From b5e0edb17603e434ae8a6eabad02b9501c8154e8 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Tue, 13 Feb 2024 13:47:19 +0100 Subject: [PATCH 01/15] Create python-package.yml --- .github/workflows/python-package.yml | 40 ++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 .github/workflows/python-package.yml diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 00000000..14a4e65b --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,40 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python package + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest From 686786f09a8f17c28eecd5e572395c1c9e35f8d7 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Tue, 13 Feb 2024 14:51:05 +0100 Subject: [PATCH 02/15] Update python-package.yml --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 14a4e65b..28885d0d 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9", "3.10", "3.11"] + python-version: ["3.9-dev", "3.10-dev", "3.11-dev"] steps: - uses: actions/checkout@v3 From 3ee97d267feeee5d87b9c7305bff4e3108e6224a Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Tue, 13 Feb 2024 14:53:19 +0100 Subject: [PATCH 03/15] Update python-package.yml --- .github/workflows/python-package.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 28885d0d..856af22a 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9-dev", "3.10-dev", "3.11-dev"] + python-version: ["3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v3 @@ -26,6 +26,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | + sudo apt install python-dev python -m pip install --upgrade pip python -m pip install flake8 pytest if [ -f requirements.txt ]; then pip install -r requirements.txt; fi From 37a5b573bcd5f77108522cf96b1557c63b7c32b1 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Tue, 13 Feb 2024 14:59:36 +0100 Subject: [PATCH 04/15] Update python-package.yml --- .github/workflows/python-package.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 856af22a..14a4e65b 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -26,7 +26,6 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - sudo apt install python-dev python -m pip install --upgrade pip python -m pip install flake8 pytest if [ -f requirements.txt ]; then pip install -r requirements.txt; fi From eef870afe4e7d3f2e0358d8feee082c42f4dda05 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Tue, 13 Feb 2024 15:47:17 +0100 Subject: [PATCH 05/15] Update python-package.yml --- .github/workflows/python-package.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 14a4e65b..f7e7e7e0 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -26,6 +26,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | + sudo apt install python3-dev python -m pip install --upgrade pip python -m pip install flake8 pytest if [ -f requirements.txt ]; then pip install -r requirements.txt; fi From 90e0e1200d0b5b49e45131abb43d8dba25737b8d Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Tue, 13 Feb 2024 16:26:06 +0100 Subject: [PATCH 06/15] remove fasttext, correct aut, and include langdetect --- README.md | 6 ------ main.py | 26 ++++++----------------- requirements.txt | 2 +- superstyl/preproc/tuyau.py | 43 ++++++++++++++++---------------------- 4 files changed, 25 insertions(+), 52 deletions(-) diff --git a/README.md b/README.md index 946ac70e..b617b53e 100755 --- a/README.md +++ b/README.md @@ -5,17 +5,11 @@ You will need python3.9 or later, the corresponding `-dev` package, `virtualenv` and `pip` ```bash -# Only if you don't have it -sudo apt install python3.9-dev -# then git clone https://github.com/SupervisedStylometry/SuperStyl.git cd SuperStyl virtualenv -p python3.9 env source env/bin/activate pip install -r requirements.txt -# And get the model for language prediction -mkdir superstyl/preproc/models -wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -P ./superstyl/preproc/models/ ``` ## Workflow diff --git a/main.py b/main.py index f3de7916..55589b26 100755 --- a/main.py +++ b/main.py @@ -26,7 +26,6 @@ parser.add_argument('-t', action='store', help="types of features (words or chars)", type=str) parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int) parser.add_argument('-p', action='store', help="Processes to use (default 1)", default=1, type=int) - parser.add_argument('-c', action='store', help="Path to file with metadata corrections", default=None, type=str) parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int) parser.add_argument('--absolute_freqs', action='store_true', help="switch to get absolute instead of relative freqs", default=False) parser.add_argument('--z_scores', action='store_true', help="Use z-scores?", default=False) # TODO: remove this as already covered in model training? @@ -44,32 +43,19 @@ help="if true, same as keep_punct, plus no Unidecode, and numbers are kept as well (default is False)", default=False) parser.add_argument('--identify_lang', action='store_true', - help="if true, should the language of each text be guessed, using a fasttext model (default is False) -- Necessitates downloading the model", + help="if true, should the language of each text be guessed, using langdetect (default is False)", default=False) args = parser.parse_args() - if args.identify_lang: - model = fasttext.load_model("superstyl/preproc/models/lid.176.bin") - else: - model=None - print(".......loading texts.......") - if args.c: - # "debug_authors.csv" - correct_aut = pandas.read_csv(args.c) - # a bit hacky. Improve later - correct_aut.index = list(correct_aut.loc[:, "Original"]) - myTexts = tuy.load_texts(args.s, identify_lang=model, format=args.x, correct_aut=correct_aut, keep_punct=args.keep_punct, keep_sym=args.keep_sym) + if args.sampling: + myTexts = tuy.docs_to_samples(args.s, identify_lang=args.identify_lang, size=args.sample_size, step=args.sample_step, + units=args.sample_units, feature="tokens", format=args.x, + keep_punct=args.keep_punct, keep_sym=args.keep_sym, max_samples=args.max_samples) else: - if args.sampling: - myTexts = tuy.docs_to_samples(args.s, identify_lang=model, size=args.sample_size, step=args.sample_step, - units=args.sample_units, feature="tokens", format=args.x, - keep_punct=args.keep_punct, keep_sym=args.keep_sym, max_samples=args.max_samples) - - else: - myTexts = tuy.load_texts(args.s, identify_lang=model, format=args.x, keep_punct=args.keep_punct, keep_sym=args.keep_sym) + myTexts = tuy.load_texts(args.s, identify_lang=args.identify_lang, format=args.x, keep_punct=args.keep_punct, keep_sym=args.keep_sym) print(".......getting features.......") diff --git a/requirements.txt b/requirements.txt index d768ec96..22a1745f 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -fasttext==0.9.2 +langdetect==1.0.9 joblib==1.2.0 lxml==4.9.1 nltk==3.6.6 diff --git a/superstyl/preproc/tuyau.py b/superstyl/preproc/tuyau.py index 6465c4b7..fda95fde 100755 --- a/superstyl/preproc/tuyau.py +++ b/superstyl/preproc/tuyau.py @@ -3,12 +3,12 @@ import unidecode import nltk.tokenize import random +import langdetect -def XML_to_text(path, correct_aut=None): +def XML_to_text(path): """ Get main text from xml file :param path: path to the file to transform - :param correct_aut: optional data frame of metadata correction (authors) :return: a tuple with auts, and string (the text). """ @@ -45,18 +45,14 @@ def XML_to_text(path, correct_aut=None): else: aut = auts[0] - if correct_aut is not None and aut in list(correct_aut.loc[:, "Original"]): - print("correcting " + aut + " to " + correct_aut.loc[aut, "Actual"]) - aut = correct_aut.loc[aut, "Actual"] return aut, re.sub(r"\s+", " ", str(myxsl(my_doc))) -def TXT_to_text(path, correct_aut=None): +def TXT_to_text(path): """ Get main text from xml file :param path: path to the file to transform - :param correct_aut: optional data frame of metadata correction (authors) :return: a tuple with auts, and string (the text). """ @@ -70,15 +66,14 @@ def TXT_to_text(path, correct_aut=None): return aut, re.sub(r"\s+", " ", str(' '.join(txt))) -def identify_lang(string, model): +def detect_lang(string): """ Get the language from a string :param string: a string, duh - :param model, the fasttext model :return: the language """ - return model.predict(string) # , k = 3) + return langdetect.detect(string) # , k = 3) def normalise(text, keep_punct=False, keep_sym=False): @@ -98,14 +93,13 @@ def normalise(text, keep_punct=False, keep_sym=False): return out -def load_texts(paths, identify_lang=None, format="txt", correct_aut=None, keep_punct=False, keep_sym=False): +def load_texts(paths, identify_lang=False, format="txt", keep_punct=False, keep_sym=False): """ Loads a collection of documents into a 'myTexts' object for further processing. TODO: a proper class :param paths: path to docs - :param identify_lang: what model to use for language guessing of the texts (default: None) + :param identify_lang: whether or not try to identify lang (default: False) :param format: format of the source files (implemented values: txt [default], xml) - :param correct_aut: optional data frame of metadata correction (authors) :param keep_punct: whether or not to keep punctuation and caps. :param keep_sym: whether or not to keep punctuation, caps, letter variants and numbers (no unidecode). :return: a myTexts object @@ -118,14 +112,13 @@ def load_texts(paths, identify_lang=None, format="txt", correct_aut=None, keep_p name = path.split('/')[-1] if format=='xml': - aut, text = XML_to_text(path, correct_aut=correct_aut) + aut, text = XML_to_text(path) else: - aut, text = TXT_to_text(path) # implement correct_aut + aut, text = TXT_to_text(path) - if identify_lang is not None: - lang, cert = identify_lang(text, identify_lang) - lang = lang[0].replace("__label__", "") + if identify_lang: + lang = detect_lang(text) else: lang = "NA" @@ -215,7 +208,7 @@ def get_samples(path, size, step=None, units="verses", feature="tokens", format= def docs_to_samples(paths, size, step=None, units="verses", feature="tokens", format="tei", keep_punct=False, - keep_sym=False, max_samples=None, identify_lang=None): + keep_sym=False, max_samples=None, identify_lang=False): """ Loads a collection of documents into a 'myTexts' object for further processing BUT with samples ! :param paths: path to docs @@ -227,20 +220,20 @@ def docs_to_samples(paths, size, step=None, units="verses", feature="tokens", fo :param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED) :param keep_punct: whether or not to keep punctuation and caps. :param max_samples: maximum number of samples per author. - :param identify_lang: what model to use for language guessing of the texts (default: None) + :param identify_lang: whether or not try to identify lang (default: False) """ myTexts = [] for path in paths: aut = path.split('/')[-1].split('_')[0] - if identify_lang is not None: + if identify_lang: if format == 'xml': - aut, text = XML_to_text(path, correct_aut=correct_aut) + aut, text = XML_to_text(path) else: - aut, text = TXT_to_text(path) # implement correct_aut + aut, text = TXT_to_text(path) + + lang = detect_lang(text) - lang, cert = identify_lang(text, identify_lang) - lang = lang[0].replace("__label__", "") else: lang = 'NA' From ced09698753cd66a26f393430061d9404377ae6d Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Tue, 13 Feb 2024 16:40:22 +0100 Subject: [PATCH 07/15] rename --- {tests => test}/test_main.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {tests => test}/test_main.py (100%) diff --git a/tests/test_main.py b/test/test_main.py similarity index 100% rename from tests/test_main.py rename to test/test_main.py From 12a69acc4b4ac60af6f3d8924af5bca82e99f36c Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Tue, 13 Feb 2024 16:45:51 +0100 Subject: [PATCH 08/15] grrrr --- {test => tests}/test_main.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {test => tests}/test_main.py (100%) diff --git a/test/test_main.py b/tests/test_main.py similarity index 100% rename from test/test_main.py rename to tests/test_main.py From 8f21273e513a2fab64b95e9e669399cf7a512c16 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Tue, 13 Feb 2024 16:48:43 +0100 Subject: [PATCH 09/15] fix tests --- .github/workflows/python-package.yml | 4 ++-- tests/__init__.py | 0 2 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 tests/__init__.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index f7e7e7e0..93d35ad0 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -36,6 +36,6 @@ jobs: flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with pytest + - name: Test with unittest run: | - pytest + python -m unittest diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b From 2133e2337c15ef33b8d7f77503d79c327b4ce09e Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Tue, 13 Feb 2024 17:00:37 +0100 Subject: [PATCH 10/15] codecov --- .github/workflows/python-package.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 93d35ad0..d855b0ac 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -36,6 +36,9 @@ jobs: flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with unittest + - name: Generate Report run: | - python -m unittest + pip install coverage + coverage run -m unittest + - name: Upload Coverage to Codecov + uses: codecov/codecov-action@v4 From 5f84735899c5a624d4951ef4c09389249ff20bf6 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Tue, 13 Feb 2024 17:02:01 +0100 Subject: [PATCH 11/15] codecov --- codecov.yml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 codecov.yml diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 00000000..3b859168 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,2 @@ +ignore: + - "*/tests/*" \ No newline at end of file From c1091dc0d1abec738872e3288877102f825008b1 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Tue, 13 Feb 2024 17:12:19 +0100 Subject: [PATCH 12/15] coverage with token --- .github/workflows/python-package.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index d855b0ac..78e8c298 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -42,3 +42,10 @@ jobs: coverage run -m unittest - name: Upload Coverage to Codecov uses: codecov/codecov-action@v4 + with: + fail_ci_if_error: true # optional (default = false) + flags: unittests # optional + name: codecov-umbrella # optional + token: ${{ secrets.CODECOV_TOKEN }} # required + slug: SupervisedStylometry/SuperStyl + verbose: true # optional (default = false) From 08659991c000168e14f882db24a0a30882b8975f Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Tue, 13 Feb 2024 17:15:13 +0100 Subject: [PATCH 13/15] badge --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index b617b53e..7475e119 100755 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # SUPERvised STYLometry +[![codecov](https://codecov.io/github/SupervisedStylometry/SuperStyl/graph/badge.svg?token=TY5HCBOOKL)](https://codecov.io/github/SupervisedStylometry/SuperStyl) + ## Installing You will need python3.9 or later, the corresponding `-dev` package, `virtualenv` and `pip` From 5418336cd46d35462da801eba31b1297090dd581 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Tue, 13 Feb 2024 17:24:08 +0100 Subject: [PATCH 14/15] remove dev --- .github/workflows/python-package.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 78e8c298..e45ea890 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -26,7 +26,6 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - sudo apt install python3-dev python -m pip install --upgrade pip python -m pip install flake8 pytest if [ -f requirements.txt ]; then pip install -r requirements.txt; fi From 5806c45f90e417fb0d791c67f6cacd923db58738 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Camps Date: Tue, 13 Feb 2024 17:41:42 +0100 Subject: [PATCH 15/15] finally quickjumped pandas a few years in the future --- requirements.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 22a1745f..6226c220 100755 --- a/requirements.txt +++ b/requirements.txt @@ -2,14 +2,15 @@ langdetect==1.0.9 joblib==1.2.0 lxml==4.9.1 nltk==3.6.6 -numpy==1.22.0 +numpy==1.26.4 pybind11==2.8.1 scikit-learn==1.2.1 scipy==1.10.0 six==1.16.0 tqdm==4.64.1 unidecode==1.3.2 -pandas==1.3.4 +pandas==2.2.0 +pyarrow==15.0.0 argparse==1.4.0 regex==2022.10.31 matplotlib==3.6.2