diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 00000000..e45ea890 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,50 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python package + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Generate Report + run: | + pip install coverage + coverage run -m unittest + - name: Upload Coverage to Codecov + uses: codecov/codecov-action@v4 + with: + fail_ci_if_error: true # optional (default = false) + flags: unittests # optional + name: codecov-umbrella # optional + token: ${{ secrets.CODECOV_TOKEN }} # required + slug: SupervisedStylometry/SuperStyl + verbose: true # optional (default = false) diff --git a/README.md b/README.md index 946ac70e..7475e119 100755 --- a/README.md +++ b/README.md @@ -1,21 +1,17 @@ # SUPERvised STYLometry +[![codecov](https://codecov.io/github/SupervisedStylometry/SuperStyl/graph/badge.svg?token=TY5HCBOOKL)](https://codecov.io/github/SupervisedStylometry/SuperStyl) + ## Installing You will need python3.9 or later, the corresponding `-dev` package, `virtualenv` and `pip` ```bash -# Only if you don't have it -sudo apt install python3.9-dev -# then git clone https://github.com/SupervisedStylometry/SuperStyl.git cd SuperStyl virtualenv -p python3.9 env source env/bin/activate pip install -r requirements.txt -# And get the model for language prediction -mkdir superstyl/preproc/models -wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -P ./superstyl/preproc/models/ ``` ## Workflow diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 00000000..3b859168 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,2 @@ +ignore: + - "*/tests/*" \ No newline at end of file diff --git a/main.py b/main.py index f3de7916..55589b26 100755 --- a/main.py +++ b/main.py @@ -26,7 +26,6 @@ parser.add_argument('-t', action='store', help="types of features (words or chars)", type=str) parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int) parser.add_argument('-p', action='store', help="Processes to use (default 1)", default=1, type=int) - parser.add_argument('-c', action='store', help="Path to file with metadata corrections", default=None, type=str) parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int) parser.add_argument('--absolute_freqs', action='store_true', help="switch to get absolute instead of relative freqs", default=False) parser.add_argument('--z_scores', action='store_true', help="Use z-scores?", default=False) # TODO: remove this as already covered in model training? @@ -44,32 +43,19 @@ help="if true, same as keep_punct, plus no Unidecode, and numbers are kept as well (default is False)", default=False) parser.add_argument('--identify_lang', action='store_true', - help="if true, should the language of each text be guessed, using a fasttext model (default is False) -- Necessitates downloading the model", + help="if true, should the language of each text be guessed, using langdetect (default is False)", default=False) args = parser.parse_args() - if args.identify_lang: - model = fasttext.load_model("superstyl/preproc/models/lid.176.bin") - else: - model=None - print(".......loading texts.......") - if args.c: - # "debug_authors.csv" - correct_aut = pandas.read_csv(args.c) - # a bit hacky. Improve later - correct_aut.index = list(correct_aut.loc[:, "Original"]) - myTexts = tuy.load_texts(args.s, identify_lang=model, format=args.x, correct_aut=correct_aut, keep_punct=args.keep_punct, keep_sym=args.keep_sym) + if args.sampling: + myTexts = tuy.docs_to_samples(args.s, identify_lang=args.identify_lang, size=args.sample_size, step=args.sample_step, + units=args.sample_units, feature="tokens", format=args.x, + keep_punct=args.keep_punct, keep_sym=args.keep_sym, max_samples=args.max_samples) else: - if args.sampling: - myTexts = tuy.docs_to_samples(args.s, identify_lang=model, size=args.sample_size, step=args.sample_step, - units=args.sample_units, feature="tokens", format=args.x, - keep_punct=args.keep_punct, keep_sym=args.keep_sym, max_samples=args.max_samples) - - else: - myTexts = tuy.load_texts(args.s, identify_lang=model, format=args.x, keep_punct=args.keep_punct, keep_sym=args.keep_sym) + myTexts = tuy.load_texts(args.s, identify_lang=args.identify_lang, format=args.x, keep_punct=args.keep_punct, keep_sym=args.keep_sym) print(".......getting features.......") diff --git a/requirements.txt b/requirements.txt index d768ec96..6226c220 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,16 @@ -fasttext==0.9.2 +langdetect==1.0.9 joblib==1.2.0 lxml==4.9.1 nltk==3.6.6 -numpy==1.22.0 +numpy==1.26.4 pybind11==2.8.1 scikit-learn==1.2.1 scipy==1.10.0 six==1.16.0 tqdm==4.64.1 unidecode==1.3.2 -pandas==1.3.4 +pandas==2.2.0 +pyarrow==15.0.0 argparse==1.4.0 regex==2022.10.31 matplotlib==3.6.2 diff --git a/superstyl/preproc/tuyau.py b/superstyl/preproc/tuyau.py index 6465c4b7..fda95fde 100755 --- a/superstyl/preproc/tuyau.py +++ b/superstyl/preproc/tuyau.py @@ -3,12 +3,12 @@ import unidecode import nltk.tokenize import random +import langdetect -def XML_to_text(path, correct_aut=None): +def XML_to_text(path): """ Get main text from xml file :param path: path to the file to transform - :param correct_aut: optional data frame of metadata correction (authors) :return: a tuple with auts, and string (the text). """ @@ -45,18 +45,14 @@ def XML_to_text(path, correct_aut=None): else: aut = auts[0] - if correct_aut is not None and aut in list(correct_aut.loc[:, "Original"]): - print("correcting " + aut + " to " + correct_aut.loc[aut, "Actual"]) - aut = correct_aut.loc[aut, "Actual"] return aut, re.sub(r"\s+", " ", str(myxsl(my_doc))) -def TXT_to_text(path, correct_aut=None): +def TXT_to_text(path): """ Get main text from xml file :param path: path to the file to transform - :param correct_aut: optional data frame of metadata correction (authors) :return: a tuple with auts, and string (the text). """ @@ -70,15 +66,14 @@ def TXT_to_text(path, correct_aut=None): return aut, re.sub(r"\s+", " ", str(' '.join(txt))) -def identify_lang(string, model): +def detect_lang(string): """ Get the language from a string :param string: a string, duh - :param model, the fasttext model :return: the language """ - return model.predict(string) # , k = 3) + return langdetect.detect(string) # , k = 3) def normalise(text, keep_punct=False, keep_sym=False): @@ -98,14 +93,13 @@ def normalise(text, keep_punct=False, keep_sym=False): return out -def load_texts(paths, identify_lang=None, format="txt", correct_aut=None, keep_punct=False, keep_sym=False): +def load_texts(paths, identify_lang=False, format="txt", keep_punct=False, keep_sym=False): """ Loads a collection of documents into a 'myTexts' object for further processing. TODO: a proper class :param paths: path to docs - :param identify_lang: what model to use for language guessing of the texts (default: None) + :param identify_lang: whether or not try to identify lang (default: False) :param format: format of the source files (implemented values: txt [default], xml) - :param correct_aut: optional data frame of metadata correction (authors) :param keep_punct: whether or not to keep punctuation and caps. :param keep_sym: whether or not to keep punctuation, caps, letter variants and numbers (no unidecode). :return: a myTexts object @@ -118,14 +112,13 @@ def load_texts(paths, identify_lang=None, format="txt", correct_aut=None, keep_p name = path.split('/')[-1] if format=='xml': - aut, text = XML_to_text(path, correct_aut=correct_aut) + aut, text = XML_to_text(path) else: - aut, text = TXT_to_text(path) # implement correct_aut + aut, text = TXT_to_text(path) - if identify_lang is not None: - lang, cert = identify_lang(text, identify_lang) - lang = lang[0].replace("__label__", "") + if identify_lang: + lang = detect_lang(text) else: lang = "NA" @@ -215,7 +208,7 @@ def get_samples(path, size, step=None, units="verses", feature="tokens", format= def docs_to_samples(paths, size, step=None, units="verses", feature="tokens", format="tei", keep_punct=False, - keep_sym=False, max_samples=None, identify_lang=None): + keep_sym=False, max_samples=None, identify_lang=False): """ Loads a collection of documents into a 'myTexts' object for further processing BUT with samples ! :param paths: path to docs @@ -227,20 +220,20 @@ def docs_to_samples(paths, size, step=None, units="verses", feature="tokens", fo :param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED) :param keep_punct: whether or not to keep punctuation and caps. :param max_samples: maximum number of samples per author. - :param identify_lang: what model to use for language guessing of the texts (default: None) + :param identify_lang: whether or not try to identify lang (default: False) """ myTexts = [] for path in paths: aut = path.split('/')[-1].split('_')[0] - if identify_lang is not None: + if identify_lang: if format == 'xml': - aut, text = XML_to_text(path, correct_aut=correct_aut) + aut, text = XML_to_text(path) else: - aut, text = TXT_to_text(path) # implement correct_aut + aut, text = TXT_to_text(path) + + lang = detect_lang(text) - lang, cert = identify_lang(text, identify_lang) - lang = lang[0].replace("__label__", "") else: lang = 'NA' diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b