Skip to content

Commit

Permalink
Merge pull request #39 from SupervisedStylometry/tests
Browse files Browse the repository at this point in the history
Tests
  • Loading branch information
Jean-Baptiste-Camps authored Feb 13, 2024
2 parents b719f07 + 5806c45 commit d8e8c9c
Show file tree
Hide file tree
Showing 7 changed files with 82 additions and 54 deletions.
50 changes: 50 additions & 0 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

name: Python package

on:
push:
branches: [ "master" ]
pull_request:
branches: [ "master" ]

jobs:
build:

runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.10", "3.11"]

steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install flake8 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Generate Report
run: |
pip install coverage
coverage run -m unittest
- name: Upload Coverage to Codecov
uses: codecov/codecov-action@v4
with:
fail_ci_if_error: true # optional (default = false)
flags: unittests # optional
name: codecov-umbrella # optional
token: ${{ secrets.CODECOV_TOKEN }} # required
slug: SupervisedStylometry/SuperStyl
verbose: true # optional (default = false)
8 changes: 2 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,21 +1,17 @@
# SUPERvised STYLometry

[![codecov](https://codecov.io/github/SupervisedStylometry/SuperStyl/graph/badge.svg?token=TY5HCBOOKL)](https://codecov.io/github/SupervisedStylometry/SuperStyl)

## Installing

You will need python3.9 or later, the corresponding `-dev` package, `virtualenv` and `pip`

```bash
# Only if you don't have it
sudo apt install python3.9-dev
# then
git clone https://github.com/SupervisedStylometry/SuperStyl.git
cd SuperStyl
virtualenv -p python3.9 env
source env/bin/activate
pip install -r requirements.txt
# And get the model for language prediction
mkdir superstyl/preproc/models
wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -P ./superstyl/preproc/models/
```

## Workflow
Expand Down
2 changes: 2 additions & 0 deletions codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ignore:
- "*/tests/*"
26 changes: 6 additions & 20 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
parser.add_argument('-t', action='store', help="types of features (words or chars)", type=str)
parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int)
parser.add_argument('-p', action='store', help="Processes to use (default 1)", default=1, type=int)
parser.add_argument('-c', action='store', help="Path to file with metadata corrections", default=None, type=str)
parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int)
parser.add_argument('--absolute_freqs', action='store_true', help="switch to get absolute instead of relative freqs", default=False)
parser.add_argument('--z_scores', action='store_true', help="Use z-scores?", default=False) # TODO: remove this as already covered in model training?
Expand All @@ -44,32 +43,19 @@
help="if true, same as keep_punct, plus no Unidecode, and numbers are kept as well (default is False)",
default=False)
parser.add_argument('--identify_lang', action='store_true',
help="if true, should the language of each text be guessed, using a fasttext model (default is False) -- Necessitates downloading the model",
help="if true, should the language of each text be guessed, using langdetect (default is False)",
default=False)
args = parser.parse_args()

if args.identify_lang:
model = fasttext.load_model("superstyl/preproc/models/lid.176.bin")
else:
model=None

print(".......loading texts.......")

if args.c:
# "debug_authors.csv"
correct_aut = pandas.read_csv(args.c)
# a bit hacky. Improve later
correct_aut.index = list(correct_aut.loc[:, "Original"])
myTexts = tuy.load_texts(args.s, identify_lang=model, format=args.x, correct_aut=correct_aut, keep_punct=args.keep_punct, keep_sym=args.keep_sym)
if args.sampling:
myTexts = tuy.docs_to_samples(args.s, identify_lang=args.identify_lang, size=args.sample_size, step=args.sample_step,
units=args.sample_units, feature="tokens", format=args.x,
keep_punct=args.keep_punct, keep_sym=args.keep_sym, max_samples=args.max_samples)

else:
if args.sampling:
myTexts = tuy.docs_to_samples(args.s, identify_lang=model, size=args.sample_size, step=args.sample_step,
units=args.sample_units, feature="tokens", format=args.x,
keep_punct=args.keep_punct, keep_sym=args.keep_sym, max_samples=args.max_samples)

else:
myTexts = tuy.load_texts(args.s, identify_lang=model, format=args.x, keep_punct=args.keep_punct, keep_sym=args.keep_sym)
myTexts = tuy.load_texts(args.s, identify_lang=args.identify_lang, format=args.x, keep_punct=args.keep_punct, keep_sym=args.keep_sym)

print(".......getting features.......")

Expand Down
7 changes: 4 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
fasttext==0.9.2
langdetect==1.0.9
joblib==1.2.0
lxml==4.9.1
nltk==3.6.6
numpy==1.22.0
numpy==1.26.4
pybind11==2.8.1
scikit-learn==1.2.1
scipy==1.10.0
six==1.16.0
tqdm==4.64.1
unidecode==1.3.2
pandas==1.3.4
pandas==2.2.0
pyarrow==15.0.0
argparse==1.4.0
regex==2022.10.31
matplotlib==3.6.2
Expand Down
43 changes: 18 additions & 25 deletions superstyl/preproc/tuyau.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
import unidecode
import nltk.tokenize
import random
import langdetect

def XML_to_text(path, correct_aut=None):
def XML_to_text(path):
"""
Get main text from xml file
:param path: path to the file to transform
:param correct_aut: optional data frame of metadata correction (authors)
:return: a tuple with auts, and string (the text).
"""

Expand Down Expand Up @@ -45,18 +45,14 @@ def XML_to_text(path, correct_aut=None):

else:
aut = auts[0]
if correct_aut is not None and aut in list(correct_aut.loc[:, "Original"]):
print("correcting " + aut + " to " + correct_aut.loc[aut, "Actual"])
aut = correct_aut.loc[aut, "Actual"]

return aut, re.sub(r"\s+", " ", str(myxsl(my_doc)))


def TXT_to_text(path, correct_aut=None):
def TXT_to_text(path):
"""
Get main text from xml file
:param path: path to the file to transform
:param correct_aut: optional data frame of metadata correction (authors)
:return: a tuple with auts, and string (the text).
"""

Expand All @@ -70,15 +66,14 @@ def TXT_to_text(path, correct_aut=None):
return aut, re.sub(r"\s+", " ", str(' '.join(txt)))


def identify_lang(string, model):
def detect_lang(string):
"""
Get the language from a string
:param string: a string, duh
:param model, the fasttext model
:return: the language
"""

return model.predict(string) # , k = 3)
return langdetect.detect(string) # , k = 3)


def normalise(text, keep_punct=False, keep_sym=False):
Expand All @@ -98,14 +93,13 @@ def normalise(text, keep_punct=False, keep_sym=False):
return out


def load_texts(paths, identify_lang=None, format="txt", correct_aut=None, keep_punct=False, keep_sym=False):
def load_texts(paths, identify_lang=False, format="txt", keep_punct=False, keep_sym=False):
"""
Loads a collection of documents into a 'myTexts' object for further processing.
TODO: a proper class
:param paths: path to docs
:param identify_lang: what model to use for language guessing of the texts (default: None)
:param identify_lang: whether or not try to identify lang (default: False)
:param format: format of the source files (implemented values: txt [default], xml)
:param correct_aut: optional data frame of metadata correction (authors)
:param keep_punct: whether or not to keep punctuation and caps.
:param keep_sym: whether or not to keep punctuation, caps, letter variants and numbers (no unidecode).
:return: a myTexts object
Expand All @@ -118,14 +112,13 @@ def load_texts(paths, identify_lang=None, format="txt", correct_aut=None, keep_p
name = path.split('/')[-1]

if format=='xml':
aut, text = XML_to_text(path, correct_aut=correct_aut)
aut, text = XML_to_text(path)

else:
aut, text = TXT_to_text(path) # implement correct_aut
aut, text = TXT_to_text(path)

if identify_lang is not None:
lang, cert = identify_lang(text, identify_lang)
lang = lang[0].replace("__label__", "")
if identify_lang:
lang = detect_lang(text)
else:
lang = "NA"

Expand Down Expand Up @@ -215,7 +208,7 @@ def get_samples(path, size, step=None, units="verses", feature="tokens", format=


def docs_to_samples(paths, size, step=None, units="verses", feature="tokens", format="tei", keep_punct=False,
keep_sym=False, max_samples=None, identify_lang=None):
keep_sym=False, max_samples=None, identify_lang=False):
"""
Loads a collection of documents into a 'myTexts' object for further processing BUT with samples !
:param paths: path to docs
Expand All @@ -227,20 +220,20 @@ def docs_to_samples(paths, size, step=None, units="verses", feature="tokens", fo
:param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED)
:param keep_punct: whether or not to keep punctuation and caps.
:param max_samples: maximum number of samples per author.
:param identify_lang: what model to use for language guessing of the texts (default: None)
:param identify_lang: whether or not try to identify lang (default: False)
"""
myTexts = []
for path in paths:
aut = path.split('/')[-1].split('_')[0]
if identify_lang is not None:
if identify_lang:
if format == 'xml':
aut, text = XML_to_text(path, correct_aut=correct_aut)
aut, text = XML_to_text(path)

else:
aut, text = TXT_to_text(path) # implement correct_aut
aut, text = TXT_to_text(path)

lang = detect_lang(text)

lang, cert = identify_lang(text, identify_lang)
lang = lang[0].replace("__label__", "")
else:
lang = 'NA'

Expand Down
Empty file added tests/__init__.py
Empty file.

0 comments on commit d8e8c9c

Please sign in to comment.