From b5e0edb17603e434ae8a6eabad02b9501c8154e8 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Tue, 13 Feb 2024 13:47:19 +0100
Subject: [PATCH 01/15] Create python-package.yml

---
 .github/workflows/python-package.yml | 40 ++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 .github/workflows/python-package.yml

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
new file mode 100644
index 00000000..14a4e65b
--- /dev/null
+++ b/.github/workflows/python-package.yml
@@ -0,0 +1,40 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Python package
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.9", "3.10", "3.11"]
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test with pytest
+      run: |
+        pytest

From 686786f09a8f17c28eecd5e572395c1c9e35f8d7 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Tue, 13 Feb 2024 14:51:05 +0100
Subject: [PATCH 02/15] Update python-package.yml

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 14a4e65b..28885d0d 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.10", "3.11"]
+        python-version: ["3.9-dev", "3.10-dev", "3.11-dev"]
 
     steps:
     - uses: actions/checkout@v3

From 3ee97d267feeee5d87b9c7305bff4e3108e6224a Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Tue, 13 Feb 2024 14:53:19 +0100
Subject: [PATCH 03/15] Update python-package.yml

---
 .github/workflows/python-package.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 28885d0d..856af22a 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9-dev", "3.10-dev", "3.11-dev"]
+        python-version: ["3.9", "3.10", "3.11"]
 
     steps:
     - uses: actions/checkout@v3
@@ -26,6 +26,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
+        sudo apt install python-dev
         python -m pip install --upgrade pip
         python -m pip install flake8 pytest
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi

From 37a5b573bcd5f77108522cf96b1557c63b7c32b1 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Tue, 13 Feb 2024 14:59:36 +0100
Subject: [PATCH 04/15] Update python-package.yml

---
 .github/workflows/python-package.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 856af22a..14a4e65b 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -26,7 +26,6 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        sudo apt install python-dev
         python -m pip install --upgrade pip
         python -m pip install flake8 pytest
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi

From eef870afe4e7d3f2e0358d8feee082c42f4dda05 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Tue, 13 Feb 2024 15:47:17 +0100
Subject: [PATCH 05/15] Update python-package.yml

---
 .github/workflows/python-package.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 14a4e65b..f7e7e7e0 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -26,6 +26,7 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
+        sudo apt install python3-dev
         python -m pip install --upgrade pip
         python -m pip install flake8 pytest
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi

From 90e0e1200d0b5b49e45131abb43d8dba25737b8d Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Tue, 13 Feb 2024 16:26:06 +0100
Subject: [PATCH 06/15] remove fasttext, correct aut, and include langdetect

---
 README.md                  |  6 ------
 main.py                    | 26 ++++++-----------------
 requirements.txt           |  2 +-
 superstyl/preproc/tuyau.py | 43 ++++++++++++++++----------------------
 4 files changed, 25 insertions(+), 52 deletions(-)

diff --git a/README.md b/README.md
index 946ac70e..b617b53e 100755
--- a/README.md
+++ b/README.md
@@ -5,17 +5,11 @@
 You will need python3.9 or later, the corresponding `-dev` package, `virtualenv` and `pip`
 
 ```bash
-# Only if you don't have it
-sudo apt install python3.9-dev
-# then
 git clone https://github.com/SupervisedStylometry/SuperStyl.git
 cd SuperStyl
 virtualenv -p python3.9 env
 source env/bin/activate
 pip install -r requirements.txt
-# And get the model for language prediction
-mkdir superstyl/preproc/models
-wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -P ./superstyl/preproc/models/
 ```
 
 ## Workflow
diff --git a/main.py b/main.py
index f3de7916..55589b26 100755
--- a/main.py
+++ b/main.py
@@ -26,7 +26,6 @@
     parser.add_argument('-t', action='store', help="types of features (words or chars)", type=str)
     parser.add_argument('-n', action='store', help="n grams lengths (default 1)", default=1, type=int)
     parser.add_argument('-p', action='store', help="Processes to use (default 1)", default=1, type=int)
-    parser.add_argument('-c', action='store', help="Path to file with metadata corrections", default=None, type=str)
     parser.add_argument('-k', action='store', help="How many most frequent?", default=5000, type=int)
     parser.add_argument('--absolute_freqs', action='store_true', help="switch to get absolute instead of relative freqs", default=False)
     parser.add_argument('--z_scores', action='store_true', help="Use z-scores?", default=False) # TODO: remove this as already covered in model training?
@@ -44,32 +43,19 @@
                         help="if true, same as keep_punct, plus no Unidecode, and numbers are kept as well (default is False)",
                         default=False)
     parser.add_argument('--identify_lang', action='store_true',
-                        help="if true, should the language of each text be guessed, using a fasttext model (default is False) -- Necessitates downloading the model",
+                        help="if true, should the language of each text be guessed, using langdetect (default is False)",
                         default=False)
     args = parser.parse_args()
 
-    if args.identify_lang:
-        model = fasttext.load_model("superstyl/preproc/models/lid.176.bin")
-    else:
-        model=None
-
     print(".......loading texts.......")
 
-    if args.c:
-        # "debug_authors.csv"
-        correct_aut = pandas.read_csv(args.c)
-        # a bit hacky. Improve later
-        correct_aut.index = list(correct_aut.loc[:, "Original"])
-        myTexts = tuy.load_texts(args.s, identify_lang=model, format=args.x, correct_aut=correct_aut, keep_punct=args.keep_punct, keep_sym=args.keep_sym)
+    if args.sampling:
+        myTexts = tuy.docs_to_samples(args.s, identify_lang=args.identify_lang, size=args.sample_size, step=args.sample_step,
+                                  units=args.sample_units, feature="tokens", format=args.x,
+                                      keep_punct=args.keep_punct, keep_sym=args.keep_sym, max_samples=args.max_samples)
 
     else:
-        if args.sampling:
-            myTexts = tuy.docs_to_samples(args.s, identify_lang=model, size=args.sample_size, step=args.sample_step,
-                                      units=args.sample_units, feature="tokens", format=args.x,
-                                          keep_punct=args.keep_punct, keep_sym=args.keep_sym, max_samples=args.max_samples)
-
-        else:
-            myTexts = tuy.load_texts(args.s, identify_lang=model, format=args.x, keep_punct=args.keep_punct, keep_sym=args.keep_sym)
+        myTexts = tuy.load_texts(args.s, identify_lang=args.identify_lang, format=args.x, keep_punct=args.keep_punct, keep_sym=args.keep_sym)
 
     print(".......getting features.......")
 
diff --git a/requirements.txt b/requirements.txt
index d768ec96..22a1745f 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-fasttext==0.9.2
+langdetect==1.0.9
 joblib==1.2.0
 lxml==4.9.1
 nltk==3.6.6
diff --git a/superstyl/preproc/tuyau.py b/superstyl/preproc/tuyau.py
index 6465c4b7..fda95fde 100755
--- a/superstyl/preproc/tuyau.py
+++ b/superstyl/preproc/tuyau.py
@@ -3,12 +3,12 @@
 import unidecode
 import nltk.tokenize
 import random
+import langdetect
 
-def XML_to_text(path, correct_aut=None):
+def XML_to_text(path):
     """
     Get main text from xml file
     :param path: path to the file to transform
-    :param correct_aut: optional data frame of metadata correction (authors)
     :return: a tuple with auts, and string (the text).
     """
 
@@ -45,18 +45,14 @@ def XML_to_text(path, correct_aut=None):
 
         else:
             aut = auts[0]
-            if correct_aut is not None and aut in list(correct_aut.loc[:, "Original"]):
-                print("correcting " + aut + " to " + correct_aut.loc[aut, "Actual"])
-                aut = correct_aut.loc[aut, "Actual"]
 
         return aut, re.sub(r"\s+", " ", str(myxsl(my_doc)))
 
 
-def TXT_to_text(path, correct_aut=None):
+def TXT_to_text(path):
     """
     Get main text from xml file
     :param path: path to the file to transform
-    :param correct_aut: optional data frame of metadata correction (authors)
     :return: a tuple with auts, and string (the text).
     """
 
@@ -70,15 +66,14 @@ def TXT_to_text(path, correct_aut=None):
     return aut, re.sub(r"\s+", " ", str(' '.join(txt)))
 
 
-def identify_lang(string, model):
+def detect_lang(string):
     """
     Get the language from a string
     :param string: a string, duh
-    :param model, the fasttext model
     :return: the language
     """
 
-    return model.predict(string)  # , k = 3)
+    return langdetect.detect(string)  # , k = 3)
 
 
 def normalise(text, keep_punct=False, keep_sym=False):
@@ -98,14 +93,13 @@ def normalise(text, keep_punct=False, keep_sym=False):
     return out
 
 
-def load_texts(paths, identify_lang=None, format="txt", correct_aut=None, keep_punct=False, keep_sym=False):
+def load_texts(paths, identify_lang=False, format="txt", keep_punct=False, keep_sym=False):
     """
     Loads a collection of documents into a 'myTexts' object for further processing.
     TODO: a proper class
     :param paths: path to docs
-    :param identify_lang: what model to use for language guessing of the texts (default: None)
+    :param identify_lang: whether or not try to identify lang (default: False)
     :param format: format of the source files (implemented values: txt [default], xml)
-    :param correct_aut: optional data frame of metadata correction (authors)
     :param keep_punct: whether or not to keep punctuation and caps.
     :param keep_sym: whether or not to keep punctuation, caps, letter variants and numbers (no unidecode).
     :return: a myTexts object
@@ -118,14 +112,13 @@ def load_texts(paths, identify_lang=None, format="txt", correct_aut=None, keep_p
         name = path.split('/')[-1]
 
         if format=='xml':
-            aut, text = XML_to_text(path, correct_aut=correct_aut)
+            aut, text = XML_to_text(path)
 
         else:
-            aut, text = TXT_to_text(path)  # implement correct_aut
+            aut, text = TXT_to_text(path)
 
-        if identify_lang is not None:
-            lang, cert = identify_lang(text, identify_lang)
-            lang = lang[0].replace("__label__", "")
+        if identify_lang:
+            lang = detect_lang(text)
         else:
             lang = "NA"
 
@@ -215,7 +208,7 @@ def get_samples(path, size, step=None, units="verses", feature="tokens", format=
 
 
 def docs_to_samples(paths, size, step=None, units="verses", feature="tokens", format="tei", keep_punct=False,
-                    keep_sym=False, max_samples=None, identify_lang=None):
+                    keep_sym=False, max_samples=None, identify_lang=False):
     """
     Loads a collection of documents into a 'myTexts' object for further processing BUT with samples !
     :param paths: path to docs
@@ -227,20 +220,20 @@ def docs_to_samples(paths, size, step=None, units="verses", feature="tokens", fo
     :param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED)
     :param keep_punct: whether or not to keep punctuation and caps.
     :param max_samples: maximum number of samples per author.
-    :param identify_lang: what model to use for language guessing of the texts (default: None)
+    :param identify_lang: whether or not try to identify lang (default: False)
     """
     myTexts = []
     for path in paths:
         aut = path.split('/')[-1].split('_')[0]
-        if identify_lang is not None:
+        if identify_lang:
             if format == 'xml':
-                aut, text = XML_to_text(path, correct_aut=correct_aut)
+                aut, text = XML_to_text(path)
 
             else:
-                aut, text = TXT_to_text(path)  # implement correct_aut
+                aut, text = TXT_to_text(path)
+
+            lang = detect_lang(text)
 
-            lang, cert = identify_lang(text, identify_lang)
-            lang = lang[0].replace("__label__", "")
         else:
             lang = 'NA'
 

From ced09698753cd66a26f393430061d9404377ae6d Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Tue, 13 Feb 2024 16:40:22 +0100
Subject: [PATCH 07/15] rename

---
 {tests => test}/test_main.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename {tests => test}/test_main.py (100%)

diff --git a/tests/test_main.py b/test/test_main.py
similarity index 100%
rename from tests/test_main.py
rename to test/test_main.py

From 12a69acc4b4ac60af6f3d8924af5bca82e99f36c Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Tue, 13 Feb 2024 16:45:51 +0100
Subject: [PATCH 08/15] grrrr

---
 {test => tests}/test_main.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename {test => tests}/test_main.py (100%)

diff --git a/test/test_main.py b/tests/test_main.py
similarity index 100%
rename from test/test_main.py
rename to tests/test_main.py

From 8f21273e513a2fab64b95e9e669399cf7a512c16 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Tue, 13 Feb 2024 16:48:43 +0100
Subject: [PATCH 09/15] fix tests

---
 .github/workflows/python-package.yml | 4 ++--
 tests/__init__.py                    | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 create mode 100644 tests/__init__.py

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index f7e7e7e0..93d35ad0 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -36,6 +36,6 @@ jobs:
         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-    - name: Test with pytest
+    - name: Test with unittest
       run: |
-        pytest
+         python -m unittest
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b

From 2133e2337c15ef33b8d7f77503d79c327b4ce09e Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Tue, 13 Feb 2024 17:00:37 +0100
Subject: [PATCH 10/15] codecov

---
 .github/workflows/python-package.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 93d35ad0..d855b0ac 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -36,6 +36,9 @@ jobs:
         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-    - name: Test with unittest
+    - name: Generate Report
       run: |
-         python -m unittest
+          pip install coverage
+          coverage run -m unittest
+    - name: Upload Coverage to Codecov
+      uses: codecov/codecov-action@v4

From 5f84735899c5a624d4951ef4c09389249ff20bf6 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Tue, 13 Feb 2024 17:02:01 +0100
Subject: [PATCH 11/15] codecov

---
 codecov.yml | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 codecov.yml

diff --git a/codecov.yml b/codecov.yml
new file mode 100644
index 00000000..3b859168
--- /dev/null
+++ b/codecov.yml
@@ -0,0 +1,2 @@
+ignore:
+  - "*/tests/*"
\ No newline at end of file

From c1091dc0d1abec738872e3288877102f825008b1 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Tue, 13 Feb 2024 17:12:19 +0100
Subject: [PATCH 12/15] coverage with token

---
 .github/workflows/python-package.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index d855b0ac..78e8c298 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -42,3 +42,10 @@ jobs:
           coverage run -m unittest
     - name: Upload Coverage to Codecov
       uses: codecov/codecov-action@v4
+      with:
+        fail_ci_if_error: true # optional (default = false)
+        flags: unittests # optional
+        name: codecov-umbrella # optional
+        token: ${{ secrets.CODECOV_TOKEN }} # required
+        slug: SupervisedStylometry/SuperStyl
+        verbose: true # optional (default = false)

From 08659991c000168e14f882db24a0a30882b8975f Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Tue, 13 Feb 2024 17:15:13 +0100
Subject: [PATCH 13/15] badge

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index b617b53e..7475e119 100755
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # SUPERvised STYLometry
 
+[![codecov](https://codecov.io/github/SupervisedStylometry/SuperStyl/graph/badge.svg?token=TY5HCBOOKL)](https://codecov.io/github/SupervisedStylometry/SuperStyl)
+
 ## Installing
 
 You will need python3.9 or later, the corresponding `-dev` package, `virtualenv` and `pip`

From 5418336cd46d35462da801eba31b1297090dd581 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Tue, 13 Feb 2024 17:24:08 +0100
Subject: [PATCH 14/15] remove dev

---
 .github/workflows/python-package.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 78e8c298..e45ea890 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -26,7 +26,6 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        sudo apt install python3-dev
         python -m pip install --upgrade pip
         python -m pip install flake8 pytest
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi

From 5806c45f90e417fb0d791c67f6cacd923db58738 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Camps <jbcamps@hotmail.com>
Date: Tue, 13 Feb 2024 17:41:42 +0100
Subject: [PATCH 15/15] finally quickjumped pandas a few years in the future

---
 requirements.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 22a1745f..6226c220 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,14 +2,15 @@ langdetect==1.0.9
 joblib==1.2.0
 lxml==4.9.1
 nltk==3.6.6
-numpy==1.22.0
+numpy==1.26.4
 pybind11==2.8.1
 scikit-learn==1.2.1
 scipy==1.10.0
 six==1.16.0
 tqdm==4.64.1
 unidecode==1.3.2
-pandas==1.3.4
+pandas==2.2.0
+pyarrow==15.0.0
 argparse==1.4.0
 regex==2022.10.31
 matplotlib==3.6.2