Merge pull request #47 from SupervisedStylometry/globalTests

Global tests
SupervisedStylometry · Feb 20, 2024 · 67a14a6 · 67a14a6
2 parents 296fc21 + a2b9a24
commit 67a14a6
Show file tree

Hide file tree

Showing 10 changed files with 313 additions and 179 deletions.
diff --git a/README.md b/README.md
@@ -144,29 +144,6 @@ You can cite it using the CITATION.cff file (and Github cite functionnalities),
 @software{Camps_SUPERvised_STYLometry_SuperStyl_2021,author = {Camps, Jean-Baptiste},doi = {...},month = {...},title = {{SUPERvised STYLometry (SuperStyl)}},version = {...},year = {2021}}
 
 
-### FastText models
-
-## FastText
-
-If you use these models, please cite the following papers:
-
-[1] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, Bag of Tricks for Efficient Text Classification
-
-@article{joulin2016bag,
-  title={Bag of Tricks for Efficient Text Classification},
-  author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas},
-  journal={arXiv preprint arXiv:1607.01759},
-  year={2016}
-}
-
-[2] A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov, FastText.zip: Compressing text classification models
-
-@article{joulin2016fasttext,
-  title={FastText.zip: Compressing text classification models},
-  author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas},
-  journal={arXiv preprint arXiv:1612.03651},
-  year={2016}
-}
 
 
 

diff --git a/main.py b/main.py
@@ -4,8 +4,6 @@
 import superstyl.preproc.embedding as embed
 import pandas
 import json
-# from multiprocessing import Pool
-from multiprocessing.pool import ThreadPool as Pool
 import tqdm
 # from importlib import reload
 # tuy = reload(tuy)
@@ -27,10 +25,10 @@
     parser.add_argument('-s', nargs='+', help="paths to files")
     parser.add_argument('-x', action='store', help="format (txt, xml or tei)", default="txt")
     parser.add_argument('--sampling', action='store_true', help="Sample the texts?", default=False)
-    parser.add_argument('--sample_units', action='store', help="Units of length for sampling (words, verses; default: verses)", default="verses", type=str)
+    parser.add_argument('--sample_units', action='store', help="Units of length for sampling (words, verses; default: words)", default="words", type=str)
     parser.add_argument('--sample_size', action='store', help="Size for sampling (default: 400)", default=400, type=int)
     parser.add_argument('--sample_step', action='store', help="Step for sampling with overlap (default is no overlap)", default=None, type=int)
-    parser.add_argument('--max_samples', action='store', help="Maximum number of (randomly selected) samples per author (default is all) /!\ Only with sampling",
+    parser.add_argument('--max_samples', action='store', help="Maximum number of (randomly selected) samples per author/class (default is all)",
                         default=None, type=int)
     parser.add_argument('--keep_punct', action='store_true', help="whether or not to keep punctuation and caps (default is False)",
                         default=False)

diff --git a/main_to_open-set.py b/main_to_open-set.py
diff --git a/requirements_extra.txt b/requirements_extra.txt
diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py
@@ -4,15 +4,13 @@
 import nltk
 
 
-def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):
+def count_words(text, feats = "words", n = 1):
     """
     Get word counts from  a text
     :param text: the source text
-    :param feat_list: a list of features to be selected
     :param feats: the type of feats (words, chars, etc.)
     :param n: the length of n-grams
-    :param relFreqs: whether to compute relative freqs
-    :return: feature frequencies in text
+    :return: features absolute frequencies in text as a counter
     """
 
     if feats == "words":
@@ -26,28 +24,23 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):
         if n > 1:
             tokens = ["".join(t) for t in list(nltk.ngrams(tokens, n))]
 
-    counts = {}
+    counts = Counter()
+    counts.update(tokens)
 
-    for t in tokens:
-        if t not in counts.keys():
-            counts[t] = 1
-
-        else:
-            counts[t] = counts[t] + 1
+    return counts
 
-    if relFreqs:
-        total = sum(counts.values())
-        for t in counts.keys():
-            if counts[t] > 0:
-                counts[t] = counts[t] / total
-            else:
-                counts[t] = 0
+def relative_frequencies(wordCounts):
+    """
+    For a counter of word counts, return the relative frequencies
+    :param wordCounts: a dictionary of word counts
+    :return a counter of word relative frequencies
+    """
 
-    if feat_list:
-        # and keep only the ones in the feature list
-        counts = {f: counts[f] for f in feat_list if f in counts.keys()}
+    total = sum(wordCounts.values())
+    for t in wordCounts.keys():
+        wordCounts[t] = wordCounts[t] / total
 
-    return counts
+    return wordCounts
 
 
 def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
@@ -61,29 +54,41 @@ def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
     my_feats = Counter()
 
     for text in myTexts:
-        counts = count_words(text["text"], feats=feats, n=n, relFreqs=relFreqs)
+        counts = count_words(text["text"], feats=feats, n=n)
 
         my_feats.update(counts)
 
+    if relFreqs:
+        my_feats = relative_frequencies(my_feats)
+
     # sort them
     my_feats = [(i, my_feats[i]) for i in sorted(my_feats, key=my_feats.get, reverse=True)]
 
     return my_feats
 
 
-def get_counts(myTexts, feat_list, feats = "words", n = 1, relFreqs = False):
+def get_counts(myTexts, feat_list=None, feats = "words", n = 1, relFreqs = False):
     """
     Get counts for a collection of texts
     :param myTexts: the document collection
-    :param feat_list: a list of features to be selected
+    :param feat_list: a list of features to be selected (None for all)
     :param feats: the type of feats (words, chars, etc.)
     :param n: the length of n-grams
     :param relFreqs: whether to compute relative freqs
     :return: the collection with, for each text, a 'wordCounts' dictionary
     """
 
     for i in enumerate(myTexts):
-        myTexts[i[0]]["wordCounts"] = count_words(
-            myTexts[i[0]]["text"], feat_list=feat_list, feats=feats, n=n, relFreqs=relFreqs)
+
+        counts = count_words(myTexts[i[0]]["text"], feats=feats, n=n)
+
+        if relFreqs:
+            counts = relative_frequencies(counts)
+
+        if feat_list:
+            # and keep only the ones in the feature list
+            counts = {f: counts[f] for f in feat_list if f in counts.keys()}
+
+        myTexts[i[0]]["wordCounts"] = counts
 
     return myTexts
diff --git a/superstyl/preproc/tuyau.py b/superstyl/preproc/tuyau.py
@@ -234,9 +234,9 @@ def docs_to_samples(paths, size, step=None, units="verses", feature="tokens", fo
     :param units: the units to use, one of "words" or "verses"
     :param feature: type of tokens to extract (default is tokens, not lemmas or POS)
     :param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED)
-    :param keep_punct: whether or not to keep punctuation and caps.
+    :param keep_punct: whether to keep punctuation and caps.
     :param max_samples: maximum number of samples per author/class.
-    :param identify_lang: whether or not try to identify lang (default: False)
+    :param identify_lang: whether to try to identify lang (default: False)
     :return: a myTexts object
     """
     myTexts = []