Skip to content

Commit

Permalink
Merge pull request #47 from SupervisedStylometry/globalTests
Browse files Browse the repository at this point in the history
Global tests
  • Loading branch information
Jean-Baptiste-Camps authored Feb 20, 2024
2 parents 296fc21 + a2b9a24 commit 67a14a6
Show file tree
Hide file tree
Showing 10 changed files with 313 additions and 179 deletions.
23 changes: 0 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,29 +144,6 @@ You can cite it using the CITATION.cff file (and Github cite functionnalities),
@software{Camps_SUPERvised_STYLometry_SuperStyl_2021,author = {Camps, Jean-Baptiste},doi = {...},month = {...},title = {{SUPERvised STYLometry (SuperStyl)}},version = {...},year = {2021}}


### FastText models

## FastText

If you use these models, please cite the following papers:

[1] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, Bag of Tricks for Efficient Text Classification

@article{joulin2016bag,
title={Bag of Tricks for Efficient Text Classification},
author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas},
journal={arXiv preprint arXiv:1607.01759},
year={2016}
}

[2] A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov, FastText.zip: Compressing text classification models

@article{joulin2016fasttext,
title={FastText.zip: Compressing text classification models},
author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas},
journal={arXiv preprint arXiv:1612.03651},
year={2016}
}



Expand Down
6 changes: 2 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
import superstyl.preproc.embedding as embed
import pandas
import json
# from multiprocessing import Pool
from multiprocessing.pool import ThreadPool as Pool
import tqdm
# from importlib import reload
# tuy = reload(tuy)
Expand All @@ -27,10 +25,10 @@
parser.add_argument('-s', nargs='+', help="paths to files")
parser.add_argument('-x', action='store', help="format (txt, xml or tei)", default="txt")
parser.add_argument('--sampling', action='store_true', help="Sample the texts?", default=False)
parser.add_argument('--sample_units', action='store', help="Units of length for sampling (words, verses; default: verses)", default="verses", type=str)
parser.add_argument('--sample_units', action='store', help="Units of length for sampling (words, verses; default: words)", default="words", type=str)
parser.add_argument('--sample_size', action='store', help="Size for sampling (default: 400)", default=400, type=int)
parser.add_argument('--sample_step', action='store', help="Step for sampling with overlap (default is no overlap)", default=None, type=int)
parser.add_argument('--max_samples', action='store', help="Maximum number of (randomly selected) samples per author (default is all) /!\ Only with sampling",
parser.add_argument('--max_samples', action='store', help="Maximum number of (randomly selected) samples per author/class (default is all)",
default=None, type=int)
parser.add_argument('--keep_punct', action='store_true', help="whether or not to keep punctuation and caps (default is False)",
default=False)
Expand Down
59 changes: 0 additions & 59 deletions main_to_open-set.py

This file was deleted.

5 changes: 0 additions & 5 deletions requirements_extra.txt

This file was deleted.

59 changes: 32 additions & 27 deletions superstyl/preproc/features_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,13 @@
import nltk


def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):
def count_words(text, feats = "words", n = 1):
"""
Get word counts from a text
:param text: the source text
:param feat_list: a list of features to be selected
:param feats: the type of feats (words, chars, etc.)
:param n: the length of n-grams
:param relFreqs: whether to compute relative freqs
:return: feature frequencies in text
:return: features absolute frequencies in text as a counter
"""

if feats == "words":
Expand All @@ -26,28 +24,23 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):
if n > 1:
tokens = ["".join(t) for t in list(nltk.ngrams(tokens, n))]

counts = {}
counts = Counter()
counts.update(tokens)

for t in tokens:
if t not in counts.keys():
counts[t] = 1

else:
counts[t] = counts[t] + 1
return counts

if relFreqs:
total = sum(counts.values())
for t in counts.keys():
if counts[t] > 0:
counts[t] = counts[t] / total
else:
counts[t] = 0
def relative_frequencies(wordCounts):
"""
For a counter of word counts, return the relative frequencies
:param wordCounts: a dictionary of word counts
:return a counter of word relative frequencies
"""

if feat_list:
# and keep only the ones in the feature list
counts = {f: counts[f] for f in feat_list if f in counts.keys()}
total = sum(wordCounts.values())
for t in wordCounts.keys():
wordCounts[t] = wordCounts[t] / total

return counts
return wordCounts


def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
Expand All @@ -61,29 +54,41 @@ def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
my_feats = Counter()

for text in myTexts:
counts = count_words(text["text"], feats=feats, n=n, relFreqs=relFreqs)
counts = count_words(text["text"], feats=feats, n=n)

my_feats.update(counts)

if relFreqs:
my_feats = relative_frequencies(my_feats)

# sort them
my_feats = [(i, my_feats[i]) for i in sorted(my_feats, key=my_feats.get, reverse=True)]

return my_feats


def get_counts(myTexts, feat_list, feats = "words", n = 1, relFreqs = False):
def get_counts(myTexts, feat_list=None, feats = "words", n = 1, relFreqs = False):
"""
Get counts for a collection of texts
:param myTexts: the document collection
:param feat_list: a list of features to be selected
:param feat_list: a list of features to be selected (None for all)
:param feats: the type of feats (words, chars, etc.)
:param n: the length of n-grams
:param relFreqs: whether to compute relative freqs
:return: the collection with, for each text, a 'wordCounts' dictionary
"""

for i in enumerate(myTexts):
myTexts[i[0]]["wordCounts"] = count_words(
myTexts[i[0]]["text"], feat_list=feat_list, feats=feats, n=n, relFreqs=relFreqs)

counts = count_words(myTexts[i[0]]["text"], feats=feats, n=n)

if relFreqs:
counts = relative_frequencies(counts)

if feat_list:
# and keep only the ones in the feature list
counts = {f: counts[f] for f in feat_list if f in counts.keys()}

myTexts[i[0]]["wordCounts"] = counts

return myTexts
4 changes: 2 additions & 2 deletions superstyl/preproc/tuyau.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,9 @@ def docs_to_samples(paths, size, step=None, units="verses", feature="tokens", fo
:param units: the units to use, one of "words" or "verses"
:param feature: type of tokens to extract (default is tokens, not lemmas or POS)
:param format: type of document, one of full text, TEI or simple XML (ONLY TEI and TXT IMPLEMENTED)
:param keep_punct: whether or not to keep punctuation and caps.
:param keep_punct: whether to keep punctuation and caps.
:param max_samples: maximum number of samples per author/class.
:param identify_lang: whether or not try to identify lang (default: False)
:param identify_lang: whether to try to identify lang (default: False)
:return: a myTexts object
"""
myTexts = []
Expand Down
Loading

0 comments on commit 67a14a6

Please sign in to comment.