From 510572e4e856dc9588aeb785bcb62d5b3b1fbf3e Mon Sep 17 00:00:00 2001
From: Florian Cafiero <florian.cafiero@polytechnique.edu>
Date: Sat, 17 Feb 2024 06:44:39 +0100
Subject: [PATCH 01/14] Update svm.py

---
 superstyl/svm.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/superstyl/svm.py b/superstyl/svm.py
index 9ae2d2aa..8e54bccb 100755
--- a/superstyl/svm.py
+++ b/superstyl/svm.py
@@ -11,6 +11,9 @@
 import imblearn.over_sampling as over
 import imblearn.combine as comb
 import imblearn.pipeline as imbp
+from collections import Counter
+
+
 
 def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True, balance=False, class_weights=False, kernel="LinearSVC",
               final_pred=False, get_coefs=False):
@@ -100,11 +103,16 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True
         if balance == 'upsampling':
             estimators.append(('sampling', over.RandomOverSampler(random_state=42)))
 
-        if balance == 'SMOTE':
-            estimators.append(('sampling', over.SMOTE(random_state=42)))
+        if balance in ['SMOTE', 'SMOTETomek']:
+            # Adjust n_neighbors for SMOTE/SMOTETomek based on smallest class size
+            min_class_size = min(Counter(classes).values())
+            n_neighbors = min(5, min_class_size - 1)  # Default n_neighbors in SMOTE is 5
+
+            if balance == 'SMOTE':
+                estimators.append(('sampling', over.SMOTE(n_neighbors=n_neighbors, random_state=42)))
+            elif balance == 'SMOTETomek':
+                estimators.append(('sampling', comb.SMOTETomek(n_neighbors=n_neighbors, random_state=42)))
 
-        if balance == 'SMOTETomek':
-            estimators.append(('sampling', comb.SMOTETomek(random_state=42)))
 
     print(".......... choosing SVM ........")
 

From 9a025dad42380d063914b3cf23e5f46c84f70a37 Mon Sep 17 00:00:00 2001
From: Florian Cafiero <florian.cafiero@polytechnique.edu>
Date: Sat, 17 Feb 2024 06:46:05 +0100
Subject: [PATCH 02/14] Update svm.py

---
 superstyl/svm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/superstyl/svm.py b/superstyl/svm.py
index 8e54bccb..a75fddf1 100755
--- a/superstyl/svm.py
+++ b/superstyl/svm.py
@@ -104,7 +104,8 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True
             estimators.append(('sampling', over.RandomOverSampler(random_state=42)))
 
         if balance in ['SMOTE', 'SMOTETomek']:
-            # Adjust n_neighbors for SMOTE/SMOTETomek based on smallest class size
+            # Adjust n_neighbors for SMOTE/SMOTETomek based on smallest class size: 
+            # Ensures that the resampling method does not attempt to use more neighbors than available samples in the minority class, which produced the error.
             min_class_size = min(Counter(classes).values())
             n_neighbors = min(5, min_class_size - 1)  # Default n_neighbors in SMOTE is 5
 

From 1dd6cd8aa7c489eba31950f413adcb6bcb178c32 Mon Sep 17 00:00:00 2001
From: Florian Cafiero <florian.cafiero@polytechnique.edu>
Date: Sat, 17 Feb 2024 06:46:50 +0100
Subject: [PATCH 03/14] Update svm.py

---
 superstyl/svm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/superstyl/svm.py b/superstyl/svm.py
index a75fddf1..5d67e033 100755
--- a/superstyl/svm.py
+++ b/superstyl/svm.py
@@ -61,7 +61,7 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True
     if dim_reduc == 'pca':
         print(".......... using PCA ........")
         estimators.append(('dim_reduc', decomp.PCA()))  # chosen with default
-        # wich is: n_components = min(n_samples, n_features)
+        # which is: n_components = min(n_samples, n_features)
 
 #    if dim_reduc == 'som':
 #        print(".......... using SOM ........")  # TODO: fix SOM

From 96923dc99540e3b2fe18111ef50fefa7f9ffc5f7 Mon Sep 17 00:00:00 2001
From: Florian Cafiero <florian.cafiero@polytechnique.edu>
Date: Sat, 17 Feb 2024 06:54:18 +0100
Subject: [PATCH 04/14] Update svm.py

---
 superstyl/svm.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/superstyl/svm.py b/superstyl/svm.py
index 5d67e033..57913644 100755
--- a/superstyl/svm.py
+++ b/superstyl/svm.py
@@ -108,7 +108,9 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True
             # Ensures that the resampling method does not attempt to use more neighbors than available samples in the minority class, which produced the error.
             min_class_size = min(Counter(classes).values())
             n_neighbors = min(5, min_class_size - 1)  # Default n_neighbors in SMOTE is 5
-
+            # In case we have to temper with the n_neighbors, we print a warning message to the user (might be written more clearly, but we want a short message, right?)
+          if n_neighbors >= min_class_size:
+                print(f"Warning: Adjusting n_neighbors for SMOTE / SMOTETomek to {n_neighbors} due to small class size.")
             if balance == 'SMOTE':
                 estimators.append(('sampling', over.SMOTE(n_neighbors=n_neighbors, random_state=42)))
             elif balance == 'SMOTETomek':

From 1d36eb4a8624d7bdaf5a0a7533146533d2fcfe9f Mon Sep 17 00:00:00 2001
From: Florian Cafiero <florian.cafiero@polytechnique.edu>
Date: Sat, 17 Feb 2024 07:03:26 +0100
Subject: [PATCH 05/14] Update svm.py

---
 superstyl/svm.py | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/superstyl/svm.py b/superstyl/svm.py
index 57913644..947f05eb 100755
--- a/superstyl/svm.py
+++ b/superstyl/svm.py
@@ -62,18 +62,6 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True
         print(".......... using PCA ........")
         estimators.append(('dim_reduc', decomp.PCA()))  # chosen with default
         # which is: n_components = min(n_samples, n_features)
-
-#    if dim_reduc == 'som':
-#        print(".......... using SOM ........")  # TODO: fix SOM
-#        som = minisom.MiniSom(20, 20, nfeats, sigma=0.3, learning_rate=0.5)  # initialization of 50x50 SOM
-#        # TODO: set robust defaults, and calculate number of columns automatically
-#        som.train_random(train.values, 100)
-#        # too long to compute
-#        # som.quantization_error(train)
-#        print(".......... assigning SOM coordinates to texts ........")
-#        train = som.quantization(train.values)
-#        test = som.quantization(test.values)
-
     if norms:
         # Z-scores
         print(".......... using normalisations ........")
@@ -84,10 +72,7 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True
         estimators.append(('normalizer', preproc.Normalizer()))
 
     if balance is not None:
-        # cf. machinelearningmastery.com/combine-oversampling-and-undersampling-for-imbalanced-classification
-        # https://github.com/scikit-learn-contrib/imbalanced-learn
-        # Tons of option, look up the best ones
-
+    
         print(".......... implementing strategy to solve imbalance in data ........")
 
         if balance == 'downsampling':

From f760f0e98a7d82d52f03a24e467eaa06277b31c3 Mon Sep 17 00:00:00 2001
From: Florian Cafiero <florian.cafiero@polytechnique.edu>
Date: Sat, 17 Feb 2024 07:14:40 +0100
Subject: [PATCH 06/14] Update svm.py

Writing coefficients to .csv file and displaying a message warning the user ot it.
---
 superstyl/svm.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/superstyl/svm.py b/superstyl/svm.py
index 947f05eb..6dc0a76d 100755
--- a/superstyl/svm.py
+++ b/superstyl/svm.py
@@ -242,3 +242,13 @@ def plot_coefficients(coefs, feature_names, current_class, top_features=10):
     plt.title("Coefficients for "+current_class)
     plt.savefig('coefs_' + current_class + '.png', bbox_inches='tight')
     # TODO: write them to disk as CSV files
+    # FOLLOW-UP: New code to write coefficients to disk as CSV 
+    # I also give back a message notifying of the file creation and showing the file name.
+    # First: pairing feature names with their coefficients
+    coefficients_df = pd.DataFrame({'Feature Name': feature_names, 'Coefficient': coefs})
+    # Sorting the dataframe by values of coefficients in descending order
+    coefficients_df = coefficients_df.reindex(coefficients_df.Coefficient.abs().sort_values(ascending=False).index)
+    # Writing to CSV
+    coefficients_filename = 'coefs_' + current_class + '.csv'
+    coefficients_df.to_csv(coefficients_filename, index=False)
+    print(f"Coefficients for {current_class} written to {coefficients_filename}")

From 4c2db81aa3e2430a76441d418d01785b3ef094dd Mon Sep 17 00:00:00 2001
From: Florian Cafiero <florian.cafiero@polytechnique.edu>
Date: Sat, 17 Feb 2024 07:33:27 +0100
Subject: [PATCH 07/14] Update features_extract.py

Comments on the function name and description.
---
 superstyl/preproc/features_extract.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py
index 8fda4661..937f0c41 100755
--- a/superstyl/preproc/features_extract.py
+++ b/superstyl/preproc/features_extract.py
@@ -14,6 +14,8 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):
     :param relFreqs: whether to compute relative freqs
     :return: feature frequencies in text
     """
+    # Should this be called count_words ? It counts other features as well... count_features ? It's just a grep and replace away.
+    # Same for the first sentence of the paragraph that I find confusing.
 
     if feats == "words":
         tokens = nltk.tokenize.wordpunct_tokenize(text)

From d82594a368710c3c44eae8e634f771e7642987da Mon Sep 17 00:00:00 2001
From: Florian Cafiero <florian.cafiero@polytechnique.edu>
Date: Sat, 17 Feb 2024 07:42:28 +0100
Subject: [PATCH 08/14] Update features_select.py

Correcting typo and completing bibliographic reference.
---
 superstyl/preproc/features_select.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/superstyl/preproc/features_select.py b/superstyl/preproc/features_select.py
index 1810e1da..3c28b345 100755
--- a/superstyl/preproc/features_select.py
+++ b/superstyl/preproc/features_select.py
@@ -3,7 +3,7 @@
 def filter_ngrams(feat_list, affixes=True, punct=True):
     """
     Filter a list of features in input to yield a selection of n-grams, according to the parameters,
-    following Sapktota et al.
+    following Sapkota et al., NAACL 2015
     feat_list: the feature list (typically, coming of main.py and loaded)
      affixes: affixes (n-grams beginning or ending by space)
      punct: n-grams containing punctuation

From 539f78b927cdd0b27dd285092272d3c77d829c3a Mon Sep 17 00:00:00 2001
From: Florian Cafiero <florian.cafiero@polytechnique.edu>
Date: Sat, 17 Feb 2024 07:52:40 +0100
Subject: [PATCH 09/14] Update features_extract.py

Including POS in English. May have to upgrade to Spacy POS-tagger, but will work in English for now.
---
 superstyl/preproc/features_extract.py | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py
index 937f0c41..b99478c5 100755
--- a/superstyl/preproc/features_extract.py
+++ b/superstyl/preproc/features_extract.py
@@ -9,7 +9,7 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):
     Get word counts from  a text
     :param text: the source text
     :param feat_list: a list of features to be selected
-    :param feats: the type of feats (words, chars, etc.)
+    :param feats: the type of feats: words, chars, POS (supported only for English)
     :param n: the length of n-grams
     :param relFreqs: whether to compute relative freqs
     :return: feature frequencies in text
@@ -23,11 +23,26 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):
         if n > 1:
             tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))]
 
-    if feats == "chars":
+    elif feats == "chars":
         tokens = list(text.replace(' ', '_'))
         if n > 1:
             tokens = ["".join(t) for t in list(nltk.ngrams(tokens, n))]
 
+    #Adding POS for English language with NLTK
+    
+    elif feats == "pos":
+        words = nltk.tokenize.word_tokenize(text)
+        pos_tags = [pos for word, pos in nltk.pos_tag(words)]
+        if n > 1:
+            tokens = ["_".join(t) for t in list(nltk.ngrams(pos_tags, n))]
+        else:
+            tokens = pos_tags
+
+    #Adding an error message in case some distracted guy like me would enter something wrong:
+    else:
+        raise ValueError("Unsupported feature type. Choose from 'words', 'chars', or 'pos'.")
+
+
     counts = {}
 
     for t in tokens:
@@ -56,7 +71,7 @@ def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
     """
     :param myTexts: a 'myTexts' object, containing documents to be processed
     :param feat_list: a list of features to be selected
-    :param feats: type of feats (words, chars)
+    :param feats: type of feats (words, chars, POS)
     :param n: n-grams length
     :return: list of features, with total frequency
     """

From 8cccf8e358da6c3c5997d5f7a68752698f1f5516 Mon Sep 17 00:00:00 2001
From: Florian Cafiero <florian.cafiero@polytechnique.edu>
Date: Sat, 17 Feb 2024 08:13:24 +0100
Subject: [PATCH 10/14] Update features_extract.py

I am tired: I rounded up what is already an integer... Now corrected to a lenght, not a rounded up value (which would be the same, but looks stupid)
---
 superstyl/preproc/features_extract.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py
index b99478c5..dd61b53d 100755
--- a/superstyl/preproc/features_extract.py
+++ b/superstyl/preproc/features_extract.py
@@ -38,6 +38,11 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):
         else:
             tokens = pos_tags
 
+    # Adding sentence length ; still commented as it is a work in progress, an integer won't do, a quantile would be better
+    #elif feats == "sentenceLength":
+    #    sentences = nltk.tokenize.sent_tokenize(text)
+    #      tokens = tokens = [str(len(nltk.tokenize.word_tokenize(sentence))) for sentence in sentences]
+
     #Adding an error message in case some distracted guy like me would enter something wrong:
     else:
         raise ValueError("Unsupported feature type. Choose from 'words', 'chars', or 'pos'.")

From 51eeaf7d943d71e23ff27817b6441f150d8dbb31 Mon Sep 17 00:00:00 2001
From: Florian Cafiero <florian.cafiero@polytechnique.edu>
Date: Sat, 17 Feb 2024 12:51:30 +0100
Subject: [PATCH 11/14] Update features_extract.py

Minor correction
---
 superstyl/preproc/features_extract.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py
index dd61b53d..842ade3b 100755
--- a/superstyl/preproc/features_extract.py
+++ b/superstyl/preproc/features_extract.py
@@ -19,7 +19,6 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):
 
     if feats == "words":
         tokens = nltk.tokenize.wordpunct_tokenize(text)
-
         if n > 1:
             tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))]
 
@@ -28,8 +27,7 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):
         if n > 1:
             tokens = ["".join(t) for t in list(nltk.ngrams(tokens, n))]
 
-    #Adding POS for English language with NLTK
-    
+    #POS in english with NLTK - need to propose spacy later on
     elif feats == "pos":
         words = nltk.tokenize.word_tokenize(text)
         pos_tags = [pos for word, pos in nltk.pos_tag(words)]
@@ -41,7 +39,7 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False):
     # Adding sentence length ; still commented as it is a work in progress, an integer won't do, a quantile would be better
     #elif feats == "sentenceLength":
     #    sentences = nltk.tokenize.sent_tokenize(text)
-    #      tokens = tokens = [str(len(nltk.tokenize.word_tokenize(sentence))) for sentence in sentences]
+    #    tokens = [str(len(nltk.tokenize.word_tokenize(sentence))) for sentence in sentences]
 
     #Adding an error message in case some distracted guy like me would enter something wrong:
     else:

From 4650dafb6451a20418e387998335d8464dc499a0 Mon Sep 17 00:00:00 2001
From: Florian Cafiero <florian.cafiero@polytechnique.edu>
Date: Sat, 17 Feb 2024 13:00:57 +0100
Subject: [PATCH 12/14] Update svm.py - indentation

Minor: indentation problem solved on l.97
---
 superstyl/svm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/superstyl/svm.py b/superstyl/svm.py
index 6dc0a76d..5a079c3c 100755
--- a/superstyl/svm.py
+++ b/superstyl/svm.py
@@ -94,7 +94,7 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True
             min_class_size = min(Counter(classes).values())
             n_neighbors = min(5, min_class_size - 1)  # Default n_neighbors in SMOTE is 5
             # In case we have to temper with the n_neighbors, we print a warning message to the user (might be written more clearly, but we want a short message, right?)
-          if n_neighbors >= min_class_size:
+            if n_neighbors >= min_class_size:
                 print(f"Warning: Adjusting n_neighbors for SMOTE / SMOTETomek to {n_neighbors} due to small class size.")
             if balance == 'SMOTE':
                 estimators.append(('sampling', over.SMOTE(n_neighbors=n_neighbors, random_state=42)))

From 1427bcc84773f5e6a8f64ebd7c9ebab00e08f350 Mon Sep 17 00:00:00 2001
From: Florian Cafiero <florian.cafiero@polytechnique.edu>
Date: Sat, 17 Feb 2024 15:50:12 +0100
Subject: [PATCH 13/14] Update svm.py

---
 superstyl/svm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/superstyl/svm.py b/superstyl/svm.py
index 5a079c3c..55488047 100755
--- a/superstyl/svm.py
+++ b/superstyl/svm.py
@@ -245,7 +245,7 @@ def plot_coefficients(coefs, feature_names, current_class, top_features=10):
     # FOLLOW-UP: New code to write coefficients to disk as CSV 
     # I also give back a message notifying of the file creation and showing the file name.
     # First: pairing feature names with their coefficients
-    coefficients_df = pd.DataFrame({'Feature Name': feature_names, 'Coefficient': coefs})
+    coefficients_df = pandas.DataFrame({'Feature Name': feature_names, 'Coefficient': coefs})
     # Sorting the dataframe by values of coefficients in descending order
     coefficients_df = coefficients_df.reindex(coefficients_df.Coefficient.abs().sort_values(ascending=False).index)
     # Writing to CSV

From 748c68eb4b77bafd5ea85d48c8bb993831613ee6 Mon Sep 17 00:00:00 2001
From: Florian Cafiero <florian.cafiero@polytechnique.edu>
Date: Sat, 17 Feb 2024 16:06:48 +0100
Subject: [PATCH 14/14] Update svm.py

Validating the values of dimension reduction and sampling.
---
 superstyl/svm.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/superstyl/svm.py b/superstyl/svm.py
index 55488047..461813c6 100755
--- a/superstyl/svm.py
+++ b/superstyl/svm.py
@@ -36,9 +36,15 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True
     :return: returns a pipeline with a fitted svm model, and if possible prints evaluation and writes to disk:
     confusion_matrix.csv, misattributions.csv and (if required) FINAL_PREDICTIONS.csv
     """
-    # TODO: fix n samples in SMOTE and SMOTETomek
-    # ValueError: Expected n_neighbors <= n_samples,  but n_samples = 5, n_neighbors = 6
-    #
+    valid_dim_reduc_options = {None, 'pca'}
+    valid_balance_options = {None, 'downsampling', 'upsampling', 'Tomek', 'SMOTE', 'SMOTETomek'}
+    # Validate dimension reduction parameter
+    if dim_reduc not in valid_dim_reduc_options:
+        raise ValueError(f"Invalid dimensionality reduction option: '{dim_reduc}'. Valid options are {valid_dim_reduc_options}.")
+    # Validate 'balance' parameter
+    if balance not in valid_balance_options:
+        raise ValueError(f"Invalid balance option: '{balance}'. Valid options are {valid_balance_options}.")
+
 
     print(".......... Formatting data ........")
     # Save the classes