From 510572e4e856dc9588aeb785bcb62d5b3b1fbf3e Mon Sep 17 00:00:00 2001 From: Florian Cafiero Date: Sat, 17 Feb 2024 06:44:39 +0100 Subject: [PATCH 01/14] Update svm.py --- superstyl/svm.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/superstyl/svm.py b/superstyl/svm.py index 9ae2d2aa..8e54bccb 100755 --- a/superstyl/svm.py +++ b/superstyl/svm.py @@ -11,6 +11,9 @@ import imblearn.over_sampling as over import imblearn.combine as comb import imblearn.pipeline as imbp +from collections import Counter + + def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True, balance=False, class_weights=False, kernel="LinearSVC", final_pred=False, get_coefs=False): @@ -100,11 +103,16 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True if balance == 'upsampling': estimators.append(('sampling', over.RandomOverSampler(random_state=42))) - if balance == 'SMOTE': - estimators.append(('sampling', over.SMOTE(random_state=42))) + if balance in ['SMOTE', 'SMOTETomek']: + # Adjust n_neighbors for SMOTE/SMOTETomek based on smallest class size + min_class_size = min(Counter(classes).values()) + n_neighbors = min(5, min_class_size - 1) # Default n_neighbors in SMOTE is 5 + + if balance == 'SMOTE': + estimators.append(('sampling', over.SMOTE(n_neighbors=n_neighbors, random_state=42))) + elif balance == 'SMOTETomek': + estimators.append(('sampling', comb.SMOTETomek(n_neighbors=n_neighbors, random_state=42))) - if balance == 'SMOTETomek': - estimators.append(('sampling', comb.SMOTETomek(random_state=42))) print(".......... choosing SVM ........") From 9a025dad42380d063914b3cf23e5f46c84f70a37 Mon Sep 17 00:00:00 2001 From: Florian Cafiero Date: Sat, 17 Feb 2024 06:46:05 +0100 Subject: [PATCH 02/14] Update svm.py --- superstyl/svm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/superstyl/svm.py b/superstyl/svm.py index 8e54bccb..a75fddf1 100755 --- a/superstyl/svm.py +++ b/superstyl/svm.py @@ -104,7 +104,8 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True estimators.append(('sampling', over.RandomOverSampler(random_state=42))) if balance in ['SMOTE', 'SMOTETomek']: - # Adjust n_neighbors for SMOTE/SMOTETomek based on smallest class size + # Adjust n_neighbors for SMOTE/SMOTETomek based on smallest class size: + # Ensures that the resampling method does not attempt to use more neighbors than available samples in the minority class, which produced the error. min_class_size = min(Counter(classes).values()) n_neighbors = min(5, min_class_size - 1) # Default n_neighbors in SMOTE is 5 From 1dd6cd8aa7c489eba31950f413adcb6bcb178c32 Mon Sep 17 00:00:00 2001 From: Florian Cafiero Date: Sat, 17 Feb 2024 06:46:50 +0100 Subject: [PATCH 03/14] Update svm.py --- superstyl/svm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superstyl/svm.py b/superstyl/svm.py index a75fddf1..5d67e033 100755 --- a/superstyl/svm.py +++ b/superstyl/svm.py @@ -61,7 +61,7 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True if dim_reduc == 'pca': print(".......... using PCA ........") estimators.append(('dim_reduc', decomp.PCA())) # chosen with default - # wich is: n_components = min(n_samples, n_features) + # which is: n_components = min(n_samples, n_features) # if dim_reduc == 'som': # print(".......... using SOM ........") # TODO: fix SOM From 96923dc99540e3b2fe18111ef50fefa7f9ffc5f7 Mon Sep 17 00:00:00 2001 From: Florian Cafiero Date: Sat, 17 Feb 2024 06:54:18 +0100 Subject: [PATCH 04/14] Update svm.py --- superstyl/svm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/superstyl/svm.py b/superstyl/svm.py index 5d67e033..57913644 100755 --- a/superstyl/svm.py +++ b/superstyl/svm.py @@ -108,7 +108,9 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True # Ensures that the resampling method does not attempt to use more neighbors than available samples in the minority class, which produced the error. min_class_size = min(Counter(classes).values()) n_neighbors = min(5, min_class_size - 1) # Default n_neighbors in SMOTE is 5 - + # In case we have to temper with the n_neighbors, we print a warning message to the user (might be written more clearly, but we want a short message, right?) + if n_neighbors >= min_class_size: + print(f"Warning: Adjusting n_neighbors for SMOTE / SMOTETomek to {n_neighbors} due to small class size.") if balance == 'SMOTE': estimators.append(('sampling', over.SMOTE(n_neighbors=n_neighbors, random_state=42))) elif balance == 'SMOTETomek': From 1d36eb4a8624d7bdaf5a0a7533146533d2fcfe9f Mon Sep 17 00:00:00 2001 From: Florian Cafiero Date: Sat, 17 Feb 2024 07:03:26 +0100 Subject: [PATCH 05/14] Update svm.py --- superstyl/svm.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/superstyl/svm.py b/superstyl/svm.py index 57913644..947f05eb 100755 --- a/superstyl/svm.py +++ b/superstyl/svm.py @@ -62,18 +62,6 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True print(".......... using PCA ........") estimators.append(('dim_reduc', decomp.PCA())) # chosen with default # which is: n_components = min(n_samples, n_features) - -# if dim_reduc == 'som': -# print(".......... using SOM ........") # TODO: fix SOM -# som = minisom.MiniSom(20, 20, nfeats, sigma=0.3, learning_rate=0.5) # initialization of 50x50 SOM -# # TODO: set robust defaults, and calculate number of columns automatically -# som.train_random(train.values, 100) -# # too long to compute -# # som.quantization_error(train) -# print(".......... assigning SOM coordinates to texts ........") -# train = som.quantization(train.values) -# test = som.quantization(test.values) - if norms: # Z-scores print(".......... using normalisations ........") @@ -84,10 +72,7 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True estimators.append(('normalizer', preproc.Normalizer())) if balance is not None: - # cf. machinelearningmastery.com/combine-oversampling-and-undersampling-for-imbalanced-classification - # https://github.com/scikit-learn-contrib/imbalanced-learn - # Tons of option, look up the best ones - + print(".......... implementing strategy to solve imbalance in data ........") if balance == 'downsampling': From f760f0e98a7d82d52f03a24e467eaa06277b31c3 Mon Sep 17 00:00:00 2001 From: Florian Cafiero Date: Sat, 17 Feb 2024 07:14:40 +0100 Subject: [PATCH 06/14] Update svm.py Writing coefficients to .csv file and displaying a message warning the user ot it. --- superstyl/svm.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/superstyl/svm.py b/superstyl/svm.py index 947f05eb..6dc0a76d 100755 --- a/superstyl/svm.py +++ b/superstyl/svm.py @@ -242,3 +242,13 @@ def plot_coefficients(coefs, feature_names, current_class, top_features=10): plt.title("Coefficients for "+current_class) plt.savefig('coefs_' + current_class + '.png', bbox_inches='tight') # TODO: write them to disk as CSV files + # FOLLOW-UP: New code to write coefficients to disk as CSV + # I also give back a message notifying of the file creation and showing the file name. + # First: pairing feature names with their coefficients + coefficients_df = pd.DataFrame({'Feature Name': feature_names, 'Coefficient': coefs}) + # Sorting the dataframe by values of coefficients in descending order + coefficients_df = coefficients_df.reindex(coefficients_df.Coefficient.abs().sort_values(ascending=False).index) + # Writing to CSV + coefficients_filename = 'coefs_' + current_class + '.csv' + coefficients_df.to_csv(coefficients_filename, index=False) + print(f"Coefficients for {current_class} written to {coefficients_filename}") From 4c2db81aa3e2430a76441d418d01785b3ef094dd Mon Sep 17 00:00:00 2001 From: Florian Cafiero Date: Sat, 17 Feb 2024 07:33:27 +0100 Subject: [PATCH 07/14] Update features_extract.py Comments on the function name and description. --- superstyl/preproc/features_extract.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py index 8fda4661..937f0c41 100755 --- a/superstyl/preproc/features_extract.py +++ b/superstyl/preproc/features_extract.py @@ -14,6 +14,8 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False): :param relFreqs: whether to compute relative freqs :return: feature frequencies in text """ + # Should this be called count_words ? It counts other features as well... count_features ? It's just a grep and replace away. + # Same for the first sentence of the paragraph that I find confusing. if feats == "words": tokens = nltk.tokenize.wordpunct_tokenize(text) From d82594a368710c3c44eae8e634f771e7642987da Mon Sep 17 00:00:00 2001 From: Florian Cafiero Date: Sat, 17 Feb 2024 07:42:28 +0100 Subject: [PATCH 08/14] Update features_select.py Correcting typo and completing bibliographic reference. --- superstyl/preproc/features_select.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superstyl/preproc/features_select.py b/superstyl/preproc/features_select.py index 1810e1da..3c28b345 100755 --- a/superstyl/preproc/features_select.py +++ b/superstyl/preproc/features_select.py @@ -3,7 +3,7 @@ def filter_ngrams(feat_list, affixes=True, punct=True): """ Filter a list of features in input to yield a selection of n-grams, according to the parameters, - following Sapktota et al. + following Sapkota et al., NAACL 2015 feat_list: the feature list (typically, coming of main.py and loaded) affixes: affixes (n-grams beginning or ending by space) punct: n-grams containing punctuation From 539f78b927cdd0b27dd285092272d3c77d829c3a Mon Sep 17 00:00:00 2001 From: Florian Cafiero Date: Sat, 17 Feb 2024 07:52:40 +0100 Subject: [PATCH 09/14] Update features_extract.py Including POS in English. May have to upgrade to Spacy POS-tagger, but will work in English for now. --- superstyl/preproc/features_extract.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py index 937f0c41..b99478c5 100755 --- a/superstyl/preproc/features_extract.py +++ b/superstyl/preproc/features_extract.py @@ -9,7 +9,7 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False): Get word counts from a text :param text: the source text :param feat_list: a list of features to be selected - :param feats: the type of feats (words, chars, etc.) + :param feats: the type of feats: words, chars, POS (supported only for English) :param n: the length of n-grams :param relFreqs: whether to compute relative freqs :return: feature frequencies in text @@ -23,11 +23,26 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False): if n > 1: tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))] - if feats == "chars": + elif feats == "chars": tokens = list(text.replace(' ', '_')) if n > 1: tokens = ["".join(t) for t in list(nltk.ngrams(tokens, n))] + #Adding POS for English language with NLTK + + elif feats == "pos": + words = nltk.tokenize.word_tokenize(text) + pos_tags = [pos for word, pos in nltk.pos_tag(words)] + if n > 1: + tokens = ["_".join(t) for t in list(nltk.ngrams(pos_tags, n))] + else: + tokens = pos_tags + + #Adding an error message in case some distracted guy like me would enter something wrong: + else: + raise ValueError("Unsupported feature type. Choose from 'words', 'chars', or 'pos'.") + + counts = {} for t in tokens: @@ -56,7 +71,7 @@ def get_feature_list(myTexts, feats="words", n=1, relFreqs=True): """ :param myTexts: a 'myTexts' object, containing documents to be processed :param feat_list: a list of features to be selected - :param feats: type of feats (words, chars) + :param feats: type of feats (words, chars, POS) :param n: n-grams length :return: list of features, with total frequency """ From 8cccf8e358da6c3c5997d5f7a68752698f1f5516 Mon Sep 17 00:00:00 2001 From: Florian Cafiero Date: Sat, 17 Feb 2024 08:13:24 +0100 Subject: [PATCH 10/14] Update features_extract.py I am tired: I rounded up what is already an integer... Now corrected to a lenght, not a rounded up value (which would be the same, but looks stupid) --- superstyl/preproc/features_extract.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py index b99478c5..dd61b53d 100755 --- a/superstyl/preproc/features_extract.py +++ b/superstyl/preproc/features_extract.py @@ -38,6 +38,11 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False): else: tokens = pos_tags + # Adding sentence length ; still commented as it is a work in progress, an integer won't do, a quantile would be better + #elif feats == "sentenceLength": + # sentences = nltk.tokenize.sent_tokenize(text) + # tokens = tokens = [str(len(nltk.tokenize.word_tokenize(sentence))) for sentence in sentences] + #Adding an error message in case some distracted guy like me would enter something wrong: else: raise ValueError("Unsupported feature type. Choose from 'words', 'chars', or 'pos'.") From 51eeaf7d943d71e23ff27817b6441f150d8dbb31 Mon Sep 17 00:00:00 2001 From: Florian Cafiero Date: Sat, 17 Feb 2024 12:51:30 +0100 Subject: [PATCH 11/14] Update features_extract.py Minor correction --- superstyl/preproc/features_extract.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/superstyl/preproc/features_extract.py b/superstyl/preproc/features_extract.py index dd61b53d..842ade3b 100755 --- a/superstyl/preproc/features_extract.py +++ b/superstyl/preproc/features_extract.py @@ -19,7 +19,6 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False): if feats == "words": tokens = nltk.tokenize.wordpunct_tokenize(text) - if n > 1: tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))] @@ -28,8 +27,7 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False): if n > 1: tokens = ["".join(t) for t in list(nltk.ngrams(tokens, n))] - #Adding POS for English language with NLTK - + #POS in english with NLTK - need to propose spacy later on elif feats == "pos": words = nltk.tokenize.word_tokenize(text) pos_tags = [pos for word, pos in nltk.pos_tag(words)] @@ -41,7 +39,7 @@ def count_words(text, feat_list=None, feats = "words", n = 1, relFreqs = False): # Adding sentence length ; still commented as it is a work in progress, an integer won't do, a quantile would be better #elif feats == "sentenceLength": # sentences = nltk.tokenize.sent_tokenize(text) - # tokens = tokens = [str(len(nltk.tokenize.word_tokenize(sentence))) for sentence in sentences] + # tokens = [str(len(nltk.tokenize.word_tokenize(sentence))) for sentence in sentences] #Adding an error message in case some distracted guy like me would enter something wrong: else: From 4650dafb6451a20418e387998335d8464dc499a0 Mon Sep 17 00:00:00 2001 From: Florian Cafiero Date: Sat, 17 Feb 2024 13:00:57 +0100 Subject: [PATCH 12/14] Update svm.py - indentation Minor: indentation problem solved on l.97 --- superstyl/svm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superstyl/svm.py b/superstyl/svm.py index 6dc0a76d..5a079c3c 100755 --- a/superstyl/svm.py +++ b/superstyl/svm.py @@ -94,7 +94,7 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True min_class_size = min(Counter(classes).values()) n_neighbors = min(5, min_class_size - 1) # Default n_neighbors in SMOTE is 5 # In case we have to temper with the n_neighbors, we print a warning message to the user (might be written more clearly, but we want a short message, right?) - if n_neighbors >= min_class_size: + if n_neighbors >= min_class_size: print(f"Warning: Adjusting n_neighbors for SMOTE / SMOTETomek to {n_neighbors} due to small class size.") if balance == 'SMOTE': estimators.append(('sampling', over.SMOTE(n_neighbors=n_neighbors, random_state=42))) From 1427bcc84773f5e6a8f64ebd7c9ebab00e08f350 Mon Sep 17 00:00:00 2001 From: Florian Cafiero Date: Sat, 17 Feb 2024 15:50:12 +0100 Subject: [PATCH 13/14] Update svm.py --- superstyl/svm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superstyl/svm.py b/superstyl/svm.py index 5a079c3c..55488047 100755 --- a/superstyl/svm.py +++ b/superstyl/svm.py @@ -245,7 +245,7 @@ def plot_coefficients(coefs, feature_names, current_class, top_features=10): # FOLLOW-UP: New code to write coefficients to disk as CSV # I also give back a message notifying of the file creation and showing the file name. # First: pairing feature names with their coefficients - coefficients_df = pd.DataFrame({'Feature Name': feature_names, 'Coefficient': coefs}) + coefficients_df = pandas.DataFrame({'Feature Name': feature_names, 'Coefficient': coefs}) # Sorting the dataframe by values of coefficients in descending order coefficients_df = coefficients_df.reindex(coefficients_df.Coefficient.abs().sort_values(ascending=False).index) # Writing to CSV From 748c68eb4b77bafd5ea85d48c8bb993831613ee6 Mon Sep 17 00:00:00 2001 From: Florian Cafiero Date: Sat, 17 Feb 2024 16:06:48 +0100 Subject: [PATCH 14/14] Update svm.py Validating the values of dimension reduction and sampling. --- superstyl/svm.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/superstyl/svm.py b/superstyl/svm.py index 55488047..461813c6 100755 --- a/superstyl/svm.py +++ b/superstyl/svm.py @@ -36,9 +36,15 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True :return: returns a pipeline with a fitted svm model, and if possible prints evaluation and writes to disk: confusion_matrix.csv, misattributions.csv and (if required) FINAL_PREDICTIONS.csv """ - # TODO: fix n samples in SMOTE and SMOTETomek - # ValueError: Expected n_neighbors <= n_samples, but n_samples = 5, n_neighbors = 6 - # + valid_dim_reduc_options = {None, 'pca'} + valid_balance_options = {None, 'downsampling', 'upsampling', 'Tomek', 'SMOTE', 'SMOTETomek'} + # Validate dimension reduction parameter + if dim_reduc not in valid_dim_reduc_options: + raise ValueError(f"Invalid dimensionality reduction option: '{dim_reduc}'. Valid options are {valid_dim_reduc_options}.") + # Validate 'balance' parameter + if balance not in valid_balance_options: + raise ValueError(f"Invalid balance option: '{balance}'. Valid options are {valid_balance_options}.") + print(".......... Formatting data ........") # Save the classes