Skip to content

Commit

Permalink
Merge pull request #49 from floriancafiero/master
Browse files Browse the repository at this point in the history
Hyperstyl a.1
  • Loading branch information
Jean-Baptiste-Camps authored Feb 20, 2024
2 parents 67a14a6 + 143d386 commit 82858cd
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 30 deletions.
28 changes: 23 additions & 5 deletions superstyl/preproc/features_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,42 @@

def count_words(text, feats = "words", n = 1):
"""
Get word counts from a text
Get feature counts from a text (words, chars or POS n-grams)
:param text: the source text
:param feats: the type of feats (words, chars, etc.)
:param feats: the type of feats: words, chars, POS (supported only for English)
:param n: the length of n-grams
:return: features absolute frequencies in text as a counter
"""
# Should this be called count_words ? It counts other features as well... count_features ? It's just a grep and replace away.

if feats == "words":
tokens = nltk.tokenize.wordpunct_tokenize(text)

if n > 1:
tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))]

if feats == "chars":
elif feats == "chars":
tokens = list(text.replace(' ', '_'))
if n > 1:
tokens = ["".join(t) for t in list(nltk.ngrams(tokens, n))]

#POS in english with NLTK - need to propose spacy later on
elif feats == "pos":
words = nltk.tokenize.word_tokenize(text)
pos_tags = [pos for word, pos in nltk.pos_tag(words)]
if n > 1:
tokens = ["_".join(t) for t in list(nltk.ngrams(pos_tags, n))]
else:
tokens = pos_tags

# Adding sentence length ; still commented as it is a work in progress, an integer won't do, a quantile would be better
#elif feats == "sentenceLength":
# sentences = nltk.tokenize.sent_tokenize(text)
# tokens = [str(len(nltk.tokenize.word_tokenize(sentence))) for sentence in sentences]

#Adding an error message in case some distracted guy like me would enter something wrong:
else:
raise ValueError("Unsupported feature type. Choose from 'words', 'chars', or 'pos'.")

counts = Counter()
counts.update(tokens)

Expand All @@ -47,7 +65,7 @@ def get_feature_list(myTexts, feats="words", n=1, relFreqs=True):
"""
:param myTexts: a 'myTexts' object, containing documents to be processed
:param feat_list: a list of features to be selected
:param feats: type of feats (words, chars)
:param feats: type of feats (words, chars, POS)
:param n: n-grams length
:return: list of features, with total frequency
"""
Expand Down
2 changes: 1 addition & 1 deletion superstyl/preproc/features_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
def filter_ngrams(feat_list, affixes=True, punct=True):
"""
Filter a list of features in input to yield a selection of n-grams, according to the parameters,
following Sapktota et al.
following Sapkota et al., NAACL 2015
feat_list: the feature list (typically, coming of main.py and loaded)
affixes: affixes (n-grams beginning or ending by space)
punct: n-grams containing punctuation
Expand Down
60 changes: 36 additions & 24 deletions superstyl/svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
import imblearn.over_sampling as over
import imblearn.combine as comb
import imblearn.pipeline as imbp
from collections import Counter



def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True, balance=False, class_weights=False, kernel="LinearSVC",
final_pred=False, get_coefs=False):
Expand All @@ -33,9 +36,15 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True
:return: returns a pipeline with a fitted svm model, and if possible prints evaluation and writes to disk:
confusion_matrix.csv, misattributions.csv and (if required) FINAL_PREDICTIONS.csv
"""
# TODO: fix n samples in SMOTE and SMOTETomek
# ValueError: Expected n_neighbors <= n_samples, but n_samples = 5, n_neighbors = 6
#
valid_dim_reduc_options = {None, 'pca'}
valid_balance_options = {None, 'downsampling', 'upsampling', 'Tomek', 'SMOTE', 'SMOTETomek'}
# Validate dimension reduction parameter
if dim_reduc not in valid_dim_reduc_options:
raise ValueError(f"Invalid dimensionality reduction option: '{dim_reduc}'. Valid options are {valid_dim_reduc_options}.")
# Validate 'balance' parameter
if balance not in valid_balance_options:
raise ValueError(f"Invalid balance option: '{balance}'. Valid options are {valid_balance_options}.")


print(".......... Formatting data ........")
# Save the classes
Expand All @@ -58,19 +67,7 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True
if dim_reduc == 'pca':
print(".......... using PCA ........")
estimators.append(('dim_reduc', decomp.PCA())) # chosen with default
# wich is: n_components = min(n_samples, n_features)

# if dim_reduc == 'som':
# print(".......... using SOM ........") # TODO: fix SOM
# som = minisom.MiniSom(20, 20, nfeats, sigma=0.3, learning_rate=0.5) # initialization of 50x50 SOM
# # TODO: set robust defaults, and calculate number of columns automatically
# som.train_random(train.values, 100)
# # too long to compute
# # som.quantization_error(train)
# print(".......... assigning SOM coordinates to texts ........")
# train = som.quantization(train.values)
# test = som.quantization(test.values)

# which is: n_components = min(n_samples, n_features)
if norms:
# Z-scores
print(".......... using normalisations ........")
Expand All @@ -81,10 +78,7 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True
estimators.append(('normalizer', preproc.Normalizer()))

if balance is not None:
# cf. machinelearningmastery.com/combine-oversampling-and-undersampling-for-imbalanced-classification
# https://github.com/scikit-learn-contrib/imbalanced-learn
# Tons of option, look up the best ones


print(".......... implementing strategy to solve imbalance in data ........")

if balance == 'downsampling':
Expand All @@ -100,11 +94,19 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True
if balance == 'upsampling':
estimators.append(('sampling', over.RandomOverSampler(random_state=42)))

if balance == 'SMOTE':
estimators.append(('sampling', over.SMOTE(random_state=42)))
if balance in ['SMOTE', 'SMOTETomek']:
# Adjust n_neighbors for SMOTE/SMOTETomek based on smallest class size:
# Ensures that the resampling method does not attempt to use more neighbors than available samples in the minority class, which produced the error.
min_class_size = min(Counter(classes).values())
n_neighbors = min(5, min_class_size - 1) # Default n_neighbors in SMOTE is 5
# In case we have to temper with the n_neighbors, we print a warning message to the user (might be written more clearly, but we want a short message, right?)
if n_neighbors >= min_class_size:
print(f"Warning: Adjusting n_neighbors for SMOTE / SMOTETomek to {n_neighbors} due to small class size.")
if balance == 'SMOTE':
estimators.append(('sampling', over.SMOTE(n_neighbors=n_neighbors, random_state=42)))
elif balance == 'SMOTETomek':
estimators.append(('sampling', comb.SMOTETomek(n_neighbors=n_neighbors, random_state=42)))

if balance == 'SMOTETomek':
estimators.append(('sampling', comb.SMOTETomek(random_state=42)))

print(".......... choosing SVM ........")

Expand Down Expand Up @@ -246,3 +248,13 @@ def plot_coefficients(coefs, feature_names, current_class, top_features=10):
plt.title("Coefficients for "+current_class)
plt.savefig('coefs_' + current_class + '.png', bbox_inches='tight')
# TODO: write them to disk as CSV files
# FOLLOW-UP: New code to write coefficients to disk as CSV
# I also give back a message notifying of the file creation and showing the file name.
# First: pairing feature names with their coefficients
coefficients_df = pandas.DataFrame({'Feature Name': feature_names, 'Coefficient': coefs})
# Sorting the dataframe by values of coefficients in descending order
coefficients_df = coefficients_df.reindex(coefficients_df.Coefficient.abs().sort_values(ascending=False).index)
# Writing to CSV
coefficients_filename = 'coefs_' + current_class + '.csv'
coefficients_df.to_csv(coefficients_filename, index=False)
print(f"Coefficients for {current_class} written to {coefficients_filename}")

0 comments on commit 82858cd

Please sign in to comment.