Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hyperstyl a.1 #49

Merged
merged 15 commits into from
Feb 20, 2024
28 changes: 23 additions & 5 deletions superstyl/preproc/features_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,42 @@

def count_words(text, feats = "words", n = 1):
"""
Get word counts from a text
Get feature counts from a text (words, chars or POS n-grams)
:param text: the source text
:param feats: the type of feats (words, chars, etc.)
:param feats: the type of feats: words, chars, POS (supported only for English)
:param n: the length of n-grams
:return: features absolute frequencies in text as a counter
"""
# Should this be called count_words ? It counts other features as well... count_features ? It's just a grep and replace away.

if feats == "words":
tokens = nltk.tokenize.wordpunct_tokenize(text)

if n > 1:
tokens = ["_".join(t) for t in list(nltk.ngrams(tokens, n))]

if feats == "chars":
elif feats == "chars":
tokens = list(text.replace(' ', '_'))
if n > 1:
tokens = ["".join(t) for t in list(nltk.ngrams(tokens, n))]

#POS in english with NLTK - need to propose spacy later on
elif feats == "pos":
words = nltk.tokenize.word_tokenize(text)
pos_tags = [pos for word, pos in nltk.pos_tag(words)]
if n > 1:
tokens = ["_".join(t) for t in list(nltk.ngrams(pos_tags, n))]

Check warning on line 32 in superstyl/preproc/features_extract.py

View check run for this annotation

Codecov / codecov/patch

superstyl/preproc/features_extract.py#L28-L32

Added lines #L28 - L32 were not covered by tests
else:
tokens = pos_tags

Check warning on line 34 in superstyl/preproc/features_extract.py

View check run for this annotation

Codecov / codecov/patch

superstyl/preproc/features_extract.py#L34

Added line #L34 was not covered by tests

# Adding sentence length ; still commented as it is a work in progress, an integer won't do, a quantile would be better
#elif feats == "sentenceLength":
# sentences = nltk.tokenize.sent_tokenize(text)
# tokens = [str(len(nltk.tokenize.word_tokenize(sentence))) for sentence in sentences]

#Adding an error message in case some distracted guy like me would enter something wrong:
else:
raise ValueError("Unsupported feature type. Choose from 'words', 'chars', or 'pos'.")

Check warning on line 43 in superstyl/preproc/features_extract.py

View check run for this annotation

Codecov / codecov/patch

superstyl/preproc/features_extract.py#L43

Added line #L43 was not covered by tests

counts = Counter()
counts.update(tokens)

Expand All @@ -47,7 +65,7 @@
"""
:param myTexts: a 'myTexts' object, containing documents to be processed
:param feat_list: a list of features to be selected
:param feats: type of feats (words, chars)
:param feats: type of feats (words, chars, POS)
:param n: n-grams length
:return: list of features, with total frequency
"""
Expand Down
2 changes: 1 addition & 1 deletion superstyl/preproc/features_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
def filter_ngrams(feat_list, affixes=True, punct=True):
"""
Filter a list of features in input to yield a selection of n-grams, according to the parameters,
following Sapktota et al.
following Sapkota et al., NAACL 2015
feat_list: the feature list (typically, coming of main.py and loaded)
affixes: affixes (n-grams beginning or ending by space)
punct: n-grams containing punctuation
Expand Down
60 changes: 36 additions & 24 deletions superstyl/svm.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
import imblearn.over_sampling as over
import imblearn.combine as comb
import imblearn.pipeline as imbp
from collections import Counter



def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True, balance=False, class_weights=False, kernel="LinearSVC",
final_pred=False, get_coefs=False):
Expand All @@ -33,9 +36,15 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True
:return: returns a pipeline with a fitted svm model, and if possible prints evaluation and writes to disk:
confusion_matrix.csv, misattributions.csv and (if required) FINAL_PREDICTIONS.csv
"""
# TODO: fix n samples in SMOTE and SMOTETomek
# ValueError: Expected n_neighbors <= n_samples, but n_samples = 5, n_neighbors = 6
#
valid_dim_reduc_options = {None, 'pca'}
valid_balance_options = {None, 'downsampling', 'upsampling', 'Tomek', 'SMOTE', 'SMOTETomek'}
# Validate dimension reduction parameter
if dim_reduc not in valid_dim_reduc_options:
raise ValueError(f"Invalid dimensionality reduction option: '{dim_reduc}'. Valid options are {valid_dim_reduc_options}.")
# Validate 'balance' parameter
if balance not in valid_balance_options:
raise ValueError(f"Invalid balance option: '{balance}'. Valid options are {valid_balance_options}.")


print(".......... Formatting data ........")
# Save the classes
Expand All @@ -58,19 +67,7 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True
if dim_reduc == 'pca':
print(".......... using PCA ........")
estimators.append(('dim_reduc', decomp.PCA())) # chosen with default
# wich is: n_components = min(n_samples, n_features)

# if dim_reduc == 'som':
# print(".......... using SOM ........") # TODO: fix SOM
# som = minisom.MiniSom(20, 20, nfeats, sigma=0.3, learning_rate=0.5) # initialization of 50x50 SOM
# # TODO: set robust defaults, and calculate number of columns automatically
# som.train_random(train.values, 100)
# # too long to compute
# # som.quantization_error(train)
# print(".......... assigning SOM coordinates to texts ........")
# train = som.quantization(train.values)
# test = som.quantization(test.values)

# which is: n_components = min(n_samples, n_features)
if norms:
# Z-scores
print(".......... using normalisations ........")
Expand All @@ -81,10 +78,7 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True
estimators.append(('normalizer', preproc.Normalizer()))

if balance is not None:
# cf. machinelearningmastery.com/combine-oversampling-and-undersampling-for-imbalanced-classification
# https://github.com/scikit-learn-contrib/imbalanced-learn
# Tons of option, look up the best ones


print(".......... implementing strategy to solve imbalance in data ........")

if balance == 'downsampling':
Expand All @@ -100,11 +94,19 @@ def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True
if balance == 'upsampling':
estimators.append(('sampling', over.RandomOverSampler(random_state=42)))

if balance == 'SMOTE':
estimators.append(('sampling', over.SMOTE(random_state=42)))
if balance in ['SMOTE', 'SMOTETomek']:
# Adjust n_neighbors for SMOTE/SMOTETomek based on smallest class size:
# Ensures that the resampling method does not attempt to use more neighbors than available samples in the minority class, which produced the error.
min_class_size = min(Counter(classes).values())
n_neighbors = min(5, min_class_size - 1) # Default n_neighbors in SMOTE is 5
# In case we have to temper with the n_neighbors, we print a warning message to the user (might be written more clearly, but we want a short message, right?)
if n_neighbors >= min_class_size:
print(f"Warning: Adjusting n_neighbors for SMOTE / SMOTETomek to {n_neighbors} due to small class size.")
if balance == 'SMOTE':
estimators.append(('sampling', over.SMOTE(n_neighbors=n_neighbors, random_state=42)))
elif balance == 'SMOTETomek':
estimators.append(('sampling', comb.SMOTETomek(n_neighbors=n_neighbors, random_state=42)))

if balance == 'SMOTETomek':
estimators.append(('sampling', comb.SMOTETomek(random_state=42)))

print(".......... choosing SVM ........")

Expand Down Expand Up @@ -246,3 +248,13 @@ def plot_coefficients(coefs, feature_names, current_class, top_features=10):
plt.title("Coefficients for "+current_class)
plt.savefig('coefs_' + current_class + '.png', bbox_inches='tight')
# TODO: write them to disk as CSV files
# FOLLOW-UP: New code to write coefficients to disk as CSV
# I also give back a message notifying of the file creation and showing the file name.
# First: pairing feature names with their coefficients
coefficients_df = pandas.DataFrame({'Feature Name': feature_names, 'Coefficient': coefs})
# Sorting the dataframe by values of coefficients in descending order
coefficients_df = coefficients_df.reindex(coefficients_df.Coefficient.abs().sort_values(ascending=False).index)
# Writing to CSV
coefficients_filename = 'coefs_' + current_class + '.csv'
coefficients_df.to_csv(coefficients_filename, index=False)
print(f"Coefficients for {current_class} written to {coefficients_filename}")
Loading