diff --git a/requirements.txt b/requirements.txt index e4b64f1e..c1cf7d0c 100755 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,4 @@ argparse>=1.4.0 regex>=2022.10.31 matplotlib>=3.6.2 imbalanced-learn>=0.8.1 +gensim>=4.3.2 \ No newline at end of file diff --git a/superstyl/preproc/embedding.py b/superstyl/preproc/embedding.py index ef5d22d9..6d0d8404 100644 --- a/superstyl/preproc/embedding.py +++ b/superstyl/preproc/embedding.py @@ -1,41 +1,29 @@ import numpy as np from scipy import spatial +import gensim.models -def load_glove_embeddings(path): +def load_embeddings(path): """ - Load GloVe embeddings from a txt file and return a dictionary mapping words to vectors. + Load w2vec embeddings from a txt file and return a dictionary mapping words to vectors. :param path: the path to the embeddings txt file :return a dictionary of words and vectors """ - # path = "glove.6B.100d.txt" - embeddings_dict = {} - with open(path, 'r', encoding='utf-8') as f: - for line in f: - values = line.split() - word = values[0] - vector = np.asarray(values[1:], "float32") - embeddings_dict[word] = vector - return embeddings_dict + model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=False) + return model -def find_similar_words(embeddings_dict, word, topn=10): +def find_similar_words(model, word, topn=10): """ Find and return the most similar words to a given word based on cosine similarity. - :param embeddings_dict: the dictionnary of embeddings + :param model: the embedding model :param word: the word for which closest are retrieved :param topn: the n closest (as per cosine similarity) words on which to compute relative frequency - :return a list of the topn closest words (including original word itself) + :return a list of the topn closest words """ - if word not in embeddings_dict: + if word not in model: return None else: - similarities = {} - target_embedding = embeddings_dict[word] - for other_word, other_embedding in embeddings_dict.items(): - similarity = 1 - spatial.distance.cosine(target_embedding, other_embedding) - similarities[other_word] = similarity - sorted_similarities = sorted(similarities.items(), key=lambda item: item[1], reverse=True) - return [s[0] for s in sorted_similarities[0:topn]] + return [s[0] for s in model.most_similar(word, topn=topn)] # For tests # myTexts = [{'name': 'Letter1', 'aut': 'Smith', 'text': 'This is the text', 'lang': 'en', 'wordCounts': {'the': 1, 'this': 1}}, @@ -43,19 +31,19 @@ def find_similar_words(embeddings_dict, word, topn=10): # 'wordCounts': {'the': 1, 'also': 1}}] # feat_list = ['the'] # feat = "the" -def get_embedded_counts(myTexts, feat_list, embeddings_dict, topn=10): +def get_embedded_counts(myTexts, feat_list, model, topn=10): """ Replace absolute frequencies by frequencies relative to a given semantic neighbouring (i.e., some sort of relative frequency among 'paronyms'), using a Glove embedding (cf. Eder, 2022). :param myTexts: the document collection :param feat_list: a list of features to be selected - :param embeddings_dict: the dictionnary of embeddings + :param model: the embeddings model :param topn: the n closest (as per cosine similarity) words on which to compute relative frequency :return: the myTexts collection with, for each text, a 'wordCounts' dictionary with said semantic relative frequencies """ for feat in feat_list: - similars = find_similar_words(embeddings_dict, feat, topn=topn) + similars = find_similar_words(model, feat, topn=topn) if similars is None: # IN THAT CASE, we do not include it in the embedded freqs continue @@ -69,10 +57,9 @@ def get_embedded_counts(myTexts, feat_list, embeddings_dict, topn=10): # then, initialise myTexts[i[0]]["embedded"] = {} - total = sum([myTexts[i[0]]["wordCounts"][s] for s in similars if s in myTexts[i[0]]["wordCounts"].keys()]) + total = sum([myTexts[i[0]]["wordCounts"][s] for s in [feat]+similars if s in myTexts[i[0]]["wordCounts"].keys()]) myTexts[i[0]]["embedded"][feat] = myTexts[i[0]]["wordCounts"][feat] / total - return myTexts