forked from tazeek/BullyDetect
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvector_preprocess.py
119 lines (83 loc) · 3.06 KB
/
vector_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import pandas as pd
import numpy as np
import pickle
import os
from gensim.models import Word2Vec as w2v
mean_cos_array = []
# Dr. Soon's idea
def wordsAverage(word, model):
global mean_cos_array
# Pre-initialize an empty numpy array (for speed)
# 300 is used, as it is the number of vectors in Word2Vec
avgWordsFeature = np.zeros((300,),dtype="float32")
# Get words that are similar. This returns tuples in a list
# Topn refers to the Top N words. 10 is default
top_n = 5
# Get words that are similar. This returns tuples in a list
similar_words = model.most_similar(word, topn=top_n)
# Calculate the Mean Cosine similarity among words and append it to the array
mean_cos_distance = np.mean([ cos_distance for word, cos_distance in similar_words ])
mean_cos_array.append(mean_cos_distance)
# Get the collected words that are similar above this score.
# Get the number of words as well
words_above_mean = [word for word, cos_distance in similar_words if cos_distance > mean_cos_distance]
total_words = float(len(words_above_mean))
# Loop over each word
for word in words_above_mean:
# Add the word's vector
avgWordsFeature = np.add(avgWordsFeature,model[word])
# Average them out
avgWordsFeature = np.divide(avgWordsFeature,total_words)
# Return them
return avgWordsFeature
# Function to transform the unique words
def createVectorDictionary(unique_words, model):
vector_dict = {}
# Loop word by word
for i, word in enumerate(unique_words):
# Status checker
if i % 100 == 0:
print("%d out of %d preprocessed" % (i, len(unique_words)))
# Check if word is in model
if word in model:
# Get the average feature
avgWordFeature = wordsAverage(word, model)
# Add to dictionary
vector_dict[word] = avgWordFeature
return vector_dict
# Function to get the unique words
def getUniqueWords(comments):
unique_words = []
# Loop comment by comment
for comment in comments:
# Loop word by word
for word in comment.split():
# Append the word if not present
if word not in unique_words:
unique_words.append(word)
return unique_words
os.system('cls')
# Load Word2Vec model here
print("LOADING WORD2VEC MODEL\n\n")
FILE = "W2V Models/w2v_reddit_unigram_300d.bin"
model = w2v.load_word2vec_format(FILE, binary=True)
# Load the dataset here and load comments
df = pd.read_csv('clean_dataset.csv')
X = df['Comment']
# Get unique words
print("GETTING UNIQUE WORDS LIST\n\n")
unique_words = getUniqueWords(X)
# Create the dictionary (200K vs 20K)
# After creating, save using pickle
print("CREATING WORD-TRANSFORMED DICTIONARY\n\n")
vector_dict = createVectorDictionary(unique_words, model)
#print("SAVING THE DICTIONARY")
#FILE = "Word Dictionaries/vect_dict_5.p"
#pickle.dump(vector_dict, open(FILE, "wb"))
# Get overall mean cosine similarity
overall_cos_mean = np.mean(mean_cos_array)
overall_cos_median = np.median(mean_cos_array)
overall_cos_mode = np.bincount(mean_cos_array).argmax()
print("COSINE MEAN: ", round(overall_cos_mean, 4))
print("COSINE MEDIAN: ", round(overall_cos_median, 4))
print("COSINE MODE: ", round(overall_cos_mode, 4))