forked from tazeek/BullyDetect
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster_trans.py
65 lines (42 loc) · 1.5 KB
/
cluster_trans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import pickle
import os
import numpy as np
from gensim.models import Word2Vec as w2v
os.system('cls')
def getAverageCluster(word_list, model):
# Get number of words
total_words = len(word_list)
# Pre-initialize an empty numpy array (for speed)
# 300 is used, as it is the number of vectors in Word2Vec
avgWordsFeature = np.zeros((300,),dtype="float32")
# Loop word by word
for word in word_list:
# Add the word's feature vectors
avgWordsFeature = np.add(avgWordsFeature, model[word])
# Divide to get the mean
avgWordsFeature = np.divide(avgWordsFeature,total_words)
return avgWordsFeature
# Load the Cluster dictionary
print("LOADING CLUSTER DICTIONARY \n\n")
cluster_num = 250
FILE = "Word Dictionaries/dict_" + str(cluster_num) + "C.pk"
array_dict_cluster = pickle.load(open(FILE, "rb"))
# Load Word2Vec model
print("LOADING WORD2VEC MODEL \n\n")
FILE = "W2V Models/w2v_reddit_unigram_300d.bin"
model = w2v.load_word2vec_format(FILE, binary=True)
# Loop cluster by cluster
print("STARTING TRANSFORMATIONS \n\n")
for index,cluster in enumerate(array_dict_cluster):
# Print update
if index % 10 == 0:
print("%i Clusters out of %i transformed" % (index, len(array_dict_cluster)))
# Get the word list
words = cluster['word_list']
# Call the function
avg_cluster = getAverageCluster(words, model)
# Store in new key
cluster['average_vector'] = avg_cluster
# Save the File
FILE = "Word Dictionaries/trans_dict_" + str(cluster_num) + "C.pk"
pickle.dump(array_dict_cluster, open(FILE, "wb"))