forked from tazeek/BullyDetect
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathavg_hybrid.py
124 lines (85 loc) · 2.86 KB
/
avg_hybrid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import pandas as pd
import numpy as np
import os
import pickle
from gensim.models import Word2Vec as w2v
from evaluation import evaluate
def handleMissingWord(word, model, num_features):
# n letters
n = 5
count = 0.
# Pre-initialize an empty numpy array (for speed)
featureVec = np.zeros((num_features,),dtype="float32")
# Substring
word = word[:5]
for model_word in model.index2word:
if model_word.startswith(word):
featureVec = np.add(featureVec, model[model_word])
count += 1.
# Divide it
if count != 0:
featureVec = np.divide(featureVec, count)
return featureVec
def characterVec(words, model, num_features):
featureVec = np.zeros((num_features,), dtype="float32")
nwords = 0.
for word in words:
nwords += 1.
featureVec = np.add(featureVec, vector_dict[word])
featureVec = np.divide(featureVec, nwords)
return featureVec
# One of the kaggle tests
def makeFeatureVec(words, model, vector_dict, num_features):
# Pre-initialize an empty numpy array (for speed)
featureVec = np.zeros((num_features,),dtype="float32")
# Count number of words
nwords = 0.
# Loop over word by word
# If in vocabulary, add its feature vector to the total
for word in words.split():
if word in model: #and word not in stop_words:
nwords += 1.
featureVec = np.add(featureVec,vector_dict[word])
else:
missingWord = handleMissingWord(word, model, num_features)
featureVec = np.add(featureVec, missingWord)
nwords += 1.
# Divide the result by the number of words to get the average
featureVec = np.divide(featureVec,nwords)
# If number of words zero
if nwords == 0:
featureVec = characterVec(words, model, num_features)
return featureVec
# One of the kaggle tests
def getAvgFeatureVecs(comments, model, vector_dict, num_features):
# Initialize empty counter
counter = 0
# Preallocate a 2D numpy array for speed
reviewFeatureVecs = np.zeros((len(comments),num_features),dtype="float32")
for comment in comments:
# Call function that gets the average vectors
reviewFeatureVecs[counter] = makeFeatureVec(comment, model, vector_dict, num_features)
# Increment counter
counter += 1
return reviewFeatureVecs
os.system('cls')
# Load Word2Vec model here
print("LOADING WORD2VEC MODEL\n\n")
FILE = "W2V Models/w2v_reddit_unigram_300d.bin"
model = w2v.load_word2vec_format(FILE, binary=True)
# Load the dataset here
df = pd.read_csv('clean_dataset.csv')
# Separate out comments and labels
X , y = df['Comment'], df['Insult']
# Load the dictionary
print("LOADING DICTIONARY\n\n")
FILE = "Word Dictionaries/vect_dict_10.p"
vector_dict = pickle.load(open(FILE,"rb"))
# Transform the data
print("TRANSFORMING DATA \n\n")
X = getAvgFeatureVecs(X, model, vector_dict, 300)
# Get the Python's file name. Remove the .py extension
file_name = os.path.basename(__file__)
file_name = file_name.replace(".py","")
# Evaluate models
evaluate(X,y, file_name)