forked from tazeek/BullyDetect
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword_avg_hybrid.py
84 lines (58 loc) · 2.01 KB
/
word_avg_hybrid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import pandas as pd
import numpy as np
import os
import pickle
from gensim.models import Word2Vec as w2v
from evaluation import evaluate
# Use each word as a feature
def makeFeatureVec(comment, model, vect_dict, num_features):
# Pre-initialize an empty numpy array (for speed)
featureVec = np.zeros((num_features,),dtype="float32")
# Loop word-by-word, as well as index
for i,word in enumerate(comment.split()):
# INCOMPLETE SENTENCE DETECTED
if i == len(featureVec):
break
# If word is in model, return average of the word's feature vectors
# Else, return -1 which indicates no word found
if word in model:
word_feature = np.mean(vect_dict[word])
else:
word_feature = -1.0
# Overwrite the sentence of the numpy array
featureVec[i] = word_feature
return featureVec
# One of the kaggle tests
def commentFeatureVecs(comments, model, vect_dict, num_features):
# Initialize empty counter
counter = 0
# Preallocate a 2D numpy array for speed
reviewFeatureVecs = np.zeros((len(comments),num_features),dtype="float32")
for comment in comments:
# Call function that gets the average vectors
reviewFeatureVecs[counter] = makeFeatureVec(comment, model, vect_dict, num_features)
# Increment counter
counter += 1
return reviewFeatureVecs
os.system('cls')
# Load Word2Vec model here
print("LOADING WORD2VEC MODEL\n\n")
FILE = "W2V Models/w2v_reddit_unigram_300d.bin"
model = w2v.load_word2vec_format(FILE, binary=True)
# Load the dataset here
df = pd.read_csv('balanced_dataset.csv')
# Separate out comments and labels
X , y = df['Comment'], df['Insult']
# Load the dictionary
print("LOADING DICTIONARY\n\n")
FILE = "Word Dictionaries/vect_dict_5.p"
vect_dict = pickle.load(open(FILE,"rb"))
# Transform the data
print("TRANSFORMING DATASET \n\n")
MAX_WORDS = 300
X = commentFeatureVecs(X, model, vect_dict, MAX_WORDS)
# Get the Python's file name. Remove the .py extension
file_name = os.path.basename(__file__)
file_name = file_name.replace(".py","")
# Evaluate models
evaluate(X,y, file_name)