forked from tazeek/BullyDetect
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword_feature.py
78 lines (53 loc) · 1.81 KB
/
word_feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas as pd
import numpy as np
import os
from gensim.models import Word2Vec as w2v
from evaluation import evaluate
# Use each word as a feature
def makeFeatureVec(comment, model, num_features):
# Pre-initialize an empty numpy array (for speed)
featureVec = np.zeros((num_features,),dtype="float32")
# Loop word-by-word, as well as index
for i,word in enumerate(comment.split()):
# INCOMPLETE SENTENCE DETECTED
if i == len(featureVec):
break
# If word is in model, return average of the word's feature vectors
# Else, return -1 which indicates no word found
if word in model:
word_feature = np.mean(model[word])
else:
word_feature = -1.0
# Overwrite the sentence of the numpy array
featureVec[i] = word_feature
return featureVec
# One of the kaggle tests
def commentFeatureVecs(comments, model, num_features):
# Initialize empty counter
counter = 0
# Preallocate a 2D numpy array for speed
reviewFeatureVecs = np.zeros((len(comments),num_features),dtype="float32")
for comment in comments:
# Call function that gets the average vectors
reviewFeatureVecs[counter] = makeFeatureVec(comment, model, num_features)
# Increment counter
counter += 1
return reviewFeatureVecs
os.system('cls')
# Load Word2Vec model here
print("LOADING WORD2VEC MODEL\n\n")
FILE = "W2V Models/w2v_reddit_unigram_300d.bin"
model = w2v.load_word2vec_format(FILE, binary=True)
# Load the dataset here
df = pd.read_csv('balanced_dataset.csv')
# Separate out comments and labels
X , y = df['Comment'], df['Insult']
# Transform the data
print("TRANSFORMING DATA \n\n")
MAX_WORDS = 300
X = commentFeatureVecs(X, model, MAX_WORDS)
# Get the Python's file name. Remove the .py extension
file_name = os.path.basename(__file__)
file_name = file_name.replace(".py","")
# Evaluate models
evaluate(X,y, file_name)