forked from tazeek/BullyDetect
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathavg_sentence.py
35 lines (25 loc) · 897 Bytes
/
avg_sentence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import pandas as pd
import numpy as np
import os
from gensim.models import Word2Vec as w2v
from evaluation import evaluate
os.system('cls')
# Load Word2Vec model here
print("LOADING WORD2VEC MODEL \n\n")
FILE = "W2V Models/w2v_reddit_unigram_300d.bin"
model = w2v.load_word2vec_format(FILE, binary=True)
# Load the dataset here
print("LOADING DATASET \n\n")
df = pd.read_csv('balanced_dataset.csv')
# Separate out comments and labels
X , y = df['Comment'], df['Insult']
# Transform the set
# NOTE: You need to reshape the file since it is only ONE feature
print("TRANSFORMING DATA \n\n")
X = np.array([np.mean([model[word] for word in sentence if word in model]) for sentence in X])
X = X.reshape(len(X),1)
# Get the Python's file name. Remove the .py extension
file_name = os.path.basename(__file__)
file_name = file_name.replace(".py","")
# Evaluate models
evaluate(X,y, file_name)