forked from tazeek/BullyDetect
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster_freq.py
87 lines (59 loc) · 2.06 KB
/
cluster_freq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pickle
import os
import pandas as pd
import numpy as np
from evaluation import evaluate
# Transform the data
def createBagCentroids(comment, clusters, cluster_dictionary):
# Pre-allocate the bag of centroids vector (for speed)
bag_of_centroids = np.zeros( clusters, dtype="float32" )
# Loop word by word
for word in comment.split():
# Assume word is not in dictionary
index = -1
# Check if word is in dictionary
if word in cluster_dictionary:
# Get index of the word
index = cluster_dictionary[word]
# Increment index of bag_of_centroids
bag_of_centroids[index] += 1
return bag_of_centroids
# Read in comment by comment
def transformation(comments, cluster_dictionary):
# Find number of clusters
# Use extra cluster to store unseen words. This is the last cluster
clusters = max(cluster_dictionary.values()) + 2
# Pre-allocate an array for the transformation (for speed)
centroids_bag = np.zeros((len(comments), clusters), dtype="float32")
# Initialize counter
counter = 0
# Loop over comment by comment
for comment in comments:
# Overwrite current row with transformed data
centroids_bag[counter] = createBagCentroids(comment, clusters, cluster_dictionary)
# Increment counter
counter += 1
return centroids_bag
# Function to load the cluster dictionary
def loadClusterSet(FILE):
# File loaded here
word_centroid_map = pickle.load(open(FILE,"rb"))
return word_centroid_map
os.system('cls')
# Load the dataset here
print("LOADING DATASET \n\n")
df = pd.read_csv('clean_dataset.csv')
# Separate out comments and labels
X , y = df['Comment'], df['Insult']
# Loading the cluster dictionary here
print("LOADING CLUSTER DICTIONARY \n\n")
FILE = "K-Means Models/full_500C.pk"
cluster_dictionary = loadClusterSet(FILE)
# Transform the data
print("TRANSFORMING DATA \n\n")
X = transformation(X, cluster_dictionary)
# Get the Python's file name. Remove the .py extension
file_name = os.path.basename(__file__)
file_name = file_name.replace(".py","")
# Send in for evaluation
evaluate(X,y, file_name)