forked from tazeek/BullyDetect
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtfidf_train.py
35 lines (26 loc) · 903 Bytes
/
tfidf_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
# Load the dataset
print("LOADING DATASET\n\n")
df = pd.read_csv('clean_dataset.csv')
# Load the comments
print("LOADING COMMENTS\n\n")
X = df['Comment']
# Initialize the model: One with stop words, One without
print("INITIALIZING TFIDF WITH STOP WORDS\n\n")
tfidf_stop_words = TfidfVectorizer()
print("INITIALIZING TFIDF WITHOUT STOP WORDS\n\n")
tfidf = TfidfVectorizer(stop_words='english')
# Train both
print("TRAINING TFIDF WITH STOP WORDS\n\n")
tfidf_stop_words.fit(X)
print("TRAINING TFIDF WITHOUT STOP WORDS\n\n")
tfidf.fit(X)
# Save both models
print("SAVING TFIDF WITH STOP WORDS\n\n")
FILE = "TFIDF models/tfidif_stop.pk"
pickle.dump(tfidf_stop_words, open(FILE, "wb"))
print("SAVING TFIDIF WITHOUT STOP WORDS\n\n")
FILE = "TFIDF models/tfidf_normal.pk"
pickle.dump(tfidf, open(FILE, "wb"))