forked from tazeek/BullyDetect
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
64 lines (45 loc) · 1.91 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!python3
from gensim.models import Word2Vec, Phrases
from pymongo import MongoClient
import logging
import multiprocessing
import os
import time
import numpy as np
# Utilize the full power of the worker threads available
os.system("taskset -p 0xff %d" % os.getpid())
class MySentences():
def __iter__(self):
client = MongoClient() # First: Connect to MongoDB
db = client['reddit'] # Second: Connect to Database
collection = db['full_3'] # Third: Get collection list
for fragments in collection.find():
for sentence in fragments['sentence_list']:
yield sentence
word_list = MySentences()
print("DATA LOADED SUCCESSFULLY.....\n\n")
#For Logging Parameters
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
#Word2Vec Parameters (START)
sg = 1 # Use Skip-Gram Model
size = 300 # Dimensionality size of Projection Layer
window = 5 # Window of surrounding words
alpha = 0.025 # Initial learning rate of the Neural Network
min_count = 5 # Minimum Frequency of Words
workers = multiprocessing.cpu_count() # Number of workers
max_vocab_size = 8000000 # Maximum number of Unique Words
negative = 10 # Number of words to be drawn for Negative Sampling
sample = 0.001 # Subsampling of frequent words
hs = 0 # Negative Sampling to be used
iter = 5 # Iterations over corpus. Also called epochs
#Word2Vec Parameters (END)
# WORD2VEC RAM FORMULA (IN GIGABYTES):
# (Estimated Number of Unique Words x Dimension Size x 12)/1,000,000,000
#Initialize Bigram Transformer
#bigram_transformer = Phrases(word_list)
os.system('cls')
#Initialize Word2Vec model
model = Word2Vec(word_list, sg=sg, size=size, window=window, alpha=alpha, min_count=min_count, workers=workers, max_vocab_size=max_vocab_size, hs=hs, iter=iter, sample=sample)
model_name = "w2v_reddit_bigram_300d(FINAL)"
model.init_sims(replace=True) # Trim down memory size
model.save_word2vec_format(model_name + '.bin', binary=True)