-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword2vec-sg.py
150 lines (126 loc) · 5.16 KB
/
word2vec-sg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# Developers: Yue Ka Leung, Tang Yu Hin
# This script is used to implement the word2vec Skip-Gram model
# Modified from https://www.geeksforgeeks.org/implement-your-own-word2vecskip-gram-model-in-python/
# Importing the required libraries
import numpy as np
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
# Compute softmax values for each sets of scores in x.
def softmax(x): # x is an array
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum() # the sum of the returned array is 1 --> probability distribution
# Define the word2vec class
class word2vec(object):
def __init__(self):
self.N = 10 # number of neurons in the hidden layer of the neural network
self.X_train = []
self.y_train = []
self.window_size = 2
self.alpha = 0.001
self.words = []
self.word_index = {}
def initialize(self, V, data):
self.V = V # number of unqiue words in the vocabulary
self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N)) # weights between input layer and hidden layer
self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V)) # weights between hidden layer and output layer
self.words = data
for i in range(len(data)):
self.word_index[data[i]] = i
#
def feed_forward(self, X):
self.h = np.dot(self.W.T, X).reshape(self.N, 1) # hidden layer matrix
self.u = np.dot(self.W1.T, self.h) # output layer matrix
self.y = softmax(self.u) # output layer
return self.y
def backpropagate(self, x, t):
e = self.y - np.asarray(t).reshape(self.V, 1)
dLdW1 = np.dot(self.h, e.T)
X = np.array(x).reshape(self.V, 1)
dLdW = np.dot(X, np.dot(self.W1, e).T)
self.W1 = self.W1 - self.alpha * dLdW1 # update W1
self.W = self.W - self.alpha * dLdW # update W2
def train(self, epochs):
for x in range(1, epochs):
self.loss = 0
for j in range(len(self.X_train)):
self.feed_forward(self.X_train[j])
self.backpropagate(self.X_train[j], self.y_train[j])
C = 0
for m in range(self.V):
if(self.y_train[j][m]):
self.loss += -1 * self.u[m][0]
C += 1
self.loss += C * np.log(np.sum(np.exp(self.u)))
print("epoch ", x, " loss = ", self.loss)
self.alpha *= 1/(1 + self.alpha*x)
def predict(self, word, number_of_predictions):
if word in self.words:
index = self.word_index[word]
X = [0 for i in range(self.V)]
X[index] = 1
prediction = self.feed_forward(X)
output = {}
for i in range(self.V):
output[prediction[i][0]] = i
top_context_words = []
for k in sorted(output, reverse=True):
top_context_words.append(self.words[output[k]])
if(len(top_context_words) >= number_of_predictions):
break
return top_context_words
else:
print("Word not found in dictionary")
# preprocess string such that they could be analysed (remove punctuation, all lowercase etc.)
def preprocessing(corpus):
stop_words = set(stopwords.words('english'))
training_data = []
sentences = corpus.split(".")
for i in range(len(sentences)):
sentences[i] = sentences[i].strip()
sentence = sentences[i].split()
x = [word.strip(string.punctuation) for word in sentence if word not in stop_words]
x = [word.lower() for word in x]
training_data.append(x)
return training_data
def get_train_data(filename):
fo = open(filename, "r", encoding = "utf-8")
corpus = fo.read()
return corpus
def prepare_data_for_training(sentences, w2v):
data = {}
for sentence in sentences:
for word in sentence:
if word not in data:
data[word] = 1
else:
data[word] += 1
V = len(data)
data = sorted(list(data.keys()))
vocab = {}
for i in range(len(data)):
vocab[data[i]] = i
for sentence in sentences:
for i in range(len(sentence)):
center_word = [0 for x in range(V)]
center_word[vocab[sentence[i]]] = 1
context = [0 for x in range(V)]
for j in range(i - w2v.window_size, i + w2v.window_size):
if i != j and j >= 0 and j < len(sentence):
context[vocab[sentence[j]]] += 1
w2v.X_train.append(center_word)
w2v.y_train.append(context)
w2v.initialize(V,data)
return w2v.X_train, w2v.y_train
# Can be adjusted for different text and target word
filename = "train-data-1.txt"
target_word = "food"
epochs = 1000
# End of edit
corpus = get_train_data(filename)
training_data = preprocessing(corpus)
w2v = word2vec()
prepare_data_for_training(training_data, w2v)
w2v.train(epochs)
print(w2v.predict(target_word, 3)) # Predict the top 3 words that are similar to the target word