Skip to content

Commit

Permalink
update to 0.1c version
Browse files Browse the repository at this point in the history
  • Loading branch information
tmikolov committed Sep 6, 2014
1 parent 5815e5d commit 891d84c
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 93 deletions.
8 changes: 4 additions & 4 deletions demo-analogy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ if [ ! -e text8 ]; then
wget http://mattmahoney.net/dc/text8.zip -O text8.gz
gzip -d text8.gz -f
fi
echo -----------------------------------------------------------------------------------------------------
echo Note that for the word analogy to perform well, the models should be trained on much larger data sets
echo ---------------------------------------------------------------------------------------------------
echo Note that for the word analogy to perform well, the model should be trained on much larger data set
echo Example input: paris france berlin
echo -----------------------------------------------------------------------------------------------------
time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
echo ---------------------------------------------------------------------------------------------------
time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
./word-analogy vectors.bin
2 changes: 1 addition & 1 deletion demo-classes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ if [ ! -e text8 ]; then
wget http://mattmahoney.net/dc/text8.zip -O text8.gz
gzip -d text8.gz -f
fi
time ./word2vec -train text8 -output classes.txt -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500
time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500
sort classes.txt -k 2 -n > classes.sorted.txt
echo The word classes were saved to file classes.sorted.txt
19 changes: 9 additions & 10 deletions demo-phrase-accuracy.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
make
if [ ! -e text8 ]; then
wget http://mattmahoney.net/dc/text8.zip -O text8.gz
gzip -d text8.gz -f
if [ ! -e news.2012.en.shuffled ]; then
wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
gzip -d news.2012.en.shuffled.gz -f
fi
echo ----------------------------------------------------------------------------------------------------------------
echo Note that the accuracy and coverage of the test set questions is going to be low with this small training corpus
echo To achieve better accuracy, larger training set is needed
echo ----------------------------------------------------------------------------------------------------------------
time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 -min-count 3
time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3
./compute-accuracy vectors-phrase.bin <questions-phrases.txt
sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
./compute-accuracy vectors-phrase.bin < questions-phrases.txt
15 changes: 9 additions & 6 deletions demo-phrases.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
make
if [ ! -e text8 ]; then
wget http://mattmahoney.net/dc/text8.zip -O text8.gz
gzip -d text8.gz -f
if [ ! -e news.2012.en.shuffled ]; then
wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz
gzip -d news.2012.en.shuffled.gz -f
fi
time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2
time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
./distance vectors-phrase.bin
sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0
time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2
time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2
tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1
time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15
./distance vectors-phrase.bin
2 changes: 1 addition & 1 deletion demo-word-accuracy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ if [ ! -e text8 ]; then
wget http://mattmahoney.net/dc/text8.zip -O text8.gz
gzip -d text8.gz -f
fi
time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
./compute-accuracy vectors.bin 30000 < questions-words.txt
# to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt
4 changes: 2 additions & 2 deletions demo-word.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ if [ ! -e text8 ]; then
wget http://mattmahoney.net/dc/text8.zip -O text8.gz
gzip -d text8.gz -f
fi
time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1
./distance vectors.bin
time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15
./distance vectors.bin
4 changes: 2 additions & 2 deletions makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
CC = gcc
#The -Ofast might not work with older versions of gcc; in that case, use -O2
CFLAGS = -lm -pthread -Ofast -march=native -Wall -funroll-loops -Wno-unused-result
#Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions
CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result

all: word2vec word2phrase distance word-analogy compute-accuracy

Expand Down
154 changes: 87 additions & 67 deletions word2vec.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,15 @@ struct vocab_word {
char train_file[MAX_STRING], output_file[MAX_STRING];
char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
struct vocab_word *vocab;
int binary = 0, cbow = 0, debug_mode = 2, window = 5, min_count = 5, num_threads = 1, min_reduce = 1;
int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
int *vocab_hash;
long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
long long train_words = 0, word_count_actual = 0, file_size = 0, classes = 0;
real alpha = 0.025, starting_alpha, sample = 0;
long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
real alpha = 0.025, starting_alpha, sample = 1e-3;
real *syn0, *syn1, *syn1neg, *expTable;
clock_t start;

int hs = 1, negative = 0;
int hs = 0, negative = 5;
const int table_size = 1e8;
int *table;

Expand Down Expand Up @@ -337,29 +337,32 @@ void ReadVocab() {

void InitNet() {
long long a, b;
unsigned long long next_random = 1;
a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
if (hs) {
a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
for (b = 0; b < layer1_size; b++) for (a = 0; a < vocab_size; a++)
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
syn1[a * layer1_size + b] = 0;
}
if (negative>0) {
a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real));
if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
for (b = 0; b < layer1_size; b++) for (a = 0; a < vocab_size; a++)
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
syn1neg[a * layer1_size + b] = 0;
}
for (b = 0; b < layer1_size; b++) for (a = 0; a < vocab_size; a++)
syn0[a * layer1_size + b] = (rand() / (real)RAND_MAX - 0.5) / layer1_size;
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
next_random = next_random * (unsigned long long)25214903917 + 11;
syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
}
CreateBinaryTree();
}

void *TrainModelThread(void *id) {
long long a, b, d, word, last_word, sentence_length = 0, sentence_position = 0;
long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
long long l1, l2, c, target, label;
long long l1, l2, c, target, label, local_iter = iter;
unsigned long long next_random = (long long)id;
real f, g;
clock_t now;
Expand All @@ -374,11 +377,11 @@ void *TrainModelThread(void *id) {
if ((debug_mode > 1)) {
now=clock();
printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha,
word_count_actual / (real)(train_words + 1) * 100,
word_count_actual / (real)(iter * train_words + 1) * 100,
word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
fflush(stdout);
}
alpha = starting_alpha * (1 - word_count_actual / (real)(train_words + 1));
alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
}
if (sentence_length == 0) {
Expand All @@ -400,8 +403,16 @@ void *TrainModelThread(void *id) {
}
sentence_position = 0;
}
if (feof(fi)) break;
if (word_count > train_words / num_threads) break;
if (feof(fi) || (word_count > train_words / num_threads)) {
word_count_actual += word_count - last_word_count;
local_iter--;
if (local_iter == 0) break;
word_count = 0;
last_word_count = 0;
sentence_length = 0;
fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
continue;
}
word = sen[sentence_position];
if (word == -1) continue;
for (c = 0; c < layer1_size; c++) neu1[c] = 0;
Expand All @@ -410,58 +421,63 @@ void *TrainModelThread(void *id) {
b = next_random % window;
if (cbow) { //train the cbow architecture
// in -> hidden
cw = 0;
for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
c = sentence_position - window + a;
if (c < 0) continue;
if (c >= sentence_length) continue;
last_word = sen[c];
if (last_word == -1) continue;
for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];
cw++;
}
if (hs) for (d = 0; d < vocab[word].codelen; d++) {
f = 0;
l2 = vocab[word].point[d] * layer1_size;
// Propagate hidden -> output
for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
if (f <= -MAX_EXP) continue;
else if (f >= MAX_EXP) continue;
else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
// 'g' is the gradient multiplied by the learning rate
g = (1 - vocab[word].code[d] - f) * alpha;
// Propagate errors output -> hidden
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
// Learn weights hidden -> output
for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
}
// NEGATIVE SAMPLING
if (negative > 0) for (d = 0; d < negative + 1; d++) {
if (d == 0) {
target = word;
label = 1;
} else {
next_random = next_random * (unsigned long long)25214903917 + 11;
target = table[(next_random >> 16) % table_size];
if (target == 0) target = next_random % (vocab_size - 1) + 1;
if (target == word) continue;
label = 0;
if (cw) {
for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
if (hs) for (d = 0; d < vocab[word].codelen; d++) {
f = 0;
l2 = vocab[word].point[d] * layer1_size;
// Propagate hidden -> output
for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
if (f <= -MAX_EXP) continue;
else if (f >= MAX_EXP) continue;
else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
// 'g' is the gradient multiplied by the learning rate
g = (1 - vocab[word].code[d] - f) * alpha;
// Propagate errors output -> hidden
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
// Learn weights hidden -> output
for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
}
// NEGATIVE SAMPLING
if (negative > 0) for (d = 0; d < negative + 1; d++) {
if (d == 0) {
target = word;
label = 1;
} else {
next_random = next_random * (unsigned long long)25214903917 + 11;
target = table[(next_random >> 16) % table_size];
if (target == 0) target = next_random % (vocab_size - 1) + 1;
if (target == word) continue;
label = 0;
}
l2 = target * layer1_size;
f = 0;
for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
if (f > MAX_EXP) g = (label - 1) * alpha;
else if (f < -MAX_EXP) g = (label - 0) * alpha;
else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
}
// hidden -> in
for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
c = sentence_position - window + a;
if (c < 0) continue;
if (c >= sentence_length) continue;
last_word = sen[c];
if (last_word == -1) continue;
for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
}
l2 = target * layer1_size;
f = 0;
for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
if (f > MAX_EXP) g = (label - 1) * alpha;
else if (f < -MAX_EXP) g = (label - 0) * alpha;
else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
}
// hidden -> in
for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
c = sentence_position - window + a;
if (c < 0) continue;
if (c >= sentence_length) continue;
last_word = sen[c];
if (last_word == -1) continue;
for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
}
} else { //train skip-gram
for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
Expand Down Expand Up @@ -611,7 +627,7 @@ int ArgPos(char *str, int argc, char **argv) {
int main(int argc, char **argv) {
int i;
if (argc == 1) {
printf("WORD VECTOR estimation toolkit v 0.1b\n\n");
printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
printf("Options:\n");
printf("Parameters for training:\n");
printf("\t-train <file>\n");
Expand All @@ -623,18 +639,20 @@ int main(int argc, char **argv) {
printf("\t-window <int>\n");
printf("\t\tSet max skip length between words; default is 5\n");
printf("\t-sample <float>\n");
printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency");
printf(" in the training data will be randomly down-sampled; default is 0 (off), useful value is 1e-5\n");
printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
printf("\t-hs <int>\n");
printf("\t\tUse Hierarchical Softmax; default is 1 (0 = not used)\n");
printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
printf("\t-negative <int>\n");
printf("\t\tNumber of negative examples; default is 0, common values are 5 - 10 (0 = not used)\n");
printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
printf("\t-threads <int>\n");
printf("\t\tUse <int> threads (default 1)\n");
printf("\t\tUse <int> threads (default 12)\n");
printf("\t-iter <int>\n");
printf("\t\tRun more training iterations (default 5)\n");
printf("\t-min-count <int>\n");
printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
printf("\t-alpha <float>\n");
printf("\t\tSet the starting learning rate; default is 0.025\n");
printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
printf("\t-classes <int>\n");
printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
printf("\t-debug <int>\n");
Expand All @@ -646,9 +664,9 @@ int main(int argc, char **argv) {
printf("\t-read-vocab <file>\n");
printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
printf("\t-cbow <int>\n");
printf("\t\tUse the continuous bag of words model; default is 0 (skip-gram model)\n");
printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n");
printf("\nExamples:\n");
printf("./word2vec -train data.txt -output vec.txt -debug 2 -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1\n\n");
printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n");
return 0;
}
output_file[0] = 0;
Expand All @@ -661,13 +679,15 @@ int main(int argc, char **argv) {
if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]);
if (cbow) alpha = 0.05;
if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
Expand All @@ -679,4 +699,4 @@ int main(int argc, char **argv) {
}
TrainModel();
return 0;
}
}

0 comments on commit 891d84c

Please sign in to comment.