diff --git a/demo-analogy.sh b/demo-analogy.sh index 3d1fcf7..bc2b53f 100644 --- a/demo-analogy.sh +++ b/demo-analogy.sh @@ -3,9 +3,9 @@ if [ ! -e text8 ]; then wget http://mattmahoney.net/dc/text8.zip -O text8.gz gzip -d text8.gz -f fi -echo ----------------------------------------------------------------------------------------------------- -echo Note that for the word analogy to perform well, the models should be trained on much larger data sets +echo --------------------------------------------------------------------------------------------------- +echo Note that for the word analogy to perform well, the model should be trained on much larger data set echo Example input: paris france berlin -echo ----------------------------------------------------------------------------------------------------- -time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 +echo --------------------------------------------------------------------------------------------------- +time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 ./word-analogy vectors.bin diff --git a/demo-classes.sh b/demo-classes.sh index b0b9d99..a48d180 100644 --- a/demo-classes.sh +++ b/demo-classes.sh @@ -3,6 +3,6 @@ if [ ! -e text8 ]; then wget http://mattmahoney.net/dc/text8.zip -O text8.gz gzip -d text8.gz -f fi -time ./word2vec -train text8 -output classes.txt -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -classes 500 +time ./word2vec -train text8 -output classes.txt -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -iter 15 -classes 500 sort classes.txt -k 2 -n > classes.sorted.txt echo The word classes were saved to file classes.sorted.txt diff --git a/demo-phrase-accuracy.sh b/demo-phrase-accuracy.sh index eb2b392..1654069 100644 --- a/demo-phrase-accuracy.sh +++ b/demo-phrase-accuracy.sh @@ -1,12 +1,11 @@ make -if [ ! -e text8 ]; then - wget http://mattmahoney.net/dc/text8.zip -O text8.gz - gzip -d text8.gz -f +if [ ! -e news.2012.en.shuffled ]; then + wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz + gzip -d news.2012.en.shuffled.gz -f fi -echo ---------------------------------------------------------------------------------------------------------------- -echo Note that the accuracy and coverage of the test set questions is going to be low with this small training corpus -echo To achieve better accuracy, larger training set is needed -echo ---------------------------------------------------------------------------------------------------------------- -time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 -min-count 3 -time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -min-count 3 -./compute-accuracy vectors-phrase.bin news.2012.en.shuffled-norm0 +time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2 +time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2 +tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1 +time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 +./compute-accuracy vectors-phrase.bin < questions-phrases.txt diff --git a/demo-phrases.sh b/demo-phrases.sh index c833b81..daf1778 100644 --- a/demo-phrases.sh +++ b/demo-phrases.sh @@ -1,8 +1,11 @@ make -if [ ! -e text8 ]; then - wget http://mattmahoney.net/dc/text8.zip -O text8.gz - gzip -d text8.gz -f +if [ ! -e news.2012.en.shuffled ]; then + wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.en.shuffled.gz + gzip -d news.2012.en.shuffled.gz -f fi -time ./word2phrase -train text8 -output text8-phrase -threshold 500 -debug 2 -time ./word2vec -train text8-phrase -output vectors-phrase.bin -cbow 0 -size 300 -window 10 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -./distance vectors-phrase.bin \ No newline at end of file +sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" < news.2012.en.shuffled | tr -c "A-Za-z'_ \n" " " > news.2012.en.shuffled-norm0 +time ./word2phrase -train news.2012.en.shuffled-norm0 -output news.2012.en.shuffled-norm0-phrase0 -threshold 200 -debug 2 +time ./word2phrase -train news.2012.en.shuffled-norm0-phrase0 -output news.2012.en.shuffled-norm0-phrase1 -threshold 100 -debug 2 +tr A-Z a-z < news.2012.en.shuffled-norm0-phrase1 > news.2012.en.shuffled-norm1-phrase1 +time ./word2vec -train news.2012.en.shuffled-norm1-phrase1 -output vectors-phrase.bin -cbow 1 -size 200 -window 10 -negative 25 -hs 0 -sample 1e-5 -threads 20 -binary 1 -iter 15 +./distance vectors-phrase.bin diff --git a/demo-word-accuracy.sh b/demo-word-accuracy.sh index ffe828a..37c4b98 100644 --- a/demo-word-accuracy.sh +++ b/demo-word-accuracy.sh @@ -3,6 +3,6 @@ if [ ! -e text8 ]; then wget http://mattmahoney.net/dc/text8.zip -O text8.gz gzip -d text8.gz -f fi -time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 +time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 ./compute-accuracy vectors.bin 30000 < questions-words.txt # to compute accuracy with the full vocabulary, use: ./compute-accuracy vectors.bin < questions-words.txt diff --git a/demo-word.sh b/demo-word.sh index 0df5bd5..1ad52f3 100644 --- a/demo-word.sh +++ b/demo-word.sh @@ -3,5 +3,5 @@ if [ ! -e text8 ]; then wget http://mattmahoney.net/dc/text8.zip -O text8.gz gzip -d text8.gz -f fi -time ./word2vec -train text8 -output vectors.bin -cbow 0 -size 200 -window 5 -negative 0 -hs 1 -sample 1e-3 -threads 12 -binary 1 -./distance vectors.bin \ No newline at end of file +time ./word2vec -train text8 -output vectors.bin -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 20 -binary 1 -iter 15 +./distance vectors.bin diff --git a/makefile b/makefile index d446b1a..24ce917 100644 --- a/makefile +++ b/makefile @@ -1,6 +1,6 @@ CC = gcc -#The -Ofast might not work with older versions of gcc; in that case, use -O2 -CFLAGS = -lm -pthread -Ofast -march=native -Wall -funroll-loops -Wno-unused-result +#Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions +CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result all: word2vec word2phrase distance word-analogy compute-accuracy diff --git a/word2vec.c b/word2vec.c index b92dbcb..6763cfa 100644 --- a/word2vec.c +++ b/word2vec.c @@ -37,15 +37,15 @@ struct vocab_word { char train_file[MAX_STRING], output_file[MAX_STRING]; char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING]; struct vocab_word *vocab; -int binary = 0, cbow = 0, debug_mode = 2, window = 5, min_count = 5, num_threads = 1, min_reduce = 1; +int binary = 0, cbow = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1; int *vocab_hash; long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100; -long long train_words = 0, word_count_actual = 0, file_size = 0, classes = 0; -real alpha = 0.025, starting_alpha, sample = 0; +long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0; +real alpha = 0.025, starting_alpha, sample = 1e-3; real *syn0, *syn1, *syn1neg, *expTable; clock_t start; -int hs = 1, negative = 0; +int hs = 0, negative = 5; const int table_size = 1e8; int *table; @@ -337,29 +337,32 @@ void ReadVocab() { void InitNet() { long long a, b; + unsigned long long next_random = 1; a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)); if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);} if (hs) { a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real)); if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);} - for (b = 0; b < layer1_size; b++) for (a = 0; a < vocab_size; a++) + for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) syn1[a * layer1_size + b] = 0; } if (negative>0) { a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real)); if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);} - for (b = 0; b < layer1_size; b++) for (a = 0; a < vocab_size; a++) + for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) syn1neg[a * layer1_size + b] = 0; } - for (b = 0; b < layer1_size; b++) for (a = 0; a < vocab_size; a++) - syn0[a * layer1_size + b] = (rand() / (real)RAND_MAX - 0.5) / layer1_size; + for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) { + next_random = next_random * (unsigned long long)25214903917 + 11; + syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size; + } CreateBinaryTree(); } void *TrainModelThread(void *id) { - long long a, b, d, word, last_word, sentence_length = 0, sentence_position = 0; + long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0; long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; - long long l1, l2, c, target, label; + long long l1, l2, c, target, label, local_iter = iter; unsigned long long next_random = (long long)id; real f, g; clock_t now; @@ -374,11 +377,11 @@ void *TrainModelThread(void *id) { if ((debug_mode > 1)) { now=clock(); printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, - word_count_actual / (real)(train_words + 1) * 100, + word_count_actual / (real)(iter * train_words + 1) * 100, word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); fflush(stdout); } - alpha = starting_alpha * (1 - word_count_actual / (real)(train_words + 1)); + alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; } if (sentence_length == 0) { @@ -400,8 +403,16 @@ void *TrainModelThread(void *id) { } sentence_position = 0; } - if (feof(fi)) break; - if (word_count > train_words / num_threads) break; + if (feof(fi) || (word_count > train_words / num_threads)) { + word_count_actual += word_count - last_word_count; + local_iter--; + if (local_iter == 0) break; + word_count = 0; + last_word_count = 0; + sentence_length = 0; + fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); + continue; + } word = sen[sentence_position]; if (word == -1) continue; for (c = 0; c < layer1_size; c++) neu1[c] = 0; @@ -410,6 +421,7 @@ void *TrainModelThread(void *id) { b = next_random % window; if (cbow) { //train the cbow architecture // in -> hidden + cw = 0; for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { c = sentence_position - window + a; if (c < 0) continue; @@ -417,51 +429,55 @@ void *TrainModelThread(void *id) { last_word = sen[c]; if (last_word == -1) continue; for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size]; + cw++; } - if (hs) for (d = 0; d < vocab[word].codelen; d++) { - f = 0; - l2 = vocab[word].point[d] * layer1_size; - // Propagate hidden -> output - for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; - if (f <= -MAX_EXP) continue; - else if (f >= MAX_EXP) continue; - else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; - // 'g' is the gradient multiplied by the learning rate - g = (1 - vocab[word].code[d] - f) * alpha; - // Propagate errors output -> hidden - for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; - // Learn weights hidden -> output - for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; - } - // NEGATIVE SAMPLING - if (negative > 0) for (d = 0; d < negative + 1; d++) { - if (d == 0) { - target = word; - label = 1; - } else { - next_random = next_random * (unsigned long long)25214903917 + 11; - target = table[(next_random >> 16) % table_size]; - if (target == 0) target = next_random % (vocab_size - 1) + 1; - if (target == word) continue; - label = 0; + if (cw) { + for (c = 0; c < layer1_size; c++) neu1[c] /= cw; + if (hs) for (d = 0; d < vocab[word].codelen; d++) { + f = 0; + l2 = vocab[word].point[d] * layer1_size; + // Propagate hidden -> output + for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; + if (f <= -MAX_EXP) continue; + else if (f >= MAX_EXP) continue; + else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; + // 'g' is the gradient multiplied by the learning rate + g = (1 - vocab[word].code[d] - f) * alpha; + // Propagate errors output -> hidden + for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; + // Learn weights hidden -> output + for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; + } + // NEGATIVE SAMPLING + if (negative > 0) for (d = 0; d < negative + 1; d++) { + if (d == 0) { + target = word; + label = 1; + } else { + next_random = next_random * (unsigned long long)25214903917 + 11; + target = table[(next_random >> 16) % table_size]; + if (target == 0) target = next_random % (vocab_size - 1) + 1; + if (target == word) continue; + label = 0; + } + l2 = target * layer1_size; + f = 0; + for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; + if (f > MAX_EXP) g = (label - 1) * alpha; + else if (f < -MAX_EXP) g = (label - 0) * alpha; + else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; + for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; + for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; + } + // hidden -> in + for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { + c = sentence_position - window + a; + if (c < 0) continue; + if (c >= sentence_length) continue; + last_word = sen[c]; + if (last_word == -1) continue; + for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; } - l2 = target * layer1_size; - f = 0; - for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; - if (f > MAX_EXP) g = (label - 1) * alpha; - else if (f < -MAX_EXP) g = (label - 0) * alpha; - else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; - for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; - for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; - } - // hidden -> in - for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { - c = sentence_position - window + a; - if (c < 0) continue; - if (c >= sentence_length) continue; - last_word = sen[c]; - if (last_word == -1) continue; - for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; } } else { //train skip-gram for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { @@ -611,7 +627,7 @@ int ArgPos(char *str, int argc, char **argv) { int main(int argc, char **argv) { int i; if (argc == 1) { - printf("WORD VECTOR estimation toolkit v 0.1b\n\n"); + printf("WORD VECTOR estimation toolkit v 0.1c\n\n"); printf("Options:\n"); printf("Parameters for training:\n"); printf("\t-train \n"); @@ -623,18 +639,20 @@ int main(int argc, char **argv) { printf("\t-window \n"); printf("\t\tSet max skip length between words; default is 5\n"); printf("\t-sample \n"); - printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency"); - printf(" in the training data will be randomly down-sampled; default is 0 (off), useful value is 1e-5\n"); + printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n"); + printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n"); printf("\t-hs \n"); - printf("\t\tUse Hierarchical Softmax; default is 1 (0 = not used)\n"); + printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n"); printf("\t-negative \n"); - printf("\t\tNumber of negative examples; default is 0, common values are 5 - 10 (0 = not used)\n"); + printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n"); printf("\t-threads \n"); - printf("\t\tUse threads (default 1)\n"); + printf("\t\tUse threads (default 12)\n"); + printf("\t-iter \n"); + printf("\t\tRun more training iterations (default 5)\n"); printf("\t-min-count \n"); printf("\t\tThis will discard words that appear less than times; default is 5\n"); printf("\t-alpha \n"); - printf("\t\tSet the starting learning rate; default is 0.025\n"); + printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n"); printf("\t-classes \n"); printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n"); printf("\t-debug \n"); @@ -646,9 +664,9 @@ int main(int argc, char **argv) { printf("\t-read-vocab \n"); printf("\t\tThe vocabulary will be read from , not constructed from the training data\n"); printf("\t-cbow \n"); - printf("\t\tUse the continuous bag of words model; default is 0 (skip-gram model)\n"); + printf("\t\tUse the continuous bag of words model; default is 1 (use 0 for skip-gram model)\n"); printf("\nExamples:\n"); - printf("./word2vec -train data.txt -output vec.txt -debug 2 -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1\n\n"); + printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3\n\n"); return 0; } output_file[0] = 0; @@ -661,6 +679,7 @@ int main(int argc, char **argv) { if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]); if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]); if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]); + if (cbow) alpha = 0.05; if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]); if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]); if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]); @@ -668,6 +687,7 @@ int main(int argc, char **argv) { if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]); if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]); if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]); + if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]); if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]); if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]); vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); @@ -679,4 +699,4 @@ int main(int argc, char **argv) { } TrainModel(); return 0; -} \ No newline at end of file +}