forked from tmikolov/word2vec
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
1,922 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
void InitNet() { | ||
long long a, b; | ||
unsigned long long next_random = 1; | ||
a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)); | ||
if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);} | ||
if (hs) { | ||
a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real)); | ||
if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);} | ||
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) | ||
syn1[a * layer1_size + b] = 0; | ||
} | ||
if (negative>0) { | ||
a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real)); | ||
if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);} | ||
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) | ||
syn1neg[a * layer1_size + b] = 0; | ||
} | ||
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) { | ||
next_random = next_random * (unsigned long long)25214903917 + 11; | ||
syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size; | ||
} | ||
CreateBinaryTree(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
void TrainModel() { | ||
long a, b, c, d; | ||
FILE *fo; | ||
pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); | ||
printf("Starting training using file %s\n", train_file); | ||
starting_alpha = alpha; | ||
if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile(); | ||
if (save_vocab_file[0] != 0) SaveVocab(); | ||
if (output_file[0] == 0) return; | ||
InitNet(); | ||
if (negative > 0) InitUnigramTable(); | ||
start = clock(); | ||
for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); | ||
for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); | ||
fo = fopen(output_file, "wb"); | ||
if (classes == 0) { | ||
// Save the word vectors | ||
fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); | ||
for (a = 0; a < vocab_size; a++) { | ||
fprintf(fo, "%s ", vocab[a].word); | ||
if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo); | ||
else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]); | ||
fprintf(fo, "\n"); | ||
} | ||
} else { | ||
// Run K-means on the word vectors | ||
int clcn = classes, iter = 10, closeid; | ||
int *centcn = (int *)malloc(classes * sizeof(int)); | ||
int *cl = (int *)calloc(vocab_size, sizeof(int)); | ||
real closev, x; | ||
real *cent = (real *)calloc(classes * layer1_size, sizeof(real)); | ||
for (a = 0; a < vocab_size; a++) cl[a] = a % clcn; | ||
for (a = 0; a < iter; a++) { | ||
for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0; | ||
for (b = 0; b < clcn; b++) centcn[b] = 1; | ||
for (c = 0; c < vocab_size; c++) { | ||
for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d]; | ||
centcn[cl[c]]++; | ||
} | ||
for (b = 0; b < clcn; b++) { | ||
closev = 0; | ||
for (c = 0; c < layer1_size; c++) { | ||
cent[layer1_size * b + c] /= centcn[b]; | ||
closev += cent[layer1_size * b + c] * cent[layer1_size * b + c]; | ||
} | ||
closev = sqrt(closev); | ||
for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev; | ||
} | ||
for (c = 0; c < vocab_size; c++) { | ||
closev = -10; | ||
closeid = 0; | ||
for (d = 0; d < clcn; d++) { | ||
x = 0; | ||
for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b]; | ||
if (x > closev) { | ||
closev = x; | ||
closeid = d; | ||
} | ||
} | ||
cl[c] = closeid; | ||
} | ||
} | ||
// Save the K-means classes | ||
for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]); | ||
free(centcn); | ||
free(cent); | ||
free(cl); | ||
} | ||
fclose(fo); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,183 @@ | ||
|
||
void *TrainModelThread(void *id) { | ||
long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0; | ||
long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; | ||
long long l1, l2, c, target, label, local_iter = iter; | ||
unsigned long long next_random = (long long)id; | ||
char eof = 0; | ||
real f, g; | ||
clock_t now; | ||
real *neu1 = (real *)calloc(layer1_size, sizeof(real)); | ||
real *neu1e = (real *)calloc(layer1_size, sizeof(real)); | ||
FILE *fi = fopen(train_file, "rb"); | ||
fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); | ||
while (1) { | ||
if (word_count - last_word_count > 10000) { | ||
word_count_actual += word_count - last_word_count; | ||
last_word_count = word_count; | ||
if ((debug_mode > 1)) { | ||
now=clock(); | ||
printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, | ||
word_count_actual / (real)(iter * train_words + 1) * 100, | ||
word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); | ||
fflush(stdout); | ||
} | ||
alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); | ||
if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; | ||
} | ||
if (sentence_length == 0) { | ||
while (1) { | ||
word = ReadWordIndex(fi, &eof); | ||
if (eof) break; | ||
if (word == -1) continue; | ||
word_count++; | ||
if (word == 0) break; | ||
// The subsampling randomly discards frequent words while keeping the ranking same | ||
if (sample > 0) { | ||
real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; | ||
next_random = next_random * (unsigned long long)25214903917 + 11; | ||
if (ran < (next_random & 0xFFFF) / (real)65536) continue; | ||
} | ||
sen[sentence_length] = word; | ||
sentence_length++; | ||
if (sentence_length >= MAX_SENTENCE_LENGTH) break; | ||
} | ||
sentence_position = 0; | ||
} | ||
if (eof || (word_count > train_words / num_threads)) { | ||
word_count_actual += word_count - last_word_count; | ||
local_iter--; | ||
if (local_iter == 0) break; | ||
word_count = 0; | ||
last_word_count = 0; | ||
sentence_length = 0; | ||
fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); | ||
continue; | ||
} | ||
word = sen[sentence_position]; | ||
if (word == -1) continue; | ||
for (c = 0; c < layer1_size; c++) neu1[c] = 0; | ||
for (c = 0; c < layer1_size; c++) neu1e[c] = 0; | ||
next_random = next_random * (unsigned long long)25214903917 + 11; | ||
b = next_random % window; | ||
if (cbow) { //train the cbow architecture | ||
// in -> hidden | ||
cw = 0; | ||
for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { | ||
c = sentence_position - window + a; | ||
if (c < 0) continue; | ||
if (c >= sentence_length) continue; | ||
last_word = sen[c]; | ||
if (last_word == -1) continue; | ||
for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size]; | ||
cw++; | ||
} | ||
if (cw) { | ||
for (c = 0; c < layer1_size; c++) neu1[c] /= cw; | ||
if (hs) for (d = 0; d < vocab[word].codelen; d++) { | ||
f = 0; | ||
l2 = vocab[word].point[d] * layer1_size; | ||
// Propagate hidden -> output | ||
for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; | ||
if (f <= -MAX_EXP) continue; | ||
else if (f >= MAX_EXP) continue; | ||
else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; | ||
// 'g' is the gradient multiplied by the learning rate | ||
g = (1 - vocab[word].code[d] - f) * alpha; | ||
// Propagate errors output -> hidden | ||
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; | ||
// Learn weights hidden -> output | ||
for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; | ||
} | ||
// NEGATIVE SAMPLING | ||
if (negative > 0) for (d = 0; d < negative + 1; d++) { | ||
if (d == 0) { | ||
target = word; | ||
label = 1; | ||
} else { | ||
next_random = next_random * (unsigned long long)25214903917 + 11; | ||
target = table[(next_random >> 16) % table_size]; | ||
if (target == 0) target = next_random % (vocab_size - 1) + 1; | ||
if (target == word) continue; | ||
label = 0; | ||
} | ||
l2 = target * layer1_size; | ||
f = 0; | ||
for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; | ||
if (f > MAX_EXP) g = (label - 1) * alpha; | ||
else if (f < -MAX_EXP) g = (label - 0) * alpha; | ||
else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; | ||
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; | ||
for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; | ||
} | ||
// hidden -> in | ||
for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { | ||
c = sentence_position - window + a; | ||
if (c < 0) continue; | ||
if (c >= sentence_length) continue; | ||
last_word = sen[c]; | ||
if (last_word == -1) continue; | ||
for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; | ||
} | ||
} | ||
} else { //train skip-gram | ||
for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { | ||
c = sentence_position - window + a; | ||
if (c < 0) continue; | ||
if (c >= sentence_length) continue; | ||
last_word = sen[c]; | ||
if (last_word == -1) continue; | ||
l1 = last_word * layer1_size; | ||
for (c = 0; c < layer1_size; c++) neu1e[c] = 0; | ||
// HIERARCHICAL SOFTMAX | ||
if (hs) for (d = 0; d < vocab[word].codelen; d++) { | ||
f = 0; | ||
l2 = vocab[word].point[d] * layer1_size; | ||
// Propagate hidden -> output | ||
for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; | ||
if (f <= -MAX_EXP) continue; | ||
else if (f >= MAX_EXP) continue; | ||
else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; | ||
// 'g' is the gradient multiplied by the learning rate | ||
g = (1 - vocab[word].code[d] - f) * alpha; | ||
// Propagate errors output -> hidden | ||
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; | ||
// Learn weights hidden -> output | ||
for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; | ||
} | ||
// NEGATIVE SAMPLING | ||
if (negative > 0) for (d = 0; d < negative + 1; d++) { | ||
if (d == 0) { | ||
target = word; | ||
label = 1; | ||
} else { | ||
next_random = next_random * (unsigned long long)25214903917 + 11; | ||
target = table[(next_random >> 16) % table_size]; | ||
if (target == 0) target = next_random % (vocab_size - 1) + 1; | ||
if (target == word) continue; | ||
label = 0; | ||
} | ||
l2 = target * layer1_size; | ||
f = 0; | ||
for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; | ||
if (f > MAX_EXP) g = (label - 1) * alpha; | ||
else if (f < -MAX_EXP) g = (label - 0) * alpha; | ||
else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; | ||
for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; | ||
for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1]; | ||
} | ||
// Learn weights input -> hidden | ||
for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c]; | ||
} | ||
} | ||
sentence_position++; | ||
if (sentence_position >= sentence_length) { | ||
sentence_length = 0; | ||
continue; | ||
} | ||
} | ||
fclose(fi); | ||
free(neu1); | ||
free(neu1e); | ||
pthread_exit(NULL); | ||
} |
Oops, something went wrong.