-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTrigrams.py
executable file
·109 lines (94 loc) · 4.01 KB
/
Trigrams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from nltk import trigrams
from collections import defaultdict
import random
from pathlib import Path
from time import sleep
def trainTrigramModel(corpus_path):
'''Trains a trigram language model.
args
corpus_path : the path to the training corpus
returns
model : a dictionary containing the trigram model
'''
# smoothing value to handle unattested words in test data (I think keep this zero for best model quality)
smoothing = 0.0
model = defaultdict(lambda: defaultdict(lambda: smoothing))
# TODO make this use headline snap data somehow, not iterate through txt files in a directory
# train model on full corpus of headline snaps
print("Training a trigram model on the database of Headline Snaps ...")
with open(corpus_path, 'r', encoding='utf-8') as f:
sents = f.readlines()
for sentence in sents:
sentence = sentence.split()
for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
model[(w1, w2)][w3] += 1
# transform the counts into probabilities
for w1_w2 in model:
total_count = float(sum(model[w1_w2].values()))
for w3 in model[w1_w2]:
model[w1_w2][w3] /= total_count
sleep(1)
print("Done.")
return model
def generateSentence(trigram_model, corpus_path):
'''Generates a sentence according to a trigram model.
args
trigram_model : the trigram model to use in sentence generation
returns
generated : the generated sentence
'''
text = [None, None]
generated = ''
sentence_finished = False
# instantiate counter to keep track of times tried to generate a novel sentence
fail_counter = 0
max_tries = 10
sleep(1)
print("Generating a new Headline Snap ...\n")
sleep(1)
while not sentence_finished:
r = random.random()
accumulator = .0
for word in trigram_model[tuple(text[-2:])].keys():
accumulator += trigram_model[tuple(text[-2:])][word]
if accumulator >= r:
text.append(word)
break
# preemptively create the generated sentence (string) here, because we need to check it against the corpus anyway
generated = ' '.join([t for t in text if t])
# check if the sentence is complete AND not present in the corpus verbatim
if (text[-2:] == [None, None]) and not checkAgainstCorpus(generated,corpus_path):
sentence_finished = True
# else (meaning it is in the corpus), check only if it's finished
elif text[-2:] == [None, None]:
# increment the number of novel sentence generation tries
fail_counter += 1
# DEBUG
#print("Times failed: ", fail_counter)
if fail_counter == max_tries:
# if we reach max tries, give up and return the sentence anyway
sentence_finished = True
continue
# set the sentence back to the starting value (empty with padding) before trying again
text = [None, None]
disclaimer = "\n" if (fail_counter < max_tries) else " (maximum reached; sentence not novel)\n"
print("Non-novel generations before novelty achieved:", str(fail_counter) + disclaimer)
#generated = ' '.join([t for t in text if t])
return generated
def checkAgainstCorpus(sentence,corpus_path):
'''Checks if a given sentence is present in a given corpus file.
args
sentence : the sentence to check
corpus_path : the path to the corpus file to be compared to
returns
bool
'''
with open(corpus_path, 'r', encoding='utf-8') as corpus_file:
# need to use .read().splitlines() here as opposed to readlines()
# so we do not get '\n' at the end of every list item
# this allows the sentence being checked against the list to match if necessary
sents = corpus_file.read().splitlines()
if sentence in sents:
return True
else:
return False