Skip to content

Commit

Permalink
improvements to how sentences are generated
Browse files Browse the repository at this point in the history
  • Loading branch information
frnsys committed Jul 8, 2014
1 parent 4f78460 commit 8a2d576
Showing 1 changed file with 40 additions and 43 deletions.
83 changes: 40 additions & 43 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,19 @@
import re
from textblob import TextBlob

ELIGIBLE_TAGS = [
'CD', # numbers
'JJ', # adjectives
'NN', # nouns
'NNP', # proper nouns
'NNPS', # plural proper nouns
'NNS', # plural nouns
'VBN',
'VBG',
'VB',
'RB' # adverbs
]

def main():
try:
import config
Expand All @@ -27,7 +40,7 @@ def main():
def tweets():
user_tweets = []
for i in range(10):
user_tweets += twitter.tweets('frnsys', page=i)
user_tweets += twitter.tweets('brian_justie', page=i)
with open('data/data.txt', 'w') as outfile:
json.dump(user_tweets, outfile)

Expand All @@ -39,74 +52,58 @@ def process():

# Chains of POS tags to build
# tweets out of.
speech_patterns = {}
speech_patterns = []
for tweet in json.load(f):
pattern_ = []

if '@' not in tweet['body']: # Trying without any @ mentions.
text = tweet['body']

# Remove urls
# Remove urls and @mentions.
text = re.sub(r"(?:\@|https?\://)\S+", "", text)
pattern = text

for t in TextBlob(text).pos_tags:
token = t[0]
tag = t[1]

pattern_.append(tag)

if tag == '-NONE-':
# Preserve hashtags.
if token[0] == '#':
continue

if tag not in speech_parts:
speech_parts[tag] = []
speech_parts[tag].append(token)
if tag in ELIGIBLE_TAGS and len(token) > 2:
pattern = pattern.replace(token, '{{{{ {0} }}}}'.format(tag))

if tag == '-NONE-':
continue

pattern = '.'.join(pattern_)
if pattern not in speech_patterns:
speech_patterns[pattern] = 0
speech_patterns[pattern] += 1
if tag not in speech_parts:
speech_parts[tag] = []
speech_parts[tag].append(token.lower())

speech_patterns.append(pattern)

with open('data/speech_parts.json', 'w') as outfile:
json.dump(speech_parts, outfile)
json.dump(speech_parts, outfile, indent=4, sort_keys=True)

with open('data/speech_patterns.json', 'w') as outfile:
json.dump(speech_patterns, outfile)
json.dump(speech_patterns, outfile, indent=4, sort_keys=True )

def generate():
with open('data/speech_parts.json', 'r') as f:
speech_parts = json.load(f)
with open('data/speech_patterns.json', 'r') as f:
speech_patterns = json.load(f)

pattern = _weighted_choice(speech_patterns)
pattern = random.choice(speech_patterns)
tweet = pattern

p = re.compile(r'\{\{\s*([A-Za-z]+)\s*\}\}')
tags = p.findall(pattern)

tweet = []
for tag in pattern.split('.'):
for tag in tags:
token = random.choice(speech_parts[tag])
tweet.append(token)
print(' '.join(tweet))


def _weighted_choice(choices):
"""
Random selects a key from a dictionary,
where each key's value is its probability weight.
"""
# Randomly select a value between 0 and
# the sum of all the weights.
rand = random.uniform(0, sum(choices.values()))

# Seek through the dict until a key is found
# resulting in the random value.
summ = 0.0
for key, value in choices.items():
summ += value
if rand < summ: return key

# If this returns False,
# it's likely because the knowledge is empty.
return False
tweet = re.sub(r'(\{\{\s*' + re.escape(tag) + r'\s*\}\})', token, tweet, 1)
print(tweet)



if __name__ == '__main__':
Expand Down

0 comments on commit 8a2d576

Please sign in to comment.