-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhashTageAnalyze.py
121 lines (102 loc) · 4.32 KB
/
hashTageAnalyze.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import sys
import jsonpickle
import os
import sys
# General:
from collections import defaultdict
import matplotlib
import tweepy # To consume Twitter's API
import pandas as pd # To handle data
import numpy as np # For number computing
# For plotting and visualization:
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
# matplotlib inline
import tweepy
from tweepy import OAuthHandler
import json
from textblob import TextBlob
import re
from retrying import retry
def get_auth():
consumer_key = 'qcq2FiLTgjsOiAz7SZSa8NT8Q'
consumer_secret = 'b4fOmhWboQZkMYFYiWImVxkZUk6AeuK0mRshCrPyNHVnT6sdqL'
access_token = '619712240-QfErMlhFzeEtK7Ru1cmERKns9WFPDuIXUlHMjEJW'
access_secret = 'aBouYO4ournJvwuSd4yeSiXHgCrOTaENwY5uURCcp9GX0'
# auth = OAuthHandler(consumer_key, consumer_secret)
# auth.set_access_token(access_token, access_secret)
# api = tweepy.API(auth)
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(auth, wait_on_rate_limit=True,
wait_on_rate_limit_notify=True,retry_count=100,retry_delay=10)
return api
api = get_auth()
searchQuery = '#wall' # this is what we're searching for
maxTweets = 10000000 # Some arbitrary large number
tweetsPerQry = 100 # this is the max the API permits
fName = 'tweets.txt' # We'll store the
# tweets in a text file.
# If results from a specific ID onwards are reqd, set since_id to that ID.
# else default to no lower limit, go as far back as API allows
sinceId = None
# If results only below a specific ID are, set max_id to that ID.
# else default to no upper limit, start from the most recent tweet matching the search query.
max_id = -1
tweetCount = 0
print("Downloading max {0} tweets".format(maxTweets))
with open(fName, 'w') as f:
while tweetCount < maxTweets:
try:
if (max_id <= 0):
if (not sinceId):
new_tweets = api.search(q=searchQuery, count=tweetsPerQry)
else:
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
since_id=sinceId)
else:
if (not sinceId):
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
max_id=str(max_id - 1))
else:
new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
max_id=str(max_id - 1),
since_id=sinceId)
if not new_tweets:
print("No more tweets found")
break
for tweet in new_tweets:
f.write(jsonpickle.encode(tweet._json, unpicklable=False) +
'\n')
tweetCount += len(new_tweets)
print("Downloaded {0} tweets".format(tweetCount))
max_id = new_tweets[-1].id
except tweepy.TweepError as e:
# Just exit if any error
print("some error : " + str(e))
break
print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))
# from urllib.request import urlopen
# from bs4 import BeautifulSoup
# file = "5_twitterBBC.csv"
# f = open(file, "w")
# Headers = "tweet_user, tweet_text, replies, retweets\n"
# f.write(Headers)
# url = "https://twitter.com/realDonaldTrump/status/1092787440560078849"
# html = urlopen(url)
# soup = BeautifulSoup(html, "html.parser")
# print(soup.prettify())
# # Gets the tweet
# tweets = soup.find_all("li", attrs={"class":"js-stream-item"})
# # Writes tweet fetched in file
# for tweet in tweets:
# try:
# if tweet.find('p',{"class":'tweet-text'}):
# tweet_user = tweet.find('span',{"class":'username'}).text.strip()
# tweet_text = tweet.find('p',{"class":'tweet-text'}).text.encode('utf8').strip()
# replies = tweet.find('span',{"class":"ProfileTweet-actionCount"}).text.strip()
# retweets = tweet.find('span', {"class" : "ProfileTweet-action--retweet"}).text.strip()
# # String interpolation technique
# f.write(f'{tweet_user},/^{tweet_text}$/,{replies},{retweets}\n')
# except: AttributeError
# f.close()