-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_tweet.py
55 lines (43 loc) · 1.41 KB
/
clean_tweet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import re
import pandas as pd
from collections import defaultdict
import urllib.request
from bs4 import BeautifulSoup
# url = "BS162NF8HC"
# response = requests.get("https://t.co/" + urllib.parse.quote(url))
# c = response.content
# link = "https://t.co/BS162NF8HC"
# f = requests.head(link)
# print(f.content)
# soup = BeautifulSoup(f.content)
#
# sample = soup.find("title")
# #
# print(soup.prettify())
# print(sample.contents[0])
# print(response.content)
df = pd.read_csv("Reactions.csv", header=None)
# df_clean = pd.DataFrame().reindex_like(df)
def clean_tweet(tweet):
if tweet.startswith("https://t.co/"):
soup = BeautifulSoup(urllib.request.urlopen(tweet),features="lxml")
tweet = soup.title.string
return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
tweetReplyDict = defaultdict(list)
for index, row in df.iterrows():
row_list = []
key = ''
for indexRow, elements in enumerate(row):
if indexRow == 0:
key = clean_tweet(elements)
continue
if(pd.isnull(elements)):
continue
else:
cleaned_element = clean_tweet(elements)
# print(cleaned_element)
row_list.append(cleaned_element)
tweetReplyDict[key] = row_list
df_clean = pd.DataFrame.from_dict(tweetReplyDict,orient='index')
print(df_clean)
df_clean.to_csv('Cleaned_Reactions.csv',header=False)