-
Notifications
You must be signed in to change notification settings - Fork 0
/
helper.py
244 lines (170 loc) · 7.23 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
import streamlit as st
from urlextract import URLExtract
from wordcloud import WordCloud
import pandas as pd
from collections import Counter
#import emoji
extract = URLExtract()
def fetch_stats(selected_user,df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
# fetch the number of messages
num_messages = df.shape[0]
# fetch the total number of words
words = []
for message in df['message']:
words.extend(message.split())
# fetch number of media messages
num_media_messages = df[df['message'] == '<Media omitted>\n'].shape[0]
# fetch number of links shared
links = []
for message in df['message']:
links.extend(extract.find_urls(message))
return num_messages,len(words),num_media_messages,len(links)
def most_busy_users(df):
x = df['user'].value_counts().head()
df = round((df['user'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename(
columns={'index': 'name', 'user': 'percent'})
return x,df
def create_wordcloud(selected_user,df):
f = open('stop.txt', 'r')
stop_words = f.read()
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
temp = df[df['user'] != 'group_notification']
temp = temp[temp['message'] != '<Media omitted>\n']
def remove_stop_words(message):
y = []
for word in message.lower().split():
if word not in stop_words:
y.append(word)
return " ".join(y)
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')
temp['message'] = temp['message'].apply(remove_stop_words)
df_wc = wc.generate(temp['message'].str.cat(sep=" "))
return df_wc
def most_common_words(selected_user,df):
f = open('stop.txt','r')
stop_words = f.read()
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
temp = df[df['user'] != 'group_notification']
temp = temp[temp['message'] != '<Media omitted>\n']
words = []
for message in temp['message']:
for word in message.lower().split():
if word not in stop_words:
words.append(word)
most_common_df = pd.DataFrame(Counter(words).most_common(20))
return most_common_df
def monthly_timeline(selected_user,df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
timeline = df.groupby(['year', 'month_num', 'month']).count()['message'].reset_index()
time = []
for i in range(timeline.shape[0]):
time.append(timeline['month'][i] + "-" + str(timeline['year'][i]))
timeline['time'] = time
timeline = timeline.sort_values('month_num')
return timeline
def daily_timeline(selected_user,df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
daily_timeline = df.groupby('only_date').count()['message'].reset_index()
return daily_timeline
def week_activity_map(selected_user,df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
return df['day_name'].value_counts()
def month_activity_map(selected_user,df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
return df['month'].value_counts()
def activity_heatmap(selected_user,df):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
user_heatmap = df.pivot_table(index='day_name', columns='period', values='message', aggfunc='count').fillna(0)
return user_heatmap
def sentiment_week_activity_map(selected_user,df,k):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
df = df[df['value'] == k]
return df['day_name'].value_counts()
# Will return count of messages of selected user per month having k(0/1/-1) sentiment
def sentiment_month_activity_map(selected_user,df,k):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
df = df[df['value'] == k]
return df['month'].value_counts()
# Will return hear map containing count of messages having k(0/1/-1) sentiment
def sentiment_activity_heatmap(selected_user,df,k):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
df = df[df['value'] == k]
# Creating heat map
user_heatmap = df.pivot_table(index='day_name', columns='period', values='message', aggfunc='count').fillna(0)
return user_heatmap
# Will return count of messages of selected user per date having k(0/1/-1) sentiment
def sentiment_daily_timeline(selected_user,df,k):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
df = df[df['value']==k]
# count of message on a specific date
daily_timeline = df.groupby('only_date').count()['message'].reset_index()
return daily_timeline
# Will return count of messages of selected user per {year + month number + month} having k(0/1/-1) sentiment
def sentiment_monthly_timeline(selected_user,df,k):
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
df = df[df['value']== k]
timeline = df.groupby(['year', 'month_num', 'month']).count()['message'].reset_index()
time = []
for i in range(timeline.shape[0]):
time.append(timeline['month'][i] + "-" + str(timeline['year'][i]))
timeline['time'] = time
return timeline
# Will return percentage of message contributed having k(0/1/-1) sentiment
def sentiment_percentage(df,k):
df = round((df['user'][df['value']==k].value_counts() / df[df['value']==k].shape[0]) * 100, 2).reset_index().rename(
columns={'index': 'name', 'user': 'user'})
return df
# Return wordcloud from words in message
def sentiment_create_wordcloud(selected_user,df,k):
f = open('stop.txt', 'r')
stop_words = f.read()
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
# Remove entries of no significance
temp = df[df['user'] != 'group_notification']
temp = temp[temp['message'] != '<Media omitted>\n']
# Remove stop words according to text file "stop_hinglish.txt"
def remove_stop_words(message):
y = []
for word in message.lower().split():
if word not in stop_words:
y.append(word)
return " ".join(y)
# Dimensions of wordcloud
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')
# Actual removing
temp['message'] = temp['message'].apply(remove_stop_words)
temp['message'] = temp['message'][temp['value'] == k]
# Word cloud generated
df_wc = wc.generate(temp['message'].str.cat(sep=" "))
return df_wc
# Return set of most common words having k(0/1/-1) sentiment
def sentiment_most_common_words(selected_user,df,k):
f = open('stop.txt','r')
stop_words = f.read()
if selected_user != 'Overall':
df = df[df['user'] == selected_user]
temp = df[df['user'] != 'group_notification']
temp = temp[temp['message'] != '<Media omitted>\n']
words = []
for message in temp['message'][temp['value'] == k]:
for word in message.lower().split():
if word not in stop_words:
words.append(word)
# Creating data frame of most common 20 entries
most_common_df = pd.DataFrame(Counter(words).most_common(20))
return most_common_df