-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathartemis_stats.py
3813 lines (3254 loc) · 161 KB
/
artemis_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""The STATS runtime provides the statistics operation for the bot,
which is run daily after midnight UTC.
"""
import datetime
import os
import re
import sys
import time
import traceback
from ast import literal_eval
from calendar import monthrange
from collections import Counter, OrderedDict
from random import sample
from threading import Thread
from urllib.error import HTTPError, URLError
from urllib.request import urlopen
import praw
import prawcore
import psutil
import requests
from requests.exceptions import ConnectionError
import connection
import database
import timekeeping
from artemis_stream import stream_query_access
from common import flair_sanitizer, logger, main_error_log, markdown_escaper
from settings import INFO, FILE_ADDRESS, SETTINGS
from text import *
# Number of regular top-level routine runs that have been made.
CYCLES = 0
AGGS_ENABLED = True
"""SUBREDDIT TRAFFIC RETRIEVAL"""
def subreddit_traffic_daily_estimator(subreddit_name):
"""Looks at the DAILY traffic up to now in the current month and
estimates the total traffic for this month.
:param subreddit_name: The name of a Reddit subreddit.
:return: A dictionary indexed with various values, including
averages and estimated totals. `None` if inaccessible.
"""
daily_traffic_dictionary = {}
output_dictionary = {}
total_uniques = []
total_pageviews = []
# Get the current month as a YYYY-MM string.
current_month = timekeeping.month_convert_to_string(time.time())
# Retrieve traffic data as a dictionary.
# The speed of this function is determined by how fast `traffic()`
# gets data from the site itself. If the bot does not have the
# ability to access this data, return `None`.
try:
traffic_data = reddit.subreddit(subreddit_name).traffic()
except prawcore.exceptions.NotFound:
logger.info("Traffic Estimator: I do not have access to the traffic data.")
return None
daily_data = traffic_data["day"]
# Iterate over the data. If there's data for a day, we'll save it.
for date in daily_data:
date_string = timekeeping.time_convert_to_string(date[0])
date_uniques = date[1]
if date_uniques != 0 and current_month in date_string:
date_pageviews = date[2]
daily_traffic_dictionary[date_string] = [date_uniques, date_pageviews]
# Evaluate our data.
num_of_recorded_days = len(daily_traffic_dictionary.keys())
if num_of_recorded_days == 0:
return None # Exit if we have no valid data.
for date, value in daily_traffic_dictionary.items():
total_uniques.append(value[0])
total_pageviews.append(value[1])
# Calculate the daily average of uniques and page views.
days_uniques_recorded = len(total_uniques)
average_uniques = int(sum(total_uniques) / days_uniques_recorded)
average_pageviews = int(sum(total_pageviews) / len(total_pageviews))
# Get the number of days in the month and calculate the estimated
# amount for the month.
year = datetime.datetime.now().year
days_in_month = monthrange(year, datetime.datetime.now().month)[1]
output_dictionary["average_uniques"] = average_uniques
output_dictionary["average_pageviews"] = average_pageviews
output_dictionary["estimated_pageviews"] = average_pageviews * days_in_month
# We now have to calculate the estimated uniques based on the
# current total recorded, if we already have data for this month
# that we can estimate off of. Otherwise just give a rough estimate.
current_sum_uniques = traffic_data["month"][0][1]
if current_sum_uniques != 0:
avg_daily_unique = current_sum_uniques / days_uniques_recorded
output_dictionary["estimated_uniques"] = int(avg_daily_unique * days_in_month)
else:
output_dictionary["estimated_uniques"] = average_uniques * days_in_month
return output_dictionary
def subreddit_traffic_recorder(subreddit_name):
"""Retrieve the recorded monthly traffic statistics for a subreddit
and store them in our database. This function will also merge or
retrieve it from the local cache if that data is already stored.
:param subreddit_name: The name of a Reddit subreddit.
:return: A dictionary indexed by YYYY-MM with the traffic data for
that month.
"""
subreddit_name = subreddit_name.lower()
traffic_dictionary = {}
current_month = timekeeping.month_convert_to_string(time.time())
# Retrieve traffic data as a dictionary.
try:
sub_object = reddit.subreddit(subreddit_name)
traffic_data = sub_object.traffic()
except prawcore.exceptions.NotFound:
# We likely do not have the ability to access this.
logger.info("Traffic Recorder: I do not have access to traffic data for this subreddit.")
return
# Save the specific information.
monthly_data = traffic_data["month"]
# Iterate over the months.
for month in monthly_data:
# Convert the listed data into actual variables.
# Account for UTC with the time.
unix_month_time = month[0] + 86400
month_uniques = month[1]
month_pageviews = month[2]
year_month = timekeeping.month_convert_to_string(unix_month_time)
# This would also save the data for the current month, which is
# fine except that traffic data usually has initial gaps.
# Therefore, this function is run twice. It will update the
# numbers with whatever is most recent.
if current_month != year_month and month_uniques > 0 and month_pageviews > 0:
traffic_dictionary[year_month] = [month_uniques, month_pageviews]
# Check for pre-existing traffic data stored in the database.
sql_command = "SELECT * FROM subreddit_traffic WHERE subreddit = ?"
database.CURSOR_STATS.execute(sql_command, (subreddit_name,))
result = database.CURSOR_STATS.fetchone()
# If the data has not been saved before, add it as a new entry.
# Otherwise, if saved traffic data already exists merge the data.
if result is None:
data_package = (subreddit_name, str(traffic_dictionary))
database.CURSOR_STATS.execute("INSERT INTO subreddit_traffic VALUES (?, ?)", data_package)
database.CONN_STATS.commit()
logger.debug("Traffic Recorder: Traffic data for r/{} added.".format(subreddit_name))
else:
existing_dictionary = literal_eval(result[1])
new_dictionary = existing_dictionary.copy()
new_dictionary.update(traffic_dictionary)
update_command = "UPDATE subreddit_traffic SET traffic = ? WHERE subreddit = ?"
database.CURSOR_STATS.execute(update_command, (str(new_dictionary), subreddit_name))
database.CONN_STATS.commit()
logger.debug("Traffic Recorder: r/{} data merged.".format(subreddit_name))
return traffic_dictionary
# noinspection PyTypeChecker
def subreddit_traffic_retriever(subreddit_name):
"""Function that looks at the monthly traffic data for a subreddit
and returns it as a Markdown table.
If available it will also incorporate the estimated monthly targets
for the current month. This function also calculates the
month-to-month change and the averages for the entire period.
There are sections which use Pushshift, but while aggs are down they
have been edited out. Hopefully they can be re-enabled in the
future.
:param subreddit_name: The name of a Reddit subreddit.
:return: A Markdown table with all the months we have data for.
"""
correlated_data = {}
formatted_lines = []
all_uniques = []
all_pageviews = []
all_uniques_changes = []
all_pageviews_changes = []
top_month_uniques = None
top_month_pageviews = None
basic_line = "| {} | {} | {:,} | *{}%* | {} | {:,} | *{}%* | {} |"
# Look for the traffic data in our database.
subreddit_name = subreddit_name.lower()
database.CURSOR_STATS.execute(
"SELECT * FROM subreddit_traffic WHERE subreddit = ?", (subreddit_name,)
)
results = database.CURSOR_STATS.fetchone()
# If we have data, convert it back into a dictionary.
# Otherwise, return `None.
if results is not None:
traffic_dictionary = literal_eval(results[1])
if not len(traffic_dictionary): # Empty dictionary.
return None
else:
return None
# Fetch some submission / comment data from Pushshift's database
# for integration into the overall traffic table.
for search_type in ["submission", "comment"]:
correlated_data[search_type] = {}
earliest_month = list(traffic_dictionary.keys())[0] + "-01"
stat_query = (
"https://api.pushshift.io/reddit/search/{}/?subreddit={}&after={}"
"&aggs=created_utc&frequency=month&size=0".format(
search_type, subreddit_name, earliest_month
)
)
retrieved_data = subreddit_pushshift_access(stat_query)
if "aggs" not in retrieved_data:
correlated_data[search_type] = {}
else:
returned_months = retrieved_data["aggs"]["created_utc"]
for entry in returned_months:
month = timekeeping.month_convert_to_string(entry["key"])
count = entry["doc_count"]
if not count:
formatted_count = "N/A"
else:
formatted_count = "{:,}".format(count)
correlated_data[search_type][month] = formatted_count
# Iterate over our dictionary.
for key in sorted(traffic_dictionary, reverse=True):
# We get the previous month's data so we can track changes.
month_t = datetime.datetime.strptime(key, "%Y-%m").date()
previous_month = (month_t + datetime.timedelta(-15)).strftime("%Y-%m")
current_uniques = traffic_dictionary[key][0]
current_pageviews = traffic_dictionary[key][1]
# We SKIP this month for averaging if there's nothing there.
# Both uniques and pageviews are ZERO.
if current_uniques == 0 and current_pageviews == 0:
continue
all_uniques.append(current_uniques)
all_pageviews.append(current_pageviews)
# Get the ratio of uniques to pageviews. We round the ratio to
# the nearest integer.
if current_uniques != 0:
ratio_uniques_pageviews = "≈1:{:.0f}".format(current_pageviews / current_uniques)
else:
ratio_uniques_pageviews = "---"
# Try to get comparative data from the previous month.
try:
previous_uniques = traffic_dictionary[previous_month][0]
previous_pageviews = traffic_dictionary[previous_month][1]
# Determine the changes in uniques/page views relative to
# the previous month.
raw_uniques = current_uniques - previous_uniques
uniques_change = round((raw_uniques / previous_uniques) * 100, 2)
raw_pageviews = current_pageviews - previous_pageviews
pageviews_change = round((raw_pageviews / previous_pageviews) * 100, 2)
all_uniques_changes.append(uniques_change)
all_pageviews_changes.append(pageviews_change)
except (KeyError, ZeroDivisionError):
# If we do not have valid data from the previous month,
# put placeholder blank lines instead.
uniques_change = "---"
pageviews_change = "---"
# Format our necessary symbols to easily indicate the
# month-over-month change in the table.
if uniques_change != "---":
if uniques_change > 0:
uniques_symbol = "➕"
elif uniques_change < 0:
uniques_symbol = "🔻"
else:
uniques_symbol = "🔹"
else:
uniques_symbol = ""
if pageviews_change != "---":
if pageviews_change > 0:
pageviews_symbol = "➕"
elif pageviews_change < 0:
pageviews_symbol = "🔻"
else:
pageviews_symbol = "🔹"
else:
pageviews_symbol = ""
# Format the table line and add it to the list.
line = basic_line.format(
key,
uniques_symbol,
current_uniques,
uniques_change,
pageviews_symbol,
current_pageviews,
pageviews_change,
ratio_uniques_pageviews,
# correlated_data["submission"].get(key, "N/A"),
# correlated_data["comment"].get(key, "N/A"),
)
formatted_lines.append(line)
# Here we look for the top months we have in the recorded data.
if len(all_uniques) != 0 and len(all_pageviews) != 0:
top_uniques = max(all_uniques)
top_pageviews = max(all_pageviews)
for key, data in traffic_dictionary.items():
if top_uniques in data:
top_month_uniques = key
if top_pageviews in data:
top_month_pageviews = key
else:
top_uniques = top_pageviews = None
# Get the estimated CURRENT monthly average for this month.
# This is generated from the current daily data.
daily_data = subreddit_traffic_daily_estimator(subreddit_name)
if daily_data is not None:
# We have daily estimated data that we can parse.
# Get month data and the current month as a YYYY-MM string.
current_month = timekeeping.month_convert_to_string(time.time())
current_month_dt = datetime.datetime.strptime(current_month, "%Y-%m").date()
prev_month = (current_month_dt + datetime.timedelta(-15)).strftime("%Y-%m")
# Estimate the change.
estimated_uniques = daily_data["estimated_uniques"]
estimated_pageviews = daily_data["estimated_pageviews"]
# Get the previous month's data for comparison.
# This will fail if the keys are not included in the dictionary
# or if a variable for division is set to zero.
try:
previous_uniques = traffic_dictionary[prev_month][0]
previous_pageviews = traffic_dictionary[prev_month][1]
uniques_diff = estimated_uniques - previous_uniques
pageviews_diff = estimated_pageviews - previous_pageviews
est_uniques_change = round((uniques_diff / previous_uniques) * 100, 2)
est_pageviews_change = round((pageviews_diff / previous_pageviews) * 100, 2)
ratio_raw = round(estimated_pageviews / estimated_uniques, 0)
ratio_est_uniques_pageviews = "≈1:{}".format(int(ratio_raw))
x_ratio = 1 + (est_pageviews_change * 0.01) # Est. ratio
# Interpolate estimated number of posts and comments based
# on the Pushshift data and the ratio we have for pageviews.
now_posts = correlated_data["submission"].get(prev_month, "0").replace(",", "")
if now_posts != "N/A":
now_posts = int(now_posts)
est_posts = "{:,.0f}".format(now_posts * x_ratio)
else:
est_posts = "N/A"
now_comments = correlated_data["comment"].get(prev_month, "0").replace(",", "")
if now_comments != "N/A":
now_comments = int(now_comments)
est_comments = "{:,.0f}".format(now_comments * x_ratio)
else:
est_comments = "N/A"
except (KeyError, ZeroDivisionError):
est_uniques_change = est_pageviews_change = ratio_est_uniques_pageviews = "---"
est_posts = est_comments = "N/A"
estimated_line = basic_line.format(
"*{} (estimated)*".format(current_month),
"",
estimated_uniques,
est_uniques_change,
"",
estimated_pageviews,
est_pageviews_change,
ratio_est_uniques_pageviews,
est_posts,
est_comments,
)
# Insert at the start of the formatted lines list, position 0.
formatted_lines.insert(0, estimated_line)
# Get the averages of both the total amounts and the percentages.
# If there's no data, set the averages to zero.
try:
num_avg_uniques = round(sum(all_uniques) / len(all_uniques), 2)
num_avg_pageviews = round(sum(all_pageviews) / len(all_uniques), 2)
except ZeroDivisionError:
num_avg_uniques = num_avg_pageviews = 0
# Make sure we have month over month data, because if we don't have
# more than one month's worth of data, we can't calculate the
# average per month increase.
if len(all_uniques_changes) > 0 and len(all_pageviews_changes) > 0:
num_avg_uniques_change = round(sum(all_uniques_changes) / len(all_uniques_changes), 2)
num_pageviews_changes = round(sum(all_pageviews_changes) / len(all_pageviews_changes), 2)
else:
num_avg_uniques_change = num_pageviews_changes = 0
# Form the Markdown for the "Average" section.
average_section = (
"* *Average Monthly Uniques*: {:,}\n* *Average Monthly Pageviews*: {:,}\n"
"* *Average Monthly Uniques Change*: {:+}%"
"\n* *Average Monthly Pageviews Change*: {:+}%\n"
)
average_section = average_section.format(
num_avg_uniques, num_avg_pageviews, num_avg_uniques_change, num_pageviews_changes
)
# Get the difference of the top months from the average and
# form the Markdown for the "Top" section that follows.
# Get the percentage increase for uniques and pageviews.
if top_uniques is not None and top_pageviews is not None:
if num_avg_uniques != 0 and num_avg_pageviews != 0:
i_uniques = (top_uniques - num_avg_uniques) / num_avg_uniques
i_pageviews = (top_pageviews - num_avg_pageviews) / num_avg_pageviews
top_increase_uniques = ", {:+.2%} more than the average month".format(i_uniques)
top_increase_pageviews = ", {:+.2%} more than the average month".format(i_pageviews)
else:
top_increase_uniques = top_increase_pageviews = ""
top_section = (
"* *Top Month for Uniques*: {} ({:,} uniques{})\n"
"* *Top Month for Pageviews*: {} ({:,} pageviews{})\n\n"
)
top_section = top_section.format(
top_month_uniques,
top_uniques,
top_increase_uniques,
top_month_pageviews,
top_pageviews,
top_increase_pageviews,
)
else:
# Leave it blank if there's not enough data to derive a
# top section.
top_section = ""
# Form the overall Markdown table with the header and body text.
header = (
"\n| Month | 📈 | Uniques | Uniques % Change | 📉 | "
"Pageviews | Pageviews % Change | Uniques : Pageviews | "
"\n|-------|----|---------|------------------|----|------|"
"--------------------|---------------------|\n"
)
body = average_section + top_section + header + "\n".join(formatted_lines)
return body
"""SUBREDDIT STATISTICS RETRIEVAL"""
def subreddit_pushshift_probe(test_count=5):
"""This function does a check of Pushshift to see if aggregations
are enabled or not, as Artemis depends on aggregations for some
statistics.
:param test_count: How many subreddit queries to test.
:return: `True` if aggregations are valid, returns `False if they
are not.
"""
global AGGS_ENABLED
aggs_valid_count = 0
# Return if there are few monitored subreddits.
if len(MONITORED_SUBREDDITS) < test_count:
AGGS_ENABLED = True
return
# Select some random subreddits to test.
random_selection = sample(MONITORED_SUBREDDITS, test_count)
# Get a time two weeks ago to test
two_weeks_prior = int(time.time() - 1209600)
start_search_at = timekeeping.month_convert_to_string(two_weeks_prior)
# We run a regular query to test if the database itself is up,
# then follow that up with an aggregations query. If aggregations
# are off, then there will be no `aggs` in the result for the query.
for subreddit in random_selection:
regular_query = (
"https://api.pushshift.io/reddit/search/"
"submission/?subreddit={}&after={}"
"&size=25".format(subreddit, start_search_at)
)
aggs_query = regular_query + "&aggs=author"
regular_data = subreddit_pushshift_access(regular_query)
aggs_data = subreddit_pushshift_access(aggs_query)
if regular_data and "aggs" in aggs_data:
logger.info("Pushshift Probe: r/{} aggregations are valid.".format(subreddit))
aggs_valid_count += 1
else:
logger.info("Pushshift Probe: r/{} aggregations are invalid.".format(subreddit))
# If there was no valid aggregate data, return False.
logger.info(
"Pushshift Probe: Detected {} valid aggregation "
"results out of {} tests.".format(aggs_valid_count, test_count)
)
if aggs_valid_count > 0:
AGGS_ENABLED = True
else:
AGGS_ENABLED = False
logger.info("Pushshift Probe: Aggregations are currently invalid.")
return
def subreddit_pushshift_access(query_string, retries=3, stream_possible=False):
"""This function is called by others as the main point of query to
Pushshift. It contains code to account for JSON decoding errors and
to retry if it encounters such problems. It also converts JSON data
into a Python dictionary.
:param query_string: The exact API call we want to make.
:param retries: The number of times (as an integer) that we want to
try connecting to the API. Default is 3.
:param stream_possible: Boolean that tells whether or not this query
can be covered by the stream database. If
aggregations are disabled and this is `True`
then the function will consult the stream.
:return: An empty dictionary if there was a connection error,
otherwise, a dictionary.
"""
# A temporary check to see if aggregations are currently active.
# If not and this isn't something we can get stream data for,
# the function returns an empty dictionary straightaway,
# as it will not get real data anyway.
if "&aggs" in query_string and not AGGS_ENABLED:
if stream_possible:
return stream_query_access(query_string)
else:
return {}
# Regular function iteration.
for _ in range(retries):
try:
returned_data = requests.get(query_string)
returned_data = returned_data.json()
return returned_data # Return data as soon as it is found.
except (ValueError, ConnectionError, HTTPError, requests.exceptions.ChunkedEncodingError):
continue
return {}
def subreddit_subscribers_recorder(subreddit_name, check_pushshift=False):
"""A quick routine that gets the number of subscribers for a
specific subreddit and saves it to our database.
This is intended to be run daily at midnight UTC.
:param subreddit_name: The name of a Reddit subreddit.
:param check_pushshift: Whether we want to get the live count of
the subscribers from Reddit (normal mode)
or we want to try and get the more accurate
one from Pushshift. This is because Artemis
may have been added at the end of a UTC day
and its current subscriber count would not
be as accurate as an earlier one.
:return: Nothing.
"""
# Get the date by converting the time to YYYY-MM-DD in UTC.
current_time = time.time()
current_day = timekeeping.convert_to_string(current_time)
# `check_pushshift`: We want to get a more accurate count from the
# start of the day. Set `current_subs` to `None` if there is no
# information retrieved. If we can get data, it'll be in a dict
# format: {'2018-11-11': 9999}
if check_pushshift:
ps_subscribers = subreddit_subscribers_pushshift_historical_recorder(
subreddit_name, fetch_today=True
)
if len(ps_subscribers.keys()) == 0:
current_subs = None
else:
current_subs = ps_subscribers[current_day]
else:
current_subs = None
# Get the current state of subscribers. If an exception is thrown
# the subreddit is likely quarantined or banned.
if current_subs is None:
try:
current_subs = reddit.subreddit(subreddit_name).subscribers
except prawcore.exceptions.Forbidden:
current_subs = 0
# Insert the subscribers information into our database.
data_package = {current_day: current_subs}
logger.debug(
"Subscribers Recorder: {}, r/{}: {:,} subscribers.".format(
current_day, subreddit_name, current_subs
)
)
database.subscribers_insert(subreddit_name, data_package)
return
def subreddit_subscribers_retriever(subreddit_name):
"""Function that looks at the stored subscriber data and returns it
as a Markdown table.
It keeps the daily information from the last 6 months and past that
only returns monthly information.
:param subreddit_name: The name of a Reddit subreddit.
:return: A Markdown-formatted table with the daily change in
subscribers and total number.
"""
formatted_lines = []
day_changes = []
# Check to see if this has been stored before.
# If there is, get the dictionary. Exit if there is no data.
subscriber_dictionary = database.subscribers_retrieve(subreddit_name)
if subscriber_dictionary is None:
return None
# Get the founding date of the subreddit, by checking the local
# database, or the object itself if not monitored. If the local
# database does not contain the date, check the subreddit.
try:
created = database.extended_retrieve(subreddit_name)["created_utc"]
founding_date = timekeeping.convert_to_string(created)
except (KeyError, TypeError):
try:
founding_epoch = reddit.subreddit(subreddit_name).created_utc
founding_date = timekeeping.convert_to_string(founding_epoch)
except prawcore.exceptions.Forbidden:
# No access to a private subreddit. In case of this very
# unlikely situation, set the founding date to the epoch.
founding_date = "1970-01-01"
# Iterate over the data. Format the lines together and get their net
# change as well. We sort in this case, by newest first.
list_of_dates = list(sorted(subscriber_dictionary.keys()))
list_of_dates.reverse()
for date in list_of_dates:
day_index = list_of_dates.index(date)
logger.debug(
"Subscribers Retriever for r/{}: {}, index {}".format(subreddit_name, date, day_index)
)
# Get some date variables and the template for each line.
day_t = datetime.datetime.strptime(date, "%Y-%m-%d").date()
previous_day = day_t + datetime.timedelta(-1)
previous_day = str(previous_day.strftime("%Y-%m-%d"))
line = "| {} | {:,} | {:+,} |"
subscriber_count = subscriber_dictionary[date]
# This is a regular day in the last 180 entries. If we are past
# 180 days (about half a year) then we get only the starts of
# the months for a shorter table.
day_limit = SETTINGS.num_display_subscriber_days
if previous_day in subscriber_dictionary and day_index <= day_limit:
subscriber_previous = subscriber_dictionary[previous_day]
net_change = subscriber_count - subscriber_previous
elif day_index > day_limit and "-01" in date[-3:]:
# Try to get the previous month's entry, which is not
# immediately previous to this one.
try:
later_line = formatted_lines[-1]
later_date = later_line.split("|")[1].strip()
subscriber_later = int(later_line.split("|")[2].strip().replace(",", ""))
except IndexError:
later_date = founding_date
subscriber_later = 1
# Get the average change of subscribers per day.
days_difference = timekeeping.num_days_between(later_date, date)
if days_difference != 0:
subscriber_delta = subscriber_later - subscriber_count
net_change = round(subscriber_delta / days_difference, 2)
else:
net_change = 0
else:
continue
# If there was a change, append the change to our list.
if net_change != 0:
day_changes.append(net_change)
new_line = line.format(date, subscriber_count, net_change)
if day_index <= day_limit or day_index > day_limit and "-01" in date[-3:]:
formatted_lines.append(new_line)
# Get the average change of subscribers per day.
if len(day_changes) >= 2:
average_change = sum(day_changes) / len(day_changes)
average_change_text = "*Average Daily Change (overall)*: {:+,.2f} subscribers\n\n"
average_change_section = average_change_text.format(average_change)
# If the subreddit is actually growing, get the average growth.
milestone_estimated = subreddit_subscribers_estimator(subreddit_name)
if milestone_estimated is not None:
average_change_section += "{}\n\n".format(milestone_estimated)
else:
average_change_section = ""
# Get the milestone chart, if possible. This charts which days the
# subreddit reached a certain number of subscribers.
milestone_section = subreddit_subscribers_milestone_chart_former(subreddit_name)
if milestone_section is None:
milestone_section = ""
# Format the actual body of the table.
subscribers_header = (
"\n\n### Log\n\n"
"| Date | Subscribers | Average Daily Change |\n"
"|------|-------------|----------------------|\n"
)
# The last line is appended, which is the start date of the sub,
# and form the text together.
founding_line = "\n| {} | Created | --- |".format(founding_date)
body = average_change_section + milestone_section
body += subscribers_header + "\n".join(formatted_lines) + founding_line
return body
def subreddit_subscribers_estimator(subreddit_name):
"""This function tries to estimate how long it'll be until the
subreddit reaches the next subscriber milestone. This is based off
the value `sample_size`, which is the most recent number of entries
that are evaluated.
:param subreddit_name: The name of a Reddit subreddit.
:return:
"""
next_milestone = None
sample_size = SETTINGS.subscriber_sample_size
last_few_entries = []
# Access the database.
results = database.subscribers_retrieve(subreddit_name)
# Exit if there is no data.
if results is None:
return None
# Look through our results, specifically the last X ones of
# `sample_size`. In this case it means we look through the last two
# weeks to get the average. We order the data from newest first to
# oldest last.
last_few_days = list(sorted(results.keys()))[-sample_size:]
last_few_days.reverse()
for day in last_few_days:
last_few_entries.append(results[day])
# Get the current number of subscribers, and calculate the average
# daily change in recent days.
current_number = results[last_few_days[0]]
average_changes = [s - t for s, t in zip(last_few_entries, last_few_entries[1:])]
average_daily_change = sum(average_changes) / len(average_changes)
# Iterate over the milestones. Calculate the next milestone this
# subreddit will reach.
for milestone in SETTINGS.milestones:
if milestone > current_number:
next_milestone = milestone
break
# Format the daily change text.
if average_daily_change != 0:
average_daily_format = (
"*Average Daily Change (last {} entries)*: "
"{:+,.2f} subscribers\n\n".format(sample_size, average_daily_change)
)
else:
average_daily_format = None
# We now know what the next milestone is. Calculate the number of
# days until then.
if next_milestone is not None:
# Check how many days we estimate until the next milestone.
difference_between = next_milestone - current_number
if average_daily_change != 0:
days_until_milestone = int(difference_between / float(average_daily_change))
else:
days_until_milestone = 10000 # Assign it a really long number
unix_next_milestone = time.time() + (days_until_milestone * 86400)
# If the days are too far from now, (over two years) or if the
# subreddit is shrinking don't return milestones.
if days_until_milestone > SETTINGS.subscriber_milestone_upper or days_until_milestone < 0:
milestone_format = None
else:
# Format the next milestone as a string. If the next
# milestone is within four months, just include it as days.
# Otherwise, format the next time string in months instead.
unix_next_milestone_string = timekeeping.convert_to_string(unix_next_milestone)
if days_until_milestone <= SETTINGS.subscriber_milestone_format_days:
if days_until_milestone == 0:
time_until_string = "(today!)"
else:
time_until_string = "({} days from now)".format(days_until_milestone)
else:
# Otherwise, format it as months. 30.44 is the average
# number of days in a month.
time_until_string = "({:.2f} months from now)".format(days_until_milestone / 30.44)
milestone_format = "*Next Subscriber Milestone (estimated)*: {:,} subscribers on {} {}"
milestone_format = milestone_format.format(
next_milestone, unix_next_milestone_string, time_until_string
)
else:
milestone_format = None
# Then we put together the two lines, if possible.
if average_daily_format is not None:
if milestone_format is not None:
returned_body = average_daily_format + milestone_format
else:
returned_body = average_daily_format
return returned_body
else:
return None
def subreddit_subscribers_milestone_chart_former(subreddit_name):
"""This function is backwards-looking; that is, it looks back and
determines when a subreddit passed certain subscriber milestones.
:param subreddit_name: The name of a Reddit subreddit.
:return: A Markdown table.
"""
# Create a dictionary with milestones to derive the chart from.
dictionary_milestones = {}
formatted_lines = []
# Check to see if we have stored data. We order this by date,
# oldest item first, newest item last.
dictionary_total = database.subscribers_retrieve(subreddit_name)
# Exit if there is no data.
if dictionary_total is None:
return None
# Get the last number of recorded subscribers.
current_subscribers = dictionary_total[list(sorted(dictionary_total.keys()))[-1]]
milestones_to_check = [x for x in SETTINGS.milestones if x <= current_subscribers]
# We iterate over the data we have, starting with the OLDEST date
# we have data for.
for date in list(sorted(dictionary_total.keys())):
date_subscribers = dictionary_total[date]
# Iterate over the subscriber milestones that we have defined.
for milestone in milestones_to_check:
if date_subscribers > milestone:
continue
else:
# We get the next day for the milestone.
d_type = "%Y-%m-%d"
time_delta = datetime.timedelta(days=1)
next_day = (datetime.datetime.strptime(date, d_type) + time_delta).strftime(d_type)
# We add the next date here.
dictionary_milestones[milestone] = next_day
# Iterate over the dictionary of milestones to make sure to remove
# any milestone that might not yet be attained. Delete the key if
# it's somehow larger than our current subscriber count.
for key in [key for key in dictionary_milestones if key > current_subscribers]:
del dictionary_milestones[key]
# Form the Markdown table from our dictionary of data. We also add
# a first entry for the founding of the sub.
header = (
"### Milestones\n\n\n"
"| Date Reached | Subscriber Milestone | Average Daily Change "
"| Days From Previous Milestone |\n"
"|--------------|----------------------|----------------------|"
"------------------------------|\n"
)
founding_date = timekeeping.convert_to_string(reddit.subreddit(subreddit_name).created_utc)
founding_line = "\n| {} | Created | --- |\n\n".format(founding_date)
for milestone, date in list(sorted(dictionary_milestones.items())):
# If we have previous items in this list, we want to calculate
# the daily growth between this milestone and the previous one.
if len(formatted_lines) != 0:
previous_date = formatted_lines[-1].split("|")[1].strip()
previous_milestone = int(formatted_lines[-1].split("|")[2].strip().replace(",", ""))
else:
# If there is no previous entry, we start from the founding
# of the subreddit.
previous_date = founding_date
previous_milestone = 1
# Calculate the number of days between the two milestones and
# the changes in between. Start by obtaining the subscriber
# change between the last milestone.
milestone_delta = milestone - previous_milestone
days_difference = timekeeping.num_days_between(previous_date, date)
# Calculate the average daily change. If the difference in days
# is zero, set the delta value to a generic string.
if days_difference != 0:
daily_delta = "{:+,.2f}".format(milestone_delta / days_difference)
else:
daily_delta = "---"
# Create a new line for the table and add it to our list.
new_line = "| {} | {:,} | {} | {} |".format(date, milestone, daily_delta, days_difference)
formatted_lines.append(new_line)
# Join everything together. We also need to delete the last
# milestone, which is not real since it's in the future.
# Also sort it by newest first, and replace any double linebreaks
# so that the table is intact.
formatted_lines.reverse()
body = "{}{}".format(header, "\n".join(formatted_lines)) + founding_line
body = body.replace("\n\n", "\n")
return body
def subreddit_subscribers_pushshift_historical_recorder(subreddit_name, fetch_today=False):
"""Pushshift's API stores subscriber data for subreddits from about
2018-03-15. This function will go back until then and get the
subscribers for each day if it can, namely by grabbing data in
chunks and analyzing them for subscriber count.
:param subreddit_name: Name of a subreddit.
:param fetch_today: Whether we should get just today's stats, or a
list of stats from March 15, 2018 onwards.
:return:
"""
subscribers_dictionary = {}
chunk_size = SETTINGS.pushshift_subscriber_chunks
logger.info(
"Subscribers PS: Retrieving historical " "subscribers for r/{}...".format(subreddit_name)
)
# If we just want to get today's stats just create a list with today
# as the only component. Otherwise, fetch a list of days since March
# 15, which is when subscriber information became available on
# Pushshift's database.
if not fetch_today:
yesterday = int(time.time()) - 86400
yesterday_string = timekeeping.convert_to_string(yesterday)
# Insert check for subreddit age. If the subreddit was created
# after the default start date of March 15, 2018, use the
# creation date as the starting point instead.
subreddit_created = int(reddit.subreddit(subreddit_name).created_utc)
subreddit_created_date = timekeeping.convert_to_string(subreddit_created)
if SETTINGS.pushshift_subscriber_start > subreddit_created_date:
start_date = SETTINGS.pushshift_subscriber_start
else:
start_date = subreddit_created_date
logger.info("Subscribers PS: Retrieval will start from {}.".format(start_date))
list_of_days_to_get = timekeeping.get_series_of_days(start_date, yesterday_string)
else:
today_string = timekeeping.convert_to_string(time.time())
list_of_days_to_get = [today_string]
api_search_query = (
"https://api.pushshift.io/reddit/search/submission/"
"?subreddit={}&after={}&before={}&sort_type=created_utc"
"&fields=subreddit_subscribers,created_utc&size=750"
)
# Get the data from Pushshift as JSON. We try to get a submission
# per day and record the subscribers.
list_chunked = [
list_of_days_to_get[i : i + chunk_size]
for i in range(0, len(list_of_days_to_get), chunk_size)
]
# Iterate over our chunks of days.
for chunk in list_chunked:
processed_days = []
# Set time variables.
first_day = chunk[0]
start_time = timekeeping.convert_to_unix(first_day)
last_day = chunk[-1]