diff --git a/.gitignore b/.gitignore index 54e3a89..d2b099b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,13 +1,11 @@ follower/data/mementos.txt follower/data/NonParsedMementos.txt -core/config/data/followercount */__pycache__ */*/__pycache__ *.pyc output/ .RData dist/ -core/ build/ mementos.txt test.py diff --git a/fch/core/__init__.py b/fch/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fch/core/config/__init__.py b/fch/core/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fch/core/config/configreader.py b/fch/core/config/configreader.py new file mode 100644 index 0000000..1067c2c --- /dev/null +++ b/fch/core/config/configreader.py @@ -0,0 +1,44 @@ +import pickle +import os + + +class ConfigurationReader: + """ + This class is for reading Configuration of the program. + + Attributes: + __conf_record (dict): Configuration Dictionary + mode (int): Sets mode of the program + start_time (int): Start Time of the analysis + end_time (int): End Time of the Analysis + out_dir (str): Default path for output + debug (bool): Sets Debug Mode + url_can (bool): Sets URL Canonicalization + tlangs (list): List of Twitter languages + dbname (str): Database Name + tdomain (list): List of Twitter Domains + tpath (list): List of Twitter paths + backup_logs (bool): Back Up Logs + """ + def __init__(self, db_config=None): + """ + This is the constructor for ConfigurationReader class. + + Parameters: + db_config (str): Name of Configuration File + """ + if not db_config: + self.db_config = "followercount" + with open(os.path.join(os.path.dirname(__file__), "data", self.db_config), "rb") as self.__ofile: + self.__conf_record = pickle.load(self.__ofile) + self.start_time = self.__conf_record["Start_Timestamp"] + self.end_time = self.__conf_record["End_Timestamp"] + self.out = self.__conf_record["Output_Dir"] + self.debug = self.__conf_record["Debug_Mode"] + self.tlangs = self.__conf_record["Twitter_Languages"] + self.dbname = self.__conf_record["Database"] + self.tdomain = self.__conf_record["Twitter_Domain"] + self.tpath = self.__conf_record["Twitter_Path"] + self.turl = self.__conf_record["Twitter_Url"] + self.frequency = self.__conf_record["Frequency"] + self.intermediate = self.__conf_record["Internediary_Dir"] \ No newline at end of file diff --git a/fch/core/config/configwriter.py b/fch/core/config/configwriter.py new file mode 100644 index 0000000..c59dcc4 --- /dev/null +++ b/fch/core/config/configwriter.py @@ -0,0 +1,39 @@ +import pickle +import configparser +import os + + +class ConfigurationWriter: + """ + This class is for writing Configuration of the program + + Attributes: + __config (ConfigParser): ConfigParser object + __conf_path (str): Path to configuration file + __conf_record (dict): Stores configuration dictionary + """ + + def __init__(self, **kwargs): + """ + This is constructor for ConfigurationWriter class + + Parameters: + kwargs(list): Variable argument for configuration + """ + self.__config = configparser.ConfigParser() + self.__conf_path = os.path.join(os.path.dirname(__file__), "data") + if len(self.__config.read(os.path.join(self.__conf_path, "config.ini"))) == 0: + self.__config.read(os.path.join(self.__conf_path, "config.ini")) + self.__conf_record = dict(Start_Timestamp=kwargs.get('st', int(self.__config['SETUP']['START_TIMESTAMP'])), + End_Timestamp=kwargs.get('et', int(self.__config['SETUP']['END_TIMESTAMP'])), + Output_Dir=kwargs.get('f', None), + Internediary_Dir=self.__config['SETUP']['INTERMIDIARY_DIR'], + Frequency=kwargs.get('freq', int(self.__config['SETUP']['FREQUENCY'])), + Database=kwargs.get("db_conf", "followercount"), + Debug_Mode=kwargs.get("debug", True if self.__config['SETUP']['DEBUG_MODE'] == "True" else False), + Twitter_Languages=self.__config['TWITTER']['LANGUAGES'].split(" "), + Twitter_Domain=self.__config['TWITTER']['DOMAIN'].split(" "), + Twitter_Path=self.__config['TWITTER']['PATH'], + Twitter_Url=self.__config['TWITTER']['URL']) + with open(os.path.join(self.__conf_path, self.__conf_record["Database"]), "wb") as ofile: + pickle.dump(self.__conf_record, ofile) diff --git a/fch/core/config/data/README_config.txt b/fch/core/config/data/README_config.txt new file mode 100644 index 0000000..e1a92e3 --- /dev/null +++ b/fch/core/config/data/README_config.txt @@ -0,0 +1,70 @@ +******************** +[TWITTER] + +********** +MODE: +This is to set the mode of your analysis for finding deleted tweets. + +Supported Value: +0: For getting deleted tweets based on Twitter api's usertimeline method. (Most recent 3200 tweets) +1: For providing user defined start and end time for analysis +********** + +********** +START_TIMESTAMP: +This sets the start timestamp for fetching mementos. It accepts value in Memento DateTime format (20190107235959) +* Note: MODE 1 is required for using this option. +********** + +********** +END_TIMESTAMP: +This sets the end timestamp for fetching mementos. It accepts value in Memento DateTime format (20190807235959) +* Note: MODE 1 is required for using this option. +********** + + +********** +URL_CANONICALIZATION: F +This option allows for creating canonicalized Twitter URLs. +For example, a Twitter URL will be appended with language variation and with_replies parameter. + +https://twitter.com/dougjones (1 URL) +https://twitter.com/dougjones/with_replies (1 URL) +https://twitter.com/dougjones?lang=en (47 URLs for 47 languages) +https://twitter.com/dougjones/with_replies?lang=en (47 URLs for 47 languages) +Total: 96 URLs for each URI-R + +Supported Value: +T: Set to True +F: Set to False +********** + + +******************** +[COMMON] +******************** + +********** +OUTPUT_DIR: +This options sets the default output directory. +********** + +********** +DEBUG_MODE: +This option sets the dedug mode. + +Supported Value: +True: Set to True +False: Set to False +********** + +******************** +[TWITTER_LANGUAGES] +******************** + +********** +LANGUAGES: fr en ar ja es de it id pt ko tr ru nl fil ms zh-tw zh-cn hi no sv fi da pl hu fa he ur th uk ca ga el eu cs gl ro hr en-gb vi bn bg sr sk gu mr ta kn + +This option lists all the languages supported in Twitter URL. When a new language is encountered in the Twitter +URL than already present in the current list add to the current list. +********** \ No newline at end of file diff --git a/fch/core/config/data/config.ini b/fch/core/config/data/config.ini new file mode 100644 index 0000000..6386f79 --- /dev/null +++ b/fch/core/config/data/config.ini @@ -0,0 +1,13 @@ +[SETUP] +START_TIMESTAMP: -1 +END_TIMESTAMP: -1 +OUTPUT: None +INTERMIDIARY_DIR: /tmp +DEBUG_MODE: False +FREQUENCY: 0 + +[TWITTER] +LANGUAGES: fr en ar ja es de it id pt ko tr ru nl fil ms zh-tw zh-cn hi no sv fi da pl hu fa he ur th uk ca ga el eu cs gl ro hr en-gb vi bn bg sr sk gu mr ta kn +DOMAIN: https://twitter.com https://mobile.twitter.com +PATH: with_replies +URL: scheme://domain/handle/path?lang=lang diff --git a/fch/core/config/data/deletedtweets b/fch/core/config/data/deletedtweets new file mode 100644 index 0000000..01ee13d Binary files /dev/null and b/fch/core/config/data/deletedtweets differ diff --git a/fch/core/config/data/followercount b/fch/core/config/data/followercount new file mode 100644 index 0000000..be0a154 Binary files /dev/null and b/fch/core/config/data/followercount differ diff --git a/fch/core/datamanager.py b/fch/core/datamanager.py new file mode 100644 index 0000000..5e677a5 --- /dev/null +++ b/fch/core/datamanager.py @@ -0,0 +1,308 @@ +import os +import requests +import json +import csv +import time +import sys + +from warcio.warcwriter import WARCWriter +from warcio.statusandheaders import StatusAndHeaders +from warcio.archiveiterator import ArchiveIterator + +from fch.core.utils.util_functions import Utils + + +class DataManager: + """ + This class is for Data Management. + + Attributes: + __config (ConfigurationReader): Configuration object + __constants (Constants): For constants + __memento_dir (str): Memento Directory + __timemap_dir (str): TimeMap Directory + __pmemento_dir (str): Parsed Memento Directory + __dtweet_dir (str): Deleted Tweets Directory + __json_dir (str): Json files Directory + __fcount_dir (str): Follower Count Directory + + """ + def __init__(self, config, constants): + """ + The constructor of DataManager class. + + Parameters: + config (ConfigurationReader): Configuration object + constants (Constants): For constants + """ + self.__config = config + self.__constants = constants + self.__memento_dir = os.path.join(self.__config.intermediate, "Mementos") + self.__timemap_dir = os.path.join(self.__config.intermediate, "TimeMap") + self.__pmemento_dir = os.path.join(self.__config.intermediate, "ParsedMementos") + self.__dtweet_dir = os.path.join(self.__config.intermediate, "DeletedTweets") + self.__json_dir = os.path.join(self.__config.intermediate, "JsonOutputs") + + def set_twitter_handle(self, thandle): + """ + Function to set Twitter handle + + Parameters: + thandle (str): Twitter handle + + Returns: + """ + self.__thandle = thandle + + def write_memento(self, murl=None): + """ + This is function to write memento in WARC format. + + Parameters: + murl (str): URI-M + + Returns: + (bool): True on Success and False on Failure + """ + try: + if self.lookup_memento(murl): + return True + else: + response = Utils.get_murl_info(murl, self.__thandle) + mpath = self.__memento_dir + if not os.path.exists(mpath): + os.mkdir(mpath) + mpath = os.path.join(mpath, response["handle"].lower()) + if not os.path.exists(mpath): + os.mkdir(mpath) + mpath = os.path.join(mpath, response["domain"]) + if not os.path.exists(mpath): + os.mkdir(mpath) + mpath = os.path.join(mpath, response["archive"]) + if not os.path.exists(mpath): + os.mkdir(mpath) + mpath = os.path.join(mpath, response["wrep"] + response["lang"]) + if not os.path.exists(mpath): + os.mkdir(mpath) + try: + mpath = os.path.join(mpath, str(response["timestamp"]) + self.__constants.WARC_EXT) + with open(mpath, "wb") as output: + writer = WARCWriter(output, gzip=True) + resp = requests.get(murl, + headers={'Accept-Encoding': 'identity'}, + stream=True, timeout=120) + + # get raw headers from urllib3 + headers_list = resp.raw.headers.items() + http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.1') + record = writer.create_warc_record(mpath, 'response', + payload=resp.raw, + http_headers=http_headers) + writer.write_record(record) + return True + except requests.exceptions.TooManyRedirects as err: + sys.stderr.write(murl + "Too Many redirects" + "\n") + except requests.exceptions.ConnectTimeout as err: + sys.stderr.write(murl + "Connection Timeout" + "\n") + except Exception as e: + sys.stderr.write("Memento Write Error: " + str(e) + "URL:" + murl + "\n") + except Exception as e: + sys.stderr.write("Memento Write Error: " + murl + " " + str(e) + "\n") + return False + + def read_memento(self, murl=None): + """ + This function is for reading memento content. + + Parameters: + murl (str):URI-M + + Returns: + (str): Content on Success and None on Failure + """ + mpath = self.lookup_memento(murl) + response = Utils.get_murl_info(murl, self.__thandle) + if mpath: + if self.__constants.WARC_EXT in mpath: + try: + with open(mpath, 'rb') as stream: + for record in ArchiveIterator(stream): + if record.rec_type == 'response': + if self.__config.debug: sys.stdout.write(str(murl["uri"]) + " Content Size: " + str(record.rec_headers.get_header('Content-Length')) + "\n") + if (int(response["timestamp"]) < 20090101000000 and int(record.rec_headers.get_header('Content-Length')) < 1000) or (int(response["timestamp"]) > 20200101000000 and int(record.rec_headers.get_header('Content-Length')) < 100000): + return None + else: + return record.content_stream().read() + + + except Exception as e: + sys.stderr.write("Memento Read Error: " + str(e) + "\n") + elif ".html" in mpath: + try: + with open(mpath, "r") as stream: + return stream.read() + except Exception as e: + sys.stderr.write("Memento Read Error: " + str(e) + "\n") + return None + + def lookup_memento(self, murl=None): + """ + This function looks up for mementos. + + Parameters: + murl (str): URI-M + + Returns: + (str): Path of Memento on Success and None on Failure + """ + try: + response = Utils.get_murl_info(murl, self.__thandle) + mpath = os.path.join(self.__memento_dir, response["handle"].lower(), response["domain"], response["archive"], + response["wrep"], response["lang"], response["timestamp"] + self.__constants.WARC_EXT) + if os.path.exists(mpath) and os.stat(mpath).st_size > 0: + return mpath + else: + mpath = os.path.join(self.__memento_dir, response["handle"].lower(), response["archive"], + response["wrep"], response["lang"], response["timestamp"] + ".html") + if os.path.exists(mpath): + return mpath + return None + except Exception as e: + sys.stderr.write("Memento Lookup Error: " + str(murl) + " " + str(e) + "\n") + + def write_timemap(self, turl=None, tm_content=None): + """ + This is function to write TimeMap. + + Parameters: + turl (str): Twitter URL + tm_content (str): TimeMap Content + Returns: + (bool): True on Success and False on Failure + """ + tresponse = Utils.get_turl_info(turl) + tmpath = self.__timemap_dir + if not os.path.exists(tmpath): + os.mkdir(tmpath) + tmpath = os.path.join(tmpath, tresponse["handle"].lower()) + if not os.path.exists(tmpath): + os.mkdir(tmpath) + tmpath = os.path.join(tmpath, tresponse["domain"]) + if not os.path.exists(tmpath): + os.mkdir(tmpath) + tmpath = os.path.join(tmpath, tresponse["wrep"] + tresponse["lang"]) + if not os.path.exists(tmpath): + os.mkdir(tmpath) + millis = int(round(time.time() * 1000)) + try: + tmpath = os.path.join(tmpath, str(Utils.epochtime_to_memento(millis)) + self.__constants.TM_EXT) + with open(tmpath, "w") as tm_ofile: + tm_ofile.write(tm_content) + return True + except Exception as e: + sys.stderr.write("TimeMap Write Error: " + str(e) + "\n") + return False + + def read_timemap(self, turl=None): + """ + This function is for reading TimeMap. + + Parameters: + turl (str): Twitter URL + + Returns: + (list): Content on Success and None on Failure + """ + if self.lookup_timemap(turl): + try: + tmpath = self.__timemap_dir + tresponse = Utils.get_turl_info(turl) + tmpath = os.path.join(tmpath, tresponse["handle"].lower()) + tmpath = os.path.join(tmpath, tresponse["domain"], tresponse["wrep"] + tresponse["lang"]) + urims = [] + for time_map in os.listdir(tmpath): + with open(os.path.join(tmpath, time_map), "r") as tm_ofile: + for line in tm_ofile: + if not (line.startswith("@") or line.startswith("!")): + if line not in urims: + urims.append(line) + return urims + except Exception as e: + sys.stderr.write("TimeMap Read Error: " + str(e) + "\n") + return None + + def lookup_timemap(self, turl=None): + """ + This function looks up for TimeMap. + + Parameters: + turl (str): Twitter URL + + Returns: + (bool): True on Success and False on Failure + """ + try: + tmpath = self.__timemap_dir + tresponse = Utils.get_turl_info(turl) + tmpath = os.path.join(tmpath, tresponse["handle"].lower()) + tmpath = os.path.join(tmpath, tresponse["domain"], tresponse["wrep"] + tresponse["lang"]) + if os.path.exists(tmpath) and len(os.listdir(tmpath)) > 0: + return True + return False + except Exception as e: + self.stderr.write("LookUp TimeMap: " + str(turl) + " " + str(e) + "\n") + + def write_follower_count(self, thandle="john", fcontent=None): + """ + This is function to write Follower Count. + + Parameters: + thandle (str): Twitter Handle + fcontent (dict): File Content + Returns: + (bool): True on Success and False on Failure + """ + try: + if self.__config.out: + fpath = os.path.join(os.getcwd(), "output") + if not os.path.exists(fpath): + os.mkdir(fpath) + fpath = os.path.join(fpath, "followerCSV") + if not os.path.exists(fpath): + os.mkdir(fpath) + if not os.path.exists(os.path.join(os.getcwd(), "output", "graphs")): + os.mkdir(os.path.join(os.getcwd(), "output", "graphs")) + csv_file = open(os.path.join(fpath, thandle + ".csv"), "w") + fieldnames = ["MementoTimestamp", "URI-M", "FollowerCount", "DateTime"] + writer = csv.DictWriter(csv_file, fieldnames=fieldnames) + writer.writeheader() + for row in fcontent: + writer.writerow(row) + csv_file.close() + else: + for row in fcontent: + row.pop("DateTime") + fcontent = json.dumps(fcontent) + sys.stdout.write(str(fcontent)) + return True + + except Exception as e: + sys.stderr.write("write_follower_count: " + str(e) + "\n") + return False + + def lookup_follower_count(self, thandle="john"): + """ + This function looks up for a Follower Count. + + Parameters: + thandle (str): Twitter Handle + urim (str): URI-M + + Returns: + (bool): Dictionary of Follower Count on Success and None on Failure + """ + if self.__config.out: + fpath = os.path.join(os.getcwd(), "output", "followerCSV") + if os.path.exists(os.path.join(fpath, thandle + ".csv")): + return True + return False diff --git a/fch/core/mementodownloader.py b/fch/core/mementodownloader.py new file mode 100644 index 0000000..7161cbb --- /dev/null +++ b/fch/core/mementodownloader.py @@ -0,0 +1,99 @@ +import time +import ast +import requests +import sys +import concurrent.futures +from fch.core.utils.util_functions import Utils +from fch.follower.followerparser import FollowerParser + + +class MementoDownloader: + def __init__(self, thandle, turl, constants, dmanager, conf_reader): + self.__thandle = thandle + self.__turl = turl + self.__dmanager = dmanager + self.__constants = constants + self.__conf_reader = conf_reader + self.__parse_memento = FollowerParser(thandle, constants, dmanager, conf_reader) + + ''' + Function gets URIMs and fetches mementos using concurrent threads and writes to database + ''' + + def get_memento(self): + todo_frontier = self.__parse_timemap() + if self.__conf_reader.debug: sys.stdout.write("fetch_mementos: Frontier: " + str(todo_frontier) + "\n") + if todo_frontier: + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: + for frontier_list in todo_frontier: + future_result = {executor.submit(self.__download_memento, url): + url for url in frontier_list["urims"]} + for future in concurrent.futures.as_completed(future_result): + if self.__conf_reader.debug: sys.stdout.write("fetch_mementos: result: " + str(future.result()) + "\n") + + ''' + Parses the CDXJ format of memento list and downloads only the mementos above the timestamp of the 3200 recent tweets + fetched from live Twitter + ''' + + def __parse_timemap(self): + + todo_frontier = [] + + ''' + List to count mementos + Index 0: Total Urls + Index 1: Already downloaded mementos + Index 2: To be downloaded mementos + ''' + mcount = [0, 0, 0] + mintime, maxtime = Utils.get_timerange(self.__constants, self.__conf_reader) + if self.__conf_reader.debug: sys.stdout.write("__parse_timemap: Minimum Live Timestamp: {} Maximum Live Timestamp: {}".format + (mintime, maxtime) + "\n") + timemap_content = Utils.parse_timemap(self.__dmanager, self.__constants, self.__turl, self.__conf_reader, mintime, maxtime) + if self.__conf_reader.debug: sys.stdout.write("__parse_timemap: " + str(timemap_content) + "\n") + for memento in timemap_content: + response = Utils.get_murl_info(memento, self.__thandle) + # If archive.is mementos then skip it, as we do not parse them + # Added to remove wayback.archive.it + if response["archive"] not in ["archive.is", "archive.md"]: + if response["timestamp"].isdigit(): + if mintime <= int(response["timestamp"]) <= maxtime: + # Count total number of mementos for twitter handle within the time range + mcount[0] += 1 + memento_present = self.__dmanager.lookup_memento(memento) + if memento_present: + mcount[1] += 1 + else: + mcount[2] += 1 + frontier_present = False + for entry in todo_frontier: + if entry["archive"] == response["archive"]: + frontier_present = True + entry["urims"].append(memento["uri"]) + entry["count"] += 1 + break + if not frontier_present: + json_object = {"archive": response["archive"], "count": 1, + "urims": [memento["uri"]]} + todo_frontier.append(json_object) + # Write logs for each user + if self.__conf_reader.debug: sys.stdout.write("fetch_mementos: Twitter Handle: " + self.__thandle + "\n") + if self.__conf_reader.debug: sys.stdout.write("fetch_mementos: Date-Time: " + str(time.strftime("%b %d %Y %H:%M:%S", time.gmtime())) + "\n") + if self.__conf_reader.debug: sys.stdout.write("fetch_mementos: Total Memento URLs: " + str(mcount[0]) + "\n") + if self.__conf_reader.debug: sys.stdout.write("fetch_mementos: Number of Mementos already downloaded: " + str(mcount[1]) + "\n") + if self.__conf_reader.debug: sys.stdout.write("fetch_mementos: Number of Mementos for consideration: " + str(mcount[2]) + "\n") + return todo_frontier + + ''' + Function to fetch Mementos, send response for parsing and write the response + ''' + + def __download_memento(self, murl): + if self.__conf_reader.debug: sys.stdout.write("__download_memento:" + murl + "\n") + try: + self.__dmanager.write_memento(murl) + except requests.exceptions.ConnectionError as err: + sys.stderr.write("__download_memento: ConnectionError: " + murl + " " + str(err) + "\n") + except Exception as err: + sys.stderr.write("__download_memento: " + murl + " " + str(err) + "\n") diff --git a/fch/core/timemapdownloader.py b/fch/core/timemapdownloader.py new file mode 100644 index 0000000..46878d8 --- /dev/null +++ b/fch/core/timemapdownloader.py @@ -0,0 +1,59 @@ +import requests +import sys +import subprocess + +class TimeMapDownloader: + """ + This class is for finding deleted tweets of a Twitter handle. + + Note: + Always run memgator server + Command: memgator --contimeout=10s --agent=msiddiqu@cs.odu.edu server + Docker Command: docker run -p 1208:1208 ibnesayeed/memgator --contimeout=10s --agent=msiddiqu@cs.odu.edu + server + + Attributes: + __thandle (str): Twitter Handle + __constants (Constants): For constants + __dmanager (DataManager): Allows Data Management + __conf_reader (ConfigReader): ConfigReader object + + """ + def __init__(self, thandle, constants, dmanager, config_reader): + """ + This is constructor for TimeMapDownloader class. + + Parameters: + thandle (str): Twitter Handle + constants (Constants): For constants + dmanager (DataManager): Allows Data Management + config_reader (ConfigReader): ConfigReader object + + """ + self.__thandle = thandle + self.__constants = constants + self.__dmanager = dmanager + self.__conf_reader = config_reader + + def fetch_timemap(self, turl): + """ + This function is to fetch TimeMap + + Returns: + (bool): True on Success and False on Failure + """ + # command = self.__constants.MEMGATOR_URL + self.__constants.MEMGATOR_FORMAT + self.__constants.FSLASH + turl + command = "https://memgator.cs.odu.edu/timemap/cdxj/" + turl + try: + response = requests.get(command) + # result = subprocess.run("docker container run -it --rm oduwsdl/memgator --format=cdxj --contact=msidd003@odu.edu --contimeout=10s https://twitter.com/m_nsiddique", shell=True, stdout=subprocess.PIPE, universal_newlines=True) + if response.status_code == 200: + self.__dmanager.write_timemap(turl, response.content.decode('ascii')) + if self.__conf_reader.debug: sys.stdout.write("fetch_timemap: " + str(response.status_code) + "\n") + return True + else: + if self.__conf_reader.debug: sys.stdout.write("fetch_timemap: " + str(response.status_code) + "\n") + if self.__conf_reader.debug: sys.stdout.write("fetch_timemap: No timemap found: " + turl + "\n") + except Exception as err: + sys.stderr.write("Fetch Timemap: Error: "+ turl + " " + str(err) + "\n") + return False diff --git a/fch/core/utils/__init__.py b/fch/core/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fch/core/utils/constants.py b/fch/core/utils/constants.py new file mode 100644 index 0000000..f1b7f86 --- /dev/null +++ b/fch/core/utils/constants.py @@ -0,0 +1,96 @@ +class Constants: + """ + This class is for Constants. + + Attributes: + JSON_EXT (str): JSON file Extention + TM_EXT (str): TimeMap file extention + PARSE_MEM_EXT (str): Parsed Memento file extention + WARC_EXT (str): Memento Records stored as WARC + DT_EXT (str): Deleted Tweets file extention + USCORE (str): Underscore Separator + FSLASH (str): Forward Slash Separator + ERROR404 (str): Represents 404 error text found in mementos + TWEET_DEL (str): Original Deleted Tweet + ACCT_DEL (str): Account Deleted + ACCT_SUS (str): Account Suspended + ACCT_PRI (str): Account Private + UNRT (str): Unretweet a Tweet + RT_ACCT_DEL (str): Retweet Account Deleted + RT_ACCT_SUS (str): Retweet Account Suspended + RT_ACCT_PRI (str): Retweet Account made Private + ORG_TWEET_OF_RT_DEL (str): Original Tweet for Retweet Deleted + ORG_ACCT_OF_TWEET_DEL (str): Account for Original Tweet Deleted + ORG_ACCT_OF_TWEET_SUS (str): Account for Original Tweet Suspended + ORG_ACCT_OF_TWEET_PRI (str): Account for Original Tweet goes Private + TWEET_404 (int): Status Code 404 + ACCT_DEL (int): Twitter Error Code for Account Deleted + ACCT_SUS (int): Twitter Error Code for Account Suspended + ACCT_PRI (int): Twitter Error Code for Account Private + OT (str): Original Tweet + RT (str): Retweet + OT_OF_RT (str): Original Tweet of a Retweet + MEMGATOR_URL (str): Memgator URL + MEMGATOR_FORMAT (str): Memgator Data Format + TWITTER_URL (str): Twitter URL + MOBILE_TWITTER_URL (str): Twitter Mobile URL + TWITTER_FOUND_DATE = "20160321120000" + PRE_SNOWFLAKE_BEGIN_TID = 20 + PRE_SNOWFLAKE_END_TID = 29700859247 + """ + + def __init__(self): + + """ + This is the constructor for Constants class. + + Parameters: + + """ + # File Extentions + self.JSON_EXT = ".json" + self.TM_EXT = ".tm" + self.PARSE_MEM_EXT = ".parsed" + self.WARC_EXT = ".warc.gz" + self.DT_EXT = ".del" + + # Separators + self.USCORE = "_" + self.FSLASH = "/" + + # Error + self.ERROR404 = "404 page not found" + + # Tweet Deletion Types + self.TWEET_DEL = "A1" + self.ACCT_DEL = "A2" + self.ACCT_SUS = "A3" + self.ACCT_PRI = "A4" + self.UNRT = "AB1" + self.RT_ACCT_DEL = "AB2" + self.RT_ACCT_SUS = "AB3" + self.RT_ACCT_PRI = "AB4" + self.ORG_TWEET_OF_RT_DEL = "B1" + self.ORG_ACCT_OF_TWEET_DEL = "B2" + self.ORG_ACCT_OF_TWEET_SUS = "B3" + self.ORG_ACCT_OF_TWEET_PRI = "B4" + + # Twitter Error Codes + self.TWEET_404 = 144 + self.ACCT_DEL = 34 + self.ACCT_SUS = 63 + self.ACCT_PRI = 179 + + # Tweet Types + self.OT = "OT" + self.RT = "RT" + self.OT_OF_RT = "OTR" + + # Memgator Variables + self.MEMGATOR_URL = "http://localhost:1208/timemap/" + self.MEMGATOR_FORMAT = "cdxj" + self.TWITTER_URL = "http://twitter.com/" + self.MOBILE_TWITTER_URL = "https://mobile.twitter.com" + self.TWITTER_FOUND_DATE = "20060321120000" + self.PRE_SNOWFLAKE_BEGIN_TID = 20 + self.PRE_SNOWFLAKE_END_TID = 29700859247 \ No newline at end of file diff --git a/fch/core/utils/util_functions.py b/fch/core/utils/util_functions.py new file mode 100644 index 0000000..2323d52 --- /dev/null +++ b/fch/core/utils/util_functions.py @@ -0,0 +1,271 @@ +import datetime +import ast +import re +import bs4 +import os +import sys + + +class Utils: + """ + This is a class for Utility Functions. + """ + + @staticmethod + def check_memento(dmanager, memento): + mcontent = dmanager.read_memento(memento) + if mcontent is not None: + soup = bs4.BeautifulSoup(mcontent, "html.parser") + if soup.find("html") is None or not soup.find("html").has_attr("lang"): + return False + else: + return True + + @staticmethod + def memento_to_epochtime(mtime): + """ + Function to convert memento datetime to UTC milliseconds + + Parameters: + mtime (str): Memento Datetime + + Returns: + (int): Memento Datetime in milliseconds on Success and None on Failure + """ + try: + mdate = datetime.datetime.strptime(mtime, "%Y%m%d%H%M%S") + epoch = datetime.datetime.utcfromtimestamp(0) + mepoch = int((mdate - epoch).total_seconds()) + return mepoch + except Exception as e: + if mtime != "-1": + sys.stderr.write("memento_to_epochtime: " + str(mtime) + " " + str(e) + "\n") + return None + + @staticmethod + def epochtime_to_memento(tmillis): + """ + Function to convert timestamp in millis to Memento datetime + + Parameters: + tmillis (int): Time in milliseconds + + Returns: + (int): Memento Datetime on success and None on Failure + """ + + try: + mdate = datetime.datetime.fromtimestamp(tmillis / 1000) + mtime = str(mdate.year) + if mdate.month < 10: + mtime += "0" + mtime += str(mdate.month) + if mdate.day < 10: + mtime += "0" + mtime += str(mdate.day) + if mdate.hour < 10: + mtime += "0" + mtime += str(mdate.hour) + if mdate.minute < 10: + mtime += "0" + mtime += str(mdate.minute) + if mdate.second < 10: + mtime += "0" + mtime += str(mdate.second) + return mtime + except Exception as e: + sys.stderr.write("epochtime_to_memento: " + str(e) + "\n") + return None + + @staticmethod + def get_timerange(constants, config, db_live=None): + """ + Function to get the minimum and maximum timestamp for the analysis + + Parameters: + constants (Constants): for Constants + config (ConfigurationReader): for Configuration + db_live (collection): Live Tweets Collection + + Returns: + (int): Minimum Timestamp + (int): maximum Timestamp + """ + min_time = str(config.start_time) + max_time = str(config.end_time) + if not Utils.memento_to_epochtime(min_time) and not Utils.memento_to_epochtime(max_time): + min_time = constants.TWITTER_FOUND_DATE + cur_time = datetime.datetime.now() + max_time = cur_time.strftime("%Y%m%d%H%M%S") + elif not Utils.memento_to_epochtime(min_time) and Utils.memento_to_epochtime(max_time): + min_time = constants.TWITTER_FOUND_DATE + elif Utils.memento_to_epochtime(min_time) and not Utils.memento_to_epochtime(max_time): + cur_time = datetime.datetime.now() + max_time = cur_time.strftime("%Y%m%d%H%M%S") + return int(min_time), int(max_time) + + @staticmethod + def parse_timemap(dmanager, constants, turl, config_reader=None, stime=None, etime=None): + """ + This function is for parsing the timemap between the start and end time and getting URI-Ms + + Parameters: + dmanager (DataManager): DataManger Object + constants (Constants): Constants + turl (str): Twitter URL + stime (int): Start Time + etime (int): End Time + + Returns: + lurims (list: List of URI-Ms + """ + try: + if os.path.exists(os.path.join(os.getcwd(), "mementos.txt")): + lurims = [] + with open(os.path.join(os.getcwd(), "mementos.txt"), "r") as fobj: + for line in fobj: + lurims.append(ast.literal_eval(line.rstrip())) + return lurims + + timemap_content = dmanager.read_timemap(turl) + if timemap_content: + lurims = [] + srange = Utils.memento_to_epochtime(str(stime)) + erange = Utils.memento_to_epochtime(str(stime)) + int(config_reader.frequency) + for line in timemap_content: + if constants.ERROR404 in line: + return None + elif not (line.startswith("!") or line.startswith("@")) and line.rstrip(): + line_split = line.rstrip().split(" ", 1) + memento = ast.literal_eval(line_split[1]) + if config_reader.debug: sys.stdout.write("parse_timemap: " + str(memento) + "\n") + response = Utils.get_turl_info(turl) + response = Utils.get_murl_info(memento, response["handle"]) + if response["archive"] not in ["archive.is", "archive.today", "perma.cc", "webarchive.loc.gov", "web.archive.bibalex.org"]: + mtime = line_split[0] + if config_reader.debug: sys.stdout.write("parse_timemap: " + str(mtime) + "\n") + if stime <= int(mtime) <= etime: + if config_reader.frequency == 0: + lurims.append(memento) + else: + mtime = Utils.memento_to_epochtime(mtime) + if srange <= mtime <= erange: + # if Utils.check_memento(dmanager, memento): + srange = erange + erange += int(config_reader.frequency) + lurims.append(memento) + elif mtime > erange: + # if Utils.check_memento(dmanager, memento): + lurims.append(memento) + while srange <= mtime: + srange = erange + erange += int(config_reader.frequency) + elif Utils.memento_to_epochtime(str(etime)) < Utils.memento_to_epochtime(mtime): + break + with open(os.path.join(os.getcwd(), "mementos.txt"), "w") as fobj: + if config_reader.debug: sys.stdout.write("parse_timemap: Going to write memento.txt file" + "\n") + for urim in lurims: + fobj.write(str(urim) + "\n") + return lurims + except Exception as e: + sys.stderr.write("parse_timemap: " + str(e) + "\n") + return None + + @staticmethod + def get_turl_info(turl): + """ + This function parses semantics of a Twitter URL. + + Parameters: + turl (str): Twitter URL + + Returns: + (dict): Dictionary containing Domain, Handle, with_replies and lang information + """ + reg = re.compile(r'https?://(www\.)?(?Pmobile)?([\w\.\-]+)(:\d+)?/(?P\w+)' + r'((\/(?Pwith_replies))?(\/?\?((lang|locale)=(?P[\w\-]+))?.*)?)?', re.I) + response = reg.match(turl) + response = response.groupdict() + response["lang"] = (response["lang"] if response["lang"] else "default") + response["wrep"] = (response["wrep"] + "_" if response["wrep"] else "") + response["domain"] = (response["domain"] if response["domain"] else "desktop") + return response + + @staticmethod + def get_murl_info(*args): + """ + This function parses semantics of a URI-M. + + Parameters: + args (variable arguments): Index 1: URI-M/ TimeMap CDXJ Entry, Index 2: Twitter Handle + + Returns: + (dict): Dictionary containing Archive, Memento Timestamp, Domain, Handle, with_replies and lang information + """ + flag = True + try: + archive_list = ["perma.cc", "archive.is"] + input = ast.literal_eval(str(args[0])) + murl = input["uri"] + for archive in archive_list: + if archive in murl: + flag = False + except Exception as e: + murl = args[0] + + if flag: + reg = re.compile(r'https?://(www\.)?(?P[\w\.\-]+)(:\d+)?(\/\w+)?(/archive)?/(?P\d+)([a-z]' + r'{2}_)?/(?Phttps?://(www\.)?(?Pmobile)?[\w\.\-]+(:\d+)?(\/)+(?P' + r'\w+)' + r'((\/(?Pwith_replies))?\/?\?((lang|locale)=(?P[\w\-]+))?.*)?)', re.I) + response = reg.match(murl) + response = response.groupdict() + response["lang"] = (response["lang"] if response["lang"] else "default") + response["wrep"] = (response["wrep"] + "_" if response["wrep"] else "") + response["domain"] = (response["domain"] if response["domain"] else "desktop") + else: + reg = re.compile(r'https?://(www\.)?(?P[\w\.\-]+)(:\d+)?/.*', re.I) + response = reg.match(murl) + response = response.groupdict() + mdate = datetime.datetime.strptime(input["datetime"], "%a, %d %b %Y %H:%M:%S %Z") + response["timestamp"] = mdate.strftime("%Y%m%d%H%M%S") + response["lang"] = "default" + response["wrep"] = "" + response["domain"] = "desktop" + if len(args) > 1: + response["TwitterURL"] = "https://twitter.com/" + args[1].lower() + response["handle"] = args[1].lower() + else: + response["TwitterURL"] = "" + response["handle"] = "" + return response + + @staticmethod + def convert_digits_to_english(number): + # ldigits good for: fr, es, de, it, id, pt, tr, ru, ar, en, ru, ja, ko, nl, fil, ms, hi, no, sv, fi, da, po, hu, + # fa, he, ur, th, uk, ca, ga, el, eu, cs, gl, ro, hr, en-gb, bn, bg, sr, sk, gu, mr, kn, ta, va + # zh-tw zh-cn, vi: To de added + ldigits = [ + "0٠০ 〇*零०۰-๐૦௦೦零", + "1١১一१۱א๑૧௧೧壹", + "2٢২二२۲ב๒૨௨೨貳", + "3٣৩三३۳๓ג૩௩೩叄", + "4٤৪四४۴ד๔૪௪೪肆", + "5٥৫五५۵๕ה૫௫೫伍", + "6٦৬六६۶ו๖૬௬೬陸", + "7٧৭七७۷ז๗૭௭೭柒", + "8٨৮八८۸๘૮ח௮೮捌", + "9٩৯九९۹ט๙૯௯೯玖", + "十י௰拾"] + # For ko, ja mapping + lmapping = { + "ja": "廿卅百千万億兆京", + "value": [20, 30, 100, 1000, 10000, 100000000, 1000000000000, 10000000000000000]} + # JAPANESE Number system is complex check it + conv = 0 + for digit in number: + for index in range(len(ldigits)): + if digit in ldigits[index]: + conv = (10 * conv) + index + break + return conv