Skip to content

Commit

Permalink
Look up existing t.co image mappings
Browse files Browse the repository at this point in the history
Look up the short/long mappings in the image data too. Also create a log
and store in the archive directory as a record of how the data has been
updated.
  • Loading branch information
edsu committed Nov 20, 2022
1 parent 1210185 commit 43b45a2
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 5 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from setuptools import setup

__version__ = "0.0.6"
__version__ = "0.0.7"

def long_description():
with open("README.md") as f:
Expand Down
2 changes: 1 addition & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

def test_read_mapping():
url_map = twitter_archive_unshorten.read_url_map('test-data/data/tweet.js')
assert len(url_map) == 42
assert len(url_map) == 45

def test_unshorten():
urls = [
Expand Down
20 changes: 17 additions & 3 deletions twitter_archive_unshorten.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import sys
import json
import time
import logging
import urllib.error
import urllib.request

Expand All @@ -29,6 +30,9 @@ def main():
archive_dir = sys.argv[1]
sanity_check(archive_dir)

logging.basicConfig(filename=join(archive_dir, "twitter-archive-unshorten.log"), level=logging.INFO)
logging.info("rewriting t.co urls with https://github.com/docnow/twitter-archive-unshorten")

# find all the short urls in the archive
short_urls = get_short_urls(archive_dir)

Expand Down Expand Up @@ -94,8 +98,9 @@ def rewrite_files(archive_dir, url_map):
# remember the mapping only contains https keys
lookup_url = re.sub(r'^http://', 'https://', short_url)
if lookup_url in url_map:
rewrote += 1
logging.info("rewriting {short_url} to {long_url} in {path}")
line = line.replace(short_url, url_map[lookup_url])
rewrote += 1
else:
print(f"{lookup_url} not found")
lines.append(line)
Expand Down Expand Up @@ -133,13 +138,17 @@ def unshorten(urls, archive_dir):

# if we already know what the long url is we can skip it
if short_url in url_map:
logging.info(f"already have long url for {short_url}")
continue

logging.info(f"looking up {short_url}")

try:
urllib.request.urlopen(short_url)
except urllib.error.HTTPError as e:
if e.code == 301:
long_url = e.headers.get('Location')
logging.info(f"got {long_url} for {short_url}")
# an unescaped " will break JSON serialization
long_url = long_url.replace('"', '%22')
url_map[short_url] = long_url
Expand All @@ -150,6 +159,7 @@ def unshorten(urls, archive_dir):

# periodically dump the mappings we have
if archive_dir != "" and len(url_map) % 10 == 0:
logging.info(f"writing {len(url_map)} urls to {url_map_file}")
json.dump(url_map, open(url_map_file, "w"), indent=2)

# try not to awaken the dragon
Expand All @@ -166,16 +176,20 @@ def read_url_map(path):
data = json.loads(text)
url_map = {}
for tweet in data:
entities = tweet['tweet']['entities']['urls']
entities.extend(tweet['tweet']['entities'].get('media', []))
for url in tweet['tweet']['entities']['urls']:
short_url = url['url']
short_url = re.sub(r'^http://', 'https://', short_url)
short_url = re.sub(r'^http://', 'https://', url['url'])
if short_url.startswith('https://t.co/'):
url_map[short_url] = url['expanded_url']

return url_map

# Some shenanigans so urllib gets the redirect but doesn't follow it.
# It would be nice to be able to use requests here but I didn't want to
# make people install anything extra.
#
# It might be worth revisiting this since it is pip installed now.

class NoRedirect(urllib.request.HTTPRedirectHandler):
def redirect_request(self, *_):
Expand Down

0 comments on commit 43b45a2

Please sign in to comment.