-
Notifications
You must be signed in to change notification settings - Fork 1
/
weatherScrapper.py
64 lines (57 loc) · 3.23 KB
/
weatherScrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
##Copyright 2015 Christopher King
##This file is part of Pennslyvania Weather Closings.
##
##Pennslyvania Weather Closings is free software: you can redistribute it and/or modify
##it under the terms of the GNU General Public License as published by
##the Free Software Foundation, either version 3 of the License, or
##(at your option) any later version.
##
##Pennslyvania Weather Closings is distributed in the hope that it will be useful,
##but WITHOUT ANY WARRANTY; without even the implied warranty of
##MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
##GNU General Public License for more details.
##
##You should have received a copy of the GNU General Public License
from urllib import urlopen
import re
from itertools import chain
import argparse
import sys
javascript = re.compile(r"ibsys.htvClosings.init\(({.*?})\)") #The closings are embeded in a javascript function. This will extract the dictionary to group 1.
wgalAddress = "http://www.wgal.com/weather/closings"
def closings(useCache=True, cache="closings.html", verbose=False):
"""Returns a dictionary containing information about closed schools and bussinesses.
Keyword Arguments:
useCache -- If true function will use cache instead of going online (default True)
cache -- The file that a cache of the website will be stored in. If none or the file does not exist, useCache will be changed to True (default \"closing.html\")
"""
if not cache: useCache = False
if useCache:
if verbose:
print >> sys.stderr, "DEBUG - reading HTML from %s, not Interweb" % cache
try:
with open(cache, 'r') as cacheFile:
wgalSite = cacheFile.read()
except IOError:
return closings(useCache=False, cache=cache)
else:
wgalSite = urlopen(wgalAddress).read()
if cache:
with open(cache, 'w') as cacheFile:
cacheFile.write(wgalSite)
rawData = eval(re.search(javascript, wgalSite).group(1)) #The parsed data is a valid python dictionary
organizedDict = {place.pop('name'): place for place in #Change the name into the key
chain(*(letter['institutions'] for letter in rawData.values() if type(letter) == dict))} #The raw data is organized by letter. Extract "institutions" from each and chain them together.
return organizedDict
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Process closing information from external website')
parser.add_argument('--usecache', dest='usecache', action='store_true', default=False, help='use the HTML cached on the local machine')
parser.add_argument('--verbose', dest='verbose', action='store_true', default=False, help='write debugging to stderr')
parser.add_argument('--cache', dest='cache', action='store', default="closings.html", type=str, help='name of the cache file')
args = parser.parse_args()
if args.verbose:
print >> sys.stderr, "DEBUG - command-line argument(s): usecache=%s, cache=%s, verbose=%s" % (args.usecache, args.cache, args.verbose)
for school, data in closings(args.usecache,cache=args.cache,verbose=args.verbose).iteritems():
print school
for key, value in data.iteritems():
print '\t'+key.upper()+': '+value