-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathalloc.py
107 lines (80 loc) · 2.97 KB
/
alloc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import array
import collections
import os
import sys
import requests
stations = sys.argv[1].split(",")
years = [int(year) for year in sys.argv[2].split("-")]
start_year = years[0]
end_year = years[1]
TEMPLATE_URL = "https://www.ncei.noaa.gov/data/global-hourly/access/{year}/{station}.csv"
TEMPLATE_FILE = "station_{station}_{year}.csv"
def download_data(station, year):
my_url = TEMPLATE_URL.format(station=station, year=year)
req = requests.get(my_url)
if req.status_code != 200:
return # not found
w = open(TEMPLATE_FILE.format(station=station, year=year), "wt")
w.write(req.text)
w.close()
def download_all_data(stations, start_year, end_year):
for station in stations:
for year in range(start_year, end_year + 1):
if not os.path.exists(TEMPLATE_FILE.format(station=station, year=year)):
download_data(station, year)
def get_all_files(stations, start_year, end_year):
all_files = collections.defaultdict(list)
for station in stations:
for year in range(start_year, end_year + 1):
f = open(TEMPLATE_FILE.format(station=station, year=year), 'rb')
content = list(f.read())
all_files[station].append(content)
f.close()
return all_files
stations = ['01044099999']
start_year = 2005
end_year = 2021
download_all_data(stations, start_year, end_year)
all_files = get_all_files(stations, start_year, end_year)
list(all_files.keys())
print(sys.getsizeof(all_files))
print(sys.getsizeof(all_files.values()))
print(sys.getsizeof(list(all_files.values())))
station_content = all_files[stations[0]]
print(len(station_content))
print(sys.getsizeof(station_content))
print(len(station_content[0]))
print(sys.getsizeof(station_content[0]))
print(type(station_content[0]))
print(station_content[0][0])
print(sys.getsizeof('text'))
print(sys.getsizeof('longer text'))
print(sys.getsizeof(['text']))
print(sys.getsizeof(['longer text']))
print(sys.getsizeof(station_content[0][0]))
print(type(station_content[0][0]))
id(station_content[0][2])
single_file_data = station_content[0]
all_ids = set()
for entry in single_file_data:
all_ids.add(id(entry))
print(len(all_ids))
single_file_str_list = [chr(i) for i in single_file_data]
print(sys.getsizeof(single_file_str_list[0]))
single_file_str = ''.join(single_file_str_list)
print(sys.getsizeof(single_file_str))
def get_all_files_clean(stations, start_year, end_year):
all_files = collections.defaultdict(list)
for station in stations:
for year in range(start_year, end_year + 1):
f = open(TEMPLATE_FILE.format(station=station, year=year), 'rb')
content = f.read()
all_files[station].append(content)
f.close()
return all_files
all_files_clean = get_all_files_clean(stations, start_year, end_year)
single_file_data = all_files_clean[stations[0]][0]
print(type(single_file_data))
print(sys.getsizeof(single_file_data))
len(single_file_data)
(len(single_file_data) + 1) // 2