-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathinep-scrapper.py
118 lines (90 loc) · 3.51 KB
/
inep-scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import asyncio
import aiohttp
import argparse
import helpers
import settings
import os
from parsel import Selector
async def main():
parser = argparse.ArgumentParser()
parser.add_argument("--url", type=str, help="URL para o site de microdados do INEP")
args = parser.parse_args()
url = args.url if args.url else settings.INEP_URL
if url is None:
print("Nenhuma URL para base do INEP foi definida")
return
helpers.print_init(url)
async with aiohttp.ClientSession() as session:
async with session.get(url) as req:
req.raise_for_status()
content = await req.text()
sel = Selector(text=content)
title = sel.xpath("normalize-space(string(//title))").get()
page_update = sel.xpath(
"normalize-space(string(//span[@class='page-update']))"
).get()
meta_data = sel.xpath("//meta[@property='creator.productor']/@content").get()
helpers.print_welcome(title=title, subtitle=page_update)
sections = helpers.generate_sections(
sel.xpath(
"//div[contains(@class, 'anchor__content') and @data-anchor]"
).getall()
)
for section in sections:
print(section)
# select section
try:
print(f"Escolha uma opção entre 0 e {len(sections) - 1}")
option = int(input("Opção: "))
if option < 0 or option > len(sections):
raise ValueError
selected_section = sections[option]
print(f"Categoria escolhida: {selected_section}")
except ValueError:
print("Opção inválida")
return
download_files = []
# select subsection
if selected_section.isDefault():
print(f"Baixando todos os dados de todas as categorias...")
for section in sections:
for subsection in section.subsections():
download_files.append(subsection.url())
elif selected_section.subsections:
subsections = selected_section.subsections()
for subsection in subsections:
print(f"{subsection}")
try:
print(f"Escolha uma opção entre 0 e {len(subsections) - 1}")
option = int(input("Opção: "))
if option < 0 or option > len(subsections):
raise ValueError
selected_subsection = subsections[option]
except ValueError:
print("Opção inválida")
return
print(f"Baixando dados de {selected_section} - {selected_subsection}...")
if selected_subsection.isDefault():
for subsection in selected_section.subsections():
download_files.append(subsection.url())
else:
download_files.append(selected_subsection.url())
storage_dir = settings.DATA_DIR
if not os.path.exists(storage_dir):
os.mkdir(storage_dir)
for f in download_files:
if f is None:
continue
file_name = os.path.basename(f)
print(f"Requisitando o arquivo {file_name}")
async with aiohttp.ClientSession() as session:
async with session.get(f, timeout=None) as req:
req.raise_for_status()
file_name = os.path.join(storage_dir, os.path.basename(f))
with open(file_name, "wb") as output_f:
async for chunk in req.content.iter_chunked(256):
output_f.write(chunk)
print(f"O arquivo {f} foi baixado com sucesso!")
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())