diff --git a/standalone-crawlers/TJMG #1/teste.py b/standalone-crawlers/TJMG #1/teste.py deleted file mode 100644 index a030aca2..00000000 --- a/standalone-crawlers/TJMG #1/teste.py +++ /dev/null @@ -1,14 +0,0 @@ -from util import * -import os -import time -from selenium import webdriver -from selenium.webdriver.common.keys import Keys -import speech_recognition as sr - -browser = webdriver.Chrome(executable_path=r'chromedriver.exe') - -browse.get('https://www4.tjmg.jus.br/juridico/sf/proc_resultado.jsp?comrCodigo=24&numero=1&listaProcessos=13108047') - -botao_baixar_audio = browser.find_element_by_link_text("Baixar o áudio") -botao_baixar_audio.click() - diff --git a/standalone-crawlers/TJMG #1/tjmg.py b/standalone-crawlers/TJMG #1/tjmg.py deleted file mode 100644 index 16e98d7d..00000000 --- a/standalone-crawlers/TJMG #1/tjmg.py +++ /dev/null @@ -1,29 +0,0 @@ -from util import * -import os -import time -from selenium import webdriver -from selenium.webdriver.common.keys import Keys -import speech_recognition as sr - -#### MAIN #### - -browser = webdriver.Chrome(executable_path=r'chromedriver.exe') - -anexos_txt = open('urls_de_anexos_em_pdf.txt','w') -anexos_txt.write('') -anexos_txt.close() - - -comarca_arq = open("comarcas.txt", 'r') -comarcas_vet = [] -for i in comarca_arq: - comarcas_vet.append(int(i)) -comarca_arq.close() - - -continuar(comarcas_vet, browser) - -#minera(range(19,0,-1), comarcas_vet) - - - diff --git a/standalone-crawlers/TJMG #1/util.py b/standalone-crawlers/TJMG #1/util.py deleted file mode 100644 index 4358e345..00000000 --- a/standalone-crawlers/TJMG #1/util.py +++ /dev/null @@ -1,339 +0,0 @@ -from util import * -import os -import time -from selenium import webdriver -from selenium.webdriver.common.keys import Keys -import speech_recognition as sr - -def dv10(pNumProc): - vDigito = -1 - vSoma = 0 - v1NumProc = pNumProc[0:12] - if v1NumProc=="": - return 0 - vTamanho = len(v1NumProc) - vMultiplicador = (vTamanho % 2) + 1 # 1 se par, 2 se impar - for j in range(vTamanho): - vParcela = int(v1NumProc[j:j+1]) * vMultiplicador - if vParcela >= 10: - vParcela = (vParcela % 10) + 1 - vMultiplicador = 3 - vMultiplicador # Alterna entre 1 e 2 - vSoma += vParcela - vDigito = (10 - (vSoma % 10)) % 10 - return vDigito - - -def monta_cod_in(num): - return str(num) + str(dv10(num)) - -def monta_num_unica(num_processo, ano, jtr, comarca): - codigo_in = monta_cod_in(str(comarca)+str(ano)[-2:]+preencheZeros(num_processo,6))[-7:] - dv_num_unica = calcula_mod97(codigo_in, ano, jtr, comarca) - - num_unica = codigo_in + '-' - num_unica = num_unica + str(dv_num_unica)+'.' - num_unica = num_unica + str(ano)+'.'+str(jtr)[0]+'.'+str(jtr)[1:]+'.'+preencheZeros(str(comarca),4) - return num_unica - - -def calcula_mod97(NNNNNNN, AAAA, JTR, OOOO): - valor1 = ""; - resto1 = 0; - valor2 = ""; - resto2 = 0; - valor3 = ""; - valor1 = preencheZeros(NNNNNNN, 7); - resto1 = int(valor1) % 97; - valor2 = preencheZeros(resto1, 2) + preencheZeros(AAAA, 4) + preencheZeros(JTR, 3); - resto2 = int(valor2) % 97; - valor3 = preencheZeros(resto2, 2) + preencheZeros(OOOO, 4) + "00"; - return preencheZeros(98 - (int(valor3) % 97), 2 ); - -def valida_mod97( NNNNNNN, DD, AAAA, JTR, OOOO): - valor1 = ""; - resto1 = 0; - valor2 = ""; - resto2 = 0; - valor3 = ""; - valor1 = preencheZeros(NNNNNNN, 7); - resto1 = int(valor1) % 97; - valor2 = preencheZeros(resto1, 2) + preencheZeros(AAAA, 4) + preencheZeros(JTR, 3); - resto2 = int(valor2) % 97; - valor3 = preencheZeros(resto2, 2) + preencheZeros(OOOO, 4) + preencheZeros(DD, 2); - return ((int(valor3) % 97) == 1) - -def preencheZeros(numero, quantidade): - temp = str(numero); - retorno = ""; - if quantidade < len(temp): - return temp - else: - for i in range(quantidade - len(temp)): - retorno = "0" + retorno; - return retorno + temp -def quebra_captcha(browser, time_sleep=2): - captcha_chave = "" - botao_baixar_audio = browser.find_element_by_link_text("Baixar o áudio") - botao_baixar_audio.click() - - esperar_arquivo(r'C:\\Users\\Tales Panoutsos\\Downloads\\audio.wav', 2) - captcha_chave = audio_to_frase('C:\\Users\\Tales Panoutsos\\Downloads\\audio.wav') - captcha_form = browser.find_element_by_xpath("/html/body/table/tbody/tr[3]/td/table/tbody/tr[2]/td[2]/input") - - browser.execute_script("arguments[0].value='';", captcha_form) - if len(captcha_chave)>4: - browser.execute_script("arguments[0].value='"+captcha_chave[-5:]+"';", captcha_form) - else: - browser.execute_script("arguments[0].value='00000';", captcha_form) - captcha_form.send_keys(Keys.ENTER) - - os.system('cd "C:/Users/Tales Panoutsos/Downloads" & del audio.wav') - time.sleep(time_sleep) - return - - -def descobrir_fim(browser, comarca, ano, tolerancia): - cod_inicial_atual = 500000 - cod_inicial_a_somar = 250000 - while cod_inicial_a_somar>1: - if cod_inicial_a_somar<100: - tolerancia = cod_inicial_a_somar - if verificar_paginas(browser, comarca, ano, cod_inicial_atual, tolerancia)==1: - cod_inicial_atual = cod_inicial_atual + cod_inicial_a_somar - else: - cod_inicial_atual = cod_inicial_atual - cod_inicial_a_somar - cod_inicial_a_somar = int(cod_inicial_a_somar/2) - return cod_inicial_atual - -def carregar_pagina(browser,url): - browser.get(url) - - - - -def esperar_arquivo(path, time_sleep): - while not os.path.exists(r""+path): - print('Esperando arquivo') - time.sleep(2) - - -def audio_to_frase(nome_audio): - r = sr.Recognizer() - with sr.WavFile(nome_audio) as source: - audio = r.record(source) - try: - return(r.recognize_google(audio, language = "pt-BR")) - except LookupError: - return("error") - -def link_maker(ano, cod_unico, cod_comarca, tipo_pagina): - link = 'www4.tjmg.jus.br/juridico/sf/proc_'+tipo_pagina+'.jsp?listaProcessos='+preencheZeros(ano,2)+preencheZeros(cod_unico,6)+'&comrCodigo='+str(int(cod_comarca))+'&numero=1' - return link - - -def verificar_paginas(browser,comarca,ano,cod_inicial,num_pags): - cod = 0 - while cod < num_pags: - link = link_maker(ano, cod_inicial+cod, comarca, 'complemento') - carregar_pagina(browser,"https://"+link) - code_da_pag = browser.page_source - - verificar_exibição_da_pagina(browser, code_da_pag, link) - - if code_da_pag.find('NUMERAÇÃO ÚNICA:')!=-1: - break - cod = cod + 1 - - if cod == num_pags: - return 0 - else: - return 1 -def verificar_exibição_da_pagina(browser, code_da_pag, url): - try: - while browser.find_elements_by_xpath("/html/body/table/tbody/tr[3]/td/table/tbody/tr[2]/td[2]/input") != []: - quebra_captcha(browser) - code_da_pag = browser.page_source - except: - print('Captcha em pagina de download de pdf') - - while code_da_pag.find("A Consulta Processual aos processos de 1a. Instância encontra-se indisponível.") !=-1: - print('Fora do ar') - time.sleep(5) - carregar_pagina(browser,url) - code_da_pag = browser.page_source - - while code_da_pag.find('Não é possível acessar esse site')!=-1: - print('Não é possível acessar esse site') - time.sleep(5) - carregar_pagina(browser,url) - code_da_pag = browser.page_source - - while code_da_pag.find('ERR_INTERNET_DISCONNECTED')!=-1: - print('Sem internet') - time.sleep(5) - carregar_pagina(browser,url) - code_da_pag = browser.page_source - -def coleta_2(browser,ano,com,cod, instancia = 1, num_pagina = 0): - link = link_maker(ano, cod, com, 'resultado', instancia, num_pagina) - carregar_pagina(browser,"https://"+link) - - verificar_exibição_da_pagina(browser, browser.page_source, link) - code_da_pag = browser.page_source - - if code_da_pag.find('NUMERAÇÃO ÚNICA:')!=-1: - tipos_paginas = [] - urls = [] - links_pag_para_baixar = browser.find_elements_by_xpath('/html/body/table/tbody/tr/td/b/a') - - - - if links_pag_para_baixar!=[]: - arq_de_urls_para_baixar = open('urls_para_baixar.txt', 'a') - arq_de_urls_para_baixar.write(preencheZeros(str(int(com)),4)+preencheZeros(str(ano),2)+preencheZeros(str(cod),6)+', '+str(instancia)+', '+str(num_pagina)+', ') ##### MUDANÇA - for link_pag_para_baixar in links_pag_para_baixar: - link_text = link_pag_para_baixar.get_attribute('href') - - if instancia==1: - tipo_pagina = link_text[link_text.find('proc_')+len('proc_'):link_text.find('.jsp?')] ##### MUDANÇA - else: - tipo_pagina = link_text[link_text.find('proc_')+len('proc_'):link_text.find('2.jsp?')] ##### MUDANÇA - - - if tipo_pagina == 'movimentacoes' or tipo_pagina == 'complemento': - tipos_paginas.append(tipo_pagina) - urls.append(link_text) - else: - arq_de_urls_para_baixar.write(link_pag_para_baixar.get_attribute('href') +', ') - - #arq_de_urls_para_baixar.write('\n') - #arq_de_urls_para_baixar.close() - - - for (tipo_pagina,url) in zip(tipos_paginas,urls): - carregar_pagina(browser,url) - verificar_exibição_da_pagina(browser, browser.page_source, url) - code_da_pag = browser.page_source - if instancia==1: - nome_arquivo = tipo_pagina+' - ' + preencheZeros(str(int(com)),4)+preencheZeros(str(ano),2)+preencheZeros(str(cod),6)+".html" - else: - nome_arquivo = tipo_pagina + '2('+ num_pagina +')- ' + preencheZeros(str(int(com)),4)+preencheZeros(str(ano),2)+preencheZeros(str(cod),6)+".html" ##### MUDANÇA - - with open("paginas\\"+nome_arquivo, 'w') as page: - page.write(str(browser.page_source.encode("utf-8", 'ignore'), 'utf-8', 'ignore')) - - - butoes_anexo = browser.find_elements_by_xpath('/html/body/table/tbody/tr/td[1]/a') - for butao in butoes_anexo: - butao.click() - if tipo_pagina == "movimentacoes": - links_anexos = browser.find_elements_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[4]/a') - if tipo_pagina == "complemento": - links_anexos = browser.find_elements_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr') - links = [] - num_anexo=0 - - if links_anexos!=[]: - #anexos_em_pdf = open('urls_de_anexos_em_pdf.txt','a') - #print(anexos_em_pdf) - #arq_de_urls_para_baixar.write(preencheZeros(str(int(com)),4)+preencheZeros(str(ano),2)+preencheZeros(str(cod),6)+', '+str(instancia)+', '+str(num_pagina)+', ') - for link_web_element in links_anexos: - nome_arquivo_pdf_antigo = link_web_element.text - - if link_web_element.text[-4:]=='.pdf' or link_web_element.text[-5:]=='.html': - url_anexo = link_web_element.get_attribute('href') - arq_de_urls_para_baixar.write(str(url_anexo)+', ') - - else: - funcao_para_extrair_url = str(link_web_element.get_attribute('onclick')) - inicio_codigo_do_arq = funcao_para_extrair_url.find("'") - fim_codigo_do_arq = funcao_para_extrair_url.find("'",inicio_codigo_do_arq) - codigo_do_arq = funcao_para_extrair_url[inicio_codigo_do_arq:fim_codigo_do_arq] - - if codigo_do_arq!= '': - url_anexo = 'www4.tjmg.jus.br/juridico/sf/relatorioAcordao?numeroVerificador='+str(codigo_do_arq) - arq_de_urls_para_baixar.write(str(url_anexo)+', ') - - - # for link_anexo in links: - # carregar_pagina(browser,link_anexo) - # verificar_exibição_da_pagina(browser, browser.page_source,url) - # code_da_pag = browser.page_source - # nome_arquivo_anexo = "anexo("+ str(num_anexo) + ") - " + preencheZeros(str(int(com)),4)+preencheZeros(str(ano),2)+preencheZeros(str(cod),6)+".html" - # try: - # with open("paginas/"+nome_arquivo_anexo, 'w') as page: - # page.write(str(browser.page_source.encode("utf-8", 'ignore'),'utf-8', 'ignore')) - # except: - # arq_de_urls_para_baixar.write(str(link_anexo)+', ') - # num_anexo=num_anexo+1 - arq_de_urls_para_baixar.write('\n') - arq_de_urls_para_baixar.close() - - return 1 - else: - return 0 - - -def link_maker(ano, cod_unico, cod_comarca, tipo_pagina, instancia = 1, num = 0): - if instancia == 1: - link = 'www4.tjmg.jus.br/juridico/sf/proc_'+tipo_pagina+'.jsp?listaProcessos='+preencheZeros(ano,2)+preencheZeros(cod_unico,6)+'&comrCodigo='+str(int(cod_comarca))+'&numero='+str(instancia) - else: - link = 'www4.tjmg.jus.br/juridico/sf/proc_'+tipo_pagina+'2.jsp?listaProcessos=1'+preencheZeros(cod_comarca,4)+ preencheZeros(ano,2) +preencheZeros(cod_unico,6)+'0'+preencheZeros(num,3) - return link - - - -def minera(browser, range_ano, comarca_vet, comarca_inicio = 2, comarca_final= -1, cod_inicial = 0, cod_final = 999999, procurar_fim = 1, tolerancia_descobrir_fim=25, tolerancia_paginas_vazias = 100): - pos_inicial_comarca = comarca_vet.index(comarca_inicio) - if comarca_final!=-1: - pos_final_comarca = comarca_vet.index(comarca_final) - else: - pos_final_comarca = comarca_vet[-1] - - for ano in range_ano: - for com in comarca_vet[pos_inicial_comarca : pos_final_comarca+1]: - paginas_vazias = 0 - if procurar_fim == 1: - cod_final = descobrir_fim(browser,com, ano, tolerancia_descobrir_fim) - print("Comarca: " + str(com) + " Codigo Final:"+str(cod_final)) - pagina_coletada=0 - cod=cod_inicial - while cod tolerancia_paginas_vazias and cod > cod_final: - break - - cod=cod+1 - -def continuar(comarca_vet, browser): - parada = open('parada.txt','r') - onde_parou = parada.read() - parada.close() - onde_parou = onde_parou.split() - - ano = int(onde_parou[0]) - comarca = int(onde_parou[1]) - codigo = int(onde_parou[2]) - - prox_comarca = comarca_vet[comarca_vet.index(comarca)+1] - - minera(browser, [ano], [comarca], comarca_inicio=comarca, cod_inicial = codigo) - minera(browser, [ano], comarca_vet, comarca_inicio=prox_comarca) - minera(browser, range(ano-1,0,-1), comarca_vet) - diff --git a/standalone-crawlers/TRF2/README.md b/standalone-crawlers/TRF2/README.md deleted file mode 100644 index 9b9332ec..00000000 --- a/standalone-crawlers/TRF2/README.md +++ /dev/null @@ -1,12 +0,0 @@ -# TRF2 -## Opções ---links : Realiza consultas com combinações de caracteres (e.g. aa *) para coletar links atualizados. Default é False. - ---processes : Coleta dados de processos a partir dos links obtidos pela flag acima. Default é False. - ---screen : Abre o browser na tela do computador (desativa o modo headless). Default é False. - -
-Exemplo de uso: - -$ python crawler_trf2.py --processes --screen \ No newline at end of file diff --git a/standalone-crawlers/TRF2/crawler_trf2.py b/standalone-crawlers/TRF2/crawler_trf2.py deleted file mode 100644 index cda98ee3..00000000 --- a/standalone-crawlers/TRF2/crawler_trf2.py +++ /dev/null @@ -1,314 +0,0 @@ -import selenium -import time -import os -import glob -import urllib.request -import matplotlib.pyplot as plt -import cv2 -import numpy as np -import pytesseract -import argparse - -from selenium import webdriver -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.options import Options -from selenium.webdriver.common.keys import Keys -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as ec -from selenium.common.exceptions import NoSuchElementException -from selenium.common.exceptions import StaleElementReferenceException -from itertools import product -from string import ascii_lowercase -from PIL import Image - -SCREEN_ON = False - -# Utility functions -def init_webdriver(path_to_driver=None, use_window=False): - """ - Creates a webdriver suited for the task. - - Keyword arguments: - path_to_driver -- path to geckodriver (firefox) driver executable. Leave None if the driver is set in path. - use_window -- if True, a window of the firefox webdriver will be opened - """ - fp = webdriver.FirefoxProfile() - # "Brute force" solution to download every mime-type without asking - fp.set_preference("browser.download.folderList",2) - fp.set_preference("browser.download.manager.showWhenStarting",False) - fp.set_preference("browser.download.dir", os.path.join(os.getcwd(), 'tmp')) - fp.set_preference("browser.download.defaultFolder", os.path.join(os.getcwd(), 'tmp')) - fp.set_preference("pdfjs.disabled", True) - fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/vnd.hzn-3d-crossword;video/3gpp;video/3gpp2;application/vnd.mseq;application/vnd.3m.post-it-notes;application/vnd.3gpp.pic-bw-large;application/vnd.3gpp.pic-bw-small;application/vnd.3gpp.pic-bw-var;application/vnd.3gp2.tcap;application/x-7z-compressed;application/x-abiword;application/x-ace-compressed;application/vnd.americandynamics.acc;application/vnd.acucobol;application/vnd.acucorp;audio/adpcm;application/x-authorware-bin;application/x-athorware-map;application/x-authorware-seg;application/vnd.adobe.air-application-installer-package+zip;application/x-shockwave-flash;application/vnd.adobe.fxp;application/pdf;application/vnd.cups-ppd;application/x-director;applicaion/vnd.adobe.xdp+xml;application/vnd.adobe.xfdf;audio/x-aac;application/vnd.ahead.space;application/vnd.airzip.filesecure.azf;application/vnd.airzip.filesecure.azs;application/vnd.amazon.ebook;application/vnd.amiga.ami;applicatin/andrew-inset;application/vnd.android.package-archive;application/vnd.anser-web-certificate-issue-initiation;application/vnd.anser-web-funds-transfer-initiation;application/vnd.antix.game-component;application/vnd.apple.installe+xml;application/applixware;application/vnd.hhe.lesson-player;application/vnd.aristanetworks.swi;text/x-asm;application/atomcat+xml;application/atomsvc+xml;application/atom+xml;application/pkix-attr-cert;audio/x-aiff;video/x-msvieo;application/vnd.audiograph;image/vnd.dxf;model/vnd.dwf;text/plain-bas;application/x-bcpio;application/octet-stream;image/bmp;application/x-bittorrent;application/vnd.rim.cod;application/vnd.blueice.multipass;application/vnd.bm;application/x-sh;image/prs.btif;application/vnd.businessobjects;application/x-bzip;application/x-bzip2;application/x-csh;text/x-c;application/vnd.chemdraw+xml;text/css;chemical/x-cdx;chemical/x-cml;chemical/x-csml;application/vn.contact.cmsg;application/vnd.claymore;application/vnd.clonk.c4group;image/vnd.dvb.subtitle;application/cdmi-capability;application/cdmi-container;application/cdmi-domain;application/cdmi-object;application/cdmi-queue;applicationvnd.cluetrust.cartomobile-config;application/vnd.cluetrust.cartomobile-config-pkg;image/x-cmu-raster;model/vnd.collada+xml;text/csv;application/mac-compactpro;application/vnd.wap.wmlc;image/cgm;x-conference/x-cooltalk;image/x-cmx;application/vnd.xara;application/vnd.cosmocaller;application/x-cpio;application/vnd.crick.clicker;application/vnd.crick.clicker.keyboard;application/vnd.crick.clicker.palette;application/vnd.crick.clicker.template;application/vn.crick.clicker.wordbank;application/vnd.criticaltools.wbs+xml;application/vnd.rig.cryptonote;chemical/x-cif;chemical/x-cmdf;application/cu-seeme;application/prs.cww;text/vnd.curl;text/vnd.curl.dcurl;text/vnd.curl.mcurl;text/vnd.crl.scurl;application/vnd.curl.car;application/vnd.curl.pcurl;application/vnd.yellowriver-custom-menu;application/dssc+der;application/dssc+xml;application/x-debian-package;audio/vnd.dece.audio;image/vnd.dece.graphic;video/vnd.dec.hd;video/vnd.dece.mobile;video/vnd.uvvu.mp4;video/vnd.dece.pd;video/vnd.dece.sd;video/vnd.dece.video;application/x-dvi;application/vnd.fdsn.seed;application/x-dtbook+xml;application/x-dtbresource+xml;application/vnd.dvb.ait;applcation/vnd.dvb.service;audio/vnd.digital-winds;image/vnd.djvu;application/xml-dtd;application/vnd.dolby.mlp;application/x-doom;application/vnd.dpgraph;audio/vnd.dra;application/vnd.dreamfactory;audio/vnd.dts;audio/vnd.dts.hd;imag/vnd.dwg;application/vnd.dynageo;application/ecmascript;application/vnd.ecowin.chart;image/vnd.fujixerox.edmics-mmr;image/vnd.fujixerox.edmics-rlc;application/exi;application/vnd.proteus.magazine;application/epub+zip;message/rfc82;application/vnd.enliven;application/vnd.is-xpr;image/vnd.xiff;application/vnd.xfdl;application/emma+xml;application/vnd.ezpix-album;application/vnd.ezpix-package;image/vnd.fst;video/vnd.fvt;image/vnd.fastbidsheet;application/vn.denovo.fcselayout-link;video/x-f4v;video/x-flv;image/vnd.fpx;image/vnd.net-fpx;text/vnd.fmi.flexstor;video/x-fli;application/vnd.fluxtime.clip;application/vnd.fdf;text/x-fortran;application/vnd.mif;application/vnd.framemaker;imae/x-freehand;application/vnd.fsc.weblaunch;application/vnd.frogans.fnc;application/vnd.frogans.ltf;application/vnd.fujixerox.ddd;application/vnd.fujixerox.docuworks;application/vnd.fujixerox.docuworks.binder;application/vnd.fujitu.oasys;application/vnd.fujitsu.oasys2;application/vnd.fujitsu.oasys3;application/vnd.fujitsu.oasysgp;application/vnd.fujitsu.oasysprs;application/x-futuresplash;application/vnd.fuzzysheet;image/g3fax;application/vnd.gmx;model/vn.gtw;application/vnd.genomatix.tuxedo;application/vnd.geogebra.file;application/vnd.geogebra.tool;model/vnd.gdl;application/vnd.geometry-explorer;application/vnd.geonext;application/vnd.geoplan;application/vnd.geospace;applicatio/x-font-ghostscript;application/x-font-bdf;application/x-gtar;application/x-texinfo;application/x-gnumeric;application/vnd.google-earth.kml+xml;application/vnd.google-earth.kmz;application/vnd.grafeq;image/gif;text/vnd.graphviz;aplication/vnd.groove-account;application/vnd.groove-help;application/vnd.groove-identity-message;application/vnd.groove-injector;application/vnd.groove-tool-message;application/vnd.groove-tool-template;application/vnd.groove-vcar;video/h261;video/h263;video/h264;application/vnd.hp-hpid;application/vnd.hp-hps;application/x-hdf;audio/vnd.rip;application/vnd.hbci;application/vnd.hp-jlyt;application/vnd.hp-pcl;application/vnd.hp-hpgl;application/vnd.yamaha.h-script;application/vnd.yamaha.hv-dic;application/vnd.yamaha.hv-voice;application/vnd.hydrostatix.sof-data;application/hyperstudio;application/vnd.hal+xml;text/html;application/vnd.ibm.rights-management;application/vnd.ibm.securecontainer;text/calendar;application/vnd.iccprofile;image/x-icon;application/vnd.igloader;image/ief;application/vnd.immervision-ivp;application/vnd.immervision-ivu;application/reginfo+xml;text/vnd.in3d.3dml;text/vnd.in3d.spot;mode/iges;application/vnd.intergeo;application/vnd.cinderella;application/vnd.intercon.formnet;application/vnd.isac.fcs;application/ipfix;application/pkix-cert;application/pkixcmp;application/pkix-crl;application/pkix-pkipath;applicaion/vnd.insors.igm;application/vnd.ipunplugged.rcprofile;application/vnd.irepository.package+xml;text/vnd.sun.j2me.app-descriptor;application/java-archive;application/java-vm;application/x-java-jnlp-file;application/java-serializd-object;text/x-java-source,java;application/javascript;application/json;application/vnd.joost.joda-archive;video/jpm;image/jpeg;video/jpeg;application/vnd.kahootz;application/vnd.chipnuts.karaoke-mmd;application/vnd.kde.karbon;aplication/vnd.kde.kchart;application/vnd.kde.kformula;application/vnd.kde.kivio;application/vnd.kde.kontour;application/vnd.kde.kpresenter;application/vnd.kde.kspread;application/vnd.kde.kword;application/vnd.kenameaapp;applicatin/vnd.kidspiration;application/vnd.kinar;application/vnd.kodak-descriptor;application/vnd.las.las+xml;application/x-latex;application/vnd.llamagraphics.life-balance.desktop;application/vnd.llamagraphics.life-balance.exchange+xml;application/vnd.jam;application/vnd.lotus-1-2-3;application/vnd.lotus-approach;application/vnd.lotus-freelance;application/vnd.lotus-notes;application/vnd.lotus-organizer;application/vnd.lotus-screencam;application/vnd.lotus-wordro;audio/vnd.lucent.voice;audio/x-mpegurl;video/x-m4v;application/mac-binhex40;application/vnd.macports.portpkg;application/vnd.osgeo.mapguide.package;application/marc;application/marcxml+xml;application/mxf;application/vnd.wolfrm.player;application/mathematica;application/mathml+xml;application/mbox;application/vnd.medcalcdata;application/mediaservercontrol+xml;application/vnd.mediastation.cdkey;application/vnd.mfer;application/vnd.mfmp;model/mesh;appliation/mads+xml;application/mets+xml;application/mods+xml;application/metalink4+xml;application/vnd.ms-powerpoint.template.macroenabled.12;application/vnd.ms-word.document.macroenabled.12;application/vnd.ms-word.template.macroenabed.12;application/vnd.mcd;application/vnd.micrografx.flo;application/vnd.micrografx.igx;application/vnd.eszigno3+xml;application/x-msaccess;video/x-ms-asf;application/x-msdownload;application/vnd.ms-artgalry;application/vnd.ms-ca-compressed;application/vnd.ms-ims;application/x-ms-application;application/x-msclip;image/vnd.ms-modi;application/vnd.ms-fontobject;application/vnd.ms-excel;application/vnd.ms-excel.addin.macroenabled.12;application/vnd.ms-excelsheet.binary.macroenabled.12;application/vnd.ms-excel.template.macroenabled.12;application/vnd.ms-excel.sheet.macroenabled.12;application/vnd.ms-htmlhelp;application/x-mscardfile;application/vnd.ms-lrm;application/x-msmediaview;aplication/x-msmoney;application/vnd.openxmlformats-officedocument.presentationml.presentation;application/vnd.openxmlformats-officedocument.presentationml.slide;application/vnd.openxmlformats-officedocument.presentationml.slideshw;application/vnd.openxmlformats-officedocument.presentationml.template;application/vnd.openxmlformats-officedocument.spreadsheetml.sheet;application/vnd.openxmlformats-officedocument.spreadsheetml.template;application/vnd.openxmformats-officedocument.wordprocessingml.document;application/vnd.openxmlformats-officedocument.wordprocessingml.template;application/x-msbinder;application/vnd.ms-officetheme;application/onenote;audio/vnd.ms-playready.media.pya;vdeo/vnd.ms-playready.media.pyv;application/vnd.ms-powerpoint;application/vnd.ms-powerpoint.addin.macroenabled.12;application/vnd.ms-powerpoint.slide.macroenabled.12;application/vnd.ms-powerpoint.presentation.macroenabled.12;appliation/vnd.ms-powerpoint.slideshow.macroenabled.12;application/vnd.ms-project;application/x-mspublisher;application/x-msschedule;application/x-silverlight-app;application/vnd.ms-pki.stl;application/vnd.ms-pki.seccat;application/vn.visio;video/x-ms-wm;audio/x-ms-wma;audio/x-ms-wax;video/x-ms-wmx;application/x-ms-wmd;application/vnd.ms-wpl;application/x-ms-wmz;video/x-ms-wmv;video/x-ms-wvx;application/x-msmetafile;application/x-msterminal;application/msword;application/x-mswrite;application/vnd.ms-works;application/x-ms-xbap;application/vnd.ms-xpsdocument;audio/midi;application/vnd.ibm.minipay;application/vnd.ibm.modcap;application/vnd.jcp.javame.midlet-rms;application/vnd.tmobile-ivetv;application/x-mobipocket-ebook;application/vnd.mobius.mbk;application/vnd.mobius.dis;application/vnd.mobius.plc;application/vnd.mobius.mqy;application/vnd.mobius.msl;application/vnd.mobius.txf;application/vnd.mobius.daf;tex/vnd.fly;application/vnd.mophun.certificate;application/vnd.mophun.application;video/mj2;audio/mpeg;video/vnd.mpegurl;video/mpeg;application/mp21;audio/mp4;video/mp4;application/mp4;application/vnd.apple.mpegurl;application/vnd.msician;application/vnd.muvee.style;application/xv+xml;application/vnd.nokia.n-gage.data;application/vnd.nokia.n-gage.symbian.install;application/x-dtbncx+xml;application/x-netcdf;application/vnd.neurolanguage.nlu;application/vnd.na;application/vnd.noblenet-directory;application/vnd.noblenet-sealer;application/vnd.noblenet-web;application/vnd.nokia.radio-preset;application/vnd.nokia.radio-presets;text/n3;application/vnd.novadigm.edm;application/vnd.novadim.edx;application/vnd.novadigm.ext;application/vnd.flographit;audio/vnd.nuera.ecelp4800;audio/vnd.nuera.ecelp7470;audio/vnd.nuera.ecelp9600;application/oda;application/ogg;audio/ogg;video/ogg;application/vnd.oma.dd2+xml;applicatin/vnd.oasis.opendocument.text-web;application/oebps-package+xml;application/vnd.intu.qbo;application/vnd.openofficeorg.extension;application/vnd.yamaha.openscoreformat;audio/webm;video/webm;application/vnd.oasis.opendocument.char;application/vnd.oasis.opendocument.chart-template;application/vnd.oasis.opendocument.database;application/vnd.oasis.opendocument.formula;application/vnd.oasis.opendocument.formula-template;application/vnd.oasis.opendocument.grapics;application/vnd.oasis.opendocument.graphics-template;application/vnd.oasis.opendocument.image;application/vnd.oasis.opendocument.image-template;application/vnd.oasis.opendocument.presentation;application/vnd.oasis.opendocumen.presentation-template;application/vnd.oasis.opendocument.spreadsheet;application/vnd.oasis.opendocument.spreadsheet-template;application/vnd.oasis.opendocument.text;application/vnd.oasis.opendocument.text-master;application/vnd.asis.opendocument.text-template;image/ktx;application/vnd.sun.xml.calc;application/vnd.sun.xml.calc.template;application/vnd.sun.xml.draw;application/vnd.sun.xml.draw.template;application/vnd.sun.xml.impress;application/vnd.sun.xl.impress.template;application/vnd.sun.xml.math;application/vnd.sun.xml.writer;application/vnd.sun.xml.writer.global;application/vnd.sun.xml.writer.template;application/x-font-otf;application/vnd.yamaha.openscoreformat.osfpvg+xml;application/vnd.osgi.dp;application/vnd.palm;text/x-pascal;application/vnd.pawaafile;application/vnd.hp-pclxl;application/vnd.picsel;image/x-pcx;image/vnd.adobe.photoshop;application/pics-rules;image/x-pict;application/x-chat;aplication/pkcs10;application/x-pkcs12;application/pkcs7-mime;application/pkcs7-signature;application/x-pkcs7-certreqresp;application/x-pkcs7-certificates;application/pkcs8;application/vnd.pocketlearn;image/x-portable-anymap;image/-portable-bitmap;application/x-font-pcf;application/font-tdpfr;application/x-chess-pgn;image/x-portable-graymap;image/png;image/x-portable-pixmap;application/pskc+xml;application/vnd.ctc-posml;application/postscript;application/xfont-type1;application/vnd.powerbuilder6;application/pgp-encrypted;application/pgp-signature;application/vnd.previewsystems.box;application/vnd.pvi.ptid1;application/pls+xml;application/vnd.pg.format;application/vnd.pg.osasli;tex/prs.lines.tag;application/x-font-linux-psf;application/vnd.publishare-delta-tree;application/vnd.pmi.widget;application/vnd.quark.quarkxpress;application/vnd.epson.esf;application/vnd.epson.msf;application/vnd.epson.ssf;applicaton/vnd.epson.quickanime;application/vnd.intu.qfx;video/quicktime;application/x-rar-compressed;audio/x-pn-realaudio;audio/x-pn-realaudio-plugin;application/rsd+xml;application/vnd.rn-realmedia;application/vnd.realvnc.bed;applicatin/vnd.recordare.musicxml;application/vnd.recordare.musicxml+xml;application/relax-ng-compact-syntax;application/vnd.data-vision.rdz;application/rdf+xml;application/vnd.cloanto.rp9;application/vnd.jisp;application/rtf;text/richtex;application/vnd.route66.link66+xml;application/rss+xml;application/shf+xml;application/vnd.sailingtracker.track;image/svg+xml;application/vnd.sus-calendar;application/sru+xml;application/set-payment-initiation;application/set-reistration-initiation;application/vnd.sema;application/vnd.semd;application/vnd.semf;application/vnd.seemail;application/x-font-snf;application/scvp-vp-request;application/scvp-vp-response;application/scvp-cv-request;application/svp-cv-response;application/sdp;text/x-setext;video/x-sgi-movie;application/vnd.shana.informed.formdata;application/vnd.shana.informed.formtemplate;application/vnd.shana.informed.interchange;application/vnd.shana.informed.package;application/thraud+xml;application/x-shar;image/x-rgb;application/vnd.epson.salt;application/vnd.accpac.simply.aso;application/vnd.accpac.simply.imp;application/vnd.simtech-mindmapper;application/vnd.commonspace;application/vnd.ymaha.smaf-audio;application/vnd.smaf;application/vnd.yamaha.smaf-phrase;application/vnd.smart.teacher;application/vnd.svd;application/sparql-query;application/sparql-results+xml;application/srgs;application/srgs+xml;application/sml+xml;application/vnd.koan;text/sgml;application/vnd.stardivision.calc;application/vnd.stardivision.draw;application/vnd.stardivision.impress;application/vnd.stardivision.math;application/vnd.stardivision.writer;application/vnd.tardivision.writer-global;application/vnd.stepmania.stepchart;application/x-stuffit;application/x-stuffitx;application/vnd.solent.sdkm+xml;application/vnd.olpc-sugar;audio/basic;application/vnd.wqd;application/vnd.symbian.install;application/smil+xml;application/vnd.syncml+xml;application/vnd.syncml.dm+wbxml;application/vnd.syncml.dm+xml;application/x-sv4cpio;application/x-sv4crc;application/sbml+xml;text/tab-separated-values;image/tiff;application/vnd.to.intent-module-archive;application/x-tar;application/x-tcl;application/x-tex;application/x-tex-tfm;application/tei+xml;text/plain;application/vnd.spotfire.dxp;application/vnd.spotfire.sfs;application/timestamped-data;applicationvnd.trid.tpt;application/vnd.triscape.mxs;text/troff;application/vnd.trueapp;application/x-font-ttf;text/turtle;application/vnd.umajin;application/vnd.uoml+xml;application/vnd.unity;application/vnd.ufdl;text/uri-list;application/nd.uiq.theme;application/x-ustar;text/x-uuencode;text/x-vcalendar;text/x-vcard;application/x-cdlink;application/vnd.vsf;model/vrml;application/vnd.vcx;model/vnd.mts;model/vnd.vtu;application/vnd.visionary;video/vnd.vivo;applicatin/ccxml+xml,;application/voicexml+xml;application/x-wais-source;application/vnd.wap.wbxml;image/vnd.wap.wbmp;audio/x-wav;application/davmount+xml;application/x-font-woff;application/wspolicy+xml;image/webp;application/vnd.webturb;application/widget;application/winhlp;text/vnd.wap.wml;text/vnd.wap.wmlscript;application/vnd.wap.wmlscriptc;application/vnd.wordperfect;application/vnd.wt.stf;application/wsdl+xml;image/x-xbitmap;image/x-xpixmap;image/x-xwindowump;application/x-x509-ca-cert;application/x-xfig;application/xhtml+xml;application/xml;application/xcap-diff+xml;application/xenc+xml;application/patch-ops-error+xml;application/resource-lists+xml;application/rls-services+xml;aplication/resource-lists-diff+xml;application/xslt+xml;application/xop+xml;application/x-xpinstall;application/xspf+xml;application/vnd.mozilla.xul+xml;chemical/x-xyz;text/yaml;application/yang;application/yin+xml;application/vnd.ul;application/zip;application/vnd.handheld-entertainment+xml;application/vnd.zzazz.deck+xml") - options = Options() - - if not use_window: - options.headless = True - - if path_to_driver is None: - driver = webdriver.Firefox(options=options, firefox_profile=fp) - driver.set_page_load_timeout(30) - return driver - else: - driver = webdriver.Firefox(executable_path=path_to_driver, options=options, firefox_profile=fp) - driver.set_page_load_timeout(30) - return driver - -def get_captcha(driver, download_path='captchas/captcha.png'): - """Takes a screenshot of the current captcha page, crops the captcha and saves it.""" - element = driver.find_element_by_xpath('//*[@id="imgCaptcha"]') - location = element.location - size = element.size - driver.save_screenshot('image.png') - x = location['x'] - y = location['y'] - width = location['x'] + size['width'] - height = location['y'] + size['height'] - im = Image.open('image.png') - im = im.crop((int(x), int(y), int(width), int(height))) - im.save('captchas/captcha.png') - - -def guess_captcha(image_path='captchas/captcha.png'): - """Aplies a mask to captcha image and uses OCR to guess the captcha code.""" - img = cv2.imread(image_path) - result = img.copy() - image = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) - lower = np.array([0,220,50]) - upper = np.array([0,255,255]) - mask = cv2.inRange(image, lower, upper) - result = cv2.bitwise_and(img, img, mask=mask) - result = cv2.cvtColor(result, cv2.COLOR_BGR2GRAY) - return pytesseract.image_to_string(result) - -def save_page_source(driver, save_path, outer_folder, process_code): - """Saves the page source HTML.""" - html = driver.find_element_by_id('divInfraAreaProcesso').get_attribute('outerHTML') - with open('{}/{}/{}/{}_source.html'.format(save_path, outer_folder, process_code, process_code), 'w') as f: - f.write(html) - -def get_process_code(driver): - """Obtains and processes the process code for folder creation purposes.""" - process_code = driver.find_element_by_id('txtNumProcesso').get_attribute('innerHTML') - process_code = process_code.replace('.', '') - process_code = process_code.replace('-', '') - return process_code - -def save_attachments(driver, save_path, outer_folder, process_code): - """Tries to find attachments and save them.""" - list_attachments = driver.find_elements_by_class_name('infraLinkDocumento') - if list_attachments: - attachment_links = [entry.get_attribute('href') for entry in list_attachments] - for j, attachment in enumerate(attachment_links): - try: # If file, download and wait for timeout. - # "Fix" for https://github.com/mozilla/geckodriver/issues/1065 - driver.get(attachment) - except: # Move downloaded file (gets latest file and moves it) - move_from = os.path.join(os.getcwd(), 'tmp') - list_of_files = glob.glob(move_from + "/*") - latest_file = max(list_of_files, key=os.path.getctime) - file_extension = latest_file.split('.')[-1] - move_to = '{}/{}/{}/{}_attachment_{}.{}'.format(save_path, outer_folder, process_code, process_code, j, file_extension) - os.rename(latest_file, move_to) - print('Attachment moved') - continue - try: - driver.find_element_by_tag_name('iframe') - WebDriverWait(driver, 10).until(ec.frame_to_be_available_and_switch_to_it("conteudoIframe")) - except: - pass - - with open('{}/{}/{}/{}_attachment_{}.html'.format(save_path, outer_folder, process_code, process_code, j), 'w', encoding='utf-8') as f: - f.write(driver.page_source) - -def get_links(process_page, save_path='processlinks.txt'): - """ - Generate combinations of letters and use as queries. - Search returns CPF/CPNJ and we save the links for later use. - - Keyword arguments: - process_page -- link with main page - save_path -- path to text file containing links - """ - keywords = [''.join(i) for i in product(ascii_lowercase, repeat = 2)] - waitmore = set('aeioumy') - driver = init_webdriver(use_window=SCREEN_ON) - driver.get(process_page) - - i = 1 - for query in keywords: - attempt = 0 - print(i) - if (i % 10 == 1): # Captcha shows up each 10 queries in this case - time.sleep(5) - driver = break_captcha(driver, process_page) - name_field = driver.find_element_by_id('txtStrParte') - name_field.click() - name_field.clear() - name_field.send_keys('{} *'.format(query)) - else: - name_field = driver.find_element_by_id('txtStrParte') - name_field.click() - name_field.clear() - name_field.send_keys('{} *'.format(query)) - - name_field.send_keys(Keys.RETURN) - - print('Query: {}'.format(query)) - i += 1 - if not waitmore.isdisjoint(query): # Check if bigram contains troublesome characters, wait longer if so - wait = 50 - else: - wait = 20 - - try: - time.sleep(3) - WebDriverWait(driver, wait).until(ec.presence_of_element_located((By.CLASS_NAME, 'infraCaption'))) - except: - print('No results') - continue - while True: # Keep trying until page loads completely - print("Attempt {}, waiting for navigation to load...".format(attempt)) - attempt += 1 - time.sleep(1) - - if attempt == 100: - print("Something went wrong, page not loading. Please try again.") - exit() - try: - print('Results loaded') - entries = driver.find_elements_by_tag_name('tr')[1:] # First element is table header - print('Entries found: {}'.format(len(entries))) - links = [row.find_element_by_tag_name('a').get_attribute('href') for row in entries] - print('Links obtained: {}'.format(len(links))) - with open(save_path, 'a+') as f: - for link in links: - f.write('{}\n'.format(link)) - except NoSuchElementException: - print("NoSuchElementException") - continue - except StaleElementReferenceException: - print("StaleElementReferenceException") - continue - break - -def explore_links(driver, process_links, save_path, subfolders=100): - """Explore the CPF/CNPJ link and visit/save each listed process.""" - for link in process_links: - try: - driver.get(link) - # Get process code for naming purposes - process_code = get_process_code(driver) - outer_folder = int(process_code) % subfolders - except: - print('Could not find process number, moving on to next entry') - print('Link with issue at process level (saved to failed.txt):') - print(link) - with open('failed.txt', 'a+') as f: - f.write(link) - f.write('\n') - continue - # Create process dir - try: - os.makedirs('{}/{}/{}'.format(save_path, outer_folder, process_code)) - except OSError: - print('Creation of the directory {} failed'.format(process_code)) - break # Duplicate entry - # Check for additional events: - # Sometimes we'll need to click a red button in order to show the complete info - try: - list_all = driver.find_element(By.XPATH, "//a[@style='color:red;']") - driver.get(list_all.get_attribute('href')) - except: - pass - # Write page source - try: - save_page_source(driver, save_path, outer_folder, process_code) - except: - print('Could not find process info, moving on to next entry') - print('Link with issue at info level (saved to failed.txt):') - print(link) - with open('failed.txt', 'a+') as f: - f.write(link) - f.write('\n') - continue - # Check for attached files - save_attachments(driver, save_path, outer_folder, process_code) - -def break_captcha(driver, link): - """Keeps trying to break captcha.""" - attempt = 1 - captcha_check = True - while captcha_check: - print('Breaking captcha. Attempt: {}'.format(attempt)) - driver.close() - driver.quit() - driver = init_webdriver(use_window=SCREEN_ON) - driver.get(link) - attempt += 1 - get_captcha(driver) - - # Trying to break captcha with "informed" guesses - captcha = guess_captcha() - captcha = captcha.replace('l', '1') - captcha = captcha.replace('S', '9') - - captcha_field = driver.find_element_by_id('txtCaptcha') - captcha_field.send_keys(captcha) - captcha_field.send_keys(Keys.RETURN) - time.sleep(1) - captcha_check = driver.find_elements_by_id('txtCaptcha') - return driver - -def crawler(links_path='processlinks.txt', save_path='../processos', initial_line=0): - """Crawl through list of links and download necessary files.""" - i = 0 # Line count - driver = init_webdriver(use_window=SCREEN_ON) - with open('processlinks.txt', 'r') as f: - for name in f: - print(i, name) - if (i % 200 == 0 and i >= initial_line): # Close the browser to avoid memory issues - print(i) - driver = break_captcha(driver, name) - if (i < initial_line): # Used to skip to a particular line in links file - i += 1 - continue - i += 1 - name = name.strip('\n') - driver.get(name) - try: - process_entries = [entry for entry in driver.find_elements_by_tag_name('td') if entry.find_elements_by_tag_name('a')] - process_links = [entry.find_element_by_tag_name('a').get_attribute('href') for entry in process_entries] - # process_code = driver.find_element_by_id('txtNumProcesso').get_attribute('innerHTML') - except: - print('Could not find process number, moving on to next entry') - print('Link with issue at name level (saved to failed.txt):') - print(name) - with open('failed.txt', 'a+') as f: - f.write(name) - f.write('\n') - continue - explore_links(driver, process_links, save_path) - - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('--links', dest='links', action='store_true', - default=False, help='query for new links and save them to text file') - parser.add_argument('--processes', dest='processes', action='store_true', - default=False, help='follow link file to collect processes') - parser.add_argument('--screen', dest='screen', action='store_true', - default=False, help='turn headless mode off') - - args = parser.parse_args() - - process_page = 'https://eproc.trf2.jus.br/eproc/externo_controlador.php?acao=processo_consulta_publica&acao_origem=&acao_retorno=processo_consulta_publica' - if args.screen: - SCREEN_ON = True - if args.links: - get_links(process_page) - if args.processes: - crawler(initial_line=0) - \ No newline at end of file diff --git a/standalone-crawlers/TRF2n2/crawler.py b/standalone-crawlers/TRF2n2/crawler.py deleted file mode 100644 index 4cab8e5f..00000000 --- a/standalone-crawlers/TRF2n2/crawler.py +++ /dev/null @@ -1,500 +0,0 @@ -import selenium -import os -import shutil -import sys -import time - -from joblib import Parallel, delayed - -import urllib.parse - -from selenium import webdriver -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver import ActionChains -from selenium.webdriver.firefox.options import Options -from selenium.webdriver.common.keys import Keys -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC - -FOLDER_PATH = "coleta" - -JTR_IDENTIFIER = 402 -TRF2_URL = "http://portal.trf2.jus.br/portal/consulta/cons_procs.asp" - -FIRST_YEAR = 2014 -LAST_YEAR = 2020 - -ORIGINS = [0, 9999] -MAX_SEQ = 9999999 - -#DEBUG = True -#PRINTING_INTERVAL = 5000 - -MIN_MISS_SEQ = 100 - -RETRY_LIMIT = 100 -WAIT_INTERVAL = 2 - -DOWNLOAD_ATTACHMENTS = True - -DOWNLOAD_FOLDER = os.path.join("/datalake/ufmg/coletatrf2", FOLDER_PATH) -TMP_DOWNLOAD = os.path.join(os.getcwd(), "tmp") - -### GENERAL UTILS - -def wait_for_file(folder_path, pooling_interval): - """ - Waits until a downloaded file shows up in the temporary folder - - :param folder_path: folder in which to search - :return: the downloaded file name - """ - downloaded_file = None - - time.sleep(10) - - while not downloaded_file: - time.sleep(pooling_interval) - for file in os.listdir(folder_path): - # Don't consider Firefox temporary files - if os.path.splitext(file)[1] != ".part": - downloaded_file = file - - return downloaded_file - - -def download_page(driver, file_name): - """ - Downloads the current page - - :param driver: a Firefox driver for Selenium inside the desired page - :param file_name: name of the file to create - """ - with open(file_name, 'wb') as html_file: - html_file.write(driver.page_source.encode('iso-8859-1')) - - -def bin_search(driver, year, origin): - """ - Checks how many entries we should "scan" - - :param driver: a Firefox driver for Selenium - :param year: the year for which we're checking - :param origin: the origin for which we're checking - :return: last position which should be checked manually - """ - begin = 0 - end = MAX_SEQ - last_hit = -1 - while begin < end: - mid = (begin + end) // 2 - # check at least MIN_MISS_SEQ entries before declaring a miss - all_miss = True - for i in range(mid, min(mid + MIN_MISS_SEQ, end + 1, MAX_SEQ + 1)): - code = build_from_data(year, origin, i) - if check_number(driver, code): - all_miss = False - last_hit = i - # not worth checking all the rest to avoid using a break stmt - break - - REVERSE = False - if all_miss: - if not REVERSE: - end = mid - 1 - else: - begin = i + 1 - else: - if not REVERSE: - begin = last_hit + 1 - else: - end = mid - 1 - - return last_hit + MIN_MISS_SEQ + 1 - - -### DRIVER INITIALIZATION AND CONNECTION - -def init_driver(tmp_folder, headless=True, timeout=30): - """ - Initializes the Firefox Driver - - :param headless: temp folder used for downloads - :param headless: if set to true, no window is rendered by Selenium - :param timeout: number of seconds to wait for a page to load - :return: the configured driver - """ - - if not os.path.exists(tmp_folder): - os.makedirs(tmp_folder) - - # clear tmp directory - for file in os.listdir(tmp_folder): - os.remove(os.path.join(tmp_folder, file)) - - fp = webdriver.FirefoxProfile() - # Download files inside a folder called tmp in the current dir - fp.set_preference("browser.download.folderList", 2) - fp.set_preference("browser.download.dir", tmp_folder) - fp.set_preference("browser.download.downloadDir", tmp_folder) - fp.set_preference("browser.download.defaultFolder", tmp_folder) - fp.set_preference("pdfjs.disabled", True) - fp.set_preference("plugin.scan.Acrobat", "99.0"); - fp.set_preference("plugin.scan.plid.all", False); - fp.set_preference("browser.download.manager.showWhenStarting", False) - fp.set_preference("browser.download.manager.focusWhenStarting", False) - fp.set_preference("browser.download.manager.closeWhenDone", True) - fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf"); - #fp.set_preference("dom.max_script_run_time", 1) - - options = Options() - options.headless = headless - - driver = webdriver.Firefox(options=options, firefox_profile=fp) - driver.set_page_load_timeout(timeout) - return driver - - -def load_or_retry(driver, url): - """ - Tries to GET the supplied url using the driver - - :param driver: the driver to access the page from - :param url: url to load - :return: the driver in the desired url - """ - tries = 0 - - # tries the required number of times - while tries < RETRY_LIMIT: - try: - # leaves the loop if URL is correctly loaded - driver.get(url) - break - except: - tries += 1 - time.sleep(WAIT_INTERVAL * tries) - - if tries >= RETRY_LIMIT: - raise Exception("Couldn't reach {}".format(url)) - - return driver - - -### CODE PROCESSING - -def verif_code(num_proc): - """ - Calculates the verificaton code for a given process number - - :param num_proc: the process number for which to generate the code - :return: the verif number for the supplied code - """ - num_proc = int(num_proc) - val = 98 - ((num_proc * 100) % 97) - - return val - - -def build_from_data(year, origin, sequence): - """ - Generates a process code from its parameters - - :param year: year of the related process - :param origin: identifier for the originating unit for this process - :param sequence: sequential identifier for the process - :return: the corresponding process with its verifying digits - """ - - p_fmt = "{:07d}{:04d}{:03d}{:04d}" - ver_code = verif_code(p_fmt.format(sequence, year, JTR_IDENTIFIER, origin)) - - res_fmt = "{:07d}{:02d}{:04d}{:03d}{:04d}" - return res_fmt.format(sequence, ver_code, year, JTR_IDENTIFIER, origin) - - -def generate_codes(first_year, last_year): - """ - Generates all the codes to be downloaded - - :return: a generator instance which iterates over all possible codes - """ - for year in range(first_year, last_year + 1): - for origin in ORIGINS: - for n in range(0, MAX_SEQ + 1): - yield build_from_data(year, origin, n) - - - -### INTERACTION WITH TRF2 WEBPAGE - -def load_process_page(driver, num_proc): - """ - Fills in the form and submits it to go to a desired process - - :param driver: a Firefox driver for Selenium - :param num_proc: the process number to try - :return: the driver in the desired location - """ - driver = load_or_retry(driver, TRF2_URL) - - # process number input - loaded = False - while not loaded: - try: - WebDriverWait(driver, 10).until( - EC.presence_of_element_located( - (By.NAME, "NumProc") - ) - ) - elem = driver.find_element_by_name("NumProc") - loaded = True - except: - # there has been some internal server error, try again - print("Server error, trying again") - driver = load_or_retry(driver, TRF2_URL) - - elem.clear() - elem.send_keys(num_proc) - - # Checks the box to download everything - elem = driver.find_element_by_name("baixado") - elem.click() - - # CAPTCHA "solving" - elem = driver.find_element_by_id("gabarito") - catpcha_sol = elem.get_attribute("value") - - # Entering the CAPTCHA result - if len(catpcha_sol) > 0: - # Writes the result - elem = driver.find_element_by_name("captchacode") - elem.send_keys(catpcha_sol) - else: - # Checks the "none" box - elem = driver.find_element_by_name("nenhum") - elem.click() - - # Search - elem = driver.find_element_by_name("Pesquisar") - elem.click() - WebDriverWait(driver, 20).until(EC.staleness_of(elem)) - - return driver - - -def process_page(driver, num_proc, tmp_folder, down_folder): - """ - Processes a hit page to download all iframes - - :param driver: a Firefox driver for Selenium, in a process detail page - :param num_proc: the process number for the open page - :param tmp_folder: folder for temporary downloads - :param down_folder: folder for collection storage - """ - - # Creates a folder for this process' files - proc_folder = os.path.join(down_folder, str(num_proc)) - if not os.path.exists(proc_folder): - os.mkdir(proc_folder) - - # list with attachment links - att_links = [] - - print("Selecting iframe links") - # selects all iframe links - iframe_sel = "/html/body/form/center/div/table[3]/tbody/tr[1]/td/table/tbody/tr/td[2]/p/font/span/a" - els = driver.find_elements_by_xpath(iframe_sel) - counter = 0 - for el in els: - # open the iframe - el.click() - counter += 1 - - # wait for it to load and then switch into it - time.sleep(2) - driver.switch_to.frame("dir") - print("Tab number {}".format(counter)) - - if counter == 1: - # the first iframe has paging in it - - prev_btn = driver.find_elements_by_link_text("Anterior") - # rewind to first page - while len(prev_btn) > 0: - button = prev_btn[0] - button.click() - WebDriverWait(driver, 10).until(EC.staleness_of(button)) - prev_btn = driver.find_elements_by_link_text("Anterior") - - # now we're at the first page - file_name = str(counter) + "-1.html" - file_name = os.path.join(proc_folder, file_name) - download_page(driver, file_name) - - next_btn = driver.find_elements_by_link_text("Próxima") - # follow to next pages - page_counter = 1 - while len(next_btn) > 0: - page_counter += 1 - button = next_btn[0] - button.click() - WebDriverWait(driver, 10).until(EC.staleness_of(button)) - file_name = str(counter) + "-" + str(page_counter) + ".html" - file_name = os.path.join(proc_folder, file_name) - download_page(driver, file_name) - next_btn = driver.find_elements_by_link_text("Próxima") - - elif counter == 6: - # the sixth frame has attachments, save its links if needed - inner_frame = driver.find_elements_by_css_selector("iframe") - driver.switch_to.frame(inner_frame[0]) - - # Download page - file_name = str(counter) + ".html" - file_name = os.path.join(proc_folder, file_name) - download_page(driver, file_name) - - if DOWNLOAD_ATTACHMENTS: - attachments = driver.find_elements_by_css_selector("a.link-under") - - if len(attachments) > 0: - # create folder for attachments - att_folder = os.path.join(proc_folder, "attachments") - if not os.path.exists(att_folder): - os.mkdir(att_folder) - - # save each attachment - for i in attachments: - i.click() - time.sleep(2) ################################# LEFT - - window_before = driver.window_handles[0] - window_after = driver.window_handles[-1] - driver.switch_to.window(window_after) - - downloaded_file = wait_for_file(tmp_folder, 2) - driver.close() - - new_location = os.path.join(att_folder, downloaded_file) - previous_location = os.path.join(tmp_folder, downloaded_file) - shutil.move(previous_location, new_location) - - # Wait for file to be removed - #wait = WebDriverWait(driver, 30) - #wait.until(lambda _: len(os.listdir(tmp_folder)) == 0) - - driver.switch_to.window(window_before) - - time.sleep(10) # wait for all files to be available - # check if all attachments were downloaded - #print("Assert for {} ".format(num_proc)) - assert len(os.listdir(att_folder)) == len(attachments) - - else: - # Just download the inner iframe's contents - inner_frame = driver.find_elements_by_css_selector("iframe") - if len(inner_frame) > 0: - driver.switch_to.frame(inner_frame[0]) - file_name = str(counter) + ".html" - file_name = os.path.join(proc_folder, file_name) - download_page(driver, file_name) - - driver.switch_to.default_content() - #time.sleep(2) ###################### REMOVED - - """if not DOWNLOAD_ATTACHMENTS: - # save links in a text file - file_name = os.path.join(proc_folder, "attachments.txt") - with open(file_name, 'w') as txt_file: - txt_file.write("\n".join(att_links))""" - - -def check_number(driver, num_proc): - """ - Checks if a given number is a hit or a miss - - :param driver: a Firefox driver for Selenium - :param num_proc: the process number to try - :return: true if the checked number was hit, false if it missed - """ - - driver = load_process_page(driver, num_proc) - - if "cons_procs" in driver.current_url: - return False - else: - return True - - -def access_process_url(driver, num_proc, tmp_folder, down_folder): - """ - Checks if it hit a detail page, and if so, download the iframes - - :param driver: a Firefox driver for Selenium - :param num_proc: the process number to try - :param tmp_folder: folder for temporary downloads - :param down_folder: folder for collection storage - :return: true if the checked number was hit, false if it missed - """ - - print("Loading process page") - driver = load_process_page(driver, num_proc) - - if "cons_procs" in driver.current_url: - return False - else: - print("Entry found, processing page") - #print("*** Found process for {}".format(num_proc)) - process_page(driver, num_proc, tmp_folder, down_folder) - return True - - -def run_year(year): - print("Begin year {}".format(year)) - tmp_folder = os.path.join(TMP_DOWNLOAD, str(year)) - down_folder = os.path.join(DOWNLOAD_FOLDER, str(year)) - - if not os.path.exists(down_folder): - os.makedirs(down_folder) - - driver = init_driver(tmp_folder, True) - - hit = 0 - for origin in ORIGINS: - #print("* Year {}:".format(year)) - upper_lim = bin_search(driver, year, origin) - #print("** Verify from 0 to {}".format(upper_lim)) - for i in range(0, upper_lim): - code = build_from_data(year, origin, i) - processed = False - while not processed: - try: - print("{} --- Begin processing {}".format(year, code)) - if access_process_url(driver, code, tmp_folder, down_folder): - hit += 1 - print("{} --- Done processing {}".format(year, code)) - processed = True - except: - print("Error at {}, retrying...".format(year, code)) - pass - sys.stdout.flush() - - print("Year {} had {} hits".format(year, hit)) - sys.stdout.flush() - - time.sleep(5) - - driver.close() - -def main(): - if not os.path.exists(DOWNLOAD_FOLDER): - os.makedirs(DOWNLOAD_FOLDER) - #run_year(2019) - #Parallel(n_jobs=-1)(delayed(run_year)(year) for year in reversed(range(FIRST_YEAR, LAST_YEAR + 1) )) - for year in reversed(range(FIRST_YEAR, LAST_YEAR + 1)): - run_year(year) - -if __name__ == "__main__": - main() diff --git a/standalone-crawlers/TRF2n2/scrapycrawler.py b/standalone-crawlers/TRF2n2/scrapycrawler.py deleted file mode 100644 index 2d27ad71..00000000 --- a/standalone-crawlers/TRF2n2/scrapycrawler.py +++ /dev/null @@ -1,124 +0,0 @@ -import json -import logging -import os -import time -import scrapy - -from scrapy.crawler import CrawlerProcess - -JTR_IDENTIFIER = 402 -TRF2_URL = "http://portal.trf2.jus.br/portal/consulta/cons_procs.asp" - -FIRST_YEAR = 1989 -LAST_YEAR = 2019 - -ORIGINS = [0, 9999] -MAX_SEQ = 9999999 - -FOLDER_PATH = "coleta" - -# Functions to generate the process codes - -def verif_code(num_proc): - """ - Calculates the verificaton code for a given process number - - :param num_proc: the process number for which to generate the code - :returns: the verif number for the supplied code - """ - num_proc = int(num_proc) - val = 98 - ((num_proc * 100) % 97) - - return val - - -def build_from_data(year, origin, sequence): - """ - Generates a process code from its parameters - - :param year: year of the related process - :param origin: identifier for the originating unit for this process - :param sequence: sequential identifier for the process - :returns: the corresponding process with its verifying digits - """ - - p_fmt = "{:07d}{:04d}{:03d}{:04d}" - ver_code = verif_code(p_fmt.format(sequence, year, JTR_IDENTIFIER, origin)) - - res_fmt = "{:07d}{:02d}{:04d}{:03d}{:04d}" - return res_fmt.format(sequence, ver_code, year, JTR_IDENTIFIER, origin) - - -def generate_codes(first_year, last_year, origins, max_seq): - """ - Generates all the codes to be downloaded - - :returns: a generator instance which iterates over all possible codes - """ - for year in range(first_year, last_year + 1): - for origin in origins: - for n in range(0, max_seq + 1): - yield build_from_data(year, origin, n) - -class Trf2_2Crawler(scrapy.Spider): - name = "trf2_2" - - def __init__(self): - logging.getLogger('scrapy').setLevel(logging.WARNING) - - if not os.path.exists(FOLDER_PATH): - os.makedirs(FOLDER_PATH) - - - def start_requests(self): - req_body = "Botao=Pesquisar&UsarCaptcha=S&gabarito=4&resposta=4&Localidade=0&baixado=0&CodLoc=&NumProc={}++&TipDocPess=0&captchacode=4" - - count = 0 - for code in generate_codes(FIRST_YEAR, LAST_YEAR, ORIGINS, MAX_SEQ): - count += 1 - req = scrapy.Request(TRF2_URL, method='POST', - body=req_body.format(code), - headers={"Content-Type": "application/x-www-form-urlencoded"}, - callback=self.parse_entry) - req.meta['count'] = count - yield req - - def parse_entry(self, response): - count = response.meta['count'] - - page_cont = response.text - - urls = response.css('iframe::attr(src)').extract() - framecount = 0 - for url in urls: - framecount += 1 - full_url = response.urljoin(url) - file_name = str(count) + "-" + str(framecount) + ".html" - page_cont = page_cont.replace(url, file_name) - req = scrapy.Request(full_url, callback=self.parse_iframe) - req.meta['file_name'] = file_name - yield req - - if len(urls) > 0: - with open(os.path.join(FOLDER_PATH, str(count) + ".html"), 'w') as html_file: - html_file.write(page_cont) - - - def parse_iframe(self, response): - proc_code = response.css("#Procs option::text").get() - with open(os.path.join(FOLDER_PATH, response.meta['file_name']), 'w') as html_file: - html_file.write(response.text) - - yield {'code': proc_code} - - -def main(): - process = CrawlerProcess({ - 'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0' - }) - - process.crawl(Trf2_2Crawler) - process.start() - -if __name__ == "__main__": - main() diff --git a/standalone-crawlers/TRF5/html_retrieve.py b/standalone-crawlers/TRF5/html_retrieve.py deleted file mode 100644 index 463de835..00000000 --- a/standalone-crawlers/TRF5/html_retrieve.py +++ /dev/null @@ -1,55 +0,0 @@ -import urllib.request -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from selenium.common.exceptions import NoSuchElementException - -url_base = 'http://www5.trf5.jus.br/precatorio/' - -# define xpaths válidos em páginas de processos nos dois formatos existentes -xpath_model1 = '//*[@id="wrapper"]/h1' -xpath_model2 = '/html/body/p[2]' - -# opções para não abrir navegador -chrome_options = Options() -chrome_options.add_argument("--headless") -driver = webdriver.Chrome(chrome_options=chrome_options) - -# arquivo para armazenar as páginas que tiveram exceção -excs = open('exceptions.txt', 'a') - -# checa se um dado xpath existe -def check_exists_by_xpath(xpath, url): - driver.get(url) - try: - driver.find_element_by_xpath(xpath) - except NoSuchElementException: - return False - return True - -# URLs de forma sequencial -i = 1 - -# contador de exceções -exc_counter = 0 - -while True: - url = url_base + str(i) - - if exc_counter >= 3: - break - - # se a página contiver um xpath específico de um dos dois modelos, ela é armazenada - if check_exists_by_xpath(xpath_model1, url) or check_exists_by_xpath(xpath_model2, url): - - exc_counter = 0 - - filename = 'precatorio-' + str("{:06d}".format(i)) + '.html' - path = './pages/' + filename - - # guarda o conteúdo html da página acessada - urllib.request.urlretrieve(url, path) - i += 1 - else: - excs.write(url + '\n') - exc_counter += 1 - i += 1 \ No newline at end of file diff --git a/standalone-crawlers/TRF5/pages/precatorio-000001.html b/standalone-crawlers/TRF5/pages/precatorio-000001.html deleted file mode 100644 index 52c7e67c..00000000 --- a/standalone-crawlers/TRF5/pages/precatorio-000001.html +++ /dev/null @@ -1,191 +0,0 @@ - - - - - - - - - Tribunal Regional Federal da 5ª Região - Consulta Precatório - - - - - - - - -
- - - -
-
- -
-
-
-

Consulta Precatório

-
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - -
PROCESSO
-
8905000010 PRECATORIO 1 PROC. ORIG. 0009541977
-
-
DADOS
-
     ASSUNTO....................: DESCONHECIDO
-     ORIGEM.......................: JUSTIÇA FEDERAL
-     LOCALIDADE...............: NATAL - RN
-     TIPO............................: ENTIDADE
-     DATA AUTUACAO........: 08/08/1988
-     DATA DISTRIB .............: 15/08/1988
-     DATA LIQUIDACAO......: 01/05/1989
-     ELEMENTO DESPESA.: 
-     NATUREZA ALIMENTAR
-
-
PARTES
-
     Reqte: JOAO EVANGELISTA BEZERRA e outros
-     Adv: NICIA MARIA GOMES 
-     Reqdo: UFRN - UNIVERSIDADE FEDERAL DO RIO GRANDE DO NORTE 
-     Adv: STELA GURGEL GUERRA e outro
-     Deprec: JUIZO FEDERAL DA 1A.VARA-RN 
-
-
FASES
-
     DATA: 19/07/1989   FASE: CONCLUSAO AO JUIZ PRESIDENTE
-     DATA: 19/07/1989   FASE: DESPACHO
-     OBSERVACAO: "Oficie-se ao orgão requerido."
-     DATA: 27/07/1989   FASE: OFICIO
-     OBSERVACAO: Nº 280-GP, SOLICITANDO VERBA A UFRN.
-     DATA: 16/08/1989   FASE: PUBLICADO NO DIARIO OFICIAL DE PE
-     DATA: 13/09/1989   FASE: CONCLUSAO AO JUIZ PRESIDENTE
-     OBSERVACAO: COM ANALISE DO ART. 327-RI.
-     DATA: 26/09/1989   FASE: DESPACHO
-     OBSERVACAO: Face a informação de fls. e, tão logo o órgão requerido ...
-     DATA: 17/10/1989   FASE: PUBLICADO NO DIARIO OFICIAL DE PE
-     OBSERVACAO: DESPACHO DA FASE RETRO.
-     DATA: 25/03/1992   FASE: PRECATORIO DEPOSITADO
-     DATA: 07/12/1992   FASE: REMETIDO À SOF PARA PAGAMENTO
-     DATA: 07/12/1992   FASE: REMETIDO À SOF PARA PAGAMENTO
-     DATA: 25/03/1992   FASE: PRECATORIO DEPOSITADO
-     DATA: 03/09/1997   FASE: VISTAS A AGU
-     OBSERVACAO:  LOTE 3
-     DATA: 12/09/1997   FASE: DEVOLVIDO PELA AGU
-     DATA: 22/02/1999   FASE: RETORNO DA DILIGÊNCIA
-     DATA: 12/05/1999   FASE: PRECATORIO DEPOSITADO
-     DATA: 14/08/2002   FASE: ARQUIVADO - PASTA
-     OBSERVACAO: 000
-     DATA: 14/08/2002   FASE: ARQUIVADO - PASTA
-     OBSERVACAO: 000
-     DATA: 19/08/2002   FASE: DEPÓSITO EFETIVADO
-     OBSERVACAO: VALOR LIQ .
-     DATA: 12/05/2003   FASE: ARQUIVADO - PASTA
-     OBSERVACAO: N
-     DATA: 31/07/2003   FASE: ARQUIVADO - PASTA
-
-
-
-
- - - -
- - - - diff --git a/standalone-crawlers/TRF5/pages/precatorio-000002.html b/standalone-crawlers/TRF5/pages/precatorio-000002.html deleted file mode 100644 index a4dd4922..00000000 --- a/standalone-crawlers/TRF5/pages/precatorio-000002.html +++ /dev/null @@ -1,181 +0,0 @@ - - - - - - - - - Tribunal Regional Federal da 5ª Região - Consulta Precatório - - - - - - - - -
- - - -
-
- -
-
-
-

Consulta Precatório

-
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - -
PROCESSO
-
8905000029 PRECATORIO 2 PROC. ORIG. 0009605339
-
-
DADOS
-
     ASSUNTO....................: DESCONHECIDO
-     ORIGEM.......................: JUSTIÇA FEDERAL
-     LOCALIDADE...............: NATAL - RN
-     TIPO............................: ENTIDADE
-     DATA AUTUACAO........: 08/08/1988
-     DATA DISTRIB .............: 15/08/1988
-     DATA LIQUIDACAO......: 19/01/1988
-     ELEMENTO DESPESA.: 
-     NATUREZA ALIMENTAR
-
-
PARTES
-
     Reqte: EVALDO CABRAL DA SILVA e outros
-     Adv: NICIA MARIA GOMES 
-     Reqdo: UFRN - UNIVERSIDADE FEDERAL DO RIO GRANDE DO NORTE 
-     Adv: NILSON ROBERTO CAVALCANTI MELO 
-     Deprec: JUIZO FEDERAL DA 1A.VARA-RN 
-
-
FASES
-
     DATA: 19/07/1989   FASE: CONCLUSAO AO JUIZ PRESIDENTE
-     DATA: 19/07/1989   FASE: DESPACHO
-     OBSERVACAO: "Oficie-se ao órgão requerido."
-     DATA: 27/07/1989   FASE: OFICIO
-     OBSERVACAO: Nº 281-GP, SOLICITANDO VERBA A UFRN.
-     DATA: 16/08/1989   FASE: PUBLICADO NO DIARIO OFICIAL DE PE
-     DATA: 13/09/1989   FASE: CONCLUSAO AO JUIZ PRESIDENTE
-     OBSERVACAO: COM ANALISE DO ART. 327-RI.
-     DATA: 26/09/1989   FASE: DESPACHO
-     OBSERVACAO: "Tendo em vista a informação de fls., baixem os autos a ...
-     DATA: 17/10/1989   FASE: PUBLICADO NO DIARIO OFICIAL DE PE
-     OBSERVACAO: Despacho da fase retro.
-     DATA: 23/11/1989   FASE: PUBLICADO NO DIARIO OFICIAL DE PE
-     DATA: 25/03/1992   FASE: PRECATORIO DEPOSITADO
-     DATA: 14/08/2002   FASE: ARQUIVADO - PASTA
-     OBSERVACAO: 000
-     DATA: 14/08/2002   FASE: ARQUIVADO - PASTA
-     OBSERVACAO: 000
-     DATA: 14/08/2002   FASE: ARQUIVADO - PASTA
-     OBSERVACAO: 00009
-
-
-
-
- - - -
- - - - diff --git a/standalone-crawlers/TRF5/pages/precatorio-000003.html b/standalone-crawlers/TRF5/pages/precatorio-000003.html deleted file mode 100644 index 0e1b6689..00000000 --- a/standalone-crawlers/TRF5/pages/precatorio-000003.html +++ /dev/null @@ -1,181 +0,0 @@ - - - - - - - - - Tribunal Regional Federal da 5ª Região - Consulta Precatório - - - - - - - - -
- - - -
-
- -
-
-
-

Consulta Precatório

-
- - -
- - - - - - - - - - - - - - - - - - - - - - - - - -
PROCESSO
-
8905000037 PRECATORIO 3 PROC. ORIG. 0009569421
-
-
DADOS
-
     ASSUNTO....................: DESCONHECIDO
-     ORIGEM.......................: JUSTIÇA FEDERAL
-     LOCALIDADE...............: NATAL - RN
-     TIPO............................: ENTIDADE
-     DATA AUTUACAO........: 08/08/1988
-     DATA DISTRIB .............: 15/08/1988
-     DATA LIQUIDACAO......: 25/02/1988
-     ELEMENTO DESPESA.: 
-     NATUREZA ALIMENTAR
-
-
PARTES
-
     Reqte: DONATILA MELO BARBOSA 
-     Adv: ARTHUR PAREDES CUNHA LIMA 
-     Reqdo: UFRN - UNIVERSIDADE FEDERAL DO RIO GRANDE DO NORTE 
-     Adv: NILSON ROBERTO CAVALCANTI MELO 
-     Deprec: JUIZO FEDERAL DA 1A.VARA-RN 
-
-
FASES
-
     DATA: 19/07/1989   FASE: CONCLUSAO AO JUIZ PRESIDENTE
-     DATA: 19/07/1989   FASE: DESPACHO
-     OBSERVACAO: "Oficie-se ao órgão requerido."
-     DATA: 27/07/1989   FASE: OFICIO
-     OBSERVACAO: Nº 282-GP, SOLICITANDO VERBA A UFRN.
-     DATA: 16/08/1989   FASE: PUBLICADO NO DIARIO OFICIAL DE PE
-     DATA: 13/09/1989   FASE: CONCLUSAO AO JUIZ PRESIDENTE
-     OBSERVACAO: COM ANALISE DO ART. 327-RI.
-     DATA: 26/09/1989   FASE: DESPACHO
-     OBSERVACAO: "Tendo em vista a informação de fls., baixem os autos a ...
-     DATA: 17/10/1989   FASE: PUBLICADO NO DIARIO OFICIAL DE PE
-     OBSERVACAO: Despacho da fase retro.
-     DATA: 25/03/1992   FASE: PRECATORIO DEPOSITADO
-     DATA: 14/08/2002   FASE: ARQUIVADO - PASTA
-     OBSERVACAO: 0003
-     DATA: 11/11/2002   FASE: ARQUIVADO - PASTA
-     OBSERVACAO: 1
-     DATA: 25/03/2003   FASE: PRECATORIO DEPOSITADO
-     OBSERVACAO: OF. GP/PRC - 71/2003 RN
-     DATA: 31/10/2005   FASE: ARQUIVADO - PASTA
-
-
-
-
- - - -
- - - - diff --git a/standalone-crawlers/consulta-pregoes/pregoes.py b/standalone-crawlers/consulta-pregoes/pregoes.py deleted file mode 100644 index 2a44cb77..00000000 --- a/standalone-crawlers/consulta-pregoes/pregoes.py +++ /dev/null @@ -1,106 +0,0 @@ -import requests -import logging -import os -import re -import time - -from selenium import webdriver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.wait import WebDriverWait -from selenium.webdriver.common.action_chains import ActionChains -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import NoSuchElementException - -def getPDF(driver, command, name): - try: - driver.execute_script(command) - if os.path.isfile("/home/rennan/Downloads/" + name): - os.rename("/home/rennan/Downloads/" + name, - destination_dir + "/" + name) - except: - pass - -def download(driver, i, retries=0): - if retries > 10: - logging.info("Nao foi possivel coletar pregao de id:" + str(i)) - return - try: - logging.info("Tentativa de número: " + str(retries)) - url = base + str(i) - response = requests.get(url) - content = response.text - if not_found in content or "Continue acessando: PRODEMGE" in content: - return - logging.info("Coletando pregao de ID: " + str(i)) - folder = "pregao" + str(i%100) - destination_dir = "pregoes/" + folder + "/" + str(i) - if not os.path.exists(destination_dir): - os.makedirs(destination_dir) - with open(destination_dir + "/pregao_id" + str(i) + ".html", "w") as f: - f.write(content) - - content = requests.get(url_ata + str(i)).text - if not(chat_not_found in content or "Continue acessando: PRODEMGE" in content): - driver.get(url_ata + str(i)) - logging.info("Coletando atas pregao de ID: " + str(i)) - atas = re.findall(r"habilitarComandoVisualizarAta\('(.*?)'\)", content) - for ata in atas: - print(ata) - time.sleep(.5) - url_down = "https://www1.compras.mg.gov.br/processocompra/pregao/consulta/dados/atas/atasGeraisPregao.html?id=" + ata +"&idPregao=801&metodo=visualizarAta" - driver.get(url_down) - if os.path.isfile("/home/rennan/Downloads/ataPregao.pdf"): - os.rename("/home/rennan/Downloads/ataPregao.pdf", destination_dir + "/ataPregao.pdf" + ata) - - content = requests.get(url_ata_especifica + str(i) + "&metodo=visualizar").text - if not(chat_not_found in content or "Continue acessando: PRODEMGE" in content): - driver.get(url_ata + str(i)) - logging.info("Coletando atas pregao de ID: " + str(i)) - atas = re.findall(r"habilitarComandoVisualizarAta\('(.*?)'\)", content) - for ata in atas: - print(ata) - time.sleep(.5) - url_down = "https://www1.compras.mg.gov.br/processocompra/pregao/consulta/dados/atas/atasEspecificasLote.html?id=" + ata +"&idPregao=15000&metodo=visualizarAta" - driver.get(url_down) - if os.path.isfile("/home/rennan/Downloads/ataPregao.pdf"): - os.rename("/home/rennan/Downloads/ataPregao.pdf", destination_dir + "/ataPregaoEspecifica.pdf" + ata) - - driver.get(url) - getPDF(driver, "exibirRelatorioQuadroAvisos();", "relatorioConsultaQuadroAvisos.pdf") - getPDF(driver, "exibirRelatorioTermoConclusao();", "termoConclusao.pdf") - - driver.get(url_chat + str(i)) - time.sleep(1) - content = driver.page_source - if not(chat_not_found in content or "Continue acessando: PRODEMGE" in content): - logging.info("Coletando chat pregao de ID: " + str(i)) - folder = "pregao" + str(i%100) - with open(destination_dir + "/pregao_id_chat" + str(i) + ".html", "w") as f: - f.write(content) - except: - retries+=1 - download(driver, i, retries) - -options = webdriver.ChromeOptions() -options.add_argument("--headless") -options.add_experimental_option("prefs", { - "download.default_directory": "/home/rennan/Downloads/", - "download.prompt_for_download": False, - "download.directory_upgrade": True, - "safebrowsing_for_trusted_sources_enabled": False, - "safebrowsing.enabled": False -}) -options.add_argument('window-size=1920x1080') -driver = webdriver.Chrome("/usr/bin/chromedriver", chrome_options=options) - -logging.basicConfig(level=logging.INFO) - -base = "https://www1.compras.mg.gov.br/processocompra/pregao/consulta/dados/abaDadosPregao.html?interfaceModal=true&metodo=visualizar&idPregao=" -url_ata = "https://www1.compras.mg.gov.br/processocompra/pregao/consulta/dados/atas/atasGeraisPregao.html?interfaceModal=true&idPregao=" -url_ata_especifica = "https://www1.compras.mg.gov.br/processocompra/pregao/consulta/dados/atas/atasEspecificasLote.html?aba=abaAtasEspecificasLote&idPregao=" -url_chat = "https://www1.compras.mg.gov.br/processocompra/pregao/consulta/dados/pregao/visualizacaoChatPregao.html?interfaceModal=true&idPregao=" -not_found = "entidadeNaoEncontrada" -chat_not_found = 'O(A) "Pregão" não pode ser alterado(a), pois foi excluído(a) por outro usuário, em acesso concorrente, enquanto esta tela era visualizada.' - -for i in range(120000,200000): - download(driver, i) \ No newline at end of file diff --git a/standalone-crawlers/despesa-pessoal-municipio/.gitignore b/standalone-crawlers/despesa-pessoal-municipio/.gitignore deleted file mode 100644 index c045aaf4..00000000 --- a/standalone-crawlers/despesa-pessoal-municipio/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*.csv -*.zip \ No newline at end of file diff --git a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/__init__.py b/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/items.py b/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/items.py deleted file mode 100644 index b47772a4..00000000 --- a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/items.py +++ /dev/null @@ -1,14 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class DespmunicipioItem(scrapy.Item): - # define the fields for your item here like: - # name = scrapy.Field() - pass diff --git a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/middlewares.py b/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/middlewares.py deleted file mode 100644 index f9849713..00000000 --- a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class DespmunicipioSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class DespmunicipioDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/pipelines.py b/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/pipelines.py deleted file mode 100644 index 05d432e2..00000000 --- a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/pipelines.py +++ /dev/null @@ -1,11 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - - -class DespmunicipioPipeline(object): - def process_item(self, item, spider): - return item diff --git a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/settings.py b/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/settings.py deleted file mode 100644 index 07f7fed1..00000000 --- a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/settings.py +++ /dev/null @@ -1,101 +0,0 @@ -# -*- coding: utf-8 -*- -from shutil import which - -# Scrapy settings for despmunicipio project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'despmunicipio' - -SPIDER_MODULES = ['despmunicipio.spiders'] -NEWSPIDER_MODULE = 'despmunicipio.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'despmunicipio (+http://www.yourdomain.com)' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = False - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'despmunicipio.middlewares.DespmunicipioSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'despmunicipio.middlewares.DespmunicipioDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { -# 'despmunicipio.pipelines.DespmunicipioPipeline': 300, -#} - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' -SELENIUM_DRIVER_NAME = 'chrome' -SELENIUM_DRIVER_EXECUTABLE_PATH = which("chromedriver_win32_chr_83.exe") # Windows -# SELENIUM_DRIVER_EXECUTABLE_PATH = which("chromedriver") # Ubuntu -# SELENIUM_DRIVER_ARGUMENTS=[] -SELENIUM_DRIVER_ARGUMENTS=['--headless'] # '--headless' if using chrome instead of firefox -DOWNLOADER_MIDDLEWARES = { - 'scrapy_selenium.SeleniumMiddleware': 0 -} - -DOWNLOAD_DELAY = 1 \ No newline at end of file diff --git a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/spiders/__init__.py b/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/spiders/__init__.py deleted file mode 100644 index ebd689ac..00000000 --- a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/spiders/fetch_missing.py b/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/spiders/fetch_missing.py deleted file mode 100644 index a44a8b34..00000000 --- a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/spiders/fetch_missing.py +++ /dev/null @@ -1,336 +0,0 @@ -import scrapy -from scrapy.crawler import CrawlerProcess -import requests -import logging -import os -import re -import time -import datetime -from shutil import which - -from scrapy_selenium import SeleniumRequest -from selenium import webdriver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.wait import WebDriverWait -from selenium.webdriver.common.action_chains import ActionChains -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import NoSuchElementException -from selenium.common.exceptions import StaleElementReferenceException -from selenium.webdriver.support.ui import Select - -from PyPDF2.utils import PdfReadError - -import pandas -import unidecode -import json - -class SeleniumSpider(scrapy.Spider): - name = 'fetch_missing' - - def __init__(self, *a, **kw): - super(SeleniumSpider, self).__init__(*a, **kw) - self.city_name_xpath = "//form/div[3]/section/div[1]/table/tbody/tr/td/span/div/table/tbody/tr[5]/td[3]/div/" \ - "div[1]/div/table/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/" \ - "tr/td/div/div/span[2]" - - self.month_name = { - 1: "Janeiro", - 2: "Fevereiro", - 3: "Março", - 4: "Abril", - 5: "Maio", - 6: "Junho", - 7: "Julho", - 8: "Agosto", - 9: "Setembro", - 10: "Outubro", - 11: "Novembro", - 12: "Dezembro", - } - - self.org_name = { - 1: "Legislativo", - 2: "Executivo", - 3: "Município", - } - - self.base_table_xpath = "//form/div[3]/section/div[1]/table/tbody/tr/td/span/div/table/tbody/tr[5]/td[3]/" \ - "div/div[1]/div/table/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr[13]/td[2]/table/" \ - "tbody/tr/td/table/tbody/tr" - - self.tables = { - "Despesa_Total_com_Pessoal": "[4]/td[3]/table", - "Exclusoes_da_Despesa_Total_com_Pessoal": "[6]/td[3]/table", - "APURAÇÃO_DO_CUMPRIMENTO_DO_LIMITE_LEGAL": "[8]/td[2]/table", - } - - self.tables_text = { - "Despesa Total com Pessoal": "Despesa_Total_com_Pessoal", - "Exclusões da Despesa Total com Pessoal": "Exclusoes_da_Despesa_Total_com_Pessoal", - "APURAÇÃO DO CUMPRIMENTO DO LIMITE LEGAL": "APURAÇÃO_DO_CUMPRIMENTO_DO_LIMITE_LEGAL", - } - - self.org_select = "ctl00_MainContent_RVRemoto_ctl04_ctl03_ddValue" - self.month_select = "ctl00_MainContent_RVRemoto_ctl04_ctl07_ddValue" - self.search_btn_id = "ctl00_MainContent_RVRemoto_ctl04_ctl00" - - logger = logging.getLogger('scrapy.spidermiddlewares.httperror') - logger.setLevel(logging.INFO) - - def gen_base_url(self): - codes = [] - with open("cod_cidade_ibge.txt", "r") as f: - for code in f: - codes.append(int(code)) - - url = "https://reportviewer.tce.mg.gov.br/default.aspx?server=relatorios.tce.mg.gov.br&" \ - "relatorio=SICOM_Consulta%2fModulo_LRF%2fRelatoriosComuns%2fUC02-ConsultarDespesaPessoalPoder-RL&" - - with open("last_call.txt", "r") as f: - last_call = int(f.read()) - - i = 0 - count = 1117 - # 2019 for all codes - for code in codes: - # for year in range(2014, 2019):# 2020): - year = 2019 - if i >= last_call: - with open("last_call.txt", "w+") as f: - f.write(str(i)) - - self.logger.info( - f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Next: city {i} of {count} - year {year}" - ) - yield (url + f"municipioSelecionado={code}&exercicioSelecionado={year}", {"year": year}) - i += 1 - - f = open("missing_files.txt", "r") - missing_files = json.loads(f.read()) - f.close() - - # 2014~2018 for codes missing - for code in missing_files["code"]: - for year in range(2014, 2019): - if i >= last_call: - with open("last_call.txt", "w+") as f: - f.write(str(i)) - - self.logger.info( - f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Next: city {i} of {count} - year {year}" - ) - yield (url + f"municipioSelecionado={code}&exercicioSelecionado={year}", {"year": year}) - i += 1 - - # city code with specific years missing - for miss in missing_files["year"]: - self.logger.info("))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))" + str(miss)) - code = miss["code"].split("-")[0] - year = miss["year"].split("-")[0] - if i >= last_call: - with open("last_call.txt", "w+") as f: - f.write(str(i)) - - self.logger.info( - f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Next: city {i} of {count} - year {year}" - ) - yield (url + f"municipioSelecionado={code}&exercicioSelecionado={year}", {"year": year}) - i += 1 - - - def next_call(self, gen): - try: - url, metadata = next(gen) - metadata["generator"] = gen - metadata["url"] = url - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Calling {url}") - return SeleniumRequest(url=url, meta=metadata, callback=self.parse, dont_filter=True) - except StopIteration: - pass - - def start_requests(self): - gen = self.gen_base_url() - yield self.next_call(gen) - - def parse(self, response): - driver = response.request.meta['driver'] - year = response.request.meta['year'] - base_url = response.request.meta['url'] - - self.wait_element(driver, self.city_name_xpath) - city = driver.find_element_by_xpath(self.city_name_xpath).text - city = unidecode.unidecode(city.replace(" ", "")) - - self.wait_element(driver, el_id=self.month_select) - select = Select(driver.find_element_by_id(self.month_select)) - if select.first_selected_option.text == "Envio Incompleto": - self.export_error(city, driver, year, base_url) - else: - self.get_tables(city, driver, year, base_url) - - yield self.next_call(response.request.meta['generator']) - - def export_error(self, city, driver, year, base_url): - self.prepare_path(city, year) - with open(f"tabelas_de_despesa/{city}/{year}/ERROR.log", "w+") as f: - f.write("Um ou mais Órgãos está com envio incompleto no Período/Data Base/Exercício!!!\n") - f.write(base_url) - - def get_tables(self, city, driver, year, base_url): - for org in range(3, 0, -1): - attempt_at_org = 8 - while attempt_at_org: - try: - self.select_org(driver, str(org)) - # time.sleep(1) - - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> At org {self.org_name[org]} - {attempt_at_org}") - - self.prepare_path(city, year, self.org_name[org]) - - for month in range(1, 13): - attempt_at_month = 8 - while attempt_at_month: - try: - fname = f"tabelas_de_despesa/{city}/{year}/{self.org_name[org]}/{self.month_name[month]}-" - - self.wait_element(driver, el_id=self.month_select) - select = Select(driver.find_element_by_id(self.month_select)) - try: - select.select_by_value(str(month)) - except NoSuchElementException: - for tid in self.tables: - self.logger.info( - f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> {self.org_name[org]} {self.month_name[month]} não existe" - ) - with open(fname + tid + ".csv", "w+") as f: - f.write(f"Mes não existe\n{base_url}") - break - - time.sleep(1) - - self.click_element(driver, self.search_btn_id) - - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> {self.org_name[org]} {self.month_name[month]}") - - time.sleep(1) - - self.wait_results_to_load(driver, self.org_name[org], self.month_name[month]) - - attempt_at_table = 8 - while attempt_at_table > 0: - try: - self.logger.info( - f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> {self.org_name[org]} {self.month_name[month]}" + - f" - {attempt_at_org},{attempt_at_month},{attempt_at_table}" - ) - tds = driver.find_elements_by_tag_name("td") - for td in tds: - if td.text in self.tables_text: - table_html = td.find_element_by_xpath("..").find_element_by_xpath("..").find_element_by_xpath("..").get_attribute('outerHTML') - self.logger.info( - f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> {self.org_name[org]} {self.month_name[month]}" + - f" - {attempt_at_org},{attempt_at_month},{attempt_at_table} - found {td.text}" - ) - df = pandas.read_html(table_html)[0] - df.to_csv(fname + self.tables_text[td.text] + ".csv", index=False) - break - except StaleElementReferenceException: - attempt_at_table -= 1 - self.logger.info(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> stale at table attempt") - time.sleep(1) - # time.sleep(0.3) - break - except StaleElementReferenceException: - attempt_at_month -= 1 - self.logger.info(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> stale at month attempt") - time.sleep(1) - break - except StaleElementReferenceException: - attempt_at_org -= 1 - self.logger.info(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> stale at org attempt") - time.sleep(1) - - def click_element(self, driver, btn_id): - attempt = 1 - while attempt <= 8: - try: - btn = driver.find_element_by_id(btn_id) - btn.click() - return True - except StaleElementReferenceException: - attempt += 1 - time.sleep(1) - self.logger.info("Unable to get hold of element " + btn_id) - return False - - def select_org(self, driver, org): - attempt = 1 - while attempt <= 8: - try: - select = Select(driver.find_element_by_id(self.org_select)) - select.select_by_value(org) - return True - except StaleElementReferenceException: - attempt += 1 - time.sleep(1) - self.logger.info("Unable to get hold of org select") - return False - - def prepare_path(self, city, year, org=None): - self.create_folder(f"tabelas_de_despesa/{city}") - self.create_folder(f"tabelas_de_despesa/{city}/{year}") - if org is not None: - self.create_folder(f"tabelas_de_despesa/{city}/{year}/{org}") - - def create_folder(self, folder_path): - if not os.path.exists(folder_path): - os.mkdir(folder_path) - - def wait_element(self, driver, xpath=None, el_id=None): - attempt = 1 - while attempt <= 8: - try: - if xpath is not None: - driver.find_element_by_xpath(xpath) - return True - elif el_id is not None: - driver.find_element_by_id(el_id) - return True - else: - raise TypeError - except NoSuchElementException: - attempt += 1 - time.sleep(1) - self.logger.info("Unable to locate element at " + str(xpath) + "-" + str(el_id)) - return False - - def wait_results_to_load(self, driver, target_org, target_month): - info_path = "//form/div[3]/section/div[1]/table/tbody/tr/td/span/div/table/tbody/tr[5]/td[3]/div/div[1]/div/" \ - "table/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr[9]/td[3]/table/tbody/tr/td/div/" \ - "div/span" - attempt = 0 - while attempt < 16: - try: - org_span = driver.find_element_by_xpath(info_path + "[3]") - self.logger.debug(f"$$$$$$$$$$$$$ curr = <{org_span.text}> / target = <{target_org}>") - if org_span.text == target_org: - month_span = driver.find_element_by_xpath(info_path + "[5]") - self.logger.debug(f"$$$$$$$$$$$$$ curr = <{month_span.text}> / target = <{target_month}>") - if month_span.text == target_month: - return True - - except NoSuchElementException: - pass - except StaleElementReferenceException: - pass - attempt += 1 - time.sleep(1) - - def file_exists(self, file_name): - try: - with open(file_name) as f: - pass - except FileNotFoundError: - return False - return True diff --git a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/spiders/main_crawler.py b/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/spiders/main_crawler.py deleted file mode 100644 index 792da4c6..00000000 --- a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/despmunicipio/spiders/main_crawler.py +++ /dev/null @@ -1,263 +0,0 @@ -import scrapy -from scrapy.crawler import CrawlerProcess -import requests -import logging -import os -import re -import time -import datetime -from shutil import which - -from scrapy_selenium import SeleniumRequest -from selenium import webdriver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.wait import WebDriverWait -from selenium.webdriver.common.action_chains import ActionChains -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import NoSuchElementException -from selenium.common.exceptions import StaleElementReferenceException -from selenium.webdriver.support.ui import Select - -from PyPDF2.utils import PdfReadError - -import pandas -import unidecode - -class SeleniumSpider(scrapy.Spider): - name = 'main_crawler' - - def __init__(self, *a, **kw): - super(SeleniumSpider, self).__init__(*a, **kw) - self.city_name_xpath = "//form/div[3]/section/div[1]/table/tbody/tr/td/span/div/table/tbody/tr[5]/td[3]/div/" \ - "div[1]/div/table/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr[2]/td[3]/table/tbody/" \ - "tr/td/div/div/span[2]" - - self.month_name = { - 1: "Janeiro", - 2: "Fevereiro", - 3: "Março", - 4: "Abril", - 5: "Maio", - 6: "Junho", - 7: "Julho", - 8: "Agosto", - 9: "Setembro", - 10: "Outubro", - 11: "Novembro", - 12: "Dezembro", - } - - self.org_name = { - 1: "Legislativo", - 2: "Executivo", - 3: "Município", - } - - self.base_table_xpath = "//form/div[3]/section/div[1]/table/tbody/tr/td/span/div/table/tbody/tr[5]/td[3]/" \ - "div/div[1]/div/table/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr[13]/td[2]/table/" \ - "tbody/tr/td/table/tbody/tr" - - self.tables = { - "Despesa_Total_com_Pessoal": "[4]/td[3]/table", - "Exclusoes_da_Despesa_Total_com_Pessoal": "[6]/td[3]/table", - "APURAÇÃO_DO_CUMPRIMENTO_DO_LIMITE_LEGAL": "[8]/td[2]/table", - } - - self.org_select = "ctl00_MainContent_RVRemoto_ctl04_ctl03_ddValue" - self.month_select = "ctl00_MainContent_RVRemoto_ctl04_ctl07_ddValue" - self.search_btn_id = "ctl00_MainContent_RVRemoto_ctl04_ctl00" - - def gen_base_url(self): - codes = [] - with open("cod_cidade_ibge.txt", "r") as f: - for code in f: - codes.append(int(code)) - - url = "https://reportviewer.tce.mg.gov.br/default.aspx?server=relatorios.tce.mg.gov.br&" \ - "relatorio=SICOM_Consulta%2fModulo_LRF%2fRelatoriosComuns%2fUC02-ConsultarDespesaPessoalPoder-RL&" - - with open("last_call.txt", "r") as f: - last_call = int(f.read()) - - i = 0 - count = len(codes) - for code in codes: - for year in range(2014, 2019):# 2020): - if i >= last_call: - with open("last_call.txt", "w+") as f: - f.write(str(i)) - - self.logger.info( - f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Next: city {i} of {count} - year {year}" - ) - yield (url + f"municipioSelecionado={code}&exercicioSelecionado={year}", {"year": year}) - i += 1 - - def next_call(self, gen): - try: - url, metadata = next(gen) - metadata["generator"] = gen - metadata["url"] = url - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Calling {url}") - return SeleniumRequest(url=url, meta=metadata, callback=self.parse, dont_filter=True) - except StopIteration: - pass - - def start_requests(self): - gen = self.gen_base_url() - yield self.next_call(gen) - - def parse(self, response): - driver = response.request.meta['driver'] - year = response.request.meta['year'] - base_url = response.request.meta['url'] - - self.wait_element(driver, self.city_name_xpath) - city = driver.find_element_by_xpath(self.city_name_xpath).text - city = unidecode.unidecode(city.replace(" ", "")) - - self.wait_element(driver, el_id=self.month_select) - select = Select(driver.find_element_by_id(self.month_select)) - if select.first_selected_option.text == "Envio Incompleto": - self.export_error(city, driver, year, base_url) - else: - self.get_tables(city, driver, year, base_url) - - yield self.next_call(response.request.meta['generator']) - - def export_error(self, city, driver, year, base_url): - self.prepare_path(city, year) - with open(f"tabelas_de_despesa/{city}/{year}/ERROR.log", "w+") as f: - f.write("Um ou mais Órgãos está com envio incompleto no Período/Data Base/Exercício!!!\n") - f.write(base_url) - - def get_tables(self, city, driver, year, base_url): - for org in range(3, 0, -1): - self.select_org(driver, str(org)) - time.sleep(1) - - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> At org {self.org_name[org]}") - - self.prepare_path(city, year, self.org_name[org]) - - for month in range(1, 13): - fname = f"tabelas_de_despesa/{city}/{year}/{self.org_name[org]}/{self.month_name[month]}-" - - self.wait_element(driver, el_id=self.month_select) - select = Select(driver.find_element_by_id(self.month_select)) - try: - select.select_by_value(str(month)) - except NoSuchElementException: - for tid in self.tables: - self.logger.info( - f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Mes {self.month_name[month]} não existe" - ) - with open(fname + tid + ".csv", "w+") as f: - f.write(f"Mes não existe\n{base_url}") - continue - - time.sleep(1) - - self.click_element(driver, self.search_btn_id) - - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> At month {self.month_name[month]}") - - time.sleep(1) - - self.wait_results_to_load(driver, self.org_name[org], self.month_name[month]) - - for tid in self.tables: - self.wait_element(driver, self.base_table_xpath + self.tables[tid]) - - for tid in self.tables: - table_html = driver.find_element_by_xpath( - self.base_table_xpath + self.tables[tid] - ).get_attribute('outerHTML') - df = pandas.read_html(table_html)[0] - df.to_csv(fname + tid + ".csv", index=False) - - time.sleep(0.3) - - def click_element(self, driver, btn_id): - attempt = 1 - while attempt <= 8: - try: - btn = driver.find_element_by_id(btn_id) - btn.click() - return True - except StaleElementReferenceException: - attempt += 1 - time.sleep(1) - self.logger.info("Unable to get hold of element " + btn_id) - return False - - def select_org(self, driver, org): - attempt = 1 - while attempt <= 8: - try: - select = Select(driver.find_element_by_id(self.org_select)) - select.select_by_value(org) - return True - except StaleElementReferenceException: - attempt += 1 - time.sleep(1) - self.logger.info("Unable to get hold of org select") - return False - - def prepare_path(self, city, year, org=None): - self.create_folder(f"tabelas_de_despesa/{city}") - self.create_folder(f"tabelas_de_despesa/{city}/{year}") - if org is not None: - self.create_folder(f"tabelas_de_despesa/{city}/{year}/{org}") - - def create_folder(self, folder_path): - if not os.path.exists(folder_path): - os.mkdir(folder_path) - - def wait_element(self, driver, xpath=None, el_id=None): - attempt = 1 - while attempt <= 8: - try: - if xpath is not None: - driver.find_element_by_xpath(xpath) - return True - elif el_id is not None: - driver.find_element_by_id(el_id) - return True - else: - raise TypeError - except NoSuchElementException: - attempt += 1 - time.sleep(1) - self.logger.info("Unable to locate element at " + str(xpath) + "-" + str(el_id)) - return False - - def wait_results_to_load(self, driver, target_org, target_month): - info_path = "//form/div[3]/section/div[1]/table/tbody/tr/td/span/div/table/tbody/tr[5]/td[3]/div/div[1]/div/" \ - "table/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr/td/table/tbody/tr[9]/td[3]/table/tbody/tr/td/div/" \ - "div/span" - attempt = 0 - while attempt < 16: - try: - org_span = driver.find_element_by_xpath(info_path + "[3]") - self.logger.debug(f"$$$$$$$$$$$$$ curr = <{org_span.text}> / target = <{target_org}>") - if org_span.text == target_org: - month_span = driver.find_element_by_xpath(info_path + "[5]") - self.logger.debug(f"$$$$$$$$$$$$$ curr = <{month_span.text}> / target = <{target_month}>") - if month_span.text == target_month: - return True - - except NoSuchElementException: - pass - except StaleElementReferenceException: - pass - attempt += 1 - time.sleep(1) - - def file_exists(self, file_name): - try: - with open(file_name) as f: - pass - except FileNotFoundError: - return False - return True diff --git a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/execute.py b/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/execute.py deleted file mode 100644 index 3b2376f9..00000000 --- a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/execute.py +++ /dev/null @@ -1,22 +0,0 @@ -import subprocess - -with open("execute_log.txt", "w+") as f: - pass - -while True: - with open("last_call.txt", "r") as f: - last_call = int(f.read()) - - if last_call >= 1107: - break - - with open("execute_log.txt", "a") as f: - f.write(f"{last_call}\n") - - process = subprocess.Popen( - "python -m scrapy crawl fetch_missing --loglevel=INFO", - shell=True, stdout=subprocess.PIPE - ) - process.wait() - - \ No newline at end of file diff --git a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/scrapy.cfg b/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/scrapy.cfg deleted file mode 100644 index dda2eedb..00000000 --- a/standalone-crawlers/despesa-pessoal-municipio/despmunicipio/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = despmunicipio.settings - -[deploy] -#url = http://localhost:6800/ -project = despmunicipio diff --git a/standalone-crawlers/despesa-pessoal/despesa.py b/standalone-crawlers/despesa-pessoal/despesa.py deleted file mode 100644 index 0babae15..00000000 --- a/standalone-crawlers/despesa-pessoal/despesa.py +++ /dev/null @@ -1,22 +0,0 @@ -import requests, os, time -base = "http://transparencia.mg.gov.br/estado-pessoal/despesa-com-pessoal/despesapessoal-orgaosFiltro/" -headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} - -#year iteration -for year in range(2009, 2021): - #months iteration - for month in range(1,13): - query = "{}/{}/{}".format(month, month, year) - if year >= 2020 and month >=3: - exit() - url = base + query - print(url) - # exit() - response = requests.get(url, headers=headers) - content = response.text - destination_dir = "./despesa-pessoal" - if not os.path.exists(destination_dir): - os.makedirs(destination_dir) - with open(destination_dir + "/ano-mes" + query.replace("/", "-") + ".html", "w") as f: - f.write(content) - time.sleep(2) diff --git a/standalone-crawlers/diario-oficial-ammg/crawler_ammg.py b/standalone-crawlers/diario-oficial-ammg/crawler_ammg.py deleted file mode 100644 index ee979b4b..00000000 --- a/standalone-crawlers/diario-oficial-ammg/crawler_ammg.py +++ /dev/null @@ -1,194 +0,0 @@ -from selenium import webdriver -from selenium.webdriver.chrome.webdriver import WebDriver -from selenium.common.exceptions import ElementClickInterceptedException -from selenium.common.exceptions import ElementNotInteractableException -from selenium.webdriver.support.select import Select - -import datetime -import time -import json -from pandas import date_range - -SCREEN_ON = True - -class LimitOfAttemptsReached(Exception): - pass - -def initChromeWebdriver ( - path_to_driver: str = None, - use_window: bool = False, - ignore_console: bool = True -) -> WebDriver: - """ - Returns a ready to use chrome driver. - It will not display warning and errors that the site prints on the - navigator console. - Keyword arguments: - path_to_driver -- path to the chrome driver executable. No need to - pass if the driver is set in path. - use_window -- if True, a instance of chrome will be opened - and one could watch the execution of the program - print_console -- if true, log information printed in the navigator - console will be printed in the output - """ - # If the chrome webdriver is added to path, leave path_to_driver blank - # else, pass path to executable - - # Setting Chrome Browser - chrome_options = webdriver.ChromeOptions() - - chrome_options.add_argument('--window-size=1420,1080') - # chrome_options.add_argument("disable-popup-blocking") - if ignore_console: - # will only print fatal errors - chrome_options.add_argument('log-level=3') - - if not use_window: - # --no-sandbox: needed for chrome to start headless. - # Its a security feature on chrome, see linkg - # https://www.google.com/googlebooks/chrome/med_26.html - # So, be careful with the pages being opened. - chrome_options.add_argument('--no-sandbox') - # --headless: web browser without a graphical user interface. - chrome_options.add_argument('--headless') - # - chrome_options.add_argument('--disable-gpu') - - if path_to_driver is None: - # if driver is not configured to path, it will raise an excpetion - # selenium.common.exceptions.WebDriverException: Message: ‘geckodriver’ executable needs to be in PATH - driver = webdriver.Chrome(options=chrome_options) - driver.set_page_load_timeout(60) - return driver - else: - driver = webdriver.Chrome(path_to_driver, options=chrome_options) - driver.set_page_load_timeout(60) - return driver - -def checkForUrl(driver, prev, dtype, spider_manager_id, btn_id): - container = driver.find_element_by_id(spider_manager_id) - css = container.value_of_css_property("display") - if css == "block": - print("link found", dtype) - - # must check display and if url changed, because it was repeating urls - anchor = driver.find_element_by_id(btn_id).get_attribute("href") - if anchor == prev: - print("but repeated...") - time.sleep(1) - anchor = driver.find_element_by_id(btn_id).get_attribute("href") - if anchor == prev: - print("still repeated, giving up") - return "" - else: - print("new url found, returning...") - return anchor - else: - return anchor - return "" - -def checkForUrlNova(driver, prev): - return checkForUrl(driver, prev, "Nova", "containerDownloadNova", "btDownloadSimples2") - -def checkForUrlExtra(driver, prev): - return checkForUrl(driver, prev, "extra", "exibeExtra", "btDownloadExtra") - -def checkForUrlOld(driver, prev): - return checkForUrl(driver, prev, "Old", "containerDownload", "btDownloadSimples") - -def addProgress(urls): - print(f"Saving progress, adding {len(urls)} links") - try: - f = open("links_ammg.json", "r") - data = json.loads(f.read()) - f.close() - except FileNotFoundError: - data = [] - - data = data + urls - - f = open("links_ammg.json", "w+") - f.write(json.dumps(data, indent=1)) - f.close() - -def lastFetchedDate(): - try: - f = open("links_ammg.json", "r") - data = json.loads(f.read()) - f.close() - last_date = datetime.datetime.fromisoformat(data[-1]["date"]) - last_date += datetime.timedelta(days=1) - return last_date.strftime('%Y-%m-%d') - except FileNotFoundError: - return "2014-01-01" - -def waitUntilInteractable(driver, xpath): - popup_header = driver.find_element_by_xpath(xpath) - attempt = 0 - while attempt < 16: - try: - popup_header.click() - return - except ElementNotInteractableException: - attempt += 1 - print("Element not interactable yet, sleeping...") - time.sleep(1) - raise LimitOfAttemptsReached() - -def crawler(): - driver = initChromeWebdriver( - # 'chromedriver_win_79-0-3945-36.exe', - # use_window=SCREEN_ON - ) - - target = "http://www.diariomunicipal.com.br/amm-mg/" - driver.get(target) - - urls = [] - - prev_old_url = "" - prev_new_url = "" - prev_extra_url = "" - - for dt in date_range(lastFetchedDate(), datetime.datetime.now().strftime('%Y-%m-%d')): - print(f"starting {dt.strftime('%Y-%m-%d')} at {datetime.datetime.now()}") - - waitUntilInteractable(driver, "//*[@id=\"hora\"]") # wait for popup to close - - Select(driver.find_element_by_id("calendar_year")).select_by_value(str(dt.year)) - time.sleep(1) - Select(driver.find_element_by_id("calendar_month")).select_by_value(str(dt.month)) - time.sleep(1) - Select(driver.find_element_by_id("calendar_day")).select_by_value(str(dt.day)) - time.sleep(1) - - driver.find_element_by_css_selector(".selected").click() - waitUntilInteractable(driver, "//*[@id=\"popup\"]/div/article/header") # waits for popup to open - - anchor = checkForUrlOld(driver, prev_old_url) - if anchor != "": - urls.append({"date": dt.strftime('%Y-%m-%d'), "type": "regular", "url": anchor}) - prev_old_url = anchor - else: - anchor = checkForUrlNova(driver, prev_new_url) - if anchor != "": - urls.append({"date": dt.strftime('%Y-%m-%d'), "type": "regular", "url": anchor}) - prev_new_url = anchor - - anchor = checkForUrlExtra(driver, prev_extra_url) - if anchor != "": - urls.append({"date": dt.strftime('%Y-%m-%d'), "type": "extra", "url": anchor}) - prev_extra_url = anchor - - if len(urls) >= 100: - addProgress(urls) - urls = [] - - driver.find_element_by_xpath("//*[@id=\"popup\"]/div/article/a").click() # close popup - time.sleep(1) - - driver.close() - addProgress(urls) - print("Done.") - -crawler() diff --git a/standalone-crawlers/diario-oficial-ammg/list_files.py b/standalone-crawlers/diario-oficial-ammg/list_files.py deleted file mode 100644 index 6dfd4e3c..00000000 --- a/standalone-crawlers/diario-oficial-ammg/list_files.py +++ /dev/null @@ -1,25 +0,0 @@ -import json -import pprint -pp = pprint.PrettyPrinter(indent=2) - -with open("links_ammg.json", "r") as f: - data = json.loads(f.read()) - -url_index = {} -for d, i in zip(data, range(len(data))): - url_index[d["url"]] = i - -with open("scrapy_download_pdfs.log", "r") as f: - for line in f: - if line[:11] == "{'file_urls": - attr = json.loads(line.replace("'", "\""))["files"][0] - - data[url_index[attr['url']]]['path'] = attr['path'] - data[url_index[attr['url']]]['checksum'] = attr['checksum'] - -for d in data: - if len(d) == 3: - print("Probelm with:", d["date"], d["url"]) - -with open("info.txt", "w+") as f: - f.write(json.dumps(data, indent=1)) diff --git a/standalone-crawlers/diario-oficial-ammg/scrapy_download_pdfs.py b/standalone-crawlers/diario-oficial-ammg/scrapy_download_pdfs.py deleted file mode 100644 index 2a69cecc..00000000 --- a/standalone-crawlers/diario-oficial-ammg/scrapy_download_pdfs.py +++ /dev/null @@ -1,36 +0,0 @@ -import scrapy -from scrapy.crawler import CrawlerProcess -import json - -class MySpider(scrapy.Spider): - name = 'ammg-spider' - start_urls = [ - 'https://stackoverflow.com/questions/57245315/using-scrapy-how-to-download-pdf-files-from-some-extracted-links' - ] - - def parse(self, response): - f = open("links_ammg.json", 'r') - links = json.loads(f.read()) - f.close() - - for link in links: - # print(link[2]) - yield { - 'file_urls': [link['url']] - } - -c = CrawlerProcess({ - 'USER_AGENT': 'Mozilla/5.0', - - # save in file as CSV, JSON or XML - # 'FEED_FORMAT': 'csv', # csv, json, xml - # 'FEED_URI': 'output.csv', # - - # download files to `FILES_STORE/full` - # it needs `yield {'file_urls': [url]}` in `parse()` - 'ITEM_PIPELINES': {'scrapy.pipelines.files.FilesPipeline': 1}, - 'FILES_STORE': '.', - 'DNS_TIMEOUT': 180 -}) -c.crawl(MySpider) -c.start() diff --git a/standalone-crawlers/diario-oficial-araguari/diario-oficial-araguari.py b/standalone-crawlers/diario-oficial-araguari/diario-oficial-araguari.py deleted file mode 100644 index 064c46a9..00000000 --- a/standalone-crawlers/diario-oficial-araguari/diario-oficial-araguari.py +++ /dev/null @@ -1,49 +0,0 @@ -import os -import scrapy -from scrapy.http import Request - -from scrapy.crawler import CrawlerProcess - -COLLECTION_URL = "https://www.araguari.mg.gov.br/assets/uploads/correio/" -FOLDER_PATH = "diario-oficial-araguari-scrapy" - - -class DOMAraguari(scrapy.Spider): - - name = "diario-oficial-araguari" - start_urls = [COLLECTION_URL] - - def __init__(self): - if not os.path.exists(FOLDER_PATH): - os.makedirs(FOLDER_PATH) - - def parse(self, response): - links = [a.attrib["href"] for a in response.css('td > a:not([href *= "assets"])')] - for link in links: - yield Request(url=response.urljoin(link), callback=self.save_file) - - def save_file(self, response): - file_name = response.url.split('/')[-1] - directory = os.path.join(FOLDER_PATH, "{}".format(file_name[-3:])) - - if not os.path.exists(directory): - os.makedirs(directory) - - path = os.path.join(directory, file_name) - self.logger.info('Saving file %s', file_name) - with open(path, 'wb') as f: - f.write(response.body) - -def main(): - process = CrawlerProcess({ - 'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0', - 'DOWNLOAD_TIMEOUT': 1800, - 'DOWNLOAD_WARNSIZE': 500000000, - 'RETRY_TIMES': 5 - }) - - process.crawl(DOMAraguari) - process.start() - -if __name__ == "__main__": - main() diff --git a/standalone-crawlers/diario-oficial-de-mg/.gitignore b/standalone-crawlers/diario-oficial-de-mg/.gitignore deleted file mode 100644 index 219f88c9..00000000 --- a/standalone-crawlers/diario-oficial-de-mg/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -chromedriver_win_79-0-3945-36.exe -*.json -*.zip diff --git a/standalone-crawlers/diario-oficial-de-mg/crawler.py b/standalone-crawlers/diario-oficial-de-mg/crawler.py deleted file mode 100644 index 44cceb1c..00000000 --- a/standalone-crawlers/diario-oficial-de-mg/crawler.py +++ /dev/null @@ -1,708 +0,0 @@ -from selenium import webdriver -from selenium.webdriver.chrome.webdriver import WebDriver -from selenium.webdriver.remote.webelement import WebElement -from selenium.common.exceptions import NoSuchElementException -from selenium.common.exceptions import NoSuchFrameException -from selenium.common.exceptions import StaleElementReferenceException -from selenium.common.exceptions import ElementClickInterceptedException - -from shutil import copyfile - -from PyPDF2.utils import PdfReadError -from PyPDF2 import PdfFileReader - -import requests -import json -import datetime -import locale -import time -import os -import PyPDF2 -from tika import parser - -SCREEN_ON = True - -class LimitOfAttemptsReached(Exception): - pass - -def initChromeWebdriver ( - path_to_driver: str = None, - use_window: bool = False, - ignore_console: bool = True -) -> WebDriver: - """ - Returns a ready to use chrome driver. - It will not display warning and errors that the site prints on the - navigator console. - - Keyword arguments: - path_to_driver -- path to the chrome driver executable. No need to - pass if the driver is set in path. - use_window -- if True, a instance of chrome will be opened - and one could watch the execution of the program - print_console -- if true, log information printed in the navigator - console will be printed in the output - """ - # If the chrome webdriver is added to path, leave path_to_driver blank - # else, pass path to executable - - # Setting Chrome Browser - chrome_options = webdriver.ChromeOptions() - - chrome_options.add_argument('--window-size=1420,1080') - - if ignore_console: - # will only print fatal errors - chrome_options.add_argument('log-level=3') - - if not use_window: - # --no-sandbox: needed for chrome to start headless. - # Its a security feature on chrome, see linkg - # https://www.google.com/googlebooks/chrome/med_26.html - # So, be careful with the pages being opened. - chrome_options.add_argument('--no-sandbox') - # --headless: web browser without a graphical user interface. - chrome_options.add_argument('--headless') - # - chrome_options.add_argument('--disable-gpu') - - if path_to_driver is None: - # if driver is not configured to path, it will raise an excpetion - # selenium.common.exceptions.WebDriverException: Message: ‘geckodriver’ executable needs to be in PATH - driver = webdriver.Chrome(options=chrome_options) - driver.set_page_load_timeout(60) - return driver - else: - driver = webdriver.Chrome(path_to_driver, options=chrome_options) - driver.set_page_load_timeout(60) - return driver - -def getLastNewspaperDate(driver: WebDriver) -> datetime.date: - """Finds the date of the last newspaper published.""" - last_newspaper = driver.find_element_by_xpath( - "//div[@id='links-constantes-direita']/fieldset/table/tbody/tr/td/a" - ) - last_newspaper_date = last_newspaper.get_attribute("text") - # now its like "Jornal de 01/02/2020" - last_newspaper_date = last_newspaper_date.split(" ")[-1] - # now its like "01/02/2020" - last_newspaper_date = last_newspaper_date.split("/") - # now its like ["01", "02", "2020"] - last_newspaper_date = datetime.date( - int(last_newspaper_date[2]), - int(last_newspaper_date[1]), - int(last_newspaper_date[0]) - ) - - return last_newspaper_date - -def getYear(driver: WebDriver, date: datetime.date) -> WebElement: - """Finds the anchor for the year of , if it exists.""" - yearly_navigation = driver.find_element_by_xpath( - "//div[@id='links-constantes-direita']/fieldset/table/tbody" - ) - anchors = yearly_navigation.find_elements_by_tag_name("a") - - print("Trying to find year...") - - for a in anchors: - if a.get_attribute("text") == str(date.year): - return a - - return None - -def getMonth(driver: WebDriver, date: datetime.date) -> WebElement: - """Finds the anchor for the month of , if it exists.""" - month_table = driver.find_element_by_id("id-lista-subcomunidades") - month_links = month_table.find_elements_by_tag_name("td") - - print("Trying to find month...") - - target_month = month_links[date.month - 1] - try: - return target_month.find_element_by_tag_name("a") - except NoSuchElementException: - return None - -def getDay(driver: WebDriver, date: datetime.date) -> WebElement: - """Finds the anchor for the day of , if it exists.""" - # will explore the column of the corresponding weekday to find the date - print("Trying to find day...") - - column = date.weekday() + 1 # weekday() -> 0 == monday, 6 == sunday - if column == 7: # if sunday - column = 0 - - calendar_table = driver.find_element_by_id("id-lista-subcomunidades") - trs = calendar_table.find_elements_by_tag_name("tr") - trs = trs[2:] # ignores lines with month name and week day - - target_day_td = None - - for tr in trs: - tds = tr.find_elements_by_tag_name("td") - - text = tds[column].text - if text != "-" and int(text) == date.day: - target_day_td = tds[column] - - if target_day_td is None: - return None - - try: - return target_day_td.find_element_by_tag_name("a") - except NoSuchElementException: - return None - -def waitNavigationToLoad(driver: WebDriver, action_type: str): - """ - Waits for a element to load, then returns it. - For 128 attempts it will ignore NoSuchElementException and - StaleElementReferenceException if they occur while trying to get the - element. - - type -- should be in ['month', 'day', 'news']. - Waits for main navigation div to load. - """ - if action_type == 'month': - target_size = [12] - elif action_type == 'day': - target_size = [6, 7, 8] - elif action_type == 'news': - pass - else: - print("Wrong value for type") - exit() - - time.sleep(2) - - attempt = 1 - - while 1: - try: - nvg_table = driver.find_element_by_id("id-lista-subcomunidades") - - if action_type in ["month", "day"]: - rows = nvg_table.find_elements_by_tag_name("tr") - if len(rows) in target_size: - return - else: - if "Seg" not in nvg_table.text: - time.sleep(2) - return - - except NoSuchElementException: - print("NoSuchElementException") - pass - except StaleElementReferenceException: - print("StaleElementReferenceException") - pass - - print(f"Attempt #{attempt} failed, waiting {action_type} navigation to load...") - attempt += 1 - time.sleep(1) - - if attempt == 128: - raise LimitOfAttemptsReached("Limit of attempts reached, aborting") - -def getNewspaperLink(driver: WebDriver, date: datetime.date) -> str: - """ - Checks if there is newspaper for a given date. - Returns url of the pdf viewer page if so, None otherwise. - """ - year_anchor = getYear(driver, date) - if year_anchor is None: - print(f"Unable to find year option for year {date.year}") - return None - year_anchor.click() - waitNavigationToLoad(driver, "month") - - month_anchor = getMonth(driver, date) - if month_anchor is None: - print("Target month have no newspaper") - return None - month_anchor.click() - waitNavigationToLoad(driver, "day") - - day_anchor = getDay(driver, date) - if day_anchor is None: - print("This day have no newspapers") - return None - day_anchor.click() - waitNavigationToLoad(driver, "news") - - news_btn = driver.find_element_by_id("id-lista-subcomunidades") - anchors = news_btn.find_elements_by_tag_name("a") - - links = {} - for a in anchors: - if len(a.text) == 0: # there are some broken links with empty texts - continue - links[a.text] = a.get_attribute("onclick").split("'")[1] - - print("Url found!") - return links - -def switchFrame(driver: WebDriver, attempts: int = 1): - """Swithcs driver to content frames when it loads.""" - try: - driver.switch_to.frame("blockrandom") - except NoSuchFrameException: - try: - h1 = driver.find_element_by_tag_name("h1") - if h1.text == "504 Gateway Time-out": - print("FATAL ERROR: 504 Gateway Time-out.") - exit() - except NoSuchElementException: - print(f"Attempt #{attempts} failed...") - - if attempts == 128: - raise LimitOfAttemptsReached("Limit of attempts reached, aborting") - - pass - - time.sleep(1) - switchFrame(driver, attempts + 1) - -def saveProgress(urls_listed: []): - """Saves list of urls to file 'temp.json'.""" - open("temp.json", "w+").write(json.dumps(urls_listed, indent=1)) - -def listNewspaperToDownload() -> [(datetime.date, str)]: - """Finds all newspapers not downloaded and returns a list with them.""" - # set language to be used for month and weekday names - print(">>>>>>>> Checking for new newspaper") - - locale.setlocale(locale.LC_TIME, "pt_BR.utf8") - - driver = initChromeWebdriver( - # 'chromedriver_win_79-0-3945-36.exe', - use_window=SCREEN_ON - ) - target = "http://www.iof.mg.gov.br/index.php?/ultima-edicao.html" - - driver.get(target) - print("Loaded page...") - switchFrame(driver) - - # to check if there is new data available - print("Getting date...") - last_newspaper_date = getLastNewspaperDate(driver) - try: - configs = json.loads(open("config.json").read()) - last_date_crawled = datetime.date.fromisoformat( - configs["last_date_crawled"] - ) - except FileNotFoundError: - last_date_crawled = datetime.date(2005, 6, 30) - - if last_newspaper_date <= last_date_crawled: - print("No new newspaper to crawl") - return [] - - print( - f"Last date crawled: {last_date_crawled}," - " last newspaper: {last_newspaper_date}" - ) - - try: # trying to continue previous progress - urls = json.loads(open("temp.json", "r").read()) - print("Continuing progress") - - except FileNotFoundError: - urls = [] - - if len(urls) > 0: - last_saved = datetime.date.fromisoformat(urls[-1]["date"]) - next_date = last_saved + datetime.timedelta(days=1) - else: - next_date = last_date_crawled + datetime.timedelta(days=1) - - last_saved = 0 - count_results = 0 - - while next_date <= last_newspaper_date: - time.sleep(5) # politiness? - print(f">>>> {count_results}. Checking for date:", next_date) - new_urls = getNewspaperLink(driver, next_date) - if new_urls is not None: - new_urls["date"] = str(next_date) - urls.append(new_urls) - count_results += 1 - - next_date += datetime.timedelta(days=1) - - if count_results > last_saved + 50: - saveProgress(urls) - last_saved = count_results - - print("Number of dates with newspaper:", len(urls)) - - driver.close() - - return urls - -def mergePdfFiles(files_to_merge: [str], output_file_address: str): - """Merge pdf files listed in one single file in the order they are.""" - pdf_merger = PyPDF2.PdfFileMerger() - for text_pdf_file in files_to_merge: - pdf_merger.append(PyPDF2.PdfFileReader(text_pdf_file, strict=False)) - pdf_merger.write(output_file_address) - pdf_merger.close() - -def pdf2Txt(source_file: str, final_file: str): - """Converts pdf file in a txt file.""" - raw = parser.from_file(source_file) - open(final_file, "w", encoding="utf-8").write(raw['content']) - -def createFolders(): - """Creates folders 'jornais/pdf/' and 'jornais/txt' if they do not exists.""" - try: - os.mkdir('jornais') - except FileExistsError: - pass - - try: - os.mkdir('jornais/pdf') - except FileExistsError: - pass - - try: - os.mkdir('jornais/txt') - except FileExistsError: - pass - -def openNextPage(driver: WebDriver): - """Interacts with page to load next pdf page.""" - attempts = 0 - print("Opening next page") - while True: - try: - next_page_div = driver.find_element_by_id( - "id-div-pagina-posterior" - ) - next_page_btn = next_page_div.find_element_by_tag_name("a") - next_page_btn.click() - time.sleep(1) - return - except NoSuchElementException: - print("NoSuchElementException") - time.sleep(1) - pass - except StaleElementReferenceException: - print("StaleElementReferenceException") - time.sleep(1) - pass - except ElementClickInterceptedException: - print("ElementClickInterceptedException") - time.sleep(1) - pass - - print(f"Attempt #{attempts} failed!") - attempts += 1 - if attempts == 16: - print("ERROR Limits of attempt reached") - exit() - -def getContentFrame(driver: WebDriver) -> WebElement: - """Locate the main iframe tag and returns it.""" - attempts = 0 - print("Getting content page") - while True: - try: - frame = driver.find_element_by_tag_name("iframe") - return frame - except NoSuchElementException: - print("NoSuchElementException") - time.sleep(1) - pass - except StaleElementReferenceException: - print("StaleElementReferenceException") - time.sleep(1) - pass - - print(f"Attempt #{attempts} failed!") - attempts += 1 - - if attempts == 128: - raise LimitOfAttemptsReached("Limit of attempts reached, aborting") - -def getFileNameFromUrl(pdf_source: str) -> str: - """Extracts file name from url, converting special characters.""" - return pdf_source.split("/")[-1].split("?")[0].replace("%20", " ") - -def downloadPdf(pdf_source: str, fname: str) -> bool: - """Download pdf stored in url pdf_source and saves it in fname.""" - attempt = 0 - - if fileDownloaded(fname): - print("Already saved") - return True - - while True: - print(pdf_source) - - myfile = requests.get(pdf_source) - if len(myfile.content) > 0: - f = open(fname, "wb") - f.write(myfile.content) - f.close() - - downloaded_file = getFileNameFromUrl(pdf_source) - try: - os.remove(downloaded_file) - except FileNotFoundError: - pass - - print("Page saved") - - print("Checking pdf...") - try: - pdf_merger = PyPDF2.PdfFileMerger() - pdf_merger.append(PyPDF2.PdfFileReader(fname, strict=False)) - print("ok") - return True - except PdfReadError: - print("PdfReadError, trying again...") - pass - - else: - print("File not Downloaded or empty.") - - print(f"Attempt #{attempt} to download pdf failed...", attempt) - attempt += 1 - if attempt == 16: - # not raising exception because some pdfs just have corrupted pages - print("ERROR Limit of attempts reached!") - - try: - os.remove(fname) - except FileNotFoundError: - pass - - return False - time.sleep(1) - -def downloadPdfPages(driver: WebDriver, date: str, n_pages: int, pdf_name: str) -> bool: - """ - Download all pages of a newspaper and merge them. - - It will create temporary pdf files to contain the pages, then merge - the files and delete them. File will be named 'jornais/pdf/.pdf' - """ - print("Number of pages:", n_pages) - files_to_merge = [] - - everything_ok = True - - page_count = 0 - while page_count < n_pages: - everything_ok_with_this_page = True - - print(f"Starting page ({page_count}/{n_pages-1})", getNow()) - if page_count > 0: - openNextPage(driver) - - pdf_frame = getContentFrame(driver) - pdf_source = pdf_frame.get_attribute("src") - - fname = f'jornais/temp-{str(date)}-{page_count}.pdf' - - if fileDownloaded(fname): - files_to_merge.append(fname) - else: - if not downloadPdf(pdf_source, fname): - everything_ok = False - everything_ok_with_this_page = False - else: - files_to_merge.append(fname) - - if everything_ok_with_this_page and page_count == 0: - n_pages_download = checkPagesDownloaded(fname) - if n_pages_download > 1: - if n_pages_download != n_pages: - print("More than one page downloaded, but not all. Giving up on this file.") - return False - else: - print("All pages in page one. Done.") - copyfile(fname, pdf_name) - os.remove(fname) - return True - - page_count += 1 - - print("Merging pdf pages...") - driver.close() - mergePdfFiles(files_to_merge, pdf_name) - - print("Deleting temporary pdf") - for file in files_to_merge: - os.remove(file) - - return everything_ok - -def fileDownloaded(fname: str) -> bool: - try: - f = open(fname, "r") - except FileNotFoundError: - return False - f.close() - return True - -def getNumberOfPages(driver: WebDriver,) -> str: - attempts = 0 - while attempts < 8: - try: - n_pages = driver.find_element_by_id("id-div-numero-pagina-corrente").text - - if n_pages is not None: - return n_pages - - except NoSuchElementException: - try: - error_message = driver.find_element_by_id("id-area-principal-esquerda").text - print("Problematic date. Returned page with message:") - print(error_message) - print("Ignoring it.") - return "" - except NoSuchElementException: - pass - - print(f"Attempts {attempts} for get page number failed...") - attempts += 1 - time.sleep(1) - - raise LimitOfAttemptsReached("ERROR Limit of attempts reached!") - -def downloadNewspaper(newspaper: dict) -> bool: - """ - Download correspondent newspaper, saves it in pdf and txt. - - Final files will have the address 'jornais/pdf/.pdf', - 'jornais/txt/.txt'. - """ - date = newspaper['date'] - everything_ok = True - - downloaded_something = False - - for news_type in newspaper: - everything_ok_with_this = True - - if news_type == 'date': - continue - - news_type_holder = news_type.replace("/", " ") - - print(">>>>>>>> Starting download for", date, "-", news_type, "-", newspaper[news_type]) - - pdf_name = f'jornais/pdf/{str(date)}-{news_type_holder}.pdf' - - # Delete 3 lines - if fileDownloaded(pdf_name): - print("Ja salvo") - continue - - downloaded_something = True - - driver = initChromeWebdriver( - # 'chromedriver_win_79-0-3945-36.exe', - use_window=SCREEN_ON - ) - target = newspaper[news_type] - - driver.get(target) - time.sleep(1) - - - n_pages = getNumberOfPages(driver) - if len(n_pages) == 0: - continue - - n_pages = n_pages.split(" ")[-1] - - # some newspapers have no page number and all pages are together - if n_pages == "de": - print("Unique pdf, starting download...") - pdf_frame = getContentFrame(driver) - pdf_source = pdf_frame.get_attribute("src") - - pdf_name = f'jornais/pdf/{str(date)}-{news_type_holder}.pdf' - if not downloadPdf(pdf_source, pdf_name): - everything_ok_with_this = False - everything_ok = everything_ok and everything_ok_with_this - else: - n_pages = int(n_pages) - if not downloadPdfPages(driver, date, n_pages, pdf_name): - everything_ok_with_this = False - everything_ok = everything_ok and everything_ok_with_this - - if everything_ok_with_this: - print("Creating txt version...") - pdf2Txt(pdf_name, f'jornais/txt/{str(date)}-{news_type_holder}.txt') - - print("Sleeping...") - time.sleep(60) # politiness? - - # return everything_ok - return downloaded_something - -def crawler(): - """ - Crawler to download newspaper from the page: - http://www.iof.mg.gov.br/index.php?/ultima-edicao.html. - - It saves the date of the last newspaper downloaded in the file config.json. - If file is not found, it will starting looking for newspapers after - 2015/06/30. - It will save newspaper in pdf and txt in folders 'jornais/pdf' and - 'jornais/txt'. - - If it is not the first run and there are new newspaper, just execute - the script again and it will check for updates. - """ - - urls_to_download = listNewspaperToDownload() - - saveProgress(urls_to_download) - urls_to_download = json.loads(open("temp.json", "r").read()) # DELETE - - problematic_pdf = [] - - createFolders() - for i in range(len(urls_to_download)): - url = urls_to_download[i] - - Uncomment - if not downloadNewspaper(url): - problematic_pdf.append(urls_to_download) - - if downloadNewspaper(url): # Delete - saveProgress(urls_to_download[i + 1:]) - - f = open("config.json", "w+") - last_date_crawled = str(urls_to_download[-1]["date"]) - f.write(json.dumps({"last_date_crawled": last_date_crawled}, indent=1)) - f.close() - - print("Probably corrupted or mising pages:") - print(problematic_pdf) - - try: - os.remove("temp.json") - except FileNotFoundError: - pass - -def getNow(): - date = datetime.datetime.now() - return f"{date.hour}:{date.minute}:{date.second}" - -def checkPagesDownloaded(pdf_name: str) -> int: - f = open(pdf_name, "rb") - reader = PdfFileReader(f) - n_pages = reader.getNumPages() - f.close() - return n_pages - -if __name__ == "__main__": - crawler() \ No newline at end of file diff --git a/standalone-crawlers/diario-oficial-itabirito/crawler.py b/standalone-crawlers/diario-oficial-itabirito/crawler.py deleted file mode 100644 index 20b80d47..00000000 --- a/standalone-crawlers/diario-oficial-itabirito/crawler.py +++ /dev/null @@ -1,88 +0,0 @@ -# -*- coding: utf-8 -*- -import os -from datetime import datetime -from tqdm import tqdm -from selenium import webdriver -from selenium.webdriver.common.keys import Keys -import csv - - -driver = webdriver.Firefox() -driver.get("http://www.itabirito.mg.gov.br/oficio/") -assert "Prefeitura" in driver.title - -elements = driver.find_elements_by_class_name('clr-prefeitura') -links_pgs = [] -for elem in elements: - try: - link = driver.find_element_by_link_text(elem.text) - if link: - links_pgs.append(link.get_attribute("href")) - except: - pass - -numeroRegistros = len(links_pgs)-1 - -pbar = tqdm(total=numeroRegistros) - -registrosSalvos = 1 -title_list=[] -paragraphs_list=[] -external_links_list=[] -pdfs_list=[] - -dataConsulta = datetime.today().strftime('%Y_%m_%d') - -if not os.path.exists(dataConsulta): - os.makedirs(dataConsulta) - -# if not os.path.exists(dataConsulta): -# os.makedirs(dataConsulta) - -while(registrosSalvos<=numeroRegistros): - - # print('links_pgs[registrosSalvos]: ', links_pgs[registrosSalvos]) - - driver.get(links_pgs[registrosSalvos]) - - with open(os.path.join(dataConsulta,'publicacao_oficial_'+str(registrosSalvos)+'.html'), 'w') as f: - f.write(driver.page_source) - - title = driver.find_elements_by_class_name('clr-prefeitura')[1].text - - paragraphs = driver.find_element_by_xpath("//div[@class='content col-xs-12 col-sm-12 col-md-12 col-lg-12']//p").text - - links = driver.find_elements_by_xpath("//div[@class='content col-xs-12 col-sm-12 col-md-12 col-lg-12']//a") - pdfs="" - external_links="" - for link in links: - if ('.pdf' in str(link.get_attribute("href")) ): - pdfs+=link.get_attribute("href")+"\n" - else: - external_links+=link.get_attribute("href")+"\n" - - title_list.append(title) - paragraphs_list.append(paragraphs) - external_links_list.append(external_links) - pdfs_list.append(pdfs) - - registrosSalvos+=1 - pbar.update(1) - -if os.path.exists('diario-oficial-itabirito.csv'): - append_write = 'a' # append if already exists -else: - append_write = 'w' # make a new file if not - -with open('diario-oficial-itabirito.csv', append_write) as csvfile: - fieldnames = ['titulo', 'paragrafo','links_externos', 'pdfs_anexos'] - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - - writer.writeheader() - for i in range(0, registrosSalvos-1): - writer.writerow({'titulo': title_list[i], 'paragrafo': paragraphs_list[i], 'links_externos':external_links_list[i], 'pdfs_anexos':pdfs_list[i]}) - - -pbar.close() -driver.close() -driver.quit() \ No newline at end of file diff --git a/standalone-crawlers/diario-oficial-pocos-de-caldas/crawler.py b/standalone-crawlers/diario-oficial-pocos-de-caldas/crawler.py deleted file mode 100644 index 5b3524ff..00000000 --- a/standalone-crawlers/diario-oficial-pocos-de-caldas/crawler.py +++ /dev/null @@ -1,125 +0,0 @@ -import selenium -import os -import time - -from selenium import webdriver -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.options import Options -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC - -DOC_URL = "https://pocosdecaldas.mg.gov.br/transparencia/diario-oficial-do-municipio/" - -DEBUG = True - -RETRY_LIMIT = 10 - -def init_driver(headless=True, timeout=30): - """ - Initializes the Firefox Driver - - :param headless: if set to true, no window is rendered by Selenium - :param timeout: number of seconds to wait for a page to load - :return: the configured driver - """ - - fp = webdriver.FirefoxProfile() - # Download files inside a folder called tmp in the current dir - fp.set_preference("browser.download.folderList", 2) - fp.set_preference("browser.download.dir", os.path.join(os.getcwd(), 'tmp')) - fp.set_preference("browser.download.defaultFolder", os.path.join(os.getcwd(), 'tmp')) - fp.set_preference("pdfjs.disabled", True) - fp.set_preference("plugin.scan.Acrobat", "99.0"); - fp.set_preference("plugin.scan.plid.all", False); - fp.set_preference("browser.download.manager.showWhenStarting", False) - fp.set_preference("browser.download.manager.focusWhenStarting", False) - fp.set_preference("browser.download.manager.closeWhenDone", True) - fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf"); - fp.set_preference("dom.max_script_run_time", 1) - - options = Options() - options.headless = headless - - driver = webdriver.Firefox(options=options, firefox_profile=fp) - driver.set_page_load_timeout(timeout) - return driver - - -def load_or_retry(driver, url): - """ - Tries to GET the supplied url using the driver - - :param driver: the driver to access the page from - :param url: url to load - :returns: the driver in the desired url - """ - tries = 0 - - # tries the required number of times - while tries < RETRY_LIMIT: - try: - # leaves the loop if URL is correctly loaded - driver.get(url) - break - except: - tries += 1 - - if tries >= RETRY_LIMIT: - raise Exception("Couldn't reach {}".format(url)) - - return driver - - -def download_pdfs_url(driver, url): - """ - Downloads the PDF files for the given url - - :param driver: a Firefox driver for Selenium - :param url: the url to get the files from - """ - - driver = load_or_retry(driver, url) - - main_page_sel = "article > .inner-article > .entry-text > p a" - - driver.find_element_by_css_selector(main_page_sel).click() - - time.sleep(10) - - entry_limit_formset = driver.\ - find_elements_by_css_selector(".form-horizontal .form-group")[4] - entry_limit_form = entry_limit_formset.\ - find_element_by_css_selector(".form-control") - entry_limit_form.clear() - entry_limit_form.send_keys("100000") - - send_button = driver.find_element_by_css_selector(".form-horizontal button") - send_button.click() - - time.sleep(5) - - down_btn_selector = ".main .main-body table table .link i.fa.fa-download" - - WebDriverWait(driver, 30).until(\ - EC.presence_of_element_located((By.CSS_SELECTOR, down_btn_selector))\ - ) - - down_btns = driver.find_elements_by_css_selector(down_btn_selector) - - for btn in down_btns: - btn.click() - time.sleep(2) - - time.sleep(60) - - -def main(): - driver = init_driver(True, 30) - - download_pdfs_url(driver, DOC_URL) - - driver.close() - -if __name__ == "__main__": - main() diff --git a/standalone-crawlers/diario-oficial-uberaba/diario-oficial-uberaba.py b/standalone-crawlers/diario-oficial-uberaba/diario-oficial-uberaba.py deleted file mode 100644 index f75e139c..00000000 --- a/standalone-crawlers/diario-oficial-uberaba/diario-oficial-uberaba.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -import time -from selenium import webdriver -from selenium.webdriver.common.keys import Keys -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as ec - -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.options import Options - -for i in range(2020, 2021): - - directory = "/mnt/louise/diario-oficial-uberaba/" + str(i) - if not os.path.exists(directory): - os.makedirs(directory) - - fp = webdriver.FirefoxProfile() - - mime_types = "application/pdf, application/zip" - fp.set_preference("browser.download.folderList", 2) - fp.set_preference("browser.download.dir", directory) - fp.set_preference("browser.download.downloadDir", directory) - fp.set_preference("browser.download.defaultFolder", directory) - fp.set_preference("pdfjs.disabled", True) - fp.set_preference("plugin.scan.Acrobat", "99.0"); - fp.set_preference("plugin.scan.plid.all", False); - fp.set_preference("browser.download.manager.showWhenStarting", False) - fp.set_preference("browser.download.manager.focusWhenStarting", False) - fp.set_preference("browser.download.manager.closeWhenDone", True) - fp.set_preference("browser.helperApps.neverAsk.saveToDisk", mime_types) - - options = Options() - options.headless = True - driver = webdriver.Firefox(options=options, firefox_profile=fp, service_log_path = os.path.join(os.getcwd(), "geckodriver.log")) - - driver.get("http://www.uberaba.mg.gov.br/portal/galeriaarquivosd,portavoz,arquivos,{}".format(i)) - WebDriverWait(driver, 60).until(ec.presence_of_element_located((By.CSS_SELECTOR, "div.claGaleriaBoxFileTable"))) - - pages = driver.find_elements_by_css_selector("div.claGaleriaPaginas:first-of-type a") - npages = len(pages) - - for p in range(npages): - pages = driver.find_elements_by_css_selector("div.claGaleriaPaginas:first-of-type a") - pages[p].click() - time.sleep(10) - WebDriverWait(driver, 60).until(ec.presence_of_element_located((By.CSS_SELECTOR, "img.claCursorPointer"))) - - elem = driver.find_elements_by_css_selector("img.claCursorPointer") - for i in range(len(elem)): - time.sleep(2) - elem[i].click() - - time.sleep(20) - - time.sleep(60) - - driver.quit() - -driver.close() diff --git a/standalone-crawlers/diario-oficial-uberlandia/crawler.py b/standalone-crawlers/diario-oficial-uberlandia/crawler.py deleted file mode 100644 index 9e87cdac..00000000 --- a/standalone-crawlers/diario-oficial-uberlandia/crawler.py +++ /dev/null @@ -1,193 +0,0 @@ -import selenium -import os -import time - -from selenium import webdriver -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile -from selenium.webdriver.firefox.options import Options -from selenium.webdriver.common.keys import Keys -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC - -DOC_URL_BEFORE_2018 = "https://www.uberlandia.mg.gov.br/{}/{}/?post_type=diario_oficial" -DOC_URL_AFTER_2018 = "https://www.uberlandia.mg.gov.br/{}/{}/?post_type=diariooficial" - -DEBUG = True - -RETRY_LIMIT = 10 - -def init_driver(headless=True, timeout=30): - """ - Initializes the Firefox Driver - - :param headless: if set to true, no window is rendered by Selenium - :param timeout: number of seconds to wait for a page to load - :return: the configured driver - """ - - fp = webdriver.FirefoxProfile() - # Download files inside a folder called tmp in the current dir - fp.set_preference("browser.download.folderList", 2) - fp.set_preference("browser.download.dir", os.path.join(os.getcwd(), 'tmp')) - fp.set_preference("browser.download.defaultFolder", os.path.join(os.getcwd(), 'tmp')) - fp.set_preference("pdfjs.disabled", True) - fp.set_preference("plugin.scan.Acrobat", "99.0"); - fp.set_preference("plugin.scan.plid.all", False); - fp.set_preference("browser.download.manager.showWhenStarting", False) - fp.set_preference("browser.download.manager.focusWhenStarting", False) - fp.set_preference("browser.download.manager.closeWhenDone", True) - fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf"); - fp.set_preference("dom.max_script_run_time", 1) - - options = Options() - options.headless = headless - - driver = webdriver.Firefox(options=options, firefox_profile=fp) - driver.set_page_load_timeout(timeout) - return driver - - -def load_or_retry(driver, url): - """ - Tries to GET the supplied url using the driver - - :param driver: the driver to access the page from - :param url: url to load - :returns: the driver in the desired url - """ - tries = 0 - - # tries the required number of times - while tries < RETRY_LIMIT: - try: - # leaves the loop if URL is correctly loaded - driver.get(url) - break - except: - tries += 1 - - if tries >= RETRY_LIMIT: - raise Exception("Couldn't reach {}".format(url)) - - return driver - - -def generate_urls(first_year, last_year): - """ - Generates all the urls to be downloaded - - :returns: a generator instance which iterates over all possible urls - """ - - # The urls for before and after 2018 are different - for year in range(first_year, 2018): - for month in range(1, 13): - yield DOC_URL_BEFORE_2018.format(year, month) - - - for year in range(2018, last_year + 1): - for month in range(1, 13): - yield DOC_URL_AFTER_2018.format(year, month) - - -def download_pdfs_url(driver, url): - """ - Downloads the PDF files for a given url (which is given by a month/year - combination) - - :param driver: a Firefox driver for Selenium - :param url: the url to get the files from - :return: True if there was data for this period, False otherwise - """ - - driver = load_or_retry(driver, url) - - finished = False - - while True: - # Get all the PDF links in page - elems = driver.find_elements_by_css_selector(\ - "section article a.elementor-post__read-more") - - num_elems = len(elems) - - if len(elems) == 0: - return False - - list_url = driver.current_url - for el in range(num_elems): - # Get all the PDF links in page again (page might have been - # refreshed) - elems = driver.find_elements_by_css_selector(\ - "section article a.elementor-post__read-more") - link_href = elems[el].get_attribute("href") - try: - # This should download the file, but this url also hangs the - # driver, so we wrap it around a try/except block to catch the - # timeout exception - driver.get(link_href) - except: - pass - - load_or_retry(driver, list_url) - time.sleep(1) - - next_button = driver.find_elements_by_css_selector("a.page-numbers.next") - if len(next_button) != 0: - # go to next page - curr_url = driver.current_url - next_button[0].click() - wait = WebDriverWait(driver, 10) - wait.until(lambda driver: driver.current_url != curr_url) - else: - # there is no next page, finish - return True - -def countEntries(driver, url): - """ - Counts the amount of PDF files for a given url - - :param driver: a Firefox driver for Selenium - :param url: the url in which to look for the files - :return: Number of entries for this period - """ - - driver = load_or_retry(driver, url) - - finished = False - - total = 0 - - while True: - # Get all the PDF links in page - elems = driver.find_elements_by_css_selector(\ - "section article a.elementor-post__read-more") - - total += len(elems) - time.sleep(1) - - if len(elems) == 0: - return total - - next_button = driver.find_elements_by_css_selector("a.page-numbers.next") - if len(next_button) != 0: - # go to next page - next_button[0].click() - else: - # there is no next page, finish - return total - - - -def main(): - driver = init_driver(True, 30) - - for url in generate_urls(2005, 2020): - download_pdfs_url(driver, url) - print(url) - - driver.close() - -if __name__ == "__main__": - main() diff --git a/standalone-crawlers/diario-oficial-uniao/.gitignore b/standalone-crawlers/diario-oficial-uniao/.gitignore deleted file mode 100644 index 13329052..00000000 --- a/standalone-crawlers/diario-oficial-uniao/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*.zip -*.pyc diff --git a/standalone-crawlers/diario-oficial-uniao/diariouniao/check_missing_files.py b/standalone-crawlers/diario-oficial-uniao/diariouniao/check_missing_files.py deleted file mode 100644 index 6294f253..00000000 --- a/standalone-crawlers/diario-oficial-uniao/diariouniao/check_missing_files.py +++ /dev/null @@ -1,17 +0,0 @@ -import os -import json - -file_names = {} -with open("list_of_files.txt", "r") as f: - for line in f: - f_dict = json.loads(line) - file_names[f_dict["file_name"]] = f_dict - -files_downloaded = [] -for f in os.listdir("jornais-completos"): - files_downloaded.append(f) -files_downloaded = set(files_downloaded) - -for f in file_names: - if f not in files_downloaded: - print(f) diff --git a/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/__init__.py b/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/items.py b/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/items.py deleted file mode 100644 index 55ff5d07..00000000 --- a/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/items.py +++ /dev/null @@ -1,14 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - -import scrapy - - -class DiariouniaoItem(scrapy.Item): - # define the fields for your item here like: - # name = scrapy.Field() - pass diff --git a/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/middlewares.py b/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/middlewares.py deleted file mode 100644 index 638bce35..00000000 --- a/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/middlewares.py +++ /dev/null @@ -1,103 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -from scrapy import signals - - -class DiariouniaoSpiderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the spider middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_spider_input(self, response, spider): - # Called for each response that goes through the spider - # middleware and into the spider. - - # Should return None or raise an exception. - return None - - def process_spider_output(self, response, result, spider): - # Called with the results returned from the Spider, after - # it has processed the response. - - # Must return an iterable of Request, dict or Item objects. - for i in result: - yield i - - def process_spider_exception(self, response, exception, spider): - # Called when a spider or process_spider_input() method - # (from other spider middleware) raises an exception. - - # Should return either None or an iterable of Request, dict - # or Item objects. - pass - - def process_start_requests(self, start_requests, spider): - # Called with the start requests of the spider, and works - # similarly to the process_spider_output() method, except - # that it doesn’t have a response associated. - - # Must return only requests (not items). - for r in start_requests: - yield r - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) - - -class DiariouniaoDownloaderMiddleware(object): - # Not all methods need to be defined. If a method is not defined, - # scrapy acts as if the downloader middleware does not modify the - # passed objects. - - @classmethod - def from_crawler(cls, crawler): - # This method is used by Scrapy to create your spiders. - s = cls() - crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) - return s - - def process_request(self, request, spider): - # Called for each request that goes through the downloader - # middleware. - - # Must either: - # - return None: continue processing this request - # - or return a Response object - # - or return a Request object - # - or raise IgnoreRequest: process_exception() methods of - # installed downloader middleware will be called - return None - - def process_response(self, request, response, spider): - # Called with the response returned from the downloader. - - # Must either; - # - return a Response object - # - return a Request object - # - or raise IgnoreRequest - return response - - def process_exception(self, request, exception, spider): - # Called when a download handler or a process_request() - # (from other downloader middleware) raises an exception. - - # Must either: - # - return None: continue processing this exception - # - return a Response object: stops process_exception() chain - # - return a Request object: stops process_exception() chain - pass - - def spider_opened(self, spider): - spider.logger.info('Spider opened: %s' % spider.name) diff --git a/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/pipelines.py b/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/pipelines.py deleted file mode 100644 index 02f4611d..00000000 --- a/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/pipelines.py +++ /dev/null @@ -1,11 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html - - -class DiariouniaoPipeline(object): - def process_item(self, item, spider): - return item diff --git a/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/settings.py b/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/settings.py deleted file mode 100644 index feb1d033..00000000 --- a/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/settings.py +++ /dev/null @@ -1,100 +0,0 @@ -# -*- coding: utf-8 -*- -from shutil import which -# Scrapy settings for diariouniao project -# -# For simplicity, this file contains only settings considered important or -# commonly used. You can find more settings consulting the documentation: -# -# https://docs.scrapy.org/en/latest/topics/settings.html -# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# https://docs.scrapy.org/en/latest/topics/spider-middleware.html - -BOT_NAME = 'diariouniao' - -SPIDER_MODULES = ['diariouniao.spiders'] -NEWSPIDER_MODULE = 'diariouniao.spiders' - - -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'diariouniao (+http://www.yourdomain.com)' - -# Obey robots.txt rules -ROBOTSTXT_OBEY = True - -# Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 - -# Configure a delay for requests for the same website (default: 0) -# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay -# See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 -# The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 - -# Disable cookies (enabled by default) -#COOKIES_ENABLED = False - -# Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False - -# Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { -# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', -# 'Accept-Language': 'en', -#} - -# Enable or disable spider middlewares -# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { -# 'diariouniao.middlewares.DiariouniaoSpiderMiddleware': 543, -#} - -# Enable or disable downloader middlewares -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { -# 'diariouniao.middlewares.DiariouniaoDownloaderMiddleware': 543, -#} - -# Enable or disable extensions -# See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { -# 'scrapy.extensions.telnet.TelnetConsole': None, -#} - -# Configure item pipelines -# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -# ITEM_PIPELINES = { -# 'diariouniao.pipelines.DiariouniaoPipeline': 100, -# } - -# Enable and configure the AutoThrottle extension (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True -# The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 -# The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 -# The average number of requests Scrapy should be sending in parallel to -# each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 -# Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False - -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' - -SELENIUM_DRIVER_NAME = 'chrome' -SELENIUM_DRIVER_EXECUTABLE_PATH = which("../chromedriver_win32_chr_81.exe") -# SELENIUM_DRIVER_ARGUMENTS=[] -SELENIUM_DRIVER_ARGUMENTS=['--headless'] # '--headle ss' if using chrome instead of firefox -DOWNLOADER_MIDDLEWARES = { - 'scrapy_selenium.SeleniumMiddleware': 0 -} - -DOWNLOAD_DELAY = 1 \ No newline at end of file diff --git a/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/spiders/__init__.py b/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/spiders/__init__.py deleted file mode 100644 index ebd689ac..00000000 --- a/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/spiders/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/spiders/download_missing_files.py b/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/spiders/download_missing_files.py deleted file mode 100644 index e4657107..00000000 --- a/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/spiders/download_missing_files.py +++ /dev/null @@ -1,83 +0,0 @@ -import scrapy -from scrapy.crawler import CrawlerProcess -import requests -import logging -import os -import re -import time -import datetime -from shutil import which - -from scrapy_selenium import SeleniumRequest -from selenium import webdriver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.wait import WebDriverWait -from selenium.webdriver.common.action_chains import ActionChains -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import NoSuchElementException - -from PyPDF2.utils import PdfReadError -from PyPDF2 import PdfFileReader - -import pandas # for date_range -import json - -class SeleniumSpider(scrapy.Spider): - name = 'download_missing_files' - - def __init__(self, *a, **kw): - super(SeleniumSpider, self).__init__(*a, **kw) - - def start_requests(self): - files = {} - - with open("list_of_files.txt", "r") as f: - file_names = {} - with open("list_of_files.txt", "r") as f: - for line in f: - f_dict = json.loads(line) - file_names[f_dict["file_name"]] = f_dict - - files_downloaded = [] - for f in os.listdir("jornais-completos"): - files_downloaded.append(f) - files_downloaded = set(files_downloaded) - - for f in file_names: - if f not in files_downloaded: - for p in range(int(file_names[f]['totalArquivos'])): - url_data = file_names[f].copy() - url = f"http://pesquisa.in.gov.br/imprensa/servlet/INPDFViewer?jornal={url_data['jornal']}" \ - f"&pagina={p + 1}&data={url_data['data']}&captchafield=firstAccess" - - url_data["pagina"] = str(p + 1) - - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> spawning {url_data} {url}") - yield scrapy.Request( - url=url, callback=self.save_page, meta=url_data, - priority=1 # min(1, 1000 - int(args['totalArquivos'])) # give priority to shorter documents - ) - - def save_page(self, response): - metadata = response.request.meta - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> At save_page {metadata}") - metadata["data"] = metadata["data"].replace("/", "-") - name = "jornais/" + "_".join([ - metadata["date"], - metadata["jornal"], - "%04d" % (int(metadata["totalArquivos"])), - "%04d" % (int(metadata["pagina"])) - ]) + ".pdf" - with open(name, "wb") as f: - f.write(response.body) - -# c = CrawlerProcess({ -# 'ITEM_PIPELINES': {'scrapy.pipelines.files.FilesPipeline': 1}, -# 'SELENIUM_DRIVER_NAME': 'chrome', -# 'SELENIUM_DRIVER_EXECUTABLE_PATH': which("../chromedriver_win32_chr_81.exe"), -# 'SELENIUM_DRIVER_ARGUMENTS': [], -# # 'SELENIUM_DRIVER_ARGUMENTS': ['--headless'], # '--headle ss' if using chrome instead of firefox, -# 'DOWNLOADER_MIDDLEWARES': {'scrapy_selenium.SeleniumMiddleware': 800}, -# }) -# c.crawl(SeleniumSpider) -# c.start() diff --git a/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/spiders/list_files.py b/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/spiders/list_files.py deleted file mode 100644 index 97583b37..00000000 --- a/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/spiders/list_files.py +++ /dev/null @@ -1,170 +0,0 @@ -import scrapy -from scrapy.crawler import CrawlerProcess -import requests -import logging -import os -import re -import time -import datetime -from shutil import which - -from scrapy_selenium import SeleniumRequest -from selenium import webdriver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.wait import WebDriverWait -from selenium.webdriver.common.action_chains import ActionChains -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import NoSuchElementException - -from PyPDF2.utils import PdfReadError -from PyPDF2 import PdfFileReader - -import json - -import pandas # for date_range - -class SeleniumSpider(scrapy.Spider): - name = 'list_files' - - def __init__(self, *a, **kw): - super(SeleniumSpider, self).__init__(*a, **kw) - - # def file_exists(file_name): - # try: - # with open(file_name) as f: - # pass - # except FileNotFoundError: - # return False - # return True - - def gen_base_url(self): - today = datetime.datetime.today() - - # for date in pandas.date_range("2020-05-02", today.strftime('%Y-%m-%d')): - # for date in pandas.date_range("2014-01-01", today.strftime('%Y-%m-%d')): - for date in pandas.date_range("2014-01-01", "2020-05-15"): - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> At gen_base_url") - url = f"http://www.in.gov.br/leiturajornal?data={date.strftime('%d-%m-%Y')}#daypicker" - metadata = {"date": date.strftime('%Y-%m-%d')} - - yield (url, metadata) - - def next_call(self, gen): - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> At next_call") - try: - url, metadata = next(gen) - metadata["generator"] = gen - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Calling {url}") - return SeleniumRequest(url=url, meta=metadata, callback=self.parse, dont_filter=True) - except StopIteration: - return [] - - def start_requests(self): - gen = self.gen_base_url() - yield self.next_call(gen) - - def parse(self, response): - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> At parser") - driver = response.request.meta['driver'] - date = response.request.meta['date'] - - list_files = open("list_of_files.txt", "a+") - - if self.move_to_document_list(driver) and self.there_is_documents(driver, date): - table_rows = self.get_document_table(driver) - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> {date} - No. of rows: {len(table_rows)}") - - for tr in table_rows: - anchor = tr.find_element_by_xpath("td[1]/a") - link = anchor.get_attribute("onclick") - args = {"date": date} - - for i in link.split('?')[1][:-3].split('&'): - key, value = i.split('=') - args[key] = value - - args["file_name"] = f"{date}_{args['jornal']}.pdf" - list_files.write(json.dumps(args) + "\n") - - # for p in range(int(args['totalArquivos'])): - # url = f"http://pesquisa.in.gov.br/imprensa/servlet/INPDFViewer?jornal={args['jornal']}" \ - # f"&pagina={p + 1}&data={args['data']}&captchafield=firstAccess" - # url_data = args.copy() - - # url_data["pagina"] = str(p + 1) - - # self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> spawning {url_data} {url}") - # yield scrapy.Request( - # url=url, callback=self.save_page, meta=url_data, - # priority=1 # min(1, 1000 - int(args['totalArquivos'])) # give priority to shorter documents - # ) - - list_files.close() - yield self.next_call(response.request.meta['generator']) - - def wait_element(self, driver, xpath): - attempt = 1 - while attempt <= 8: - try: - driver.find_element_by_xpath(xpath) - return True - except NoSuchElementException: - attempt += 1 - time.sleep(1) - self.logger.info("Unable to locate element at " + xpath) - return False - - def move_to_document_list(self, driver): - full_btn_xpath = "//div[1]/div[2]/main/div[2]/section/div/div/div/div/div[4]/section/" \ - "div/div[2]/div/div[1]/div[3]/button[4]" - - if not self.wait_element(driver, full_btn_xpath): - return False - - driver.find_element_by_xpath(full_btn_xpath).click() - return True - - def there_is_documents(self, driver, date): - if not self.wait_element(driver, "//div/form/center"): - return False - - center = driver.find_element_by_xpath("//div/form/center") - - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> {date} TEXT - " + center.text) - if center.text == "Nenhum registro encontrado para a pesquisa.": - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> {date} - No documents for date") - return False - - return True - - def get_document_table(self, driver): - time.sleep(2) # making sure the table is filled before checking - table_rows_xpath = "//*[@id=\"ResultadoConsulta\"]/tbody/tr" - if not self.wait_element(driver, table_rows_xpath): - return [] - table_rows = driver.find_elements_by_xpath(table_rows_xpath) - return table_rows - - def save_page(self, response): - metadata = response.request.meta - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> At save_page {metadata}") - metadata["data"] = metadata["data"].replace("/", "-") - name = "jornais/" + "_".join([ - metadata["date"], - metadata["jornal"], - "%04d" % (int(metadata["totalArquivos"])), - "%04d" % (int(metadata["pagina"])) - ]) + ".pdf" - with open(name, "wb") as f: - f.write(response.body) - -# c = CrawlerProcess({ -# 'ITEM_PIPELINES': {'scrapy.pipelines.files.FilesPipeline': 1}, -# 'SELENIUM_DRIVER_NAME': 'chrome', -# 'SELENIUM_DRIVER_EXECUTABLE_PATH': which("../chromedriver_win32_chr_81.exe"), -# 'SELENIUM_DRIVER_ARGUMENTS': [], -# # 'SELENIUM_DRIVER_ARGUMENTS': ['--headless'], # '--headle ss' if using chrome instead of firefox, -# 'DOWNLOADER_MIDDLEWARES': {'scrapy_selenium.SeleniumMiddleware': 800}, -# }) -# c.crawl(SeleniumSpider) -# c.start() diff --git a/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/spiders/main_crawler.py b/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/spiders/main_crawler.py deleted file mode 100644 index aa161899..00000000 --- a/standalone-crawlers/diario-oficial-uniao/diariouniao/diariouniao/spiders/main_crawler.py +++ /dev/null @@ -1,162 +0,0 @@ -import scrapy -from scrapy.crawler import CrawlerProcess -import requests -import logging -import os -import re -import time -import datetime -from shutil import which - -from scrapy_selenium import SeleniumRequest -from selenium import webdriver -from selenium.webdriver.common.by import By -from selenium.webdriver.support.wait import WebDriverWait -from selenium.webdriver.common.action_chains import ActionChains -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import NoSuchElementException - -from PyPDF2.utils import PdfReadError -from PyPDF2 import PdfFileReader - -import pandas # for date_range - -class SeleniumSpider(scrapy.Spider): - name = 'main_crawler' - - def __init__(self, *a, **kw): - super(SeleniumSpider, self).__init__(*a, **kw) - - # def file_exists(file_name): - # try: - # with open(file_name) as f: - # pass - # except FileNotFoundError: - # return False - # return True - - def gen_base_url(self): - today = datetime.datetime.today() - - # for date in pandas.date_range("2020-05-02", today.strftime('%Y-%m-%d')): - # for date in pandas.date_range("2014-01-01", today.strftime('%Y-%m-%d')): - for date in pandas.date_range("2019-09-25", "2020-05-15"): - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> At gen_base_url") - url = f"http://www.in.gov.br/leiturajornal?data={date.strftime('%d-%m-%Y')}#daypicker" - metadata = {"date": date.strftime('%Y-%m-%d')} - - yield (url, metadata) - - def next_call(self, gen): - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> At next_call") - try: - url, metadata = next(gen) - metadata["generator"] = gen - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Calling {url}") - return SeleniumRequest(url=url, meta=metadata, callback=self.parse, dont_filter=True) - except StopIteration: - return [] - - def start_requests(self): - gen = self.gen_base_url() - yield self.next_call(gen) - - def parse(self, response): - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> At parser") - driver = response.request.meta['driver'] - date = response.request.meta['date'] - - if self.move_to_document_list(driver) and self.there_is_documents(driver, date): - table_rows = self.get_document_table(driver) - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> {date} - No. of rows: {len(table_rows)}") - - for tr in table_rows: - anchor = tr.find_element_by_xpath("td[1]/a") - link = anchor.get_attribute("onclick") - args = {"date": date} - - for i in link.split('?')[1][:-3].split('&'): - key, value = i.split('=') - args[key] = value - - for p in range(int(args['totalArquivos'])): - url = f"http://pesquisa.in.gov.br/imprensa/servlet/INPDFViewer?jornal={args['jornal']}" \ - f"&pagina={p + 1}&data={args['data']}&captchafield=firstAccess" - url_data = args.copy() - - url_data["pagina"] = str(p + 1) - - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> spawning {url_data} {url}") - yield scrapy.Request( - url=url, callback=self.save_page, meta=url_data, - priority=1 # min(1, 1000 - int(args['totalArquivos'])) # give priority to shorter documents - ) - - yield self.next_call(response.request.meta['generator']) - - def wait_element(self, driver, xpath): - attempt = 1 - while attempt <= 8: - try: - driver.find_element_by_xpath(xpath) - return True - except NoSuchElementException: - attempt += 1 - time.sleep(1) - self.logger.info("Unable to locate element at " + xpath) - return False - - def move_to_document_list(self, driver): - full_btn_xpath = "//div[1]/div[2]/main/div[2]/section/div/div/div/div/div[4]/section/" \ - "div/div[2]/div/div[1]/div[3]/button[4]" - - if not self.wait_element(driver, full_btn_xpath): - return False - - driver.find_element_by_xpath(full_btn_xpath).click() - return True - - def there_is_documents(self, driver, date): - if not self.wait_element(driver, "//div/form/center"): - return False - - center = driver.find_element_by_xpath("//div/form/center") - - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> {date} TEXT - " + center.text) - if center.text == "Nenhum registro encontrado para a pesquisa.": - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> {date} - No documents for date") - return False - - return True - - def get_document_table(self, driver): - time.sleep(2) # making sure the table is filled before checking - table_rows_xpath = "//*[@id=\"ResultadoConsulta\"]/tbody/tr" - if not self.wait_element(driver, table_rows_xpath): - return [] - table_rows = driver.find_elements_by_xpath(table_rows_xpath) - return table_rows - - def save_page(self, response): - metadata = response.request.meta - self.logger.info(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> At save_page {metadata}") - metadata["data"] = metadata["data"].replace("/", "-") - name = "jornais/" + "_".join([ - metadata["date"], - metadata["jornal"], - "%04d" % (int(metadata["totalArquivos"])), - "%04d" % (int(metadata["pagina"])) - ]) + ".pdf" - with open(name, "wb") as f: - f.write(response.body) - -# c = CrawlerProcess({ -# 'ITEM_PIPELINES': {'scrapy.pipelines.files.FilesPipeline': 1}, -# 'SELENIUM_DRIVER_NAME': 'chrome', -# 'SELENIUM_DRIVER_EXECUTABLE_PATH': which("../chromedriver_win32_chr_81.exe"), -# 'SELENIUM_DRIVER_ARGUMENTS': [], -# # 'SELENIUM_DRIVER_ARGUMENTS': ['--headless'], # '--headle ss' if using chrome instead of firefox, -# 'DOWNLOADER_MIDDLEWARES': {'scrapy_selenium.SeleniumMiddleware': 800}, -# }) -# c.crawl(SeleniumSpider) -# c.start() diff --git a/standalone-crawlers/diario-oficial-uniao/diariouniao/merge_documents.py b/standalone-crawlers/diario-oficial-uniao/diariouniao/merge_documents.py deleted file mode 100644 index 07467182..00000000 --- a/standalone-crawlers/diario-oficial-uniao/diariouniao/merge_documents.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -import threading -import time -import concurrent.futures -import PyPDF2.utils - -def merge_doc_pages(docs): - docs.sort() - - name_parts = docs[0].split("_") - name_key = name_parts[0] + "_" + name_parts[1] - - pdf_merger = PyPDF2.PdfFileMerger() - for text_pdf_file in docs: - pdf_merger.append(PyPDF2.PdfFileReader("jornais/" + text_pdf_file, strict=False)) - pdf_merger.write(f"jornais-completos/{name_key}.pdf") - - print(name_key, "merged!") - - for text_pdf_file in docs: - os.remove("jornais/" + text_pdf_file) - - pdf_merger.close() - -while True: - print("main loop starting...") - docs = {} - folder = "jornais" - docs_completed = [] - for file_name in os.listdir(folder): - name_parts = file_name.split("_") - name_key = name_parts[0] + "_" + name_parts[1] - - if name_key not in docs: - docs[name_key] = { - "n_pages": int(name_parts[2]), - "pages": [] - } - - docs[name_key]["pages"].append(file_name) - if len(docs[name_key]["pages"]) == docs[name_key]["n_pages"]: - print(name_key, "pages downloaded") - docs_completed.append(name_key) - - docs_completed = set(docs_completed) - - with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: - executor.map(merge_doc_pages, [ - docs[doc]["pages"] for doc in docs if doc in docs_completed - ]) - - print("main loop sleeping...") - time.sleep(300) \ No newline at end of file diff --git a/standalone-crawlers/diario-oficial-uniao/diariouniao/scrapy.cfg b/standalone-crawlers/diario-oficial-uniao/diariouniao/scrapy.cfg deleted file mode 100644 index 952c5e6a..00000000 --- a/standalone-crawlers/diario-oficial-uniao/diariouniao/scrapy.cfg +++ /dev/null @@ -1,11 +0,0 @@ -# Automatically created by: scrapy startproject -# -# For more information about the [deploy] section see: -# https://scrapyd.readthedocs.io/en/latest/deploy.html - -[settings] -default = diariouniao.settings - -[deploy] -#url = http://localhost:6800/ -project = diariouniao diff --git a/standalone-crawlers/diario-prefeitura-bh/crawler.py b/standalone-crawlers/diario-prefeitura-bh/crawler.py deleted file mode 100644 index a3ed8c86..00000000 --- a/standalone-crawlers/diario-prefeitura-bh/crawler.py +++ /dev/null @@ -1,53 +0,0 @@ -from datetime import timedelta, date -import os -import requests -import re -from time import sleep - -def daterange(start_date, end_date): - for n in range(int ((end_date - start_date).days)): - yield start_date + timedelta(n) - - -def download(single_date): - try: - date = single_date.strftime("%d/%m/%Y") - print(date) - url = "http://portal6.pbh.gov.br/dom/iniciaEdicao.do?method=DomDia&dia=" + date - content = requests.get(url).text - if "Nenhum Artigo publicado em " in content: - return True - destination_dir = date.split("/")[-1] + "/" + date.replace("/", "-") - if not os.path.exists(destination_dir): - os.makedirs(destination_dir) - with open(destination_dir + "/diario-" + date.replace("/", "-") + ".html", "w") as f: - f.write(content) - base_files = "http://portal6.pbh.gov.br/dom/iniciaEdicao.do?method=DetalheArtigo&pk=" - files = re.findall(r"method=DetalheArtigo&pk=\d+", content) - for f in files: - f = f.split("=")[-1] - url = base_files + f - response = requests.get(url) - content = response.text - with open(destination_dir + "/anexo-" + date.replace("/", "-") + "-" + f +".html", "w") as f: - f.write(content) - if "Capa" in content: - for capa in re.findall(r"/dom\d+ ", content): - capa = capa.replace("/", "").replace(" ", "") - response = requests.get("http://portal6.pbh.gov.br/dom/Files/" + capa + " - assinado.pdf") - with open(destination_dir + "/capa-" + date.replace("/", "-") + ".pdf", 'wb') as f: - f.write(response.content) - return True - except Exception as err: - print(err) - print("Deu erro") - sleep(60) - return False - -start_date = date(2018,11, 28) -end_date = date(2020, 3, 13) -for single_date in daterange(start_date, end_date): - while True: - print("Tentativa") - if download(single_date): - break diff --git a/standalone-crawlers/licitacoes-e/README.md b/standalone-crawlers/licitacoes-e/README.md deleted file mode 100644 index c0d39f01..00000000 --- a/standalone-crawlers/licitacoes-e/README.md +++ /dev/null @@ -1 +0,0 @@ -O codigo não esta completo e não consegue recuperar os dados do dominio pois o reCaptcha consegue identificar a navegaço automatizada. diff --git a/standalone-crawlers/licitacoes-e/licitacoes-e.py b/standalone-crawlers/licitacoes-e/licitacoes-e.py deleted file mode 100644 index 1de2e70f..00000000 --- a/standalone-crawlers/licitacoes-e/licitacoes-e.py +++ /dev/null @@ -1,107 +0,0 @@ -from selenium import webdriver -from selenium.webdriver.support.ui import Select -import datetime -import time -import os - -from PIL import Image -import cv2 -import pytesseract - -import speech_recognition as sr -from os import remove -from os import path -from pydub import AudioSegment - - -# Quebra captcha de audio -def transcribe(audio, limit = 5): - sound = AudioSegment.from_mp3(audio) - AUDIO_FILE = "audio.wav" - sound.export(AUDIO_FILE, format="wav") - r = sr.Recognizer() - resposta = "" - attempts = 0 - with sr.AudioFile(AUDIO_FILE) as source: - while attempts < limit: - try: - audio = r.record(source) # read the entire audio file - resposta = r.recognize_google(audio) - attempts = limit + 1 - except: - attempts += 1 - remove(AUDIO_FILE) - return resposta, attempts - -#Qubra captcha de imagem -def solve_captcha(img_path): - img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE) - img[img>110] = 240 - img[img<30] = 240 - img = img[:,20:150] - return pytesseract.image_to_string(img) - - -url = "https://www.licitacoes-e.com.br/aop/pesquisar-licitacao.aop?opcao=preencherPesquisar" - -options = webdriver.ChromeOptions() -options.add_argument("--start-maximized") -options.add_experimental_option("excludeSwitches", ["enable-automation"]) -options.add_experimental_option('useAutomationExtension', False) - -driver = webdriver.Chrome(options=options) -driver.get(url) -print("Pagina carregada") - -# Prenche formulario -# option = input("Situação da licitação: ") -option = "Publicada" -situacao = driver.find_element_by_xpath("""//*[@id="licitacaoPesquisaSituacaoForm"]/div[5]/span/input""") -situacao.clear() -situacao.send_keys(option) - -#Pede para usuario preencher o captcha -while True: - resposta = input("Captcha: ") - driver.find_element_by_xpath("""//*[@id="pQuestionAvancada"]""").send_keys(resposta) - driver.find_element_by_xpath("""//*[@id="licitacaoPesquisaSituacaoForm"]/div[14]/input""").click() - try: - Select(driver.find_element_by_xpath('//*[@id="tCompradores_length"]/label/select')).select_by_visible_text("Todos") - break - except: - continue - -# Para cada linha da tabela -table = driver.find_element_by_xpath("""//*[@id="tCompradores"]""") -for row in table.find_elements_by_xpath(".//tr/td[5]/img"): - - #abrindo nova janela - row.click() - time.sleep(2) - driver.switch_to_window(driver.window_handles[1]) - - # Passando por captcha - #clica no botao do captcha - time.sleep(2) - frame = driver.find_element_by_xpath('//*[@id="html_element"]/div/div/iframe') - driver.switch_to.frame(frame) - driver.find_element_by_xpath("//*[@id='recaptcha-anchor']").click() - driver.switch_to.default_content() - #escolhe audio - time.sleep(5) - frame = driver.find_element_by_xpath('//*[@id="bodyPrincipal"]/div[3]/div[4]/iframe') - driver.switch_to.frame(frame) - driver.find_element_by_xpath('//*[@id="rc-imageselect"]/div[3]/div[2]/div[1]/div[1]/div[2]').click() - driver.switch_to.default_content() - - #recupera dados para aquela tabela - # ... - - # Fecha aba atual - driver.close() - driver.switch_to_window(driver.window_handles[0]) - -# Fecha janelas -print("Dados recolhidos") -time.sleep(5) -driver.close() diff --git a/standalone-crawlers/licitacoes-muriae/crawler.py b/standalone-crawlers/licitacoes-muriae/crawler.py deleted file mode 100644 index 53214c73..00000000 --- a/standalone-crawlers/licitacoes-muriae/crawler.py +++ /dev/null @@ -1,147 +0,0 @@ -import json -import logging -import os -import time -import scrapy - -from scrapy.http import TextResponse - -from scrapy.crawler import CrawlerProcess - -COLLECTION_URL = "https://muriae.mg.gov.br/licitacao/" -FOLDER_PATH = "coleta" -ID_FORMAT = "{:03d}" - -class MuriaeCrawler(scrapy.Spider): - name = "licitacoes_muriae" - start_urls = [ - COLLECTION_URL, - ] - - def __init__(self): - logging.getLogger('scrapy').setLevel(logging.WARNING) - - if not os.path.exists(FOLDER_PATH): - os.makedirs(FOLDER_PATH) - - # list of css selectors used to advance from one "layer" to the next - self.follow_list = [\ - '#projects-archive article a.header-button.callout-btn::attr("href")', - 'a.attachment-link::attr("href")'] - # list of css collectors used to advance from one page to the next in - # the same "layer" - self.pagination_list = ['li.nav-previous a::attr("href")', None, None] - # list of folders to store each layer in - self.folder_list = [\ - os.path.join(FOLDER_PATH, "listas"), - os.path.join(FOLDER_PATH, "paginas"), - os.path.join(FOLDER_PATH, "anexos") - ] - - if not os.path.exists(FOLDER_PATH): - os.makedirs(FOLDER_PATH) - - for folder_path in self.folder_list: - if not os.path.exists(folder_path): - os.makedirs(folder_path) - - def parse(self, response): - # get the full path to the current branch we're crawling - current_path = response.meta.get('crawl_path', '[1]') - current_path = json.loads(current_path) - # our current scraping "layer" - current_layer = int(response.meta.get('layer', 0)) - # our current page in this scraping "layer" - current_page = int(response.meta.get('page', 1)) - - file_name_pre = "" - for part in current_path: - if len(file_name_pre) > 0: - file_name_pre += "-" - file_name_pre += ID_FORMAT.format(part) - - # Detect if file is an html or an attachment - is_attachment = ("download" in response.headers.get("Content-Type").decode("utf-8")) - - if not isinstance(response, TextResponse) or is_attachment: - # Binary response or marked as attachment, download it - - file_name = file_name_pre + "--" - if "Content-Disposition" in response.headers: - file_data = response.headers["Content-Disposition"].decode("utf-8").split(";") - for d in file_data: - d_entry = d.split("=") - if len(d_entry) > 1: - name, value = d_entry - if name.strip() == "filename": - file_name += value - - file_name = os.path.join(self.folder_list[current_layer], file_name) - - with open(file_name, 'wb') as f: - f.write(response.body) - else: - # Text response (suppose it's html) - - # check if there is pagination at this layer - if current_layer < len(self.pagination_list) and \ - self.pagination_list[current_layer] is not None: - # go to the next page in this layer if it exists - next_page = response.css(self.pagination_list[current_layer]) - if next_page is not None and len(next_page) > 0: - next_page = next_page[0].get() - request = scrapy.Request(next_page, callback=self.parse) - request.meta['layer'] = current_layer - request.meta['page'] = current_page + 1 - # Update the current layer in the crawling tree - updated_path = current_path.copy() - updated_path[-1] = current_page + 1 - request.meta['crawl_path'] = json.dumps(updated_path) - yield request - - # write current file - file_name = file_name_pre + ".html" - file_name = os.path.join(self.folder_list[current_layer], file_name) - - with open(file_name, 'wb') as f: - f.write(response.body) - - # request all pages for next layer contained in this page - count = 1 - if current_layer < len(self.follow_list) and \ - self.follow_list[current_layer] is not None: - entry_links = response.css(self.follow_list[current_layer]) - # Insert another layer into the crawling tree - updated_path = current_path.copy() - updated_path.append(1) - # If next layer has multiple pages - multi_page = current_layer + 1 < len(self.pagination_list) and \ - self.pagination_list[current_layer + 1] is not None - if multi_page: - # Insert a new counter into the crawling tree for that - updated_path.append(0) - - for entry in entry_links: - curr_link = entry.get() - request = scrapy.Request(curr_link, callback=self.parse) - request.meta['page'] = 1 - request.meta['layer'] = current_layer + 1 - if multi_page: - updated_path[-2] = count - updated_path[-1] = 1 - else: - updated_path[-1] = count - request.meta['crawl_path'] = json.dumps(updated_path) - count += 1 - yield request - -def main(): - process = CrawlerProcess({ - 'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0' - }) - - process.crawl(MuriaeCrawler) - process.start() - -if __name__ == "__main__": - main() diff --git a/standalone-crawlers/licitacoes-obras/obras.py b/standalone-crawlers/licitacoes-obras/obras.py deleted file mode 100644 index 04bc5b65..00000000 --- a/standalone-crawlers/licitacoes-obras/obras.py +++ /dev/null @@ -1,20 +0,0 @@ -import requests -import logging -import os - -logging.basicConfig(level=logging.INFO) - -base = "http://geoobras.tce.mg.gov.br/cidadao/Obras/ObrasPaginaInteiraDetalhes.aspx?IDOBRA=" -not_found = "possivel localizar a obra" -for i in range(50000): - # logging.info("Coletando obra de ID: " + str(i)) - url = base + str(i) - response = requests.get(url) - content = response.text - if not_found in content: - continue - folder = "obras" + str(i%100) - if not os.path.exists("obras_tce/" + folder): - os.makedirs("obras_tce/" + folder) - with open("obras_tce/" + folder + "/obra_id" + str(i), "w") as f: - f.write(content) diff --git a/standalone-crawlers/licitacoes-pbh/README.md b/standalone-crawlers/licitacoes-pbh/README.md deleted file mode 100644 index 6f0f573b..00000000 --- a/standalone-crawlers/licitacoes-pbh/README.md +++ /dev/null @@ -1,15 +0,0 @@ -Crawler for https://prefeitura.pbh.gov.br/licitacoes. - -As of March 18th 2020, there are 2210 processes. The crawler should run for approximately 5 hours to achieve full coverage (expected: 13GB of data). - -### Features -- Using ``requests`` and ``wget`` -- Downloads the following: - - base html for each process page - - external links - - attached files (pdf/zip/doc/docx/odt/odf/eml) -- Stops after 5 consecutive void pages -- Full coverage - -### TODO -- [ ] Access IDs randomly to avoid blocking \ No newline at end of file diff --git a/standalone-crawlers/licitacoes-pbh/crawler.py b/standalone-crawlers/licitacoes-pbh/crawler.py deleted file mode 100644 index 73c2cbbe..00000000 --- a/standalone-crawlers/licitacoes-pbh/crawler.py +++ /dev/null @@ -1,59 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Rúbia Reis Guerra -rubia-rg@github -Crawler for 'Licitações' at https://prefeitura.pbh.gov.br/licitacoes -""" -import time -import logging - -from requests.exceptions import ConnectionError -from http.client import RemoteDisconnected -from licitacoes import config -from licitacoes import download - - -download_dir = config.DOWNLOAD_DIR -config.set_logging() - -current_page = config.START_PAGE -start_time = time.time() -retries = 0 -skipped_pages = 0 - -while current_page < config.MAX_PAGES: - try: - # time information - download.progress_information(current_page, start_time) - - # process not found - if download.check_access_forbidden(current_page): - logging.info("Page " + str(current_page) + ": 403 Forbidden") - skipped_pages += 1 - current_page += 1 - if download.check_max_skipped_pages(skipped_pages): - break - else: - continue - elif download.check_page_exists(current_page): - logging.info("Page " + str(current_page) + ": nothing to download") - skipped_pages += 1 - current_page += 1 - if download.check_max_skipped_pages(skipped_pages): - break - else: - continue - else: - skipped_pages = 0 - logging.info("Crawling files on page " + str(current_page)) - - # fetching html - download.crawl_search_page(current_page) - - current_page = current_page + 1 - retries = 0 - - # Check for timeout and retry download - except (TimeoutError, ConnectionError, RemoteDisconnected): - if download.check_max_retries(retries): - break diff --git a/standalone-crawlers/licitacoes-pbh/licitacoes/__init__.py b/standalone-crawlers/licitacoes-pbh/licitacoes/__init__.py deleted file mode 100644 index 8616bf84..00000000 --- a/standalone-crawlers/licitacoes-pbh/licitacoes/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Rúbia Reis Guerra -rubia-rg@github -Crawler for 'Licitações' at https://prefeitura.pbh.gov.br/licitacoes -""" \ No newline at end of file diff --git a/standalone-crawlers/licitacoes-pbh/licitacoes/config.py b/standalone-crawlers/licitacoes-pbh/licitacoes/config.py deleted file mode 100644 index 5963d4a9..00000000 --- a/standalone-crawlers/licitacoes-pbh/licitacoes/config.py +++ /dev/null @@ -1,37 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Rúbia Reis Guerra -rubia-rg@github -Crawler for 'Licitações' at https://prefeitura.pbh.gov.br/licitacoes -""" -import logging - -BASE_FOLDER = "/br.gov.pbh.prefeitura/licitacoes" -DOWNLOAD_DIR = "/Volumes/Work HD/MPMG" -NO_PERMISSION_MESSAGE = "You don't have permission to access" -INVALID_PAGE_MESSAGE = 'Uma escolha inválida foi detectada. Por favor entre em contato com o administrador do site.' -MAX_RETRIES = 10 -WAIT_INTERVAL = 30 -START_PAGE = 1 -MAX_PAGES = 100 -MAX_SKIPPED_PAGES = 5 -BASE = "https://prefeitura.pbh.gov.br" -SEARCH_PAGE = "https://prefeitura.pbh.gov.br/licitacoes?field_situacao_value=1&page=" -CSS_SELECTOR_FILES = '.field.field--name-field-icone.field--type-image.field--label-hidden.field__item a[href]' -CSS_SELECTOR_LINKS = '.item_ar_licitacao a[href]' -USER_AGENT_LIST = [ - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0' -] - - -def set_logging(): - logging.basicConfig(level=logging.INFO) diff --git a/standalone-crawlers/licitacoes-pbh/licitacoes/download.py b/standalone-crawlers/licitacoes-pbh/licitacoes/download.py deleted file mode 100644 index fdb1c730..00000000 --- a/standalone-crawlers/licitacoes-pbh/licitacoes/download.py +++ /dev/null @@ -1,161 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Rúbia Reis Guerra -rubia-rg@github -Crawler for 'Licitações' at https://prefeitura.pbh.gov.br/licitacoes -""" -import logging -import time -import wget -import requests -import os -import random -import mimetypes - -from licitacoes import config -from bs4 import BeautifulSoup -from random import randint - - -def progress_information(current_page, start_time): - if not current_page % 100: - wait_time = randint(1, 30) - e = int(time.time() - start_time) - elapsed_time = f'{e // 3600:02d}:{(e % 3600 // 60):02d}:{e % 60:02d}' - percentage = (current_page / config.MAX_PAGES) * 100 - logging.info("Elapsed time " + elapsed_time + " sec, " + str(percentage) + "% of all pages covered") - logging.info("Waiting " + str(wait_time) + " sec") - time.sleep(wait_time) - - -def get_url(link): - return config.BASE + str(link) - - -def get_search_url(current_page): - return config.SEARCH_PAGE + str(current_page) - - -def get_html_contents(link): - """ - Returns HTML content in binary code - - Preserves text encoding - :param link: current process - :return: url contents - """ - user_agent = random.choice(config.USER_AGENT_LIST) - headers = {'User-Agent': user_agent} - url = get_url(link) - response = requests.get(url, headers=headers) - return response.content - - -def get_html_text(link=None, url=None): - if url is None: - url = get_url(link) - user_agent = random.choice(config.USER_AGENT_LIST) - headers = {'User-Agent': user_agent} - response = requests.get(url, headers=headers) - return response.text - - -def check_page_exists(current_page): - url = get_search_url(current_page) - text = get_html_text(url=url) - return config.INVALID_PAGE_MESSAGE in str(text) or "Acesso negado" in str(text) - - -def check_access_forbidden(current_page): - url = get_search_url(current_page) - text = get_html_text(url=url) - return config.NO_PERMISSION_MESSAGE in str(text) - - -def check_max_skipped_pages(skipped_ids): - if skipped_ids > config.MAX_SKIPPED_PAGES: - logging.info(str(skipped_ids) + " consecutive void pages. Nothing else to download") - return True - else: - return False - - -def check_max_retries(retries): - retries += 1 - retry_time = config.WAIT_INTERVAL * retries - logging.info("TimeoutError, retrying in " + str(retry_time) + " sec") - time.sleep(retry_time) - if retries > config.MAX_RETRIES: - logging.info("Max retries reached, terminating crawler") - return True - - -def get_output_dir(link): - return config.DOWNLOAD_DIR + config.BASE_FOLDER + link + '/' - - -def crawl_search_page(current_page): - search_url = get_search_url(current_page) - html_text = get_html_text(url=search_url) - links = get_links(html_text, config.CSS_SELECTOR_LINKS) - for link in set(links): - logging.info("Crawling " + str(link)) - download_html(link) - get_files_links(link) - - -def download_html(link): - content = get_html_contents(link) - output_dir = get_output_dir(link) - _, filename = parse_link(link) - if not os.path.exists(output_dir): - os.makedirs(output_dir) - logging.info("Creating directory: " + str(output_dir)) - with open(output_dir + filename + ".html", "wb") as f: - f.write(content) - - -def download_process_files(base_link, url): - output_dir = get_output_dir(base_link) - try: - wget.download(url, out=output_dir) - time.sleep(1.5) - except: - logging.info('File not available at ' + str(url)) - - -def save_process_links(base_link, url): - output_dir = get_output_dir(base_link) - if not os.path.exists(output_dir): - os.makedirs(output_dir) - logging.info("Creating directory: " + str(output_dir)) - with open(output_dir + "links.txt", "a") as f: - f.write(str(url)) - - -def get_files_links(base_link): - html_text = get_html_text(base_link) - links = get_links(html_text, config.CSS_SELECTOR_FILES) - for link in set(links): - if is_file(link): - logging.info("Downloading file from " + str(link)) - download_process_files(base_link, link) - else: - logging.info("Appending " + str(link) + " to links.txt") - save_process_links(base_link, link) - - -def is_file(url): - return mimetypes.guess_type(url)[0] - - -def get_links(html_text, css_element=config.CSS_SELECTOR_LINKS): - soup = BeautifulSoup(html_text, features='lxml') - links = [] - for link in soup.select(css_element): - links.append(link['href']) - return links - - -def parse_link(link): - return link.rsplit('/', 1) - diff --git a/standalone-crawlers/licitacoes-pocos-caldas/README.md b/standalone-crawlers/licitacoes-pocos-caldas/README.md deleted file mode 100644 index e78e925a..00000000 --- a/standalone-crawlers/licitacoes-pocos-caldas/README.md +++ /dev/null @@ -1,16 +0,0 @@ -Crawler for https://pocosdecaldas.mg.gov.br/glossario/licitacoes/. - -As of March 30th 2020, there are 3868 processes. The crawler should run for a few hours achieve full coverage. - -### Features -- Using selenium (geckodriver) -- Downloads the following: - - html for search results - - html of each tab in 'Saiba mais...' -- Checks process IDs from 1 to 4000 - - Stops after 50 consecutive void IDs - -### TODO -- [ ] Access IDs randomly to avoid blocking -- [ ] Download files in `Participantes do Processo > Contratos` -- [ ] Download files in `Atas de Registro de Preços > Atas` \ No newline at end of file diff --git a/standalone-crawlers/licitacoes-pocos-caldas/crawler.py b/standalone-crawlers/licitacoes-pocos-caldas/crawler.py deleted file mode 100644 index bc3badba..00000000 --- a/standalone-crawlers/licitacoes-pocos-caldas/crawler.py +++ /dev/null @@ -1,42 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Rúbia Reis Guerra -rubia-rg@github -Crawler for 'Licitações' at https://pocosdecaldas.mg.gov.br/ -""" -import logging -import time -from licitacoes import config, form, utils, process -from selenium.common import exceptions - -config.set_logging() -driver = config.get_driver() -driver.set_window_size(1920, 1080) -time.sleep(5) - -browser = utils.load_page(driver) - -logging.info('Filling search form') - -SearchForm = form.SearchForm() -search_results = SearchForm.fill_form(browser) - -process_count = config.START_PROCESS -start_time = time.time() -skipped_ids = 0 - -while process_count < config.MAX_PROCESSES: - try: - Process = process.BiddingProcess(process_count, search_results) - logging.info('Downloading search result ' + str(process_count)) - except exceptions.NoSuchElementException: - logging.error('Process does not exist') - process_count += 1 - skipped_ids += 1 - utils.check_max_skipped_ids(skipped_ids) - continue - utils.progress_information(process_count, start_time) - search_results = Process.extract_process_data(search_results) - process_count += 1 - -driver.close() diff --git a/standalone-crawlers/licitacoes-pocos-caldas/licitacoes/__init__.py b/standalone-crawlers/licitacoes-pocos-caldas/licitacoes/__init__.py deleted file mode 100644 index a4142cfc..00000000 --- a/standalone-crawlers/licitacoes-pocos-caldas/licitacoes/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -"" -"" \ No newline at end of file diff --git a/standalone-crawlers/licitacoes-pocos-caldas/licitacoes/config.py b/standalone-crawlers/licitacoes-pocos-caldas/licitacoes/config.py deleted file mode 100644 index c01aa099..00000000 --- a/standalone-crawlers/licitacoes-pocos-caldas/licitacoes/config.py +++ /dev/null @@ -1,90 +0,0 @@ -import logging - -from selenium import webdriver -from selenium.webdriver.chrome.options import Options as ChromeOptions -from selenium.webdriver.firefox.options import Options as FirefoxOptions - - -BASE_FOLDER = "/br.gov.mg.pocosdecaldas/licitacoes" -DOWNLOAD_DIR = "/Volumes/Work HD/MPMG" - -PROFILE_PATH = "/Users/work/Library/Application Support/Firefox/Profiles/yde1eaqi.mpmg" - -START_PROCESS = 39 -MAX_PROCESSES = 4000 - -MAX_SKIPPED_IDS = 20 - -START_URL = "http://187.49.207.14:8081/portalcidadao/#78c3e513dd43cb27d8a3e2f376196ffc656d7ea577b2c6fbae696bb4f" \ - "e4e5639b796%C5%A690e87f4d63047fdd72ee224f99bf38dc8950906afc423ff6853c05e0a11a2f886c31119c58b6298c4" \ - "2fc9af04c84c6f1a52a741a1e66bf2e691388cd2a945d3362feb4b2aa570e068e90a9b811d197c088f07777e90419980e4" \ - "93642fbe6a4b75e31164bb2c70f08f308f28a9ad1142ec1a016f8453ed9c18ec7d19212daad26019a707cedb449c83bfc1" \ - "b3f31ad3d609ad126923dc87f6a4ea6a6c438e38107134f58a903a4ab4b9da16a974c4907e8c179e545586" - -MAX_RETRIES = 10 -WAIT_INTERVAL = 30 -WEBDRIVER_DELAY = 90 - -MAX_RESULTS_FIELD = '/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div[10]/input' -SORT_BUTTON = '//*[@id="gwt-uid-122"]' -SUBMIT_BUTTON = '/html/body/div[5]/div[1]/div[1]/div[2]/div/div[2]/div[11]/button' -FIRST_RESULT = "/html/body/div[5]/div[1]/div[1]/div[2]/div/div[3]/table/tbody/tr[2]/" \ - "td/div/table/tbody/tr/td/table/tbody/tr[2]/td[1]/a" - - -def set_options_firefox(): - firefox_options = FirefoxOptions() - firefox_options.add_argument("--headless") - firefox_options.add_argument("--window-size=1920x1080") - firefox_options.add_argument("--disable-notifications") - firefox_options.add_argument('--no-sandbox') - firefox_options.add_argument('--verbose') - firefox_options.add_argument('--disable-gpu') - firefox_options.add_argument('--disable-software-rasterizer') - firefox_options.set_preference("browser.download.manager.showWhenStarting", False) - firefox_options.set_preference('browser.helperApps.neverAsk.openFile', "application/pdf") - firefox_options.set_preference('browser.helperApps.neverAsk.saveToDisk', "application/pdf") - firefox_options.set_preference('browser.helperApps.neverAsk.openFile', "application/zip") - firefox_options.set_preference('browser.helperApps.neverAsk.saveToDisk', "application/zip") - firefox_options.set_preference('browser.helperApps.neverAsk.openFile', "application/msword") - firefox_options.set_preference('browser.helperApps.neverAsk.saveToDisk', "application/msword") - firefox_options.set_preference("pdfjs.disabled", "true") - return firefox_options - - -def set_options_chrome(): - chrome_options = ChromeOptions() - chrome_options.add_argument("--headless") - chrome_options.add_argument("--window-size=1920x1080") - chrome_options.add_argument("--disable-notifications") - chrome_options.add_argument('--no-sandbox') - chrome_options.add_argument('--verbose') - chrome_options.add_argument('--disable-gpu') - chrome_options.add_argument("user-agent=python3.6") - chrome_options.add_argument('--disable-software-rasterizer') - chrome_options.add_experimental_option("prefs", { - "download.default_directory": DOWNLOAD_DIR, - "download.prompt_for_download": False, - "download.directory_upgrade": True, - "safebrowsing_for_trusted_sources_enabled": False, - "safebrowsing.enabled": False - }) - return chrome_options - - -def get_driver(profile_path=PROFILE_PATH, browser='firefox'): - if browser == 'firefox': - firefox_options = set_options_firefox() - if profile_path: - return webdriver.Firefox(firefox_profile=profile_path, options=firefox_options) - else: - return webdriver.Firefox(options=firefox_options) - elif browser == 'chrome': - chrome_options = set_options_chrome() - return webdriver.Chrome(chrome_options=chrome_options, executable_path="/usr/local/bin/chromedriver") - else: - return None - - -def set_logging(): - logging.basicConfig(level=logging.INFO) diff --git a/standalone-crawlers/licitacoes-pocos-caldas/licitacoes/form.py b/standalone-crawlers/licitacoes-pocos-caldas/licitacoes/form.py deleted file mode 100644 index 6c086ea3..00000000 --- a/standalone-crawlers/licitacoes-pocos-caldas/licitacoes/form.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -Rúbia Reis Guerra -rubia-rg@github -Fills 'Licitaçoes' search form at https://pocosdecaldas.mg.gov.br/ -""" -import time -import logging - -from licitacoes import config, utils -from selenium.common import exceptions - - -class SearchForm: - def __init__(self, sort_button=config.SORT_BUTTON, max_results_field=config.MAX_RESULTS_FIELD, - submit_button=config.SUBMIT_BUTTON): - self.sort_button = sort_button - self.max_results_field = max_results_field - self.submit_button = submit_button - - def sort_results(self, driver): - try: - driver.find_element_by_xpath(self.sort_button).click() - except exceptions.NoSuchElementException: - time.sleep(5) - try: - driver.find_element_by_xpath(self.sort_button).click() - except exceptions.NoSuchElementException: - logging.error('Website has not loaded properly') - except exceptions.ElementClickInterceptedException: - element = driver.find_element_by_xpath(self.sort_button).click() - driver.execute_script("arguments[0].scrollIntoView();", element) - element.click() - - def set_max_displayed_results(self, driver): - driver.find_element_by_xpath(self.max_results_field).clear() - driver.find_element_by_xpath(self.max_results_field).send_keys(config.MAX_PROCESSES) - - def submit_results(self, driver): - submit_button = driver.find_element_by_xpath(self.submit_button) - submit_button.click() - - def fill_form(self, driver): - self.sort_results(driver) - self.set_max_displayed_results(driver) - self.submit_results(driver) - utils.wait_page_load(driver, element_xpath=config.FIRST_RESULT, delay=config.WEBDRIVER_DELAY) - return driver diff --git a/standalone-crawlers/licitacoes-pocos-caldas/licitacoes/process.py b/standalone-crawlers/licitacoes-pocos-caldas/licitacoes/process.py deleted file mode 100644 index e2a755bf..00000000 --- a/standalone-crawlers/licitacoes-pocos-caldas/licitacoes/process.py +++ /dev/null @@ -1,91 +0,0 @@ -""" -Rúbia Reis Guerra -rubia-rg@github -Treats 'Licitaçoes' processes at https://pocosdecaldas.mg.gov.br/ -""" -from licitacoes import utils -from selenium.common import exceptions - - -class BiddingProcess: - def __init__(self, process_count, search_results): - self.process = search_results.find_element_by_xpath("/html/body/div[5]/div[1]/div[1]/div[2]/" - "div/div[3]/table/tbody/tr[2]/td/div/table/" - "tbody/tr/td/table/tbody/tr[" + - str(2 * process_count) + "]/td[1]/a") - self.process_id = utils.get_process_id(self.process) - - try: - self.process.click() - except exceptions.ElementClickInterceptedException: - search_results.execute_script("arguments[0].scrollIntoView();", self.process) - self.process.click() - - def extract_html_table(self, tab, driver): - table = self.process.find_element_by_xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[4]/table") - tab = table.find_element_by_xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[4]/table/tbody/" - "tr[2]/td/table/tbody/tr[1]/td/table/tbody/tr/td[" + str(tab) + - "]/table/tbody/tr[2]/td[2]/div/div/div") - tab_title = tab.get_attribute('innerHTML') - tab.click() - if tab_title in 'Participantes do Processo': - self.extract_contracts(driver) - # TODO - # elif tab_title in 'Atas de Registro de Preços' - # self.extract_atas() - table_html = table.get_attribute('innerHTML') - utils.save_html(table_html, self.process_id, tab_title) - - def return_search_results(self): - return_button = self.process.find_element_by_xpath("/html/body/div[5]/div[1]/div[1]/div[2]/div/div[4]/" - "div[1]/a[3]") - return_button.click() - - def extract_process_data(self, driver): - for tab in range(2, 6): - self.extract_html_table(tab, driver) - self.return_search_results() - return driver - - def extract_contracts(self, driver): - for contract_number in range(2, 21): - try: - contract = self.process.find_element_by_xpath( - '/html/body/div[5]/div[1]/div[1]/div[2]/div/div[4]/' - 'table/tbody/tr[2]/td/table/tbody/tr[2]/td/div/div[2]' - '/table/tbody/tr[' + str(contract_number) + - ']/td[5]/table/tbody/tr/td/table/tbody/tr/td/a') - contract.click() - except exceptions.ElementClickInterceptedException: - contract = self.process.find_element_by_xpath('/html/body/div[5]/div[1]/div[1]/div[2]/div/div[4]/' - 'table/tbody/tr[2]/td/table/tbody/tr[2]/td/div/div[2]' - '/table/tbody/tr[' + str(contract_number) + - ']/td[5]/table/tbody/tr/td/table/tbody/tr/td/a') - self.process.execute_script("arguments[0].scrollIntoView();", contract) - contract.click() - except exceptions.NoSuchElementException: - - continue - - # TODO - # download_link = contract.find_element_by_xpath("/html/body/div[8]/div/table/tbody/tr[2]/td[2]/div/table/" - # "tbody/tr[1]/td/div/table/tbody/tr[10]/td/div/table/tbody/" - # "tr[2]/td[5]/table/tbody/tr/td/a") - # download_link.click() - utils.wait_page_load(driver, element_xpath="/html/body/div[8]/div/table/tbody/tr[2]/td[2]/div/table" - "/tbody/tr[2]/td/table/tbody/tr/td/button") - contract_table = driver.find_element_by_xpath("/html/body/div[8]/div") - contract_id = contract_table.find_element_by_xpath("/html/body/div[8]/div/table/tbody/tr[2]/td[2]/" - "div/table/tbody/tr[1]/td/div/table/tbody/tr[1]/td[" - "2]/div").get_attribute('innerHTML') - contract_number, contract_year = utils.parse_process_id(contract_id) - contract_filename = "Contrato " + contract_year + '-' + contract_number - contract_html = contract_table.get_attribute('innerHTML') - utils.save_html(contract_html, self.process_id, contract_filename) - close_button = contract.find_element_by_xpath("/html/body/div[8]/div/table/tbody/tr[2]/td[2]/div/table/" - "tbody/tr[2]/td/table/tbody/tr/td/button") - close_button.click() - - # TODO - # def extract_atas(self): - # pass diff --git a/standalone-crawlers/licitacoes-pocos-caldas/licitacoes/utils.py b/standalone-crawlers/licitacoes-pocos-caldas/licitacoes/utils.py deleted file mode 100644 index 97736137..00000000 --- a/standalone-crawlers/licitacoes-pocos-caldas/licitacoes/utils.py +++ /dev/null @@ -1,82 +0,0 @@ -import logging -import time -import os - -from licitacoes import config -from random import randint -from selenium.common import exceptions -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as ec -from selenium.webdriver.common.by import By - - -def progress_information(current_id, start_time): - if not current_id % 50: - wait_time = randint(1, 30) - e = int(time.time() - start_time) - elapsed_time = f'{e // 3600:02d}:{(e % 3600 // 60):02d}:{e % 60:02d}' - percentage = (current_id // config.MAX_PROCESSES) * 100 - logging.info("Elapsed time " + elapsed_time + " sec, " + str(percentage) + "% of all IDs covered") - logging.info("Waiting " + str(wait_time) + " sec") - time.sleep(wait_time) - - -def check_max_skipped_ids(skipped_ids): - if skipped_ids > config.MAX_SKIPPED_IDS: - logging.info(str(skipped_ids) + " consecutive void IDs. Nothing else to download") - return True - else: - return False - - -def get_output_dir(process_id): - process_number, process_year = parse_process_id(process_id) - return config.DOWNLOAD_DIR + config.BASE_FOLDER + "/" + process_year + "/" + str(process_number) - - -def check_max_retries(retries): - retries += 1 - retry_time = config.WAIT_INTERVAL * retries - logging.info("TimeoutError, retrying in" + str(retry_time) + " sec") - time.sleep(retry_time) - if retries > config.MAX_RETRIES: - logging.info("Max retries reached, terminating crawler") - return True - - -def load_page(driver, url=config.START_URL): - try: - driver.get(url) - except exceptions.WebDriverException: - logging.error('Could not reach ' + url) - - logging.info("Waiting for page to load") - wait_page_load(driver) - - return driver - - -def wait_page_load(driver, element_xpath=config.MAX_RESULTS_FIELD, delay=config.WEBDRIVER_DELAY): - try: - WebDriverWait(driver, delay).until(ec.presence_of_element_located((By.XPATH, element_xpath))) - except exceptions.TimeoutException: - logging.error("TimeoutException: Page not loaded correctly") - - -def get_process_id(process): - process_id = str(process.get_attribute('innerHTML')) - return process_id - - -def parse_process_id(process_id): - process_number, process_year = process_id.split('/') - return process_number, process_year - - -def save_html(html_text, process_id, filename): - output_dir = get_output_dir(process_id) - if not os.path.exists(output_dir): - os.makedirs(output_dir) - logging.info("Creating directory: " + str(output_dir)) - with open(output_dir + "/" + filename + ".html", "w+") as f: - f.write(html_text) diff --git a/standalone-crawlers/processo-compras/README.md b/standalone-crawlers/processo-compras/README.md deleted file mode 100644 index 38cbf0e4..00000000 --- a/standalone-crawlers/processo-compras/README.md +++ /dev/null @@ -1,21 +0,0 @@ -Crawler for https://www1.compras.mg.gov.br/processocompra/processo/consultaProcessoCompra.html. - -As of February 28th 2020, there are 309374 processes. The crawler should run for approximately 20 days to achieve full coverage (expected: 25GB of data). - -### Features -- Using ``requests`` and ``wget`` -- Downloads the following: - - html for tab "visualizacaoArquivosProcesso" - - relatorioDetalhesProcessoCompra (pdf) - - Edital (pdf/zip/doc/docx/odt/odf/eml) - - PublicJornalGrandeCirculacao (pdf/zip/doc/docx/odt/eml) - - ExtratoPublicacaoEdital (pdf/zip/doc/docx/odt/eml) - - RetificacaoEdital (pdf/zip/doc/docx/odt/eml) -- Checks process IDs from 1 to 1000000 - - Stops after 5000 consecutive void IDs -- Full coverage except for files in "Visualizar dados do pregão" - - The files from "Visualizar dados do pregão" are identical to the ones obtained by ``consulta-pregoes`` (~45000 processes) - -### TODO -- [ ] Access IDs randomly to avoid blocking -- [ ] Merge crawler with ``consulta-pregoes`` \ No newline at end of file diff --git a/standalone-crawlers/processo-compras/compras/__init__.py b/standalone-crawlers/processo-compras/compras/__init__.py deleted file mode 100644 index 869a3795..00000000 --- a/standalone-crawlers/processo-compras/compras/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Rúbia Reis Guerra -rubia-rg@github -Crawler for 'Processos de Compra' at https://www1.compras.mg.gov.br/processocompra/processo/consultaProcessoCompra.html -""" \ No newline at end of file diff --git a/standalone-crawlers/processo-compras/compras/config.py b/standalone-crawlers/processo-compras/compras/config.py deleted file mode 100644 index bb6d3353..00000000 --- a/standalone-crawlers/processo-compras/compras/config.py +++ /dev/null @@ -1,42 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Rúbia Reis Guerra -rubia-rg@github -Crawler for 'Processos de Compra' at https://www1.compras.mg.gov.br/processocompra/processo/consultaProcessoCompra.html -""" -import logging - -BASE_FOLDER = "/br.gov.mg.compras/processocompra/" -PROFILE_PATH = "/Users/work/Library/Application Support/Firefox/Profiles/yde1eaqi.mpmg" -DOWNLOAD_DIR = "/Volumes/Work HD/MPMG" -NO_PERMISSION_MESSAGE = "You don't have permission to access" -PROCESS_NOT_FOUND_MESSAGE = 'O(A) "Processo Compra" não pode ser alterado(a), pois foi excluído(a) por outro usuário, ' \ - 'em acesso concorrente, enquanto esta tela era visualizada.' -RELATORIO_DETALHES = 'https://www1.compras.mg.gov.br/processocompra/processo/relatoriodetalhes/' \ - 'relatorioDetalhesProcessoCompra.html?id=' -MAX_RETRIES = 10 -WAIT_INTERVAL = 30 -START_PROCESS = 3030 -MAX_PROCESSES = 1000000 -MAX_SKIPPED_IDS = 5000 -BASE = "https://www1.compras.mg.gov.br/processocompra/processo/visualizacaoArquivosProcesso.html?id=" -FORM_DATA = {"idArquivo": '', "nomeArquivo": '', "chaveAberturaArquivo": ''} -USER_AGENT_LIST = [ - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14', - 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', - 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0' -] - -STRING_PATTERN = '\(([^)]+)\)' -JS_METHOD = 'visualizarArquivo' - - -def set_logging(): - logging.basicConfig(level=logging.INFO) diff --git a/standalone-crawlers/processo-compras/compras/download.py b/standalone-crawlers/processo-compras/compras/download.py deleted file mode 100644 index 9361c429..00000000 --- a/standalone-crawlers/processo-compras/compras/download.py +++ /dev/null @@ -1,156 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Rúbia Reis Guerra -rubia-rg@github -Crawler for 'Processos de Compra' at https://www1.compras.mg.gov.br/processocompra/processo/consultaProcessoCompra.html -""" -import logging -import time -import wget -import requests -import os -import re -import random - -from compras import config -from bs4 import BeautifulSoup -from random import randint - - -def progress_information(current_id, start_time): - if not current_id % 100: - wait_time = randint(1, 30) - e = int(time.time() - start_time) - elapsed_time = f'{e // 3600:02d}:{(e % 3600 // 60):02d}:{e % 60:02d}' - percentage = (current_id // config.MAX_PROCESSES) * 100 - logging.info("Elapsed time " + elapsed_time + " sec, " + str(percentage) + "% of all IDs covered") - logging.info("Waiting " + str(wait_time) + " sec") - time.sleep(wait_time) - - -def get_base_url(current_id): - return config.BASE + str(current_id) - - -def get_html_contents(current_id): - """ - Returns HTML content in binary code - - Preserves text encoding - :param current_id: current process - :return: url contents - """ - user_agent = random.choice(config.USER_AGENT_LIST) - headers = {'User-Agent': user_agent} - url = get_base_url(current_id) - response = requests.get(url, headers=headers) - return response.content - - -def get_html_text(current_id): - url = get_base_url(current_id) - user_agent = random.choice(config.USER_AGENT_LIST) - headers = {'User-Agent': user_agent} - response = requests.get(url, headers=headers) - return response.text - - -def check_process_exists(current_id): - text = get_html_text(current_id) - return config.PROCESS_NOT_FOUND_MESSAGE in str(text) or "Acesso negado" in str(text) - - -def check_access_forbidden(current_id): - text = get_html_text(current_id) - return config.NO_PERMISSION_MESSAGE in str(text) - - -def check_max_skipped_ids(skipped_ids): - if skipped_ids > config.MAX_SKIPPED_IDS: - logging.info(skipped_ids + " consecutive void IDs. Nothing else to download") - return True - else: - return False - - -def get_output_dir(current_id): - current_id_range = current_id // 100 - return config.DOWNLOAD_DIR + config.BASE_FOLDER + str(current_id_range) + "/" + str(current_id) - - -def download_html(current_id): - content = get_html_contents(current_id) - output_dir = get_output_dir(current_id) - if not os.path.exists(output_dir): - os.makedirs(output_dir) - logging.info("Creating directory: " + str(output_dir)) - with open(output_dir + "/" + str(current_id) + ".html", "wb") as f: - f.write(content) - - -def get_relatorio_detalhes(current_id): - output_dir = get_output_dir(current_id) - try: - relatorio_detalhes_url = config.RELATORIO_DETALHES + str(current_id) - wget.download(relatorio_detalhes_url, out=output_dir) - time.sleep(.5) - except: - logging.info('Relatorio de Detalhes not available for ID ' + str(current_id)) - - -def check_max_retries(retries): - retries += 1 - retry_time = config.WAIT_INTERVAL * retries - logging.info("TimeoutError, retrying in" + str(retry_time) + " sec") - time.sleep(retry_time) - if retries > config.MAX_RETRIES: - logging.info("Max retries reached, terminating crawler") - return True - - -def download_process_files(current_id): - html_text = get_html_text(current_id) - params_list = parse_html(html_text) - for params in params_list: - form_data = fill_form_data(params) - get_files(current_id, form_data) - - -def fill_form_data(params): - form_data = config.FORM_DATA - form_data['idArquivo'] = params[0] - form_data['nomeArquivo'] = params[1] - form_data['chaveAberturaArquivo'] = params[2] - return form_data - - -def get_files(current_id, form_data): - output_dir = get_output_dir(current_id) - try: - url = get_base_url(current_id) + '&versao=51&metodo=abrirArquivo&exibirMensagem=false&idArquivo=' \ - + form_data['idArquivo'] + '&nomeArquivo=' + form_data['nomeArquivo'] + \ - '&chaveAberturaArquivo=' + form_data['chaveAberturaArquivo'] - wget.download(url, out=output_dir) - time.sleep(1.5) - except: - logging.info(form_data['nomeArquivo'] + ' not available for ID ' + str(current_id)) - - -def parse_html(html_text): - soup = BeautifulSoup(html_text, features='lxml') - search_string = get_search_string() - results = re.findall(search_string, str(soup)) - params_list = [] - for params in results: - if 'idArquivo' in params: - continue - params = treat_params_string(params) - params_list.append(params) - return params_list - - -def treat_params_string(params): - return params.replace("'", "").replace(" ", "").split(",") - - -def get_search_string(js_method=config.JS_METHOD, string_pattern=config.STRING_PATTERN): - return js_method + string_pattern diff --git a/standalone-crawlers/processo-compras/crawler.py b/standalone-crawlers/processo-compras/crawler.py deleted file mode 100644 index c834ce8b..00000000 --- a/standalone-crawlers/processo-compras/crawler.py +++ /dev/null @@ -1,66 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Rúbia Reis Guerra -rubia-rg@github -Crawler for 'Processos de Compra' at https://www1.compras.mg.gov.br/processocompra/processo/consultaProcessoCompra.html -""" -import time -import logging - -from requests.exceptions import ConnectionError -from http.client import RemoteDisconnected -from compras import config -from compras import download - - -download_dir = config.DOWNLOAD_DIR -config.set_logging() - -current_id = config.START_PROCESS -start_time = time.time() -retries = 0 -skipped_ids = 0 - -while current_id < config.MAX_PROCESSES: - try: - # time information - download.progress_information(current_id, start_time) - - # process not found - if download.check_access_forbidden(current_id): - logging.info("Processo de compra ID " + str(current_id) + ": 403 Forbidden") - skipped_ids += 1 - current_id = current_id + 1 - if download.check_max_skipped_ids(skipped_ids): - break - else: - continue - elif download.check_process_exists(current_id): - logging.info("Processo de compra ID " + str(current_id) + ": nothing to download") - skipped_ids += 1 - current_id = current_id + 1 - if download.check_max_skipped_ids(skipped_ids): - break - else: - continue - else: - skipped_ids = 0 - logging.info("Downloading processo de compra ID: " + str(current_id)) - - # fetching html - download.download_html(current_id) - - # fetching relatorio_de_detalhes - download.get_relatorio_detalhes(current_id) - - # download edital files - - download.download_process_files(current_id) - - current_id = current_id + 1 - retries = 0 - - # Check for timeout and retry download - except (TimeoutError, ConnectionError, RemoteDisconnected): - if download.check_max_retries(retries): - break diff --git a/standalone-crawlers/registro-preco/crawler_registro.py b/standalone-crawlers/registro-preco/crawler_registro.py deleted file mode 100644 index 8aa35747..00000000 --- a/standalone-crawlers/registro-preco/crawler_registro.py +++ /dev/null @@ -1,94 +0,0 @@ -import os -from datetime import datetime -from tqdm import tqdm -from selenium import webdriver -from selenium.webdriver.common.keys import Keys - - -driver = webdriver.Firefox() -driver.get("https://www.registrodeprecos.mg.gov.br/aasi/do/filtrarRPs") -assert "SIRP" in driver.title -elem = driver.find_element_by_class_name("botao") -elem.click() -elem = driver.find_element_by_class_name("paginacaoTexto") -numeroRegistros=elem.text -numeroRegistros, lixo = numeroRegistros.split(' reg', 1) -lixo, numeroRegistros = numeroRegistros.split('al: ',1) -numeroRegistros = numeroRegistros.replace(',','') -numeroRegistros = int(numeroRegistros) - -print(elem.text) -print('numero: ', numeroRegistros) -pbar = tqdm(total=numeroRegistros) - - -registrosSalvos = 1 -registrosIteracao = 1 - -dataConsulta = datetime.today().strftime('%Y_%m_%d') - -if not os.path.exists(dataConsulta): - os.makedirs(dataConsulta) - -while(registrosSalvos<=numeroRegistros): - - driver.get("https://www.registrodeprecos.mg.gov.br/aasi/do/selecionarRP?metodo=selecionarPub&id="+str(registrosIteracao)) - - if 'operação não pôde ser completada devido aos erros' not in driver.page_source: - pasta = os.path.join(dataConsulta, str(registrosIteracao)) - if not os.path.exists(pasta): - os.makedirs(pasta) - - # REGISTRO DE PREÇO - with open(os.path.join(pasta,'registro_de_preco_'+str(registrosIteracao)+'.html'), 'w') as f: - f.write(driver.page_source) - - # ITENS DO PLANEJAMENTO - urlBaseItens = "https://www.registrodeprecos.mg.gov.br/aasi/do/buscarPlanCons?metodo=" - driver.get("https://www.registrodeprecos.mg.gov.br/aasi/do/buscarPlanCons?metodo=buscarPlanAdesao&idRP="+str(registrosIteracao)) - if 'operação não pôde ser completada devido aos erros' not in driver.page_source: - with open(os.path.join(pasta,'itens_planejamento_'+str(registrosIteracao)+'_1.html'), 'w') as f: - f.write(driver.page_source) - temProxima = True - while(temProxima): - try: - proximaPagina = driver.find_element_by_link_text('Próximo') - proximaPagina = proximaPagina.get_attribute('href') - driver.get(proximaPagina) - with open(os.path.join(pasta,'itens_planejamento_'+str(registrosIteracao)+"_"+str()+'.html'), 'w') as f: - f.write(driver.page_source) - except: - temProxima = False - pass - - # ITENS PRECOS REGISTROS - driver.get("https://www.registrodeprecos.mg.gov.br/aasi/do/consultaInfoAtaRP?metodo=exibirAnexoAta&anexo=I&idRP="+str(registrosIteracao)) - if 'operação não pôde ser completada devido aos erros' not in driver.page_source: - with open(os.path.join(pasta,'itens_precos_registros_'+str(registrosIteracao)+'.html'), 'w') as f: - f.write(driver.page_source) - - # ORGAOS PARTICIPANTES - driver.get("https://www.registrodeprecos.mg.gov.br/aasi/do/consultaInfoAtaRP?metodo=exibirAnexoAta&anexo=II&idRP="+str(registrosIteracao)) - if 'operação não pôde ser completada devido aos erros' not in driver.page_source: - with open(os.path.join(pasta,'orgaos_participantes_'+str(registrosIteracao)+'.html'), 'w') as f: - f.write(driver.page_source) - - # FORNECEDORES PARTICIPANTES - driver.get("https://www.registrodeprecos.mg.gov.br/aasi/do/buscarFornecedores?metodo=buscarFornecedores&operacao=FORN_PARTICIPANTES&idRP="+str(registrosIteracao)) - if 'operação não pôde ser completada devido aos erros' not in driver.page_source: - with open(os.path.join(pasta,'fornecedores_participantes_'+str(registrosIteracao)+'.html'), 'w') as f: - f.write(driver.page_source) - - # DOCUMENTOS - driver.get("https://www.registrodeprecos.mg.gov.br/aasi/do/consultarArquivos?metodo=buscarArquivosRP&idRP="+str(registrosIteracao)) - if 'operação não pôde ser completada devido aos erros' not in driver.page_source: - with open(os.path.join(pasta,'documentos_'+str(registrosIteracao)+'.html'), 'w') as f: - f.write(driver.page_source) - - registrosSalvos+=1 - pbar.update(1) - registrosIteracao+=1 - -pbar.close() -driver.close() -driver.quit() \ No newline at end of file diff --git a/standalone-crawlers/remuneracao-servidores/README.md b/standalone-crawlers/remuneracao-servidores/README.md deleted file mode 100644 index 95319ca1..00000000 --- a/standalone-crawlers/remuneracao-servidores/README.md +++ /dev/null @@ -1,9 +0,0 @@ -

Para baixar os arquivos de um periodo especifico:

-$ python remuneracao.py INICIO FINAL -
-
-

Em que INICIO e Final devem estar no formato MMAA. -Nota-se que há somente dados disponiveis para periodos posteriores a janeiro de 2019 (0119).

- -

Exemplo de uso para recuperar dados do ano de 2019:

-$ python remuneracao.py 0119 1219 diff --git a/standalone-crawlers/remuneracao-servidores/remuneracao.py b/standalone-crawlers/remuneracao-servidores/remuneracao.py deleted file mode 100644 index fc750920..00000000 --- a/standalone-crawlers/remuneracao-servidores/remuneracao.py +++ /dev/null @@ -1,45 +0,0 @@ -import requests -import argparse -import numpy -import time -import os - -parser = argparse.ArgumentParser(description='Baixa os arquivos csv referentes a remuneracao dos servidores de Minas Gerais.') -parser.add_argument("inicio", metavar='Inicio', nargs='+', type=str, help='Mes de inicio da coleta') -parser.add_argument("final", metavar='Final', nargs='+', type=str, help='Mes final da coleta') - -args = vars(parser.parse_args()) -inicio_mes = args["inicio"][0][0:2] -inicio_ano = args["inicio"][0][2:4] -final_mes = args["final"][0][0:2] -final_ano = args["final"][0][2:4] - -if (int(final_ano) > int(time.strftime("%Y")[2:4]) or int(inicio_ano) < 19 or int(final_ano) < int(inicio_ano) or (int(final_ano ) == int(inicio_ano) and int(final_mes) < int(inicio_mes)) - or int(inicio_mes) > 12 or int(inicio_mes)<1 or int(final_mes) > 12 or int(final_mes)<1 or (int(final_ano) == int(time.strftime("%Y")[2:4]) and int(final_mes) > int(time.strftime("%m")) ) ): - raise Exception("Valor dos anos incorreto, tente novamente.") - -print("Baixando dados de", inicio_mes, "de", inicio_ano,"a", final_mes,"de",final_ano) - -if not os.path.exists("Data"): - os.makedirs("Data") - -for ano in range( int(inicio_ano), int(final_ano)+1, 1): - inicio = 1 - if int(inicio_ano) == ano: - inicio = int(inicio_mes) - final = 12 - if int(final_ano) == ano: - final = int(final_mes) - for mes in range(inicio, final+1,1): - url = None - if mes < 10: - url = "http://200.198.22.105/fgs-adm/remuneracao/downloadRemuneracao.php?mes=0" + str(mes) + str(ano) - r = requests.get(url) - with open("Data/" + '0' + str(mes) + str(ano)+ ".csv", 'wb') as f: - f.write(r.content) - else: - url = "http://200.198.22.105/fgs-adm/remuneracao/downloadRemuneracao.php?mes=" + str(mes) + str(ano) - r = requests.get(url) - with open("Data/" + str(mes) + str(ano)+ ".csv", 'wb') as f: - f.write(r.content) - print(url) diff --git a/standalone-crawlers/transparecia/README.md b/standalone-crawlers/transparecia/README.md deleted file mode 100644 index 151d47d2..00000000 --- a/standalone-crawlers/transparecia/README.md +++ /dev/null @@ -1,7 +0,0 @@ -

O codigo pode ser usado para baixar arquivos do portal http://www.transparencia.dadosabertos.mg.gov.br/, que estao na forma dos links a seguir:
-- http://www.transparencia.dadosabertos.mg.gov.br/dataset/remuneracao-dos-servidores
-- http://www.transparencia.dadosabertos.mg.gov.br/dataset/despesas-com-diarias-2015-2016
-- http://www.transparencia.dadosabertos.mg.gov.br/dataset/despesas-com-pessoal-2015-2016
-

-

Para rodar o codigo:

-python transparencia.py http://www.transparencia.dadosabertos.mg.gov.br/dataset/despesas-com-diarias-2015-2016 diff --git a/standalone-crawlers/transparecia/transparencia.py b/standalone-crawlers/transparecia/transparencia.py deleted file mode 100644 index 8852343e..00000000 --- a/standalone-crawlers/transparecia/transparencia.py +++ /dev/null @@ -1,44 +0,0 @@ -from selenium import webdriver -from selenium.webdriver.support.ui import Select -import time -import argparse -import selenium -import os -import urllib.request - -# Abre o chromedriver com a url dada e retorna o driver -def carrega_pagina(url, path = '/mnt/ssd0/luiznery/Documentos/mpmg/remuneracao'): - path += "/chromedriver" - options = webdriver.ChromeOptions() - options.add_argument("--start-maximized") - options.add_experimental_option("excludeSwitches", ["enable-automation"]) - options.add_experimental_option('useAutomationExtension', False) - driver = webdriver.Chrome(path, options=options) - driver.get(url) - print("Pagina carregada") - return driver - -#Define argumentos - URL dos dados a serem coletados -parser = argparse.ArgumentParser(description='Baixa arquivos do portal transparencia.') -parser.add_argument("url", help="Url da qual se deseja baixar os arquivos.") -# parser.add_argument("path", help="Url da qual se deseja baixar os arquivos.") - -args = vars(parser.parse_args()) -url = args["url"] -# path = args["path"] -path = url.split("/")[-1:][0] - -driver = carrega_pagina(url) - -#cria diretorio -if not os.path.exists(path): - os.makedirs(path) - -print("Baixando arquivos:") -skillsSection = driver.find_element_by_xpath("""//*[@id="dataset-resources"]/ul""") -for child in skillsSection.find_elements_by_xpath(".//li/div/ul/li[2]/a"): - link = child.get_attribute("href") - print(link.split("/")[-1:][0]) - urllib.request.urlretrieve(link, path + "/" + link.split("/")[-1:][0]) - -driver.close() diff --git a/standalone-crawlers/transparencia-diarias/crawler.py b/standalone-crawlers/transparencia-diarias/crawler.py deleted file mode 100644 index 87a74951..00000000 --- a/standalone-crawlers/transparencia-diarias/crawler.py +++ /dev/null @@ -1,30 +0,0 @@ -from selenium import webdriver -from selenium.webdriver.support.ui import Select -import time - -chrome_options = webdriver.ChromeOptions() - -# define o diretório no qual os arquivos baixados serão armazenados -prefs = {'download.default_directory' : '/home/lorena/Desktop/Lorena/Programming/C04/transparencia-diarias/csvs'} - -chrome_options.add_experimental_option('prefs', prefs) -driver = webdriver.Chrome(chrome_options=chrome_options) - -driver.get('http://transparencia.mg.gov.br/estado-pessoal/diarias') -time.sleep(2) - -year_button = '//*[@id="jform_ano"]' -submit_button = '//*[@id="estado_despesadiarias-form"]/div[7]/div/button' -show_all = '//*[@id="DataTables_Table_0_wrapper"]/div[1]/ul/li[5]/a' -csv_button = '//*[@id="DataTables_Table_0_wrapper"]/div[1]/a[2]' - -# relativos aos anos 2002 a 2020 -for i in range(1, 20): - driver.find_element_by_xpath(year_button).click() - year = '//*[@id="jform_ano"]/option[' + str(i) + ']' - driver.find_element_by_xpath(year).click() - - driver.find_element_by_xpath(submit_button).click() - time.sleep(2) - driver.find_element_by_xpath(show_all).click() - driver.find_element_by_xpath(csv_button).click() \ No newline at end of file diff --git "a/standalone-crawlers/transparencia-diarias/csvs/Di\303\241rias (1).csv" "b/standalone-crawlers/transparencia-diarias/csvs/Di\303\241rias (1).csv" deleted file mode 100644 index 0181c062..00000000 --- "a/standalone-crawlers/transparencia-diarias/csvs/Di\303\241rias (1).csv" +++ /dev/null @@ -1,72 +0,0 @@ -"Órgão ";"Valor Empenhado ";"Valor Liquidado ";"Valor Pago " -"ADVOCACIA-GERAL DO ESTADO";"22.165,66";"22.165,66";"22.673,66" -"AGENCIA DE DESENVOLVIMENTO DA REGIAO METROPOLITANA DE BELO HORIZONTE";"3.303,48";"3.303,48";"3.303,48" -"AGENCIA DE DESENVOLVIMENTO DA REGIAO METROPOLITANA DO VALE DO ACO";"21.788,05";"21.788,05";"22.062,60" -"AGENCIA REGULADORA DE SERVICOS DE ABASTECIMENTO DE AGUA E DE ESGOTAMENTO SANITARIO DO ESTADO DE MINA";"69.140,25";"69.140,25";"74.800,45" -"ASSEMBLEIA LEGISLATIVA DO ESTADO DE MINAS GERAIS";"5.492.904,97";"5.492.904,97";"4.122.658,06" -"CONTROLADORIA-GERAL DO ESTADO";"51.416,00";"50.216,00";"51.070,20" -"CORPO DE BOMBEIROS MILITAR DO ESTADO DE MINAS GERAIS";"2.193.121,62";"2.193.121,62";"2.192.455,12" -"DEFENSORIA PUBLICA DO ESTADO DE MINAS GERAIS";"365.649,77";"365.649,77";"377.824,77" -"DEPARTAMENTO DE EDIFICACOES E ESTRADAS DE RODAGEM DO ESTADO DE MINAS GERAIS";"331.649,59";"331.649,59";"338.532,09" -"DEPARTAMENTO DE TRANSITO DE MINAS GERAIS";"395.280,66";"395.280,66";"395.189,45" -"EMPRESA DE ASSISTENCIA TECNICA E EXTENSAO RURAL DO ESTADO DE MINAS GERAIS";"5.202.936,18";"5.202.936,18";"5.202.936,18" -"EMPRESA DE PESQUISA AGROPECUARIA DE MINAS GERAIS";"352.874,13";"352.874,13";"352.874,13" -"EMPRESA MINEIRA DE COMUNICACAO";"1.048,50";"1.048,50";"1.048,50" -"ESCOLA DE SAUDE PUBLICA DO ESTADO DE MINAS GERAIS";"31.190,25";"31.122,45";"31.122,45" -"FUNDACAO CENTRO DE HEMATOLOGIA E HEMOTERAPIA DE MINAS GERAIS";"140.323,16";"140.323,16";"147.534,66" -"FUNDACAO CLOVIS SALGADO";"7.300,25";"7.300,25";"7.300,25" -"FUNDACAO DE AMPARO A PESQUISA DO ESTADO DE MINAS GERAIS";"144.834,07";"144.834,07";"146.387,84" -"FUNDACAO DE ARTE DE OURO PRETO";"7.884,91";"7.884,91";"7.884,91" -"FUNDACAO DE EDUCACAO PARA O TRABALHO DE MINAS GERAIS";"662,50";"662,50";"662,50" -"FUNDACAO EDUCACIONAL CAIO MARTINS";"15.348,75";"14.348,75";"14.348,75" -"FUNDACAO ESTADUAL DO MEIO AMBIENTE";"45.807,80";"45.803,35";"49.049,75" -"FUNDACAO EZEQUIEL DIAS";"243.249,10";"243.249,10";"245.658,35" -"FUNDACAO HELENA ANTIPOFF";"4.735,25";"4.735,25";"4.735,25" -"FUNDACAO HOSPITALAR DO ESTADO DE MINAS GERAIS";"295.134,25";"295.134,25";"301.337,00" -"FUNDACAO JOAO PINHEIRO";"34.161,90";"34.161,90";"34.682,40" -"FUNDACAO TV MINAS CULTURAL E EDUCATIVA";"21.754,30";"21.754,30";"21.743,30" -"FUNDO DE RECUPERACAO, PROTECAO E DESENVOLVIMENTO SUSTENTAVEL DAS BACIAS HIDROGRAFICAS DO ESTADO DE M";"127.700,34";"127.700,34";"128.819,74" -"FUNDO ESPECIAL DO MINISTERIO PUBLICO DO ESTADO DE MINAS GERAIS";"185.791,00";"185.791,00";"188.609,00" -"FUNDO ESPECIAL DO PODER JUDICIARIO DO ESTADO DE MINAS GERAIS";"8.484.406,57";"8.484.190,57";"8.643.981,64" -"FUNDO ESTADUAL DE ASSISTENCIA SOCIAL";"199.355,95";"199.355,95";"201.894,25" -"FUNDO ESTADUAL DE CULTURA";"4.048,10";"4.048,10";"4.048,10" -"FUNDO ESTADUAL DE DESENVOLVIMENTO DE TRANSPORTES";"112.942,95";"112.942,95";"114.130,95" -"FUNDO ESTADUAL DE PROTECAO E DEFESA DO CONSUMIDOR";"368.052,16";"368.052,16";"375.339,62" -"FUNDO ESTADUAL DE SAUDE";"6.681.296,17";"6.540.565,82";"6.648.624,97" -"GABINETE MILITAR DO GOVERNADOR DO ESTADO DE MINAS GERAIS";"1.523.078,11";"1.523.078,11";"1.535.547,25" -"INSTITUTO DE DESENVOLVIMENTO DO NORTE E NORDESTE DE MINAS GERAIS";"34.727,55";"34.727,55";"36.439,00" -"INSTITUTO DE METROLOGIA E QUALIDADE DO ESTADO DE MINAS GERAIS";"466.353,79";"466.353,79";"520.834,44" -"INSTITUTO DE PREVIDENCIA DOS SERVIDORES DO ESTADO DE MINAS GERAIS";"783,05";"783,05";"783,05" -"INSTITUTO DE PREVIDENCIA DOS SERVIDORES MILITARES DO ESTADO DE MINAS GERAIS";"84.043,50";"84.043,50";"89.119,75" -"INSTITUTO ESTADUAL DE FLORESTAS";"588.028,61";"581.337,11";"588.171,26" -"INSTITUTO ESTADUAL DO PATRIMONIO HISTORICO E ARTISTICO DE MINAS GERAIS";"39.899,50";"39.899,50";"41.221,50" -"INSTITUTO MINEIRO DE AGROPECUARIA";"772.735,40";"772.735,40";"800.589,85" -"INSTITUTO MINEIRO DE GESTAO DAS AGUAS";"256.495,05";"251.412,22";"259.827,22" -"JUNTA COMERCIAL DO ESTADO DE MINAS GERAIS";"2.245,30";"2.245,30";"2.807,40" -"LOTERIA DO ESTADO DE MINAS GERAIS";"3.400,00";"3.400,00";"3.400,00" -"OUVIDORIA-GERAL DO ESTADO DE MINAS GERAIS";"10.867,22";"10.867,22";"11.972,32" -"POLICIA CIVIL DO ESTADO DE MINAS GERAIS";"3.404.597,63";"3.404.597,63";"3.374.068,72" -"POLICIA MILITAR DO ESTADO DE MINAS GERAIS";"8.839.338,12";"8.839.338,12";"8.880.737,26" -"PROCURADORIA GERAL DE JUSTICA";"6.834.724,26";"6.834.724,26";"6.499.511,06" -"SECRETARIA DE ESTADO DE INFRAESTRUTURA E MOBILIDADE";"14.042,85";"14.042,85";"14.115,05" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO SOCIAL";"176.927,55";"176.927,55";"183.641,58" -"SECRETARIA DE ESTADO DE AGRICULTURA, PECUARIA E ABASTECIMENTO";"282.079,63";"282.079,63";"292.517,83" -"SECRETARIA DE ESTADO DE CULTURA E TURISMO";"94.383,05";"94.383,05";"95.404,55" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO AGRARIO";"18.400,91";"18.400,91";"20.631,01" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO E INTEGRACAO DO NORTE E NORDESTE DE MINAS GERAIS";"18.142,00";"18.142,00";"18.142,00" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO ECONOMICO";"133.819,31";"133.819,31";"142.511,70" -"SECRETARIA DE ESTADO DE DIREITOS HUMANOS, PARTICIPACAO SOCIAL E CIDADANIA";"41.089,60";"41.089,60";"42.755,70" -"SECRETARIA DE ESTADO DE EDUCACAO";"3.094.398,34";"3.094.398,34";"3.182.515,51" -"SECRETARIA DE ESTADO DE ESPORTES";"16.561,85";"16.561,85";"16.767,85" -"SECRETARIA DE ESTADO DE FAZENDA";"315.397,38";"315.397,38";"315.699,33" -"SECRETARIA DE ESTADO DE GOVERNO";"112.135,80";"112.135,80";"122.490,15" -"SECRETARIA DE ESTADO DE JUSTICA E SEGURANCA PUBLICA";"3.787.061,25";"3.787.061,25";"3.974.498,00" -"SECRETARIA DE ESTADO DE MEIO AMBIENTE E DESENVOLVIMENTO SUSTENTAVEL";"749.005,63";"749.005,63";"779.372,03" -"SECRETARIA DE ESTADO DE PLANEJAMENTO E GESTAO";"59.768,60";"59.768,60";"59.862,20" -"SECRETARIA DE ESTADO DE SEGURANCA PUBLICA";"129.434,65";"129.434,65";"132.117,20" -"SECRETARIA DE ESTADO DE TURISMO";"8.645,40";"8.645,40";"8.671,90" -"SECRETARIA-GERAL";"167.123,66";"167.123,66";"177.053,26" -"TRIBUNAL DE CONTAS DO ESTADO DE MINAS GERAIS";"1.674.882,55";"1.674.882,55";"1.686.946,04" -"TRIBUNAL DE JUSTICA MILITAR DO ESTADO DE MINAS GERAIS";"317.635,38";"317.635,38";"324.547,59" -"UNIVERSIDADE DO ESTADO DE MINAS GERAIS";"319.972,06";"319.972,06";"332.655,46" -"UNIVERSIDADE ESTADUAL DE MONTES CLAROS";"163.390,24";"163.390,24";"163.606,34" \ No newline at end of file diff --git "a/standalone-crawlers/transparencia-diarias/csvs/Di\303\241rias (2).csv" "b/standalone-crawlers/transparencia-diarias/csvs/Di\303\241rias (2).csv" deleted file mode 100644 index 04fc34b8..00000000 --- "a/standalone-crawlers/transparencia-diarias/csvs/Di\303\241rias (2).csv" +++ /dev/null @@ -1,73 +0,0 @@ -"Órgão ";"Valor Empenhado ";"Valor Liquidado ";"Valor Pago " -"ADVOCACIA-GERAL DO ESTADO";"23.202,20";"23.202,20";"23.202,20" -"AGENCIA DE DESENVOLVIMENTO DA REGIAO METROPOLITANA DE BELO HORIZONTE";"3.765,30";"3.765,30";"3.862,80" -"AGENCIA DE DESENVOLVIMENTO DA REGIAO METROPOLITANA DO VALE DO ACO";"11.221,15";"11.221,15";"11.221,15" -"AGENCIA REGULADORA DE SERVICOS DE ABASTECIMENTO DE AGUA E DE ESGOTAMENTO SANITARIO DO ESTADO DE MINA";"48.602,60";"48.602,60";"54.809,80" -"ASSEMBLEIA LEGISLATIVA DO ESTADO DE MINAS GERAIS";"4.934.550,49";"4.934.550,49";"3.650.454,11" -"CONTROLADORIA-GERAL DO ESTADO";"17.678,40";"17.678,40";"17.678,40" -"CORPO DE BOMBEIROS MILITAR DO ESTADO DE MINAS GERAIS";"2.418.122,62";"2.417.675,62";"2.427.453,79" -"DEFENSORIA PUBLICA DO ESTADO DE MINAS GERAIS";"162.477,31";"162.477,31";"171.407,75" -"DEPARTAMENTO DE EDIFICACOES E ESTRADAS DE RODAGEM DO ESTADO DE MINAS GERAIS";"241.216,09";"241.216,09";"246.598,75" -"DEPARTAMENTO DE TRANSITO DE MINAS GERAIS";"312.903,83";"312.903,83";"312.576,69" -"DEPARTAMENTO ESTADUAL DE TELECOMUNICACOES DE MINAS GERAIS";"8.996,15";"8.996,15";"8.996,15" -"ESCOLA DE SAUDE PUBLICA DO ESTADO DE MINAS GERAIS";"26.948,65";"25.221,65";"25.521,65" -"FUNDACAO CENTRO DE HEMATOLOGIA E HEMOTERAPIA DE MINAS GERAIS";"424.844,67";"424.669,02";"446.814,65" -"FUNDACAO CLOVIS SALGADO";"1.253,10";"1.253,10";"1.253,10" -"FUNDACAO DE AMPARO A PESQUISA DO ESTADO DE MINAS GERAIS";"311.929,13";"311.929,13";"311.929,13" -"FUNDACAO DE ARTE DE OURO PRETO";"9.552,05";"9.552,05";"9.325,40" -"FUNDACAO DE EDUCACAO PARA O TRABALHO DE MINAS GERAIS";"1.093,65";"1.093,65";"1.093,65" -"FUNDACAO EDUCACIONAL CAIO MARTINS";"58.768,75";"58.768,75";"58.356,75" -"FUNDACAO ESTADUAL DO MEIO AMBIENTE";"53.586,36";"53.586,36";"58.350,36" -"FUNDACAO EZEQUIEL DIAS";"254.712,58";"254.712,58";"255.773,30" -"FUNDACAO HELENA ANTIPOFF";"0,00";"0,00";"0,00" -"FUNDACAO HOSPITALAR DO ESTADO DE MINAS GERAIS";"253.341,93";"253.341,93";"259.405,08" -"FUNDACAO JOAO PINHEIRO";"36.486,60";"36.486,60";"36.904,10" -"FUNDACAO TV MINAS CULTURAL E EDUCATIVA";"57.651,10";"52.637,50";"43.735,80" -"FUNDO DE RECUPERACAO, PROTECAO E DESENVOLVIMENTO SUSTENTAVEL DAS BACIAS HIDROGRAFICAS DO ESTADO DE M";"139.582,98";"139.582,98";"139.582,98" -"FUNDO ESPECIAL DO MINISTERIO PUBLICO DO ESTADO DE MINAS GERAIS";"204.040,16";"204.040,16";"211.598,88" -"FUNDO ESPECIAL DO PODER JUDICIARIO DO ESTADO DE MINAS GERAIS";"8.979.214,29";"8.970.465,69";"9.221.797,77" -"FUNDO ESTADUAL DE ASSISTENCIA SOCIAL";"345.802,93";"335.384,64";"345.688,87" -"FUNDO ESTADUAL DE DEFESA DE DIREITOS DIFUSOS";"2.368,40";"2.368,40";"2.368,40" -"FUNDO ESTADUAL DE DESENVOLVIMENTO DE TRANSPORTES";"84.993,98";"84.993,98";"85.287,98" -"FUNDO ESTADUAL DE PROTECAO E DEFESA DO CONSUMIDOR";"311.551,04";"311.551,04";"325.246,40" -"FUNDO ESTADUAL DE SAUDE";"8.181.304,50";"8.176.304,50";"8.327.630,90" -"GABINETE MILITAR DO GOVERNADOR DO ESTADO DE MINAS GERAIS";"1.269.020,96";"1.269.020,96";"1.322.764,14" -"INSTITUTO DE DESENVOLVIMENTO DO NORTE E NORDESTE DE MINAS GERAIS";"106.102,90";"106.102,90";"109.467,50" -"INSTITUTO DE METROLOGIA E QUALIDADE DO ESTADO DE MINAS GERAIS";"926.267,85";"926.267,85";"1.047.250,95" -"INSTITUTO DE PREVIDENCIA DOS SERVIDORES DO ESTADO DE MINAS GERAIS";"17.751,53";"17.751,53";"19.692,38" -"INSTITUTO DE PREVIDENCIA DOS SERVIDORES MILITARES DO ESTADO DE MINAS GERAIS";"86.954,70";"86.954,70";"86.954,70" -"INSTITUTO ESTADUAL DE FLORESTAS";"937.579,33";"929.198,13";"856.079,27" -"INSTITUTO ESTADUAL DO PATRIMONIO HISTORICO E ARTISTICO DE MINAS GERAIS";"49.555,05";"49.555,05";"52.329,05" -"INSTITUTO MINEIRO DE AGROPECUARIA";"698.639,44";"698.639,44";"737.979,09" -"INSTITUTO MINEIRO DE GESTAO DAS AGUAS";"370.322,79";"370.322,79";"355.650,09" -"JUNTA COMERCIAL DO ESTADO DE MINAS GERAIS";"11.408,80";"11.408,80";"11.311,30" -"LOTERIA DO ESTADO DE MINAS GERAIS";"1.350,40";"1.350,40";"2.103,10" -"OUVIDORIA-GERAL DO ESTADO DE MINAS GERAIS";"32.031,84";"32.031,84";"33.857,84" -"POLICIA CIVIL DO ESTADO DE MINAS GERAIS";"3.364.731,26";"3.364.731,26";"3.269.515,59" -"POLICIA MILITAR DO ESTADO DE MINAS GERAIS";"10.392.017,32";"10.391.700,34";"10.275.528,73" -"PROCURADORIA GERAL DE JUSTICA";"5.558.938,31";"5.558.938,31";"5.353.544,23" -"SECRETARIA DE ESTADO DE TRANSPORTES E OBRAS PUBLICAS";"96.050,55";"96.050,55";"97.216,15" -"SECRETARIA DE ESTADO DE ADMINISTRACAO PRISIONAL";"2.912.612,00";"2.912.612,00";"3.106.586,70" -"SECRETARIA DE ESTADO DE AGRICULTURA, PECUARIA E ABASTECIMENTO";"234.544,69";"234.544,69";"244.656,74" -"SECRETARIA DE ESTADO DE CASA CIVIL E DE RELACOES INSTITUCIONAIS";"3.407,50";"3.407,50";"3.923,70" -"SECRETARIA DE ESTADO DE CIDADES E DE INTEGRACAO REGIONAL";"43.825,16";"43.815,16";"44.350,16" -"SECRETARIA DE ESTADO DE CULTURA";"31.130,50";"31.130,50";"31.130,50" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO AGRARIO";"194.538,36";"188.556,39";"188.320,16" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO E INTEGRACAO DO NORTE E NORDESTE DE MINAS GERAIS";"62.168,85";"62.168,85";"63.551,65" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO ECONOMICO, CIENCIA, TECNOLOGIA E ENSINO SUPERIOR";"10.213,77";"10.213,77";"10.213,77" -"SECRETARIA DE ESTADO DE DIREITOS HUMANOS, PARTICIPACAO SOCIAL E CIDADANIA";"132.017,42";"132.017,42";"136.087,62" -"SECRETARIA DE ESTADO DE EDUCACAO";"4.080.338,85";"4.080.338,85";"4.155.689,10" -"SECRETARIA DE ESTADO DE ESPORTES";"80.474,60";"75.400,60";"75.844,10" -"SECRETARIA DE ESTADO DE FAZENDA";"262.587,27";"261.495,27";"261.479,67" -"SECRETARIA DE ESTADO DE GOVERNO";"249.419,25";"249.419,25";"309.505,35" -"SECRETARIA DE ESTADO DE MEIO AMBIENTE E DESENVOLVIMENTO SUSTENTAVEL";"1.050.818,05";"1.050.818,05";"1.107.064,11" -"SECRETARIA DE ESTADO DE PLANEJAMENTO E GESTAO";"145.710,40";"145.710,40";"148.562,65" -"SECRETARIA DE ESTADO DE SEGURANCA PUBLICA";"299.860,95";"299.860,95";"308.904,10" -"SECRETARIA DE ESTADO DE TRABALHO E DESENVOLVIMENTO SOCIAL";"32.034,50";"29.722,70";"33.756,30" -"SECRETARIA DE ESTADO DE TURISMO";"127.951,07";"127.951,07";"129.576,57" -"SECRETARIA DE ESTADO EXTRAORDINARIA DE DESENVOLVIMENTO INTEGRADO E FORUNS REGIONAIS";"128.171,90";"128.171,90";"135.307,60" -"SECRETARIA-GERAL";"9.213,00";"9.213,00";"9.213,00" -"TRIBUNAL DE CONTAS DO ESTADO DE MINAS GERAIS";"878.673,09";"878.673,09";"903.186,95" -"TRIBUNAL DE JUSTICA MILITAR DO ESTADO DE MINAS GERAIS";"135.341,63";"135.341,63";"136.944,91" -"UNIVERSIDADE DO ESTADO DE MINAS GERAIS";"224.322,86";"224.322,86";"235.418,90" -"UNIVERSIDADE ESTADUAL DE MONTES CLAROS";"76.527,88";"76.527,88";"76.782,92" \ No newline at end of file diff --git "a/standalone-crawlers/transparencia-diarias/csvs/Di\303\241rias.csv" "b/standalone-crawlers/transparencia-diarias/csvs/Di\303\241rias.csv" deleted file mode 100644 index d7f3da61..00000000 --- "a/standalone-crawlers/transparencia-diarias/csvs/Di\303\241rias.csv" +++ /dev/null @@ -1,55 +0,0 @@ -"Órgão ";"Valor Empenhado ";"Valor Liquidado ";"Valor Pago " -"ADVOCACIA GERAL DO ESTADO";"2.380,00";"462,00";"462,00" -"AGENCIA DE DESENVOLVIMENTO DA REGIAO METROPOLITANA DE BELO HORIZONTE";"1.500,00";"0,00";"0,00" -"AGENCIA DE DESENVOLVIMENTO DA REGIAO METROPOLITANA DO VALE DO ACO";"0,00";"321,55";"321,55" -"AGENCIA REGULADORA DE SERVICOS DE ABASTECIMENTO DE AGUA E DE ESGOTAMENTO SANITARIO DO ESTADO DE MINA";"0,00";"1.366,50";"5.486,35" -"ASSEMBLEIA LEGISLATIVA DO ESTADO DE MINAS GERAIS";"109.088,26";"109.088,26";"269.949,58" -"CONTROLADORIA-GERAL DO ESTADO";"500,00";"2.559,60";"3.413,80" -"CORPO DE BOMBEIROS MILITAR DO ESTADO DE MINAS GERAIS";"91.657,84";"83.568,47";"35.607,59" -"DEFENSORIA PUBLICA DO ESTADO DE MINAS GERAIS";"12.188,90";"11.266,75";"25.546,85" -"DEPARTAMENTO DE EDIFICACOES E ESTRADAS DE RODAGEM DO ESTADO DE MINAS GERAIS";"26.600,00";"13.783,50";"15.172,50" -"EMPRESA DE ASSISTENCIA TECNICA E EXTENSAO RURAL DO ESTADO DE MINAS GERAIS";"160.435,50";"160.435,50";"160.435,50" -"EMPRESA DE PESQUISA AGROPECUARIA DE MINAS GERAIS";"4.914,75";"4.914,75";"4.914,75" -"ESCOLA DE SAUDE PUBLICA DO ESTADO DE MINAS GERAIS";"1.200,00";"0,00";"687,00" -"FUNDACAO CENTRO DE HEMATOLOGIA E HEMOTERAPIA DE MINAS GERAIS";"6.342,00";"2.874,50";"6.919,90" -"FUNDACAO DE AMPARO A PESQUISA DO ESTADO DE MINAS GERAIS";"10.000,00";"2.516,47";"2.414,95" -"FUNDACAO DE ARTE DE OURO PRETO";"0,00";"503,65";"415,55" -"FUNDACAO EDUCACIONAL CAIO MARTINS";"7.900,00";"816,60";"816,60" -"FUNDACAO ESTADUAL DO MEIO AMBIENTE";"23.850,00";"3.549,00";"5.518,50" -"FUNDACAO EZEQUIEL DIAS";"2.954,55";"2.988,10";"5.533,60" -"FUNDACAO HELENA ANTIPOFF";"10.000,00";"1.292,00";"1.292,00" -"FUNDACAO HOSPITALAR DO ESTADO DE MINAS GERAIS";"3.474,20";"2.368,40";"3.310,25" -"FUNDACAO JOAO PINHEIRO";"10.325,00";"3.160,44";"1.785,74" -"FUNDACAO TV MINAS CULTURAL E EDUCATIVA";"1.750,00";"108,50";"1.986,00" -"FUNDO DE RECUPERACAO, PROTECAO E DESENVOLVIMENTO SUSTENTAVEL DAS BACIAS HIDROGRAFICAS DO ESTADO DE M";"19.000,00";"0,00";"0,00" -"FUNDO ESPECIAL DO PODER JUDICIARIO DO ESTADO DE MINAS GERAIS";"163.447,00";"142.956,50";"277.419,50" -"FUNDO ESTADUAL DE ASSISTENCIA SOCIAL";"2.000,00";"211,50";"211,50" -"FUNDO ESTADUAL DE PROTECAO E DEFESA DO CONSUMIDOR";"495,00";"495,00";"16.883,00" -"FUNDO ESTADUAL DE SAUDE";"655.374,50";"96.336,18";"145.630,54" -"GABINETE MILITAR DO GOVERNADOR DO ESTADO DE MINAS GERAIS";"24.034,53";"4.579,60";"10.231,34" -"INSTITUTO DE DESENVOLVIMENTO DO NORTE E NORDESTE DE MINAS GERAIS";"5.000,00";"1.332,05";"4.556,95" -"INSTITUTO DE METROLOGIA E QUALIDADE DO ESTADO DE MINAS GERAIS";"0,00";"19.191,15";"28.207,65" -"INSTITUTO DE PREVIDENCIA DOS SERVIDORES DO ESTADO DE MINAS GERAIS";"10.000,00";"0,00";"0,00" -"INSTITUTO DE PREVIDENCIA DOS SERVIDORES MILITARES DO ESTADO DE MINAS GERAIS";"0,00";"1.818,40";"2.395,15" -"INSTITUTO ESTADUAL DE FLORESTAS";"4.829,90";"2.131,90";"1.377,25" -"INSTITUTO ESTADUAL DO PATRIMONIO HISTORICO E ARTISTICO DE MINAS GERAIS";"0,00";"553,50";"776,00" -"INSTITUTO MINEIRO DE AGROPECUARIA";"5.579,66";"5.694,90";"5.311,35" -"INSTITUTO MINEIRO DE GESTAO DAS AGUAS";"70.957,87";"23.164,95";"23.164,95" -"POLICIA MILITAR DO ESTADO DE MINAS GERAIS";"133.568,13";"136.291,38";"74.605,60" -"PROCURADORIA GERAL DE JUSTICA";"201.138,75";"197.079,00";"138.107,00" -"SECRETARIA DE ESTADO DE INFRAESTRUTURA E MOBILIDADE";"1.000,00";"1.456,05";"1.887,35" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO SOCIAL";"23.140,25";"35.477,40";"71.305,45" -"SECRETARIA DE ESTADO DE AGRICULTURA, PECUARIA E ABASTECIMENTO";"50.388,00";"0,00";"0,00" -"SECRETARIA DE ESTADO DE CULTURA E TURISMO";"1.987,00";"653,55";"653,55" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO ECONOMICO";"24.000,00";"7.656,40";"15.914,05" -"SECRETARIA DE ESTADO DE EDUCACAO";"1.188.345,30";"167.184,80";"116.833,85" -"SECRETARIA DE ESTADO DE FAZENDA";"6.492,00";"8.225,98";"0,00" -"SECRETARIA DE ESTADO DE GOVERNO";"0,00";"2.832,20";"2.832,20" -"SECRETARIA DE ESTADO DE JUSTICA E SEGURANCA PUBLICA";"178.508,60";"133.634,70";"355.319,40" -"SECRETARIA DE ESTADO DE MEIO AMBIENTE E DESENVOLVIMENTO SUSTENTAVEL";"37.179,70";"21.612,80";"29.451,25" -"SECRETARIA DE ESTADO DE PLANEJAMENTO E GESTAO";"2.724,00";"2.665,65";"2.753,75" -"SECRETARIA-GERAL";"20.442,50";"1.446,00";"4.003,50" -"TRIBUNAL DE CONTAS DO ESTADO DE MINAS GERAIS";"66.491,13";"66.491,13";"102.522,09" -"TRIBUNAL DE JUSTICA MILITAR DO ESTADO DE MINAS GERAIS";"10.791,18";"10.791,18";"22.945,55" -"UNIVERSIDADE DO ESTADO DE MINAS GERAIS";"4.349,60";"620,70";"764,90" -"UNIVERSIDADE ESTADUAL DE MONTES CLAROS";"17.387,50";"0,00";"0,00" \ No newline at end of file diff --git a/standalone-crawlers/transparencia-viagens/crawler.py b/standalone-crawlers/transparencia-viagens/crawler.py deleted file mode 100644 index 72e31178..00000000 --- a/standalone-crawlers/transparencia-viagens/crawler.py +++ /dev/null @@ -1,28 +0,0 @@ -from selenium import webdriver -from selenium.webdriver.support.ui import Select -import time - -chrome_options = webdriver.ChromeOptions() -# define o diretório no qual os arquivos baixados serão armazenados -prefs = {'download.default_directory' : '/home/lorena/Desktop/Lorena/Programming/C04/transparencia-viagens/csvs'} -chrome_options.add_experimental_option('prefs', prefs) -driver = webdriver.Chrome(chrome_options=chrome_options) - -driver.get('http://transparencia.mg.gov.br/estado-pessoal/viagens') -time.sleep(2) - -year_button = '//*[@id="jform_ano"]' -submit_button = '//*[@id="estado_viagens-form"]/div[7]/div/button' -show_all = '//*[@id="DataTables_Table_0_wrapper"]/div[1]/ul/li[5]/a' -csv_button = '//*[@id="DataTables_Table_0_wrapper"]/div[1]/a[2]' - -# viagens referentes aos anos 2012 a 2020 -for i in range(1, 9): - driver.find_element_by_xpath(year_button).click() - year = '//*[@id="jform_ano"]/option[' + str(i) + ']' - driver.find_element_by_xpath(year).click() - - driver.find_element_by_xpath(submit_button).click() - time.sleep(2) - driver.find_element_by_xpath(show_all).click() - driver.find_element_by_xpath(csv_button).click() \ No newline at end of file diff --git a/standalone-crawlers/transparencia-viagens/csvs/Viagens (1).csv b/standalone-crawlers/transparencia-viagens/csvs/Viagens (1).csv deleted file mode 100644 index 202506f0..00000000 --- a/standalone-crawlers/transparencia-viagens/csvs/Viagens (1).csv +++ /dev/null @@ -1,55 +0,0 @@ -"Órgão ";"Quantidade De Viagens ";"Quantidade De Diárias ";"Valor Pago Diárias ";"Valor Pago Passagens ";"Valor Total " -"ADVOCACIA GERAL DO ESTADO DE MINAS GERAIS - AGE";"373";"239,15";"53.696,30";"83.292,37";"136.988,67" -"AGENCIA DE DESENVOLVIMENTO DA REGIAO METROPOLITANA DE BELO HORIZONTE - ARMBH";"10";"11,15";"3.630,20";"7.164,76";"10.794,96" -"AGENCIA DE DESENVOLVIMENTO METROPOLITANO DO VALE DO ACO - ARMVA";"54";"99,90";"27.185,60";"0,00";"27.185,60" -"AGENCIA REGULADORA DE SERVICOS DE ABASTECIMENTO DE AGUA E DE ESGOTAMENTO SANITARIO DO ESTADO DE MINAS GERAIS - ARSAE";"165";"379,95";"91.140,75";"60.524,96";"151.665,71" -"CONTROLADORIA GERAL DO ESTADO DE MINAS GERAIS - CGE";"84";"249,70";"63.948,20";"57.872,29";"121.820,49" -"DEPARTAMENTO DE EDIFICACOES E ESTRADAS DE RODAGEM DO ESTADO DE MINAS GERAIS - DEER";"2655";"4.091,05";"732.848,22";"5.160,78";"738.009,00" -"ESCOLA DE SAUDE PUBLICA DO ESTADO DE MINAS GERAIS - ESP";"62";"136,85";"28.330,85";"22.616,70";"50.947,55" -"FUNDACAO CENTRO DE HEMATOLOGIA E HEMOTERAPIA DE MINAS GERAIS - HEMOMINAS";"968";"1.303,70";"208.960,19";"18.231,60";"227.191,79" -"FUNDACAO CLOVIS SALGADO - FCS";"9";"23,15";"8.360,25";"7.857,58";"16.217,83" -"FUNDACAO DE AMPARO A PESQUISA DO ESTADO DE MINAS GERAIS - FAPEMIG";"322";"584,60";"162.525,51";"202.937,87";"365.463,38" -"FUNDACAO DE EDUCACAO PARA O TRABALHO DE MINAS GERAIS - UTRAMIG";"4";"4,05";"850,50";"5.851,20";"6.701,70" -"FUNDACAO EDUCACIONAL CAIO MARTINS - FUCAM";"25";"94,75";"17.544,75";"0,00";"17.544,75" -"FUNDACAO ESTADUAL DO MEIO AMBIENTE - FEAM";"220";"450,25";"81.504,55";"18.383,44";"99.887,99" -"FUNDACAO EZEQUIEL DIAS - FUNED";"346";"1.143,60";"201.389,03";"110.421,67";"311.810,70" -"FUNDACAO HELENA ANTIPOFF - FHA";"14";"28,20";"5.675,25";"8.657,60";"14.332,85" -"FUNDACAO HOSPITALAR DO ESTADO DE MINAS GERAIS - FHEMIG";"1213";"1.319,35";"328.505,38";"98.498,78";"427.004,16" -"FUNDACAO JOAO PINHEIRO - FJP";"106";"233,75";"46.912,00";"38.300,59";"85.212,59" -"FUNDACAO TV MINAS CULTURAL E EDUCATIVA - REDE MINAS";"68";"137,80";"26.266,32";"0,00";"26.266,32" -"FUNDAÇÃO DE ARTE DE OURO PRETO - FAOP";"17";"32,25";"9.098,34";"0,00";"9.098,34" -"GABINETE MILITAR DO GOVERNADOR - GMG";"199";"247,45";"67.507,30";"32.727,26";"100.234,56" -"INSTITUTO DE DESENVOLVIMENTO DO NORTE E NORDESTE DE MINAS GERAIS - IDENE";"132";"264,20";"50.719,62";"32.528,99";"83.248,61" -"INSTITUTO DE METROLOGIA E QUALIDADE DO ESTADO DE MINAS GERAIS - IPEM";"1533";"5.312,85";"860.614,93";"29.716,12";"890.331,05" -"INSTITUTO DE PREVIDENCIA DOS SERVIDORES DO ESTADO DE MINAS GERAIS - IPSEMG";"4";"6,40";"1.207,05";"0,00";"1.207,05" -"INSTITUTO DE PREVIDENCIA DOS SERVIDORES MILITARES DO ESTADO DE MINAS GERAIS - IPSM";"423";"505,45";"111.171,63";"2.515,21";"113.686,84" -"INSTITUTO ESTADUAL DE FLORESTAS - IEF";"2694";"3.091,35";"563.122,85";"18.238,40";"581.361,25" -"INSTITUTO ESTADUAL DO PATRIMONIO HISTORICO E ARTISTICO DE MINAS GERAIS - IEPHA";"260";"400,00";"65.045,00";"6.062,97";"71.107,97" -"INSTITUTO MINEIRO DE AGROPECUARIA - IMA";"3262";"6.565,80";"1.106.933,85";"72.056,65";"1.178.990,50" -"INSTITUTO MINEIRO DE GESTAO DAS AGUAS - IGAM";"636";"2.498,10";"454.614,25";"181.759,75";"636.374,00" -"JUNTA COMERCIAL DO ESTADO DE MINAS GERAIS - JUCEMG";"18";"18,95";"6.257,40";"17.675,48";"23.932,88" -"LOTERIA MINEIRA - LEMG";"13";"10,90";"3.367,90";"8.582,26";"11.950,16" -"OUVIDORIA GERAL DO ESTADO DE MINAS GERAIS - OGE";"37";"82,95";"16.797,20";"7.525,76";"24.322,96" -"POLICIA CIVIL DE MINAS GERAIS - PCMG";"14787";"11.947,00";"3.627.005,42";"61.102,85";"3.688.108,27" -"SECRETARIA DE ESTADO DE ADMINISTRAÇÃO PRISIONAL - SEAP";"23666";"21.838,90";"4.061.007,28";"165.294,51";"4.226.301,79" -"SECRETARIA DE ESTADO DE AGRICULTURA, PECUARIA E ABASTECIMENTO - SEAPA";"781";"2.230,95";"375.580,35";"58.984,35";"434.564,70" -"SECRETARIA DE ESTADO DE CIDADES E DE INTEGRACAO REGIONAL - SECIR";"4";"0,00";"0,00";"0,00";"0,00" -"SECRETARIA DE ESTADO DE CULTURA - SEC";"252";"513,30";"116.029,97";"138.996,73";"255.026,70" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO AGRARIO - SEDA";"37";"148,20";"24.600,20";"0,00";"24.600,20" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO E INTEGRACAO DO NORTE E NORDESTE DE MINAS GERAIS - SEDINOR";"61";"188,35";"28.252,50";"0,00";"28.252,50" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO ECONOMICO, CIENCIA, TECNOLOGIA E ENSINO SUPERIOR";"284";"592,45";"128.144,13";"113.315,44";"241.459,57" -"SECRETARIA DE ESTADO DE DIREITOS HUMANOS, PARTICIPACAO SOCIAL E CIDADANIA - SEDPAC";"208";"167,95";"35.598,30";"0,00";"35.598,30" -"SECRETARIA DE ESTADO DE EDUCACAO - SEE";"20362";"23.030,10";"3.979.933,37";"114.456,38";"4.094.389,75" -"SECRETARIA DE ESTADO DE ESPORTES - SEESP";"36";"118,60";"22.483,87";"0,00";"22.483,87" -"SECRETARIA DE ESTADO DE FAZENDA - SEF";"854";"1.691,55";"395.152,41";"429.685,22";"824.837,63" -"SECRETARIA DE ESTADO DE GOVERNO - SEGOV";"427";"804,05";"152.339,44";"58.717,37";"211.056,81" -"SECRETARIA DE ESTADO DE MEIO AMBIENTE E DESENVOLVIMENTO SUSTENTAVEL - SEMAD";"3311";"6.608,75";"1.227.165,46";"105.016,85";"1.332.182,31" -"SECRETARIA DE ESTADO DE PLANEJAMENTO E GESTAO - SEPLAG";"109";"199,20";"53.085,34";"111.494,84";"164.580,18" -"SECRETARIA DE ESTADO DE SAUDE - SES";"22731";"40.635,10";"8.109.399,37";"830.623,53";"8.940.022,90" -"SECRETARIA DE ESTADO DE SEGURANCA PUBLICA - SESP";"696";"755,40";"154.234,07";"28.439,97";"182.674,04" -"SECRETARIA DE ESTADO DE TRABALHO E DESENVOLVIMENTO SOCIAL - SEDESE";"1067";"1.793,15";"389.779,87";"48.914,77";"438.694,64" -"SECRETARIA DE ESTADO DE TRANSPORTES E OBRAS PUBLICAS - SETOP";"57";"72,90";"20.745,15";"29.171,97";"49.917,12" -"SECRETARIA DE ESTADO DE TURISMO - SETUR";"27";"57,45";"11.538,90";"0,00";"11.538,90" -"SECRETARIA GERAL - SG";"436";"839,35";"168.302,72";"147.277,58";"315.580,30" -"UNIVERSIDADE DO ESTADO DE MINAS GERAIS - REITORIA";"809";"1.605,50";"373.557,83";"347.037,35";"720.595,18" -"UNIVERSIDADE ESTADUAL DE MONTES CLAROS - UNIMONTES";"464";"692,20";"164.704,84";"191.941,46";"356.646,30" \ No newline at end of file diff --git a/standalone-crawlers/transparencia-viagens/csvs/Viagens (2).csv b/standalone-crawlers/transparencia-viagens/csvs/Viagens (2).csv deleted file mode 100644 index a608986f..00000000 --- a/standalone-crawlers/transparencia-viagens/csvs/Viagens (2).csv +++ /dev/null @@ -1,57 +0,0 @@ -"Órgão ";"Quantidade De Viagens ";"Quantidade De Diárias ";"Valor Pago Diárias ";"Valor Pago Passagens ";"Valor Total " -"ADVOCACIA GERAL DO ESTADO DE MINAS GERAIS - AGE";"458";"264,55";"55.425,51";"143.756,62";"199.182,13" -"AGENCIA DE DESENVOLVIMENTO DA REGIAO METROPOLITANA DE BELO HORIZONTE - ARMBH";"50";"22,50";"4.802,80";"363,20";"5.166,00" -"AGENCIA DE DESENVOLVIMENTO METROPOLITANO DO VALE DO ACO - ARMVA";"33";"45,55";"12.880,15";"0,00";"12.880,15" -"AGENCIA REGULADORA DE SERVICOS DE ABASTECIMENTO DE AGUA E DE ESGOTAMENTO SANITARIO DO ESTADO DE MINAS GERAIS - ARSAE";"144";"308,70";"63.934,78";"27.185,49";"91.120,27" -"CONTROLADORIA GERAL DO ESTADO DE MINAS GERAIS - CGE";"35";"72,95";"19.965,90";"29.606,20";"49.572,10" -"DEPARTAMENTO DE EDIFICACOES E ESTRADAS DE RODAGEM DO ESTADO DE MINAS GERAIS - DEER";"3913";"3.890,90";"643.740,10";"1.184,55";"644.924,65" -"DEPARTAMENTO ESTADUAL DE TELECOMUNICACOES DO ESTADO DE MINAS GERAIS - DETEL (EXTINTO)";"146";"71,10";"12.978,15";"6.732,66";"19.710,81" -"ESCOLA DE SAUDE PUBLICA DO ESTADO DE MINAS GERAIS - ESP";"58";"132,80";"25.356,75";"17.521,05";"42.877,80" -"FUNDACAO CENTRO DE HEMATOLOGIA E HEMOTERAPIA DE MINAS GERAIS - HEMOMINAS";"1739";"2.949,10";"493.103,55";"50.651,69";"543.755,24" -"FUNDACAO CLOVIS SALGADO - FCS";"1";"3,35";"1.293,10";"860,22";"2.153,32" -"FUNDACAO DE AMPARO A PESQUISA DO ESTADO DE MINAS GERAIS - FAPEMIG";"542";"1.248,55";"342.190,75";"270.469,50";"612.660,25" -"FUNDACAO DE EDUCACAO PARA O TRABALHO DE MINAS GERAIS - UTRAMIG";"3";"4,05";"1.213,65";"2.920,43";"4.134,08" -"FUNDACAO EDUCACIONAL CAIO MARTINS - FUCAM";"154";"407,20";"66.860,74";"232,57";"67.093,31" -"FUNDACAO ESTADUAL DO MEIO AMBIENTE - FEAM";"157";"392,15";"75.593,64";"48.539,23";"124.132,87" -"FUNDACAO EZEQUIEL DIAS - FUNED";"442";"1.460,00";"260.945,18";"150.610,84";"411.556,02" -"FUNDACAO HOSPITALAR DO ESTADO DE MINAS GERAIS - FHEMIG";"1120";"1.158,20";"283.925,66";"40.229,00";"324.154,66" -"FUNDACAO JOAO PINHEIRO - FJP";"85";"272,75";"47.909,10";"14.005,80";"61.914,90" -"FUNDACAO TV MINAS CULTURAL E EDUCATIVA - REDE MINAS";"172";"205,00";"56.431,40";"0,00";"56.431,40" -"FUNDAÇÃO DE ARTE DE OURO PRETO - FAOP";"27";"26,65";"9.139,95";"4.696,93";"13.836,88" -"GABINETE MILITAR DO GOVERNADOR - GMG";"181";"159,40";"44.731,55";"30.790,49";"75.522,04" -"INSTITUTO DE DESENVOLVIMENTO DO NORTE E NORDESTE DE MINAS GERAIS - IDENE";"146";"344,20";"67.749,03";"39.595,33";"107.344,36" -"INSTITUTO DE METROLOGIA E QUALIDADE DO ESTADO DE MINAS GERAIS - IPEM";"2584";"9.741,95";"1.572.534,59";"55.042,76";"1.627.577,35" -"INSTITUTO DE PREVIDENCIA DOS SERVIDORES DO ESTADO DE MINAS GERAIS - IPSEMG";"61";"117,35";"24.338,78";"8.851,68";"33.190,46" -"INSTITUTO DE PREVIDENCIA DOS SERVIDORES MILITARES DO ESTADO DE MINAS GERAIS - IPSM";"263";"412,40";"84.681,34";"13.985,73";"98.667,07" -"INSTITUTO ESTADUAL DE FLORESTAS - IEF";"2283";"4.796,95";"844.522,02";"41.238,54";"885.760,56" -"INSTITUTO ESTADUAL DO PATRIMONIO HISTORICO E ARTISTICO DE MINAS GERAIS - IEPHA";"268";"429,35";"68.724,05";"13.174,52";"81.898,57" -"INSTITUTO MINEIRO DE AGROPECUARIA - IMA";"3409";"4.490,40";"727.675,32";"65.869,32";"793.544,64" -"INSTITUTO MINEIRO DE GESTAO DAS AGUAS - IGAM";"624";"3.006,30";"563.803,81";"290.617,21";"854.421,02" -"JUNTA COMERCIAL DO ESTADO DE MINAS GERAIS - JUCEMG";"50";"75,50";"20.489,50";"43.502,10";"63.991,60" -"LOTERIA MINEIRA - LEMG";"5";"8,75";"2.103,10";"1.559,74";"3.662,84" -"OUVIDORIA GERAL DO ESTADO DE MINAS GERAIS - OGE";"70";"160,45";"36.366,85";"17.584,52";"53.951,37" -"POLICIA CIVIL DE MINAS GERAIS - PCMG";"14203";"12.502,80";"3.712.985,56";"17.120,60";"3.730.106,16" -"SECRETARIA DE ESTADO DE ADMINISTRAÇÃO PRISIONAL - SEAP";"16492";"17.633,90";"3.209.293,27";"72.079,19";"3.281.372,46" -"SECRETARIA DE ESTADO DE AGRICULTURA, PECUARIA E ABASTECIMENTO - SEAPA";"530";"1.281,80";"223.772,33";"25.864,71";"249.637,04" -"SECRETARIA DE ESTADO DE CASA CIVIL E DE RELACOES INSTITUCIONAIS - SECCRI";"14";"15,90";"4.398,70";"5.078,46";"9.477,16" -"SECRETARIA DE ESTADO DE CIDADES E DE INTEGRACAO REGIONAL - SECIR";"113";"186,30";"30.499,23";"59.275,71";"89.774,94" -"SECRETARIA DE ESTADO DE CULTURA - SEC";"116";"136,55";"33.884,10";"14.705,71";"48.589,81" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO AGRARIO - SEDA";"367";"1.086,85";"198.916,41";"28.101,57";"227.017,98" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO E INTEGRACAO DO NORTE E NORDESTE DE MINAS GERAIS - SEDINOR";"108";"363,80";"76.653,06";"13.168,41";"89.821,47" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO ECONOMICO, CIENCIA, TECNOLOGIA E ENSINO SUPERIOR";"33";"66,15";"12.243,05";"18.470,50";"30.713,55" -"SECRETARIA DE ESTADO DE DIREITOS HUMANOS, PARTICIPACAO SOCIAL E CIDADANIA - SEDPAC";"645";"593,50";"138.267,15";"41.287,94";"179.555,09" -"SECRETARIA DE ESTADO DE EDUCACAO - SEE";"19469";"24.908,70";"4.643.431,43";"196.207,90";"4.839.639,33" -"SECRETARIA DE ESTADO DE ESPORTES - SEESP";"167";"506,85";"87.897,08";"16.302,27";"104.199,35" -"SECRETARIA DE ESTADO DE FAZENDA - SEF";"749";"1.481,60";"323.346,54";"410.757,27";"734.103,81" -"SECRETARIA DE ESTADO DE GOVERNO - SEGOV";"736";"2.027,60";"348.124,87";"8.597,91";"356.722,78" -"SECRETARIA DE ESTADO DE MEIO AMBIENTE E DESENVOLVIMENTO SUSTENTAVEL - SEMAD";"3716";"8.431,60";"1.568.727,71";"147.766,64";"1.716.494,35" -"SECRETARIA DE ESTADO DE PLANEJAMENTO E GESTAO - SEPLAG";"385";"767,85";"167.055,63";"123.511,73";"290.567,36" -"SECRETARIA DE ESTADO DE SAUDE - SES";"24330";"41.375,45";"8.551.194,01";"1.990.662,38";"10.541.856,39" -"SECRETARIA DE ESTADO DE SEGURANCA PUBLICA - SESP";"1154";"1.710,60";"338.371,76";"59.170,78";"397.542,54" -"SECRETARIA DE ESTADO DE TRABALHO E DESENVOLVIMENTO SOCIAL - SEDESE";"927";"1.772,10";"396.144,65";"100.881,20";"497.025,85" -"SECRETARIA DE ESTADO DE TRANSPORTES E OBRAS PUBLICAS - SETOP";"158";"583,30";"154.074,36";"0,00";"154.074,36" -"SECRETARIA DE ESTADO DE TURISMO - SETUR";"258";"632,15";"138.694,63";"92.041,33";"230.735,96" -"SECRETARIA DE ESTADO EXTRAORDINARIA DE DESENVOLVIMENTO INTEGRADO E FORUNS REGIONAIS - SEEDIF (EXTINTA)";"310";"678,90";"130.864,86";"51.227,31";"182.092,17" -"SECRETARIA GERAL - SG";"30";"48,00";"10.174,00";"9.402,36";"19.576,36" -"UNIVERSIDADE DO ESTADO DE MINAS GERAIS - REITORIA";"602";"1.246,70";"278.691,70";"100.128,63";"378.820,33" -"UNIVERSIDADE ESTADUAL DE MONTES CLAROS - UNIMONTES";"369";"512,60";"99.865,15";"104.216,16";"204.081,31" \ No newline at end of file diff --git a/standalone-crawlers/transparencia-viagens/csvs/Viagens (3).csv b/standalone-crawlers/transparencia-viagens/csvs/Viagens (3).csv deleted file mode 100644 index 78eb07ee..00000000 --- a/standalone-crawlers/transparencia-viagens/csvs/Viagens (3).csv +++ /dev/null @@ -1,57 +0,0 @@ -"Órgão ";"Quantidade De Viagens ";"Quantidade De Diárias ";"Valor Pago Diárias ";"Valor Pago Passagens ";"Valor Total " -"ADVOCACIA GERAL DO ESTADO DE MINAS GERAIS - AGE";"427";"255,45";"54.043,52";"70.795,67";"124.839,19" -"AGENCIA DE DESENVOLVIMENTO DA REGIAO METROPOLITANA DE BELO HORIZONTE - ARMBH";"129";"47,45";"7.926,70";"3.587,80";"11.514,50" -"AGENCIA DE DESENVOLVIMENTO METROPOLITANO DO VALE DO ACO - ARMVA";"32";"42,20";"12.921,80";"0,00";"12.921,80" -"AGENCIA REGULADORA DE SERVICOS DE ABASTECIMENTO DE AGUA E DE ESGOTAMENTO SANITARIO DO ESTADO DE MINAS GERAIS - ARSAE";"172";"433,15";"75.159,80";"23.200,51";"98.360,31" -"CONTROLADORIA GERAL DO ESTADO DE MINAS GERAIS - CGE";"126";"457,85";"85.502,18";"41.048,20";"126.550,38" -"DEPARTAMENTO DE EDIFICACOES E ESTRADAS DE RODAGEM DO ESTADO DE MINAS GERAIS - DEER";"14084";"9.936,60";"1.616.052,63";"3.501,63";"1.619.554,26" -"DEPARTAMENTO ESTADUAL DE TELECOMUNICACOES DO ESTADO DE MINAS GERAIS - DETEL (EXTINTO)";"156";"182,60";"34.041,72";"10.672,18";"44.713,90" -"ESCOLA DE SAUDE PUBLICA DO ESTADO DE MINAS GERAIS - ESP";"122";"63,20";"18.472,75";"35.925,99";"54.398,74" -"FUNDACAO CENTRO DE HEMATOLOGIA E HEMOTERAPIA DE MINAS GERAIS - HEMOMINAS";"2397";"2.816,30";"467.894,23";"93.916,17";"561.810,40" -"FUNDACAO CLOVIS SALGADO - FCS";"17";"23,95";"6.504,50";"3.075,65";"9.580,15" -"FUNDACAO DE AMPARO A PESQUISA DO ESTADO DE MINAS GERAIS - FAPEMIG";"654";"1.500,10";"412.310,77";"269.471,85";"681.782,62" -"FUNDACAO DE EDUCACAO PARA O TRABALHO DE MINAS GERAIS - UTRAMIG";"9";"2,80";"592,20";"2.576,75";"3.168,95" -"FUNDACAO EDUCACIONAL CAIO MARTINS - FUCAM";"126";"405,75";"63.023,64";"0,00";"63.023,64" -"FUNDACAO ESTADUAL DO MEIO AMBIENTE - FEAM";"245";"592,10";"111.905,70";"93.249,81";"205.155,51" -"FUNDACAO EZEQUIEL DIAS - FUNED";"493";"1.658,25";"313.719,55";"180.383,52";"494.103,07" -"FUNDACAO HOSPITALAR DO ESTADO DE MINAS GERAIS - FHEMIG";"1274";"1.160,80";"282.543,41";"0,00";"282.543,41" -"FUNDACAO JOAO PINHEIRO - FJP";"156";"601,65";"100.785,75";"0,00";"100.785,75" -"FUNDACAO TV MINAS CULTURAL E EDUCATIVA - REDE MINAS";"100";"141,00";"32.613,00";"0,00";"32.613,00" -"FUNDAÇÃO DE ARTE DE OURO PRETO - FAOP";"17";"27,95";"9.696,85";"0,00";"9.696,85" -"GABINETE MILITAR DO GOVERNADOR - GMG";"215";"338,85";"90.788,25";"57.809,59";"148.597,84" -"INSTITUTO DE DESENVOLVIMENTO DO NORTE E NORDESTE DE MINAS GERAIS - IDENE";"339";"823,30";"156.055,10";"112.680,90";"268.736,00" -"INSTITUTO DE METROLOGIA E QUALIDADE DO ESTADO DE MINAS GERAIS - IPEM";"2913";"10.457,80";"1.704.651,03";"0,00";"1.704.651,03" -"INSTITUTO DE PREVIDENCIA DOS SERVIDORES DO ESTADO DE MINAS GERAIS - IPSEMG";"105";"181,75";"37.633,30";"42.279,62";"79.912,92" -"INSTITUTO DE PREVIDENCIA DOS SERVIDORES MILITARES DO ESTADO DE MINAS GERAIS - IPSM";"311";"397,70";"68.248,63";"21.084,03";"89.332,66" -"INSTITUTO ESTADUAL DE FLORESTAS - IEF";"3341";"5.598,55";"1.013.222,39";"85.704,43";"1.098.926,82" -"INSTITUTO ESTADUAL DO PATRIMONIO HISTORICO E ARTISTICO DE MINAS GERAIS - IEPHA";"455";"615,45";"97.507,04";"25.488,03";"122.995,07" -"INSTITUTO MINEIRO DE AGROPECUARIA - IMA";"6723";"7.053,45";"1.139.424,47";"87.587,11";"1.227.011,58" -"INSTITUTO MINEIRO DE GESTAO DAS AGUAS - IGAM";"779";"3.538,70";"667.299,77";"195.156,36";"862.456,13" -"JUNTA COMERCIAL DO ESTADO DE MINAS GERAIS - JUCEMG";"190";"565,95";"151.384,10";"178.704,93";"330.089,03" -"LOTERIA MINEIRA - LEMG";"13";"18,55";"5.315,25";"6.525,95";"11.841,20" -"OUVIDORIA GERAL DO ESTADO DE MINAS GERAIS - OGE";"161";"322,60";"68.303,27";"35.830,06";"104.133,33" -"POLICIA CIVIL DE MINAS GERAIS - PCMG";"15305";"14.202,65";"4.223.905,65";"0,00";"4.223.905,65" -"SECRETARIA DE ESTADO DE ADMINISTRAÇÃO PRISIONAL - SEAP";"12873";"14.531,05";"2.563.292,91";"66.189,20";"2.629.482,11" -"SECRETARIA DE ESTADO DE AGRICULTURA, PECUARIA E ABASTECIMENTO - SEAPA";"973";"2.679,85";"457.638,94";"0,00";"457.638,94" -"SECRETARIA DE ESTADO DE CASA CIVIL E DE RELACOES INSTITUCIONAIS - SECCRI";"26";"45,10";"8.457,70";"8.182,52";"16.640,22" -"SECRETARIA DE ESTADO DE CIDADES E DE INTEGRACAO REGIONAL - SECIR";"305";"759,85";"120.501,15";"71.778,09";"192.279,24" -"SECRETARIA DE ESTADO DE CULTURA - SEC";"235";"334,50";"68.132,90";"44.324,27";"112.457,17" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO AGRARIO - SEDA";"769";"2.529,70";"458.720,60";"66.902,58";"525.623,18" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO E INTEGRACAO DO NORTE E NORDESTE DE MINAS GERAIS - SEDINOR";"36";"115,55";"18.083,76";"7.201,35";"25.285,11" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO ECONOMICO, CIENCIA, TECNOLOGIA E ENSINO SUPERIOR";"168";"166,10";"30.630,57";"44.170,03";"74.800,60" -"SECRETARIA DE ESTADO DE DIREITOS HUMANOS, PARTICIPACAO SOCIAL E CIDADANIA - SEDPAC";"927";"1.346,85";"272.806,89";"119.372,38";"392.179,27" -"SECRETARIA DE ESTADO DE EDUCACAO - SEE";"37433";"43.905,40";"7.738.551,10";"249.958,05";"7.988.509,15" -"SECRETARIA DE ESTADO DE ESPORTES - SEESP";"252";"650,20";"101.996,07";"20.894,47";"122.890,54" -"SECRETARIA DE ESTADO DE FAZENDA - SEF";"927";"1.961,15";"438.256,41";"323.869,29";"762.125,70" -"SECRETARIA DE ESTADO DE GOVERNO - SEGOV";"1550";"4.099,50";"690.108,81";"105.390,01";"795.498,82" -"SECRETARIA DE ESTADO DE MEIO AMBIENTE E DESENVOLVIMENTO SUSTENTAVEL - SEMAD";"7096";"16.131,05";"2.948.068,10";"176.854,00";"3.124.922,10" -"SECRETARIA DE ESTADO DE PLANEJAMENTO E GESTAO - SEPLAG";"679";"1.426,45";"286.494,59";"189.353,60";"475.848,19" -"SECRETARIA DE ESTADO DE SAUDE - SES";"28650";"46.908,15";"9.308.386,47";"1.414.610,44";"10.722.996,91" -"SECRETARIA DE ESTADO DE SEGURANCA PUBLICA - SESP";"848";"1.583,85";"286.971,96";"32.759,85";"319.731,81" -"SECRETARIA DE ESTADO DE TRABALHO E DESENVOLVIMENTO SOCIAL - SEDESE";"2534";"4.248,70";"894.925,84";"301.539,43";"1.196.465,27" -"SECRETARIA DE ESTADO DE TRANSPORTES E OBRAS PUBLICAS - SETOP";"52";"76,20";"14.453,85";"0,00";"14.453,85" -"SECRETARIA DE ESTADO DE TURISMO - SETUR";"311";"774,60";"164.849,48";"151.833,83";"316.683,31" -"SECRETARIA DE ESTADO EXTRAORDINARIA DE DESENVOLVIMENTO INTEGRADO E FORUNS REGIONAIS - SEEDIF (EXTINTA)";"497";"1.256,15";"214.298,09";"0,00";"214.298,09" -"SECRETARIA GERAL - SG";"63";"112,25";"22.085,56";"5.516,79";"27.602,35" -"UNIVERSIDADE DO ESTADO DE MINAS GERAIS - REITORIA";"702";"1.430,00";"327.443,84";"205.354,32";"532.798,16" -"UNIVERSIDADE ESTADUAL DE MONTES CLAROS - UNIMONTES";"485";"784,25";"157.118,26";"0,00";"157.118,26" \ No newline at end of file diff --git a/standalone-crawlers/transparencia-viagens/csvs/Viagens.csv b/standalone-crawlers/transparencia-viagens/csvs/Viagens.csv deleted file mode 100644 index 5f647e78..00000000 --- a/standalone-crawlers/transparencia-viagens/csvs/Viagens.csv +++ /dev/null @@ -1,32 +0,0 @@ -"Órgão ";"Quantidade De Viagens ";"Quantidade De Diárias ";"Valor Pago Diárias ";"Valor Pago Passagens ";"Valor Total " -"ADVOCACIA GERAL DO ESTADO DE MINAS GERAIS - AGE";"14";"1,75";"304,50";"1.511,70";"1.816,20" -"AGENCIA REGULADORA DE SERVICOS DE ABASTECIMENTO DE AGUA E DE ESGOTAMENTO SANITARIO DO ESTADO DE MINAS GERAIS - ARSAE";"8";"10,80";"3.619,35";"0,00";"3.619,35" -"CONTROLADORIA GERAL DO ESTADO DE MINAS GERAIS - CGE";"1";"1,35";"521,10";"0,00";"521,10" -"DEPARTAMENTO DE EDIFICACOES E ESTRADAS DE RODAGEM DO ESTADO DE MINAS GERAIS - DEER";"42";"4,20";"630,00";"0,00";"630,00" -"ESCOLA DE SAUDE PUBLICA DO ESTADO DE MINAS GERAIS - ESP";"2";"6,70";"1.005,00";"0,00";"1.005,00" -"FUNDACAO CENTRO DE HEMATOLOGIA E HEMOTERAPIA DE MINAS GERAIS - HEMOMINAS";"63";"33,50";"5.176,20";"121,15";"5.297,35" -"FUNDACAO ESTADUAL DO MEIO AMBIENTE - FEAM";"6";"1,00";"150,00";"0,00";"150,00" -"FUNDACAO EZEQUIEL DIAS - FUNED";"7";"8,75";"2.388,75";"4.481,53";"6.870,28" -"FUNDACAO HOSPITALAR DO ESTADO DE MINAS GERAIS - FHEMIG";"7";"2,45";"668,85";"0,00";"668,85" -"FUNDACAO JOAO PINHEIRO - FJP";"1";"1,35";"278,10";"0,00";"278,10" -"FUNDACAO TV MINAS CULTURAL E EDUCATIVA - REDE MINAS";"4";"11,85";"2.488,50";"0,00";"2.488,50" -"INSTITUTO DE DESENVOLVIMENTO DO NORTE E NORDESTE DE MINAS GERAIS - IDENE";"6";"12,10";"2.484,30";"443,66";"2.927,96" -"INSTITUTO DE METROLOGIA E QUALIDADE DO ESTADO DE MINAS GERAIS - IPEM";"40";"43,35";"8.062,50";"0,00";"8.062,50" -"INSTITUTO DE PREVIDENCIA DOS SERVIDORES MILITARES DO ESTADO DE MINAS GERAIS - IPSM";"4";"1,40";"382,20";"0,00";"382,20" -"INSTITUTO ESTADUAL DE FLORESTAS - IEF";"36";"3,75";"851,55";"0,00";"851,55" -"INSTITUTO ESTADUAL DO PATRIMONIO HISTORICO E ARTISTICO DE MINAS GERAIS - IEPHA";"3";"3,05";"457,50";"0,00";"457,50" -"INSTITUTO MINEIRO DE AGROPECUARIA - IMA";"18";"0,70";"105,00";"0,00";"105,00" -"INSTITUTO MINEIRO DE GESTAO DAS AGUAS - IGAM";"10";"56,75";"8.632,54";"0,00";"8.632,54" -"SECRETARIA DE ESTADO DE ADMINISTRAÇÃO PRISIONAL - SEAP";"1390";"1.164,95";"209.613,99";"0,00";"209.613,99" -"SECRETARIA DE ESTADO DE DESENVOLVIMENTO ECONOMICO, CIENCIA, TECNOLOGIA E ENSINO SUPERIOR";"15";"24,25";"5.647,65";"0,00";"5.647,65" -"SECRETARIA DE ESTADO DE EDUCACAO - SEE";"15";"0,00";"0,00";"0,00";"0,00" -"SECRETARIA DE ESTADO DE FAZENDA - SEF";"4";"0,00";"0,00";"390,08";"390,08" -"SECRETARIA DE ESTADO DE GOVERNO - SEGOV";"2";"5,70";"855,00";"0,00";"855,00" -"SECRETARIA DE ESTADO DE MEIO AMBIENTE E DESENVOLVIMENTO SUSTENTAVEL - SEMAD";"46";"4,35";"971,10";"1.980,85";"2.951,95" -"SECRETARIA DE ESTADO DE PLANEJAMENTO E GESTAO - SEPLAG";"4";"2,40";"926,40";"3.911,94";"4.838,34" -"SECRETARIA DE ESTADO DE SAUDE - SES";"188";"210,65";"43.966,25";"14.360,13";"58.326,38" -"SECRETARIA DE ESTADO DE TRABALHO E DESENVOLVIMENTO SOCIAL - SEDESE";"32";"64,45";"10.513,80";"1.857,96";"12.371,76" -"SECRETARIA DE ESTADO DE TRANSPORTES E OBRAS PUBLICAS - SETOP";"5";"3,40";"1.159,85";"5.607,16";"6.767,01" -"SECRETARIA GERAL - SG";"8";"31,80";"4.770,00";"0,00";"4.770,00" -"UNIVERSIDADE DO ESTADO DE MINAS GERAIS - REITORIA";"2";"0,00";"0,00";"1.339,15";"1.339,15" -"UNIVERSIDADE ESTADUAL DE MONTES CLAROS - UNIMONTES";"8";"0,00";"0,00";"0,00";"0,00" \ No newline at end of file