-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathscrape.py
117 lines (102 loc) · 5.44 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.3'
# jupytext_version: 0.8.6
# kernelspec:
# display_name: Python 3
# language: python
# name: python3
# ---
# # Scraping for the Philadelphia Bail Bond
#
# This code will scrape data from the Philadelphia Courts, cleans the data, and outputs a CSV file. Future implementation is to have it check pages on its own, but for now manual entry of end page is necessary.
# ## Import Libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import argh
from datetime import date
PAGE_URL = "https://www.courts.phila.gov/NewCriminalFilings/date/default.aspx"
@argh.arg("--record-date", help = "Date of records to parse (must be within last 7 days)")
@argh.arg("--out", help = "Name of a file for resulting CSV.")
def main(record_date = None, out = None):
"""Scrape data from the Philadelphia Courts, clean, and output a CSV file.
"""
if record_date is None:
record_date = str(date.today())
# This list will hold the scraped data from each page
scraped_list_per_page = []
# The current page is 1 and the end page as of now is 3 (this needs to be manually checked)
curr_page_num, end_page = (1,3)
# Starting at the current page and stopping at the last page of the website
for curr_page_num in range(end_page):
# Take the current page number and increament it each iteration
curr_page_num = 1 + curr_page_num
# The current webpage stores up to 24 criminal files and we are going through each page by updating the page number in the format
params = {
"search": record_date,
"page": curr_page_num
}
# Then get the HTML file of the page as text
source = requests.get(PAGE_URL, params = params).text
# Then create a BeautifulSoup object of the text, this makes pulling data out of HTML files easier
# To learn more about it read here (https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
soup = BeautifulSoup(source)
# After inspecting the source code I noticed the criminal files were listed under this specific div tag
# The findAll function will grab each criminal file from that page
list_of_criminal_filings = soup.findAll("div", {"class": "well well-sm"})
# Then pass the list of all criminal fiilings into the extract_attributes function
# After the extract_attributes function completes it will return a list of that whole page's scraped criminal
# filings and then it will continue to the next page and at the end we will have one complete joined list
scraped_list_per_page = (extract_attributes(list_of_criminal_filings)) + scraped_list_per_page
# The joined list will then be passed into the create_csv function and converted to CSV
create_csv(out, scraped_list_per_page)
def extract_attributes(list_of_criminal_filings):
list_of_criminal_file_scraped = []
# For each criminal file in the list of criminal filings pass it into the scrape_and_store function
# Then afterwards return everything to main and it will repeat this cycle for the amount of pages
for criminal_file in list_of_criminal_filings:
criminal_file_scraped = scrape_and_store(criminal_file.text)
list_of_criminal_file_scraped.append(criminal_file_scraped)
return list_of_criminal_file_scraped
# This is just regex functions that helped me clean the data you can read more about regex here (https://docs.python.org/3/library/re.html)
def scrape_and_store(text):
hold = text.splitlines()
defendant_name = re.split('Name (.*?)', hold[3])[-1]
age = re.split('Age (.*?)', hold[4])[-1]
address = hold[6]
city = re.split('\t ', address.split(',')[0])[1]
state = re.split(" (.*?) ", re.split(",", address)[1])[1]
zip_code = re.split(" (.*?) ", re.split(",", address)[1])[2]
docket_number = re.split("Number (.*?)", hold[11])[2]
filing = re.split(" ", hold[12])
filing_date = filing[2]
filing_time = " ".join(filing[3:5])
charge = re.split("Charge ", hold[13])[1]
represented = hold[15].strip()
in_custody = hold[16]
if len(in_custody) != 1:
try:
in_custody = re.split("Custody (.*?)", in_custody)[2]
except IndexError as error:
in_custody = ""
bail_status = re.split("\t(.*?)", hold[-10])[-1]
bail_datetime = re.split(" ", hold[-9])
bail_date = bail_datetime[2]
bail_time = " ".join(bail_datetime[3:5])
bail_type = re.split(": (.*?)", hold[-8])[-1]
bail_amount = re.split(": (.*?)", hold[-7])[-1]
outstanding_bail_amt = re.split(" ", hold[-6])[-1]
# Return a list of all the attributes
return [defendant_name, age, city, state, zip_code, docket_number, filing_date, filing_time, charge, represented, in_custody, bail_status, bail_date, bail_time, bail_type, bail_amount, outstanding_bail_amt]
# This function will make the list of lists into a CSV file with Pandas
def create_csv(fname, list_of_criminal_file_scraped):
df = pd.DataFrame(list_of_criminal_file_scraped)
df.to_csv(fname, index=False, header=["Defendant Name", "Age", "City", "State", "Zip Code", "Docket Number", "Filing Date", "Filing Time", "Charge", "Represented", "In Custody", "Bail Status", "Bail Date", "Bail Time", "Bail Type", "Bail Amount", "Outstanding Bail Amount"])
if __name__ == "__main__":
argh.dispatch_command(main)