Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature #2

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions analyser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from math import radians, sin, cos, atan2, sqrt


def distance2(dlon, dlat):
# The math module contains a function named radians which converts from degrees to radians.
# The radius of the Earth in Poland is 6363.564 km
dlon = radians(dlon)
dlat = radians(dlat)
R = 6363.564
# Haversine formula
a = sin(dlat / 2) ** 2 + cos(dlat) * cos(dlat) * sin(dlon / 2) ** 2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
dist = R * c
return abs(dist)
51 changes: 51 additions & 0 deletions analyser/buses_in_districts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import json
from shapely.geometry import shape, Point
import pandas as pd
import os

count = 0
# from data directory import warszawa-dzielnice.geojson
with open('data\\warszawa-dzielnice.geojson') as f:
districts = json.load(f)


def district_of_point(point):
global count
count += 1
if count % 1000 == 0:
print(count)
for district in districts['features']:
polygon = shape(district['geometry'])
# check if point is in district
if polygon.contains(point) and district['properties']['name'] != 'Warszawa':
return district['properties']['name']
return None


# This function adds a column to the buses_locations dataframe with the name of the district
def district_of_bus(location_file_name, output_file_name='buses_locations_with_district.csv'):
if not os.path.isfile(location_file_name):
raise FileNotFoundError('Locations file not found')
buses_locations = pd.read_csv(location_file_name)
buses_locations['district'] = buses_locations.apply(lambda row: district_of_point(Point(row['dl_geo'],
row['szer_geo'])), axis=1)
buses_locations.to_csv('data\\' + output_file_name, index=False)


# This function counts the number of buses in each district
def buses_in_districts(location_file_name, sample_size=360, output_file_name='buses_in_districts.csv'):
if not os.path.isfile('data\\' + location_file_name):
raise FileNotFoundError('Locations file not found')
buses_locations = pd.read_csv('data\\' + location_file_name)
buses_locations = buses_locations.dropna()
# group by district and count number of rows
buses_locations = buses_locations.groupby('district').size().reset_index(name='count')
buses_locations['count'] = buses_locations['count']
buses_locations['count'] = buses_locations['count'] / sample_size
# sum count column and print
print(buses_locations['count'].sum())
buses_locations.to_csv('data\\' + output_file_name, index=False)


# district_of_bus('data\\locations20_40.csv')
# buses_in_districts('data\\buses_locations_with_district.csv')
83 changes: 83 additions & 0 deletions analyser/punctuality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import pandas as pd
import os
from analyser import distance2
import time


# This function uses geo data from buses and bus stops to determine if the bus is on the bus stop
# and if it is, it saves the data to a csv file
def punctuality_of_line(locations_file, line, km_threshold=0.10):
point0 = time.time()
locations = pd.read_csv('data\\' + locations_file)
if not os.path.isfile('data\\schedules' + line + '.csv'):
raise FileNotFoundError('File not found')
schedules = pd.read_csv('data\\schedules' + line + '.csv')
point1 = time.time()
print("Time to read csv: ", point1 - point0)
schedules = schedules.rename(columns={'szer_geo': 'szer_geo_stop', 'dl_geo': 'dl_geo_stop',
'time': 'time_stop'})
# narrow down locations to line
locations = locations[locations['lines'] == line]
locations = locations.rename(columns={'szer_geo': 'szer_geo_bus', 'dl_geo': 'dl_geo_bus',
'time': 'time_bus'})
locations['time_bus'] = pd.to_datetime(locations['time_bus']).dt.time

# convert time_stop to time HH:MM:SS
schedules['time_stop'] = pd.to_datetime(schedules['time_stop'], format='%H:%M:%S').dt.time

# brigade to string
schedules['brigade'] = schedules['brigade'].astype(str)
locations['brigade'] = locations['brigade'].astype(str)

# merge schedules and locations on brigade
buses_on_stops = schedules.merge(locations, on='brigade')
point2 = time.time()
print("Time to merge: ", point2 - point1)
mask = buses_on_stops.apply(lambda row: distance2(row['dl_geo_stop'] - row['dl_geo_bus'],
row['szer_geo_stop'] - row['szer_geo_bus']) < km_threshold, axis=1)
point3 = time.time()
print("Time to calculate distance: ", point3 - point2)
buses_on_stops = buses_on_stops[mask]

# create a mask for time difference, chcec if time difference is less than an hour
mask = buses_on_stops.apply(lambda row: abs(row['time_bus'].hour - row['time_stop'].hour) < 1, axis=1)
buses_on_stops = buses_on_stops[mask]
point4 = time.time()
print("Time to calculate time difference: ", point4 - point3)

# remove duplicate rows
buses_on_stops = buses_on_stops.drop_duplicates()

# remove rows with NaN values
buses_on_stops = buses_on_stops.dropna()
# save buses_on_stops to csv in data directory
buses_on_stops.to_csv('data\\buses_on_stops' + line + '.csv', index=False)


# This function uses data from buses_on_stops to determine if the bus is late or early
def test_punctuality_of_line(line, threshold=3, output_file='buses_late_or_early.csv'):
time0 = time.time()
buses_on_stops = pd.read_csv('data\\buses_on_stops' + line + '.csv')
time1 = time.time()
print("Time to read csv: ", time1 - time0)
# treshold is amount of minutes that bus can be late or early
buses_on_stops['time_bus'] = pd.to_datetime(buses_on_stops['time_bus'], format='%H:%M:%S')
buses_on_stops['time_stop'] = pd.to_datetime(buses_on_stops['time_stop'], format='%H:%M:%S')
buses_on_stops['time_diff'] = buses_on_stops['time_bus'] - buses_on_stops['time_stop']
buses_on_stops['is_late'] = buses_on_stops['time_diff'] > pd.Timedelta(0)
buses_on_stops['is_early'] = buses_on_stops['time_diff'] < pd.Timedelta(0)
buses_on_stops['time_diff'] = buses_on_stops['time_diff'].dt.total_seconds() / 60
buses_on_stops['time_diff'] = buses_on_stops['time_diff'].abs()
buses_on_stops = buses_on_stops[buses_on_stops['time_diff'] > threshold]
# discard columns brigade, szer_geo_bus, dl_geo_bus, id_ulicy
buses_on_stops = buses_on_stops.drop(columns=['brigade', 'szer_geo_bus', 'dl_geo_bus', 'id_ulicy'])
# group by zespol and slupek and take mean of time_diff
buses_on_stops = buses_on_stops.groupby(['zespol', 'slupek', 'szer_geo_stop',
'dl_geo_stop', 'nazwa_zespolu']).agg({'time_diff': 'mean'}).reset_index()
buses_on_stops.to_csv('data\\' + output_file, index=False)
time2 = time.time()
print("Time to finish: ", time2 - time1)


# punctuality_of_line('180')
# test_punctuality_of_line('180')
119 changes: 119 additions & 0 deletions analyser/transit_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import pandas as pd
import os
import time
from analyser import distance2

# This funnction uses geo data from buses and bus stops to determine if the bus is on the bus stop
def buses_on_bus_stops(locations_file, line, km_threshold=0.10):
point0 = time.time()
if not os.path.isfile('data\\' + locations_file):
raise FileNotFoundError('Locations file not found')
locations = pd.read_csv('data\\' + locations_file)

if not os.path.isfile('data\\schedules' + line + '.csv'):
raise FileNotFoundError('Schedules file not found')
locations = locations[locations['lines'] == line]
# remove duplicate rows
locations = locations.drop_duplicates()
locations = locations.rename(columns={'szer_geo': 'szer_geo_bus', 'dl_geo': 'dl_geo_bus',
'time': 'time_bus'})
bus_stops = pd.read_csv('data\\possible_bus_stops' + line + '.csv')
bus_stops = bus_stops.rename(columns={'szer_geo': 'szer_geo_stop', 'dl_geo': 'dl_geo_stop',
'time': 'time_stop'})
buses_on_stops = bus_stops.merge(locations, how='cross')
mask = buses_on_stops.apply(lambda row: distance2(row['dl_geo_stop'] - row['dl_geo_bus'],
row['szer_geo_stop'] - row['szer_geo_bus']) < km_threshold, axis=1)
buses_on_stops = buses_on_stops[mask]
buses_on_stops = buses_on_stops.drop(columns=['id_ulicy', 'brigade', 'type'])
buses_on_stops = buses_on_stops.sort_values(by=['vehicle_number', 'time_bus'])
# reorder columns make vehicle_number and time_bus first
buses_on_stops = buses_on_stops[['vehicle_number', 'time_bus', 'zespol', 'slupek', 'nazwa_zespolu', 'szer_geo_stop',
'dl_geo_stop', 'kierunek', 'szer_geo_bus', 'dl_geo_bus']]
# remove duplicate rows where vehicle_number and zespol are the same
buses_on_stops = buses_on_stops.drop_duplicates(subset=['vehicle_number', 'zespol'])
# remove rows with NaN values
buses_on_stops = buses_on_stops.dropna()
buses_on_stops.to_csv('data\\buses_on_stops_simplified' + line + '.csv', index=False)
return buses_on_stops


# This function calculates transit time between bus stops based on data from buses_on_stops
def calculate_transit_time(line):
buses_on_stops = pd.read_csv('data\\buses_on_stops_simplified' + line + '.csv')
buses_on_stops['time_bus'] = pd.to_datetime(buses_on_stops['time_bus'])
buses_on_stops = buses_on_stops.sort_values(by=['vehicle_number', 'time_bus'])
buses_on_stops['time_diff'] = buses_on_stops['time_bus'].shift(-1) - buses_on_stops['time_bus']
buses_on_stops['time_diff'] = buses_on_stops['time_diff'].dt.total_seconds() / 60
buses_on_stops['vechicle_number_match'] = (buses_on_stops['vehicle_number'] ==
buses_on_stops['vehicle_number'].shift(-1))
buses_on_stops['next_stop_name'] = buses_on_stops['nazwa_zespolu'].shift(-1)
# zespol to string
buses_on_stops['zespol'] = buses_on_stops['zespol'].astype(str)
buses_on_stops['next_stop_id'] = buses_on_stops['zespol'].shift(-1)
# next_stop_id to string
buses_on_stops['next_stop_id'] = buses_on_stops['next_stop_id'].astype(str)
buses_on_stops = buses_on_stops.drop(columns=['szer_geo_bus', 'dl_geo_bus'])
# remove rows where vehicle_number_match is False
buses_on_stops = buses_on_stops[buses_on_stops['vechicle_number_match']]
# group by zespol and next_stop_id and calculate mean of time_diff and count of vehicle_number
aggr_by = ['zespol', 'nazwa_zespolu', 'next_stop_id', 'next_stop_name', 'szer_geo_stop', 'dl_geo_stop']
# group by columns in aggr_by and calculate mean of time_diff and count of vehicle_number, but leave columns
# specified in aggr_by
buses_on_stops = buses_on_stops.groupby(aggr_by).agg({'time_diff': 'mean', 'vehicle_number': 'count'}).reset_index()
# reset index is what makes columns in aggr_by stay in the dataframe
# remove rows where vehicle_number is less than 3
buses_on_stops = buses_on_stops[buses_on_stops['vehicle_number'] > 2]
buses_on_stops.to_csv('data\\transit_time' + line + '.csv', index=False)
return buses_on_stops


# This function uses data about typical route of the line and transit time to calculate transit time on the route
def fit_to_schedule(line, output_file='fit_to_schedule.csv'):
all_routes = pd.read_json('data\\all_routes.json')
# read transit_time + line + .csv
transit_time = pd.read_csv('data\\transit_time' + line + '.csv')
# zespol to string
transit_time['zespol'] = transit_time['zespol'].astype(str)
# next_stop_id to string
transit_time['next_stop_id'] = transit_time['next_stop_id'].astype(str)
line_routes = all_routes['result'][line]
for route in line_routes:
df = pd.DataFrame(line_routes[route])
# flip the dataframe
df = df.T
print(route, df.size)
# index to int
df.index = df.index.astype(int)
# sort by index
df = df.sort_index()
df['next_stop_id'] = df['nr_zespolu'].shift(-1)
# merge with transit_time by zespol and next_stop_id
df = df.merge(transit_time, left_on=['nr_zespolu', 'next_stop_id'], right_on=['zespol', 'next_stop_id'])
print(route, df.size, '\n')
max_time_diff = df['time_diff'].max()
min_time_diff = df['time_diff'].min()
# round time_diff to 2 decimal places
df['time_diff'] = df['time_diff'].round(2)
yellow = (255, 255, 0)
blue = (0, 0, 255)
# make column color and set to gradient from blue to yellow
df['color'] = df['time_diff'].apply(lambda x: gradient(min_time_diff, max_time_diff, x, blue, yellow))
# save to csv
df.to_csv('data\\' + route + output_file, index=False)

# print(line_routes)


def gradient(minimum, maximum, value, color1, color2):
ratio = (value - minimum) / (maximum - minimum)
r1, g1, b1 = color1
r2, g2, b2 = color2
r = int(r1 + (r2 - r1) * ratio)
g = int(g1 + (g2 - g1) * ratio)
b = int(b1 + (b2 - b1) * ratio)
return "rgb(" + str(r) + "," + str(g) + "," + str(b) + ")"


# buses_on_bus_stops('180')
# calculate_transit_time('180')
# fit_to_schedule('180')
33 changes: 10 additions & 23 deletions analyser/velocity.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,14 @@
import analyser
import pandas as pd
import os
from math import radians, sin, cos, sqrt, atan2
import time

from analyser import distance2


def distance(lat1, lon1, lat2, lon2):
return distance2(lon2 - lon1, lat2 - lat1)


def distance2(dlon, dlat):
# The math module contains a function named radians which converts from degrees to radians.
# The radius of the Earth in Poland is 6363.564 km
dlon = radians(dlon)
dlat = radians(dlat)
R = 6363.564
# Haversine formula
a = sin(dlat / 2) ** 2 + cos(dlat) * cos(dlat) * sin(dlon / 2) ** 2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
dist = R * c
return abs(dist)


def exceeded_limit(limit, dlon, dlat, time_diff, vehicle_number, vechicle_number_match):
distance_diff = distance2(dlon, dlat)
velocity = distance_diff / time_diff if time_diff != 0 else 0
Expand All @@ -35,12 +22,13 @@ def exceeded_limit(limit, dlon, dlat, time_diff, vehicle_number, vechicle_number
return None


def exceeded_velocity(limit, locations_file):
if not os.path.isfile(locations_file):
# This function uses geo data from buses to determine if the bus exceeded the velocity limit
def exceeded_velocity(limit, locations_file, output_file='velocity_exceeded.csv', meta_data_file='meta_data.csv'):
if not os.path.isfile("data\\" + locations_file):
print("File does not exist")
return
start = time.time()
buses_locations = pd.read_csv(locations_file)
buses_locations = pd.read_csv("data\\" + locations_file)
time1 = time.time()
print("Time to read csv: ", time1 - start)

Expand Down Expand Up @@ -90,8 +78,6 @@ def exceeded_velocity(limit, locations_file):
time2c = time.time()
print("Time to filter: ", time2c - time2b)

# print columns velocity_exceeded, vehicle_number_match, time, vehicle_number
print(diff_pd[['velocity_exceeded', 'vehicle_number_match', 'time', 'vehicle_number']])
meta_data = dict()
meta_data['how_many_rows'] = len(diff_pd)
meta_data['how_many_vehicles'] = len(diff_pd['vehicle_number'].unique())
Expand All @@ -105,11 +91,12 @@ def exceeded_velocity(limit, locations_file):
diff_pd = diff_pd[diff_pd['velocity_exceeded'] <= 100]

# save dataframe to csv
diff_pd.to_csv('velocity_exceeded.csv', index=False)
diff_pd.to_csv("data\\" + output_file, index=False)

# save meta data to csv
meta_data_df = pd.DataFrame([meta_data])
meta_data_df.to_csv('meta_data.csv', index=False)
meta_data_df.to_csv("data\\" + meta_data_file, index=False)


exceeded_velocity(50, '..\\locations20_40.csv')
# exceeded_velocity(50, '..\\locations20_40.csv')

20 changes: 20 additions & 0 deletions buses_in_districts_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import unittest
import analyser.buses_in_districts as buses_in_districts
import pandas as pd
import os


class BusesInDistrictsTests(unittest.TestCase):
# check if buses_in_districts function makes good csv file
def test_buses_in_districts(self):
buses_in_districts.buses_in_districts('buses_locations_with_district.csv')
self.assertTrue(os.path.isfile('data\\buses_in_districts.csv'))
buses_in_districts_df = pd.read_csv('data\\buses_in_districts.csv')
self.assertEqual(buses_in_districts_df.shape[1], 2)
# check if there are no NaN values
self.assertFalse(buses_in_districts_df.isnull().values.any())
# check if there are no duplicates
self.assertFalse(buses_in_districts_df.duplicated().any())

if __name__ == '__main__':
unittest.main()
Loading