Michu1596 · Michu1596 · Feb 16, 2024 · Feb 17, 2024 · Feb 18, 2024 · Feb 19, 2024
diff --git a/analyser/__init__.py b/analyser/__init__.py
@@ -0,0 +1,14 @@
+from math import radians, sin, cos, atan2, sqrt
+
+
+def distance2(dlon, dlat):
+    # The math module contains a function named radians which converts from degrees to radians.
+    # The radius of the Earth in Poland is 6363.564 km
+    dlon = radians(dlon)
+    dlat = radians(dlat)
+    R = 6363.564
+    # Haversine formula
+    a = sin(dlat / 2) ** 2 + cos(dlat) * cos(dlat) * sin(dlon / 2) ** 2
+    c = 2 * atan2(sqrt(a), sqrt(1 - a))
+    dist = R * c
+    return abs(dist)
diff --git a/analyser/buses_in_districts.py b/analyser/buses_in_districts.py
@@ -0,0 +1,51 @@
+import json
+from shapely.geometry import shape, Point
+import pandas as pd
+import os
+
+count = 0
+# from data directory import warszawa-dzielnice.geojson
+with open('data\\warszawa-dzielnice.geojson') as f:
+    districts = json.load(f)
+
+
+def district_of_point(point):
+    global count
+    count += 1
+    if count % 1000 == 0:
+        print(count)
+    for district in districts['features']:
+        polygon = shape(district['geometry'])
+        # check if point is in district
+        if polygon.contains(point) and district['properties']['name'] != 'Warszawa':
+            return district['properties']['name']
+    return None
+
+
+# This function adds a column to the buses_locations dataframe with the name of the district
+def district_of_bus(location_file_name, output_file_name='buses_locations_with_district.csv'):
+    if not os.path.isfile(location_file_name):
+        raise FileNotFoundError('Locations file not found')
+    buses_locations = pd.read_csv(location_file_name)
+    buses_locations['district'] = buses_locations.apply(lambda row: district_of_point(Point(row['dl_geo'],
+                                                                                            row['szer_geo'])), axis=1)
+    buses_locations.to_csv('data\\' + output_file_name, index=False)
+
+
+# This function counts the number of buses in each district
+def buses_in_districts(location_file_name, sample_size=360, output_file_name='buses_in_districts.csv'):
+    if not os.path.isfile('data\\' + location_file_name):
+        raise FileNotFoundError('Locations file not found')
+    buses_locations = pd.read_csv('data\\' + location_file_name)
+    buses_locations = buses_locations.dropna()
+    # group by district and count number of rows
+    buses_locations = buses_locations.groupby('district').size().reset_index(name='count')
+    buses_locations['count'] = buses_locations['count']
+    buses_locations['count'] = buses_locations['count'] / sample_size
+    # sum count column and print
+    print(buses_locations['count'].sum())
+    buses_locations.to_csv('data\\' + output_file_name, index=False)
+
+
+# district_of_bus('data\\locations20_40.csv')
+# buses_in_districts('data\\buses_locations_with_district.csv')
diff --git a/analyser/punctuality.py b/analyser/punctuality.py
@@ -0,0 +1,83 @@
+import pandas as pd
+import os
+from analyser import distance2
+import time
+
+
+# This function uses geo data from buses and bus stops to determine if the bus is on the bus stop
+# and if it is, it saves the data to a csv file
+def punctuality_of_line(locations_file, line, km_threshold=0.10):
+    point0 = time.time()
+    locations = pd.read_csv('data\\' + locations_file)
+    if not os.path.isfile('data\\schedules' + line + '.csv'):
+        raise FileNotFoundError('File not found')
+    schedules = pd.read_csv('data\\schedules' + line + '.csv')
+    point1 = time.time()
+    print("Time to read csv: ", point1 - point0)
+    schedules = schedules.rename(columns={'szer_geo': 'szer_geo_stop', 'dl_geo': 'dl_geo_stop',
+                                          'time': 'time_stop'})
+    # narrow down locations to line
+    locations = locations[locations['lines'] == line]
+    locations = locations.rename(columns={'szer_geo': 'szer_geo_bus', 'dl_geo': 'dl_geo_bus',
+                                          'time': 'time_bus'})
+    locations['time_bus'] = pd.to_datetime(locations['time_bus']).dt.time
+
+    # convert time_stop to time HH:MM:SS
+    schedules['time_stop'] = pd.to_datetime(schedules['time_stop'], format='%H:%M:%S').dt.time
+
+    # brigade to string
+    schedules['brigade'] = schedules['brigade'].astype(str)
+    locations['brigade'] = locations['brigade'].astype(str)
+
+    # merge schedules and locations on brigade
+    buses_on_stops = schedules.merge(locations, on='brigade')
+    point2 = time.time()
+    print("Time to merge: ", point2 - point1)
+    mask = buses_on_stops.apply(lambda row: distance2(row['dl_geo_stop'] - row['dl_geo_bus'],
+                                            row['szer_geo_stop'] - row['szer_geo_bus']) < km_threshold, axis=1)
+    point3 = time.time()
+    print("Time to calculate distance: ", point3 - point2)
+    buses_on_stops = buses_on_stops[mask]
+
+    # create a mask for time difference, chcec if time difference is less than an hour
+    mask = buses_on_stops.apply(lambda row: abs(row['time_bus'].hour - row['time_stop'].hour) < 1, axis=1)
+    buses_on_stops = buses_on_stops[mask]
+    point4 = time.time()
+    print("Time to calculate time difference: ", point4 - point3)
+
+    # remove duplicate rows
+    buses_on_stops = buses_on_stops.drop_duplicates()
+
+    # remove rows with NaN values
+    buses_on_stops = buses_on_stops.dropna()
+    # save buses_on_stops to csv in data directory
+    buses_on_stops.to_csv('data\\buses_on_stops' + line + '.csv', index=False)
+
+
+# This function uses data from buses_on_stops to determine if the bus is late or early
+def test_punctuality_of_line(line, threshold=3, output_file='buses_late_or_early.csv'):
+    time0 = time.time()
+    buses_on_stops = pd.read_csv('data\\buses_on_stops' + line + '.csv')
+    time1 = time.time()
+    print("Time to read csv: ", time1 - time0)
+    # treshold is amount of minutes that bus can be late or early
+    buses_on_stops['time_bus'] = pd.to_datetime(buses_on_stops['time_bus'], format='%H:%M:%S')
+    buses_on_stops['time_stop'] = pd.to_datetime(buses_on_stops['time_stop'], format='%H:%M:%S')
+    buses_on_stops['time_diff'] = buses_on_stops['time_bus'] - buses_on_stops['time_stop']
+    buses_on_stops['is_late'] = buses_on_stops['time_diff'] > pd.Timedelta(0)
+    buses_on_stops['is_early'] = buses_on_stops['time_diff'] < pd.Timedelta(0)
+    buses_on_stops['time_diff'] = buses_on_stops['time_diff'].dt.total_seconds() / 60
+    buses_on_stops['time_diff'] = buses_on_stops['time_diff'].abs()
+    buses_on_stops = buses_on_stops[buses_on_stops['time_diff'] > threshold]
+    # discard columns brigade, szer_geo_bus, dl_geo_bus, id_ulicy
+    buses_on_stops = buses_on_stops.drop(columns=['brigade', 'szer_geo_bus', 'dl_geo_bus', 'id_ulicy'])
+    # group by zespol and slupek and take mean of time_diff
+    buses_on_stops = buses_on_stops.groupby(['zespol', 'slupek', 'szer_geo_stop',
+                                             'dl_geo_stop', 'nazwa_zespolu']).agg({'time_diff': 'mean'}).reset_index()
+    buses_on_stops.to_csv('data\\' + output_file, index=False)
+    time2 = time.time()
+    print("Time to finish: ", time2 - time1)
+
+
+# punctuality_of_line('180')
+# test_punctuality_of_line('180')
diff --git a/analyser/transit_time.py b/analyser/transit_time.py
@@ -0,0 +1,119 @@
+import pandas as pd
+import os
+import time
+from analyser import distance2
+
+# This funnction uses geo data from buses and bus stops to determine if the bus is on the bus stop
+def buses_on_bus_stops(locations_file, line, km_threshold=0.10):
+    point0 = time.time()
+    if not os.path.isfile('data\\' + locations_file):
+        raise FileNotFoundError('Locations file not found')
+    locations = pd.read_csv('data\\' + locations_file)
+
+    if not os.path.isfile('data\\schedules' + line + '.csv'):
+        raise FileNotFoundError('Schedules file not found')
+    locations = locations[locations['lines'] == line]
+    # remove duplicate rows
+    locations = locations.drop_duplicates()
+    locations = locations.rename(columns={'szer_geo': 'szer_geo_bus', 'dl_geo': 'dl_geo_bus',
+                                          'time': 'time_bus'})
+    bus_stops = pd.read_csv('data\\possible_bus_stops' + line + '.csv')
+    bus_stops = bus_stops.rename(columns={'szer_geo': 'szer_geo_stop', 'dl_geo': 'dl_geo_stop',
+                                          'time': 'time_stop'})
+    buses_on_stops = bus_stops.merge(locations, how='cross')
+    mask = buses_on_stops.apply(lambda row: distance2(row['dl_geo_stop'] - row['dl_geo_bus'],
+                                        row['szer_geo_stop'] - row['szer_geo_bus']) < km_threshold, axis=1)
+    buses_on_stops = buses_on_stops[mask]
+    buses_on_stops = buses_on_stops.drop(columns=['id_ulicy', 'brigade', 'type'])
+    buses_on_stops = buses_on_stops.sort_values(by=['vehicle_number', 'time_bus'])
+    # reorder columns make vehicle_number and time_bus first
+    buses_on_stops = buses_on_stops[['vehicle_number', 'time_bus', 'zespol', 'slupek', 'nazwa_zespolu', 'szer_geo_stop',
+                                     'dl_geo_stop', 'kierunek', 'szer_geo_bus', 'dl_geo_bus']]
+    # remove duplicate rows where vehicle_number and zespol are the same
+    buses_on_stops = buses_on_stops.drop_duplicates(subset=['vehicle_number', 'zespol'])
+    # remove rows with NaN values
+    buses_on_stops = buses_on_stops.dropna()
+    buses_on_stops.to_csv('data\\buses_on_stops_simplified' + line + '.csv', index=False)
+    return buses_on_stops
+
+
+# This function calculates transit time between bus stops based on data from buses_on_stops
+def calculate_transit_time(line):
+    buses_on_stops = pd.read_csv('data\\buses_on_stops_simplified' + line + '.csv')
+    buses_on_stops['time_bus'] = pd.to_datetime(buses_on_stops['time_bus'])
+    buses_on_stops = buses_on_stops.sort_values(by=['vehicle_number', 'time_bus'])
+    buses_on_stops['time_diff'] = buses_on_stops['time_bus'].shift(-1) - buses_on_stops['time_bus']
+    buses_on_stops['time_diff'] = buses_on_stops['time_diff'].dt.total_seconds() / 60
+    buses_on_stops['vechicle_number_match'] = (buses_on_stops['vehicle_number'] ==
+                                               buses_on_stops['vehicle_number'].shift(-1))
+    buses_on_stops['next_stop_name'] = buses_on_stops['nazwa_zespolu'].shift(-1)
+    # zespol to string
+    buses_on_stops['zespol'] = buses_on_stops['zespol'].astype(str)
+    buses_on_stops['next_stop_id'] = buses_on_stops['zespol'].shift(-1)
+    # next_stop_id to string
+    buses_on_stops['next_stop_id'] = buses_on_stops['next_stop_id'].astype(str)
+    buses_on_stops = buses_on_stops.drop(columns=['szer_geo_bus', 'dl_geo_bus'])
+    # remove rows where vehicle_number_match is False
+    buses_on_stops = buses_on_stops[buses_on_stops['vechicle_number_match']]
+    # group by zespol and next_stop_id and calculate mean of time_diff and count of vehicle_number
+    aggr_by = ['zespol', 'nazwa_zespolu', 'next_stop_id', 'next_stop_name', 'szer_geo_stop', 'dl_geo_stop']
+    # group by columns in aggr_by and calculate mean of time_diff and count of vehicle_number, but leave columns
+    # specified in aggr_by
+    buses_on_stops = buses_on_stops.groupby(aggr_by).agg({'time_diff': 'mean', 'vehicle_number': 'count'}).reset_index()
+    # reset index is what makes columns in aggr_by stay in the dataframe
+    # remove rows where vehicle_number is less than 3
+    buses_on_stops = buses_on_stops[buses_on_stops['vehicle_number'] > 2]
+    buses_on_stops.to_csv('data\\transit_time' + line + '.csv', index=False)
+    return buses_on_stops
+
+
+# This function uses data about typical route of the line and transit time to calculate transit time on the route
+def fit_to_schedule(line, output_file='fit_to_schedule.csv'):
+    all_routes = pd.read_json('data\\all_routes.json')
+    # read transit_time + line + .csv
+    transit_time = pd.read_csv('data\\transit_time' + line + '.csv')
+    # zespol to string
+    transit_time['zespol'] = transit_time['zespol'].astype(str)
+    # next_stop_id to string
+    transit_time['next_stop_id'] = transit_time['next_stop_id'].astype(str)
+    line_routes = all_routes['result'][line]
+    for route in line_routes:
+        df = pd.DataFrame(line_routes[route])
+        # flip the dataframe
+        df = df.T
+        print(route, df.size)
+        # index to int
+        df.index = df.index.astype(int)
+        # sort by index
+        df = df.sort_index()
+        df['next_stop_id'] = df['nr_zespolu'].shift(-1)
+        # merge with transit_time by zespol and next_stop_id
+        df = df.merge(transit_time, left_on=['nr_zespolu', 'next_stop_id'], right_on=['zespol', 'next_stop_id'])
+        print(route, df.size, '\n')
+        max_time_diff = df['time_diff'].max()
+        min_time_diff = df['time_diff'].min()
+        # round time_diff to 2 decimal places
+        df['time_diff'] = df['time_diff'].round(2)
+        yellow = (255, 255, 0)
+        blue = (0, 0, 255)
+        # make column color and set to gradient from blue to yellow
+        df['color'] = df['time_diff'].apply(lambda x: gradient(min_time_diff, max_time_diff, x, blue, yellow))
+        # save to csv
+        df.to_csv('data\\' + route + output_file, index=False)
+
+    # print(line_routes)
+
+
+def gradient(minimum, maximum, value, color1, color2):
+    ratio = (value - minimum) / (maximum - minimum)
+    r1, g1, b1 = color1
+    r2, g2, b2 = color2
+    r = int(r1 + (r2 - r1) * ratio)
+    g = int(g1 + (g2 - g1) * ratio)
+    b = int(b1 + (b2 - b1) * ratio)
+    return "rgb(" + str(r) + "," + str(g) + "," + str(b) + ")"
+
+
+# buses_on_bus_stops('180')
+# calculate_transit_time('180')
+# fit_to_schedule('180')
diff --git a/analyser/velocity.py b/analyser/velocity.py
@@ -1,27 +1,14 @@
-import analyser
 import pandas as pd
 import os
-from math import radians, sin, cos, sqrt, atan2
 import time
 
+from analyser import distance2
+
 
 def distance(lat1, lon1, lat2, lon2):
     return distance2(lon2 - lon1, lat2 - lat1)
 
 
-def distance2(dlon, dlat):
-    # The math module contains a function named radians which converts from degrees to radians.
-    # The radius of the Earth in Poland is 6363.564 km
-    dlon = radians(dlon)
-    dlat = radians(dlat)
-    R = 6363.564
-    # Haversine formula
-    a = sin(dlat / 2) ** 2 + cos(dlat) * cos(dlat) * sin(dlon / 2) ** 2
-    c = 2 * atan2(sqrt(a), sqrt(1 - a))
-    dist = R * c
-    return abs(dist)
-
-
 def exceeded_limit(limit, dlon, dlat, time_diff, vehicle_number, vechicle_number_match):
     distance_diff = distance2(dlon, dlat)
     velocity = distance_diff / time_diff if time_diff != 0 else 0
@@ -35,12 +22,13 @@ def exceeded_limit(limit, dlon, dlat, time_diff, vehicle_number, vechicle_number
     return None
 
 
-def exceeded_velocity(limit, locations_file):
-    if not os.path.isfile(locations_file):
+# This function uses geo data from buses to determine if the bus exceeded the velocity limit
+def exceeded_velocity(limit, locations_file, output_file='velocity_exceeded.csv', meta_data_file='meta_data.csv'):
+    if not os.path.isfile("data\\" + locations_file):
         print("File does not exist")
         return
     start = time.time()
-    buses_locations = pd.read_csv(locations_file)
+    buses_locations = pd.read_csv("data\\" + locations_file)
     time1 = time.time()
     print("Time to read csv: ", time1 - start)
 
@@ -90,8 +78,6 @@ def exceeded_velocity(limit, locations_file):
     time2c = time.time()
     print("Time to filter: ", time2c - time2b)
 
-    # print columns velocity_exceeded, vehicle_number_match, time, vehicle_number
-    print(diff_pd[['velocity_exceeded', 'vehicle_number_match', 'time', 'vehicle_number']])
     meta_data = dict()
     meta_data['how_many_rows'] = len(diff_pd)
     meta_data['how_many_vehicles'] = len(diff_pd['vehicle_number'].unique())
@@ -105,11 +91,12 @@ def exceeded_velocity(limit, locations_file):
     diff_pd = diff_pd[diff_pd['velocity_exceeded'] <= 100]
 
     # save dataframe to csv
-    diff_pd.to_csv('velocity_exceeded.csv', index=False)
+    diff_pd.to_csv("data\\" + output_file, index=False)
+
     # save meta data to csv
     meta_data_df = pd.DataFrame([meta_data])
-    meta_data_df.to_csv('meta_data.csv', index=False)
+    meta_data_df.to_csv("data\\" + meta_data_file, index=False)
 
 
-exceeded_velocity(50, '..\\locations20_40.csv')
+# exceeded_velocity(50, '..\\locations20_40.csv')
 
diff --git a/buses_in_districts_tests.py b/buses_in_districts_tests.py
@@ -0,0 +1,20 @@
+import unittest
+import analyser.buses_in_districts as buses_in_districts
+import pandas as pd
+import os
+
+
+class BusesInDistrictsTests(unittest.TestCase):
+    # check if buses_in_districts function makes good csv file
+    def test_buses_in_districts(self):
+        buses_in_districts.buses_in_districts('buses_locations_with_district.csv')
+        self.assertTrue(os.path.isfile('data\\buses_in_districts.csv'))
+        buses_in_districts_df = pd.read_csv('data\\buses_in_districts.csv')
+        self.assertEqual(buses_in_districts_df.shape[1], 2)
+        # check if there are no NaN values
+        self.assertFalse(buses_in_districts_df.isnull().values.any())
+        # check if there are no duplicates
+        self.assertFalse(buses_in_districts_df.duplicated().any())
+
+if __name__ == '__main__':
+    unittest.main()