Skip to content

Commit

Permalink
opravy scrapovani
Browse files Browse the repository at this point in the history
  • Loading branch information
michalkasparek committed Nov 11, 2024
1 parent e997ad9 commit 031098f
Show file tree
Hide file tree
Showing 14 changed files with 370,406 additions and 584,005 deletions.
1 change: 1 addition & 0 deletions 001_cd_scrapovani.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ def cd(odkud, kam, pocet_dni=0):
("Praha", "Lublaň"),
("Praha", "Amsterdam"),
("Praha", "Krakow"),
# ("Praha","Vratislav")
]

random.shuffle(trasy_b)
Expand Down
1,009 changes: 127 additions & 882 deletions 002_cd_cisteni.ipynb

Large diffs are not rendered by default.

3,299 changes: 11 additions & 3,288 deletions 004_rj_cisteni.ipynb

Large diffs are not rendered by default.

358 changes: 39 additions & 319 deletions 006_le_cisteni.ipynb

Large diffs are not rendered by default.

814 changes: 39 additions & 775 deletions 008_ar_cisteni.ipynb

Large diffs are not rendered by default.

356 changes: 356 additions & 0 deletions 050_spojeni.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,356 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 95,
"id": "3c49eaba-04cd-4c05-9b78-ca43179ff542",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pandas as pd\n",
"import datetime\n",
"pd.set_option('display.max_columns', 100)\n",
"pd.set_option('display.max_rows', 500)"
]
},
{
"cell_type": "code",
"execution_count": 96,
"id": "e2c1f968-7977-4632-82f7-3d83447ea76d",
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame()\n",
"for f in [x for x in os.listdir(\"data\") if x[2] == \"_\"]:\n",
" df = pd.concat([df, pd.read_parquet(os.path.join(\"data\",f))])"
]
},
{
"cell_type": "code",
"execution_count": 97,
"id": "98e440a4-ff6c-4378-af23-bc8206a7ebec",
"metadata": {},
"outputs": [],
"source": [
"df['oscrapovano_minuty'] = df['oscrapovano'].apply(lambda x: str(x)[0:15])"
]
},
{
"cell_type": "code",
"execution_count": 98,
"id": "834a58d6-07a1-4ff3-b434-44a4d8a16b9f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 2024-11-07 18:3\n",
"1 2024-11-07 18:3\n",
"2 2024-11-07 18:3\n",
"3 2024-11-07 18:3\n",
"4 2024-11-07 18:3\n",
" ... \n",
"12797 2024-11-10 00:0\n",
"12798 2024-11-10 00:0\n",
"12799 2024-11-10 14:1\n",
"12800 2024-11-10 14:1\n",
"12801 2024-11-10 14:1\n",
"Name: oscrapovano_minuty, Length: 528765, dtype: object"
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['oscrapovano_minuty']"
]
},
{
"cell_type": "code",
"execution_count": 99,
"id": "b31b987f-afe9-4594-bd30-e512d08a3566",
"metadata": {},
"outputs": [],
"source": [
"df = df.drop_duplicates(subset=['odkud','kam','odjezd','oscrapovano_minuty'], keep=\"last\")"
]
},
{
"cell_type": "code",
"execution_count": 100,
"id": "de3420d1-e5c1-4720-b702-4e1840d74ae3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"prodejce oscrapovano\n",
"ARR 2024-11-07 2497\n",
" 2024-11-08 7238\n",
" 2024-11-09 3655\n",
" 2024-11-10 3242\n",
"LE 2024-11-07 332\n",
" 2024-11-08 1615\n",
" 2024-11-09 1873\n",
" 2024-11-10 1755\n",
"RJ 2024-11-06 3877\n",
" 2024-11-07 1927\n",
" 2024-11-08 12228\n",
" 2024-11-09 14652\n",
" 2024-11-10 12683\n",
"ČD 2024-10-31 1558\n",
" 2024-11-01 15717\n",
" 2024-11-02 21853\n",
" 2024-11-03 17590\n",
" 2024-11-04 23195\n",
" 2024-11-05 22982\n",
" 2024-11-06 23426\n",
" 2024-11-07 22759\n",
" 2024-11-08 23954\n",
" 2024-11-09 25076\n",
" 2024-11-10 22781\n",
"dtype: int64"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby(['prodejce',pd.Grouper(key='oscrapovano',freq='D')]).size()"
]
},
{
"cell_type": "code",
"execution_count": 101,
"id": "13a6cb25-bc64-4cce-b00f-ab1a4ecf7734",
"metadata": {},
"outputs": [],
"source": [
"df = df.sort_values(by=\"oscrapovano\").reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 102,
"id": "fabaebb2-fc2e-476b-8775-208546de88b3",
"metadata": {},
"outputs": [],
"source": [
"df['cena'] = pd.to_numeric(df['cena'])"
]
},
{
"cell_type": "code",
"execution_count": 103,
"id": "1e429647-b3cf-4a38-b662-82bf58f40e68",
"metadata": {},
"outputs": [],
"source": [
"df = df.dropna(subset=['odkud','kam','odjezd','oscrapovano'],how='any')"
]
},
{
"cell_type": "code",
"execution_count": 104,
"id": "8516e283-1ec0-4fc6-92e0-5b0d9eda7db4",
"metadata": {},
"outputs": [],
"source": [
"df = df[df['prostredek'] != 'autobus']"
]
},
{
"cell_type": "code",
"execution_count": 105,
"id": "e7b3f70a-4f04-488f-99a0-e534168cdf21",
"metadata": {},
"outputs": [],
"source": [
"days = {0: 'po', 1: 'út', 2: 'st', 3: 'čt', \n",
" 4: 'pá', 5: 'so', 6: 'ne'}\n",
"df['den'] = df['odjezd'].dt.dayofweek.map(days)"
]
},
{
"cell_type": "code",
"execution_count": 106,
"id": "50764e8d-9cd1-450b-bd8a-34e85e3d22f9",
"metadata": {},
"outputs": [],
"source": [
"df['predstih_d'] = df['predstih'].dt.days\n",
"df['predstih_h'] = df['predstih'].dt.total_seconds() / 3600"
]
},
{
"cell_type": "code",
"execution_count": 107,
"id": "94bc1ba3-932e-4df2-a7a5-4aedaeec9717",
"metadata": {},
"outputs": [],
"source": [
"df = df[df['predstih_h'] > -3]"
]
},
{
"cell_type": "code",
"execution_count": 108,
"id": "5376546f-7995-48af-a8c1-dd8c41c0a591",
"metadata": {},
"outputs": [],
"source": [
"kategoricka_data = ['odkud','kam','prodejce','den']"
]
},
{
"cell_type": "code",
"execution_count": 109,
"id": "be5f17df-fb85-4139-9b07-98b3013433ed",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Before: 19614208\n",
"After: 2411240\n",
"Before: 19526421\n",
"After: 2410297\n",
"Before: 19965698\n",
"After: 2403328\n",
"Before: 17655969\n",
"After: 2403648\n"
]
}
],
"source": [
"for k in kategoricka_data:\n",
" print(\"Before:\", df[k].memory_usage(deep=True))\n",
" df[k] = df[k].astype('category')\n",
" print(\"After: \", df[k].memory_usage(deep=True))"
]
},
{
"cell_type": "code",
"execution_count": 110,
"id": "f5e504ea-d0cb-43db-960d-574b16d96b2f",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df['cena'] = pd.to_numeric(df['cena'])"
]
},
{
"cell_type": "code",
"execution_count": 111,
"id": "031a1930-cc15-4b30-b202-ffcb492534b2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(266992, 26)"
]
},
"execution_count": 111,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 112,
"id": "852ca70e-3c1a-48cb-81fe-d1c51c2300f5",
"metadata": {},
"outputs": [],
"source": [
"poradi = ['oscrapovano','prodejce','odkud','kam',\n",
" 'odjezd',\n",
" 'predstih',\n",
" 'predstih_d',\n",
" 'predstih_h',\n",
" 'cena',\n",
" 'prostredek',\n",
" 'volnych_mist',\n",
" 'obsazenost',\n",
" 'jizdni_doba',\n",
" 'vzdalenost',\n",
" 'zpozdeni',\n",
" 'cena_poznamka',\n",
" 'den',\n",
" 'prestupy',\n",
"'vlaky',\n",
" 'mistenka_zdarma',\n",
" 'nahradni_bus',\n",
" 'volna_mista_economy',\n",
" 'volna_mista_economy_plus',\n",
" 'volna_mista_economy_business',\n",
" 'volna_mista_premium']"
]
},
{
"cell_type": "code",
"execution_count": 113,
"id": "16e60aa6-f5da-4435-8fd0-35341d542234",
"metadata": {},
"outputs": [],
"source": [
"df[poradi].to_parquet(os.path.join(\"data\",\"jizdenky.parquet\"))"
]
},
{
"cell_type": "code",
"execution_count": 114,
"id": "66d06a65-575a-486b-aeff-1fb8a3735eb0",
"metadata": {},
"outputs": [],
"source": [
"nejnovejsi = df['oscrapovano'].max()\n",
"nejnovejsi\n",
"df_tyden = df[df['oscrapovano'] > (nejnovejsi - datetime.timedelta(hours=168))]"
]
},
{
"cell_type": "code",
"execution_count": 115,
"id": "50643981-0d07-4390-a1ab-e2a4b636b358",
"metadata": {},
"outputs": [],
"source": [
"df_tyden[poradi].to_csv(os.path.join(\"data\",\"jizdenky_tyden.csv\"))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 031098f

Please sign in to comment.