Skip to content

Commit

Permalink
15. 11. 2024
Browse files Browse the repository at this point in the history
  • Loading branch information
michalkasparek committed Nov 15, 2024
1 parent 031098f commit 6ce1097
Show file tree
Hide file tree
Showing 15 changed files with 369,355 additions and 380,392 deletions.
105 changes: 105 additions & 0 deletions 001_cd_scrapovani_claude.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
def oscrapuj_cd(slozka, soubor):
# Pre-compile regular expressions for better performance
date_pattern = re.compile(r'\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}')
time_pattern = re.compile(r'\d{1,2}:\d\d')

with open(os.path.join(slozka, soubor), "r", encoding="utf-8") as file:
raw_html = file.read()

# Extract date once instead of per iteration
oscrapovano = date_pattern.search(soubor).group(0)
oscrapovano_dt = datetime.strptime(oscrapovano, "%Y-%m-%d_%H-%M-%S")

# Split only once and store results
spojeni = raw_html.split("<article")[1:]

# Define common strings as constants to avoid string creation in loop
COMMON_STRINGS = {
'to_train': 'text: to(lastTrain($index()))',
'from_train': 'text: from(firstTrain($index()))',
'journey_date': 'text: journeyDateTextFrom()',
'dep_time': 'text: depTime(firstTrain($index()))',
'buy_button': 'text: buyButtonText()',
'train_type': 'text: model.trainTypeAndNum',
'occupancy': '<span aria-hidden="true" data-bind="visible: occupancyLevelFull(), text: occupancyLevelText()"',
'time_length': '<span data-bind="text: timeLength, visible: timeLength != null &amp;&amp; timeLength != \'\'"',
'distance': '<span data-bind="text: distance, visible: distance != null &amp;&amp; distance != \'"',
'delay': '<span aria-hidden="true" data-bind="text: delayText()">'
}

slovniky = []

for s in spojeni:
# Initialize dictionary with default values
slovnik = {
'prodejce': "ČD",
'vlaky': [],
'obsazenost': [],
'zpozdeni': [],
'oscrapovano': oscrapovano_dt,
'prostredek': 'vlak'
}

lines = s.splitlines()
den = None
cas_odjezdu = None

for radek in lines:
# Use dictionary lookup instead of multiple if statements
if COMMON_STRINGS['to_train'] in radek:
slovnik['kam'] = radek.split('>')[-2].split('<')[0]
elif COMMON_STRINGS['from_train'] in radek:
slovnik['odkud'] = radek.split('>')[-2].split('<')[0]
elif COMMON_STRINGS['journey_date'] in radek and not den:
den = radek.split('>')[-2].split('<')[0]
elif COMMON_STRINGS['dep_time'] in radek:
cas_odjezdu = time_pattern.search(radek).group(0)
elif COMMON_STRINGS['buy_button'] in radek:
cena = radek.split('-->')[1].split('<!--')[0].replace(" Kč","")
try:
slovnik['cena'] = int(cena)
except ValueError:
pass
elif COMMON_STRINGS['train_type'] in radek:
vlak = radek.split('-->')[1].split('<!--')[0]
if vlak not in slovnik['vlaky']:
slovnik['vlaky'].append(vlak)
elif COMMON_STRINGS['occupancy'] in radek:
slovnik['obsazenost'].append(radek.split('>')[-2].split('<')[0])
elif COMMON_STRINGS['time_length'] in radek:
jizdni_doba = radek.split('>')[-2].split('<')[0].split(":")
slovnik['jizdni_doba'] = (int(jizdni_doba[0]) * 60) + int(jizdni_doba[1][:2])
elif COMMON_STRINGS['distance'] in radek:
try:
slovnik['vzdalenost'] = int(radek.split('>')[-2].split('<')[0].replace(" km",""))
except ValueError:
pass
elif "Náhradní autobusová doprava" in radek:
slovnik['nahradni_bus'] = True
elif COMMON_STRINGS['delay'] in radek:
if "+" in radek:
try:
slovnik['zpozdeni'].append(int(radek.split('+')[1].strip().split(' ')[0]))
except ValueError:
slovnik['zpozdeni'].append('chyba')
elif "><" in radek:
slovnik['zpozdeni'].append(0)
elif "Místenka zdarma" in radek:
slovnik['mistenka_zdarma'] = True

if den and cas_odjezdu:
slovnik['den'] = den.split(' ')[0].lower().strip()
date_str = '-'.join(den.split(' ')[1].split('.')[::-1])
slovnik['odjezd'] = datetime.strptime(f"{date_str} {cas_odjezdu}", "%Y-%m-%d %H:%M")
slovnik['predstih'] = slovnik['odjezd'] - slovnik['oscrapovano']

# Post-processing
if len(slovnik['vlaky']) == 1 and slovnik['obsazenost']:
slovnik['obsazenost'] = [slovnik['obsazenost'][0]]

slovnik['zpozdeni'] = slovnik['zpozdeni'][1::2]
slovnik['prestupy'] = len(slovnik['vlaky']) - 1

slovniky.append(slovnik)

return [x for x in slovniky if x]
225 changes: 50 additions & 175 deletions 002_cd_cisteni.ipynb
Original file line number Diff line number Diff line change
@@ -1,21 +1,8 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "f0149a9b-22a2-421e-984a-75c7ac37fed2",
"metadata": {},
"source": [
"Todo:\n",
"\n",
"- udělat to jako skript, co bere vše neoscrapované\n",
"\n",
"Sólo sešit:\n",
"- mazat oscrapované soubory"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 16,
"id": "e27c863b-fd97-4b32-b4b3-0825cb67f49e",
"metadata": {},
"outputs": [],
Expand All @@ -28,7 +15,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 17,
"id": "ad771ad2-9650-42bf-a22b-b9249d883ccd",
"metadata": {},
"outputs": [],
Expand All @@ -43,7 +30,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 18,
"id": "4d3db07f-3600-48ac-b158-70ea455b365a",
"metadata": {},
"outputs": [],
Expand All @@ -66,50 +53,51 @@
" slovnik['oscrapovano'] = datetime.strptime(oscrapovano, \"%Y-%m-%d_%H-%M-%S\")\n",
" den = False\n",
" for radek in s:\n",
" if \"text: to(lastTrain($index()))\" in radek:\n",
" slovnik['kam'] = radek.split('>')[-2].split('<')[0]\n",
" if \"text: from(firstTrain($index()))\" in radek:\n",
" slovnik['odkud'] = radek.split('>')[-2].split('<')[0]\n",
" if (\"text: journeyDateTextFrom()\" in radek) and (den == False):\n",
" den = radek.split('>')[-2].split('<')[0]\n",
" if \"text: depTime(firstTrain($index()))\" in radek:\n",
" cas_odjezdu = re.search(r\"\\d{1,2}:\\d\\d\", radek).group(0)\n",
" if \"text: buyButtonText()\" in radek:\n",
" cena = radek.split('-->')[1].split('<!--')[0].replace(\"\",\"\")\n",
" try:\n",
" slovnik['cena'] = int(cena)\n",
" except:\n",
" pass\n",
" if \"Zjistit cenu\" in radek:\n",
" slovnik['cena_poznamka'] = \"Zjistit cenu\"\n",
" if \"Cena v dalším kroku\" in radek:\n",
" slovnik['cena_poznamka'] = \"Cena v dalším kroku\"\n",
" if \"text: model.trainTypeAndNum\" in radek:\n",
" if radek.split('-->')[1].split('<!--')[0] not in slovnik['vlaky']:\n",
" slovnik['vlaky'].append(radek.split('-->')[1].split('<!--')[0])\n",
" if \"\"\"<span aria-hidden=\"true\" data-bind=\"visible: occupancyLevelFull(), text: occupancyLevelText()\" style=\"display: none;\">\"\"\" in radek:\n",
" slovnik['obsazenost'].append(radek.split('>')[-2].split('<')[0])\n",
" if \"\"\"<span data-bind=\"text: timeLength, visible: timeLength != null &amp;&amp; timeLength != ''\">\"\"\" in radek:\n",
" jizdni_doba = radek.split('>')[-2].split('<')[0].split(\":\")\n",
" slovnik['jizdni_doba'] = (int(jizdni_doba[0]) * 60) + int(re.search(r'\\d\\d', jizdni_doba[1]).group(0))\n",
" if \"\"\"<span data-bind=\"text: distance, visible: distance != null &amp;&amp; distance != ''\" class=\"mobile-hidden\">\"\"\" in radek:\n",
" vzdalenost = radek.split('>')[-2].split('<')[0].replace(\" km\",\"\")\n",
" try:\n",
" slovnik['vzdalenost'] = int(vzdalenost)\n",
" except:\n",
" pass\n",
" if \"\"\"<span class=\"icon icon-bus\" data-bind=\"ifnot: isLegend, visible: !isLegend &amp;&amp; icoSrc, css: icoSrc, attr: {title: desc, 'aria-label': desc}\" title=\"Náhradní autobusová doprava\" aria-label=\"Náhradní autobusová doprava\"></span>\"\"\" in radek:\n",
" slovnik['nahradni_bus'] = True\n",
" if \"\"\"<span aria-hidden=\"true\" data-bind=\"text: delayText()\">\"\"\" in radek:\n",
" if \"+\" in radek:\n",
" if len(radek) < 500:\n",
" if \"text: to(lastTrain($index()))\" in radek:\n",
" slovnik['kam'] = radek.split('>')[-2].split('<')[0]\n",
" elif \"text: from(firstTrain($index()))\" in radek:\n",
" slovnik['odkud'] = radek.split('>')[-2].split('<')[0]\n",
" elif (\"text: journeyDateTextFrom()\" in radek) and (den == False):\n",
" den = radek.split('>')[-2].split('<')[0]\n",
" elif \"text: depTime(firstTrain($index()))\" in radek:\n",
" cas_odjezdu = re.search(r\"\\d{1,2}:\\d\\d\", radek).group(0)\n",
" elif \"text: buyButtonText()\" in radek:\n",
" cena = radek.split('-->')[1].split('<!--')[0].replace(\"\",\"\")\n",
" try:\n",
" slovnik['zpozdeni'].append(int(radek.split('+')[1].strip().split(' ')[0]))\n",
" slovnik['cena'] = int(cena)\n",
" except:\n",
" slovnik['zpozdeni'].append('chyba')\n",
" elif \"><\" in radek:\n",
" slovnik['zpozdeni'].append(0)\n",
" if \"Místenka zdarma\" in radek:\n",
" slovnik['mistenka_zdarma'] = True\n",
" pass\n",
" elif \"Zjistit cenu\" in radek:\n",
" slovnik['cena_poznamka'] = \"Zjistit cenu\"\n",
" elif \"Cena v dalším kroku\" in radek:\n",
" slovnik['cena_poznamka'] = \"Cena v dalším kroku\"\n",
" elif \"text: model.trainTypeAndNum\" in radek:\n",
" if radek.split('-->')[1].split('<!--')[0] not in slovnik['vlaky']:\n",
" slovnik['vlaky'].append(radek.split('-->')[1].split('<!--')[0])\n",
" elif \"\"\"<span aria-hidden=\"true\" data-bind=\"visible: occupancyLevelFull(), text: occupancyLevelText()\" style=\"display: none;\">\"\"\" in radek:\n",
" slovnik['obsazenost'].append(radek.split('>')[-2].split('<')[0])\n",
" elif \"\"\"<span data-bind=\"text: timeLength, visible: timeLength != null &amp;&amp; timeLength != ''\">\"\"\" in radek:\n",
" jizdni_doba = radek.split('>')[-2].split('<')[0].split(\":\")\n",
" slovnik['jizdni_doba'] = (int(jizdni_doba[0]) * 60) + int(re.search(r'\\d\\d', jizdni_doba[1]).group(0))\n",
" elif \"\"\"<span data-bind=\"text: distance, visible: distance != null &amp;&amp; distance != ''\" class=\"mobile-hidden\">\"\"\" in radek:\n",
" vzdalenost = radek.split('>')[-2].split('<')[0].replace(\" km\",\"\")\n",
" try:\n",
" slovnik['vzdalenost'] = int(vzdalenost)\n",
" except:\n",
" pass\n",
" elif \"\"\"<span class=\"icon icon-bus\" data-bind=\"ifnot: isLegend, visible: !isLegend &amp;&amp; icoSrc, css: icoSrc, attr: {title: desc, 'aria-label': desc}\" title=\"Náhradní autobusová doprava\" aria-label=\"Náhradní autobusová doprava\"></span>\"\"\" in radek:\n",
" slovnik['nahradni_bus'] = True\n",
" elif \"\"\"<span aria-hidden=\"true\" data-bind=\"text: delayText()\">\"\"\" in radek:\n",
" if \"+\" in radek:\n",
" try:\n",
" slovnik['zpozdeni'].append(int(radek.split('+')[1].strip().split(' ')[0]))\n",
" except:\n",
" slovnik['zpozdeni'].append('chyba')\n",
" elif \"><\" in radek:\n",
" slovnik['zpozdeni'].append(0)\n",
" elif \"Místenka zdarma\" in radek:\n",
" slovnik['mistenka_zdarma'] = True\n",
" slovnik['den'] = den.split(' ')[0].lower().strip()\n",
" slovnik['odjezd'] = datetime.strptime('-'.join(den.split(' ')[1].split('.')[::-1]) + \" \" + cas_odjezdu, \"%Y-%m-%d %H:%M\")\n",
" slovnik['predstih'] = slovnik['odjezd'] - slovnik['oscrapovano']\n",
Expand All @@ -128,121 +116,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"id": "5b25ead6-ae07-4300-9bdb-da3074af6d8b",
"metadata": {},
"outputs": [],
"source": [
"def oscrapuj_cd(slozka, soubor):\n",
" # Pre-compile regular expressions for better performance\n",
" date_pattern = re.compile(r'\\d{4}-\\d{2}-\\d{2}_\\d{2}-\\d{2}-\\d{2}')\n",
" time_pattern = re.compile(r'\\d{1,2}:\\d\\d')\n",
" \n",
" with open(os.path.join(slozka, soubor), \"r\", encoding=\"utf-8\") as file:\n",
" raw_html = file.read()\n",
"\n",
" # Extract date once instead of per iteration\n",
" oscrapovano = date_pattern.search(soubor).group(0)\n",
" oscrapovano_dt = datetime.strptime(oscrapovano, \"%Y-%m-%d_%H-%M-%S\")\n",
" \n",
" # Split only once and store results\n",
" spojeni = raw_html.split(\"<article\")[1:]\n",
" \n",
" # Define common strings as constants to avoid string creation in loop\n",
" COMMON_STRINGS = {\n",
" 'to_train': 'text: to(lastTrain($index()))',\n",
" 'from_train': 'text: from(firstTrain($index()))',\n",
" 'journey_date': 'text: journeyDateTextFrom()',\n",
" 'dep_time': 'text: depTime(firstTrain($index()))',\n",
" 'buy_button': 'text: buyButtonText()',\n",
" 'train_type': 'text: model.trainTypeAndNum',\n",
" 'occupancy': '<span aria-hidden=\"true\" data-bind=\"visible: occupancyLevelFull(), text: occupancyLevelText()\"',\n",
" 'time_length': '<span data-bind=\"text: timeLength, visible: timeLength != null &amp;&amp; timeLength != \\'\\'\"',\n",
" 'distance': '<span data-bind=\"text: distance, visible: distance != null &amp;&amp; distance != \\'\"',\n",
" 'delay': '<span aria-hidden=\"true\" data-bind=\"text: delayText()\">'\n",
" }\n",
" \n",
" slovniky = []\n",
" \n",
" for s in spojeni:\n",
" # Initialize dictionary with default values\n",
" slovnik = {\n",
" 'prodejce': \"ČD\",\n",
" 'vlaky': [],\n",
" 'obsazenost': [],\n",
" 'zpozdeni': [],\n",
" 'oscrapovano': oscrapovano_dt,\n",
" 'prostredek': 'vlak'\n",
" }\n",
" \n",
" lines = s.splitlines()\n",
" den = None\n",
" cas_odjezdu = None\n",
" \n",
" for radek in lines:\n",
" # Use dictionary lookup instead of multiple if statements\n",
" if COMMON_STRINGS['to_train'] in radek:\n",
" slovnik['kam'] = radek.split('>')[-2].split('<')[0]\n",
" elif COMMON_STRINGS['from_train'] in radek:\n",
" slovnik['odkud'] = radek.split('>')[-2].split('<')[0]\n",
" elif COMMON_STRINGS['journey_date'] in radek and not den:\n",
" den = radek.split('>')[-2].split('<')[0]\n",
" elif COMMON_STRINGS['dep_time'] in radek:\n",
" cas_odjezdu = time_pattern.search(radek).group(0)\n",
" elif COMMON_STRINGS['buy_button'] in radek:\n",
" cena = radek.split('-->')[1].split('<!--')[0].replace(\"\",\"\")\n",
" try:\n",
" slovnik['cena'] = int(cena)\n",
" except ValueError:\n",
" pass\n",
" elif COMMON_STRINGS['train_type'] in radek:\n",
" vlak = radek.split('-->')[1].split('<!--')[0]\n",
" if vlak not in slovnik['vlaky']:\n",
" slovnik['vlaky'].append(vlak)\n",
" elif COMMON_STRINGS['occupancy'] in radek:\n",
" slovnik['obsazenost'].append(radek.split('>')[-2].split('<')[0])\n",
" elif COMMON_STRINGS['time_length'] in radek:\n",
" jizdni_doba = radek.split('>')[-2].split('<')[0].split(\":\")\n",
" slovnik['jizdni_doba'] = (int(jizdni_doba[0]) * 60) + int(jizdni_doba[1][:2])\n",
" elif COMMON_STRINGS['distance'] in radek:\n",
" try:\n",
" slovnik['vzdalenost'] = int(radek.split('>')[-2].split('<')[0].replace(\" km\",\"\"))\n",
" except ValueError:\n",
" pass\n",
" elif \"Náhradní autobusová doprava\" in radek:\n",
" slovnik['nahradni_bus'] = True\n",
" elif COMMON_STRINGS['delay'] in radek:\n",
" if \"+\" in radek:\n",
" try:\n",
" slovnik['zpozdeni'].append(int(radek.split('+')[1].strip().split(' ')[0]))\n",
" except ValueError:\n",
" slovnik['zpozdeni'].append('chyba')\n",
" elif \"><\" in radek:\n",
" slovnik['zpozdeni'].append(0)\n",
" elif \"Místenka zdarma\" in radek:\n",
" slovnik['mistenka_zdarma'] = True\n",
" \n",
" if den and cas_odjezdu:\n",
" slovnik['den'] = den.split(' ')[0].lower().strip()\n",
" date_str = '-'.join(den.split(' ')[1].split('.')[::-1])\n",
" slovnik['odjezd'] = datetime.strptime(f\"{date_str} {cas_odjezdu}\", \"%Y-%m-%d %H:%M\")\n",
" slovnik['predstih'] = slovnik['odjezd'] - slovnik['oscrapovano']\n",
" \n",
" # Post-processing\n",
" if len(slovnik['vlaky']) == 1 and slovnik['obsazenost']:\n",
" slovnik['obsazenost'] = [slovnik['obsazenost'][0]]\n",
" \n",
" slovnik['zpozdeni'] = slovnik['zpozdeni'][1::2]\n",
" slovnik['prestupy'] = len(slovnik['vlaky']) - 1\n",
" \n",
" slovniky.append(slovnik)\n",
" \n",
" return [x for x in slovniky if x]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 19,
"id": "e297e728-2198-4215-af23-6cf12ab2802f",
"metadata": {
"scrolled": true
Expand All @@ -252,15 +126,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
"cd_2024-11-10.parquet\n",
"2024-11-10: 777\n"
"cd_2024-11-14.parquet\n",
"2024-11-14: 808\n"
]
}
],
"source": [
"kam = \"data\" \n",
"os.makedirs(kam, exist_ok=True)\n",
"hotove = [y for y in os.listdir(kam) if y[0:3] == \"cd_\"]\n",
"hotove = hotove\n",
"# hotove = []\n",
"for x in os.listdir(\"downloads\"):\n",
" nazev_souboru = \"cd_\" + x + \".parquet\"\n",
Expand Down
Loading

0 comments on commit 6ce1097

Please sign in to comment.