Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ecoulement des cours d'eau #10

Merged
merged 7 commits into from
Oct 17, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions cl_hubeau/watercourses_flow/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-

from .watercourses_flow_scraper import WatercoursesFlowSession
from .utils import get_all_stations, get_all_observations


__all__ = [
"get_all_stations",
"get_all_observations",
"WatercoursesFlowSession",
]
126 changes: 126 additions & 0 deletions cl_hubeau/watercourses_flow/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import geopandas as gpd
import pandas as pd
from tqdm import tqdm
from datetime import date, datetime
from itertools import product

from cl_hubeau.watercourses_flow.watercourses_flow_scraper import (
WatercoursesFlowSession,
)
from cl_hubeau import _config
from cl_hubeau.utils import get_departements


def get_all_stations(**kwargs) -> gpd.GeoDataFrame:
"""
Retrieve all stations from France.

Parameters
----------
**kwargs :
kwargs passed to WatercoursesFlowSession.get_stations (hence mostly intended
for hub'eau API's arguments). Do not use `format` or `code_departement`
as they are set by the current function.

Returns
-------
results : gpd.GeoDataFrame
GeoDataFrame of stations

"""

with WatercoursesFlowSession() as session:

deps = get_departements()
results = [
session.get_stations(code_departement=dep, format="geojson", **kwargs)
for dep in tqdm(
deps,
desc="querying dep/dep",
leave=_config["TQDM_LEAVE"],
position=tqdm._get_free_pos(),
)
]
results = [x.dropna(axis=1, how="all") for x in results if not x.empty]
results = gpd.pd.concat(results, ignore_index=True)
try:
results["code_station"]
results = results.drop_duplicates("code_station")
except KeyError:
pass
return results


def get_all_observations(**kwargs) -> gpd.GeoDataFrame:
"""
Retrieve all observsations from France.

Parameters
----------
**kwargs :
kwargs passed to WatercoursesFlowSession.get_observations (hence mostly intended
for hub'eau API's arguments). Do not use `format` or `code_departement`
as they are set by the current function.

Returns
-------
results : gpd.GeoDataFrame
GeoDataFrame of observations
"""

deps = get_departements()

# Set a loop for yearly querying as dataset are big
start_auto_determination = False
if "date_observation_min" not in kwargs:
start_auto_determination = True
kwargs["date_observation_min"] = "2016-01-01"
if "date_observation_max" not in kwargs:
kwargs["date_observation_max"] = date.today().strftime("%Y-%m-%d")

ranges = pd.date_range(
start=datetime.strptime(kwargs.pop("date_observation_min"), "%Y-%m-%d").date(),
end=datetime.strptime(kwargs.pop("date_observation_max"), "%Y-%m-%d").date(),
)
dates = pd.Series(ranges).to_frame("date")
dates["year"] = dates["date"].dt.year
dates = dates.groupby("year")["date"].agg(["min", "max"])
for d in "min", "max":
dates[d] = dates[d].dt.strftime("%Y-%m-%d")
if start_auto_determination:
dates = pd.concat(
[
dates,
pd.DataFrame([{"min": "1900-01-01", "max": "2015-12-31"}]),
],
ignore_index=False,
).sort_index()

args = list(product(deps, dates.values.tolist()))

with WatercoursesFlowSession() as session:

results = [
session.get_observations(
format="geojson",
date_observation_min=date_min,
date_observation_max=date_max,
**{"code_departement": chunk},
**kwargs,
)
for chunk, (date_min, date_max) in tqdm(
args,
desc="querying station/station and year/year",
leave=_config["TQDM_LEAVE"],
position=tqdm._get_free_pos(),
)
]
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO : refacto avec nouvelle fonction prepare_kwargs_loops ici - exemple de mise en oeuvre

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Je n'obtiens pas le même nombre de résultats avec les trois méthodes :

  • En utilisant directement l'API observation sur le site, j'obtiens 320597 résultats
  • Avec mon ancienne méthode, j'obtenais 316714 résultats
  • Avec ta fonction, j'obtiens 308268 résultats


results = [x.dropna(axis=1, how="all") for x in results if not x.empty]
results = pd.concat(results, ignore_index=True)
return results


# if __name__ == "__main__":
# # print(get_all_stations())
# print(get_all_observations())
Loading