feat(docker): upgrade python and packages version

dataesr · Apr 9, 2024 · 4d155ab · 4d155ab
1 parent f4edcbb
commit 4d155ab
Show file tree

Hide file tree

Showing 6 changed files with 103 additions and 345 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,5 @@
 # syntax=docker/dockerfile:1
-FROM python:3.6
+FROM python:3.11
 
 WORKDIR /src
 

diff --git a/project/server/main/load_paysage.py b/project/server/main/load_paysage.py
@@ -18,7 +18,7 @@
 from project.server.main.logger import get_logger
 from project.server.main.my_elastic import MyElastic
 from project.server.main.utils import (
-    download_insee_data,
+    city_zone_emploi_insee,
     get_alpha2_from_french,
     FRENCH_STOP,
     clean_list,
@@ -88,10 +88,10 @@ def load_paysage(index_prefix: str = "matcher") -> dict:
         es_data[criterion] = {}
 
     # Download paysage data
-    raw_data = download_data()
+    raw_records = download_data()
 
     # Transform paysage data
-    transformed_data = transform_data(raw_data)
+    transformed_data = transform_data(raw_records)
 
     # Iterate over paysage data
     logger.debug("Prepare data for elastic")
@@ -153,55 +153,58 @@ def load_paysage(index_prefix: str = "matcher") -> dict:
 
 def download_data() -> list:
     logger.debug(f"Download Paysage data from {ODS_PAYSAGE}")
-    data = (
-        pd.read_csv(
-            f"https://data.enseignementsup-recherche.gouv.fr/explore/dataset/{ODS_PAYSAGE}/download/?format=csv&apikey={ODS_KEY}",
-            sep=";",
-            low_memory=False,
-        )
-        .replace(np.nan, None)
-        .to_dict(orient="records")
+    data = pd.read_csv(
+        f"https://data.enseignementsup-recherche.gouv.fr/explore/dataset/{ODS_PAYSAGE}/download/?format=csv&apikey={ODS_KEY}",
+        sep=";",
+        low_memory=False,
     )
-    return data
+    records = data.replace(np.nan, None).to_dict(orient="records")
+    return records
 
 
-def transform_data(data: list) -> list:
-    logger.debug(f"Start transform of Paysage data ({len(data)} records)")
+def transform_data(records: list) -> list:
+    logger.debug(f"Start transform of Paysage data ({len(records)} records)")
 
     # Loading zone emploi data
-    logger.debug(f"Download insee data")
-    zone_emploi_insee = download_insee_data()
-    zone_emploi_composition = {}
-    city_zone_emploi = {}
-    for d in zone_emploi_insee:
-        city = d["LIBGEO"]
-        city_code = d["CODGEO"]
-        ze = d["LIBZE2020"]
-        if ze not in zone_emploi_composition:
-            zone_emploi_composition[ze] = []
-        zone_emploi_composition[ze].append(city)
-        if city_code not in city_zone_emploi:
-            city_zone_emploi[city_code] = []
-        city_zone_emploi[city_code].append(ze)
+    logger.debug(f"Load insee data")
+    try:
+        city_zone_emploi, zone_emploi_composition = city_zone_emploi_insee()
+    except Exception as error:
+        city_zone_emploi = {}
+        zone_emploi_composition = {}
+        logger.error(f"Error while loading insee data: {error}")
 
     # Setting a dict with all names, acronyms and cities
     logger.debug("Get data from Paysage records")
     name_acronym_city = {}
-    for d in data:
-        current_id = d["identifiant_interne"]
+    for record in records:
+        current_id = record["identifiant_interne"]
         name_acronym_city[current_id] = {}
+
         # Acronyms
-        acronyms = [d.get("sigle")] if d.get("sigle") else []
+        acronyms, names = [], []
+        sigle = record.get("sigle")
+        name_short = record.get("nom_court")
+        if sigle:
+            acronyms.append(sigle)
+        if name_short:
+            if name_short.isalnum():
+                acronyms.append(name_short)
+            else:
+                names.append(name_short)
         # Names
-        labels = ["uo_lib", "uo_lib_officiel", "uo_lib_en", "nom_court"]
-        names = [d.get(name) for name in labels if d.get(name)]
+        labels = ["uo_lib", "uo_lib_officiel", "uo_lib_en"]
+        names += [record.get(name) for name in labels if record.get(name)]
         names = list(set(names))
         names = list(set(names) - set(acronyms))
+
         # Cities, country_alpha2, and zone_emploi
         cities, country_alpha2, zone_emploi = [], [], []
-        city = d.get("com_nom")
-        city_code = d.get("com_code")
-        country = d.get("pays_etranger_acheminement")
+        city = record.get("com_nom")
+        clean_city = " ".join([s for s in city.split(" ") if s.isalpha()])
+        city = clean_city if clean_city else city
+        city_code = record.get("com_code")
+        country = record.get("pays_etranger_acheminement")
         if city:
             cities.append(city)
             if city_code in city_zone_emploi:
@@ -211,7 +214,7 @@ def transform_data(data: list) -> list:
             country_alpha2.append(alpha2)
 
         name_acronym_city[current_id]["city"] = clean_list(data=cities)
-        name_acronym_city[current_id]["zone_emploi"] = clean_list(data=zone_emploi)
+        name_acronym_city[current_id]["zone_emploi"] = clean_list(zone_emploi)
         name_acronym_city[current_id]["acronym"] = clean_list(data=acronyms, ignored=ACRONYM_IGNORED, min_character=2)
         name_acronym_city[current_id]["name"] = clean_list(data=names, stopwords=FRENCH_STOP, min_token=2)
         country_alpha2 = clean_list(data=country_alpha2)
@@ -221,8 +224,8 @@ def transform_data(data: list) -> list:
 
     logger.debug("Transform records to elastic indexes")
     es_paysages = []
-    for d in data:
-        paysage_id = d.get("identifiant_interne")
+    for record in records:
+        paysage_id = record.get("identifiant_interne")
         es_paysage = {"id": paysage_id}
         # Acronyms & names
         es_paysage["acronym"] = name_acronym_city[paysage_id]["acronym"]
@@ -232,30 +235,26 @@ def transform_data(data: list) -> list:
         es_paysage["city"] = name_acronym_city[paysage_id]["city"]
         es_paysage["country_alpha2"] = name_acronym_city[paysage_id]["country_alpha2"]
         es_paysage["country_code"] = [name_acronym_city[paysage_id]["country_alpha2"]]
-        # For zone emploi, all the cities around are added, so that, eg, Bordeaux is in
-        # zone_emploi of a lab located in Talence
-        es_paysage["zone_emploi"] = []
-        for ze in name_acronym_city[paysage_id]["zone_emploi"]:
-            es_paysage["zone_emploi"] += zone_emploi_composition[ze]
-        es_paysage["zone_emploi"] = clean_list(es_paysage["zone_emploi"])
+        # Zone emploi
+        es_paysage["zone_emploi"] = name_acronym_city[paysage_id]["zone_emploi"]
         # Wikidata
-        wikidata = d.get("identifiant_wikidata")
+        wikidata = record.get("identifiant_wikidata")
         if wikidata:
             es_paysage["wikidata"] = wikidata
         # Dates
         last_year = f"{datetime.date.today().year}"
-        start_date = d.get("date_creation")
+        start_date = record.get("date_creation")
         if not start_date:
             start_date = "2010"
         start = int(start_date[0:4])
-        end_date = d.get("date_fermeture")
+        end_date = record.get("date_fermeture")
         if not end_date:
             end_date = last_year
         end = int(end_date[0:4])
         # Start date one year before official as it can be used before sometimes
         es_paysage["year"] = [str(y) for y in list(range(start - 1, end + 1))]
         # Url
-        url = d.get("url")
+        url = record.get("url")
         if isinstance(url, list):
             raise Exception("found list url", url)
         if url:

diff --git a/project/server/main/match_paysage.py b/project/server/main/match_paysage.py
@@ -37,7 +37,6 @@ def match_paysage(conditions: dict) -> dict:
                 equivalent_strategies_copy.append(strategy + ["paysage_year"])
             strategies_copy.append(equivalent_strategies_copy)
         strategies = strategies_copy
-    print("match_paysage:", strategies)
     matcher = Matcher()
     return matcher.match(
         field="paysages",

diff --git a/project/server/main/utils.py b/project/server/main/utils.py
@@ -70,19 +70,21 @@ def remove_parenthesis(x):
     return x
 
 
-def clean_list(data: list, stopwords=[], ignored=[], remove_inside_parenthesis=True, min_token=1, min_character = 1) -> list:
+def clean_list(
+    data: list, stopwords=[], ignored=[], remove_inside_parenthesis=True, min_token=1, min_character=1
+) -> list:
     # Cast data into list if needed
     if not isinstance(data, list):
         data = [data]
     data = list(filter(None, data))
-    # Remove duplicates
+    # Remove duplicates and non str
     data = [k for k in list(set(data)) if k and isinstance(k, str)]
     for ix, e in enumerate(data):
         if remove_inside_parenthesis:
             e = remove_parenthesis(e)
         if stopwords:
             e = remove_stop(e, stopwords)
-        data[ix] = e
+        data[ix] = e.strip()
     new_data = []
     for k in data:
         k_normalized = normalize_text(k, remove_separator=False)
@@ -95,6 +97,7 @@ def clean_list(data: list, stopwords=[], ignored=[], remove_inside_parenthesis=T
         new_data.append(k)
     return new_data
 
+
 def chunks(lst: list, n: int) -> list:
     """Yield successive n-sized chunks from list."""
     for i in range(0, len(lst), n):
@@ -129,7 +132,7 @@ def strip_accents(text: str) -> str:
 
 def delete_punctuation(text: str) -> str:
     """Delete all punctuation in a string."""
-    return text.lower().translate(str.maketrans(string.punctuation, len(string.punctuation) * ' '))
+    return text.translate(str.maketrans(string.punctuation, len(string.punctuation) * " "))
 
 
 def normalize_text(text: str = None, remove_separator: bool = True, re_order: bool = False, to_lower: bool = False) -> str:
@@ -145,7 +148,7 @@ def normalize_text(text: str = None, remove_separator: bool = True, re_order: bo
         if re_order:
             text_split.sort()
         text = sep.join(text_split)
-    return text or ''
+    return text or ""
 
 
 def get_alpha2_from_french(user_input):
@@ -174,7 +177,7 @@ def get_alpha2_from_french(user_input):
     return ref.get(user_input)
 
 
-def download_insee_data() -> dict:
+def download_insee_data() -> list:
     insee_downloaded_file = 'insee_data_dump.zip'
     insee_unzipped_folder = mkdtemp()
     response = requests.get(url=ZONE_EMPLOI_INSEE_DUMP, stream=True, verify=False)
@@ -183,13 +186,36 @@ def download_insee_data() -> dict:
             file.write(chunk)
     with ZipFile(insee_downloaded_file, 'r') as file:
         file.extractall(insee_unzipped_folder)
-    data = pd.read_excel(f'{insee_unzipped_folder}/ZE2020_au_01-01-2023.xlsx', sheet_name='Composition_communale',
-                         skiprows=5).to_dict(orient='records')
+    data = pd.read_excel(
+        f"{insee_unzipped_folder}/ZE2020_au_01-01-2024.xlsx",
+        sheet_name="Composition_communale",
+        engine="calamine",
+        skiprows=5,
+    ).to_dict(orient="records")
     os.remove(path=insee_downloaded_file)
     shutil.rmtree(path=insee_unzipped_folder)
     return data
 
 
+def city_zone_emploi_insee() -> tuple[dict, dict]:
+    zone_emploi_composition = {}
+    city_zone_emploi = {}
+
+    zone_emploi_insee = download_insee_data()
+    for d in zone_emploi_insee:
+        city = d["LIBGEO"]
+        city_code = d["CODGEO"]
+        ze = d["LIBZE2020"]
+        if ze not in zone_emploi_composition:
+            zone_emploi_composition[ze] = []
+        zone_emploi_composition[ze].append(city)
+        if city_code not in city_zone_emploi:
+            city_zone_emploi[city_code] = []
+        city_zone_emploi[city_code].append(ze)
+
+    return city_zone_emploi, zone_emploi_composition
+
+
 def has_a_digit(text: str = '') -> bool:
     for char in text:
         if char.isdigit():

diff --git a/requirements.txt b/requirements.txt
@@ -1,23 +1,20 @@
-# Dev dependencies
-pytest==6.2.3
-pytest-mock==3.5.1
-requests-mock==1.9.2
-
-# Dependencies
-beautifulsoup4==4.8.2
-contextvars==2.4
-elasticsearch==7.8.0
-elasticsearch-dsl==7.2.1
-Flask==1.1.1
-Flask-Bootstrap==3.3.7.1
-fuzzywuzzy==0.18.0
-geopy==2.1.0
-lxml==4.9.1
-pandas==0.25.3
-pycountry==20.7.3
-python-Levenshtein==0.21.1
-redis==3.5.3
-requests==2.25.0
-rq==1.9.0
-xlrd==1.1.0
-XlsxWriter==1.0.4
+pytest==8.1.1
+pytest-mock==3.14.0
+requests-mock==1.12.1
+beautifulsoup4==4.12.3
+contextvars==2.4
+elasticsearch==7.8.0
+elasticsearch-dsl==7.2.1
+Flask==3.0.3
+Flask-Bootstrap==3.3.7.1
+fuzzywuzzy==0.18.0
+geopy==2.4.1
+lxml==5.2.1
+pandas==2.2.1
+pycountry==23.12.11
+python-calamine==0.2.0
+python-Levenshtein==0.25.1
+redis==5.0.3
+requests==2.31.0
+rq==1.16.1
+XlsxWriter==3.2.0