Skip to content

Commit

Permalink
feat(docker): upgrade python and packages version
Browse files Browse the repository at this point in the history
  • Loading branch information
ahonestla committed Apr 9, 2024
1 parent f4edcbb commit 4d155ab
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 345 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# syntax=docker/dockerfile:1
FROM python:3.6
FROM python:3.11

WORKDIR /src

Expand Down
97 changes: 48 additions & 49 deletions project/server/main/load_paysage.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from project.server.main.logger import get_logger
from project.server.main.my_elastic import MyElastic
from project.server.main.utils import (
download_insee_data,
city_zone_emploi_insee,
get_alpha2_from_french,
FRENCH_STOP,
clean_list,
Expand Down Expand Up @@ -88,10 +88,10 @@ def load_paysage(index_prefix: str = "matcher") -> dict:
es_data[criterion] = {}

# Download paysage data
raw_data = download_data()
raw_records = download_data()

# Transform paysage data
transformed_data = transform_data(raw_data)
transformed_data = transform_data(raw_records)

# Iterate over paysage data
logger.debug("Prepare data for elastic")
Expand Down Expand Up @@ -153,55 +153,58 @@ def load_paysage(index_prefix: str = "matcher") -> dict:

def download_data() -> list:
logger.debug(f"Download Paysage data from {ODS_PAYSAGE}")
data = (
pd.read_csv(
f"https://data.enseignementsup-recherche.gouv.fr/explore/dataset/{ODS_PAYSAGE}/download/?format=csv&apikey={ODS_KEY}",
sep=";",
low_memory=False,
)
.replace(np.nan, None)
.to_dict(orient="records")
data = pd.read_csv(
f"https://data.enseignementsup-recherche.gouv.fr/explore/dataset/{ODS_PAYSAGE}/download/?format=csv&apikey={ODS_KEY}",
sep=";",
low_memory=False,
)
return data
records = data.replace(np.nan, None).to_dict(orient="records")
return records


def transform_data(data: list) -> list:
logger.debug(f"Start transform of Paysage data ({len(data)} records)")
def transform_data(records: list) -> list:
logger.debug(f"Start transform of Paysage data ({len(records)} records)")

# Loading zone emploi data
logger.debug(f"Download insee data")
zone_emploi_insee = download_insee_data()
zone_emploi_composition = {}
city_zone_emploi = {}
for d in zone_emploi_insee:
city = d["LIBGEO"]
city_code = d["CODGEO"]
ze = d["LIBZE2020"]
if ze not in zone_emploi_composition:
zone_emploi_composition[ze] = []
zone_emploi_composition[ze].append(city)
if city_code not in city_zone_emploi:
city_zone_emploi[city_code] = []
city_zone_emploi[city_code].append(ze)
logger.debug(f"Load insee data")
try:
city_zone_emploi, zone_emploi_composition = city_zone_emploi_insee()
except Exception as error:
city_zone_emploi = {}
zone_emploi_composition = {}
logger.error(f"Error while loading insee data: {error}")

# Setting a dict with all names, acronyms and cities
logger.debug("Get data from Paysage records")
name_acronym_city = {}
for d in data:
current_id = d["identifiant_interne"]
for record in records:
current_id = record["identifiant_interne"]
name_acronym_city[current_id] = {}

# Acronyms
acronyms = [d.get("sigle")] if d.get("sigle") else []
acronyms, names = [], []
sigle = record.get("sigle")
name_short = record.get("nom_court")
if sigle:
acronyms.append(sigle)
if name_short:
if name_short.isalnum():
acronyms.append(name_short)
else:
names.append(name_short)
# Names
labels = ["uo_lib", "uo_lib_officiel", "uo_lib_en", "nom_court"]
names = [d.get(name) for name in labels if d.get(name)]
labels = ["uo_lib", "uo_lib_officiel", "uo_lib_en"]
names += [record.get(name) for name in labels if record.get(name)]
names = list(set(names))
names = list(set(names) - set(acronyms))

# Cities, country_alpha2, and zone_emploi
cities, country_alpha2, zone_emploi = [], [], []
city = d.get("com_nom")
city_code = d.get("com_code")
country = d.get("pays_etranger_acheminement")
city = record.get("com_nom")
clean_city = " ".join([s for s in city.split(" ") if s.isalpha()])
city = clean_city if clean_city else city
city_code = record.get("com_code")
country = record.get("pays_etranger_acheminement")
if city:
cities.append(city)
if city_code in city_zone_emploi:
Expand All @@ -211,7 +214,7 @@ def transform_data(data: list) -> list:
country_alpha2.append(alpha2)

name_acronym_city[current_id]["city"] = clean_list(data=cities)
name_acronym_city[current_id]["zone_emploi"] = clean_list(data=zone_emploi)
name_acronym_city[current_id]["zone_emploi"] = clean_list(zone_emploi)
name_acronym_city[current_id]["acronym"] = clean_list(data=acronyms, ignored=ACRONYM_IGNORED, min_character=2)
name_acronym_city[current_id]["name"] = clean_list(data=names, stopwords=FRENCH_STOP, min_token=2)
country_alpha2 = clean_list(data=country_alpha2)
Expand All @@ -221,8 +224,8 @@ def transform_data(data: list) -> list:

logger.debug("Transform records to elastic indexes")
es_paysages = []
for d in data:
paysage_id = d.get("identifiant_interne")
for record in records:
paysage_id = record.get("identifiant_interne")
es_paysage = {"id": paysage_id}
# Acronyms & names
es_paysage["acronym"] = name_acronym_city[paysage_id]["acronym"]
Expand All @@ -232,30 +235,26 @@ def transform_data(data: list) -> list:
es_paysage["city"] = name_acronym_city[paysage_id]["city"]
es_paysage["country_alpha2"] = name_acronym_city[paysage_id]["country_alpha2"]
es_paysage["country_code"] = [name_acronym_city[paysage_id]["country_alpha2"]]
# For zone emploi, all the cities around are added, so that, eg, Bordeaux is in
# zone_emploi of a lab located in Talence
es_paysage["zone_emploi"] = []
for ze in name_acronym_city[paysage_id]["zone_emploi"]:
es_paysage["zone_emploi"] += zone_emploi_composition[ze]
es_paysage["zone_emploi"] = clean_list(es_paysage["zone_emploi"])
# Zone emploi
es_paysage["zone_emploi"] = name_acronym_city[paysage_id]["zone_emploi"]
# Wikidata
wikidata = d.get("identifiant_wikidata")
wikidata = record.get("identifiant_wikidata")
if wikidata:
es_paysage["wikidata"] = wikidata
# Dates
last_year = f"{datetime.date.today().year}"
start_date = d.get("date_creation")
start_date = record.get("date_creation")
if not start_date:
start_date = "2010"
start = int(start_date[0:4])
end_date = d.get("date_fermeture")
end_date = record.get("date_fermeture")
if not end_date:
end_date = last_year
end = int(end_date[0:4])
# Start date one year before official as it can be used before sometimes
es_paysage["year"] = [str(y) for y in list(range(start - 1, end + 1))]
# Url
url = d.get("url")
url = record.get("url")
if isinstance(url, list):
raise Exception("found list url", url)
if url:
Expand Down
1 change: 0 additions & 1 deletion project/server/main/match_paysage.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ def match_paysage(conditions: dict) -> dict:
equivalent_strategies_copy.append(strategy + ["paysage_year"])
strategies_copy.append(equivalent_strategies_copy)
strategies = strategies_copy
print("match_paysage:", strategies)
matcher = Matcher()
return matcher.match(
field="paysages",
Expand Down
42 changes: 34 additions & 8 deletions project/server/main/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,19 +70,21 @@ def remove_parenthesis(x):
return x


def clean_list(data: list, stopwords=[], ignored=[], remove_inside_parenthesis=True, min_token=1, min_character = 1) -> list:
def clean_list(
data: list, stopwords=[], ignored=[], remove_inside_parenthesis=True, min_token=1, min_character=1
) -> list:
# Cast data into list if needed
if not isinstance(data, list):
data = [data]
data = list(filter(None, data))
# Remove duplicates
# Remove duplicates and non str
data = [k for k in list(set(data)) if k and isinstance(k, str)]
for ix, e in enumerate(data):
if remove_inside_parenthesis:
e = remove_parenthesis(e)
if stopwords:
e = remove_stop(e, stopwords)
data[ix] = e
data[ix] = e.strip()
new_data = []
for k in data:
k_normalized = normalize_text(k, remove_separator=False)
Expand All @@ -95,6 +97,7 @@ def clean_list(data: list, stopwords=[], ignored=[], remove_inside_parenthesis=T
new_data.append(k)
return new_data


def chunks(lst: list, n: int) -> list:
"""Yield successive n-sized chunks from list."""
for i in range(0, len(lst), n):
Expand Down Expand Up @@ -129,7 +132,7 @@ def strip_accents(text: str) -> str:

def delete_punctuation(text: str) -> str:
"""Delete all punctuation in a string."""
return text.lower().translate(str.maketrans(string.punctuation, len(string.punctuation) * ' '))
return text.translate(str.maketrans(string.punctuation, len(string.punctuation) * " "))


def normalize_text(text: str = None, remove_separator: bool = True, re_order: bool = False, to_lower: bool = False) -> str:
Expand All @@ -145,7 +148,7 @@ def normalize_text(text: str = None, remove_separator: bool = True, re_order: bo
if re_order:
text_split.sort()
text = sep.join(text_split)
return text or ''
return text or ""


def get_alpha2_from_french(user_input):
Expand Down Expand Up @@ -174,7 +177,7 @@ def get_alpha2_from_french(user_input):
return ref.get(user_input)


def download_insee_data() -> dict:
def download_insee_data() -> list:
insee_downloaded_file = 'insee_data_dump.zip'
insee_unzipped_folder = mkdtemp()
response = requests.get(url=ZONE_EMPLOI_INSEE_DUMP, stream=True, verify=False)
Expand All @@ -183,13 +186,36 @@ def download_insee_data() -> dict:
file.write(chunk)
with ZipFile(insee_downloaded_file, 'r') as file:
file.extractall(insee_unzipped_folder)
data = pd.read_excel(f'{insee_unzipped_folder}/ZE2020_au_01-01-2023.xlsx', sheet_name='Composition_communale',
skiprows=5).to_dict(orient='records')
data = pd.read_excel(
f"{insee_unzipped_folder}/ZE2020_au_01-01-2024.xlsx",
sheet_name="Composition_communale",
engine="calamine",
skiprows=5,
).to_dict(orient="records")
os.remove(path=insee_downloaded_file)
shutil.rmtree(path=insee_unzipped_folder)
return data


def city_zone_emploi_insee() -> tuple[dict, dict]:
zone_emploi_composition = {}
city_zone_emploi = {}

zone_emploi_insee = download_insee_data()
for d in zone_emploi_insee:
city = d["LIBGEO"]
city_code = d["CODGEO"]
ze = d["LIBZE2020"]
if ze not in zone_emploi_composition:
zone_emploi_composition[ze] = []
zone_emploi_composition[ze].append(city)
if city_code not in city_zone_emploi:
city_zone_emploi[city_code] = []
city_zone_emploi[city_code].append(ze)

return city_zone_emploi, zone_emploi_composition


def has_a_digit(text: str = '') -> bool:
for char in text:
if char.isdigit():
Expand Down
43 changes: 20 additions & 23 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,23 +1,20 @@
# Dev dependencies
pytest==6.2.3
pytest-mock==3.5.1
requests-mock==1.9.2

# Dependencies
beautifulsoup4==4.8.2
contextvars==2.4
elasticsearch==7.8.0
elasticsearch-dsl==7.2.1
Flask==1.1.1
Flask-Bootstrap==3.3.7.1
fuzzywuzzy==0.18.0
geopy==2.1.0
lxml==4.9.1
pandas==0.25.3
pycountry==20.7.3
python-Levenshtein==0.21.1
redis==3.5.3
requests==2.25.0
rq==1.9.0
xlrd==1.1.0
XlsxWriter==1.0.4
pytest==8.1.1
pytest-mock==3.14.0
requests-mock==1.12.1
beautifulsoup4==4.12.3
contextvars==2.4
elasticsearch==7.8.0
elasticsearch-dsl==7.2.1
Flask==3.0.3
Flask-Bootstrap==3.3.7.1
fuzzywuzzy==0.18.0
geopy==2.4.1
lxml==5.2.1
pandas==2.2.1
pycountry==23.12.11
python-calamine==0.2.0
python-Levenshtein==0.25.1
redis==5.0.3
requests==2.31.0
rq==1.16.1
XlsxWriter==3.2.0
Loading

0 comments on commit 4d155ab

Please sign in to comment.