diff --git a/bsoclinicaltrials/server/main/ctis.py b/bsoclinicaltrials/server/main/ctis.py new file mode 100644 index 0000000..6b34cd3 --- /dev/null +++ b/bsoclinicaltrials/server/main/ctis.py @@ -0,0 +1,156 @@ +import datetime +import requests + +from bsoclinicaltrials.server.main.logger import get_logger +from bsoclinicaltrials.server.main.utils import my_parse_date +from bsoclinicaltrials.server.main.utils_swift import get_objects, set_objects + + +container = "clinical-trials" +logger = get_logger(__name__) + + +def harvest(): + # 1. Collect all clinical trials + cts = [] + page = 0 + per_page = 400 + while True: + page += 1 + r = requests.post("https://euclinicaltrials.eu/ctis-public-api/search", verify=False, json={ + "pagination": { + "page": page, + "size": per_page + }, + "sort": { + "property": "ctNumber", + "direction": "ASC" + }, + "searchCriteria": { + "containAll": "", + "containAny": "", + "containNot": "" + } + }) + data = r.json().get("data", []) + cts += data + if len(data) < per_page: + break + # 2. For each French clinical trial, find details metadata + cts_fr = [] + for ct in cts: + # Filter on French clinical trials + if "fr:" in ct.get("trialCountries").lower(): + r = requests.get( + f"https://euclinicaltrials.eu/ctis-public-api/retrieve/{ct.get('ctNumber')}", verify=False) + cts_fr.append(r.json()) + # 3. Save it in Object Storage + today = datetime.date.today() + set_objects(cts_fr, container, f"ctis_raw_{today}.json.gz") + return cts_fr + + +def parse_ctis(ct): + status_mapping = { + "Authorised, not started": "Active, not recruiting", + "Ongoing, recruiting": "Recruiting", + "Ongoing, not yet recruiting": "Not yet recruiting", + "Ongoing, recruitment ended": "Active, not recruiting", + "Ended": "Completed", + "Not authorised": "Not authorised", + "Halted": "Suspended", + "Revoked": "Withdrawn" + } + + res = { + "CTIS": ct.get("ctNumber"), + } + res["title"] = ct.get("authorizedApplication", {}).get("authorizedPartI", {}).get( + "trialDetails", {}).get("clinicalTrialIdentifiers", {}).get("fullTitle") + res["study_start_dat"] = ct.get("startDateEU") + countries = ct.get("authorizedApplication").get("authorizedPartsII") + res["enrollment_count"] = sum([country.get("recruitmentSubjectCount") for country in countries]) + # Results + results = ct.get("results", {}) + results = results.get("summaryResults", []) + results.get( + "laypersonResults", []) + results.get("clinicalStudyReports", []) + res["has_results"] = len(results) > 0 + res["has_results_or_publications"] = res["has_results"] + if len(results) > 0: + dates = [result.get("createdOn") for result in results] + dates.sort() + res["results_first_submit_date"] = my_parse_date(dates[0]) + res["acronym"] = ct.get("authorizedApplication", {}).get("authorizedPartI", {}).get( + "trialDetails", {}).get("clinicalTrialIdentifiers", {}).get("shortTitle") + # External ids + other_ids = [] + nct_id = ct.get("authorizedApplication", {}).get("authorizedPartI", {}).get("trialDetails", {}).get( + "clinicalTrialIdentifiers", {}).get("secondaryIdentifyingNumbers", {}).get("nctNumber", {}).get("number") + if nct_id: + other_id = {"id": nct_id, "source": "CTIS", "type": "NCTId"} + other_ids.append(other_id) + res["NCTId"] = nct_id + who_id = ct.get("authorizedApplication", {}).get("authorizedPartI", {}).get("trialDetails", {}).get( + "clinicalTrialIdentifiers", {}).get("secondaryIdentifyingNumbers", {}).get("whoUniversalTrialNumber", {}).get("number") + if who_id: + other_id = {"id": who_id, "source": "CTIS", "type": "WHO_UTN"} + other_ids.append(other_id) + res["WHO"] = who_id + isrctn_id = ct.get("authorizedApplication", {}).get("authorizedPartI", {}).get("trialDetails", {}).get( + "clinicalTrialIdentifiers", {}).get("secondaryIdentifyingNumbers", {}).get("isrctnNumber", {}).get("number") + if isrctn_id: + other_id = {"id": isrctn_id, "source": "CTIS", "type": "ISRCTN_NUMBER"} + other_ids.append(other_id) + res["ISRCTN"] = isrctn_id + additional_ids = ct.get("authorizedApplication", {}).get("authorizedPartI", {}).get("trialDetails", {}).get( + "clinicalTrialIdentifiers", {}).get("secondaryIdentifyingNumbers", {}).get("additionalRegistries", []) + for additional_id in additional_ids: + id = additional_id.get("number") + if id and id != "N/A": + other_id = {"id": id, "source": "CTIS", + "type": additional_id.get("otherRegistryName")} + other_ids.append(other_id) + if len(other_ids) > 0: + res["other_ids"] = other_ids + # Lead sponsor + sponsors = ct.get("authorizedApplication", {}).get( + "authorizedPartI", {}).get("sponsors", []) + primary_sponsors = [s for s in sponsors if s.get("primary", False) is True] + if len(primary_sponsors) > 0: + res["lead_sponsor"] = primary_sponsors[0].get("publicContacts", [])[ + 0].get("organisation", {}).get("name") + # ?? res['study_type'] = summary_infos.get("Clinical Trial Type") + res["study_first_submit_date"] = my_parse_date([t for t in ct.get("authorizedApplication", {}).get( + "applicationInfo", []) if t.get("type") == "INITIAL"][0].get("submissionDate")) + res["study_completion_date"] = my_parse_date(ct.get("endDateEU")) + res["status"] = status_mapping.get(ct.get("ctPublicStatus"), "Unknown status") + res["study_type"] = "Interventional" + return res + + +def parse(harvested_data, harvest_date=None): + if harvest_date is None: + today = datetime.date.today() + harvest_date = f"{today}" + parsed_data = [] + for ct in harvested_data: + parsed = parse_ctis(ct) + parsed_data.append(parsed) + set_objects(parsed_data, container, f"ctis_parsed_{harvest_date}.json.gz") + return { + "status": "ok", + "harvest_date": f"{harvest_date}", + "source": "ctis", + "nb_studies_harvested": len(harvested_data), + "nb_studies_parsed": len(parsed_data) + } + + +def harvest_parse_ctis(to_harvest=True, to_parse=True, harvest_date=None): + if to_harvest: + harvested_data = harvest() + else: + harvested_data = [x[0] for x in get_objects( + container, f"ctis_raw_{harvest_date}.json.gz")] + if to_parse: + return parse(harvested_data, harvest_date) diff --git a/bsoclinicaltrials/server/main/merge_sources.py b/bsoclinicaltrials/server/main/merge_sources.py index ba5ffb6..55095c5 100644 --- a/bsoclinicaltrials/server/main/merge_sources.py +++ b/bsoclinicaltrials/server/main/merge_sources.py @@ -6,7 +6,7 @@ logger = get_logger(__name__) -def get_each_sources(date_ct, date_euctr): +def get_each_sources(date_ct, date_euctr, date_ctis): raw_trials = {} logger.debug(f'getting clinicaltrials data from {date_ct}') df_ct = pd.DataFrame(get_objects("clinical-trials", f"clinical_trials_parsed_{date_ct}.json.gz")) @@ -21,24 +21,33 @@ def get_each_sources(date_ct, date_euctr): raw_trials['eudraCT'] = df_euctr.to_dict(orient='records') nb_ct_euctr = len(raw_trials['eudraCT']) logger.debug(f"Nb CT from euctr: {nb_ct_euctr}") + + logger.debug(f'getting ctis data from {date_ctis}') + df_ctis = pd.DataFrame(get_objects("clinical-trials", f"ctis_parsed_{date_ctis}.json.gz")) + df_ctis['source'] = 'ctis' + raw_trials['CTIS'] = df_ctis.to_dict(orient='records') + nb_ct_ctis = len(raw_trials['ctis']) + logger.debug(f"Nb CT from ctis: {nb_ct_ctis}") + return raw_trials -def merge_all(date_ct, date_euctr): +def merge_all(date_ct, date_euctr, date_ctis): # each field is transformed (transform_ct function) to become a list of element, each element with a source # after merge, the untransform_ct function turns back to a proper schema - raw_trials = get_each_sources(date_ct, date_euctr) + raw_trials = get_each_sources(date_ct, date_euctr, date_ctis) ct_transformed = {} for k in raw_trials: ct_transformed[k] = {} for ct in raw_trials[k]: ct_transformed[k][ct[k]] = transform_ct(ct) matches = {} - matches = update_matches(matches, raw_trials['NCTId'], 'NCTId', ['eudraCT']) - matches = update_matches(matches, raw_trials['eudraCT'], 'eudraCT', ['NCTId']) + matches = update_matches(matches, raw_trials["NCTId"], "NCTId", ["eudraCT"]) + matches = update_matches(matches, raw_trials["eudraCT"], "eudraCT", ["NCTId"]) + matches = update_matches(matches, raw_trials["CTIS"], "CTIS", ["NCTId"]) known_ids = set([]) all_ct = [] - for current_id_type in ['NCTId', 'eudraCT']: + for current_id_type in ["NCTId", "eudraCT", "CTIS"]: for ct in raw_trials[current_id_type]: if ct[current_id_type] in known_ids: continue diff --git a/bsoclinicaltrials/server/main/tasks.py b/bsoclinicaltrials/server/main/tasks.py index c702a74..43b4fa1 100644 --- a/bsoclinicaltrials/server/main/tasks.py +++ b/bsoclinicaltrials/server/main/tasks.py @@ -1,7 +1,8 @@ import datetime from bsoclinicaltrials.server.main.clinical_trials import harvest_parse_clinical_trials -from bsoclinicaltrials.server.main.elastic import load_in_es, reset_index, update_alias +from bsoclinicaltrials.server.main.ctis import harvest_parse_ctis +from bsoclinicaltrials.server.main.elastic import load_in_es, reset_index from bsoclinicaltrials.server.main.enrich_ct import enrich from bsoclinicaltrials.server.main.euctr import harvest_parse_euctr from bsoclinicaltrials.server.main.logger import get_logger @@ -18,6 +19,8 @@ def create_task_harvest(args: dict) -> dict: return harvest_parse_clinical_trials(harvest, parse, harvest_date) elif source == 'euctr': return harvest_parse_euctr(harvest, parse, harvest_date) + elif source == 'ctis': + return harvest_parse_ctis(harvest, parse, harvest_date) return {} @@ -25,12 +28,14 @@ def create_task_transform_load(args: dict) -> dict: today = datetime.date.today() harvest_date_ct = args.get('harvest_date_ct', f'{today}') harvest_date_euctr = args.get('harvest_date_euctr', f'{today}') + harvest_date_ctis = args.get('harvest_date_ctis', f'{today}') to_harvest = args.get('harvest', True) to_parse = args.get('parse', True) if to_harvest or to_parse: - res_ct = harvest_parse_clinical_trials(to_harvest=to_harvest, to_parse=to_parse, harvest_date=harvest_date_ct) - res_euctr = harvest_parse_euctr(to_harvest=to_harvest, to_parse=to_parse, harvest_date=harvest_date_euctr) - merged_ct = merge_all(harvest_date_ct, harvest_date_euctr) + harvest_parse_clinical_trials(to_harvest=to_harvest, to_parse=to_parse, harvest_date=harvest_date_ct) + harvest_parse_euctr(to_harvest=to_harvest, to_parse=to_parse, harvest_date=harvest_date_euctr) + harvest_parse_ctis(to_harvest=to_harvest, to_parse=to_parse, harvest_date=harvest_date_ctis) + merged_ct = merge_all(harvest_date_ct, harvest_date_euctr, harvest_date_ctis) data = enrich(merged_ct) current_date = today.isoformat() index = args.get('index', f'bso-clinical-trials-{current_date}')