feat(ctis): Add new source

dataesr · Jul 11, 2024 · d867d31 · d867d31
1 parent d782784
commit d867d31
Show file tree

Hide file tree

Showing 3 changed files with 180 additions and 10 deletions.
diff --git a/bsoclinicaltrials/server/main/ctis.py b/bsoclinicaltrials/server/main/ctis.py
@@ -0,0 +1,156 @@
+import datetime
+import requests
+
+from bsoclinicaltrials.server.main.logger import get_logger
+from bsoclinicaltrials.server.main.utils import my_parse_date
+from bsoclinicaltrials.server.main.utils_swift import get_objects, set_objects
+
+
+container = "clinical-trials"
+logger = get_logger(__name__)
+
+
+def harvest():
+    # 1. Collect all clinical trials
+    cts = []
+    page = 0
+    per_page = 400
+    while True:
+        page += 1
+        r = requests.post("https://euclinicaltrials.eu/ctis-public-api/search", verify=False, json={
+            "pagination": {
+                "page": page,
+                "size": per_page
+            },
+            "sort": {
+                "property": "ctNumber",
+                "direction": "ASC"
+            },
+            "searchCriteria": {
+                "containAll": "",
+                "containAny": "",
+                "containNot": ""
+            }
+        })
+        data = r.json().get("data", [])
+        cts += data
+        if len(data) < per_page:
+            break
+    # 2. For each French clinical trial, find details metadata
+    cts_fr = []
+    for ct in cts:
+        # Filter on French clinical trials
+        if "fr:" in ct.get("trialCountries").lower():
+            r = requests.get(
+                f"https://euclinicaltrials.eu/ctis-public-api/retrieve/{ct.get('ctNumber')}", verify=False)
+            cts_fr.append(r.json())
+    # 3. Save it in Object Storage
+    today = datetime.date.today()
+    set_objects(cts_fr, container, f"ctis_raw_{today}.json.gz")
+    return cts_fr
+
+
+def parse_ctis(ct):
+    status_mapping = {
+        "Authorised, not started": "Active, not recruiting",
+        "Ongoing, recruiting": "Recruiting",
+        "Ongoing, not yet recruiting": "Not yet recruiting",
+        "Ongoing, recruitment ended": "Active, not recruiting",
+        "Ended": "Completed",
+        "Not authorised": "Not authorised",
+        "Halted": "Suspended",
+        "Revoked": "Withdrawn"
+    }
+
+    res = {
+        "CTIS": ct.get("ctNumber"),
+    }
+    res["title"] = ct.get("authorizedApplication", {}).get("authorizedPartI", {}).get(
+        "trialDetails", {}).get("clinicalTrialIdentifiers", {}).get("fullTitle")
+    res["study_start_dat"] = ct.get("startDateEU")
+    countries = ct.get("authorizedApplication").get("authorizedPartsII")
+    res["enrollment_count"] = sum([country.get("recruitmentSubjectCount") for country in countries])
+    # Results
+    results = ct.get("results", {})
+    results = results.get("summaryResults", []) + results.get(
+        "laypersonResults", []) + results.get("clinicalStudyReports", [])
+    res["has_results"] = len(results) > 0
+    res["has_results_or_publications"] = res["has_results"]
+    if len(results) > 0:
+        dates = [result.get("createdOn") for result in results]
+        dates.sort()
+        res["results_first_submit_date"] = my_parse_date(dates[0])
+    res["acronym"] = ct.get("authorizedApplication", {}).get("authorizedPartI", {}).get(
+        "trialDetails", {}).get("clinicalTrialIdentifiers", {}).get("shortTitle")
+    # External ids
+    other_ids = []
+    nct_id = ct.get("authorizedApplication", {}).get("authorizedPartI", {}).get("trialDetails", {}).get(
+        "clinicalTrialIdentifiers", {}).get("secondaryIdentifyingNumbers", {}).get("nctNumber", {}).get("number")
+    if nct_id:
+        other_id = {"id": nct_id, "source": "CTIS", "type": "NCTId"}
+        other_ids.append(other_id)
+        res["NCTId"] = nct_id
+    who_id = ct.get("authorizedApplication", {}).get("authorizedPartI", {}).get("trialDetails", {}).get(
+        "clinicalTrialIdentifiers", {}).get("secondaryIdentifyingNumbers", {}).get("whoUniversalTrialNumber", {}).get("number")
+    if who_id:
+        other_id = {"id": who_id, "source": "CTIS", "type": "WHO_UTN"}
+        other_ids.append(other_id)
+        res["WHO"] = who_id
+    isrctn_id = ct.get("authorizedApplication", {}).get("authorizedPartI", {}).get("trialDetails", {}).get(
+        "clinicalTrialIdentifiers", {}).get("secondaryIdentifyingNumbers", {}).get("isrctnNumber", {}).get("number")
+    if isrctn_id:
+        other_id = {"id": isrctn_id, "source": "CTIS", "type": "ISRCTN_NUMBER"}
+        other_ids.append(other_id)
+        res["ISRCTN"] = isrctn_id
+    additional_ids = ct.get("authorizedApplication", {}).get("authorizedPartI", {}).get("trialDetails", {}).get(
+        "clinicalTrialIdentifiers", {}).get("secondaryIdentifyingNumbers", {}).get("additionalRegistries", [])
+    for additional_id in additional_ids:
+        id = additional_id.get("number")
+        if id and id != "N/A":
+            other_id = {"id": id, "source": "CTIS",
+                        "type": additional_id.get("otherRegistryName")}
+            other_ids.append(other_id)
+    if len(other_ids) > 0:
+        res["other_ids"] = other_ids
+    # Lead sponsor
+    sponsors = ct.get("authorizedApplication", {}).get(
+        "authorizedPartI", {}).get("sponsors", [])
+    primary_sponsors = [s for s in sponsors if s.get("primary", False) is True]
+    if len(primary_sponsors) > 0:
+        res["lead_sponsor"] = primary_sponsors[0].get("publicContacts", [])[
+            0].get("organisation", {}).get("name")
+    # ?? res['study_type'] = summary_infos.get("Clinical Trial Type")
+    res["study_first_submit_date"] = my_parse_date([t for t in ct.get("authorizedApplication", {}).get(
+        "applicationInfo", []) if t.get("type") == "INITIAL"][0].get("submissionDate"))
+    res["study_completion_date"] = my_parse_date(ct.get("endDateEU"))
+    res["status"] = status_mapping.get(ct.get("ctPublicStatus"), "Unknown status")
+    res["study_type"] = "Interventional"
+    return res
+
+
+def parse(harvested_data, harvest_date=None):
+    if harvest_date is None:
+        today = datetime.date.today()
+        harvest_date = f"{today}"
+    parsed_data = []
+    for ct in harvested_data:
+        parsed = parse_ctis(ct)
+        parsed_data.append(parsed)
+    set_objects(parsed_data, container, f"ctis_parsed_{harvest_date}.json.gz")
+    return {
+        "status": "ok",
+        "harvest_date": f"{harvest_date}",
+        "source": "ctis",
+        "nb_studies_harvested": len(harvested_data),
+        "nb_studies_parsed": len(parsed_data)
+    }
+
+
+def harvest_parse_ctis(to_harvest=True, to_parse=True, harvest_date=None):
+    if to_harvest:
+        harvested_data = harvest()
+    else:
+        harvested_data = [x[0] for x in get_objects(
+            container, f"ctis_raw_{harvest_date}.json.gz")]
+    if to_parse:
+        return parse(harvested_data, harvest_date)
diff --git a/bsoclinicaltrials/server/main/merge_sources.py b/bsoclinicaltrials/server/main/merge_sources.py
@@ -6,7 +6,7 @@
 logger = get_logger(__name__)
 
 
-def get_each_sources(date_ct, date_euctr):
+def get_each_sources(date_ct, date_euctr, date_ctis):
     raw_trials = {}
     logger.debug(f'getting clinicaltrials data from {date_ct}')
     df_ct = pd.DataFrame(get_objects("clinical-trials", f"clinical_trials_parsed_{date_ct}.json.gz"))
@@ -21,24 +21,33 @@ def get_each_sources(date_ct, date_euctr):
     raw_trials['eudraCT'] = df_euctr.to_dict(orient='records')
     nb_ct_euctr = len(raw_trials['eudraCT'])
     logger.debug(f"Nb CT from euctr: {nb_ct_euctr}")
+
+    logger.debug(f'getting ctis data from {date_ctis}')
+    df_ctis = pd.DataFrame(get_objects("clinical-trials", f"ctis_parsed_{date_ctis}.json.gz"))
+    df_ctis['source'] = 'ctis'
+    raw_trials['CTIS'] = df_ctis.to_dict(orient='records')
+    nb_ct_ctis = len(raw_trials['ctis'])
+    logger.debug(f"Nb CT from ctis: {nb_ct_ctis}")
+
     return raw_trials
 
 
-def merge_all(date_ct, date_euctr):
+def merge_all(date_ct, date_euctr, date_ctis):
     # each field is transformed (transform_ct function) to become a list of element, each element with a source
     # after merge, the untransform_ct function turns back to a proper schema
-    raw_trials = get_each_sources(date_ct, date_euctr)
+    raw_trials = get_each_sources(date_ct, date_euctr, date_ctis)
     ct_transformed = {}
     for k in raw_trials:
         ct_transformed[k] = {}
         for ct in raw_trials[k]:
             ct_transformed[k][ct[k]] = transform_ct(ct)
     matches = {}
-    matches = update_matches(matches, raw_trials['NCTId'], 'NCTId', ['eudraCT'])
-    matches = update_matches(matches, raw_trials['eudraCT'], 'eudraCT', ['NCTId'])
+    matches = update_matches(matches, raw_trials["NCTId"], "NCTId", ["eudraCT"])
+    matches = update_matches(matches, raw_trials["eudraCT"], "eudraCT", ["NCTId"])
+    matches = update_matches(matches, raw_trials["CTIS"], "CTIS", ["NCTId"])
     known_ids = set([])
     all_ct = []
-    for current_id_type in ['NCTId', 'eudraCT']:
+    for current_id_type in ["NCTId", "eudraCT", "CTIS"]:
         for ct in raw_trials[current_id_type]:
             if ct[current_id_type] in known_ids:
                 continue

diff --git a/bsoclinicaltrials/server/main/tasks.py b/bsoclinicaltrials/server/main/tasks.py
@@ -1,7 +1,8 @@
 import datetime
 
 from bsoclinicaltrials.server.main.clinical_trials import harvest_parse_clinical_trials
-from bsoclinicaltrials.server.main.elastic import load_in_es, reset_index, update_alias
+from bsoclinicaltrials.server.main.ctis import harvest_parse_ctis
+from bsoclinicaltrials.server.main.elastic import load_in_es, reset_index
 from bsoclinicaltrials.server.main.enrich_ct import enrich
 from bsoclinicaltrials.server.main.euctr import harvest_parse_euctr
 from bsoclinicaltrials.server.main.logger import get_logger
@@ -18,19 +19,23 @@ def create_task_harvest(args: dict) -> dict:
         return harvest_parse_clinical_trials(harvest, parse, harvest_date)
     elif source == 'euctr':
         return harvest_parse_euctr(harvest, parse, harvest_date)
+    elif source == 'ctis':
+        return harvest_parse_ctis(harvest, parse, harvest_date)
     return {}
 
 
 def create_task_transform_load(args: dict) -> dict:
     today = datetime.date.today()
     harvest_date_ct = args.get('harvest_date_ct', f'{today}')
     harvest_date_euctr = args.get('harvest_date_euctr', f'{today}')
+    harvest_date_ctis = args.get('harvest_date_ctis', f'{today}')
     to_harvest = args.get('harvest', True)
     to_parse = args.get('parse', True)
     if to_harvest or to_parse:
-        res_ct = harvest_parse_clinical_trials(to_harvest=to_harvest, to_parse=to_parse, harvest_date=harvest_date_ct)
-        res_euctr = harvest_parse_euctr(to_harvest=to_harvest, to_parse=to_parse, harvest_date=harvest_date_euctr)
-    merged_ct = merge_all(harvest_date_ct, harvest_date_euctr)
+        harvest_parse_clinical_trials(to_harvest=to_harvest, to_parse=to_parse, harvest_date=harvest_date_ct)
+        harvest_parse_euctr(to_harvest=to_harvest, to_parse=to_parse, harvest_date=harvest_date_euctr)
+        harvest_parse_ctis(to_harvest=to_harvest, to_parse=to_parse, harvest_date=harvest_date_ctis)
+    merged_ct = merge_all(harvest_date_ct, harvest_date_euctr, harvest_date_ctis)
     data = enrich(merged_ct)
     current_date = today.isoformat()
     index = args.get('index', f'bso-clinical-trials-{current_date}')