Skip to content

Commit

Permalink
feat(ctis): Add new source
Browse files Browse the repository at this point in the history
  • Loading branch information
annelhote committed Jul 11, 2024
1 parent d782784 commit d867d31
Show file tree
Hide file tree
Showing 3 changed files with 180 additions and 10 deletions.
156 changes: 156 additions & 0 deletions bsoclinicaltrials/server/main/ctis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import datetime
import requests

from bsoclinicaltrials.server.main.logger import get_logger
from bsoclinicaltrials.server.main.utils import my_parse_date
from bsoclinicaltrials.server.main.utils_swift import get_objects, set_objects


container = "clinical-trials"
logger = get_logger(__name__)


def harvest():
# 1. Collect all clinical trials
cts = []
page = 0
per_page = 400
while True:
page += 1
r = requests.post("https://euclinicaltrials.eu/ctis-public-api/search", verify=False, json={
"pagination": {
"page": page,
"size": per_page
},
"sort": {
"property": "ctNumber",
"direction": "ASC"
},
"searchCriteria": {
"containAll": "",
"containAny": "",
"containNot": ""
}
})
data = r.json().get("data", [])
cts += data
if len(data) < per_page:
break
# 2. For each French clinical trial, find details metadata
cts_fr = []
for ct in cts:
# Filter on French clinical trials
if "fr:" in ct.get("trialCountries").lower():
r = requests.get(
f"https://euclinicaltrials.eu/ctis-public-api/retrieve/{ct.get('ctNumber')}", verify=False)
cts_fr.append(r.json())
# 3. Save it in Object Storage
today = datetime.date.today()
set_objects(cts_fr, container, f"ctis_raw_{today}.json.gz")
return cts_fr


def parse_ctis(ct):
status_mapping = {
"Authorised, not started": "Active, not recruiting",
"Ongoing, recruiting": "Recruiting",
"Ongoing, not yet recruiting": "Not yet recruiting",
"Ongoing, recruitment ended": "Active, not recruiting",
"Ended": "Completed",
"Not authorised": "Not authorised",
"Halted": "Suspended",
"Revoked": "Withdrawn"
}

res = {
"CTIS": ct.get("ctNumber"),
}
res["title"] = ct.get("authorizedApplication", {}).get("authorizedPartI", {}).get(
"trialDetails", {}).get("clinicalTrialIdentifiers", {}).get("fullTitle")
res["study_start_dat"] = ct.get("startDateEU")
countries = ct.get("authorizedApplication").get("authorizedPartsII")
res["enrollment_count"] = sum([country.get("recruitmentSubjectCount") for country in countries])
# Results
results = ct.get("results", {})
results = results.get("summaryResults", []) + results.get(
"laypersonResults", []) + results.get("clinicalStudyReports", [])
res["has_results"] = len(results) > 0
res["has_results_or_publications"] = res["has_results"]
if len(results) > 0:
dates = [result.get("createdOn") for result in results]
dates.sort()
res["results_first_submit_date"] = my_parse_date(dates[0])
res["acronym"] = ct.get("authorizedApplication", {}).get("authorizedPartI", {}).get(
"trialDetails", {}).get("clinicalTrialIdentifiers", {}).get("shortTitle")
# External ids
other_ids = []
nct_id = ct.get("authorizedApplication", {}).get("authorizedPartI", {}).get("trialDetails", {}).get(
"clinicalTrialIdentifiers", {}).get("secondaryIdentifyingNumbers", {}).get("nctNumber", {}).get("number")
if nct_id:
other_id = {"id": nct_id, "source": "CTIS", "type": "NCTId"}
other_ids.append(other_id)
res["NCTId"] = nct_id
who_id = ct.get("authorizedApplication", {}).get("authorizedPartI", {}).get("trialDetails", {}).get(
"clinicalTrialIdentifiers", {}).get("secondaryIdentifyingNumbers", {}).get("whoUniversalTrialNumber", {}).get("number")
if who_id:
other_id = {"id": who_id, "source": "CTIS", "type": "WHO_UTN"}
other_ids.append(other_id)
res["WHO"] = who_id
isrctn_id = ct.get("authorizedApplication", {}).get("authorizedPartI", {}).get("trialDetails", {}).get(
"clinicalTrialIdentifiers", {}).get("secondaryIdentifyingNumbers", {}).get("isrctnNumber", {}).get("number")
if isrctn_id:
other_id = {"id": isrctn_id, "source": "CTIS", "type": "ISRCTN_NUMBER"}
other_ids.append(other_id)
res["ISRCTN"] = isrctn_id
additional_ids = ct.get("authorizedApplication", {}).get("authorizedPartI", {}).get("trialDetails", {}).get(
"clinicalTrialIdentifiers", {}).get("secondaryIdentifyingNumbers", {}).get("additionalRegistries", [])
for additional_id in additional_ids:
id = additional_id.get("number")
if id and id != "N/A":
other_id = {"id": id, "source": "CTIS",
"type": additional_id.get("otherRegistryName")}
other_ids.append(other_id)
if len(other_ids) > 0:
res["other_ids"] = other_ids
# Lead sponsor
sponsors = ct.get("authorizedApplication", {}).get(
"authorizedPartI", {}).get("sponsors", [])
primary_sponsors = [s for s in sponsors if s.get("primary", False) is True]
if len(primary_sponsors) > 0:
res["lead_sponsor"] = primary_sponsors[0].get("publicContacts", [])[
0].get("organisation", {}).get("name")
# ?? res['study_type'] = summary_infos.get("Clinical Trial Type")
res["study_first_submit_date"] = my_parse_date([t for t in ct.get("authorizedApplication", {}).get(
"applicationInfo", []) if t.get("type") == "INITIAL"][0].get("submissionDate"))
res["study_completion_date"] = my_parse_date(ct.get("endDateEU"))
res["status"] = status_mapping.get(ct.get("ctPublicStatus"), "Unknown status")
res["study_type"] = "Interventional"
return res


def parse(harvested_data, harvest_date=None):
if harvest_date is None:
today = datetime.date.today()
harvest_date = f"{today}"
parsed_data = []
for ct in harvested_data:
parsed = parse_ctis(ct)
parsed_data.append(parsed)
set_objects(parsed_data, container, f"ctis_parsed_{harvest_date}.json.gz")
return {
"status": "ok",
"harvest_date": f"{harvest_date}",
"source": "ctis",
"nb_studies_harvested": len(harvested_data),
"nb_studies_parsed": len(parsed_data)
}


def harvest_parse_ctis(to_harvest=True, to_parse=True, harvest_date=None):
if to_harvest:
harvested_data = harvest()
else:
harvested_data = [x[0] for x in get_objects(
container, f"ctis_raw_{harvest_date}.json.gz")]
if to_parse:
return parse(harvested_data, harvest_date)
21 changes: 15 additions & 6 deletions bsoclinicaltrials/server/main/merge_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
logger = get_logger(__name__)


def get_each_sources(date_ct, date_euctr):
def get_each_sources(date_ct, date_euctr, date_ctis):
raw_trials = {}
logger.debug(f'getting clinicaltrials data from {date_ct}')
df_ct = pd.DataFrame(get_objects("clinical-trials", f"clinical_trials_parsed_{date_ct}.json.gz"))
Expand All @@ -21,24 +21,33 @@ def get_each_sources(date_ct, date_euctr):
raw_trials['eudraCT'] = df_euctr.to_dict(orient='records')
nb_ct_euctr = len(raw_trials['eudraCT'])
logger.debug(f"Nb CT from euctr: {nb_ct_euctr}")

logger.debug(f'getting ctis data from {date_ctis}')
df_ctis = pd.DataFrame(get_objects("clinical-trials", f"ctis_parsed_{date_ctis}.json.gz"))
df_ctis['source'] = 'ctis'
raw_trials['CTIS'] = df_ctis.to_dict(orient='records')
nb_ct_ctis = len(raw_trials['ctis'])
logger.debug(f"Nb CT from ctis: {nb_ct_ctis}")

return raw_trials


def merge_all(date_ct, date_euctr):
def merge_all(date_ct, date_euctr, date_ctis):
# each field is transformed (transform_ct function) to become a list of element, each element with a source
# after merge, the untransform_ct function turns back to a proper schema
raw_trials = get_each_sources(date_ct, date_euctr)
raw_trials = get_each_sources(date_ct, date_euctr, date_ctis)
ct_transformed = {}
for k in raw_trials:
ct_transformed[k] = {}
for ct in raw_trials[k]:
ct_transformed[k][ct[k]] = transform_ct(ct)
matches = {}
matches = update_matches(matches, raw_trials['NCTId'], 'NCTId', ['eudraCT'])
matches = update_matches(matches, raw_trials['eudraCT'], 'eudraCT', ['NCTId'])
matches = update_matches(matches, raw_trials["NCTId"], "NCTId", ["eudraCT"])
matches = update_matches(matches, raw_trials["eudraCT"], "eudraCT", ["NCTId"])
matches = update_matches(matches, raw_trials["CTIS"], "CTIS", ["NCTId"])
known_ids = set([])
all_ct = []
for current_id_type in ['NCTId', 'eudraCT']:
for current_id_type in ["NCTId", "eudraCT", "CTIS"]:
for ct in raw_trials[current_id_type]:
if ct[current_id_type] in known_ids:
continue
Expand Down
13 changes: 9 additions & 4 deletions bsoclinicaltrials/server/main/tasks.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import datetime

from bsoclinicaltrials.server.main.clinical_trials import harvest_parse_clinical_trials
from bsoclinicaltrials.server.main.elastic import load_in_es, reset_index, update_alias
from bsoclinicaltrials.server.main.ctis import harvest_parse_ctis
from bsoclinicaltrials.server.main.elastic import load_in_es, reset_index
from bsoclinicaltrials.server.main.enrich_ct import enrich
from bsoclinicaltrials.server.main.euctr import harvest_parse_euctr
from bsoclinicaltrials.server.main.logger import get_logger
Expand All @@ -18,19 +19,23 @@ def create_task_harvest(args: dict) -> dict:
return harvest_parse_clinical_trials(harvest, parse, harvest_date)
elif source == 'euctr':
return harvest_parse_euctr(harvest, parse, harvest_date)
elif source == 'ctis':
return harvest_parse_ctis(harvest, parse, harvest_date)
return {}


def create_task_transform_load(args: dict) -> dict:
today = datetime.date.today()
harvest_date_ct = args.get('harvest_date_ct', f'{today}')
harvest_date_euctr = args.get('harvest_date_euctr', f'{today}')
harvest_date_ctis = args.get('harvest_date_ctis', f'{today}')
to_harvest = args.get('harvest', True)
to_parse = args.get('parse', True)
if to_harvest or to_parse:
res_ct = harvest_parse_clinical_trials(to_harvest=to_harvest, to_parse=to_parse, harvest_date=harvest_date_ct)
res_euctr = harvest_parse_euctr(to_harvest=to_harvest, to_parse=to_parse, harvest_date=harvest_date_euctr)
merged_ct = merge_all(harvest_date_ct, harvest_date_euctr)
harvest_parse_clinical_trials(to_harvest=to_harvest, to_parse=to_parse, harvest_date=harvest_date_ct)
harvest_parse_euctr(to_harvest=to_harvest, to_parse=to_parse, harvest_date=harvest_date_euctr)
harvest_parse_ctis(to_harvest=to_harvest, to_parse=to_parse, harvest_date=harvest_date_ctis)
merged_ct = merge_all(harvest_date_ct, harvest_date_euctr, harvest_date_ctis)
data = enrich(merged_ct)
current_date = today.isoformat()
index = args.get('index', f'bso-clinical-trials-{current_date}')
Expand Down

0 comments on commit d867d31

Please sign in to comment.