-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_datasets_for_prediction.py
103 lines (86 loc) · 4.51 KB
/
generate_datasets_for_prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import numpy as np
import pandas as pd
from difflib import SequenceMatcher
from datetime import datetime, timedelta
import re
import pickle
import json
import sys
from timeline_utils import get_paragraph_with_max_token_length
#what are we saving the dataset(s) to?
classification_output = 'classification_dataset.json'
EQA_output = 'eqa_dataset.json'
#attending addendum string
addendum_string = "I saw and evaluated/examined on and reviewed 's notes. I agree with the history, physical exam"
#regex for extracting HPI/Interval History
hpi_whitelist_regex = r"(?im)^(\bHPI\b|\bHistory of Present Illness\b|\bInterval History\b)"
hpi_blacklist_regex = r"(?im)(\b(Past |Prior )?((Social|Surgical|Family|Medical|Psychiatric|Seizure|Disease|Epilepsy) )History\b|\bSemiology\b|\bLab|\bExam|\bDiagnostic|\bImpression|\bPlan\b|\bPE\b|\bRisk Factor|\bMedications|\bAllerg)"
#load the data
epileptologist_notes = pd.read_pickle('<PATH_TO_RAW_NOTES>')
#remove attending addendums
attendings_phrase = epileptologist_notes['NOTE_TEXT'].apply(lambda x: SequenceMatcher(None, x[:len(addendum_string)], addendum_string).ratio() > 0.60)
attendings_explicit = epileptologist_notes['NOTE_TEXT'].apply(lambda x: 'attending addendum' in x.lower()[:500])
attendings_explicit_cased = epileptologist_notes['NOTE_TEXT'].apply(lambda x: 'Addendum:' in x[:500])
epileptologist_notes = epileptologist_notes[~(attendings_explicit | attendings_phrase | attendings_explicit_cased)].reset_index(drop=True)
#remove quick notes
quick_notes = epileptologist_notes['NOTE_TEXT'].apply(lambda x: 'quick note' in x.lower()[:20])
epileptologist_notes = epileptologist_notes[~quick_notes].reset_index(drop=True)
#How are the lines in the notes denoted from each other in raw text form?
splitter = " "
max_token_length = 512-20 #512 tokens max, with 20 set aside for questions
hpi_paragraphs_CB = []
hpi_paragraphs_RB = []
#path to the models
CB_path = r'<PATH TO CNT-UPenn/Bio_ClinicalBERT_for_seizureFreedom_classification>'
RB_path = r'<PATH TO CNT-UPenn/RoBERTa_for_seizureFrequency_QA>'
for idx, row in epileptologist_notes.iterrows():
#extract the hpi/interval history relevant paragraphs using the CB tokenizer
hpi_paragraph_CB = get_paragraph_with_max_token_length(hpi_whitelist_regex, hpi_blacklist_regex, row['NOTE_TEXT'], row['NOTE_AUTHOR'], row['MRN'], str(row['VISIT_DATE']), str(row['NOTE_ID']), CB_path, splitter, max_token_length)
if hpi_paragraph_CB != None:
hpi_paragraphs_CB.append(hpi_paragraph_CB)
#extract the hpi/interval history relevant paragraphs using the RB tokenizer
hpi_paragraph_RB = get_paragraph_with_max_token_length(hpi_whitelist_regex, hpi_blacklist_regex, row['NOTE_TEXT'], row['NOTE_AUTHOR'], row['MRN'], str(row['VISIT_DATE']), str(row['NOTE_ID']), RB_path, splitter, max_token_length)
if hpi_paragraph_RB != None:
hpi_paragraphs_RB.append(hpi_paragraph_RB)
if idx % 5000 == 0:
print(idx)
#Initialize the eqa questions and the classification questions
eqa_dataset = {'version':'v2.0', 'data':[]}
classification_dataset = []
#what questions are we asking
classification_q = "Has the patient had recent events?"
eqa_q = ["How often does the patient have events?", "When was the patient's last event?"]
eqa_identifer = ["PQF", "ELO"]
#for the classification dataset, format it
for datum in hpi_paragraphs_CB:
for key in datum.keys():
if isinstance(key, str):
continue
classification_dataset.append({
'id':datum['filename']+"_hasSz",
'passage':datum[key],
'question':classification_q
})
#put a break here to ensure that we only take the first entry. This is because later entries may include historical hpi or other sections
break
#for the eqa dataset, format it
for datum in hpi_paragraphs_RB:
for key in datum.keys():
if isinstance(key, str):
continue
#add eqa questions
for i in range(len(eqa_q)):
eqa_dataset['data'].append({
'id':datum['filename']+"_"+eqa_identifer[i],
'context':datum[key],
'question':eqa_q[i]
})
#put a break here to ensure that we only take the first entry. This is because later entries may include historical hpi or other sections
break
#save datasets to json
with open(classification_output, 'w') as f:
for datum in classification_dataset:
json.dump(datum, f)
f.write('\n')
with open(EQA_output, 'w') as f:
json.dump(eqa_dataset, f)