diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cd0b5a6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.project +jcore-pipelines/detectStopWords/* +jcore-pipelines/detectUMLSentries/* diff --git a/README.md b/README.md index adf0b34..8546bc6 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,28 @@ # JuFiT: filtered dictionaries from UMLS -* Download JuFit from https://github.com/JULIELab/jufit and create the jar file by maven +* Download JuFit from https://github.com/JULIELab/jufit and create the jar file by Maven * run `java -jar JenaUmlsFilter-1.1-jar-with-dependencies.jar MRCONSO.RRF MRSTY.RRF GER --grounded > UMLS_dict.txt` +* run the script request-jufit.sh for dictionaries of the different semantic groups +* run the script createDics.py to create on large dictionary (before run: adapt paths) + +# Dictionary Format + +* We use following format in our dictionaries: + * one line per entry + * seperated by tabulators + +# JCoRe Pipeline +* unpack the *.zip files in jcore-pipelines, there are 2 pipelines: dectectUMLSentries and detectStopwords +* put the UMLS dictionary file into jcore-pipelines/detectUMLSentries/resources +* put your analysis text data into data/files (subdirectories are not read, be carefuly with *.tar files) +* adapt filename of the dictionary and the stopword dictionary in the following files: +Einstellung des zu filternden Wörterbuches und des Stopwörterbuches in folgenden Dateien anpassen: + * desc/GazetteerAnnotator Template Descriptor with Configurable External Resource.xml + * descAll/GazetteerAnnotator, Template Descriptor with Configurable External Resource.xml +* open a terminal and root into one of the pipeline directories +* start the pipeline with 'java -jar ../jcore-pipeline-runner-base-0.4.1-SNAPSHOT-cli-assembly.jar run.xml ' +* and have a look into + * offsets.tsv + * data/outData/output-xmi diff --git a/extended_script_dictionaries/createDics.py b/extended_script_dictionaries/createDics.py new file mode 100644 index 0000000..b1f3aaf --- /dev/null +++ b/extended_script_dictionaries/createDics.py @@ -0,0 +1,30 @@ +import glob +import csv + +print('merge different UMLS dics') + +def create_big_dic(dic_path, delim): + dics = glob.glob(dic_path + '/*') + big_dic = '' + for dic in dics: + name = dic.replace(dic_path + '/', '').replace('.txt', '').replace('.dict', '').replace('2019AB-', '').replace('-GER', '') + with open(dic) as tsvfile: + reader = csv.reader(tsvfile, delimiter=delim) + for row in reader: + #print(row[0]) + big_dic += row[0] + '\t' + name + '\n' + return big_dic + +path = '/the/name/of/the/path/with/dictionary/files' + +dic_path_umls = path + '/UMLS-semantic-group' +big_dic_umls = create_big_dic(dic_path_umls, '|') + +dic_path_gene = path + '/gene' +big_dic_gene = create_big_dic(dic_path_gene, '\t') + +big_dic_file = open('bic_dic.txt', 'w') +big_dic_file.write(big_dic_umls) +big_dic_file.write(big_dic_redlist) +big_dic_file.write(big_dic_gene) +big_dic_file.close() diff --git a/extended_script_dictionaries/request-jufit.sh b/extended_script_dictionaries/request-jufit.sh new file mode 100644 index 0000000..2bb9a88 --- /dev/null +++ b/extended_script_dictionaries/request-jufit.sh @@ -0,0 +1,24 @@ +java -jar JenaUmlsFilter-1.1-jar-with-dependencies.jar MRCONSO.RRF MRSTY.RRF GER --grounded --semanticGroup=ACTI > dic/UMLS-2019AB-ACTI-GER.txt +java -jar JenaUmlsFilter-1.1-jar-with-dependencies.jar MRCONSO.RRF MRSTY.RRF GER --grounded --semanticGroup=ANAT > dic/UMLS-2019AB-ANAT-GER.txt +java -jar JenaUmlsFilter-1.1-jar-with-dependencies.jar MRCONSO.RRF MRSTY.RRF GER --grounded --semanticGroup=CHEM > dic/UMLS-2019AB-CHEM-GER.txt +java -jar JenaUmlsFilter-1.1-jar-with-dependencies.jar MRCONSO.RRF MRSTY.RRF GER --grounded --semanticGroup=CONC > dic/UMLS-2019AB-CONC-GER.txt +java -jar JenaUmlsFilter-1.1-jar-with-dependencies.jar MRCONSO.RRF MRSTY.RRF GER --grounded --semanticGroup=DEVI > dic/UMLS-2019AB-DEVI-GER.txt + +java -jar JenaUmlsFilter-1.1-jar-with-dependencies.jar MRCONSO.RRF MRSTY.RRF GER --grounded --semanticGroup=DISO > dic/UMLS-2019AB-DISO-GER.txt +java -jar JenaUmlsFilter-1.1-jar-with-dependencies.jar MRCONSO.RRF MRSTY.RRF GER --grounded --semanticGroup=GENE > dic/UMLS-2019AB-GENE-GER.txt +java -jar JenaUmlsFilter-1.1-jar-with-dependencies.jar MRCONSO.RRF MRSTY.RRF GER --grounded --semanticGroup=GEOG > dic/UMLS-2019AB-GEOG-GER.txt +java -jar JenaUmlsFilter-1.1-jar-with-dependencies.jar MRCONSO.RRF MRSTY.RRF GER --grounded --semanticGroup=LIVB > dic/UMLS-2019AB-LIVB-GER.txt +java -jar JenaUmlsFilter-1.1-jar-with-dependencies.jar MRCONSO.RRF MRSTY.RRF GER --grounded --semanticGroup=OBJC > dic/UMLS-2019AB-OBJC-GER.txt + +java -jar JenaUmlsFilter-1.1-jar-with-dependencies.jar MRCONSO.RRF MRSTY.RRF GER --grounded --semanticGroup=OCCU > dic/UMLS-2019AB-OCCU-GER.txt +java -jar JenaUmlsFilter-1.1-jar-with-dependencies.jar MRCONSO.RRF MRSTY.RRF GER --grounded --semanticGroup=ORGA > dic/UMLS-2019AB-ORGA-GER.txt +java -jar JenaUmlsFilter-1.1-jar-with-dependencies.jar MRCONSO.RRF MRSTY.RRF GER --grounded --semanticGroup=PHEN > dic/UMLS-2019AB-PHEN-GER.txt +java -jar JenaUmlsFilter-1.1-jar-with-dependencies.jar MRCONSO.RRF MRSTY.RRF GER --grounded --semanticGroup=PHYS > dic/UMLS-2019AB-PHYS-GER.txt +java -jar JenaUmlsFilter-1.1-jar-with-dependencies.jar MRCONSO.RRF MRSTY.RRF GER --grounded --semanticGroup=PROC > dic/UMLS-2019AB-PROC-GER.txt + +java -jar JenaUmlsFilter-1.1-jar-with-dependencies.jar MRCONSO.RRF MRSTY.RRF GER --grounded > dic/UMLS-2019AB-GER.txt + +#Only the following semantic group names are supported: +#ACTI, ANAT, CHEM, CONC, DEVI, +#DISO, GENE, GEOG, LIVB, OBJC, +#OCCU, ORGA, PHEN, PHYS, PROC diff --git a/jcore-pipelines/.gitignore b/jcore-pipelines/.gitignore new file mode 100644 index 0000000..9ff780b --- /dev/null +++ b/jcore-pipelines/.gitignore @@ -0,0 +1,2 @@ +/detectStopWords/ +/detectUMLSentries/ diff --git a/jcore-pipelines/detectStopWords.zip b/jcore-pipelines/detectStopWords.zip new file mode 100644 index 0000000..dea435f Binary files /dev/null and b/jcore-pipelines/detectStopWords.zip differ diff --git a/jcore-pipelines/detectUMLSentries.zip b/jcore-pipelines/detectUMLSentries.zip new file mode 100644 index 0000000..eafcddc Binary files /dev/null and b/jcore-pipelines/detectUMLSentries.zip differ diff --git a/jcore-pipelines/jcore-pipeline-builder-cli-0.4.1-SNAPSHOT-jar-with-dependencies.jar b/jcore-pipelines/jcore-pipeline-builder-cli-0.4.1-SNAPSHOT-jar-with-dependencies.jar new file mode 100644 index 0000000..da589f5 Binary files /dev/null and b/jcore-pipelines/jcore-pipeline-builder-cli-0.4.1-SNAPSHOT-jar-with-dependencies.jar differ diff --git a/jcore-pipelines/jcore-pipeline-runner-base-0.4.1-SNAPSHOT-cli-assembly.jar b/jcore-pipelines/jcore-pipeline-runner-base-0.4.1-SNAPSHOT-cli-assembly.jar new file mode 100644 index 0000000..08325af Binary files /dev/null and b/jcore-pipelines/jcore-pipeline-runner-base-0.4.1-SNAPSHOT-cli-assembly.jar differ diff --git a/jcore-pipelines/jcore-pipeline-runner-cpe-0.4.1-SNAPSHOT-jar-with-dependencies.jar b/jcore-pipelines/jcore-pipeline-runner-cpe-0.4.1-SNAPSHOT-jar-with-dependencies.jar new file mode 100644 index 0000000..a7429d8 Binary files /dev/null and b/jcore-pipelines/jcore-pipeline-runner-cpe-0.4.1-SNAPSHOT-jar-with-dependencies.jar differ