Add OBO Foundry lexicon (#14)

biopragmatics · Feb 20, 2024 · edf0859 · edf0859
1 parent e9eadac
commit edf0859
Show file tree

Hide file tree

Showing 5 changed files with 109 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -899,3 +899,4 @@ FodyWeavers.xsd
 # End of https://www.toptal.com/developers/gitignore/api/macos,linux,windows,python,jupyternotebooks,jetbrains,pycharm,vim,emacs,visualstudiocode,visualstudio
 
 scratch/
+lexica/obo/cache
diff --git a/lexica/obo/README.md b/lexica/obo/README.md
@@ -0,0 +1,62 @@
+# OBO Foundry Lexicon
+
+This contains all the terms from OBO Foundry ontologies
+(minus Protein Ontology, which is stubborn and won't download).
+
+The following script can be adapted to check new ontologies against existing terms:
+
+```python
+import json
+import gilda
+from urllib.request import urlretrieve
+
+# download the URL until https://github.com/gyorilab/gilda/pull/132
+# is accepted, then the URL can be used in gilda.Grounder directly
+url = "https://github.com/biopragmatics/biolexica/raw/main/lexica/obo/terms.tsv.gz"
+path = "terms.tsv.gz"
+urlretrieve(url, path)
+
+grounder = gilda.Grounder(path)
+
+obo_prefix = ...
+obo_uri_prefix = f"http://purl.obolibrary.org/obo/{obo_prefix}_"
+path_to_obograph_json = ...
+with open(path_to_obograph_json) as file:
+    data = json.load(file)
+
+safe = []
+
+print("## Lexical matching returned results\n")
+for graph in data['graphs']:
+    for node in sorted(graph['nodes'], key=lambda n: n['id']):
+        if node['type'] == "PROPERTY":
+            continue
+        uri = node['id']
+        if not uri.startswith(obo_uri_prefix):
+            continue
+
+        identifier = uri[len(obo_uri_prefix) :]
+        name = node['lbl']
+
+        results = []
+        results.extend(grounder.ground(name))
+        results.extend(
+            scored_match
+            for synonym in node.get("meta", {}).get("synonyms", [])
+            for scored_match in grounder.ground(synonym['val'])
+        )
+
+        if not results:
+            safe.append((identifier, name)) 
+        else:
+            print(f'- f`{obo_prefix}:{identifier}`', name)
+        for res in results:
+            curie = res.term.get_curie()
+            print(f'  - [`{curie}`](https://bioregistry.io/{curie}) {res.term.entry_name} ({round(res.score, 3)})')
+
+print("\n## Lexical matching returned no results\n")
+for identifier, name in safe:
+    print(f'- `{obo_prefix}:{identifier}`', name)
+```
+
+Inspired by https://gist.github.com/cthoyt/d26df3ec12f6a15f3157546c6ebee3a2.
diff --git a/lexica/obo/generate.py b/lexica/obo/generate.py
@@ -0,0 +1,43 @@
+from pathlib import Path
+
+import bioregistry
+from gilda import dump_terms
+from gilda.grounder import load_entries_from_terms_file
+from tqdm import tqdm
+from tqdm.contrib.logging import logging_redirect_tqdm
+
+from biolexica import iter_terms_by_prefix
+
+HERE = Path(__file__).parent.resolve()
+TERMS_OUTPUT_PATH = HERE.joinpath("terms.tsv.gz")
+CACHE = HERE.joinpath("cache")
+CACHE.mkdir(exist_ok=True, parents=True)
+
+
+def main():
+    skip = {"pr"}
+    prefixes = sorted(
+        resource.prefix
+        for resource in bioregistry.resources()
+        if resource.get_obo_preferred_prefix()
+        and not resource.is_deprecated()
+        and not resource.no_own_terms
+        and resource.prefix not in skip
+    )
+
+    all_terms = []
+    for prefix in tqdm(prefixes):
+        path = CACHE.joinpath(prefix).with_suffix(".tsv.gz")
+        if path.is_file():
+            all_terms.extend(load_entries_from_terms_file(path))
+        else:
+            local_terms = list(iter_terms_by_prefix(prefix, processor="bioontologies"))
+            with logging_redirect_tqdm():
+                dump_terms(local_terms, path)
+            all_terms.extend(local_terms)
+
+    dump_terms(all_terms, TERMS_OUTPUT_PATH)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/lexica/obo/terms.tsv.gz b/lexica/obo/terms.tsv.gz
diff --git a/src/biolexica/api.py b/src/biolexica/api.py
@@ -3,7 +3,7 @@
 import logging
 import tempfile
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Iterable, List, Literal, Optional, Union, Dict
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Literal, Optional, Union
 from urllib.request import urlretrieve
 
 import bioregistry
@@ -41,7 +41,7 @@
 GrounderHint = Union[gilda.Grounder, str, Path]
 
 
-class Input(BaseModel):
+class Input(BaseModel):  # type:ignore
     """An input towards lexicon assembly."""
 
     processor: Processor
@@ -59,7 +59,7 @@ class Configuration(BaseModel):
     )
 
 
-PREDEFINED = ["cell", "anatomy", "phenotype"]
+PREDEFINED = ["cell", "anatomy", "phenotype", "obo"]
 URL_FMT = "https://github.com/biopragmatics/biolexica/raw/main/lexica/{key}/terms.tsv.gz"
Original file line number	Diff line number	Diff line change
Expand Up		@@ -899,3 +899,4 @@ FodyWeavers.xsd
		# End of https://www.toptal.com/developers/gitignore/api/macos,linux,windows,python,jupyternotebooks,jetbrains,pycharm,vim,emacs,visualstudiocode,visualstudio

		scratch/
		lexica/obo/cache