diff --git a/src/bioregistry/data/bioregistry.json b/src/bioregistry/data/bioregistry.json index d799e8238..1eec7e11f 100644 --- a/src/bioregistry/data/bioregistry.json +++ b/src/bioregistry/data/bioregistry.json @@ -4326,6 +4326,7 @@ "prefix": "cath.superfamily", "provider_url": "http://www.cathdb.info/cathnode/$1" }, + "part_of": "cath", "prefixcommons": { "formatter": "http://identifiers.org/cath.superfamily/$1", "is_identifiers": true, @@ -4361,6 +4362,7 @@ "prefix": "cattleqtldb", "provider_url": "http://www.animalgenome.org/cgi-bin/QTLdb/BT/qdetails?QTL_ID=$1" }, + "part_of": "qtldb", "prefixcommons": { "formatter": "http://identifiers.org/cattleqtldb/$1", "is_identifiers": true, @@ -5448,6 +5450,7 @@ "prefix": "chickenqtldb", "provider_url": "http://www.animalgenome.org/cgi-bin/QTLdb/GG/qdetails?QTL_ID=$1" }, + "part_of": "qtldb", "prefixcommons": { "formatter": "http://identifiers.org/chickenqtldb/$1", "is_identifiers": true, @@ -9346,7 +9349,8 @@ "prefix": "dpo", "version": "2021-09-02", "version.iri": "http://purl.obolibrary.org/obo/dpo/releases/2021-09-02/dpo.owl" - } + }, + "part_of": "fbcv" }, "dpv": { "mappings": { @@ -14072,7 +14076,8 @@ "is_identifiers": false, "is_obo": true, "prefix": "GEO" - } + }, + "url": "http://purl.obolibrary.org/obo/GEO_$1" }, "gexo": { "bioportal": { @@ -14241,6 +14246,8 @@ } }, "glycomedb": { + "comment": "this is exactly the same as glytoucan. Idk why there are two different ones", + "has_canonical": "glytoucan", "mappings": { "miriam": "glycomedb", "n2t": "glycomedb", @@ -18250,6 +18257,7 @@ } }, "insdc.cds": { + "has_canonical": "ncbiprotein", "mappings": { "miriam": "insdc.cds", "n2t": "insdc.cds", @@ -18328,6 +18336,7 @@ }, "description": "An experimental run, served thrugh the ENA", "example": "ERR436051", + "has_canonical": "ena.embl", "homepage": "https://www.insdc.org/", "name": "International Nucleotide Sequence Database Collaboration (INSDC) Run", "pattern": "^(E|D|S)RR[0-9]{6,}$", @@ -24089,6 +24098,13 @@ "version": "1.0.1" } }, + "multicellds": { + "description": "MultiCellDS is data standard for multicellular simulation, experimental, and clinical data. A digital cell line is a hierarchical organization of quantitative phenotype data for a single biological cell line, including the microenvironmental context of the measurements and essential metadata.", + "example": "MCDS_S_0000000001", + "homepage": "http://multicellds.org/MultiCellDB.php", + "name": "MultiCellDS", + "url": "http://multicellds.org/MultiCellDB/$1" + }, "multicellds.cell_line": { "mappings": { "miriam": "multicellds.cell_line", @@ -24117,6 +24133,7 @@ "prefix": "multicellds.cell_line", "provider_url": "http://multicellds.org/MultiCellDB/$1" }, + "part_of": "multicellds", "prefixcommons": { "formatter": "http://identifiers.org/multicellds.cell_line/$1", "is_identifiers": true, @@ -24152,6 +24169,7 @@ "prefix": "multicellds.collection", "provider_url": "http://multicellds.org/MultiCellDB/$1" }, + "part_of": "multicellds", "prefixcommons": { "formatter": "http://identifiers.org/multicellds.collection/$1", "is_identifiers": true, @@ -24187,6 +24205,7 @@ "prefix": "multicellds.snapshot", "provider_url": "http://multicellds.org/MultiCellDB/$1" }, + "part_of": "multicellds", "prefixcommons": { "formatter": "http://identifiers.org/multicellds.snapshot/$1", "is_identifiers": true, @@ -28942,6 +28961,8 @@ } }, "pdb-ccd": { + "comment": "might be same as pdb.ligand, not sure though", + "has_canonical": "pdb.ligand", "mappings": { "miriam": "pdb-ccd", "n2t": "pdb-ccd", @@ -29207,7 +29228,8 @@ "prefix": "ped.ensemble", "provider_url": "https://proteinensemble.org/$1", "sampleId": "PED00017e001" - } + }, + "part_of": "ped" }, "peff": { "deprecated": true, @@ -30067,6 +30089,7 @@ "prefix": "pigqtldb", "provider_url": "http://www.animalgenome.org/cgi-bin/QTLdb/SS/qdetails?QTL_ID=$1" }, + "part_of": "qtldb", "prefixcommons": { "formatter": "http://identifiers.org/pigqtldb/$1", "is_identifiers": true, @@ -32452,6 +32475,7 @@ "name": "RefSeq", "prefix": "RefSeq" }, + "has_canonical": "ncbiprotein", "mappings": { "go": "RefSeq", "miriam": "refseq", @@ -34463,6 +34487,7 @@ "prefix": "sheepqtldb", "provider_url": "http://www.animalgenome.org/cgi-bin/QTLdb/OA/qdetails?QTL_ID=$1" }, + "part_of": "qtldb", "prefixcommons": { "formatter": "http://identifiers.org/sheepqtldb/$1", "is_identifiers": true, @@ -35063,7 +35088,8 @@ "provider_url": "https://ccg.epfl.ch/cgi-bin/snp2tfbs/snpviewer_form_parser.cgi?snpid=$1", "sampleId": "rs11603840" }, - "name": "SNP to Transcription Factor Binding Sites" + "name": "SNP to Transcription Factor Binding Sites", + "provides": "dbsnp" }, "so": { "bioportal": { @@ -36129,6 +36155,7 @@ } }, "tair.gene": { + "example": "2200934", "mappings": { "miriam": "tair.gene", "n2t": "tair.gene", @@ -36156,12 +36183,14 @@ "prefix": "tair.gene", "provider_url": "http://arabidopsis.org/servlets/TairObject?accession=$1" }, + "pattern": "^\\d{7}$", "prefixcommons": { "formatter": "http://identifiers.org/tair.gene/$1", "is_identifiers": true, "is_obo": false, "prefix": "TAIR.GENE" - } + }, + "url": "http://arabidopsis.org/servlets/TairObject?accession=Gene:$1" }, "tair.locus": { "go": { @@ -36212,6 +36241,7 @@ } }, "tair.protein": { + "example": "1009107926", "mappings": { "miriam": "tair.protein", "n2t": "tair.protein", @@ -36239,12 +36269,14 @@ "prefix": "tair.protein", "provider_url": "http://arabidopsis.org/servlets/TairObject?accession=$1" }, + "pattern": "^\\d{10}$", "prefixcommons": { "formatter": "http://identifiers.org/tair.protein/$1", "is_identifiers": true, "is_obo": false, "prefix": "TAIR.PROTEIN" - } + }, + "url": "http://arabidopsis.org/servlets/TairObject?accession=AASequence:$1" }, "tao": { "bioportal": { @@ -39277,6 +39309,12 @@ "prefix": "wb", "provider_url": "https://www.wormbase.org/get?name=$1" }, + "ncbi": { + "example": "R13H7", + "homepage": "http://www.wormbase.org/", + "name": "Caenorhabditis elegans Genome Database", + "prefix": "WormBase" + }, "prefixcommons": { "formatter": "http://identifiers.org/wb/$1", "is_identifiers": true, @@ -39284,8 +39322,21 @@ "prefix": "WB" }, "synonyms": [ - "WB_REF" - ] + "WB_REF", + "wormbase" + ], + "uniprot": { + "category": "Organism-specific databases", + "formatter": "https://wormbase.org/db/seq/protein?name=%s;class=CDS", + "identifier": "110", + "link_is_explicit": "true", + "name": "WormBase", + "prefix": "WormBase" + }, + "wikidata": { + "database": "Q3570042", + "prefix": "P3860" + } }, "wb.rnai": { "mappings": { @@ -39754,48 +39805,6 @@ "prefix": "WORFDB" } }, - "wormbase": { - "example": "C05G5/12462-12364", - "go": { - "formatter": "http://www.wormbase.org/get?name=$1", - "homepage": "http://www.wormbase.org/", - "name": "WormBase database of nematode biology", - "prefix": "WB_REF" - }, - "homepage": "https://wormbase.org", - "mappings": { - "go": "WB_REF", - "ncbi": "WormBase", - "prefixcommons": "WormBase", - "uniprot": "WormBase", - "uniprot.database": "WormBase" - }, - "name": "WormBase", - "ncbi": { - "example": "R13H7", - "homepage": "http://www.wormbase.org/", - "name": "Caenorhabditis elegans Genome Database", - "prefix": "WormBase" - }, - "prefixcommons": { - "formatter": "https://www.wormbase.org/get?name=$1", - "is_identifiers": false, - "is_obo": false, - "prefix": "WormBase" - }, - "uniprot": { - "category": "Organism-specific databases", - "formatter": "https://wormbase.org/db/seq/protein?name=%s;class=CDS", - "identifier": "110", - "link_is_explicit": "true", - "name": "WormBase", - "prefix": "WormBase" - }, - "wikidata": { - "database": "Q3570042", - "prefix": "P3860" - } - }, "wormpep": { "mappings": { "miriam": "wormpep", diff --git a/src/bioregistry/export/rdf_export.py b/src/bioregistry/export/rdf_export.py index c30ce4008..b0c83491e 100644 --- a/src/bioregistry/export/rdf_export.py +++ b/src/bioregistry/export/rdf_export.py @@ -187,6 +187,10 @@ def _add_resource(data, *, graph: Optional[rdflib.Graph] = None) -> Tuple[rdflib if provides: graph.add((node, bioregistry_schema["providesFor"], bioregistry_resource[provides])) + canonical = data.get("has_canonical") + if canonical: + graph.add((node, bioregistry_schema["hasCanonical"], bioregistry_resource[canonical])) + # TODO add contributor if it's available graph.add( diff --git a/src/bioregistry/export/tsv_export.py b/src/bioregistry/export/tsv_export.py index 3e1fb1fa1..0878681a3 100644 --- a/src/bioregistry/export/tsv_export.py +++ b/src/bioregistry/export/tsv_export.py @@ -73,6 +73,7 @@ def export_tsv(): *METAPREFIXES, "part_of", "provides", + "has_canonical", # 'type', ] @@ -139,6 +140,7 @@ def get_registry_rows(): # '|'.join(data.get('appears_in', [])), data.part_of, data.provides, + data.has_canonical, # data.get('type'), # TODO could add more, especially mappings ) diff --git a/src/bioregistry/schema/constants.py b/src/bioregistry/schema/constants.py index c530f76d5..3de605107 100644 --- a/src/bioregistry/schema/constants.py +++ b/src/bioregistry/schema/constants.py @@ -40,6 +40,8 @@ "hasMapping": "A property whose subject is a resource and object is a mapping", "hasRegistry": "A property whose subject is a mapping and object is a metaresource.", "hasMetaidentifier": "A property whose subject is a mapping and object is an identifier string.", + "hasCanonical": "A property connecting two prefixes that share an IRI where the subject is " + "the non-preferred prefix and the target is the preferred prefix", } bioregistry_collection = rdflib.namespace.Namespace("https://bioregistry.io/collection/") bioregistry_resource = rdflib.namespace.Namespace("https://bioregistry.io/registry/") diff --git a/src/bioregistry/schema/schema.json b/src/bioregistry/schema/schema.json index 6c87344cf..4073ac2fb 100644 --- a/src/bioregistry/schema/schema.json +++ b/src/bioregistry/schema/schema.json @@ -181,6 +181,10 @@ "title": "Proprietary", "type": "boolean" }, + "has_canonical": { + "title": "Has Canonical", + "type": "string" + }, "miriam": { "title": "Miriam", "type": "object" diff --git a/src/bioregistry/schema/struct.py b/src/bioregistry/schema/struct.py index 83eef3c9a..512b985e3 100644 --- a/src/bioregistry/schema/struct.py +++ b/src/bioregistry/schema/struct.py @@ -116,6 +116,8 @@ class Resource(BaseModel): contributor: Optional[Author] #: Set to true if this database is proprietary. If missing, assume it's not. proprietary: Optional[bool] + #: If this shares an IRI with another entry, maps to which should be be considered as canonical + has_canonical: Optional[str] # Registry-specific data miriam: Optional[Mapping[str, Any]] diff --git a/src/bioregistry/upload_ndex.py b/src/bioregistry/upload_ndex.py index cccdcef9d..0ec18afe3 100644 --- a/src/bioregistry/upload_ndex.py +++ b/src/bioregistry/upload_ndex.py @@ -53,6 +53,12 @@ def upload(): target=resource_nodes[target], interaction="provides", ) + if entry.has_canonical: + cx.add_edge( + source=resource_nodes[prefix], + target=resource_nodes[entry.has_canonical], + interaction="has_canonical", + ) # Which registries does it map to? for metaprefix in metaregistry: diff --git a/tests/test_data.py b/tests/test_data.py index d0257572e..252a0aaad 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -4,6 +4,7 @@ import logging import unittest +from collections import defaultdict import bioregistry from bioregistry.export.rdf_export import resource_to_rdf_str @@ -58,6 +59,7 @@ def test_keys(self): "comment", "contributor", "proprietary", + "has_canonical", } keys.update(bioregistry.read_metaregistry()) for prefix, entry in self.registry.items(): @@ -291,3 +293,59 @@ def test_get_rdf(self): """Test conversion to RDF.""" s = resource_to_rdf_str("chebi") self.assertIsInstance(s, str) + + def test_provides(self): + """Make sure all provides relations point to valid prefixes.""" + for prefix, resource in self.registry.items(): + if resource.provides is None: + continue + with self.subTest(prefix=prefix): + self.assertIn(resource.provides, self.registry) + + def test_has_canonical(self): + """Make sure all has_canonical relations point to valid prefixes.""" + for prefix, resource in self.registry.items(): + if resource.has_canonical is None: + continue + with self.subTest(prefix=prefix): + self.assertIn(resource.has_canonical, self.registry) + + def test_unique_iris(self): + """Test that all IRIs are unique, or at least there's a mapping to which one is the preferred prefix.""" + prefix_map = bioregistry.get_format_urls() + dd = defaultdict(dict) + for prefix, iri in prefix_map.items(): + resource = bioregistry.get_resource(prefix) + self.assertIsNotNone(resource) + if resource.provides is not None: + # Don't consider resources that are providing, such as `ctd.gene` + continue + dd[iri][prefix] = resource + + x = {} + for iri, resources in dd.items(): + if 1 == len(resources): + # This is a unique IRI, so no issues + continue + + # Get parts + parts = {prefix: resource.part_of for prefix, resource in resources.items()} + unmapped = [prefix for prefix, part_of in parts.items() if part_of is None] + if len(unmapped) <= 1: + continue + + # Get canonical + canonicals = {prefix: resource.has_canonical for prefix, resource in resources.items()} + canonical_target = [prefix for prefix, target in canonicals.items() if target is None] + all_targets = list( + {target for prefix, target in canonicals.items() if target is not None} + ) + if ( + len(canonical_target) == 1 + and len(all_targets) == 1 + and canonical_target[0] == all_targets[0] + ): + continue + + x[iri] = parts, unmapped, canonical_target, all_targets + self.assertEqual({}, x)