From 6f5249a8b50194d5b6f94c8fd0094f4fbddd4ee3 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 24 Sep 2021 00:51:46 +0200 Subject: [PATCH 01/14] Add data integrity tests for IRIs --- tests/test_data.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/test_data.py b/tests/test_data.py index d0257572e..9551fb39a 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -4,6 +4,7 @@ import logging import unittest +from collections import defaultdict import bioregistry from bioregistry.export.rdf_export import resource_to_rdf_str @@ -291,3 +292,26 @@ def test_get_rdf(self): """Test conversion to RDF.""" s = resource_to_rdf_str("chebi") self.assertIsInstance(s, str) + + def test_unique_iris(self): + """Test that all IRIs are unique, or at least there's a mapping to which one is the preferred prefix.""" + prefix_map = bioregistry.get_format_urls() + dd = defaultdict(list) + for prefix, iri in prefix_map.items(): + dd[iri].append(prefix) + + x = {} + for iri, prefixes in dd.items(): + if 1 == len(prefixes): + continue + resources = {prefix: bioregistry.get_resource(prefix) for prefix in prefixes} + parts = {prefix: resource.part_of for prefix, resource in resources.items()} + unmapped = [ + prefix + for prefix, part_of in parts.items() + if part_of is None + ] + if len(unmapped) <= 1: + continue + x[iri] = parts + self.assertEqual({}, x) From 7e0ba379112e95e1a06c84d9ab1ef7354062725c Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 24 Sep 2021 01:02:41 +0200 Subject: [PATCH 02/14] Update test --- tests/test_data.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tests/test_data.py b/tests/test_data.py index 9551fb39a..b334b79e9 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -296,15 +296,22 @@ def test_get_rdf(self): def test_unique_iris(self): """Test that all IRIs are unique, or at least there's a mapping to which one is the preferred prefix.""" prefix_map = bioregistry.get_format_urls() - dd = defaultdict(list) + dd = defaultdict(dict) for prefix, iri in prefix_map.items(): - dd[iri].append(prefix) + resource = bioregistry.get_resource(prefix) + self.assertIsNotNone(resource) + if resource.provides is not None: + # Don't consider resources that are providing, such as `ctd.gene` + continue + dd[iri][prefix] = resource x = {} - for iri, prefixes in dd.items(): - if 1 == len(prefixes): + for iri, resources in dd.items(): + if 1 == len(resources): + # This is a unique IRI, so no issues continue - resources = {prefix: bioregistry.get_resource(prefix) for prefix in prefixes} + + # Get parts parts = {prefix: resource.part_of for prefix, resource in resources.items()} unmapped = [ prefix @@ -313,5 +320,6 @@ def test_unique_iris(self): ] if len(unmapped) <= 1: continue + x[iri] = parts self.assertEqual({}, x) From 22580a069786d262acf1fd226312024ea34a548c Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 24 Sep 2021 01:02:48 +0200 Subject: [PATCH 03/14] Add additional curations --- src/bioregistry/data/bioregistry.json | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/bioregistry/data/bioregistry.json b/src/bioregistry/data/bioregistry.json index d799e8238..1f851b0f1 100644 --- a/src/bioregistry/data/bioregistry.json +++ b/src/bioregistry/data/bioregistry.json @@ -4326,6 +4326,7 @@ "prefix": "cath.superfamily", "provider_url": "http://www.cathdb.info/cathnode/$1" }, + "part_of": "cath", "prefixcommons": { "formatter": "http://identifiers.org/cath.superfamily/$1", "is_identifiers": true, @@ -9346,7 +9347,8 @@ "prefix": "dpo", "version": "2021-09-02", "version.iri": "http://purl.obolibrary.org/obo/dpo/releases/2021-09-02/dpo.owl" - } + }, + "part_of": "fbcv" }, "dpv": { "mappings": { @@ -14072,7 +14074,8 @@ "is_identifiers": false, "is_obo": true, "prefix": "GEO" - } + }, + "url": "http://purl.obolibrary.org/obo/GEO_$1" }, "gexo": { "bioportal": { @@ -35063,7 +35066,8 @@ "provider_url": "https://ccg.epfl.ch/cgi-bin/snp2tfbs/snpviewer_form_parser.cgi?snpid=$1", "sampleId": "rs11603840" }, - "name": "SNP to Transcription Factor Binding Sites" + "name": "SNP to Transcription Factor Binding Sites", + "provides": "dbsnp" }, "so": { "bioportal": { From 8d820b29eefdef1e01414babeab3258096f797b8 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 24 Sep 2021 01:13:35 +0200 Subject: [PATCH 04/14] Update test_data.py --- tests/test_data.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/test_data.py b/tests/test_data.py index b334b79e9..c1f7ffc2c 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -313,11 +313,7 @@ def test_unique_iris(self): # Get parts parts = {prefix: resource.part_of for prefix, resource in resources.items()} - unmapped = [ - prefix - for prefix, part_of in parts.items() - if part_of is None - ] + unmapped = [prefix for prefix, part_of in parts.items() if part_of is None] if len(unmapped) <= 1: continue From 0a46148e28099d199e9361c55eb55aa89d796185 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 24 Sep 2021 01:13:38 +0200 Subject: [PATCH 05/14] Update bioregistry.json --- src/bioregistry/data/bioregistry.json | 68 +++++++++------------------ 1 file changed, 23 insertions(+), 45 deletions(-) diff --git a/src/bioregistry/data/bioregistry.json b/src/bioregistry/data/bioregistry.json index 1f851b0f1..d515a0b28 100644 --- a/src/bioregistry/data/bioregistry.json +++ b/src/bioregistry/data/bioregistry.json @@ -29210,7 +29210,8 @@ "prefix": "ped.ensemble", "provider_url": "https://proteinensemble.org/$1", "sampleId": "PED00017e001" - } + }, + "part_of": "ped" }, "peff": { "deprecated": true, @@ -39281,6 +39282,12 @@ "prefix": "wb", "provider_url": "https://www.wormbase.org/get?name=$1" }, + "ncbi": { + "example": "R13H7", + "homepage": "http://www.wormbase.org/", + "name": "Caenorhabditis elegans Genome Database", + "prefix": "WormBase" + }, "prefixcommons": { "formatter": "http://identifiers.org/wb/$1", "is_identifiers": true, @@ -39288,8 +39295,21 @@ "prefix": "WB" }, "synonyms": [ - "WB_REF" - ] + "WB_REF", + "wormbase" + ], + "uniprot": { + "category": "Organism-specific databases", + "formatter": "https://wormbase.org/db/seq/protein?name=%s;class=CDS", + "identifier": "110", + "link_is_explicit": "true", + "name": "WormBase", + "prefix": "WormBase" + }, + "wikidata": { + "database": "Q3570042", + "prefix": "P3860" + } }, "wb.rnai": { "mappings": { @@ -39758,48 +39778,6 @@ "prefix": "WORFDB" } }, - "wormbase": { - "example": "C05G5/12462-12364", - "go": { - "formatter": "http://www.wormbase.org/get?name=$1", - "homepage": "http://www.wormbase.org/", - "name": "WormBase database of nematode biology", - "prefix": "WB_REF" - }, - "homepage": "https://wormbase.org", - "mappings": { - "go": "WB_REF", - "ncbi": "WormBase", - "prefixcommons": "WormBase", - "uniprot": "WormBase", - "uniprot.database": "WormBase" - }, - "name": "WormBase", - "ncbi": { - "example": "R13H7", - "homepage": "http://www.wormbase.org/", - "name": "Caenorhabditis elegans Genome Database", - "prefix": "WormBase" - }, - "prefixcommons": { - "formatter": "https://www.wormbase.org/get?name=$1", - "is_identifiers": false, - "is_obo": false, - "prefix": "WormBase" - }, - "uniprot": { - "category": "Organism-specific databases", - "formatter": "https://wormbase.org/db/seq/protein?name=%s;class=CDS", - "identifier": "110", - "link_is_explicit": "true", - "name": "WormBase", - "prefix": "WormBase" - }, - "wikidata": { - "database": "Q3570042", - "prefix": "P3860" - } - }, "wormpep": { "mappings": { "miriam": "wormpep", From ab0b2730c55cd0d1e6522ef80f32c641e2df4a4c Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 24 Sep 2021 01:27:16 +0200 Subject: [PATCH 06/14] Add canonical mapping this needs a better name --- src/bioregistry/data/bioregistry.json | 3 +++ src/bioregistry/schema/struct.py | 2 ++ tests/test_data.py | 9 ++++++++- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/bioregistry/data/bioregistry.json b/src/bioregistry/data/bioregistry.json index d515a0b28..0f59fae1a 100644 --- a/src/bioregistry/data/bioregistry.json +++ b/src/bioregistry/data/bioregistry.json @@ -18280,6 +18280,7 @@ "prefix": "insdc.cds", "provider_url": "https://www.ncbi.nlm.nih.gov/protein/$1" }, + "pmapto": "ncbiprotein", "prefixcommons": { "formatter": "http://identifiers.org/insdc.cds/$1", "is_identifiers": true, @@ -18334,6 +18335,7 @@ "homepage": "https://www.insdc.org/", "name": "International Nucleotide Sequence Database Collaboration (INSDC) Run", "pattern": "^(E|D|S)RR[0-9]{6,}$", + "pmapto": "ena.embl", "url": "https://www.ebi.ac.uk/ena/browser/view/$1" }, "insdc.sra": { @@ -32487,6 +32489,7 @@ "provider_url": "https://www.ncbi.nlm.nih.gov/protein/$1" }, "name": "Reference Sequence Collection", + "pmapto": "ncbiprotein", "prefixcommons": { "formatter": "http://www.ncbi.nlm.nih.gov/refseq/?term=$1", "is_identifiers": false, diff --git a/src/bioregistry/schema/struct.py b/src/bioregistry/schema/struct.py index 83eef3c9a..93300031f 100644 --- a/src/bioregistry/schema/struct.py +++ b/src/bioregistry/schema/struct.py @@ -116,6 +116,8 @@ class Resource(BaseModel): contributor: Optional[Author] #: Set to true if this database is proprietary. If missing, assume it's not. proprietary: Optional[bool] + #: If this shares an IRI with another entry, maps to which should be used + pmapto: Optional[str] # Registry-specific data miriam: Optional[Mapping[str, Any]] diff --git a/tests/test_data.py b/tests/test_data.py index c1f7ffc2c..08dfd1dfa 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -317,5 +317,12 @@ def test_unique_iris(self): if len(unmapped) <= 1: continue - x[iri] = parts + # Get pmaps + pmaptos = {prefix: resource.pmapto for prefix, resource in resources.items()} + canonical = [prefix for prefix, pmapto in pmaptos.items() if pmapto is None] + targets = list({pmapto for prefix, pmapto in pmaptos.items() if pmapto is not None}) + if len(canonical) == 1 and len(targets) == 1 and canonical[0] == targets[0]: + continue + + x[iri] = canonical, targets self.assertEqual({}, x) From cefe6a0d9ce457216a8f6cbe6988beb82db4a814 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 24 Sep 2021 10:46:37 +0200 Subject: [PATCH 07/14] Add more curations --- src/bioregistry/data/bioregistry.json | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/bioregistry/data/bioregistry.json b/src/bioregistry/data/bioregistry.json index 0f59fae1a..f7dc71d68 100644 --- a/src/bioregistry/data/bioregistry.json +++ b/src/bioregistry/data/bioregistry.json @@ -4362,6 +4362,7 @@ "prefix": "cattleqtldb", "provider_url": "http://www.animalgenome.org/cgi-bin/QTLdb/BT/qdetails?QTL_ID=$1" }, + "part_of": "qtldb", "prefixcommons": { "formatter": "http://identifiers.org/cattleqtldb/$1", "is_identifiers": true, @@ -5449,6 +5450,7 @@ "prefix": "chickenqtldb", "provider_url": "http://www.animalgenome.org/cgi-bin/QTLdb/GG/qdetails?QTL_ID=$1" }, + "part_of": "qtldb", "prefixcommons": { "formatter": "http://identifiers.org/chickenqtldb/$1", "is_identifiers": true, @@ -14244,6 +14246,7 @@ } }, "glycomedb": { + "comment": "this is exactly the same as glytoucan. Idk why there are two different ones", "mappings": { "miriam": "glycomedb", "n2t": "glycomedb", @@ -14271,6 +14274,7 @@ "prefix": "glycomedb", "provider_url": "https://glytoucan.org/Structures/Glycans/$1" }, + "pmapto": "glytoucan", "prefixcommons": { "formatter": "http://identifiers.org/glycomedb/$1", "is_identifiers": true, @@ -24094,6 +24098,13 @@ "version": "1.0.1" } }, + "multicellds": { + "description": "MultiCellDS is data standard for multicellular simulation, experimental, and clinical data. A digital cell line is a hierarchical organization of quantitative phenotype data for a single biological cell line, including the microenvironmental context of the measurements and essential metadata.", + "example": "MCDS_S_0000000001", + "homepage": "http://multicellds.org/MultiCellDB.php", + "name": "MultiCellDS", + "url": "http://multicellds.org/MultiCellDB/$1" + }, "multicellds.cell_line": { "mappings": { "miriam": "multicellds.cell_line", @@ -24122,6 +24133,7 @@ "prefix": "multicellds.cell_line", "provider_url": "http://multicellds.org/MultiCellDB/$1" }, + "part_of": "multicellds", "prefixcommons": { "formatter": "http://identifiers.org/multicellds.cell_line/$1", "is_identifiers": true, @@ -24157,6 +24169,7 @@ "prefix": "multicellds.collection", "provider_url": "http://multicellds.org/MultiCellDB/$1" }, + "part_of": "multicellds", "prefixcommons": { "formatter": "http://identifiers.org/multicellds.collection/$1", "is_identifiers": true, @@ -24192,6 +24205,7 @@ "prefix": "multicellds.snapshot", "provider_url": "http://multicellds.org/MultiCellDB/$1" }, + "part_of": "multicellds", "prefixcommons": { "formatter": "http://identifiers.org/multicellds.snapshot/$1", "is_identifiers": true, @@ -30073,6 +30087,7 @@ "prefix": "pigqtldb", "provider_url": "http://www.animalgenome.org/cgi-bin/QTLdb/SS/qdetails?QTL_ID=$1" }, + "part_of": "qtldb", "prefixcommons": { "formatter": "http://identifiers.org/pigqtldb/$1", "is_identifiers": true, @@ -34470,6 +34485,7 @@ "prefix": "sheepqtldb", "provider_url": "http://www.animalgenome.org/cgi-bin/QTLdb/OA/qdetails?QTL_ID=$1" }, + "part_of": "qtldb", "prefixcommons": { "formatter": "http://identifiers.org/sheepqtldb/$1", "is_identifiers": true, @@ -36137,6 +36153,7 @@ } }, "tair.gene": { + "example": "2200934", "mappings": { "miriam": "tair.gene", "n2t": "tair.gene", @@ -36164,12 +36181,14 @@ "prefix": "tair.gene", "provider_url": "http://arabidopsis.org/servlets/TairObject?accession=$1" }, + "pattern": "^\\d{7}$", "prefixcommons": { "formatter": "http://identifiers.org/tair.gene/$1", "is_identifiers": true, "is_obo": false, "prefix": "TAIR.GENE" - } + }, + "url": "http://arabidopsis.org/servlets/TairObject?accession=Gene:$1" }, "tair.locus": { "go": { @@ -36220,6 +36239,7 @@ } }, "tair.protein": { + "example": "1009107926", "mappings": { "miriam": "tair.protein", "n2t": "tair.protein", @@ -36247,12 +36267,14 @@ "prefix": "tair.protein", "provider_url": "http://arabidopsis.org/servlets/TairObject?accession=$1" }, + "pattern": "^\\d{10}$", "prefixcommons": { "formatter": "http://identifiers.org/tair.protein/$1", "is_identifiers": true, "is_obo": false, "prefix": "TAIR.PROTEIN" - } + }, + "url": "http://arabidopsis.org/servlets/TairObject?accession=AASequence:$1" }, "tao": { "bioportal": { From cc56d6c6333f1fb0f97820c504d5824c84560adb Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 24 Sep 2021 10:50:51 +0200 Subject: [PATCH 08/14] Pass tests --- src/bioregistry/data/bioregistry.json | 2 ++ tests/test_data.py | 1 + 2 files changed, 3 insertions(+) diff --git a/src/bioregistry/data/bioregistry.json b/src/bioregistry/data/bioregistry.json index f7dc71d68..bd3f1ed72 100644 --- a/src/bioregistry/data/bioregistry.json +++ b/src/bioregistry/data/bioregistry.json @@ -28961,6 +28961,7 @@ } }, "pdb-ccd": { + "comment": "might be same as pdb.ligand, not sure though", "mappings": { "miriam": "pdb-ccd", "n2t": "pdb-ccd", @@ -28988,6 +28989,7 @@ "prefix": "pdb-ccd", "provider_url": "https://www.ebi.ac.uk/pdbe-srv/pdbechem/chemicalCompound/show/$1" }, + "pmapto": "pdb.ligand", "prefixcommons": { "formatter": "http://identifiers.org/pdb-ccd/$1", "is_identifiers": true, diff --git a/tests/test_data.py b/tests/test_data.py index 08dfd1dfa..52d465dae 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -59,6 +59,7 @@ def test_keys(self): "comment", "contributor", "proprietary", + "pmapto", } keys.update(bioregistry.read_metaregistry()) for prefix, entry in self.registry.items(): From aada6ca17c6168f58739675505b292d2995d8831 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 24 Sep 2021 10:57:52 +0200 Subject: [PATCH 09/14] Give better name to property --- src/bioregistry/data/bioregistry.json | 10 +++++----- src/bioregistry/schema/struct.py | 4 ++-- tests/test_data.py | 14 +++++++------- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/bioregistry/data/bioregistry.json b/src/bioregistry/data/bioregistry.json index bd3f1ed72..1eec7e11f 100644 --- a/src/bioregistry/data/bioregistry.json +++ b/src/bioregistry/data/bioregistry.json @@ -14247,6 +14247,7 @@ }, "glycomedb": { "comment": "this is exactly the same as glytoucan. Idk why there are two different ones", + "has_canonical": "glytoucan", "mappings": { "miriam": "glycomedb", "n2t": "glycomedb", @@ -14274,7 +14275,6 @@ "prefix": "glycomedb", "provider_url": "https://glytoucan.org/Structures/Glycans/$1" }, - "pmapto": "glytoucan", "prefixcommons": { "formatter": "http://identifiers.org/glycomedb/$1", "is_identifiers": true, @@ -18257,6 +18257,7 @@ } }, "insdc.cds": { + "has_canonical": "ncbiprotein", "mappings": { "miriam": "insdc.cds", "n2t": "insdc.cds", @@ -18284,7 +18285,6 @@ "prefix": "insdc.cds", "provider_url": "https://www.ncbi.nlm.nih.gov/protein/$1" }, - "pmapto": "ncbiprotein", "prefixcommons": { "formatter": "http://identifiers.org/insdc.cds/$1", "is_identifiers": true, @@ -18336,10 +18336,10 @@ }, "description": "An experimental run, served thrugh the ENA", "example": "ERR436051", + "has_canonical": "ena.embl", "homepage": "https://www.insdc.org/", "name": "International Nucleotide Sequence Database Collaboration (INSDC) Run", "pattern": "^(E|D|S)RR[0-9]{6,}$", - "pmapto": "ena.embl", "url": "https://www.ebi.ac.uk/ena/browser/view/$1" }, "insdc.sra": { @@ -28962,6 +28962,7 @@ }, "pdb-ccd": { "comment": "might be same as pdb.ligand, not sure though", + "has_canonical": "pdb.ligand", "mappings": { "miriam": "pdb-ccd", "n2t": "pdb-ccd", @@ -28989,7 +28990,6 @@ "prefix": "pdb-ccd", "provider_url": "https://www.ebi.ac.uk/pdbe-srv/pdbechem/chemicalCompound/show/$1" }, - "pmapto": "pdb.ligand", "prefixcommons": { "formatter": "http://identifiers.org/pdb-ccd/$1", "is_identifiers": true, @@ -32475,6 +32475,7 @@ "name": "RefSeq", "prefix": "RefSeq" }, + "has_canonical": "ncbiprotein", "mappings": { "go": "RefSeq", "miriam": "refseq", @@ -32506,7 +32507,6 @@ "provider_url": "https://www.ncbi.nlm.nih.gov/protein/$1" }, "name": "Reference Sequence Collection", - "pmapto": "ncbiprotein", "prefixcommons": { "formatter": "http://www.ncbi.nlm.nih.gov/refseq/?term=$1", "is_identifiers": false, diff --git a/src/bioregistry/schema/struct.py b/src/bioregistry/schema/struct.py index 93300031f..512b985e3 100644 --- a/src/bioregistry/schema/struct.py +++ b/src/bioregistry/schema/struct.py @@ -116,8 +116,8 @@ class Resource(BaseModel): contributor: Optional[Author] #: Set to true if this database is proprietary. If missing, assume it's not. proprietary: Optional[bool] - #: If this shares an IRI with another entry, maps to which should be used - pmapto: Optional[str] + #: If this shares an IRI with another entry, maps to which should be be considered as canonical + has_canonical: Optional[str] # Registry-specific data miriam: Optional[Mapping[str, Any]] diff --git a/tests/test_data.py b/tests/test_data.py index 52d465dae..d86ecbd04 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -59,7 +59,7 @@ def test_keys(self): "comment", "contributor", "proprietary", - "pmapto", + "has_canonical", } keys.update(bioregistry.read_metaregistry()) for prefix, entry in self.registry.items(): @@ -318,12 +318,12 @@ def test_unique_iris(self): if len(unmapped) <= 1: continue - # Get pmaps - pmaptos = {prefix: resource.pmapto for prefix, resource in resources.items()} - canonical = [prefix for prefix, pmapto in pmaptos.items() if pmapto is None] - targets = list({pmapto for prefix, pmapto in pmaptos.items() if pmapto is not None}) - if len(canonical) == 1 and len(targets) == 1 and canonical[0] == targets[0]: + # Get canonical + canonicals = {prefix: resource.has_canonical for prefix, resource in resources.items()} + canonical_target = [prefix for prefix, target in canonicals.items() if target is None] + all_targets = list({target for prefix, target in canonicals.items() if target is not None}) + if len(canonical_target) == 1 and len(all_targets) == 1 and canonical_target[0] == all_targets[0]: continue - x[iri] = canonical, targets + x[iri] = parts, unmapped, canonical_target, all_targets self.assertEqual({}, x) From c926e3552773110f06775eccea1360fae2dfc1b8 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 24 Sep 2021 11:02:02 +0200 Subject: [PATCH 10/14] Add RDF schema annotations --- src/bioregistry/export/rdf_export.py | 4 ++++ src/bioregistry/schema/constants.py | 2 ++ src/bioregistry/schema/schema.json | 4 ++++ 3 files changed, 10 insertions(+) diff --git a/src/bioregistry/export/rdf_export.py b/src/bioregistry/export/rdf_export.py index c30ce4008..b0c83491e 100644 --- a/src/bioregistry/export/rdf_export.py +++ b/src/bioregistry/export/rdf_export.py @@ -187,6 +187,10 @@ def _add_resource(data, *, graph: Optional[rdflib.Graph] = None) -> Tuple[rdflib if provides: graph.add((node, bioregistry_schema["providesFor"], bioregistry_resource[provides])) + canonical = data.get("has_canonical") + if canonical: + graph.add((node, bioregistry_schema["hasCanonical"], bioregistry_resource[canonical])) + # TODO add contributor if it's available graph.add( diff --git a/src/bioregistry/schema/constants.py b/src/bioregistry/schema/constants.py index c530f76d5..3de605107 100644 --- a/src/bioregistry/schema/constants.py +++ b/src/bioregistry/schema/constants.py @@ -40,6 +40,8 @@ "hasMapping": "A property whose subject is a resource and object is a mapping", "hasRegistry": "A property whose subject is a mapping and object is a metaresource.", "hasMetaidentifier": "A property whose subject is a mapping and object is an identifier string.", + "hasCanonical": "A property connecting two prefixes that share an IRI where the subject is " + "the non-preferred prefix and the target is the preferred prefix", } bioregistry_collection = rdflib.namespace.Namespace("https://bioregistry.io/collection/") bioregistry_resource = rdflib.namespace.Namespace("https://bioregistry.io/registry/") diff --git a/src/bioregistry/schema/schema.json b/src/bioregistry/schema/schema.json index 6c87344cf..4073ac2fb 100644 --- a/src/bioregistry/schema/schema.json +++ b/src/bioregistry/schema/schema.json @@ -181,6 +181,10 @@ "title": "Proprietary", "type": "boolean" }, + "has_canonical": { + "title": "Has Canonical", + "type": "string" + }, "miriam": { "title": "Miriam", "type": "object" From 9d62fc00b71ef13005976322154704565abc4dc4 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 24 Sep 2021 11:02:05 +0200 Subject: [PATCH 11/14] Lint --- tests/test_data.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/test_data.py b/tests/test_data.py index d86ecbd04..7e792b111 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -321,8 +321,14 @@ def test_unique_iris(self): # Get canonical canonicals = {prefix: resource.has_canonical for prefix, resource in resources.items()} canonical_target = [prefix for prefix, target in canonicals.items() if target is None] - all_targets = list({target for prefix, target in canonicals.items() if target is not None}) - if len(canonical_target) == 1 and len(all_targets) == 1 and canonical_target[0] == all_targets[0]: + all_targets = list( + {target for prefix, target in canonicals.items() if target is not None} + ) + if ( + len(canonical_target) == 1 + and len(all_targets) == 1 + and canonical_target[0] == all_targets[0] + ): continue x[iri] = parts, unmapped, canonical_target, all_targets From 8d2fa6b0225fdbbce42331213c287b7ff6097484 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 24 Sep 2021 11:16:20 +0200 Subject: [PATCH 12/14] Update tsv_export.py --- src/bioregistry/export/tsv_export.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/bioregistry/export/tsv_export.py b/src/bioregistry/export/tsv_export.py index 3e1fb1fa1..0878681a3 100644 --- a/src/bioregistry/export/tsv_export.py +++ b/src/bioregistry/export/tsv_export.py @@ -73,6 +73,7 @@ def export_tsv(): *METAPREFIXES, "part_of", "provides", + "has_canonical", # 'type', ] @@ -139,6 +140,7 @@ def get_registry_rows(): # '|'.join(data.get('appears_in', [])), data.part_of, data.provides, + data.has_canonical, # data.get('type'), # TODO could add more, especially mappings ) From 53e8ca6f1c2f489f5d8cf244a313019dd913c671 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 24 Sep 2021 11:16:46 +0200 Subject: [PATCH 13/14] Update upload_ndex.py --- src/bioregistry/upload_ndex.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/bioregistry/upload_ndex.py b/src/bioregistry/upload_ndex.py index cccdcef9d..0ec18afe3 100644 --- a/src/bioregistry/upload_ndex.py +++ b/src/bioregistry/upload_ndex.py @@ -53,6 +53,12 @@ def upload(): target=resource_nodes[target], interaction="provides", ) + if entry.has_canonical: + cx.add_edge( + source=resource_nodes[prefix], + target=resource_nodes[entry.has_canonical], + interaction="has_canonical", + ) # Which registries does it map to? for metaprefix in metaregistry: From d7546ccb12e0dd607a1c18a85e85c84371d3e0cd Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 24 Sep 2021 11:17:17 +0200 Subject: [PATCH 14/14] Update test_data.py --- tests/test_data.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_data.py b/tests/test_data.py index 7e792b111..252a0aaad 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -294,6 +294,22 @@ def test_get_rdf(self): s = resource_to_rdf_str("chebi") self.assertIsInstance(s, str) + def test_provides(self): + """Make sure all provides relations point to valid prefixes.""" + for prefix, resource in self.registry.items(): + if resource.provides is None: + continue + with self.subTest(prefix=prefix): + self.assertIn(resource.provides, self.registry) + + def test_has_canonical(self): + """Make sure all has_canonical relations point to valid prefixes.""" + for prefix, resource in self.registry.items(): + if resource.has_canonical is None: + continue + with self.subTest(prefix=prefix): + self.assertIn(resource.has_canonical, self.registry) + def test_unique_iris(self): """Test that all IRIs are unique, or at least there's a mapping to which one is the preferred prefix.""" prefix_map = bioregistry.get_format_urls()