Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add data integrity tests for unique IRIs #164

Merged
merged 14 commits into from
Sep 24, 2021
109 changes: 59 additions & 50 deletions src/bioregistry/data/bioregistry.json
Original file line number Diff line number Diff line change
Expand Up @@ -4326,6 +4326,7 @@
"prefix": "cath.superfamily",
"provider_url": "http://www.cathdb.info/cathnode/$1"
},
"part_of": "cath",
"prefixcommons": {
"formatter": "http://identifiers.org/cath.superfamily/$1",
"is_identifiers": true,
Expand Down Expand Up @@ -4361,6 +4362,7 @@
"prefix": "cattleqtldb",
"provider_url": "http://www.animalgenome.org/cgi-bin/QTLdb/BT/qdetails?QTL_ID=$1"
},
"part_of": "qtldb",
"prefixcommons": {
"formatter": "http://identifiers.org/cattleqtldb/$1",
"is_identifiers": true,
Expand Down Expand Up @@ -5448,6 +5450,7 @@
"prefix": "chickenqtldb",
"provider_url": "http://www.animalgenome.org/cgi-bin/QTLdb/GG/qdetails?QTL_ID=$1"
},
"part_of": "qtldb",
"prefixcommons": {
"formatter": "http://identifiers.org/chickenqtldb/$1",
"is_identifiers": true,
Expand Down Expand Up @@ -9346,7 +9349,8 @@
"prefix": "dpo",
"version": "2021-09-02",
"version.iri": "http://purl.obolibrary.org/obo/dpo/releases/2021-09-02/dpo.owl"
}
},
"part_of": "fbcv"
},
"dpv": {
"mappings": {
Expand Down Expand Up @@ -14072,7 +14076,8 @@
"is_identifiers": false,
"is_obo": true,
"prefix": "GEO"
}
},
"url": "http://purl.obolibrary.org/obo/GEO_$1"
},
"gexo": {
"bioportal": {
Expand Down Expand Up @@ -14241,6 +14246,8 @@
}
},
"glycomedb": {
"comment": "this is exactly the same as glytoucan. Idk why there are two different ones",
"has_canonical": "glytoucan",
"mappings": {
"miriam": "glycomedb",
"n2t": "glycomedb",
Expand Down Expand Up @@ -18250,6 +18257,7 @@
}
},
"insdc.cds": {
"has_canonical": "ncbiprotein",
"mappings": {
"miriam": "insdc.cds",
"n2t": "insdc.cds",
Expand Down Expand Up @@ -18328,6 +18336,7 @@
},
"description": "An experimental run, served thrugh the ENA",
"example": "ERR436051",
"has_canonical": "ena.embl",
"homepage": "https://www.insdc.org/",
"name": "International Nucleotide Sequence Database Collaboration (INSDC) Run",
"pattern": "^(E|D|S)RR[0-9]{6,}$",
Expand Down Expand Up @@ -24089,6 +24098,13 @@
"version": "1.0.1"
}
},
"multicellds": {
"description": "MultiCellDS is data standard for multicellular simulation, experimental, and clinical data. A digital cell line is a hierarchical organization of quantitative phenotype data for a single biological cell line, including the microenvironmental context of the measurements and essential metadata.",
"example": "MCDS_S_0000000001",
"homepage": "http://multicellds.org/MultiCellDB.php",
"name": "MultiCellDS",
"url": "http://multicellds.org/MultiCellDB/$1"
},
"multicellds.cell_line": {
"mappings": {
"miriam": "multicellds.cell_line",
Expand Down Expand Up @@ -24117,6 +24133,7 @@
"prefix": "multicellds.cell_line",
"provider_url": "http://multicellds.org/MultiCellDB/$1"
},
"part_of": "multicellds",
"prefixcommons": {
"formatter": "http://identifiers.org/multicellds.cell_line/$1",
"is_identifiers": true,
Expand Down Expand Up @@ -24152,6 +24169,7 @@
"prefix": "multicellds.collection",
"provider_url": "http://multicellds.org/MultiCellDB/$1"
},
"part_of": "multicellds",
"prefixcommons": {
"formatter": "http://identifiers.org/multicellds.collection/$1",
"is_identifiers": true,
Expand Down Expand Up @@ -24187,6 +24205,7 @@
"prefix": "multicellds.snapshot",
"provider_url": "http://multicellds.org/MultiCellDB/$1"
},
"part_of": "multicellds",
"prefixcommons": {
"formatter": "http://identifiers.org/multicellds.snapshot/$1",
"is_identifiers": true,
Expand Down Expand Up @@ -28942,6 +28961,8 @@
}
},
"pdb-ccd": {
"comment": "might be same as pdb.ligand, not sure though",
"has_canonical": "pdb.ligand",
"mappings": {
"miriam": "pdb-ccd",
"n2t": "pdb-ccd",
Expand Down Expand Up @@ -29207,7 +29228,8 @@
"prefix": "ped.ensemble",
"provider_url": "https://proteinensemble.org/$1",
"sampleId": "PED00017e001"
}
},
"part_of": "ped"
},
"peff": {
"deprecated": true,
Expand Down Expand Up @@ -30067,6 +30089,7 @@
"prefix": "pigqtldb",
"provider_url": "http://www.animalgenome.org/cgi-bin/QTLdb/SS/qdetails?QTL_ID=$1"
},
"part_of": "qtldb",
"prefixcommons": {
"formatter": "http://identifiers.org/pigqtldb/$1",
"is_identifiers": true,
Expand Down Expand Up @@ -32452,6 +32475,7 @@
"name": "RefSeq",
"prefix": "RefSeq"
},
"has_canonical": "ncbiprotein",
"mappings": {
"go": "RefSeq",
"miriam": "refseq",
Expand Down Expand Up @@ -34463,6 +34487,7 @@
"prefix": "sheepqtldb",
"provider_url": "http://www.animalgenome.org/cgi-bin/QTLdb/OA/qdetails?QTL_ID=$1"
},
"part_of": "qtldb",
"prefixcommons": {
"formatter": "http://identifiers.org/sheepqtldb/$1",
"is_identifiers": true,
Expand Down Expand Up @@ -35063,7 +35088,8 @@
"provider_url": "https://ccg.epfl.ch/cgi-bin/snp2tfbs/snpviewer_form_parser.cgi?snpid=$1",
"sampleId": "rs11603840"
},
"name": "SNP to Transcription Factor Binding Sites"
"name": "SNP to Transcription Factor Binding Sites",
"provides": "dbsnp"
},
"so": {
"bioportal": {
Expand Down Expand Up @@ -36129,6 +36155,7 @@
}
},
"tair.gene": {
"example": "2200934",
"mappings": {
"miriam": "tair.gene",
"n2t": "tair.gene",
Expand Down Expand Up @@ -36156,12 +36183,14 @@
"prefix": "tair.gene",
"provider_url": "http://arabidopsis.org/servlets/TairObject?accession=$1"
},
"pattern": "^\\d{7}$",
"prefixcommons": {
"formatter": "http://identifiers.org/tair.gene/$1",
"is_identifiers": true,
"is_obo": false,
"prefix": "TAIR.GENE"
}
},
"url": "http://arabidopsis.org/servlets/TairObject?accession=Gene:$1"
},
"tair.locus": {
"go": {
Expand Down Expand Up @@ -36212,6 +36241,7 @@
}
},
"tair.protein": {
"example": "1009107926",
"mappings": {
"miriam": "tair.protein",
"n2t": "tair.protein",
Expand Down Expand Up @@ -36239,12 +36269,14 @@
"prefix": "tair.protein",
"provider_url": "http://arabidopsis.org/servlets/TairObject?accession=$1"
},
"pattern": "^\\d{10}$",
"prefixcommons": {
"formatter": "http://identifiers.org/tair.protein/$1",
"is_identifiers": true,
"is_obo": false,
"prefix": "TAIR.PROTEIN"
}
},
"url": "http://arabidopsis.org/servlets/TairObject?accession=AASequence:$1"
},
"tao": {
"bioportal": {
Expand Down Expand Up @@ -39277,15 +39309,34 @@
"prefix": "wb",
"provider_url": "https://www.wormbase.org/get?name=$1"
},
"ncbi": {
"example": "R13H7",
"homepage": "http://www.wormbase.org/",
"name": "Caenorhabditis elegans Genome Database",
"prefix": "WormBase"
},
"prefixcommons": {
"formatter": "http://identifiers.org/wb/$1",
"is_identifiers": true,
"is_obo": false,
"prefix": "WB"
},
"synonyms": [
"WB_REF"
]
"WB_REF",
"wormbase"
],
"uniprot": {
"category": "Organism-specific databases",
"formatter": "https://wormbase.org/db/seq/protein?name=%s;class=CDS",
"identifier": "110",
"link_is_explicit": "true",
"name": "WormBase",
"prefix": "WormBase"
},
"wikidata": {
"database": "Q3570042",
"prefix": "P3860"
}
},
"wb.rnai": {
"mappings": {
Expand Down Expand Up @@ -39754,48 +39805,6 @@
"prefix": "WORFDB"
}
},
"wormbase": {
"example": "C05G5/12462-12364",
"go": {
"formatter": "http://www.wormbase.org/get?name=$1",
"homepage": "http://www.wormbase.org/",
"name": "WormBase database of nematode biology",
"prefix": "WB_REF"
},
"homepage": "https://wormbase.org",
"mappings": {
"go": "WB_REF",
"ncbi": "WormBase",
"prefixcommons": "WormBase",
"uniprot": "WormBase",
"uniprot.database": "WormBase"
},
"name": "WormBase",
"ncbi": {
"example": "R13H7",
"homepage": "http://www.wormbase.org/",
"name": "Caenorhabditis elegans Genome Database",
"prefix": "WormBase"
},
"prefixcommons": {
"formatter": "https://www.wormbase.org/get?name=$1",
"is_identifiers": false,
"is_obo": false,
"prefix": "WormBase"
},
"uniprot": {
"category": "Organism-specific databases",
"formatter": "https://wormbase.org/db/seq/protein?name=%s;class=CDS",
"identifier": "110",
"link_is_explicit": "true",
"name": "WormBase",
"prefix": "WormBase"
},
"wikidata": {
"database": "Q3570042",
"prefix": "P3860"
}
},
"wormpep": {
"mappings": {
"miriam": "wormpep",
Expand Down
4 changes: 4 additions & 0 deletions src/bioregistry/export/rdf_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,10 @@ def _add_resource(data, *, graph: Optional[rdflib.Graph] = None) -> Tuple[rdflib
if provides:
graph.add((node, bioregistry_schema["providesFor"], bioregistry_resource[provides]))

canonical = data.get("has_canonical")
if canonical:
graph.add((node, bioregistry_schema["hasCanonical"], bioregistry_resource[canonical]))

# TODO add contributor if it's available

graph.add(
Expand Down
2 changes: 2 additions & 0 deletions src/bioregistry/export/tsv_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def export_tsv():
*METAPREFIXES,
"part_of",
"provides",
"has_canonical",
# 'type',
]

Expand Down Expand Up @@ -139,6 +140,7 @@ def get_registry_rows():
# '|'.join(data.get('appears_in', [])),
data.part_of,
data.provides,
data.has_canonical,
# data.get('type'),
# TODO could add more, especially mappings
)
Expand Down
2 changes: 2 additions & 0 deletions src/bioregistry/schema/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
"hasMapping": "A property whose subject is a resource and object is a mapping",
"hasRegistry": "A property whose subject is a mapping and object is a metaresource.",
"hasMetaidentifier": "A property whose subject is a mapping and object is an identifier string.",
"hasCanonical": "A property connecting two prefixes that share an IRI where the subject is "
"the non-preferred prefix and the target is the preferred prefix",
}
bioregistry_collection = rdflib.namespace.Namespace("https://bioregistry.io/collection/")
bioregistry_resource = rdflib.namespace.Namespace("https://bioregistry.io/registry/")
Expand Down
4 changes: 4 additions & 0 deletions src/bioregistry/schema/schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,10 @@
"title": "Proprietary",
"type": "boolean"
},
"has_canonical": {
"title": "Has Canonical",
"type": "string"
},
"miriam": {
"title": "Miriam",
"type": "object"
Expand Down
2 changes: 2 additions & 0 deletions src/bioregistry/schema/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ class Resource(BaseModel):
contributor: Optional[Author]
#: Set to true if this database is proprietary. If missing, assume it's not.
proprietary: Optional[bool]
#: If this shares an IRI with another entry, maps to which should be be considered as canonical
has_canonical: Optional[str]

# Registry-specific data
miriam: Optional[Mapping[str, Any]]
Expand Down
6 changes: 6 additions & 0 deletions src/bioregistry/upload_ndex.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@ def upload():
target=resource_nodes[target],
interaction="provides",
)
if entry.has_canonical:
cx.add_edge(
source=resource_nodes[prefix],
target=resource_nodes[entry.has_canonical],
interaction="has_canonical",
)

# Which registries does it map to?
for metaprefix in metaregistry:
Expand Down
Loading