Skip to content

Commit

Permalink
Fix issues with MIRIAM alignment (#707)
Browse files Browse the repository at this point in the history
Two issues came up related to recent changes:

1. `miriam:gramene.growthstage` was renamed to `miriam:gro`. This now
conflicts with `gro` which is reserved for the Gene Regulation Ontology,
so a mismatch was curated and the metaregistry links in the
`bioregistry:gramene.growthstage` record was updated
2. `miriam:tair.name` was added, this is equivalent to
`bioregistry:araport`. This is the first situation (I think) when
Identifiers.org has added a prefix that was a duplicate of something
novel in the Bioregistry but didn't have the same prefix. I'm not sure
how often this will happen in the future so I'm not sure if this kind of
problem needs addressing in the alignment code.
  • Loading branch information
cthoyt authored Jan 12, 2023
1 parent 4d01add commit 5834b3b
Show file tree
Hide file tree
Showing 6 changed files with 22,454 additions and 22,302 deletions.
4 changes: 3 additions & 1 deletion src/bioregistry/align/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,10 +219,12 @@ def get_curation_row(self, external_id, external_entry) -> Sequence[str]:
rv.append("")
elif isinstance(value, str):
rv.append(value.strip())
elif isinstance(value, bool):
rv.append("true" if value else "false")
elif isinstance(value, (list, tuple, set)):
rv.append("|".join(sorted(v.strip() for v in value)))
else:
raise TypeError
raise TypeError(f"unexpected type in curation header: {value}")
return rv

def _iter_curation_rows(self) -> Iterable[Sequence[str]]:
Expand Down
106 changes: 98 additions & 8 deletions src/bioregistry/data/bioregistry.json
Original file line number Diff line number Diff line change
Expand Up @@ -2959,9 +2959,22 @@
"description": "Website with general information about Arabidopsis and functionalities such as a genomic viewer",
"homepage": "https://www.araport.org/",
"mappings": {
"miriam": "tair.name",
"ncbi": "Araport",
"uniprot": "DB-0221"
},
"miriam": {
"deprecated": false,
"description": "The Arabidopsis Information Resource (TAIR) maintains a database of genetic and molecular biology data for the model higher plant Arabidopsis thaliana. The name of a Locus is unique and used by TAIR, TIGR, and MIPS.",
"homepage": "https://www.arabidopsis.org/index.jsp",
"id": "00000976",
"name": "TAIR gene name",
"namespaceEmbeddedInLui": false,
"pattern": "^AT.G[0-9]{5}$",
"prefix": "tair.name",
"sampleId": "AT5G05330",
"uri_format": "https://www.arabidopsis.org/servlets/TairObject?type=locus&name=$1"
},
"name": "Arabidopsis Information Portal",
"ncbi": {
"example": "AT1G01010",
Expand Down Expand Up @@ -3151,7 +3164,7 @@
"namespaceEmbeddedInLui": true,
"pattern": "^(ark\\:)/*[0-9A-Za-z]+(?:/[\\w/.=*+@\\$-]*)?(?:\\?.*)?$",
"prefix": "ark",
"sampleId": "/12345/fk1234",
"sampleId": "/87924/r4154jq8h",
"uri_format": "http://n2t.net/ark:$1"
},
"n2t": {
Expand Down Expand Up @@ -7892,7 +7905,7 @@
"id": "00000694",
"name": "BioStudies database",
"namespaceEmbeddedInLui": false,
"pattern": "^S-[A-Z]{4}[A-Z\\d\\-]+$",
"pattern": "^S-[A-Z]{4}[\\-\\_A-Z\\d]+$",
"prefix": "biostudies",
"sampleId": "S-EPMC6266652",
"uri_format": "https://www.ebi.ac.uk/biostudies/studies/$1"
Expand Down Expand Up @@ -10575,7 +10588,7 @@
"id": "00000234",
"name": "Cell Cycle Ontology",
"namespaceEmbeddedInLui": true,
"pattern": "^CCO\\:\\w+$",
"pattern": "^CCO:\\w+$",
"prefix": "cco",
"sampleId": "0000003",
"uri_format": "https://www.ebi.ac.uk/ols/ontologies/cco/terms?obo_id=CCO:$1"
Expand Down Expand Up @@ -13915,6 +13928,23 @@
],
"uri_format": "http://classyfire.wishartlab.com/tax_nodes/C$1"
},
"clb": {
"mappings": {
"miriam": "clb"
},
"miriam": {
"deprecated": false,
"description": "ChecklistBank is an index and repository for taxonomic and nomenclatural datasets",
"homepage": "https://www.checklistbank.org",
"id": "00000964",
"name": "ChecklistBank",
"namespaceEmbeddedInLui": false,
"pattern": "^[0-9]+(LR)?$",
"prefix": "clb",
"sampleId": "1010",
"uri_format": "https://www.checklistbank.org/dataset/$1"
}
},
"cldb": {
"biocontext": {
"is_identifiers": true,
Expand Down Expand Up @@ -16841,6 +16871,32 @@
"no_own_terms": true,
"repository": "https://github.com/MIT-LCP/mimic-omop"
},
"col": {
"mappings": {
"miriam": "col"
},
"miriam": {
"deprecated": false,
"description": "Identifier of a taxon or synonym in the Catalogue of Life",
"homepage": "https://www.checklistbank.org",
"id": "00000969",
"name": "Catalogue of Life",
"namespaceEmbeddedInLui": false,
"pattern": "^[23456789BCDFGHJKLMNPQRSTVWXYZ]{1,6}$",
"prefix": "col",
"providers": [
{
"code": "col",
"description": "The Catalogue of Life website providing a view onto the latest release of the COL Checklist.",
"homepage": "https://www.catalogueoflife.org",
"name": "Catalogue of Life (COL)",
"uri_format": "https://www.catalogueoflife.org/data/taxon/$1"
}
],
"sampleId": "4QHKG",
"uri_format": "https://www.checklistbank.org/dataset/3LR/taxon/$1"
}
},
"col.taiwan": {
"contributor": {
"email": "[email protected]",
Expand Down Expand Up @@ -38105,7 +38161,7 @@
"mappings": {
"biocontext": "GRAMENE.GROWTHSTAGE",
"bioportal": "GRO-CPGA",
"miriam": "gramene.growthstage",
"miriam": "gro",
"n2t": "gramene.growthstage",
"obofoundry": "gro",
"prefixcommons": "gramene.po"
Expand All @@ -38117,8 +38173,8 @@
"id": "00000508",
"name": "Gramene Growth Stage Ontology",
"namespaceEmbeddedInLui": true,
"pattern": "^GRO\\:\\d+$",
"prefix": "gramene.growthstage",
"pattern": "^GRO:\\d+$",
"prefix": "gro",
"sampleId": "0007133",
"uri_format": "http://www.gramene.org/db/ontology/search?id=GRO:$1"
},
Expand Down Expand Up @@ -53377,7 +53433,7 @@
"prefixcommons": "aclame"
},
"miriam": {
"deprecated": false,
"deprecated": true,
"description": "ACLAME is a database dedicated to the collection and classification of mobile genetic elements (MGEs) from various sources, comprising all known phage genomes, plasmids and transposons.",
"homepage": "http://aclame.ulb.ac.be/",
"id": "00000063",
Expand Down Expand Up @@ -64624,7 +64680,7 @@
"namespaceEmbeddedInLui": true,
"pattern": "^HOG:[0-9]{7}(\\.[0-9a-z.]+)?(_[0-9]+)?$",
"prefix": "oma.hog",
"sampleId": "HOG:0459895",
"sampleId": "0459895",
"uri_format": "https://omabrowser.org/oma/hog/resolve/HOG:$1/"
},
"pattern": "^[0-9]{7}(\\.[0-9a-z.]+)?(_[0-9]+)?$",
Expand Down Expand Up @@ -81363,6 +81419,23 @@
"name": "Scholia Registry",
"uri_format": "https://bioregistry.io/metaregistry/scholia/$1"
},
"sciflection": {
"mappings": {
"miriam": "sciflection"
},
"miriam": {
"deprecated": false,
"description": "Sciflection is a public repository for experiments and associated spectra, usually uploaded from Electronic Lab Notebooks, shared under FAIR conditions",
"homepage": "https://sciformation.com/sciflection.html",
"id": "00000973",
"name": "Sciflection",
"namespaceEmbeddedInLui": false,
"pattern": "^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$",
"prefix": "sciflection",
"sampleId": "5ede4273-b26c-4ea4-adb7-3ce294ab3397",
"uri_format": "https://sciflection.com/$1"
}
},
"scomp": {
"contributor": {
"email": "[email protected]",
Expand Down Expand Up @@ -83656,6 +83729,23 @@
"pattern": "^SKIP\\d+$",
"uri_format": "https://skip.stemcellinformatics.org/SKIPSearch/cell_line_detail?accession=$1"
},
"skm": {
"mappings": {
"miriam": "skm"
},
"miriam": {
"deprecated": false,
"description": "Stress Knowledge Map (SKM, available at https://skm.nib.si) is a knowledge graph resulting from the integration of dispersed published information on plant molecular responses to biotic and abiotic stressors. ",
"homepage": "http://www.nib.si/eng/",
"id": "00000968",
"name": "Stress Knowledge Map",
"namespaceEmbeddedInLui": false,
"pattern": "^rx[0-9]{5}$",
"prefix": "skm",
"sampleId": "rx00408",
"uri_format": "https://skm.nib.si/api/pss/reactions?reaction_id=$1&return_field=summary"
}
},
"skos": {
"aberowl": {
"description": "This file is imported by vivo-core-1.5.owl. It contains terms relating to concepts from the http://www.w3.org/2004/02/skos/core# namespace of the SKOS (Simple Knowledge Organization System) ontology that are included in the vivo ontology.",
Expand Down
46 changes: 29 additions & 17 deletions src/bioregistry/data/external/miriam/processed.json
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@
"namespaceEmbeddedInLui": true,
"pattern": "^(ark\\:)/*[0-9A-Za-z]+(?:/[\\w/.=*+@\\$-]*)?(?:\\?.*)?$",
"prefix": "ark",
"sampleId": "/12345/fk1234",
"sampleId": "/87924/r4154jq8h",
"uri_format": "http://n2t.net/ark:$1"
},
"arrayexpress": {
Expand Down Expand Up @@ -960,7 +960,7 @@
"id": "00000694",
"name": "BioStudies database",
"namespaceEmbeddedInLui": false,
"pattern": "^S-[A-Z]{4}[A-Z\\d\\-]+$",
"pattern": "^S-[A-Z]{4}[\\-\\_A-Z\\d]+$",
"prefix": "biostudies",
"sampleId": "S-EPMC6266652",
"uri_format": "https://www.ebi.ac.uk/biostudies/studies/$1"
Expand Down Expand Up @@ -1302,7 +1302,7 @@
"id": "00000234",
"name": "Cell Cycle Ontology",
"namespaceEmbeddedInLui": true,
"pattern": "^CCO\\:\\w+$",
"pattern": "^CCO:\\w+$",
"prefix": "cco",
"sampleId": "0000003",
"uri_format": "https://www.ebi.ac.uk/ols/ontologies/cco/terms?obo_id=CCO:$1"
Expand Down Expand Up @@ -3804,18 +3804,6 @@
"sampleId": "GR:0080039",
"uri_format": "http://www.gramene.org/db/genes/search_gene?acc=$1"
},
"gramene.growthstage": {
"deprecated": false,
"description": "Gramene is a comparative genome mapping database for grasses and crop plants. It combines a semi-automatically generated database of cereal genomic and expressed sequence tag sequences, genetic maps, map relations, quantitative trait loci (QTL), and publications, with a curated database of mutants (genes and alleles), molecular markers, and proteins. This collection refers to growth stage ontology information in Gramene.",
"homepage": "http://www.gramene.org/",
"id": "00000508",
"name": "Gramene Growth Stage Ontology",
"namespaceEmbeddedInLui": true,
"pattern": "^GRO\\:\\d+$",
"prefix": "gramene.growthstage",
"sampleId": "0007133",
"uri_format": "http://www.gramene.org/db/ontology/search?id=GRO:$1"
},
"gramene.protein": {
"deprecated": false,
"description": "Gramene is a comparative genome mapping database for grasses and crop plants. It combines a semi-automatically generated database of cereal genomic and expressed sequence tag sequences, genetic maps, map relations, quantitative trait loci (QTL), and publications, with a curated database of mutants (genes and alleles), molecular markers, and proteins. This datatype refers to proteins in Gramene.",
Expand Down Expand Up @@ -3888,6 +3876,18 @@
"sampleId": "19333",
"uri_format": "http://www.ars-grin.gov/cgi-bin/npgs/html/taxon.pl?$1"
},
"gro": {
"deprecated": false,
"description": "Gramene is a comparative genome mapping database for grasses and crop plants. It combines a semi-automatically generated database of cereal genomic and expressed sequence tag sequences, genetic maps, map relations, quantitative trait loci (QTL), and publications, with a curated database of mutants (genes and alleles), molecular markers, and proteins. This collection refers to growth stage ontology information in Gramene.",
"homepage": "http://www.gramene.org/",
"id": "00000508",
"name": "Gramene Growth Stage Ontology",
"namespaceEmbeddedInLui": true,
"pattern": "^GRO:\\d+$",
"prefix": "gro",
"sampleId": "0007133",
"uri_format": "http://www.gramene.org/db/ontology/search?id=GRO:$1"
},
"grsdb": {
"deprecated": false,
"description": "GRSDB is a database of G-quadruplexes and contains information on composition and distribution of putative Quadruplex-forming G-Rich Sequences (QGRS) mapped in the eukaryotic pre-mRNA sequences, including those that are alternatively processed (alternatively spliced or alternatively polyadenylated). The data stored in the GRSDB is based on computational analysis of NCBI Entrez Gene entries and their corresponding annotated genomic nucleotide sequences of RefSeq/GenBank.",
Expand Down Expand Up @@ -5762,7 +5762,7 @@
"uri_format": "https://www.metabolome-express.org/datasetview.php?datasetid=$1"
},
"mge": {
"deprecated": false,
"deprecated": true,
"description": "ACLAME is a database dedicated to the collection and classification of mobile genetic elements (MGEs) from various sources, comprising all known phage genomes, plasmids and transposons.",
"homepage": "http://aclame.ulb.ac.be/",
"id": "00000063",
Expand Down Expand Up @@ -6953,7 +6953,7 @@
"namespaceEmbeddedInLui": true,
"pattern": "^HOG:[0-9]{7}(\\.[0-9a-z.]+)?(_[0-9]+)?$",
"prefix": "oma.hog",
"sampleId": "HOG:0459895",
"sampleId": "0459895",
"uri_format": "https://omabrowser.org/oma/hog/resolve/HOG:$1/"
},
"oma.protein": {
Expand Down Expand Up @@ -9243,6 +9243,18 @@
"sampleId": "2200950",
"uri_format": "http://www.arabidopsis.org/servlets/TairObject?accession=Locus:$1"
},
"tair.name": {
"deprecated": false,
"description": "The Arabidopsis Information Resource (TAIR) maintains a database of genetic and molecular biology data for the model higher plant Arabidopsis thaliana. The name of a Locus is unique and used by TAIR, TIGR, and MIPS.",
"homepage": "https://www.arabidopsis.org/index.jsp",
"id": "00000976",
"name": "TAIR gene name",
"namespaceEmbeddedInLui": false,
"pattern": "^AT.G[0-9]{5}$",
"prefix": "tair.name",
"sampleId": "AT5G05330",
"uri_format": "https://www.arabidopsis.org/servlets/TairObject?type=locus&name=$1"
},
"tair.protein": {
"deprecated": false,
"description": "The Arabidopsis Information Resource (TAIR) maintains a database of genetic and molecular biology data for the model higher plant Arabidopsis thaliana. This provides protein information for a given gene model and provides links to other sources such as UniProtKB and GenPept",
Expand Down
Loading

0 comments on commit 5834b3b

Please sign in to comment.