Skip to content

Commit

Permalink
Simplify parsing SSSOM w/ curies.chain (#431)
Browse files Browse the repository at this point in the history
Closes #363 (final nail in the coffin)

This provides an alternative to
#429 that makes more
explicit the chaining operations done on the metadata and prefix maps.
This is also a good change to carefully document the way that this is
handled, since I might not have captured it accurately. As it is, The
priority order for combining prefix maps are:

1. Internal prefix map inside the document
2. Prefix map passed through this function inside the ``meta``
3. Prefix map passed through this function to ``prefix_map``
4. Default prefix map (handled with ensure_converter)
  • Loading branch information
cthoyt authored Oct 2, 2023
1 parent 69237d5 commit 41bcdca
Show file tree
Hide file tree
Showing 6 changed files with 158 additions and 48 deletions.
28 changes: 25 additions & 3 deletions src/sssom/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,32 @@ def _get_built_in_prefix_map() -> Converter:
HINT = Union[None, PrefixMap, Converter]


def ensure_converter(prefix_map: HINT = None) -> Converter:
"""Ensure a converter is available."""
def ensure_converter(prefix_map: HINT = None, *, use_defaults: bool = True) -> Converter:
"""Ensure a converter is available.
:param prefix_map: One of the following:
1. An empty dictionary or ``None``. This results in using the default
extended prefix map (currently based on a variant of the Bioregistry)
if ``use_defaults`` is set to true, otherwise just the builtin prefix
map including the prefixes in :data:`SSSOM_BUILT_IN_PREFIXES`
2. A non-empty dictionary representing a prefix map. This is loaded as a
converter with :meth:`Converter.from_prefix_map`. It is chained
behind the builtin prefix map to ensure none of the
:data:`SSSOM_BUILT_IN_PREFIXES` are overwritten with non-default values
3. A pre-instantiated :class:`curies.Converter`. Similarly to a prefix
map passed into this function, this is chained behind the builtin prefix
map
:param use_defaults: If an empty dictionary or None is passed to this function,
this parameter chooses if the extended prefix map (currently based on a
variant of the Bioregistry) gets loaded.
:returns: A re-usable converter
"""
if not prefix_map:
return get_converter()
if use_defaults:
return get_converter()
else:
return _get_built_in_prefix_map()
if not isinstance(prefix_map, Converter):
prefix_map = Converter.from_prefix_map(prefix_map)
return curies.chain([_get_built_in_prefix_map(), prefix_map])
73 changes: 33 additions & 40 deletions src/sssom/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
import logging as _logging
import re
import typing
from collections import Counter
from collections import ChainMap, Counter
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, TextIO, Tuple, Union, cast
from xml.dom import Node, minidom
from xml.dom.minidom import Document

import curies
import numpy as np
import pandas as pd
import requests
Expand Down Expand Up @@ -53,9 +54,9 @@
SSSOMSchemaView,
)

from .context import HINT, ensure_converter
from .context import HINT, _get_built_in_prefix_map, ensure_converter
from .sssom_document import MappingSetDocument
from .typehints import Metadata, MetadataType, generate_mapping_set_id
from .typehints import Metadata, MetadataType, generate_mapping_set_id, get_default_metadata
from .util import (
PREFIX_MAP_KEY,
SSSOM_DEFAULT_RDF_SERIALISATION,
Expand Down Expand Up @@ -193,44 +194,36 @@ def parse_sssom_table(
stream: io.StringIO = _open_input(file_path)
sep_new = _get_seperator_symbol_from_file_path(file_path)
df, sssom_metadata = _read_pandas_and_metadata(stream, sep_new)
# if mapping_predicates:
# # Filter rows based on presence of predicate_id list provided.
# df = df[df["predicate_id"].isin(mapping_predicates)]

# If SSSOM external metadata is provided, merge it with the internal metadata

if sssom_metadata:
if meta:
for k, v in meta.items():
if k in sssom_metadata:
if sssom_metadata[k] != v:
logging.warning(
f"SSSOM internal metadata {k} ({sssom_metadata[k]}) "
f"conflicts with provided ({meta[k]})."
)
else:
logging.info(f"Externally provided metadata {k}:{v} is added to metadata set.")
sssom_metadata[k] = v
meta = sssom_metadata

if PREFIX_MAP_KEY in sssom_metadata:
if prefix_map:
for k, v in prefix_map.items():
if k in sssom_metadata[CURIE_MAP]:
if sssom_metadata[CURIE_MAP][k] != v:
logging.warning(
f"SSSOM prefix map {k} ({sssom_metadata[CURIE_MAP][k]}) "
f"conflicts with provided ({prefix_map[k]})."
)
else:
logging.info(
f"Externally provided metadata {k}:{v} is added to metadata set."
)
sssom_metadata[CURIE_MAP][k] = v
prefix_map = sssom_metadata[CURIE_MAP]
if meta is None:
meta = {}

# The priority order for combining prefix maps are:
# 1. Built-in prefix map
# 2. Internal prefix map inside the document
# 3. Prefix map passed through this function inside the ``meta``
# 4. Prefix map passed through this function to ``prefix_map`` (handled with ensure_converter)
converter = curies.chain(
[
_get_built_in_prefix_map(),
Converter.from_prefix_map(sssom_metadata.pop(CURIE_MAP, {})),
Converter.from_prefix_map(meta.pop(CURIE_MAP, {})),
ensure_converter(prefix_map, use_defaults=False),
]
)

# The priority order for combining metadata is:
# 1. Metadata appearing in the SSSOM document
# 2. Metadata passed through ``meta`` to this function
# 3. Default metadata
combine_meta = dict(
ChainMap(
sssom_metadata,
meta,
get_default_metadata(),
)
)

meta_all = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta)
msdf = from_sssom_dataframe(df, prefix_map=meta_all.prefix_map, meta=meta_all.metadata)
msdf = from_sssom_dataframe(df, prefix_map=converter, meta=combine_meta)
return msdf


Expand Down
2 changes: 1 addition & 1 deletion src/sssom/writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ def to_fhir_json(msdf: MappingSetDataFrame) -> Dict:
def to_json(msdf: MappingSetDataFrame) -> JsonObj:
"""Convert a mapping set dataframe to a JSON object."""
doc = to_mapping_set_document(msdf)
data = JSONDumper().dumps(doc.mapping_set, contexts=doc.prefix_map)
data = JSONDumper().dumps(doc.mapping_set, contexts={"@context": doc.prefix_map})
json_obj = json.loads(data)
return json_obj

Expand Down
7 changes: 4 additions & 3 deletions tests/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,10 @@ def _test_to_dataframe(self, mdoc, test):
test.ct_data_frame_rows,
f"The pandas data frame has less elements than the orginal one for {test.filename}",
)
df.to_csv(test.get_out_file("roundtrip.tsv"), sep="\t")
# data = pd.read_csv(test.get_out_file("roundtrip.tsv"), sep="\t")
data = parse_sssom_table(test.get_out_file("roundtrip.tsv")).df
path = test.get_out_file("roundtrip.tsv")
with open(path, "w") as file:
write_table(msdf, file=file)
data = parse_sssom_table(path).df
self.assertEqual(
len(data),
test.ct_data_frame_rows,
Expand Down
80 changes: 79 additions & 1 deletion tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,22 @@
import json
import math
import os
import tempfile
import unittest
from pathlib import Path
from xml.dom import minidom

import numpy as np
import pandas as pd
import yaml
from rdflib import Graph

from sssom.constants import CURIE_MAP, DEFAULT_LICENSE, SSSOM_URI_PREFIX
from sssom.context import SSSOM_BUILT_IN_PREFIXES, ensure_converter
from sssom.io import parse_file
from sssom.parsers import (
_open_input,
_read_pandas_and_metadata,
from_alignment_minidom,
from_obographs,
from_sssom_dataframe,
Expand All @@ -22,7 +28,7 @@
parse_sssom_table,
)
from sssom.typehints import Metadata
from sssom.util import PREFIX_MAP_KEY, sort_df_rows_columns
from sssom.util import PREFIX_MAP_KEY, MappingSetDataFrame, sort_df_rows_columns
from sssom.writers import write_table
from tests.test_data import data_dir as test_data_dir
from tests.test_data import test_out_dir
Expand Down Expand Up @@ -245,3 +251,75 @@ def test_parse_obographs_merged(self):
)
msdf = parse_sssom_table(outfile)
self.assertTrue(custom_curie_map.items() <= msdf.prefix_map.items())


class TestParseExplicit(unittest.TestCase):
"""This test case contains explicit tests for parsing."""

def test_round_trip(self):
"""Explicitly test round tripping."""
rows = [
(
"DOID:0050601",
"ADULT syndrome",
"skos:exactMatch",
"UMLS:C1863204",
"ADULT SYNDROME",
"semapv:ManualMappingCuration",
"orcid:0000-0003-4423-4370",
)
]
columns = [
"subject_id",
"subject_label",
"predicate_id",
"object_id",
"object_label",
"mapping_justification",
"creator_id",
]
df = pd.DataFrame(rows, columns=columns)
msdf = MappingSetDataFrame(df=df, converter=ensure_converter())
msdf.clean_prefix_map(strict=True)
#: This is a set of the prefixes that explicitly are used in this
#: example. SSSOM-py also adds the remaining builtin prefixes from
#: :data:`sssom.context.SSSOM_BUILT_IN_PREFIXES`, which is reflected
#: in the formulation of the test expectation below
explicit_prefixes = {"DOID", "semapv", "orcid", "skos", "UMLS"}
self.assertEqual(
explicit_prefixes.union(SSSOM_BUILT_IN_PREFIXES),
set(msdf.prefix_map),
)

with tempfile.TemporaryDirectory() as directory:
directory = Path(directory)
path = directory.joinpath("test.sssom.tsv")
with path.open("w") as file:
write_table(msdf, file)

_, read_metadata = _read_pandas_and_metadata(_open_input(path))
reconsitited_msdf = parse_sssom_table(path)

# This tests what's actually in the file after it's written out
self.assertEqual({CURIE_MAP, "license", "mapping_set_id"}, set(read_metadata))
self.assertEqual(DEFAULT_LICENSE, read_metadata["license"])
self.assertTrue(read_metadata["mapping_set_id"].startswith(f"{SSSOM_URI_PREFIX}mappings/"))

expected_prefix_map = {
"DOID": "http://purl.obolibrary.org/obo/DOID_",
"UMLS": "http://linkedlifedata.com/resource/umls/id/",
"orcid": "https://orcid.org/",
"owl": "http://www.w3.org/2002/07/owl#",
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"semapv": "https://w3id.org/semapv/vocab/",
"skos": "http://www.w3.org/2004/02/skos/core#",
"sssom": "https://w3id.org/sssom/",
}
self.assertEqual(
expected_prefix_map,
read_metadata[CURIE_MAP],
)

# This checks that nothing funny gets added unexpectedly
self.assertEqual(expected_prefix_map, reconsitited_msdf.prefix_map)
16 changes: 16 additions & 0 deletions tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,22 @@ def test_validate_json(self):
"""
self.assertIsNone(validate(self.correct_msdf1, self.validation_types))

@unittest.skip(
reason="""\
This test did not previously do what was expected. It was raising a validation error
not because of the text below suggesting the validator was able to identify an issue
with the `mapping_justification` slot, but because `orcid` was missing from the prefix map.
The error actually thrown was::
jsonschema.exceptions.ValidationError: The prefixes in {'orcid'} are missing from 'curie_map'.
With updates in https://github.com/mapping-commons/sssom-py/pull/431, the default prefix map
which includes `orcid` is added on parse, and this error goes away. Therefore, this test
now fails, but again, this is a sporadic failure since the test was not correct in the first
place. Therefore, this test is now skipped and marked for FIXME.
"""
)
def test_validate_json_fail(self):
"""
Test if JSONSchemaValidation fail is as expected.
Expand Down

0 comments on commit 41bcdca

Please sign in to comment.