Simplify parsing SSSOM w/ curies.chain (#431)

Closes #363 (final nail in the coffin) This provides an alternative to #429 that makes more explicit the chaining operations done on the metadata and prefix maps. This is also a good change to carefully document the way that this is handled, since I might not have captured it accurately. As it is, The priority order for combining prefix maps are: 1. Internal prefix map inside the document 2. Prefix map passed through this function inside the ``meta`` 3. Prefix map passed through this function to ``prefix_map`` 4. Default prefix map (handled with ensure_converter)
mapping-commons · Oct 2, 2023 · 41bcdca · 41bcdca
1 parent 69237d5
commit 41bcdca
Show file tree

Hide file tree

Showing 6 changed files with 158 additions and 48 deletions.
diff --git a/src/sssom/context.py b/src/sssom/context.py
@@ -52,10 +52,32 @@ def _get_built_in_prefix_map() -> Converter:
 HINT = Union[None, PrefixMap, Converter]
 
 
-def ensure_converter(prefix_map: HINT = None) -> Converter:
-    """Ensure a converter is available."""
+def ensure_converter(prefix_map: HINT = None, *, use_defaults: bool = True) -> Converter:
+    """Ensure a converter is available.
+
+    :param prefix_map: One of the following:
+
+        1. An empty dictionary or ``None``. This results in using the default
+           extended prefix map (currently based on a variant of the Bioregistry)
+           if ``use_defaults`` is set to true, otherwise just the builtin prefix
+           map including the prefixes in :data:`SSSOM_BUILT_IN_PREFIXES`
+        2. A non-empty dictionary representing a prefix map. This is loaded as a
+           converter with :meth:`Converter.from_prefix_map`. It is chained
+           behind the builtin prefix map to ensure none of the
+           :data:`SSSOM_BUILT_IN_PREFIXES` are overwritten with non-default values
+        3. A pre-instantiated :class:`curies.Converter`. Similarly to a prefix
+           map passed into this function, this is chained behind the builtin prefix
+           map
+    :param use_defaults: If an empty dictionary or None is passed to this function,
+        this parameter chooses if the extended prefix map (currently based on a
+        variant of the Bioregistry) gets loaded.
+    :returns: A re-usable converter
+    """
     if not prefix_map:
-        return get_converter()
+        if use_defaults:
+            return get_converter()
+        else:
+            return _get_built_in_prefix_map()
     if not isinstance(prefix_map, Converter):
         prefix_map = Converter.from_prefix_map(prefix_map)
     return curies.chain([_get_built_in_prefix_map(), prefix_map])
diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py
@@ -6,12 +6,13 @@
 import logging as _logging
 import re
 import typing
-from collections import Counter
+from collections import ChainMap, Counter
 from pathlib import Path
 from typing import Any, Callable, Dict, Iterable, List, Optional, TextIO, Tuple, Union, cast
 from xml.dom import Node, minidom
 from xml.dom.minidom import Document
 
+import curies
 import numpy as np
 import pandas as pd
 import requests
@@ -53,9 +54,9 @@
     SSSOMSchemaView,
 )
 
-from .context import HINT, ensure_converter
+from .context import HINT, _get_built_in_prefix_map, ensure_converter
 from .sssom_document import MappingSetDocument
-from .typehints import Metadata, MetadataType, generate_mapping_set_id
+from .typehints import Metadata, MetadataType, generate_mapping_set_id, get_default_metadata
 from .util import (
     PREFIX_MAP_KEY,
     SSSOM_DEFAULT_RDF_SERIALISATION,
@@ -193,44 +194,36 @@ def parse_sssom_table(
     stream: io.StringIO = _open_input(file_path)
     sep_new = _get_seperator_symbol_from_file_path(file_path)
     df, sssom_metadata = _read_pandas_and_metadata(stream, sep_new)
-    # if mapping_predicates:
-    #     # Filter rows based on presence of predicate_id list provided.
-    #     df = df[df["predicate_id"].isin(mapping_predicates)]
-
-    # If SSSOM external metadata is provided, merge it with the internal metadata
-
-    if sssom_metadata:
-        if meta:
-            for k, v in meta.items():
-                if k in sssom_metadata:
-                    if sssom_metadata[k] != v:
-                        logging.warning(
-                            f"SSSOM internal metadata {k} ({sssom_metadata[k]}) "
-                            f"conflicts with provided ({meta[k]})."
-                        )
-                else:
-                    logging.info(f"Externally provided metadata {k}:{v} is added to metadata set.")
-                    sssom_metadata[k] = v
-        meta = sssom_metadata
-
-        if PREFIX_MAP_KEY in sssom_metadata:
-            if prefix_map:
-                for k, v in prefix_map.items():
-                    if k in sssom_metadata[CURIE_MAP]:
-                        if sssom_metadata[CURIE_MAP][k] != v:
-                            logging.warning(
-                                f"SSSOM prefix map {k} ({sssom_metadata[CURIE_MAP][k]}) "
-                                f"conflicts with provided ({prefix_map[k]})."
-                            )
-                    else:
-                        logging.info(
-                            f"Externally provided metadata {k}:{v} is added to metadata set."
-                        )
-                        sssom_metadata[CURIE_MAP][k] = v
-            prefix_map = sssom_metadata[CURIE_MAP]
+    if meta is None:
+        meta = {}
+
+    # The priority order for combining prefix maps are:
+    #  1. Built-in prefix map
+    #  2. Internal prefix map inside the document
+    #  3. Prefix map passed through this function inside the ``meta``
+    #  4. Prefix map passed through this function to ``prefix_map`` (handled with ensure_converter)
+    converter = curies.chain(
+        [
+            _get_built_in_prefix_map(),
+            Converter.from_prefix_map(sssom_metadata.pop(CURIE_MAP, {})),
+            Converter.from_prefix_map(meta.pop(CURIE_MAP, {})),
+            ensure_converter(prefix_map, use_defaults=False),
+        ]
+    )
+
+    # The priority order for combining metadata is:
+    #  1. Metadata appearing in the SSSOM document
+    #  2. Metadata passed through ``meta`` to this function
+    #  3. Default metadata
+    combine_meta = dict(
+        ChainMap(
+            sssom_metadata,
+            meta,
+            get_default_metadata(),
+        )
+    )
 
-    meta_all = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta)
-    msdf = from_sssom_dataframe(df, prefix_map=meta_all.prefix_map, meta=meta_all.metadata)
+    msdf = from_sssom_dataframe(df, prefix_map=converter, meta=combine_meta)
     return msdf
 
 

diff --git a/src/sssom/writers.py b/src/sssom/writers.py
@@ -441,7 +441,7 @@ def to_fhir_json(msdf: MappingSetDataFrame) -> Dict:
 def to_json(msdf: MappingSetDataFrame) -> JsonObj:
     """Convert a mapping set dataframe to a JSON object."""
     doc = to_mapping_set_document(msdf)
-    data = JSONDumper().dumps(doc.mapping_set, contexts=doc.prefix_map)
+    data = JSONDumper().dumps(doc.mapping_set, contexts={"@context": doc.prefix_map})
     json_obj = json.loads(data)
     return json_obj
 

diff --git a/tests/test_conversion.py b/tests/test_conversion.py
@@ -146,9 +146,10 @@ def _test_to_dataframe(self, mdoc, test):
             test.ct_data_frame_rows,
             f"The pandas data frame has less elements than the orginal one for {test.filename}",
         )
-        df.to_csv(test.get_out_file("roundtrip.tsv"), sep="\t")
-        # data = pd.read_csv(test.get_out_file("roundtrip.tsv"), sep="\t")
-        data = parse_sssom_table(test.get_out_file("roundtrip.tsv")).df
+        path = test.get_out_file("roundtrip.tsv")
+        with open(path, "w") as file:
+            write_table(msdf, file=file)
+        data = parse_sssom_table(path).df
         self.assertEqual(
             len(data),
             test.ct_data_frame_rows,

diff --git a/tests/test_parsers.py b/tests/test_parsers.py
@@ -4,16 +4,22 @@
 import json
 import math
 import os
+import tempfile
 import unittest
+from pathlib import Path
 from xml.dom import minidom
 
 import numpy as np
 import pandas as pd
 import yaml
 from rdflib import Graph
 
+from sssom.constants import CURIE_MAP, DEFAULT_LICENSE, SSSOM_URI_PREFIX
+from sssom.context import SSSOM_BUILT_IN_PREFIXES, ensure_converter
 from sssom.io import parse_file
 from sssom.parsers import (
+    _open_input,
+    _read_pandas_and_metadata,
     from_alignment_minidom,
     from_obographs,
     from_sssom_dataframe,
@@ -22,7 +28,7 @@
     parse_sssom_table,
 )
 from sssom.typehints import Metadata
-from sssom.util import PREFIX_MAP_KEY, sort_df_rows_columns
+from sssom.util import PREFIX_MAP_KEY, MappingSetDataFrame, sort_df_rows_columns
 from sssom.writers import write_table
 from tests.test_data import data_dir as test_data_dir
 from tests.test_data import test_out_dir
@@ -245,3 +251,75 @@ def test_parse_obographs_merged(self):
             )
         msdf = parse_sssom_table(outfile)
         self.assertTrue(custom_curie_map.items() <= msdf.prefix_map.items())
+
+
+class TestParseExplicit(unittest.TestCase):
+    """This test case contains explicit tests for parsing."""
+
+    def test_round_trip(self):
+        """Explicitly test round tripping."""
+        rows = [
+            (
+                "DOID:0050601",
+                "ADULT syndrome",
+                "skos:exactMatch",
+                "UMLS:C1863204",
+                "ADULT SYNDROME",
+                "semapv:ManualMappingCuration",
+                "orcid:0000-0003-4423-4370",
+            )
+        ]
+        columns = [
+            "subject_id",
+            "subject_label",
+            "predicate_id",
+            "object_id",
+            "object_label",
+            "mapping_justification",
+            "creator_id",
+        ]
+        df = pd.DataFrame(rows, columns=columns)
+        msdf = MappingSetDataFrame(df=df, converter=ensure_converter())
+        msdf.clean_prefix_map(strict=True)
+        #: This is a set of the prefixes that explicitly are used in this
+        #: example. SSSOM-py also adds the remaining builtin prefixes from
+        #: :data:`sssom.context.SSSOM_BUILT_IN_PREFIXES`, which is reflected
+        #: in the formulation of the test expectation below
+        explicit_prefixes = {"DOID", "semapv", "orcid", "skos", "UMLS"}
+        self.assertEqual(
+            explicit_prefixes.union(SSSOM_BUILT_IN_PREFIXES),
+            set(msdf.prefix_map),
+        )
+
+        with tempfile.TemporaryDirectory() as directory:
+            directory = Path(directory)
+            path = directory.joinpath("test.sssom.tsv")
+            with path.open("w") as file:
+                write_table(msdf, file)
+
+            _, read_metadata = _read_pandas_and_metadata(_open_input(path))
+            reconsitited_msdf = parse_sssom_table(path)
+
+        # This tests what's actually in the file after it's written out
+        self.assertEqual({CURIE_MAP, "license", "mapping_set_id"}, set(read_metadata))
+        self.assertEqual(DEFAULT_LICENSE, read_metadata["license"])
+        self.assertTrue(read_metadata["mapping_set_id"].startswith(f"{SSSOM_URI_PREFIX}mappings/"))
+
+        expected_prefix_map = {
+            "DOID": "http://purl.obolibrary.org/obo/DOID_",
+            "UMLS": "http://linkedlifedata.com/resource/umls/id/",
+            "orcid": "https://orcid.org/",
+            "owl": "http://www.w3.org/2002/07/owl#",
+            "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
+            "rdfs": "http://www.w3.org/2000/01/rdf-schema#",
+            "semapv": "https://w3id.org/semapv/vocab/",
+            "skos": "http://www.w3.org/2004/02/skos/core#",
+            "sssom": "https://w3id.org/sssom/",
+        }
+        self.assertEqual(
+            expected_prefix_map,
+            read_metadata[CURIE_MAP],
+        )
+
+        # This checks that nothing funny gets added unexpectedly
+        self.assertEqual(expected_prefix_map, reconsitited_msdf.prefix_map)
diff --git a/tests/test_validate.py b/tests/test_validate.py
@@ -29,6 +29,22 @@ def test_validate_json(self):
         """
         self.assertIsNone(validate(self.correct_msdf1, self.validation_types))
 
+    @unittest.skip(
+        reason="""\
+
+    This test did not previously do what was expected. It was raising a validation error
+    not because of the text below suggesting the validator was able to identify an issue
+    with the `mapping_justification` slot, but because `orcid` was missing from the prefix map.
+    The error actually thrown was::
+
+      jsonschema.exceptions.ValidationError: The prefixes in {'orcid'} are missing from 'curie_map'.
+
+    With updates in https://github.com/mapping-commons/sssom-py/pull/431, the default prefix map
+    which includes `orcid` is added on parse, and this error goes away. Therefore, this test
+    now fails, but again, this is a sporadic failure since the test was not correct in the first
+    place. Therefore, this test is now skipped and marked for FIXME.
+    """
+    )
     def test_validate_json_fail(self):
         """
         Test if JSONSchemaValidation fail is as expected.