Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify parsing SSSOM w/ curies.chain #431

Merged
merged 23 commits into from
Oct 2, 2023
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions src/sssom/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,13 @@ def _get_built_in_prefix_map() -> Converter:
HINT = Union[None, PrefixMap, Converter]


def ensure_converter(prefix_map: HINT = None) -> Converter:
def ensure_converter(prefix_map: HINT = None, *, use_bioregistry: bool = True) -> Converter:
matentzn marked this conversation as resolved.
Show resolved Hide resolved
"""Ensure a converter is available."""
if not prefix_map:
return get_converter()
if use_bioregistry:
matentzn marked this conversation as resolved.
Show resolved Hide resolved
return get_converter()
matentzn marked this conversation as resolved.
Show resolved Hide resolved
else:
return _get_built_in_prefix_map()
if not isinstance(prefix_map, Converter):
prefix_map = Converter.from_prefix_map(prefix_map)
return curies.chain([_get_built_in_prefix_map(), prefix_map])
73 changes: 33 additions & 40 deletions src/sssom/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
import logging as _logging
import re
import typing
from collections import Counter
from collections import ChainMap, Counter
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, TextIO, Tuple, Union, cast
from xml.dom import Node, minidom
from xml.dom.minidom import Document

import curies
import numpy as np
import pandas as pd
import requests
Expand Down Expand Up @@ -53,9 +54,9 @@
SSSOMSchemaView,
)

from .context import HINT, ensure_converter
from .context import HINT, _get_built_in_prefix_map, ensure_converter
from .sssom_document import MappingSetDocument
from .typehints import Metadata, MetadataType, generate_mapping_set_id
from .typehints import Metadata, MetadataType, generate_mapping_set_id, get_default_metadata
from .util import (
PREFIX_MAP_KEY,
SSSOM_DEFAULT_RDF_SERIALISATION,
Expand Down Expand Up @@ -193,44 +194,36 @@ def parse_sssom_table(
stream: io.StringIO = _open_input(file_path)
sep_new = _get_seperator_symbol_from_file_path(file_path)
df, sssom_metadata = _read_pandas_and_metadata(stream, sep_new)
# if mapping_predicates:
# # Filter rows based on presence of predicate_id list provided.
# df = df[df["predicate_id"].isin(mapping_predicates)]

# If SSSOM external metadata is provided, merge it with the internal metadata

if sssom_metadata:
if meta:
for k, v in meta.items():
if k in sssom_metadata:
if sssom_metadata[k] != v:
logging.warning(
f"SSSOM internal metadata {k} ({sssom_metadata[k]}) "
f"conflicts with provided ({meta[k]})."
matentzn marked this conversation as resolved.
Show resolved Hide resolved
)
else:
logging.info(f"Externally provided metadata {k}:{v} is added to metadata set.")
sssom_metadata[k] = v
meta = sssom_metadata

if PREFIX_MAP_KEY in sssom_metadata:
if prefix_map:
for k, v in prefix_map.items():
if k in sssom_metadata[CURIE_MAP]:
if sssom_metadata[CURIE_MAP][k] != v:
logging.warning(
f"SSSOM prefix map {k} ({sssom_metadata[CURIE_MAP][k]}) "
f"conflicts with provided ({prefix_map[k]})."
matentzn marked this conversation as resolved.
Show resolved Hide resolved
)
else:
logging.info(
f"Externally provided metadata {k}:{v} is added to metadata set."
)
sssom_metadata[CURIE_MAP][k] = v
prefix_map = sssom_metadata[CURIE_MAP]
if meta is None:
meta = {}

# The priority order for combining prefix maps are:
# 1. Built-in prefix map
# 2. Internal prefix map inside the document
# 3. Prefix map passed through this function inside the ``meta``
# 4. Prefix map passed through this function to ``prefix_map`` (handled with ensure_converter)
converter = curies.chain(
[
_get_built_in_prefix_map(),
Converter.from_prefix_map(sssom_metadata.pop(CURIE_MAP, {})),
matentzn marked this conversation as resolved.
Show resolved Hide resolved
Converter.from_prefix_map(meta.pop(CURIE_MAP, {})),
ensure_converter(prefix_map, use_bioregistry=False),
matentzn marked this conversation as resolved.
Show resolved Hide resolved
]
)

# The priority order for combining metadata is:
# 1. Metadata appearing in the SSSOM document
# 2. Metadata passed through ``meta`` to this function
# 3. Default metadata
matentzn marked this conversation as resolved.
Show resolved Hide resolved
combine_meta = dict(
ChainMap(
sssom_metadata,
meta,
get_default_metadata(),
)
)
matentzn marked this conversation as resolved.
Show resolved Hide resolved

meta_all = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta)
msdf = from_sssom_dataframe(df, prefix_map=meta_all.prefix_map, meta=meta_all.metadata)
msdf = from_sssom_dataframe(df, prefix_map=converter, meta=combine_meta)
return msdf


Expand Down
2 changes: 1 addition & 1 deletion src/sssom/writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ def to_fhir_json(msdf: MappingSetDataFrame) -> Dict:
def to_json(msdf: MappingSetDataFrame) -> JsonObj:
"""Convert a mapping set dataframe to a JSON object."""
doc = to_mapping_set_document(msdf)
data = JSONDumper().dumps(doc.mapping_set, contexts=doc.prefix_map)
data = JSONDumper().dumps(doc.mapping_set, contexts={"@context": doc.prefix_map})
matentzn marked this conversation as resolved.
Show resolved Hide resolved
json_obj = json.loads(data)
return json_obj

Expand Down
7 changes: 4 additions & 3 deletions tests/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,10 @@ def _test_to_dataframe(self, mdoc, test):
test.ct_data_frame_rows,
f"The pandas data frame has less elements than the orginal one for {test.filename}",
)
df.to_csv(test.get_out_file("roundtrip.tsv"), sep="\t")
# data = pd.read_csv(test.get_out_file("roundtrip.tsv"), sep="\t")
data = parse_sssom_table(test.get_out_file("roundtrip.tsv")).df
path = test.get_out_file("roundtrip.tsv")
matentzn marked this conversation as resolved.
Show resolved Hide resolved
with open(path, "w") as file:
write_table(msdf, file=file)
data = parse_sssom_table(path).df
self.assertEqual(
len(data),
test.ct_data_frame_rows,
Expand Down
75 changes: 74 additions & 1 deletion tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,22 @@
import json
import math
import os
import tempfile
import unittest
from pathlib import Path
from xml.dom import minidom

import numpy as np
import pandas as pd
import yaml
from rdflib import Graph

from sssom.constants import CURIE_MAP, DEFAULT_LICENSE, SSSOM_URI_PREFIX
from sssom.context import SSSOM_BUILT_IN_PREFIXES, ensure_converter
from sssom.io import parse_file
from sssom.parsers import (
_open_input,
_read_pandas_and_metadata,
from_alignment_minidom,
from_obographs,
from_sssom_dataframe,
Expand All @@ -22,7 +28,7 @@
parse_sssom_table,
)
from sssom.typehints import Metadata
from sssom.util import PREFIX_MAP_KEY, sort_df_rows_columns
from sssom.util import PREFIX_MAP_KEY, MappingSetDataFrame, sort_df_rows_columns
from sssom.writers import write_table
from tests.test_data import data_dir as test_data_dir
from tests.test_data import test_out_dir
Expand Down Expand Up @@ -245,3 +251,70 @@ def test_parse_obographs_merged(self):
)
msdf = parse_sssom_table(outfile)
self.assertTrue(custom_curie_map.items() <= msdf.prefix_map.items())


class TestParseExplicit(unittest.TestCase):
"""This test case contains explicit tests for parsing."""

def test_round_trip(self):
"""Explicitly test round tripping."""
rows = [
(
"DOID:0050601",
"ADULT syndrome",
"skos:exactMatch",
"UMLS:C1863204",
"ADULT SYNDROME",
"semapv:ManualMappingCuration",
"orcid:0000-0003-4423-4370",
)
]
columns = [
"subject_id",
"subject_label",
"predicate_id",
"object_id",
"object_label",
"mapping_justification",
"creator_id",
]
df = pd.DataFrame(rows, columns=columns)
msdf = MappingSetDataFrame(df=df, converter=ensure_converter())
msdf.clean_prefix_map(strict=True)
self.assertEqual(
{"DOID", "semapv", "orcid", "skos", "UMLS"}.union(SSSOM_BUILT_IN_PREFIXES),
matentzn marked this conversation as resolved.
Show resolved Hide resolved
set(msdf.prefix_map),
)

with tempfile.TemporaryDirectory() as directory:
directory = Path(directory)
path = directory.joinpath("test.sssom.tsv")
with path.open("w") as file:
write_table(msdf, file)

_, read_metadata = _read_pandas_and_metadata(_open_input(path))
reconsitited_msdf = parse_sssom_table(path)

# This tests what's actually in the file after it's written out
self.assertEqual({CURIE_MAP, "license", "mapping_set_id"}, set(read_metadata))
self.assertEqual(DEFAULT_LICENSE, read_metadata["license"])
self.assertTrue(read_metadata["mapping_set_id"].startswith(f"{SSSOM_URI_PREFIX}mappings/"))

expected_prefix_map = {
"DOID": "http://purl.obolibrary.org/obo/DOID_",
"UMLS": "http://linkedlifedata.com/resource/umls/id/",
"orcid": "https://orcid.org/",
"owl": "http://www.w3.org/2002/07/owl#",
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"semapv": "https://w3id.org/semapv/vocab/",
"skos": "http://www.w3.org/2004/02/skos/core#",
"sssom": "https://w3id.org/sssom/",
}
self.assertEqual(
expected_prefix_map,
read_metadata[CURIE_MAP],
)

# This checks that nothing funny gets added unexpectedly
self.assertEqual(expected_prefix_map, reconsitited_msdf.prefix_map)
16 changes: 16 additions & 0 deletions tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,22 @@ def test_validate_json(self):
"""
self.assertIsNone(validate(self.correct_msdf1, self.validation_types))

@unittest.skip(
reason="""\

This test did not previously do what was expected. It was raising a validation error
not because of the text below suggesting the validator was able to identify an issue
with the `mapping_justification` slot, but because `orcid` was missing from the prefix map.
The error actually thrown was::

jsonschema.exceptions.ValidationError: The prefixes in {'orcid'} are missing from 'curie_map'.

With updates in https://github.com/mapping-commons/sssom-py/pull/431, the default prefix map
which includes `orcid` is added on parse, and this error goes away. Therefore, this test
now fails, but again, this is a sporadic failure since the test was not correct in the first
place. Therefore, this test is now skipped and marked for FIXME.
"""
)
def test_validate_json_fail(self):
"""
Test if JSONSchemaValidation fail is as expected.
Expand Down