Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify parsing SSSOM w/ curies.chain #431

Merged
merged 23 commits into from
Oct 2, 2023
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 31 additions & 39 deletions src/sssom/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
import logging as _logging
import re
import typing
from collections import Counter
from collections import ChainMap, Counter
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, TextIO, Tuple, Union, cast
from xml.dom import Node, minidom
from xml.dom.minidom import Document

import curies
import numpy as np
import pandas as pd
import requests
Expand Down Expand Up @@ -55,7 +56,7 @@

from .context import HINT, ensure_converter
from .sssom_document import MappingSetDocument
from .typehints import Metadata, MetadataType, generate_mapping_set_id
from .typehints import Metadata, MetadataType, generate_mapping_set_id, get_default_metadata
from .util import (
PREFIX_MAP_KEY,
SSSOM_DEFAULT_RDF_SERIALISATION,
Expand Down Expand Up @@ -193,44 +194,35 @@ def parse_sssom_table(
stream: io.StringIO = _open_input(file_path)
sep_new = _get_seperator_symbol_from_file_path(file_path)
df, sssom_metadata = _read_pandas_and_metadata(stream, sep_new)
# if mapping_predicates:
# # Filter rows based on presence of predicate_id list provided.
# df = df[df["predicate_id"].isin(mapping_predicates)]

# If SSSOM external metadata is provided, merge it with the internal metadata

if sssom_metadata:
if meta:
for k, v in meta.items():
if k in sssom_metadata:
if sssom_metadata[k] != v:
logging.warning(
f"SSSOM internal metadata {k} ({sssom_metadata[k]}) "
f"conflicts with provided ({meta[k]})."
matentzn marked this conversation as resolved.
Show resolved Hide resolved
)
else:
logging.info(f"Externally provided metadata {k}:{v} is added to metadata set.")
sssom_metadata[k] = v
meta = sssom_metadata

if PREFIX_MAP_KEY in sssom_metadata:
if prefix_map:
for k, v in prefix_map.items():
if k in sssom_metadata[CURIE_MAP]:
if sssom_metadata[CURIE_MAP][k] != v:
logging.warning(
f"SSSOM prefix map {k} ({sssom_metadata[CURIE_MAP][k]}) "
f"conflicts with provided ({prefix_map[k]})."
matentzn marked this conversation as resolved.
Show resolved Hide resolved
)
else:
logging.info(
f"Externally provided metadata {k}:{v} is added to metadata set."
)
sssom_metadata[CURIE_MAP][k] = v
prefix_map = sssom_metadata[CURIE_MAP]
if meta is None:
meta = {}

# The priority order for combining prefix maps are:
# 1. Internal prefix map inside the document
# 2. Prefix map passed through this function inside the ``meta``
# 3. Prefix map passed through this function to ``prefix_map`` (handled with ensure_converter)
# 4. Default prefix map (handled with ensure_converter)
cthoyt marked this conversation as resolved.
Show resolved Hide resolved
converter = curies.chain(
[
Converter.from_prefix_map(sssom_metadata.pop(CURIE_MAP, {})),
matentzn marked this conversation as resolved.
Show resolved Hide resolved
Converter.from_prefix_map(meta.pop(CURIE_MAP, {})),
ensure_converter(prefix_map),
]
)

# The priority order for combining metadata is:
# 1. Metadata appearing in the SSSOM document
# 2. Metadata passed through ``meta`` to this function
# 3. Default metadata
matentzn marked this conversation as resolved.
Show resolved Hide resolved
combine_meta = dict(
ChainMap(
sssom_metadata,
meta,
get_default_metadata(),
)
)
matentzn marked this conversation as resolved.
Show resolved Hide resolved

meta_all = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta)
msdf = from_sssom_dataframe(df, prefix_map=meta_all.prefix_map, meta=meta_all.metadata)
msdf = from_sssom_dataframe(df, prefix_map=converter, meta=combine_meta)
return msdf


Expand Down
2 changes: 1 addition & 1 deletion src/sssom/writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ def to_fhir_json(msdf: MappingSetDataFrame) -> Dict:
def to_json(msdf: MappingSetDataFrame) -> JsonObj:
"""Convert a mapping set dataframe to a JSON object."""
doc = to_mapping_set_document(msdf)
data = JSONDumper().dumps(doc.mapping_set, contexts=doc.prefix_map)
data = JSONDumper().dumps(doc.mapping_set, contexts={"@context": doc.prefix_map})
matentzn marked this conversation as resolved.
Show resolved Hide resolved
json_obj = json.loads(data)
return json_obj

Expand Down
1 change: 1 addition & 0 deletions tests/test_collapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def test_diff(self):
def test_reconcile_prefix(self):
"""Test curie reconciliation is performing as expected."""
msdf = parse_sssom_table(data_dir / "basic3.tsv")
msdf.clean_prefix_map()
matentzn marked this conversation as resolved.
Show resolved Hide resolved

self.assertEqual(
{
Expand Down
7 changes: 4 additions & 3 deletions tests/test_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,10 @@ def _test_to_dataframe(self, mdoc, test):
test.ct_data_frame_rows,
f"The pandas data frame has less elements than the orginal one for {test.filename}",
)
df.to_csv(test.get_out_file("roundtrip.tsv"), sep="\t")
# data = pd.read_csv(test.get_out_file("roundtrip.tsv"), sep="\t")
data = parse_sssom_table(test.get_out_file("roundtrip.tsv")).df
path = test.get_out_file("roundtrip.tsv")
matentzn marked this conversation as resolved.
Show resolved Hide resolved
with open(path, "w") as file:
write_table(msdf, file=file)
data = parse_sssom_table(path).df
self.assertEqual(
len(data),
test.ct_data_frame_rows,
Expand Down
8 changes: 1 addition & 7 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,16 +129,10 @@ def test_clean_prefix_map_strict(self):
def test_clean_prefix_map_not_strict(self):
"""Test clean prefix map with 'strict'=False."""
msdf = parse_sssom_table(f"{data_dir}/test_clean_prefix.tsv")
original_curie_map = msdf.prefix_map
matentzn marked this conversation as resolved.
Show resolved Hide resolved
self.assertEqual(
{"a", "b", "c", "d", "orcid"}.union(SSSOM_BUILT_IN_PREFIXES),
set(original_curie_map),
)
msdf.clean_prefix_map(strict=False)
new_curie_map = msdf.prefix_map
self.assertEqual(
{"a", "b", "c", "d", "orcid", "x1", "y1", "z1"}.union(SSSOM_BUILT_IN_PREFIXES),
set(new_curie_map),
set(msdf.prefix_map),
)

def test_invert_nodes(self):
Expand Down
16 changes: 16 additions & 0 deletions tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,22 @@ def test_validate_json(self):
"""
self.assertIsNone(validate(self.correct_msdf1, self.validation_types))

@unittest.skip(
reason="""\

This test did not previously do what was expected. It was raising a validation error
not because of the text below suggesting the validator was able to identify an issue
with the `mapping_justification` slot, but because `orcid` was missing from the prefix map.
The error actually thrown was::

jsonschema.exceptions.ValidationError: The prefixes in {'orcid'} are missing from 'curie_map'.

With updates in https://github.com/mapping-commons/sssom-py/pull/431, the default prefix map
which includes `orcid` is added on parse, and this error goes away. Therefore, this test
now fails, but again, this is a sporadic failure since the test was not correct in the first
place. Therefore, this test is now skipped and marked for FIXME.
"""
)
def test_validate_json_fail(self):
"""
Test if JSONSchemaValidation fail is as expected.
Expand Down