From 831a58eb637eacba38355c32558ab5eaf7f34961 Mon Sep 17 00:00:00 2001 From: Harshad Hegde Date: Mon, 25 Sep 2023 19:31:47 -0500 Subject: [PATCH 1/2] Convert SSSOM TSV => JSON => TSV and confirm the MappingSetDataFrame remains consistent. --- src/sssom/parsers.py | 43 ++++++++++++++++++++++++++++++------ src/sssom/util.py | 8 ++++++- tests/data/sample1.sssom.tsv | 8 +++++++ tests/test_parsers.py | 20 ++++++++++++++++- 4 files changed, 70 insertions(+), 9 deletions(-) create mode 100644 tests/data/sample1.sssom.tsv diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index 16fc4d17..0dfaaeab 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -212,21 +212,28 @@ def parse_sssom_table( logging.info(f"Externally provided metadata {k}:{v} is added to metadata set.") sssom_metadata[k] = v meta = sssom_metadata - - if "curie_map" in sssom_metadata: + if CURIE_MAP in sssom_metadata: if prefix_map: + # Convert sssom_metadata[CURIE_MAP] keys to lowercase for case-insensitive comparison + curie_map_lower = {k.lower(): k for k in sssom_metadata[CURIE_MAP].keys()} + for k, v in prefix_map.items(): - if k in sssom_metadata[CURIE_MAP]: - if sssom_metadata[CURIE_MAP][k] != v: + k_lower = k.lower() + + if k_lower in curie_map_lower: + original_key = curie_map_lower[k_lower] + if sssom_metadata[CURIE_MAP][original_key] != v: logging.warning( - f"SSSOM prefix map {k} ({sssom_metadata[CURIE_MAP][k]}) " - f"conflicts with provided ({prefix_map[k]})." + f"SSSOM prefix map {original_key} ({sssom_metadata[CURIE_MAP][original_key]}) " + f"conflicts with provided ({v})." ) + sssom_metadata[CURIE_MAP][original_key] = v else: logging.info( f"Externally provided metadata {k}:{v} is added to metadata set." ) sssom_metadata[CURIE_MAP][k] = v + prefix_map = sssom_metadata[CURIE_MAP] meta_all = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta) @@ -264,10 +271,32 @@ def parse_sssom_json( ) -> MappingSetDataFrame: """Parse a TSV to a :class:`MappingSetDocument` to a :class`MappingSetDataFrame`.""" raise_for_bad_path(file_path) - metadata = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta) with open(file_path) as json_file: jsondoc = json.load(json_file) + + # Get prefix map from jsondoc and update metadata. + # This takes priority over default prefix_map in case of a tie. + jsondoc_prefix_map = jsondoc["@context"] + + # Convert keys in both maps to lower case for comparison + if prefix_map: + lowercase_prefix_map = {k.lower(): v for k, v in prefix_map.items()} + + # Iterate over jsondoc_prefix_map + for key, value in jsondoc_prefix_map.items(): + # If lowercase key exists in lowercase_prefix_map, update the value and key + if key.lower() in lowercase_prefix_map: + # Remove the old key-value pair + if key in prefix_map: + del prefix_map[key] + elif key.lower() in prefix_map: + del prefix_map[key.lower()] + # Add the new key-value pair + prefix_map[key] = value + + metadata = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta) + msdf = from_sssom_json(jsondoc=jsondoc, prefix_map=metadata.prefix_map, meta=metadata.metadata) # df: pd.DataFrame = msdf.df # if mapping_predicates and not df.empty(): diff --git a/src/sssom/util.py b/src/sssom/util.py index 5406e8ff..17f72e3f 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -155,7 +155,13 @@ def clean_prefix_map(self, strict: bool = True) -> None: if self.metadata: prefixes_in_table.update(get_prefixes_used_in_metadata(self.metadata)) - missing_prefixes = prefixes_in_table - self.converter.get_prefixes() + # Convert prefixes_in_table to lowercase + prefixes_in_table_lower = {prefix.lower() for prefix in prefixes_in_table} + # Convert self.converter.get_prefixes() to lowercase + converter_prefixes_lower = {prefix.lower() for prefix in self.converter.get_prefixes()} + missing_prefixes = prefixes_in_table_lower - converter_prefixes_lower + # missing_prefixes = prefixes_in_table - self.converter.get_prefixes() + if missing_prefixes and strict: raise ValueError( f"{missing_prefixes} are used in the SSSOM mapping set but it does not exist in the prefix map" diff --git a/tests/data/sample1.sssom.tsv b/tests/data/sample1.sssom.tsv new file mode 100644 index 00000000..fc256028 --- /dev/null +++ b/tests/data/sample1.sssom.tsv @@ -0,0 +1,8 @@ +#curie_map: +# FBbt: "http://purl.obolibrary.org/obo/FBbt_" +# ORCID: "https://orcid.org/" +# UBERON: "http://purl.obolibrary.org/obo/UBERON_" +#creator_id: +# - "ORCID:0000-0002-6095-8718" +subject_id subject_label predicate_id object_id mapping_justification +FBbt:00000001 organism semapv:crossSpeciesExactMatch UBERON:0000468 semapv:ManualMappingCuration \ No newline at end of file diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 63dd505f..262d713a 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -19,11 +19,12 @@ from_sssom_dataframe, from_sssom_json, from_sssom_rdf, + parse_sssom_json, parse_sssom_table, ) from sssom.typehints import Metadata from sssom.util import PREFIX_MAP_KEY, sort_df_rows_columns -from sssom.writers import write_table +from sssom.writers import to_json, write_json, write_table from tests.test_data import data_dir as test_data_dir from tests.test_data import test_out_dir @@ -245,3 +246,20 @@ def test_parse_obographs_merged(self): ) msdf = parse_sssom_table(outfile) self.assertTrue(custom_curie_map.items() <= msdf.prefix_map.items()) + + def test_tsv_to_json_and_back(self): + """Test converting SSSOM TSV => JSON => SSSOM TSV such that it is reproducible.""" + sample_tsv = f"{test_data_dir}/sample1.sssom.tsv" + json_outfile = f"{test_out_dir}/sample1.json" + msdf1 = parse_sssom_table(sample_tsv) + msdf1.clean_prefix_map() + json_doc = to_json(msdf1) + + self.assertEqual(msdf1.prefix_map, json_doc["@context"]) + + with open(json_outfile, "w") as file: + write_json(msdf1, file) + + msdf2 = parse_sssom_json(json_outfile) + msdf2.clean_prefix_map() + self.assertEqual(msdf1.prefix_map, msdf2.prefix_map) From 1854d29efd387b4056946f4dff7e56a2df79c306 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 26 Sep 2023 08:56:00 +0200 Subject: [PATCH 2/2] Add more explicit tests --- tests/test_parsers.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 262d713a..f39762d2 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -249,10 +249,12 @@ def test_parse_obographs_merged(self): def test_tsv_to_json_and_back(self): """Test converting SSSOM TSV => JSON => SSSOM TSV such that it is reproducible.""" - sample_tsv = f"{test_data_dir}/sample1.sssom.tsv" - json_outfile = f"{test_out_dir}/sample1.json" + sample_tsv = test_data_dir / "sample1.sssom.tsv" + json_outfile = test_out_dir / "sample1.json" msdf1 = parse_sssom_table(sample_tsv) + self.assertIn("ORCID", msdf1.prefix_map) msdf1.clean_prefix_map() + self.assertIn("ORCID", msdf1.prefix_map) json_doc = to_json(msdf1) self.assertEqual(msdf1.prefix_map, json_doc["@context"]) @@ -261,5 +263,7 @@ def test_tsv_to_json_and_back(self): write_json(msdf1, file) msdf2 = parse_sssom_json(json_outfile) + self.assertIn("ORCID", msdf2.prefix_map) msdf2.clean_prefix_map() + self.assertIn("ORCID", msdf2.prefix_map) self.assertEqual(msdf1.prefix_map, msdf2.prefix_map)