From 3a0d45d8bd92b4cce76e32986108a4dddf8d00d6 Mon Sep 17 00:00:00 2001 From: Scott Collins Date: Thu, 11 Nov 2021 11:16:40 -0800 Subject: [PATCH] Modified DOIDataBase class to use LIKE, rather than GLOB, for performing case-insensitive wildcard searches for queries --- src/pds_doi_service/core/db/doi_database.py | 32 +++++++++++++++---- .../core/db/test/doi_database_test.py | 16 ++++++++++ 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/src/pds_doi_service/core/db/doi_database.py b/src/pds_doi_service/core/db/doi_database.py index 85581af5..c883151d 100644 --- a/src/pds_doi_service/core/db/doi_database.py +++ b/src/pds_doi_service/core/db/doi_database.py @@ -530,7 +530,7 @@ def _form_query_with_wildcards(column_name, search_tokens): """ # Partition the tokens containing wildcards from the fully specified ones - wildcard_tokens = list(filter(lambda token: "*" in token, search_tokens)) + wildcard_tokens = list(filter(lambda token: "*" in token or "?" in token, search_tokens)) full_tokens = list(set(search_tokens) - set(wildcard_tokens)) # Clean up the column name provided so it can be used as a suitable @@ -546,14 +546,31 @@ def _form_query_with_wildcards(column_name, search_tokens): named_parameters = ",".join([f":{named_param_id}_{i}" for i in range(len(full_tokens))]) named_parameter_values = {f"{named_param_id}_{i}": full_tokens[i] for i in range(len(full_tokens))} - # Set up the named parameters for the GLOB portion of the WHERE used + # Next, because we use actually use LIKE and not GLOB (for the case-insensitivity), + # we need to convert wildcards from Unix style (*,?) to SQLite style (%,_), + # but we first need to escape any existing characters reserved by LIKE (& and _) + like_chars = ["%", "_"] + glob_chars = ["*", "?"] + + for index, wildcard_token in enumerate(wildcard_tokens): + for like_char, glob_char in zip(like_chars, glob_chars): + # Escape reserved wildcards used by LIKE + wildcard_token = wildcard_token.replace(like_char, f"\\{like_char}") + + # Replace wildcards used by GLOB with equivalent for LIKE + wildcard_token = wildcard_token.replace(glob_char, like_char) + + # Update the list of wildcards + wildcard_tokens[index] = wildcard_token + + # Set up the named parameters for the LIKE portion of the WHERE used # find tokens containing wildcards - glob_parameters = " OR ".join( - [f"{column_name} GLOB :{named_param_id}_glob_{i}" for i in range(len(wildcard_tokens))] + like_parameters = " OR ".join( + [f"{column_name} LIKE :{named_param_id}_like_{i}" for i in range(len(wildcard_tokens))] ) named_parameter_values.update( - {f"{named_param_id}_glob_{i}": wildcard_tokens[i] for i in range(len(wildcard_tokens))} + {f"{named_param_id}_like_{i}": wildcard_tokens[i] for i in range(len(wildcard_tokens))} ) # Build the portion of the WHERE clause combining the necessary @@ -567,7 +584,10 @@ def _form_query_with_wildcards(column_name, search_tokens): where_subclause += " OR " if wildcard_tokens: - where_subclause += f"{glob_parameters}" + where_subclause += f"{like_parameters}" + + # Make sure Sqlite knows were using backslash for escaped chars + where_subclause += " ESCAPE '\\'" where_subclause += ")" diff --git a/src/pds_doi_service/core/db/test/doi_database_test.py b/src/pds_doi_service/core/db/test/doi_database_test.py index 41e2a113..cf635fe9 100644 --- a/src/pds_doi_service/core/db/test/doi_database_test.py +++ b/src/pds_doi_service/core/db/test/doi_database_test.py @@ -280,6 +280,22 @@ def test_query_by_wildcard(self): self.assertEqual(len(o_query_result[-1]), 2) + # Test case-insensitive search of titles + o_query_result = self._doi_database.select_latest_rows( + query_criterias={"title": ["*shocked feldspars bundle ?"]} + ) + + # Should get all rows back + self.assertEqual(len(o_query_result[-1]), num_rows) + + # Test combination of wildcard tokens on a DOI search + o_query_result = self._doi_database.select_latest_rows( + query_criterias={"doi": ["10.17189/?0001", "10.1718*/20003"]} + ) + + # Should only match two DOI's + self.assertEqual(len(o_query_result[-1]), 2) + if __name__ == "__main__": unittest.main()