Merge pull request #297 from NASA-PDS/223_case_insensitve_wildcards

Case-Insensitve Wildcard Matching for DB Queries
NASA-PDS · Nov 12, 2021 · ccde498 · ccde498
2 parents 15ec887 + 3a0d45d
commit ccde498
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 6 deletions.
diff --git a/src/pds_doi_service/core/db/doi_database.py b/src/pds_doi_service/core/db/doi_database.py
@@ -530,7 +530,7 @@ def _form_query_with_wildcards(column_name, search_tokens):
 
         """
         # Partition the tokens containing wildcards from the fully specified ones
-        wildcard_tokens = list(filter(lambda token: "*" in token, search_tokens))
+        wildcard_tokens = list(filter(lambda token: "*" in token or "?" in token, search_tokens))
         full_tokens = list(set(search_tokens) - set(wildcard_tokens))
 
         # Clean up the column name provided so it can be used as a suitable
@@ -546,14 +546,31 @@ def _form_query_with_wildcards(column_name, search_tokens):
         named_parameters = ",".join([f":{named_param_id}_{i}" for i in range(len(full_tokens))])
         named_parameter_values = {f"{named_param_id}_{i}": full_tokens[i] for i in range(len(full_tokens))}
 
-        # Set up the named parameters for the GLOB portion of the WHERE used
+        # Next, because we use actually use LIKE and not GLOB (for the case-insensitivity),
+        # we need to convert wildcards from Unix style (*,?) to SQLite style (%,_),
+        # but we first need to escape any existing characters reserved by LIKE (& and _)
+        like_chars = ["%", "_"]
+        glob_chars = ["*", "?"]
+
+        for index, wildcard_token in enumerate(wildcard_tokens):
+            for like_char, glob_char in zip(like_chars, glob_chars):
+                # Escape reserved wildcards used by LIKE
+                wildcard_token = wildcard_token.replace(like_char, f"\\{like_char}")
+
+                # Replace wildcards used by GLOB with equivalent for LIKE
+                wildcard_token = wildcard_token.replace(glob_char, like_char)
+
+                # Update the list of wildcards
+                wildcard_tokens[index] = wildcard_token
+
+        # Set up the named parameters for the LIKE portion of the WHERE used
         # find tokens containing wildcards
-        glob_parameters = " OR ".join(
-            [f"{column_name} GLOB :{named_param_id}_glob_{i}" for i in range(len(wildcard_tokens))]
+        like_parameters = " OR ".join(
+            [f"{column_name} LIKE :{named_param_id}_like_{i}" for i in range(len(wildcard_tokens))]
         )
 
         named_parameter_values.update(
-            {f"{named_param_id}_glob_{i}": wildcard_tokens[i] for i in range(len(wildcard_tokens))}
+            {f"{named_param_id}_like_{i}": wildcard_tokens[i] for i in range(len(wildcard_tokens))}
         )
 
         # Build the portion of the WHERE clause combining the necessary
@@ -567,7 +584,10 @@ def _form_query_with_wildcards(column_name, search_tokens):
             where_subclause += " OR "
 
         if wildcard_tokens:
-            where_subclause += f"{glob_parameters}"
+            where_subclause += f"{like_parameters}"
+
+            # Make sure Sqlite knows were using backslash for escaped chars
+            where_subclause += " ESCAPE '\\'"
 
         where_subclause += ")"
 

diff --git a/src/pds_doi_service/core/db/test/doi_database_test.py b/src/pds_doi_service/core/db/test/doi_database_test.py
@@ -280,6 +280,22 @@ def test_query_by_wildcard(self):
 
         self.assertEqual(len(o_query_result[-1]), 2)
 
+        # Test case-insensitive search of titles
+        o_query_result = self._doi_database.select_latest_rows(
+            query_criterias={"title": ["*shocked feldspars bundle ?"]}
+        )
+
+        # Should get all rows back
+        self.assertEqual(len(o_query_result[-1]), num_rows)
+
+        # Test combination of wildcard tokens on a DOI search
+        o_query_result = self._doi_database.select_latest_rows(
+            query_criterias={"doi": ["10.17189/?0001", "10.1718*/20003"]}
+        )
+
+        # Should only match two DOI's
+        self.assertEqual(len(o_query_result[-1]), 2)
+
 
 if __name__ == "__main__":
     unittest.main()