Skip to content

Commit

Permalink
Merge pull request #297 from NASA-PDS/223_case_insensitve_wildcards
Browse files Browse the repository at this point in the history
Case-Insensitve Wildcard Matching for DB Queries
  • Loading branch information
nutjob4life authored Nov 12, 2021
2 parents 15ec887 + 3a0d45d commit ccde498
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 6 deletions.
32 changes: 26 additions & 6 deletions src/pds_doi_service/core/db/doi_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,7 @@ def _form_query_with_wildcards(column_name, search_tokens):
"""
# Partition the tokens containing wildcards from the fully specified ones
wildcard_tokens = list(filter(lambda token: "*" in token, search_tokens))
wildcard_tokens = list(filter(lambda token: "*" in token or "?" in token, search_tokens))
full_tokens = list(set(search_tokens) - set(wildcard_tokens))

# Clean up the column name provided so it can be used as a suitable
Expand All @@ -546,14 +546,31 @@ def _form_query_with_wildcards(column_name, search_tokens):
named_parameters = ",".join([f":{named_param_id}_{i}" for i in range(len(full_tokens))])
named_parameter_values = {f"{named_param_id}_{i}": full_tokens[i] for i in range(len(full_tokens))}

# Set up the named parameters for the GLOB portion of the WHERE used
# Next, because we use actually use LIKE and not GLOB (for the case-insensitivity),
# we need to convert wildcards from Unix style (*,?) to SQLite style (%,_),
# but we first need to escape any existing characters reserved by LIKE (& and _)
like_chars = ["%", "_"]
glob_chars = ["*", "?"]

for index, wildcard_token in enumerate(wildcard_tokens):
for like_char, glob_char in zip(like_chars, glob_chars):
# Escape reserved wildcards used by LIKE
wildcard_token = wildcard_token.replace(like_char, f"\\{like_char}")

# Replace wildcards used by GLOB with equivalent for LIKE
wildcard_token = wildcard_token.replace(glob_char, like_char)

# Update the list of wildcards
wildcard_tokens[index] = wildcard_token

# Set up the named parameters for the LIKE portion of the WHERE used
# find tokens containing wildcards
glob_parameters = " OR ".join(
[f"{column_name} GLOB :{named_param_id}_glob_{i}" for i in range(len(wildcard_tokens))]
like_parameters = " OR ".join(
[f"{column_name} LIKE :{named_param_id}_like_{i}" for i in range(len(wildcard_tokens))]
)

named_parameter_values.update(
{f"{named_param_id}_glob_{i}": wildcard_tokens[i] for i in range(len(wildcard_tokens))}
{f"{named_param_id}_like_{i}": wildcard_tokens[i] for i in range(len(wildcard_tokens))}
)

# Build the portion of the WHERE clause combining the necessary
Expand All @@ -567,7 +584,10 @@ def _form_query_with_wildcards(column_name, search_tokens):
where_subclause += " OR "

if wildcard_tokens:
where_subclause += f"{glob_parameters}"
where_subclause += f"{like_parameters}"

# Make sure Sqlite knows were using backslash for escaped chars
where_subclause += " ESCAPE '\\'"

where_subclause += ")"

Expand Down
16 changes: 16 additions & 0 deletions src/pds_doi_service/core/db/test/doi_database_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,22 @@ def test_query_by_wildcard(self):

self.assertEqual(len(o_query_result[-1]), 2)

# Test case-insensitive search of titles
o_query_result = self._doi_database.select_latest_rows(
query_criterias={"title": ["*shocked feldspars bundle ?"]}
)

# Should get all rows back
self.assertEqual(len(o_query_result[-1]), num_rows)

# Test combination of wildcard tokens on a DOI search
o_query_result = self._doi_database.select_latest_rows(
query_criterias={"doi": ["10.17189/?0001", "10.1718*/20003"]}
)

# Should only match two DOI's
self.assertEqual(len(o_query_result[-1]), 2)


if __name__ == "__main__":
unittest.main()

0 comments on commit ccde498

Please sign in to comment.