Skip to content

Commit

Permalink
[FIX] Support for text extraction independent for indexing status. (#141
Browse files Browse the repository at this point in the history
)

* Exception handling for Prompt Service

* Fix/handling extraction for duplicate documents

* Adding validation before extraction
  • Loading branch information
harini-venkataraman authored Jan 8, 2025
1 parent 7758532 commit eaccd55
Showing 1 changed file with 16 additions and 3 deletions.
19 changes: 16 additions & 3 deletions src/unstract/sdk/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ def query_index(

try:
self.tool.stream_log(
f">>> Querying '{vector_db_instance_id}' for {doc_id}..."
)
f">>> Querying '{vector_db_instance_id}' for {doc_id}..."
)
try:
doc_id_eq_filter = MetadataFilter.from_dict(
{
Expand Down Expand Up @@ -287,6 +287,20 @@ def index(

if doc_id_found and not reindex:
self.tool.stream_log(f"File was indexed already under {doc_id}")

if not fs.exists(output_file_path):
# Added this as a workaround to handle extraction
# for documents uploaded twice in different projects.
# to be reconsidered after permanent fixes.
extracted_text = self.extract_text(
x2text_instance_id=x2text_instance_id,
file_path=file_path,
output_file_path=output_file_path,
enable_highlight=enable_highlight,
usage_kwargs=usage_kwargs,
process_text=process_text,
fs=fs,
)
return doc_id

extracted_text = self.extract_text(
Expand All @@ -298,7 +312,6 @@ def index(
process_text=process_text,
fs=fs,
)

if not extracted_text:
raise IndexingError("No text available to index")

Expand Down

0 comments on commit eaccd55

Please sign in to comment.