From c48e21f45b92a158ab22d770a67afb5abdc0fe60 Mon Sep 17 00:00:00 2001 From: Chandrasekharan M <117059509+chandrasekharan-zipstack@users.noreply.github.com> Date: Tue, 1 Oct 2024 13:57:46 +0530 Subject: [PATCH] feat: Measure indexing time separately (#107) * Measure indexing time separately * Measure time for answer prompt API * README update on development with SDK, minor PR comment addressed --- README.md | 44 ++++++++++ src/unstract/sdk/index.py | 159 +++++++++++++++++++++---------------- src/unstract/sdk/prompt.py | 7 +- 3 files changed, 136 insertions(+), 74 deletions(-) diff --git a/README.md b/README.md index abe6a318..ed3b1162 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,50 @@ Supported commands: Unstract SDK 0.3.2 uses the following version of Llama Index Version **0.9.28** as on January 14th, 2024 +### Developing with the SDK + +Ensure that you have all the required dependencies and pre-commit hooks installed +```shell +pdm install +pre-commit install +``` + +Once the changes have been made, it can be tested with [Unstract](https://github.com/Zipstack/unstract) through the following means. + +#### With PDM +Specify the SDK as a dependency to a project using a tool like `pdm` by adding the following to your `pyproject.toml` + +```toml +[tool.pdm.dev-dependencies] +local_copies = [ + "-e unstract-adapters @ file:///${UNSTRACT_ADAPTERS_PATH}", + "-e unstract-sdk @ file:///${UNSTRACT_SDK_PATH}", +] +``` +Or by running the below command +```shell +pdm add -e /path/to/unstract-sdk --dev +``` + +#### With pip +- If the project is using `pip` it might be possible to add it as a dependency in `requirements.txt` +``` +-e /path/to/unstract-sdk +``` +NOTE: Building locally might require the below section to be replaced in the `unstract-sdk`'s build system configuration +``` +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" +``` +- Another option is to provide a git URL in `requirements.txt`, this can come in handy while building tool +docker images. Don't forget to run `apt install git` within the `Dockerfile` for this +```shell +unstract-sdk @ git+https://github.com/Zipstack/unstract-sdk@feature-branch +``` + +- Or try installing a [local PyPI server](https://pypi.org/project/pypiserver/) and upload / download your package from this server + ### Environment variables required for various LLMs (deprecated) - Azure OpenAI diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py index a8b99c48..368e440b 100644 --- a/src/unstract/sdk/index.py +++ b/src/unstract/sdk/index.py @@ -178,7 +178,7 @@ def extract_text( logger.error(f"Error occured inside function 'process_text': {e}") return extracted_text - @log_elapsed(operation="INDEXING(might include EXTRACTION)") + @log_elapsed(operation="CHECK_AND_INDEX(overall)") def index( self, tool_id: str, @@ -293,82 +293,101 @@ def index( if not extracted_text: raise IndexingError("No text available to index") - full_text = [ - { - "section": "full", - "text_contents": extracted_text, - } - ] - - # Check if chunking is required - documents = [] - for item in full_text: - text = item["text_contents"] - self.tool.stream_log("Indexing file...") - document = Document( - text=text, - doc_id=doc_id, - metadata={"section": item["section"]}, - ) - document.id_ = doc_id - documents.append(document) - self.tool.stream_log(f"Number of documents: {len(documents)}") - - if doc_id_found: - # Delete the nodes for the doc_id - try: - vector_db.delete(ref_doc_id=doc_id) - self.tool.stream_log(f"Deleted nodes for {doc_id}") - except Exception as e: - self.tool.stream_log( - f"Error deleting nodes for {doc_id}: {e}", - level=LogLevel.ERROR, - ) - raise SdkError(f"Error deleting nodes for {doc_id}: {e}") from e + self.index_to_vector_db( + vector_db=vector_db, + embedding=embedding, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + doc_id=doc_id, + text_to_idx=extracted_text, + doc_id_found=doc_id_found, + ) + return doc_id + finally: + vector_db.close() + + @log_elapsed(operation="INDEXING") + def index_to_vector_db( + self, + vector_db: VectorDB, + embedding: Embedding, + chunk_size: int, + chunk_overlap: int, + text_to_idx: str, + doc_id: str, + doc_id_found: bool, + ): + self.tool.stream_log("Indexing file...") + full_text = [ + { + "section": "full", + "text_contents": text_to_idx, + } + ] + # Check if chunking is required + documents = [] + for item in full_text: + text = item["text_contents"] + document = Document( + text=text, + doc_id=doc_id, + metadata={"section": item["section"]}, + ) + document.id_ = doc_id + documents.append(document) + self.tool.stream_log(f"Number of documents: {len(documents)}") + if doc_id_found: + # Delete the nodes for the doc_id try: - if chunk_size == 0: - parser = SentenceSplitter.from_defaults( - chunk_size=len(documents[0].text) + 10, - chunk_overlap=0, - callback_manager=embedding.get_callback_manager(), - ) - nodes = parser.get_nodes_from_documents( - documents, show_progress=True - ) - node = nodes[0] - node.embedding = embedding.get_query_embedding(" ") - vector_db.add(doc_id, nodes=[node]) - self.tool.stream_log("Added node to vector db") - else: - self.tool.stream_log("Adding nodes to vector db...") - # TODO: Phase 2: - # Post insertion to VDB, use query using doc_id and - # store all the VDB ids to a table against the doc_id - # During deletion for cases where metadata filtering - # does not work, these ids can be used for direct deletion - # This new table will also act like an audit trail for - # all nodes that were added to the VDB by Unstract - # Once this is in place, the overridden implementation - # of prefixing ids with doc_id before adding to VDB - # can be removed - vector_db.index_document( - documents, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - show_progress=True, - ) + vector_db.delete(ref_doc_id=doc_id) + self.tool.stream_log(f"Deleted nodes for {doc_id}") except Exception as e: self.tool.stream_log( - f"Error adding nodes to vector db: {e}", + f"Error deleting nodes for {doc_id}: {e}", level=LogLevel.ERROR, ) - raise IndexingError(str(e)) from e + raise SdkError(f"Error deleting nodes for {doc_id}: {e}") from e - self.tool.stream_log("File has been indexed successfully") - return doc_id - finally: - vector_db.close() + try: + if chunk_size == 0: + parser = SentenceSplitter.from_defaults( + chunk_size=len(documents[0].text) + 10, + chunk_overlap=0, + callback_manager=embedding.get_callback_manager(), + ) + nodes = parser.get_nodes_from_documents(documents, show_progress=True) + node = nodes[0] + node.embedding = embedding.get_query_embedding(" ") + vector_db.add(doc_id, nodes=[node]) + self.tool.stream_log("Added node to vector db") + else: + self.tool.stream_log("Adding nodes to vector db...") + # TODO: Phase 2: + # Post insertion to VDB, use query using doc_id and + # store all the VDB ids to a table against the doc_id + # During deletion for cases where metadata filtering + # does not work, these ids can be used for direct deletion + # This new table will also act like an audit trail for + # all nodes that were added to the VDB by Unstract + # Once this is in place, the overridden implementation + # of prefixing ids with doc_id before adding to VDB + # can be removed + vector_db.index_document( + documents, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + show_progress=True, + ) + except Exception as e: + self.tool.stream_log( + f"Error adding nodes to vector db: {e}", + level=LogLevel.ERROR, + ) + raise IndexingError(str(e)) from e + + self.tool.stream_log("File has been indexed successfully") + return def generate_index_key( self, diff --git a/src/unstract/sdk/prompt.py b/src/unstract/sdk/prompt.py index d6f72354..71ec5404 100644 --- a/src/unstract/sdk/prompt.py +++ b/src/unstract/sdk/prompt.py @@ -7,6 +7,7 @@ from unstract.sdk.constants import LogLevel, PromptStudioKeys, ToolEnv from unstract.sdk.helper import SdkHelper from unstract.sdk.tool.base import BaseTool +from unstract.sdk.utils.common_utils import log_elapsed logger = logging.getLogger(__name__) @@ -33,6 +34,7 @@ def __init__( if not is_public_call: self.bearer_token = tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY) + @log_elapsed(operation="ANSWER_PROMPTS") def answer_prompt( self, payload: dict[str, Any], params: Optional[dict[str, str]] = None ) -> dict[str, Any]: @@ -97,10 +99,7 @@ def _post_call( response: Response = Response() try: response = requests.post( - url=url, - json=payload, - params=params, - headers=headers + url=url, json=payload, params=params, headers=headers ) response.raise_for_status() result["status"] = "OK"