From c48e21f45b92a158ab22d770a67afb5abdc0fe60 Mon Sep 17 00:00:00 2001
From: Chandrasekharan M
 <117059509+chandrasekharan-zipstack@users.noreply.github.com>
Date: Tue, 1 Oct 2024 13:57:46 +0530
Subject: [PATCH] feat: Measure indexing time separately (#107)

* Measure indexing time separately

* Measure time for answer prompt API

* README update on development with SDK, minor PR comment addressed
---
 README.md                  |  44 ++++++++++
 src/unstract/sdk/index.py  | 159 +++++++++++++++++++++----------------
 src/unstract/sdk/prompt.py |   7 +-
 3 files changed, 136 insertions(+), 74 deletions(-)

diff --git a/README.md b/README.md
index abe6a318..ed3b1162 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,50 @@ Supported commands:
 Unstract SDK 0.3.2 uses the following version of Llama
 Index Version **0.9.28** as on January 14th, 2024
 
+### Developing with the SDK
+
+Ensure that you have all the required dependencies and pre-commit hooks installed
+```shell
+pdm install
+pre-commit install
+```
+
+Once the changes have been made, it can be tested with [Unstract](https://github.com/Zipstack/unstract) through the following means.
+
+#### With PDM
+Specify the SDK as a dependency to a project using a tool like `pdm` by adding the following to your `pyproject.toml`
+
+```toml
+[tool.pdm.dev-dependencies]
+local_copies = [
+    "-e unstract-adapters @ file:///${UNSTRACT_ADAPTERS_PATH}",
+    "-e unstract-sdk @ file:///${UNSTRACT_SDK_PATH}",
+]
+```
+Or by running the below command
+```shell
+pdm add -e /path/to/unstract-sdk --dev
+```
+
+#### With pip
+- If the project is using `pip` it might be possible to add it as a dependency in `requirements.txt`
+```
+-e /path/to/unstract-sdk
+```
+NOTE: Building locally might require the below section to be replaced in the `unstract-sdk`'s build system configuration
+```
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+```
+- Another option is to provide a git URL in `requirements.txt`, this can come in handy while building tool
+docker images. Don't forget to run `apt install git` within the `Dockerfile` for this
+```shell
+unstract-sdk @ git+https://github.com/Zipstack/unstract-sdk@feature-branch
+```
+
+- Or try installing a [local PyPI server](https://pypi.org/project/pypiserver/) and upload / download your package from this server
+
 ### Environment variables required for various LLMs (deprecated)
 
 - Azure OpenAI
diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py
index a8b99c48..368e440b 100644
--- a/src/unstract/sdk/index.py
+++ b/src/unstract/sdk/index.py
@@ -178,7 +178,7 @@ def extract_text(
                 logger.error(f"Error occured inside function 'process_text': {e}")
         return extracted_text
 
-    @log_elapsed(operation="INDEXING(might include EXTRACTION)")
+    @log_elapsed(operation="CHECK_AND_INDEX(overall)")
     def index(
         self,
         tool_id: str,
@@ -293,82 +293,101 @@ def index(
             if not extracted_text:
                 raise IndexingError("No text available to index")
 
-            full_text = [
-                {
-                    "section": "full",
-                    "text_contents": extracted_text,
-                }
-            ]
-
-            # Check if chunking is required
-            documents = []
-            for item in full_text:
-                text = item["text_contents"]
-                self.tool.stream_log("Indexing file...")
-                document = Document(
-                    text=text,
-                    doc_id=doc_id,
-                    metadata={"section": item["section"]},
-                )
-                document.id_ = doc_id
-                documents.append(document)
-            self.tool.stream_log(f"Number of documents: {len(documents)}")
-
-            if doc_id_found:
-                # Delete the nodes for the doc_id
-                try:
-                    vector_db.delete(ref_doc_id=doc_id)
-                    self.tool.stream_log(f"Deleted nodes for {doc_id}")
-                except Exception as e:
-                    self.tool.stream_log(
-                        f"Error deleting nodes for {doc_id}: {e}",
-                        level=LogLevel.ERROR,
-                    )
-                    raise SdkError(f"Error deleting nodes for {doc_id}: {e}") from e
+            self.index_to_vector_db(
+                vector_db=vector_db,
+                embedding=embedding,
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap,
+                doc_id=doc_id,
+                text_to_idx=extracted_text,
+                doc_id_found=doc_id_found,
+            )
+            return doc_id
+        finally:
+            vector_db.close()
+
+    @log_elapsed(operation="INDEXING")
+    def index_to_vector_db(
+        self,
+        vector_db: VectorDB,
+        embedding: Embedding,
+        chunk_size: int,
+        chunk_overlap: int,
+        text_to_idx: str,
+        doc_id: str,
+        doc_id_found: bool,
+    ):
+        self.tool.stream_log("Indexing file...")
+        full_text = [
+            {
+                "section": "full",
+                "text_contents": text_to_idx,
+            }
+        ]
+        # Check if chunking is required
+        documents = []
+        for item in full_text:
+            text = item["text_contents"]
+            document = Document(
+                text=text,
+                doc_id=doc_id,
+                metadata={"section": item["section"]},
+            )
+            document.id_ = doc_id
+            documents.append(document)
+        self.tool.stream_log(f"Number of documents: {len(documents)}")
 
+        if doc_id_found:
+            # Delete the nodes for the doc_id
             try:
-                if chunk_size == 0:
-                    parser = SentenceSplitter.from_defaults(
-                        chunk_size=len(documents[0].text) + 10,
-                        chunk_overlap=0,
-                        callback_manager=embedding.get_callback_manager(),
-                    )
-                    nodes = parser.get_nodes_from_documents(
-                        documents, show_progress=True
-                    )
-                    node = nodes[0]
-                    node.embedding = embedding.get_query_embedding(" ")
-                    vector_db.add(doc_id, nodes=[node])
-                    self.tool.stream_log("Added node to vector db")
-                else:
-                    self.tool.stream_log("Adding nodes to vector db...")
-                    # TODO: Phase 2:
-                    # Post insertion to VDB, use query using doc_id and
-                    # store all the VDB ids to a table against the doc_id
-                    # During deletion for cases where metadata filtering
-                    # does not work, these ids can be used for direct deletion
-                    # This new table will also act like an audit trail for
-                    # all nodes that were added to the VDB by Unstract
-                    # Once this is in place, the overridden implementation
-                    # of prefixing ids with doc_id before adding to VDB
-                    # can be removed
-                    vector_db.index_document(
-                        documents,
-                        chunk_size=chunk_size,
-                        chunk_overlap=chunk_overlap,
-                        show_progress=True,
-                    )
+                vector_db.delete(ref_doc_id=doc_id)
+                self.tool.stream_log(f"Deleted nodes for {doc_id}")
             except Exception as e:
                 self.tool.stream_log(
-                    f"Error adding nodes to vector db: {e}",
+                    f"Error deleting nodes for {doc_id}: {e}",
                     level=LogLevel.ERROR,
                 )
-                raise IndexingError(str(e)) from e
+                raise SdkError(f"Error deleting nodes for {doc_id}: {e}") from e
 
-            self.tool.stream_log("File has been indexed successfully")
-            return doc_id
-        finally:
-            vector_db.close()
+        try:
+            if chunk_size == 0:
+                parser = SentenceSplitter.from_defaults(
+                    chunk_size=len(documents[0].text) + 10,
+                    chunk_overlap=0,
+                    callback_manager=embedding.get_callback_manager(),
+                )
+                nodes = parser.get_nodes_from_documents(documents, show_progress=True)
+                node = nodes[0]
+                node.embedding = embedding.get_query_embedding(" ")
+                vector_db.add(doc_id, nodes=[node])
+                self.tool.stream_log("Added node to vector db")
+            else:
+                self.tool.stream_log("Adding nodes to vector db...")
+                # TODO: Phase 2:
+                # Post insertion to VDB, use query using doc_id and
+                # store all the VDB ids to a table against the doc_id
+                # During deletion for cases where metadata filtering
+                # does not work, these ids can be used for direct deletion
+                # This new table will also act like an audit trail for
+                # all nodes that were added to the VDB by Unstract
+                # Once this is in place, the overridden implementation
+                # of prefixing ids with doc_id before adding to VDB
+                # can be removed
+                vector_db.index_document(
+                    documents,
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap,
+                    show_progress=True,
+                )
+        except Exception as e:
+            self.tool.stream_log(
+                f"Error adding nodes to vector db: {e}",
+                level=LogLevel.ERROR,
+            )
+            raise IndexingError(str(e)) from e
+
+        self.tool.stream_log("File has been indexed successfully")
+        return
 
     def generate_index_key(
         self,
diff --git a/src/unstract/sdk/prompt.py b/src/unstract/sdk/prompt.py
index d6f72354..71ec5404 100644
--- a/src/unstract/sdk/prompt.py
+++ b/src/unstract/sdk/prompt.py
@@ -7,6 +7,7 @@
 from unstract.sdk.constants import LogLevel, PromptStudioKeys, ToolEnv
 from unstract.sdk.helper import SdkHelper
 from unstract.sdk.tool.base import BaseTool
+from unstract.sdk.utils.common_utils import log_elapsed
 
 logger = logging.getLogger(__name__)
 
@@ -33,6 +34,7 @@ def __init__(
         if not is_public_call:
             self.bearer_token = tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY)
 
+    @log_elapsed(operation="ANSWER_PROMPTS")
     def answer_prompt(
         self, payload: dict[str, Any], params: Optional[dict[str, str]] = None
     ) -> dict[str, Any]:
@@ -97,10 +99,7 @@ def _post_call(
         response: Response = Response()
         try:
             response = requests.post(
-                url=url,
-                json=payload,
-                params=params,
-                headers=headers
+                url=url, json=payload, params=params, headers=headers
             )
             response.raise_for_status()
             result["status"] = "OK"