Feat/usage reporting page (#80)

* Pushing page usage * Pushing page usage * Updating pdf plumber dependecny * Adding run id * Bumped up the version * Bumped up the patch version * Handled page calculation for LLM whsiperer * Moving usage push post processing * Added condition for empty page string * Expecting file name to passed via usage kwargs * updated the version to 0.42.1 * updated the version to 0.44.0 --------- Signed-off-by: Rahul Johny <[email protected]>
Zipstack · Aug 12, 2024 · f5fd84f · f5fd84f
1 parent 28434ca
commit f5fd84f
Show file tree

Hide file tree

Showing 8 changed files with 256 additions and 32 deletions.
diff --git a/pdm.lock b/pdm.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -57,6 +57,7 @@ dependencies = [
     # For singleton classes
     "singleton-decorator~=1.0.0",
     "httpx>=0.25.2",
+    "pdfplumber>=0.11.2",
 ]
 readme = "README.md"
 urls = { Homepage = "https://unstract.com", "Release notes" = "https://github.com/Zipstack/unstract-sdk/releases", Source = "https://github.com/Zipstack/unstract-sdk" }

diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.43.0"
+__version__ = "0.44.0"
 
 
 def get_sdk_version():

diff --git a/src/unstract/sdk/audit.py b/src/unstract/sdk/audit.py
@@ -113,3 +113,49 @@ def push_usage_data(
         finally:
             if isinstance(token_counter, TokenCountingHandler):
                 token_counter.reset_counts()
+
+    def push_page_usage_data(
+        self,
+        platform_api_key: str,
+        page_count: int,
+        file_size: int,
+        file_type: str,
+        kwargs: dict[Any, Any] = None,
+    ) -> None:
+        platform_host = self.get_env_or_die(ToolEnv.PLATFORM_HOST)
+        platform_port = self.get_env_or_die(ToolEnv.PLATFORM_PORT)
+        run_id = kwargs.get("run_id", "")
+        file_name = kwargs.get("file_name", "")
+        base_url = SdkHelper.get_platform_base_url(
+            platform_host=platform_host, platform_port=platform_port
+        )
+        bearer_token = platform_api_key
+        url = f"{base_url}/page-usage"
+        headers = {"Authorization": f"Bearer {bearer_token}"}
+
+        data = {
+            "page_count": page_count,
+            "file_name": file_name,
+            "file_size": file_size,
+            "file_type": file_type,
+            "run_id": run_id,
+        }
+
+        try:
+            response = requests.post(url, headers=headers, json=data, timeout=30)
+            if response.status_code != 200:
+                self.stream_log(
+                    log=(
+                        "Error while pushing page usage details: "
+                        f"{response.status_code} {response.reason}",
+                    ),
+                    level=LogLevel.ERROR,
+                )
+            else:
+                self.stream_log("Successfully pushed page usage details")
+
+        except requests.RequestException as e:
+            self.stream_log(
+                log=f"Error while pushing page usage details: {e}",
+                level=LogLevel.ERROR,
+            )
diff --git a/src/unstract/sdk/constants.py b/src/unstract/sdk/constants.py
@@ -158,3 +158,8 @@ class PublicAdapterKeys:
     PUBLIC_EMBEDDING_CONFIG = "PUBLIC_EMBEDDING_CONFIG"
     PUBLIC_VECTOR_DB_CONFIG = "PUBLIC_VECTOR_DB_CONFIG"
     PUBLIC_X2TEXT_CONFIG = "PUBLIC_X2TEXT_CONFIG"
+
+
+class MimeType:
+    PDF = "application/pdf"
+    TEXT = "text/plain"
diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py
@@ -225,35 +225,32 @@ def index(
             full_text = []
             extracted_text = ""
             try:
-                mime_type = ToolUtils.get_file_mime_type(file_path)
-                if mime_type == "text/plain":
-                    with open(file_path, encoding="utf-8") as file:
-                        extracted_text = file.read()
+                x2text = X2Text(
+                    tool=self.tool,
+                    adapter_instance_id=x2text_instance_id,
+                    usage_kwargs=usage_kwargs,
+                )
+                if enable_highlight and isinstance(
+                    x2text._x2text_instance, LLMWhisperer
+                ):
+                    process_response: TextExtractionResult = x2text.process(
+                        input_file_path=file_path,
+                        output_file_path=output_file_path,
+                        enable_highlight=enable_highlight,
+                    )
+                    whisper_hash_value = (
+                        process_response.extraction_metadata.whisper_hash
+                    )
+
+                    metadata = {X2TextConstants.WHISPER_HASH: whisper_hash_value}
+
+                    self.tool.update_exec_metadata(metadata)
+
                 else:
-                    x2text = X2Text(
-                        tool=self.tool, adapter_instance_id=x2text_instance_id
+                    process_response: TextExtractionResult = x2text.process(
+                        input_file_path=file_path,
+                        output_file_path=output_file_path,
                     )
-                    if enable_highlight and isinstance(
-                        x2text._x2text_instance, LLMWhisperer
-                    ):
-                        process_response: TextExtractionResult = x2text.process(
-                            input_file_path=file_path,
-                            output_file_path=output_file_path,
-                            enable_highlight=enable_highlight,
-                        )
-                        whisper_hash_value = (
-                            process_response.extraction_metadata.whisper_hash
-                        )
-
-                        metadata = {X2TextConstants.WHISPER_HASH: whisper_hash_value}
-
-                        self.tool.update_exec_metadata(metadata)
-
-                    else:
-                        process_response: TextExtractionResult = x2text.process(
-                            input_file_path=file_path,
-                            output_file_path=output_file_path,
-                        )
 
                     extracted_text = process_response.extracted_text
             except AdapterError as e:

diff --git a/src/unstract/sdk/utils/tool_utils.py b/src/unstract/sdk/utils/tool_utils.py
@@ -101,6 +101,23 @@ def get_file_mime_type(input_file: Path) -> str:
             input_file_obj.seek(0)
         return input_file_mime
 
+    @staticmethod
+    def get_file_size(input_file: Path) -> int:
+        """Gets the file size in bytes for an input file.
+        Args:
+            input_file (Path): Path object of the input file
+
+        Returns:
+            str: MIME type of the file
+        """
+        with open(input_file, mode="rb") as input_file_obj:
+            input_file_obj.seek(0, 2)  # Move the cursor to the end of the file
+            file_length = (
+                input_file_obj.tell()
+            )  # Get the current position of the cursor, which is the file length
+            input_file_obj.seek(0)
+        return file_length
+
     @staticmethod
     def str_to_bool(string: str) -> bool:
         """String value of boolean to boolean.
@@ -114,3 +131,57 @@ def str_to_bool(string: str) -> bool:
             bool
         """
         return string.lower() == "true"
+
+    # Used the same function from LLM Whisperer
+    @staticmethod
+    def calculate_page_count(
+        pages_string: str, max_page: int = 0, min_page: int = 1
+    ) -> int:
+        """Calculates the total number of pages based on the input string of
+        page numbers or ranges.
+
+        Parses the input 'pages_string' to extract individual page numbers or
+        ranges separated by commas.
+        Supports ranges like '1-5' or open-ended ranges like '4-'.
+        The 'max_page' parameter defines the upper limit for page numbers.
+        The 'min_page' parameter defines the lower limit for page numbers.
+
+        Args:
+            pages_string (str): String containing page numbers or ranges
+            separated by commas
+            max_page (int): Upper limit for page numbers (default is 0)
+            min_page (int): Lower limit for page numbers (default is 1)
+
+        Returns:
+            int: Total count of individual pages extracted from the input string
+        """
+        if not pages_string:
+            return max_page
+        pages_list: list[int] = []
+        parts = pages_string.split(",")
+        for part in parts:
+            part = part.strip()
+            if "-" in part:
+                if part.startswith("-"):  # e.g., "-5"
+                    end = int(part[1:])
+                    end = min(end, max_page)
+                    pages_list.extend(range(min_page, end + 1))
+                elif part.endswith("-"):  # e.g., "4-"
+                    start = int(part[:-1])
+                    if start < 0:
+                        start = 0
+                    if max_page is None:
+                        raise ValueError(
+                            "max_page must be defined for open-ended ranges like '4-'"
+                        )
+                    pages_list.extend(range(start, max_page + 1))
+                else:  # e.g., "1-5"
+                    start, end = map(int, part.split("-"))
+                    if start < 0:
+                        start = 0
+                    if end > max_page:
+                        end = max_page
+                    pages_list.extend(range(start, end + 1))
+            else:
+                pages_list.append(int(part))
+        return len(pages_list)
diff --git a/src/unstract/sdk/x2txt.py b/src/unstract/sdk/x2txt.py
@@ -1,30 +1,37 @@
 from abc import ABCMeta
 from typing import Any, Optional
 
+import pdfplumber
 from typing_extensions import deprecated
 
 from unstract.sdk.adapter import ToolAdapter
 from unstract.sdk.adapters.constants import Common
 from unstract.sdk.adapters.x2text import adapters
 from unstract.sdk.adapters.x2text.constants import X2TextConstants
 from unstract.sdk.adapters.x2text.dto import TextExtractionResult
+from unstract.sdk.adapters.x2text.llm_whisperer.src import LLMWhisperer
+from unstract.sdk.adapters.x2text.llm_whisperer.src.constants import WhispererConfig
 from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter
-from unstract.sdk.constants import LogLevel
+from unstract.sdk.audit import Audit
+from unstract.sdk.constants import LogLevel, MimeType, ToolEnv
 from unstract.sdk.exceptions import X2TextError
 from unstract.sdk.helper import SdkHelper
 from unstract.sdk.tool.base import BaseTool
+from unstract.sdk.utils import ToolUtils
 
 
 class X2Text(metaclass=ABCMeta):
     def __init__(
         self,
         tool: BaseTool,
-        adapter_instance_id: Optional[str] = None
+        adapter_instance_id: Optional[str] = None,
+        usage_kwargs: dict[Any, Any] = {},
     ):
         self._tool = tool
         self._x2text_adapters = adapters
         self._adapter_instance_id = adapter_instance_id
         self._x2text_instance: X2TextAdapter = None
+        self._usage_kwargs = usage_kwargs
         self._initialise()
 
     def _initialise(self):
@@ -82,13 +89,57 @@ def process(
         output_file_path: Optional[str] = None,
         **kwargs: dict[Any, Any],
     ) -> TextExtractionResult:
-        return self._x2text_instance.process(
+        mime_type = ToolUtils.get_file_mime_type(input_file_path)
+        text_extraction_result: TextExtractionResult = None
+        if mime_type == MimeType.TEXT:
+            with open(input_file_path, encoding="utf-8") as file:
+                extracted_text = file.read()
+                text_extraction_result = TextExtractionResult(
+                    extracted_text=extracted_text, extraction_metadata=None
+                )
+        text_extraction_result = self._x2text_instance.process(
             input_file_path, output_file_path, **kwargs
         )
+        # The will be executed each and every time text extraction takes place
+        self.push_usage_details(input_file_path, mime_type)
+        return text_extraction_result
 
     @deprecated("Instantiate X2Text and call process() instead")
     def get_x2text(self, adapter_instance_id: str) -> X2TextAdapter:
         if not self._x2text_instance:
             self._adapter_instance_id = adapter_instance_id
             self._initialise()
         return self._x2text_instance
+
+    def push_usage_details(self, input_file_path: str, mime_type: str) -> None:
+        file_size = ToolUtils.get_file_size(input_file_path)
+
+        self._x2text_instance
+
+        if mime_type == MimeType.PDF:
+            with pdfplumber.open(input_file_path) as pdf:
+                # calculate the number of pages
+                page_count = len(pdf.pages)
+            if isinstance(self._x2text_instance, LLMWhisperer):
+                self._x2text_instance.config.get(WhispererConfig.PAGES_TO_EXTRACT)
+                page_count = ToolUtils.calculate_page_count(
+                    self._x2text_instance.config.get(WhispererConfig.PAGES_TO_EXTRACT),
+                    page_count,
+                )
+            Audit().push_page_usage_data(
+                platform_api_key=self._tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY),
+                file_size=file_size,
+                file_type=mime_type,
+                page_count=page_count,
+                kwargs=self._usage_kwargs,
+            )
+        else:
+            # We are allowing certain image types,and raw texts. We will consider them
+            # as single page documents as there in no concept of page numbers.
+            Audit().push_page_usage_data(
+                platform_api_key=self._tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY),
+                file_size=file_size,
+                file_type=mime_type,
+                page_count=1,
+                kwargs=self._usage_kwargs,
+            )