diff --git a/pdm.lock b/pdm.lock index 3e16eaba..29bd37e8 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "docs", "lint", "test"] strategy = ["cross_platform", "inherit_metadata"] lock_version = "4.4.1" -content_hash = "sha256:78f89c18fa5f446783efe32514a77ab4a8e9c778128fd89330da9972d62f2b0d" +content_hash = "sha256:4d41c155852fe8a3377ada09498839c9c20105dfb4adfd43ae12aa0d934a5eba" [[package]] name = "aiohappyeyeballs" @@ -2439,6 +2439,37 @@ files = [ {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, ] +[[package]] +name = "pdfminer-six" +version = "20231228" +requires_python = ">=3.6" +summary = "PDF parser and analyzer" +groups = ["default"] +dependencies = [ + "charset-normalizer>=2.0.0", + "cryptography>=36.0.0", +] +files = [ + {file = "pdfminer.six-20231228-py3-none-any.whl", hash = "sha256:e8d3c3310e6fbc1fe414090123ab01351634b4ecb021232206c4c9a8ca3e3b8f"}, + {file = "pdfminer.six-20231228.tar.gz", hash = "sha256:6004da3ad1a7a4d45930cb950393df89b068e73be365a6ff64a838d37bcb08c4"}, +] + +[[package]] +name = "pdfplumber" +version = "0.11.3" +requires_python = ">=3.8" +summary = "Plumb a PDF for detailed information about each char, rectangle, and line." +groups = ["default"] +dependencies = [ + "Pillow>=9.1", + "pdfminer-six==20231228", + "pypdfium2>=4.18.0", +] +files = [ + {file = "pdfplumber-0.11.3-py3-none-any.whl", hash = "sha256:4f3e13795d18b2e53dfc4cd667a3bc2478cd6975fc9a188881376265d599c5a6"}, + {file = "pdfplumber-0.11.3.tar.gz", hash = "sha256:43a3cac33d2135ce00ac59ad5bc3813a33afe0f513d9284c0e8cb6e447ed6e53"}, +] + [[package]] name = "pgvector" version = "0.2.5" @@ -2893,6 +2924,28 @@ files = [ {file = "pypdf-4.3.1.tar.gz", hash = "sha256:b2f37fe9a3030aa97ca86067a56ba3f9d3565f9a791b305c7355d8392c30d91b"}, ] +[[package]] +name = "pypdfium2" +version = "4.30.0" +requires_python = ">=3.6" +summary = "Python bindings to PDFium" +groups = ["default"] +files = [ + {file = "pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab"}, + {file = "pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f"}, + {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163"}, + {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e"}, + {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be"}, + {file = "pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e"}, + {file = "pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c"}, + {file = "pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29"}, + {file = "pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16"}, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0" diff --git a/pyproject.toml b/pyproject.toml index 2a8819f9..360ed16d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,7 @@ dependencies = [ # For singleton classes "singleton-decorator~=1.0.0", "httpx>=0.25.2", + "pdfplumber>=0.11.2", ] readme = "README.md" urls = { Homepage = "https://unstract.com", "Release notes" = "https://github.com/Zipstack/unstract-sdk/releases", Source = "https://github.com/Zipstack/unstract-sdk" } diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py index 5ed400d6..4cfd2009 100644 --- a/src/unstract/sdk/__init__.py +++ b/src/unstract/sdk/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.43.0" +__version__ = "0.44.0" def get_sdk_version(): diff --git a/src/unstract/sdk/audit.py b/src/unstract/sdk/audit.py index be28fefc..7f47dc4f 100644 --- a/src/unstract/sdk/audit.py +++ b/src/unstract/sdk/audit.py @@ -113,3 +113,49 @@ def push_usage_data( finally: if isinstance(token_counter, TokenCountingHandler): token_counter.reset_counts() + + def push_page_usage_data( + self, + platform_api_key: str, + page_count: int, + file_size: int, + file_type: str, + kwargs: dict[Any, Any] = None, + ) -> None: + platform_host = self.get_env_or_die(ToolEnv.PLATFORM_HOST) + platform_port = self.get_env_or_die(ToolEnv.PLATFORM_PORT) + run_id = kwargs.get("run_id", "") + file_name = kwargs.get("file_name", "") + base_url = SdkHelper.get_platform_base_url( + platform_host=platform_host, platform_port=platform_port + ) + bearer_token = platform_api_key + url = f"{base_url}/page-usage" + headers = {"Authorization": f"Bearer {bearer_token}"} + + data = { + "page_count": page_count, + "file_name": file_name, + "file_size": file_size, + "file_type": file_type, + "run_id": run_id, + } + + try: + response = requests.post(url, headers=headers, json=data, timeout=30) + if response.status_code != 200: + self.stream_log( + log=( + "Error while pushing page usage details: " + f"{response.status_code} {response.reason}", + ), + level=LogLevel.ERROR, + ) + else: + self.stream_log("Successfully pushed page usage details") + + except requests.RequestException as e: + self.stream_log( + log=f"Error while pushing page usage details: {e}", + level=LogLevel.ERROR, + ) diff --git a/src/unstract/sdk/constants.py b/src/unstract/sdk/constants.py index 2029b92c..e25cb022 100644 --- a/src/unstract/sdk/constants.py +++ b/src/unstract/sdk/constants.py @@ -158,3 +158,8 @@ class PublicAdapterKeys: PUBLIC_EMBEDDING_CONFIG = "PUBLIC_EMBEDDING_CONFIG" PUBLIC_VECTOR_DB_CONFIG = "PUBLIC_VECTOR_DB_CONFIG" PUBLIC_X2TEXT_CONFIG = "PUBLIC_X2TEXT_CONFIG" + + +class MimeType: + PDF = "application/pdf" + TEXT = "text/plain" diff --git a/src/unstract/sdk/index.py b/src/unstract/sdk/index.py index c7f6e03c..5e653588 100644 --- a/src/unstract/sdk/index.py +++ b/src/unstract/sdk/index.py @@ -225,35 +225,32 @@ def index( full_text = [] extracted_text = "" try: - mime_type = ToolUtils.get_file_mime_type(file_path) - if mime_type == "text/plain": - with open(file_path, encoding="utf-8") as file: - extracted_text = file.read() + x2text = X2Text( + tool=self.tool, + adapter_instance_id=x2text_instance_id, + usage_kwargs=usage_kwargs, + ) + if enable_highlight and isinstance( + x2text._x2text_instance, LLMWhisperer + ): + process_response: TextExtractionResult = x2text.process( + input_file_path=file_path, + output_file_path=output_file_path, + enable_highlight=enable_highlight, + ) + whisper_hash_value = ( + process_response.extraction_metadata.whisper_hash + ) + + metadata = {X2TextConstants.WHISPER_HASH: whisper_hash_value} + + self.tool.update_exec_metadata(metadata) + else: - x2text = X2Text( - tool=self.tool, adapter_instance_id=x2text_instance_id + process_response: TextExtractionResult = x2text.process( + input_file_path=file_path, + output_file_path=output_file_path, ) - if enable_highlight and isinstance( - x2text._x2text_instance, LLMWhisperer - ): - process_response: TextExtractionResult = x2text.process( - input_file_path=file_path, - output_file_path=output_file_path, - enable_highlight=enable_highlight, - ) - whisper_hash_value = ( - process_response.extraction_metadata.whisper_hash - ) - - metadata = {X2TextConstants.WHISPER_HASH: whisper_hash_value} - - self.tool.update_exec_metadata(metadata) - - else: - process_response: TextExtractionResult = x2text.process( - input_file_path=file_path, - output_file_path=output_file_path, - ) extracted_text = process_response.extracted_text except AdapterError as e: diff --git a/src/unstract/sdk/utils/tool_utils.py b/src/unstract/sdk/utils/tool_utils.py index 7981d2ee..5e72d578 100644 --- a/src/unstract/sdk/utils/tool_utils.py +++ b/src/unstract/sdk/utils/tool_utils.py @@ -101,6 +101,23 @@ def get_file_mime_type(input_file: Path) -> str: input_file_obj.seek(0) return input_file_mime + @staticmethod + def get_file_size(input_file: Path) -> int: + """Gets the file size in bytes for an input file. + Args: + input_file (Path): Path object of the input file + + Returns: + str: MIME type of the file + """ + with open(input_file, mode="rb") as input_file_obj: + input_file_obj.seek(0, 2) # Move the cursor to the end of the file + file_length = ( + input_file_obj.tell() + ) # Get the current position of the cursor, which is the file length + input_file_obj.seek(0) + return file_length + @staticmethod def str_to_bool(string: str) -> bool: """String value of boolean to boolean. @@ -114,3 +131,57 @@ def str_to_bool(string: str) -> bool: bool """ return string.lower() == "true" + + # Used the same function from LLM Whisperer + @staticmethod + def calculate_page_count( + pages_string: str, max_page: int = 0, min_page: int = 1 + ) -> int: + """Calculates the total number of pages based on the input string of + page numbers or ranges. + + Parses the input 'pages_string' to extract individual page numbers or + ranges separated by commas. + Supports ranges like '1-5' or open-ended ranges like '4-'. + The 'max_page' parameter defines the upper limit for page numbers. + The 'min_page' parameter defines the lower limit for page numbers. + + Args: + pages_string (str): String containing page numbers or ranges + separated by commas + max_page (int): Upper limit for page numbers (default is 0) + min_page (int): Lower limit for page numbers (default is 1) + + Returns: + int: Total count of individual pages extracted from the input string + """ + if not pages_string: + return max_page + pages_list: list[int] = [] + parts = pages_string.split(",") + for part in parts: + part = part.strip() + if "-" in part: + if part.startswith("-"): # e.g., "-5" + end = int(part[1:]) + end = min(end, max_page) + pages_list.extend(range(min_page, end + 1)) + elif part.endswith("-"): # e.g., "4-" + start = int(part[:-1]) + if start < 0: + start = 0 + if max_page is None: + raise ValueError( + "max_page must be defined for open-ended ranges like '4-'" + ) + pages_list.extend(range(start, max_page + 1)) + else: # e.g., "1-5" + start, end = map(int, part.split("-")) + if start < 0: + start = 0 + if end > max_page: + end = max_page + pages_list.extend(range(start, end + 1)) + else: + pages_list.append(int(part)) + return len(pages_list) diff --git a/src/unstract/sdk/x2txt.py b/src/unstract/sdk/x2txt.py index 00fcb625..25645079 100644 --- a/src/unstract/sdk/x2txt.py +++ b/src/unstract/sdk/x2txt.py @@ -1,6 +1,7 @@ from abc import ABCMeta from typing import Any, Optional +import pdfplumber from typing_extensions import deprecated from unstract.sdk.adapter import ToolAdapter @@ -8,23 +9,29 @@ from unstract.sdk.adapters.x2text import adapters from unstract.sdk.adapters.x2text.constants import X2TextConstants from unstract.sdk.adapters.x2text.dto import TextExtractionResult +from unstract.sdk.adapters.x2text.llm_whisperer.src import LLMWhisperer +from unstract.sdk.adapters.x2text.llm_whisperer.src.constants import WhispererConfig from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter -from unstract.sdk.constants import LogLevel +from unstract.sdk.audit import Audit +from unstract.sdk.constants import LogLevel, MimeType, ToolEnv from unstract.sdk.exceptions import X2TextError from unstract.sdk.helper import SdkHelper from unstract.sdk.tool.base import BaseTool +from unstract.sdk.utils import ToolUtils class X2Text(metaclass=ABCMeta): def __init__( self, tool: BaseTool, - adapter_instance_id: Optional[str] = None + adapter_instance_id: Optional[str] = None, + usage_kwargs: dict[Any, Any] = {}, ): self._tool = tool self._x2text_adapters = adapters self._adapter_instance_id = adapter_instance_id self._x2text_instance: X2TextAdapter = None + self._usage_kwargs = usage_kwargs self._initialise() def _initialise(self): @@ -82,9 +89,20 @@ def process( output_file_path: Optional[str] = None, **kwargs: dict[Any, Any], ) -> TextExtractionResult: - return self._x2text_instance.process( + mime_type = ToolUtils.get_file_mime_type(input_file_path) + text_extraction_result: TextExtractionResult = None + if mime_type == MimeType.TEXT: + with open(input_file_path, encoding="utf-8") as file: + extracted_text = file.read() + text_extraction_result = TextExtractionResult( + extracted_text=extracted_text, extraction_metadata=None + ) + text_extraction_result = self._x2text_instance.process( input_file_path, output_file_path, **kwargs ) + # The will be executed each and every time text extraction takes place + self.push_usage_details(input_file_path, mime_type) + return text_extraction_result @deprecated("Instantiate X2Text and call process() instead") def get_x2text(self, adapter_instance_id: str) -> X2TextAdapter: @@ -92,3 +110,36 @@ def get_x2text(self, adapter_instance_id: str) -> X2TextAdapter: self._adapter_instance_id = adapter_instance_id self._initialise() return self._x2text_instance + + def push_usage_details(self, input_file_path: str, mime_type: str) -> None: + file_size = ToolUtils.get_file_size(input_file_path) + + self._x2text_instance + + if mime_type == MimeType.PDF: + with pdfplumber.open(input_file_path) as pdf: + # calculate the number of pages + page_count = len(pdf.pages) + if isinstance(self._x2text_instance, LLMWhisperer): + self._x2text_instance.config.get(WhispererConfig.PAGES_TO_EXTRACT) + page_count = ToolUtils.calculate_page_count( + self._x2text_instance.config.get(WhispererConfig.PAGES_TO_EXTRACT), + page_count, + ) + Audit().push_page_usage_data( + platform_api_key=self._tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY), + file_size=file_size, + file_type=mime_type, + page_count=page_count, + kwargs=self._usage_kwargs, + ) + else: + # We are allowing certain image types,and raw texts. We will consider them + # as single page documents as there in no concept of page numbers. + Audit().push_page_usage_data( + platform_api_key=self._tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY), + file_size=file_size, + file_type=mime_type, + page_count=1, + kwargs=self._usage_kwargs, + )