Skip to content

Commit

Permalink
Feat/usage reporting page (#80)
Browse files Browse the repository at this point in the history
* Pushing page usage

* Pushing page usage

* Updating pdf plumber dependecny

* Adding run id

* Bumped up the version

* Bumped up the patch version

* Handled page calculation for LLM whsiperer

* Moving usage push post processing

* Added condition for empty page string

* Expecting file name to passed via usage kwargs

* updated the version to 0.42.1

* updated the version to 0.44.0

---------

Signed-off-by: Rahul Johny <[email protected]>
  • Loading branch information
johnyrahul authored Aug 12, 2024
1 parent 28434ca commit f5fd84f
Show file tree
Hide file tree
Showing 8 changed files with 256 additions and 32 deletions.
55 changes: 54 additions & 1 deletion pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ dependencies = [
# For singleton classes
"singleton-decorator~=1.0.0",
"httpx>=0.25.2",
"pdfplumber>=0.11.2",
]
readme = "README.md"
urls = { Homepage = "https://unstract.com", "Release notes" = "https://github.com/Zipstack/unstract-sdk/releases", Source = "https://github.com/Zipstack/unstract-sdk" }
Expand Down
2 changes: 1 addition & 1 deletion src/unstract/sdk/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.43.0"
__version__ = "0.44.0"


def get_sdk_version():
Expand Down
46 changes: 46 additions & 0 deletions src/unstract/sdk/audit.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,49 @@ def push_usage_data(
finally:
if isinstance(token_counter, TokenCountingHandler):
token_counter.reset_counts()

def push_page_usage_data(
self,
platform_api_key: str,
page_count: int,
file_size: int,
file_type: str,
kwargs: dict[Any, Any] = None,
) -> None:
platform_host = self.get_env_or_die(ToolEnv.PLATFORM_HOST)
platform_port = self.get_env_or_die(ToolEnv.PLATFORM_PORT)
run_id = kwargs.get("run_id", "")
file_name = kwargs.get("file_name", "")
base_url = SdkHelper.get_platform_base_url(
platform_host=platform_host, platform_port=platform_port
)
bearer_token = platform_api_key
url = f"{base_url}/page-usage"
headers = {"Authorization": f"Bearer {bearer_token}"}

data = {
"page_count": page_count,
"file_name": file_name,
"file_size": file_size,
"file_type": file_type,
"run_id": run_id,
}

try:
response = requests.post(url, headers=headers, json=data, timeout=30)
if response.status_code != 200:
self.stream_log(
log=(
"Error while pushing page usage details: "
f"{response.status_code} {response.reason}",
),
level=LogLevel.ERROR,
)
else:
self.stream_log("Successfully pushed page usage details")

except requests.RequestException as e:
self.stream_log(
log=f"Error while pushing page usage details: {e}",
level=LogLevel.ERROR,
)
5 changes: 5 additions & 0 deletions src/unstract/sdk/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,8 @@ class PublicAdapterKeys:
PUBLIC_EMBEDDING_CONFIG = "PUBLIC_EMBEDDING_CONFIG"
PUBLIC_VECTOR_DB_CONFIG = "PUBLIC_VECTOR_DB_CONFIG"
PUBLIC_X2TEXT_CONFIG = "PUBLIC_X2TEXT_CONFIG"


class MimeType:
PDF = "application/pdf"
TEXT = "text/plain"
51 changes: 24 additions & 27 deletions src/unstract/sdk/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,35 +225,32 @@ def index(
full_text = []
extracted_text = ""
try:
mime_type = ToolUtils.get_file_mime_type(file_path)
if mime_type == "text/plain":
with open(file_path, encoding="utf-8") as file:
extracted_text = file.read()
x2text = X2Text(
tool=self.tool,
adapter_instance_id=x2text_instance_id,
usage_kwargs=usage_kwargs,
)
if enable_highlight and isinstance(
x2text._x2text_instance, LLMWhisperer
):
process_response: TextExtractionResult = x2text.process(
input_file_path=file_path,
output_file_path=output_file_path,
enable_highlight=enable_highlight,
)
whisper_hash_value = (
process_response.extraction_metadata.whisper_hash
)

metadata = {X2TextConstants.WHISPER_HASH: whisper_hash_value}

self.tool.update_exec_metadata(metadata)

else:
x2text = X2Text(
tool=self.tool, adapter_instance_id=x2text_instance_id
process_response: TextExtractionResult = x2text.process(
input_file_path=file_path,
output_file_path=output_file_path,
)
if enable_highlight and isinstance(
x2text._x2text_instance, LLMWhisperer
):
process_response: TextExtractionResult = x2text.process(
input_file_path=file_path,
output_file_path=output_file_path,
enable_highlight=enable_highlight,
)
whisper_hash_value = (
process_response.extraction_metadata.whisper_hash
)

metadata = {X2TextConstants.WHISPER_HASH: whisper_hash_value}

self.tool.update_exec_metadata(metadata)

else:
process_response: TextExtractionResult = x2text.process(
input_file_path=file_path,
output_file_path=output_file_path,
)

extracted_text = process_response.extracted_text
except AdapterError as e:
Expand Down
71 changes: 71 additions & 0 deletions src/unstract/sdk/utils/tool_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,23 @@ def get_file_mime_type(input_file: Path) -> str:
input_file_obj.seek(0)
return input_file_mime

@staticmethod
def get_file_size(input_file: Path) -> int:
"""Gets the file size in bytes for an input file.
Args:
input_file (Path): Path object of the input file
Returns:
str: MIME type of the file
"""
with open(input_file, mode="rb") as input_file_obj:
input_file_obj.seek(0, 2) # Move the cursor to the end of the file
file_length = (
input_file_obj.tell()
) # Get the current position of the cursor, which is the file length
input_file_obj.seek(0)
return file_length

@staticmethod
def str_to_bool(string: str) -> bool:
"""String value of boolean to boolean.
Expand All @@ -114,3 +131,57 @@ def str_to_bool(string: str) -> bool:
bool
"""
return string.lower() == "true"

# Used the same function from LLM Whisperer
@staticmethod
def calculate_page_count(
pages_string: str, max_page: int = 0, min_page: int = 1
) -> int:
"""Calculates the total number of pages based on the input string of
page numbers or ranges.
Parses the input 'pages_string' to extract individual page numbers or
ranges separated by commas.
Supports ranges like '1-5' or open-ended ranges like '4-'.
The 'max_page' parameter defines the upper limit for page numbers.
The 'min_page' parameter defines the lower limit for page numbers.
Args:
pages_string (str): String containing page numbers or ranges
separated by commas
max_page (int): Upper limit for page numbers (default is 0)
min_page (int): Lower limit for page numbers (default is 1)
Returns:
int: Total count of individual pages extracted from the input string
"""
if not pages_string:
return max_page
pages_list: list[int] = []
parts = pages_string.split(",")
for part in parts:
part = part.strip()
if "-" in part:
if part.startswith("-"): # e.g., "-5"
end = int(part[1:])
end = min(end, max_page)
pages_list.extend(range(min_page, end + 1))
elif part.endswith("-"): # e.g., "4-"
start = int(part[:-1])
if start < 0:
start = 0
if max_page is None:
raise ValueError(
"max_page must be defined for open-ended ranges like '4-'"
)
pages_list.extend(range(start, max_page + 1))
else: # e.g., "1-5"
start, end = map(int, part.split("-"))
if start < 0:
start = 0
if end > max_page:
end = max_page
pages_list.extend(range(start, end + 1))
else:
pages_list.append(int(part))
return len(pages_list)
57 changes: 54 additions & 3 deletions src/unstract/sdk/x2txt.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,37 @@
from abc import ABCMeta
from typing import Any, Optional

import pdfplumber
from typing_extensions import deprecated

from unstract.sdk.adapter import ToolAdapter
from unstract.sdk.adapters.constants import Common
from unstract.sdk.adapters.x2text import adapters
from unstract.sdk.adapters.x2text.constants import X2TextConstants
from unstract.sdk.adapters.x2text.dto import TextExtractionResult
from unstract.sdk.adapters.x2text.llm_whisperer.src import LLMWhisperer
from unstract.sdk.adapters.x2text.llm_whisperer.src.constants import WhispererConfig
from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter
from unstract.sdk.constants import LogLevel
from unstract.sdk.audit import Audit
from unstract.sdk.constants import LogLevel, MimeType, ToolEnv
from unstract.sdk.exceptions import X2TextError
from unstract.sdk.helper import SdkHelper
from unstract.sdk.tool.base import BaseTool
from unstract.sdk.utils import ToolUtils


class X2Text(metaclass=ABCMeta):
def __init__(
self,
tool: BaseTool,
adapter_instance_id: Optional[str] = None
adapter_instance_id: Optional[str] = None,
usage_kwargs: dict[Any, Any] = {},
):
self._tool = tool
self._x2text_adapters = adapters
self._adapter_instance_id = adapter_instance_id
self._x2text_instance: X2TextAdapter = None
self._usage_kwargs = usage_kwargs
self._initialise()

def _initialise(self):
Expand Down Expand Up @@ -82,13 +89,57 @@ def process(
output_file_path: Optional[str] = None,
**kwargs: dict[Any, Any],
) -> TextExtractionResult:
return self._x2text_instance.process(
mime_type = ToolUtils.get_file_mime_type(input_file_path)
text_extraction_result: TextExtractionResult = None
if mime_type == MimeType.TEXT:
with open(input_file_path, encoding="utf-8") as file:
extracted_text = file.read()
text_extraction_result = TextExtractionResult(
extracted_text=extracted_text, extraction_metadata=None
)
text_extraction_result = self._x2text_instance.process(
input_file_path, output_file_path, **kwargs
)
# The will be executed each and every time text extraction takes place
self.push_usage_details(input_file_path, mime_type)
return text_extraction_result

@deprecated("Instantiate X2Text and call process() instead")
def get_x2text(self, adapter_instance_id: str) -> X2TextAdapter:
if not self._x2text_instance:
self._adapter_instance_id = adapter_instance_id
self._initialise()
return self._x2text_instance

def push_usage_details(self, input_file_path: str, mime_type: str) -> None:
file_size = ToolUtils.get_file_size(input_file_path)

self._x2text_instance

if mime_type == MimeType.PDF:
with pdfplumber.open(input_file_path) as pdf:
# calculate the number of pages
page_count = len(pdf.pages)
if isinstance(self._x2text_instance, LLMWhisperer):
self._x2text_instance.config.get(WhispererConfig.PAGES_TO_EXTRACT)
page_count = ToolUtils.calculate_page_count(
self._x2text_instance.config.get(WhispererConfig.PAGES_TO_EXTRACT),
page_count,
)
Audit().push_page_usage_data(
platform_api_key=self._tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY),
file_size=file_size,
file_type=mime_type,
page_count=page_count,
kwargs=self._usage_kwargs,
)
else:
# We are allowing certain image types,and raw texts. We will consider them
# as single page documents as there in no concept of page numbers.
Audit().push_page_usage_data(
platform_api_key=self._tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY),
file_size=file_size,
file_type=mime_type,
page_count=1,
kwargs=self._usage_kwargs,
)

0 comments on commit f5fd84f

Please sign in to comment.