Skip to content

Commit

Permalink
Index and LLM fixes (#36)
Browse files Browse the repository at this point in the history
* Bypass x2text for text file

* Regex fix for JSON

* Version bump
  • Loading branch information
Deepak-Kesavan authored Apr 16, 2024
1 parent f2c9bf1 commit 983eef5
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 16 deletions.
8 changes: 4 additions & 4 deletions pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ dependencies = [
"python-magic~=0.4.27",
"python-dotenv==1.0.0",
# LLM Triad
"unstract-adapters~=0.9.0",
"unstract-adapters~=0.10.0",
"llama-index==0.10.28",
"tiktoken~=0.4.0",
"transformers==4.37.0",
Expand Down
2 changes: 1 addition & 1 deletion src/unstract/sdk/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.19.0"
__version__ = "0.20.0"


def get_sdk_version():
Expand Down
18 changes: 11 additions & 7 deletions src/unstract/sdk/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,13 +236,17 @@ def index_file(
full_text = []
extracted_text = ""
try:
x2text = X2Text(tool=self.tool)
x2text_adapter_inst: X2TextAdapter = x2text.get_x2text(
adapter_instance_id=x2text_adapter
)
extracted_text = x2text_adapter_inst.process(
input_file_path=file_path, output_file_path=output_file_path
)
if not output_file_path:
with open(file_path, encoding="utf-8") as file:
extracted_text = file.read()
else:
x2text = X2Text(tool=self.tool)
x2text_adapter_inst: X2TextAdapter = x2text.get_x2text(
adapter_instance_id=x2text_adapter
)
extracted_text = x2text_adapter_inst.process(
input_file_path=file_path, output_file_path=output_file_path
)
except AdapterError as e:
# Wrapping AdapterErrors with SdkError
raise IndexingError(str(e)) from e
Expand Down
6 changes: 3 additions & 3 deletions src/unstract/sdk/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
class ToolLLM:
"""Class to handle LLMs for Unstract Tools."""

code_block_regex = re.compile(r"```.*?\n(.*?)\n```", re.DOTALL)
json_regex = re.compile(r"\{(?:.|\n)*\}")

def __init__(
self,
Expand Down Expand Up @@ -62,9 +62,9 @@ def run_completion(
for i in range(retries):
try:
response: CompletionResponse = llm.complete(prompt, **kwargs)
match = cls.code_block_regex.search(response.text)
match = cls.json_regex.search(response.text)
if match:
response.text = match.group(1)
response.text = match.group(0)

usage = {}
llm_token_counts = llm.callback_manager.handlers[
Expand Down

0 comments on commit 983eef5

Please sign in to comment.