Skip to content

Commit

Permalink
add xlsx support (#472)
Browse files Browse the repository at this point in the history
  • Loading branch information
hexapode authored Nov 1, 2024
1 parent 3ab2ce2 commit 89348aa
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 5 deletions.
68 changes: 64 additions & 4 deletions llama_parse/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from fsspec import AbstractFileSystem
from fsspec.spec import AbstractBufferedFile
from llama_index.core.async_utils import run_jobs
from llama_index.core.async_utils import asyncio_run, run_jobs
from llama_index.core.bridge.pydantic import Field, field_validator
from llama_index.core.constants import DEFAULT_BASE_URL
from llama_index.core.readers.base import BasePydanticReader
Expand Down Expand Up @@ -123,6 +123,10 @@ class LlamaParse(BasePydanticReader):
default=None,
description="The API key for the GPT-4o API. Lowers the cost of parsing.",
)
guess_xlsx_sheet_names: Optional[bool] = Field(
default=False,
description="Whether to guess the sheet names of the xlsx file.",
)
bounding_box: Optional[str] = Field(
default=None,
description="The bounding box to use to extract text from documents describe as a string containing the bounding box margins",
Expand Down Expand Up @@ -273,8 +277,10 @@ async def _create_job(
"vendor_multimodal_model_name": self.vendor_multimodal_model_name,
"take_screenshot": self.take_screenshot,
"disable_ocr": self.disable_ocr,
"guess_xlsx_sheet_names": self.guess_xlsx_sheet_names,
"is_formatting_instruction": self.is_formatting_instruction,
"annotate_links": self.annotate_links,
"from_python_package": True,
}

# only send page separator to server if it is not None
Expand Down Expand Up @@ -467,7 +473,7 @@ def load_data(
) -> List[Document]:
"""Load data from the input path."""
try:
return asyncio.run(self.aload_data(file_path, extra_info, fs=fs))
return asyncio_run(self.aload_data(file_path, extra_info, fs=fs))
except RuntimeError as e:
if nest_asyncio_err in str(e):
raise RuntimeError(nest_asyncio_msg)
Expand Down Expand Up @@ -534,7 +540,7 @@ def get_json_result(
) -> List[dict]:
"""Parse the input path."""
try:
return asyncio.run(self.aget_json(file_path, extra_info))
return asyncio_run(self.aget_json(file_path, extra_info))
except RuntimeError as e:
if nest_asyncio_err in str(e):
raise RuntimeError(nest_asyncio_msg)
Expand Down Expand Up @@ -597,7 +603,61 @@ async def aget_images(
def get_images(self, json_result: List[dict], download_path: str) -> List[dict]:
"""Download images from the parsed result."""
try:
return asyncio.run(self.aget_images(json_result, download_path))
return asyncio_run(self.aget_images(json_result, download_path))
except RuntimeError as e:
if nest_asyncio_err in str(e):
raise RuntimeError(nest_asyncio_msg)
else:
raise e

async def aget_xlsx(
self, json_result: List[dict], download_path: str
) -> List[dict]:
"""Download images from the parsed result."""
headers = {"Authorization": f"Bearer {self.api_key}"}

# make the download path
if not os.path.exists(download_path):
os.makedirs(download_path)
try:
xlsx_list = []
for result in json_result:
job_id = result["job_id"]
if self.verbose:
print("> XLSX")

xlsx_path = os.path.join(download_path, f"{job_id}.xlsx")

xlsx = {}

xlsx["path"] = xlsx_path
xlsx["job_id"] = job_id
xlsx["original_file_path"] = result.get("file_path", None)

with open(xlsx_path, "wb") as f:
xlsx_url = (
f"{self.base_url}/api/parsing/job/{job_id}/result/raw/xlsx"
)
async with self.client_context() as client:
res = await client.get(
xlsx_url, headers=headers, timeout=self.max_timeout
)
res.raise_for_status()
f.write(res.content)
xlsx_list.append(xlsx)
return xlsx_list

except Exception as e:
print("Error while downloading xlsx:", e)
if self.ignore_errors:
return []
else:
raise e

def get_xlsx(self, json_result: List[dict], download_path: str) -> List[dict]:
"""Download xlsx from the parsed result."""
try:
return asyncio_run(self.aget_xlsx(json_result, download_path))
except RuntimeError as e:
if nest_asyncio_err in str(e):
raise RuntimeError(nest_asyncio_msg)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "llama-parse"
version = "0.5.12"
version = "0.5.13"
description = "Parse files into RAG-Optimized formats."
authors = ["Logan Markewich <[email protected]>"]
license = "MIT"
Expand Down

0 comments on commit 89348aa

Please sign in to comment.