From eeabf48d29b1778643a30fdc8b3ee0a6d2fa81dc Mon Sep 17 00:00:00 2001 From: Pierre-Loic Doulcet Date: Tue, 12 Nov 2024 19:56:58 +0100 Subject: [PATCH] add input url and http_proxy (#475) --- llama_parse/base.py | 45 +++++++++++++++++++++++++------ pyproject.toml | 2 +- tests/test_reader.py | 63 +++++++++++++++++++++++++++++++++++++++----- 3 files changed, 94 insertions(+), 16 deletions(-) diff --git a/llama_parse/base.py b/llama_parse/base.py index 4f7fa59..451f50f 100644 --- a/llama_parse/base.py +++ b/llama_parse/base.py @@ -1,6 +1,6 @@ import os import asyncio -from io import TextIOWrapper +from urllib.parse import urlparse import httpx import mimetypes @@ -11,7 +11,6 @@ from io import BufferedIOBase from fsspec import AbstractFileSystem -from fsspec.spec import AbstractBufferedFile from llama_index.core.async_utils import asyncio_run, run_jobs from llama_index.core.bridge.pydantic import Field, field_validator from llama_index.core.constants import DEFAULT_BASE_URL @@ -190,6 +189,10 @@ class LlamaParse(BasePydanticReader): azure_openai_key: Optional[str] = Field( default=None, description="Azure Openai Key" ) + http_proxy: Optional[str] = Field( + default=None, + description="(optional) If set with input_url will use the specified http proxy to download the file.", + ) @field_validator("api_key", mode="before", check_fields=True) @classmethod @@ -221,6 +224,28 @@ async def client_context(self) -> AsyncGenerator[httpx.AsyncClient, None]: async with httpx.AsyncClient(timeout=self.max_timeout) as client: yield client + def _is_input_url(self, file_path: FileInput) -> bool: + """Check if the input is a valid URL. + + This method checks for: + - Proper URL scheme (http/https) + - Valid URL structure + - Network location (domain) + """ + if not isinstance(file_path, str): + return False + try: + result = urlparse(file_path) + return all( + [ + result.scheme in ("http", "https"), + result.netloc, # Has domain + result.scheme, # Has scheme + ] + ) + except Exception: + return False + # upload a document and get back a job_id async def _create_job( self, @@ -232,6 +257,7 @@ async def _create_job( url = f"{self.base_url}/api/parsing/upload" files = None file_handle = None + input_url = file_input if self._is_input_url(file_input) else None if isinstance(file_input, (bytes, BufferedIOBase)): if not extra_info or "file_name" not in extra_info: @@ -241,6 +267,8 @@ async def _create_job( file_name = extra_info["file_name"] mime_type = mimetypes.guess_type(file_name)[0] files = {"file": (file_name, file_input, mime_type)} + elif input_url is not None: + files = None elif isinstance(file_input, (str, Path, PurePosixPath, PurePath)): file_path = str(file_input) file_ext = os.path.splitext(file_path)[1].lower() @@ -316,6 +344,13 @@ async def _create_job( if self.azure_openai_key is not None: data["azure_openai_key"] = self.azure_openai_key + if input_url is not None: + files = None + data["input_url"] = str(input_url) + + if self.http_proxy is not None: + data["http_proxy"] = self.http_proxy + try: async with self.client_context() as client: response = await client.post( @@ -332,12 +367,6 @@ async def _create_job( if file_handle is not None: file_handle.close() - @staticmethod - def __get_filename(f: Union[TextIOWrapper, AbstractBufferedFile]) -> str: - if isinstance(f, TextIOWrapper): - return f.name - return f.full_name - async def _get_job_result( self, job_id: str, result_type: str, verbose: bool = False ) -> Dict[str, Any]: diff --git a/pyproject.toml b/pyproject.toml index 323af8f..012937e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "llama-parse" -version = "0.5.13" +version = "0.5.14" description = "Parse files into RAG-Optimized formats." authors = ["Logan Markewich "] license = "MIT" diff --git a/tests/test_reader.py b/tests/test_reader.py index 70da8aa..2c8e9ba 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -76,13 +76,14 @@ def test_simple_page_markdown_buffer(markdown_parser: LlamaParse) -> None: os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", reason="LLAMA_CLOUD_API_KEY not set", ) -def test_simple_page_with_custom_fs() -> None: +@pytest.mark.asyncio +async def test_simple_page_with_custom_fs() -> None: parser = LlamaParse(result_type="markdown") fs = LocalFileSystem() filepath = os.path.join( os.path.dirname(__file__), "test_files/attention_is_all_you_need.pdf" ) - result = parser.load_data(filepath, fs=fs) + result = await parser.aload_data(filepath, fs=fs) assert len(result) == 1 @@ -90,13 +91,14 @@ def test_simple_page_with_custom_fs() -> None: os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", reason="LLAMA_CLOUD_API_KEY not set", ) -def test_simple_page_progress_workers() -> None: +@pytest.mark.asyncio +async def test_simple_page_progress_workers() -> None: parser = LlamaParse(result_type="markdown", show_progress=True, verbose=True) filepath = os.path.join( os.path.dirname(__file__), "test_files/attention_is_all_you_need.pdf" ) - result = parser.load_data([filepath, filepath]) + result = await parser.aload_data([filepath, filepath]) assert len(result) == 2 assert len(result[0].text) > 0 @@ -107,7 +109,7 @@ def test_simple_page_progress_workers() -> None: filepath = os.path.join( os.path.dirname(__file__), "test_files/attention_is_all_you_need.pdf" ) - result = parser.load_data([filepath, filepath]) + result = await parser.aload_data([filepath, filepath]) assert len(result) == 2 assert len(result[0].text) > 0 @@ -116,12 +118,59 @@ def test_simple_page_progress_workers() -> None: os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", reason="LLAMA_CLOUD_API_KEY not set", ) -def test_custom_client() -> None: +@pytest.mark.asyncio +async def test_custom_client() -> None: custom_client = AsyncClient(verify=False, timeout=10) parser = LlamaParse(result_type="markdown", custom_client=custom_client) filepath = os.path.join( os.path.dirname(__file__), "test_files/attention_is_all_you_need.pdf" ) - result = parser.load_data(filepath) + result = await parser.aload_data(filepath) assert len(result) == 1 assert len(result[0].text) > 0 + + +@pytest.mark.skipif( + os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", + reason="LLAMA_CLOUD_API_KEY not set", +) +@pytest.mark.asyncio +async def test_input_url() -> None: + parser = LlamaParse(result_type="markdown") + + # links to a resume example + input_url = "https://cdn-blog.novoresume.com/articles/google-docs-resume-templates/basic-google-docs-resume.png" + result = await parser.aload_data(input_url) + assert len(result) == 1 + assert "your name" in result[0].text.lower() + + +@pytest.mark.skipif( + os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", + reason="LLAMA_CLOUD_API_KEY not set", +) +@pytest.mark.asyncio +async def test_input_url_with_website_input() -> None: + parser = LlamaParse(result_type="markdown") + input_url = "https://www.google.com" + result = await parser.aload_data(input_url) + assert len(result) == 1 + assert "google" in result[0].text.lower() + + +@pytest.mark.skipif( + os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", + reason="LLAMA_CLOUD_API_KEY not set", +) +@pytest.mark.asyncio +async def test_mixing_input_types() -> None: + parser = LlamaParse(result_type="markdown") + filepath = os.path.join( + os.path.dirname(__file__), "test_files/attention_is_all_you_need.pdf" + ) + input_url = "https://www.google.com" + result = await parser.aload_data([filepath, input_url]) + + assert len(result) == 2 + assert "table 2" in result[0].text.lower() + assert "google" in result[1].text.lower()