From 931dae6caaa16ef8edc2b43550429c531a262edb Mon Sep 17 00:00:00 2001 From: Rahul Johny <116638720+johnyrahul@users.noreply.github.com> Date: Tue, 13 Aug 2024 11:59:04 +0530 Subject: [PATCH] Passing file name in usage Kwargs (#571) * Passing file name in usage Kwargs * updated the SDK version to 0.44.0 * Update pdm.lock for prompt-service * Update pdm.lock for backend * Update pdm.lock for root * Added comment to avoid confusion in promptsudio helper * Updated text extractor version --------- Co-authored-by: github-actions[bot] --- backend/pdm.lock | 86 +++++++++++++--- .../prompt_studio_helper.py | 2 + backend/pyproject.toml | 2 +- pdm.lock | 92 +++++++++++++---- prompt-service/pdm.lock | 98 ++++++++++++++----- prompt-service/pyproject.toml | 2 +- pyproject.toml | 2 +- tools/classifier/requirements.txt | 2 +- tools/classifier/src/config/properties.json | 2 +- tools/structure/src/config/properties.json | 2 +- tools/structure/src/main.py | 4 + tools/text_extractor/requirements.txt | 2 +- .../text_extractor/src/config/properties.json | 2 +- unstract/tool-registry/pyproject.toml | 2 +- .../tool_registry_config/public_tools.json | 12 +-- 15 files changed, 240 insertions(+), 72 deletions(-) diff --git a/backend/pdm.lock b/backend/pdm.lock index 431f57f0b..136d9eb71 100644 --- a/backend/pdm.lock +++ b/backend/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "deploy", "dev", "test"] strategy = ["cross_platform", "inherit_metadata"] lock_version = "4.4.2" -content_hash = "sha256:1014feed12deba4d5f163f013b79599187a7f6d54b7bd3df590e27b9265675c7" +content_hash = "sha256:00e0cde5d7b98e51171b22f703249f26104a820e4d5fd367d08296706e4600d9" [[package]] name = "adlfs" @@ -2294,8 +2294,8 @@ files = [ [[package]] name = "llama-index-legacy" -version = "0.9.48" -requires_python = ">=3.8.1,<4.0" +version = "0.9.48.post1" +requires_python = "<4.0,>=3.8.1" summary = "Interface between LLMs and your data" groups = ["default", "dev"] dependencies = [ @@ -2308,7 +2308,7 @@ dependencies = [ "httpx", "nest-asyncio<2.0.0,>=1.5.8", "networkx>=3.0", - "nltk<4.0.0,>=3.8.1", + "nltk>=3.8.2", "numpy", "openai>=1.1.0", "pandas", @@ -2319,8 +2319,8 @@ dependencies = [ "typing-inspect>=0.8.0", ] files = [ - {file = "llama_index_legacy-0.9.48-py3-none-any.whl", hash = "sha256:714ada95beac179b4acefa4d2deff74bb7b2f22b0f699ac247d4cb67738d16d4"}, - {file = "llama_index_legacy-0.9.48.tar.gz", hash = "sha256:82ddc4691edbf49533d65582c249ba22c03fe96fbd3e92f7758dccef28e43834"}, + {file = "llama_index_legacy-0.9.48.post1-py3-none-any.whl", hash = "sha256:583296162385010ebf92d2a612dd0a504575c04dc1638323bb455b1521aabe57"}, + {file = "llama_index_legacy-0.9.48.post1.tar.gz", hash = "sha256:e8b1603929433fd0cf3287ed700714078534dd202c97bcdbcc83ec3741bb0868"}, ] [[package]] @@ -2930,7 +2930,7 @@ files = [ [[package]] name = "openai" -version = "1.40.3" +version = "1.40.6" requires_python = ">=3.7.1" summary = "The official Python library for the openai API" groups = ["default", "dev"] @@ -2945,8 +2945,8 @@ dependencies = [ "typing-extensions<5,>=4.11", ] files = [ - {file = "openai-1.40.3-py3-none-any.whl", hash = "sha256:09396cb6e2e15c921a5d872bf92841a60a9425da10dcd962b45fe7c4f48f8395"}, - {file = "openai-1.40.3.tar.gz", hash = "sha256:f2ffe907618240938c59d7ccc67dd01dc8c50be203c0077240db6758d2f02480"}, + {file = "openai-1.40.6-py3-none-any.whl", hash = "sha256:b36372124a779381a420a34dd96f762baa748b6bdfaf83a6b9f2745f72ccc1c5"}, + {file = "openai-1.40.6.tar.gz", hash = "sha256:2239232bcb7f4bd4ce8e02544b5769618582411cf399816d96686d1b6c1e5c8d"}, ] [[package]] @@ -3035,6 +3035,37 @@ files = [ {file = "pandas-2.1.4.tar.gz", hash = "sha256:fcb68203c833cc735321512e13861358079a96c174a61f5116a1de89c58c0ef7"}, ] +[[package]] +name = "pdfminer-six" +version = "20231228" +requires_python = ">=3.6" +summary = "PDF parser and analyzer" +groups = ["default", "dev"] +dependencies = [ + "charset-normalizer>=2.0.0", + "cryptography>=36.0.0", +] +files = [ + {file = "pdfminer.six-20231228-py3-none-any.whl", hash = "sha256:e8d3c3310e6fbc1fe414090123ab01351634b4ecb021232206c4c9a8ca3e3b8f"}, + {file = "pdfminer.six-20231228.tar.gz", hash = "sha256:6004da3ad1a7a4d45930cb950393df89b068e73be365a6ff64a838d37bcb08c4"}, +] + +[[package]] +name = "pdfplumber" +version = "0.11.3" +requires_python = ">=3.8" +summary = "Plumb a PDF for detailed information about each char, rectangle, and line." +groups = ["default", "dev"] +dependencies = [ + "Pillow>=9.1", + "pdfminer-six==20231228", + "pypdfium2>=4.18.0", +] +files = [ + {file = "pdfplumber-0.11.3-py3-none-any.whl", hash = "sha256:4f3e13795d18b2e53dfc4cd667a3bc2478cd6975fc9a188881376265d599c5a6"}, + {file = "pdfplumber-0.11.3.tar.gz", hash = "sha256:43a3cac33d2135ce00ac59ad5bc3813a33afe0f513d9284c0e8cb6e447ed6e53"}, +] + [[package]] name = "pgvector" version = "0.2.5" @@ -3580,6 +3611,28 @@ files = [ {file = "pypdf-4.3.1.tar.gz", hash = "sha256:b2f37fe9a3030aa97ca86067a56ba3f9d3565f9a791b305c7355d8392c30d91b"}, ] +[[package]] +name = "pypdfium2" +version = "4.30.0" +requires_python = ">=3.6" +summary = "Python bindings to PDFium" +groups = ["default", "dev"] +files = [ + {file = "pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab"}, + {file = "pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f"}, + {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163"}, + {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e"}, + {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be"}, + {file = "pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e"}, + {file = "pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c"}, + {file = "pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29"}, + {file = "pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16"}, +] + [[package]] name = "pytest" version = "8.3.2" @@ -3769,7 +3822,7 @@ files = [ [[package]] name = "qdrant-client" -version = "1.10.1" +version = "1.11.0" requires_python = ">=3.8" summary = "Client library for the Qdrant vector search engine" groups = ["default", "dev"] @@ -3783,8 +3836,8 @@ dependencies = [ "urllib3<3,>=1.26.14", ] files = [ - {file = "qdrant_client-1.10.1-py3-none-any.whl", hash = "sha256:b9fb8fe50dd168d92b2998be7c6135d5a229b3a3258ad158cc69c8adf9ff1810"}, - {file = "qdrant_client-1.10.1.tar.gz", hash = "sha256:2284c8c5bb1defb0d9dbacb07d16f344972f395f4f2ed062318476a7951fd84c"}, + {file = "qdrant_client-1.11.0-py3-none-any.whl", hash = "sha256:1f574ccebb91c0bc8a620c9a41a5a010084fbc4d8c6f1cd0ab7b2eeb97336fc0"}, + {file = "qdrant_client-1.11.0.tar.gz", hash = "sha256:7c1d4d7a96cfd1ee0cde2a21c607e9df86bcca795ad8d1fd274d295ab64b8458"}, ] [[package]] @@ -4811,7 +4864,7 @@ dependencies = [ [[package]] name = "unstract-sdk" -version = "0.42.0" +version = "0.44.0" requires_python = "<3.11.1,>=3.9" summary = "A framework for writing Unstract Tools/Apps" groups = ["default", "dev"] @@ -4842,6 +4895,7 @@ dependencies = [ "llama-index==0.10.58", "llama-parse==0.4.9", "mistralai==0.4.2", + "pdfplumber>=0.11.2", "python-dotenv==1.0.0", "python-magic~=0.4.27", "singleton-decorator~=1.0.0", @@ -4849,8 +4903,8 @@ dependencies = [ "transformers==4.37.0", ] files = [ - {file = "unstract_sdk-0.42.0-py3-none-any.whl", hash = "sha256:9a78f1144d80ab9b6f5bfe477b3a044847cb61d96267b886eaa15721e816b3b7"}, - {file = "unstract_sdk-0.42.0.tar.gz", hash = "sha256:38fa29eed9c714fb5c5e2dfef9ee4dd6b0a806f6042dda4b6f5c25b42670fe12"}, + {file = "unstract_sdk-0.44.0-py3-none-any.whl", hash = "sha256:fb72087261a855750282f2d47381ed802d954a23ef6b63cffa4a2bfe6ba6d63f"}, + {file = "unstract_sdk-0.44.0.tar.gz", hash = "sha256:d4cb9ac3a76d3d67e6d5371c848f5b41217e76a7bbc82b213d0913c60a15419b"}, ] [[package]] @@ -4865,7 +4919,7 @@ dependencies = [ "PyYAML~=6.0.1", "docker~=6.1.3", "jsonschema~=4.18.2", - "unstract-sdk~=0.42.0", + "unstract-sdk~=0.44.0", "unstract-tool-sandbox", ] diff --git a/backend/prompt_studio/prompt_studio_core/prompt_studio_helper.py b/backend/prompt_studio/prompt_studio_core/prompt_studio_helper.py index c5513b7e6..f45412118 100644 --- a/backend/prompt_studio/prompt_studio_core/prompt_studio_helper.py +++ b/backend/prompt_studio/prompt_studio_core/prompt_studio_helper.py @@ -795,6 +795,8 @@ def dynamic_indexer( try: usage_kwargs = {"run_id": run_id} + # Orginal file name with which file got uploaded in prompt studio + usage_kwargs["file_name"] = filename util = PromptIdeBaseTool(log_level=LogLevel.INFO, org_id=org_id) tool_index = Index(tool=util) doc_id_key = tool_index.generate_file_id( diff --git a/backend/pyproject.toml b/backend/pyproject.toml index f3b1af285..531e56e4c 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -32,7 +32,7 @@ dependencies = [ "python-socketio==5.9.0", # For log_events "social-auth-app-django==5.3.0", # For OAuth "social-auth-core==4.4.2", # For OAuth - "unstract-sdk~=0.42.0", + "unstract-sdk~=0.44.0", # ! IMPORTANT! # Indirect local dependencies usually need to be added in their own projects # as: https://pdm-project.org/latest/usage/dependency/#local-dependencies. diff --git a/pdm.lock b/pdm.lock index 67c498887..8500e1931 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "hook-check-django-migrations", "lint"] strategy = ["cross_platform", "inherit_metadata"] lock_version = "4.4.2" -content_hash = "sha256:1bf7067a04ab646d5866e527eedce3d6f5e6e28e72d2804265928bc65ab89c14" +content_hash = "sha256:ef65d6805b5b45154e99dfc05d5f76202d713d2b8b7874f1b7bd05fb8c9d5b5a" [[package]] name = "adlfs" @@ -2279,8 +2279,8 @@ files = [ [[package]] name = "llama-index-legacy" -version = "0.9.48" -requires_python = ">=3.8.1,<4.0" +version = "0.9.48.post1" +requires_python = "<4.0,>=3.8.1" summary = "Interface between LLMs and your data" groups = ["hook-check-django-migrations"] dependencies = [ @@ -2293,7 +2293,7 @@ dependencies = [ "httpx", "nest-asyncio<2.0.0,>=1.5.8", "networkx>=3.0", - "nltk<4.0.0,>=3.8.1", + "nltk>=3.8.2", "numpy", "openai>=1.1.0", "pandas", @@ -2304,8 +2304,8 @@ dependencies = [ "typing-inspect>=0.8.0", ] files = [ - {file = "llama_index_legacy-0.9.48-py3-none-any.whl", hash = "sha256:714ada95beac179b4acefa4d2deff74bb7b2f22b0f699ac247d4cb67738d16d4"}, - {file = "llama_index_legacy-0.9.48.tar.gz", hash = "sha256:82ddc4691edbf49533d65582c249ba22c03fe96fbd3e92f7758dccef28e43834"}, + {file = "llama_index_legacy-0.9.48.post1-py3-none-any.whl", hash = "sha256:583296162385010ebf92d2a612dd0a504575c04dc1638323bb455b1521aabe57"}, + {file = "llama_index_legacy-0.9.48.post1.tar.gz", hash = "sha256:e8b1603929433fd0cf3287ed700714078534dd202c97bcdbcc83ec3741bb0868"}, ] [[package]] @@ -2950,7 +2950,7 @@ files = [ [[package]] name = "openai" -version = "1.40.3" +version = "1.40.6" requires_python = ">=3.7.1" summary = "The official Python library for the openai API" groups = ["hook-check-django-migrations"] @@ -2965,8 +2965,8 @@ dependencies = [ "typing-extensions<5,>=4.11", ] files = [ - {file = "openai-1.40.3-py3-none-any.whl", hash = "sha256:09396cb6e2e15c921a5d872bf92841a60a9425da10dcd962b45fe7c4f48f8395"}, - {file = "openai-1.40.3.tar.gz", hash = "sha256:f2ffe907618240938c59d7ccc67dd01dc8c50be203c0077240db6758d2f02480"}, + {file = "openai-1.40.6-py3-none-any.whl", hash = "sha256:b36372124a779381a420a34dd96f762baa748b6bdfaf83a6b9f2745f72ccc1c5"}, + {file = "openai-1.40.6.tar.gz", hash = "sha256:2239232bcb7f4bd4ce8e02544b5769618582411cf399816d96686d1b6c1e5c8d"}, ] [[package]] @@ -3066,6 +3066,37 @@ files = [ {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, ] +[[package]] +name = "pdfminer-six" +version = "20231228" +requires_python = ">=3.6" +summary = "PDF parser and analyzer" +groups = ["hook-check-django-migrations"] +dependencies = [ + "charset-normalizer>=2.0.0", + "cryptography>=36.0.0", +] +files = [ + {file = "pdfminer.six-20231228-py3-none-any.whl", hash = "sha256:e8d3c3310e6fbc1fe414090123ab01351634b4ecb021232206c4c9a8ca3e3b8f"}, + {file = "pdfminer.six-20231228.tar.gz", hash = "sha256:6004da3ad1a7a4d45930cb950393df89b068e73be365a6ff64a838d37bcb08c4"}, +] + +[[package]] +name = "pdfplumber" +version = "0.11.3" +requires_python = ">=3.8" +summary = "Plumb a PDF for detailed information about each char, rectangle, and line." +groups = ["hook-check-django-migrations"] +dependencies = [ + "Pillow>=9.1", + "pdfminer-six==20231228", + "pypdfium2>=4.18.0", +] +files = [ + {file = "pdfplumber-0.11.3-py3-none-any.whl", hash = "sha256:4f3e13795d18b2e53dfc4cd667a3bc2478cd6975fc9a188881376265d599c5a6"}, + {file = "pdfplumber-0.11.3.tar.gz", hash = "sha256:43a3cac33d2135ce00ac59ad5bc3813a33afe0f513d9284c0e8cb6e447ed6e53"}, +] + [[package]] name = "pgvector" version = "0.2.5" @@ -3629,6 +3660,28 @@ files = [ {file = "pypdf-4.3.1.tar.gz", hash = "sha256:b2f37fe9a3030aa97ca86067a56ba3f9d3565f9a791b305c7355d8392c30d91b"}, ] +[[package]] +name = "pypdfium2" +version = "4.30.0" +requires_python = ">=3.6" +summary = "Python bindings to PDFium" +groups = ["hook-check-django-migrations"] +files = [ + {file = "pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab"}, + {file = "pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f"}, + {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163"}, + {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e"}, + {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be"}, + {file = "pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e"}, + {file = "pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c"}, + {file = "pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29"}, + {file = "pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16"}, +] + [[package]] name = "python-crontab" version = "3.2.0" @@ -3756,7 +3809,7 @@ files = [ [[package]] name = "qdrant-client" -version = "1.10.1" +version = "1.11.0" requires_python = ">=3.8" summary = "Client library for the Qdrant vector search engine" groups = ["hook-check-django-migrations"] @@ -3770,8 +3823,8 @@ dependencies = [ "urllib3<3,>=1.26.14", ] files = [ - {file = "qdrant_client-1.10.1-py3-none-any.whl", hash = "sha256:b9fb8fe50dd168d92b2998be7c6135d5a229b3a3258ad158cc69c8adf9ff1810"}, - {file = "qdrant_client-1.10.1.tar.gz", hash = "sha256:2284c8c5bb1defb0d9dbacb07d16f344972f395f4f2ed062318476a7951fd84c"}, + {file = "qdrant_client-1.11.0-py3-none-any.whl", hash = "sha256:1f574ccebb91c0bc8a620c9a41a5a010084fbc4d8c6f1cd0ab7b2eeb97336fc0"}, + {file = "qdrant_client-1.11.0.tar.gz", hash = "sha256:7c1d4d7a96cfd1ee0cde2a21c607e9df86bcca795ad8d1fd274d295ab64b8458"}, ] [[package]] @@ -4705,13 +4758,13 @@ files = [ [[package]] name = "types-setuptools" -version = "71.1.0.20240806" +version = "71.1.0.20240813" requires_python = ">=3.8" summary = "Typing stubs for setuptools" groups = ["lint"] files = [ - {file = "types-setuptools-71.1.0.20240806.tar.gz", hash = "sha256:ae5e7b4d643ab9e99fc00ac00041804118cabe72a56183c30d524fb064897ad6"}, - {file = "types_setuptools-71.1.0.20240806-py3-none-any.whl", hash = "sha256:3bd8dd02039be0bb79ad880d8893b8eefcb022fabbeeb61245c61b20c9ab1ed0"}, + {file = "types-setuptools-71.1.0.20240813.tar.gz", hash = "sha256:94ff4f0af18c7c24ac88932bcb0f5655fb7187a001b7c61e53a1bfdaf9877b54"}, + {file = "types_setuptools-71.1.0.20240813-py3-none-any.whl", hash = "sha256:d9d9ba2936f5d3b47b59ae9bf65942a60063ac1d6bbee180a8a79fbb43f22ce5"}, ] [[package]] @@ -4890,7 +4943,7 @@ dependencies = [ [[package]] name = "unstract-sdk" -version = "0.42.0" +version = "0.44.0" requires_python = "<3.11.1,>=3.9" summary = "A framework for writing Unstract Tools/Apps" groups = ["hook-check-django-migrations"] @@ -4921,6 +4974,7 @@ dependencies = [ "llama-index==0.10.58", "llama-parse==0.4.9", "mistralai==0.4.2", + "pdfplumber>=0.11.2", "python-dotenv==1.0.0", "python-magic~=0.4.27", "singleton-decorator~=1.0.0", @@ -4928,8 +4982,8 @@ dependencies = [ "transformers==4.37.0", ] files = [ - {file = "unstract_sdk-0.42.0-py3-none-any.whl", hash = "sha256:9a78f1144d80ab9b6f5bfe477b3a044847cb61d96267b886eaa15721e816b3b7"}, - {file = "unstract_sdk-0.42.0.tar.gz", hash = "sha256:38fa29eed9c714fb5c5e2dfef9ee4dd6b0a806f6042dda4b6f5c25b42670fe12"}, + {file = "unstract_sdk-0.44.0-py3-none-any.whl", hash = "sha256:fb72087261a855750282f2d47381ed802d954a23ef6b63cffa4a2bfe6ba6d63f"}, + {file = "unstract_sdk-0.44.0.tar.gz", hash = "sha256:d4cb9ac3a76d3d67e6d5371c848f5b41217e76a7bbc82b213d0913c60a15419b"}, ] [[package]] @@ -4944,7 +4998,7 @@ dependencies = [ "PyYAML~=6.0.1", "docker~=6.1.3", "jsonschema~=4.18.2", - "unstract-sdk~=0.42.0", + "unstract-sdk~=0.44.0", "unstract-tool-sandbox", ] diff --git a/prompt-service/pdm.lock b/prompt-service/pdm.lock index e65c819fb..1eef8040f 100644 --- a/prompt-service/pdm.lock +++ b/prompt-service/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "deploy"] strategy = ["cross_platform", "inherit_metadata"] lock_version = "4.4.2" -content_hash = "sha256:6b2cbd558a9659c9e7e234479e629ffd5034daec3f35e77d2357bad8c117440c" +content_hash = "sha256:42289642033b05e0330d12a9f44ad5a785bdf2cc5ccace9572996ee05a650bc1" [[package]] name = "aiohappyeyeballs" @@ -294,23 +294,23 @@ files = [ [[package]] name = "boto3" -version = "1.34.158" +version = "1.34.159" requires_python = ">=3.8" summary = "The AWS SDK for Python" groups = ["default"] dependencies = [ - "botocore<1.35.0,>=1.34.158", + "botocore<1.35.0,>=1.34.159", "jmespath<2.0.0,>=0.7.1", "s3transfer<0.11.0,>=0.10.0", ] files = [ - {file = "boto3-1.34.158-py3-none-any.whl", hash = "sha256:c29e9b7e1034e8734ccaffb9f2b3f3df2268022fd8a93d836604019f8759ce27"}, - {file = "boto3-1.34.158.tar.gz", hash = "sha256:5b7b2ce0ec1e498933f600d29f3e1c641f8c44dd7e468c26795359d23d81fa39"}, + {file = "boto3-1.34.159-py3-none-any.whl", hash = "sha256:21120d23cc37c0e80dc4f64434bc5664d2a5645dcd9bf8a8fa97ed5c82164ca0"}, + {file = "boto3-1.34.159.tar.gz", hash = "sha256:ffe7bbb88ba81b5d54bc8fa0cfb2f3b7fe63a6cffa0f9207df2ef5c22a1c0587"}, ] [[package]] name = "botocore" -version = "1.34.158" +version = "1.34.159" requires_python = ">=3.8" summary = "Low-level, data-driven core of boto 3." groups = ["default"] @@ -321,8 +321,8 @@ dependencies = [ "urllib3<1.27,>=1.25.4; python_version < \"3.10\"", ] files = [ - {file = "botocore-1.34.158-py3-none-any.whl", hash = "sha256:0e6fceba1e39bfa8feeba70ba3ac2af958b3387df4bd3b5f2db3f64c1754c756"}, - {file = "botocore-1.34.158.tar.gz", hash = "sha256:5934082e25ad726673afbf466092fb1223dafa250e6e756c819430ba6b1b3da5"}, + {file = "botocore-1.34.159-py3-none-any.whl", hash = "sha256:7633062491457419a49f5860c014251ae85689f78266a3ce020c2c8688a76b97"}, + {file = "botocore-1.34.159.tar.gz", hash = "sha256:dc28806eb21e3c8d690c422530dff8b4b242ac033cbe98f160a9d37796c09cb1"}, ] [[package]] @@ -1641,8 +1641,8 @@ files = [ [[package]] name = "llama-index-legacy" -version = "0.9.48" -requires_python = ">=3.8.1,<4.0" +version = "0.9.48.post1" +requires_python = "<4.0,>=3.8.1" summary = "Interface between LLMs and your data" groups = ["default"] dependencies = [ @@ -1655,7 +1655,7 @@ dependencies = [ "httpx", "nest-asyncio<2.0.0,>=1.5.8", "networkx>=3.0", - "nltk<4.0.0,>=3.8.1", + "nltk>=3.8.2", "numpy", "openai>=1.1.0", "pandas", @@ -1666,8 +1666,8 @@ dependencies = [ "typing-inspect>=0.8.0", ] files = [ - {file = "llama_index_legacy-0.9.48-py3-none-any.whl", hash = "sha256:714ada95beac179b4acefa4d2deff74bb7b2f22b0f699ac247d4cb67738d16d4"}, - {file = "llama_index_legacy-0.9.48.tar.gz", hash = "sha256:82ddc4691edbf49533d65582c249ba22c03fe96fbd3e92f7758dccef28e43834"}, + {file = "llama_index_legacy-0.9.48.post1-py3-none-any.whl", hash = "sha256:583296162385010ebf92d2a612dd0a504575c04dc1638323bb455b1521aabe57"}, + {file = "llama_index_legacy-0.9.48.post1.tar.gz", hash = "sha256:e8b1603929433fd0cf3287ed700714078534dd202c97bcdbcc83ec3741bb0868"}, ] [[package]] @@ -2271,7 +2271,7 @@ files = [ [[package]] name = "openai" -version = "1.40.3" +version = "1.40.6" requires_python = ">=3.7.1" summary = "The official Python library for the openai API" groups = ["default"] @@ -2286,8 +2286,8 @@ dependencies = [ "typing-extensions<5,>=4.11", ] files = [ - {file = "openai-1.40.3-py3-none-any.whl", hash = "sha256:09396cb6e2e15c921a5d872bf92841a60a9425da10dcd962b45fe7c4f48f8395"}, - {file = "openai-1.40.3.tar.gz", hash = "sha256:f2ffe907618240938c59d7ccc67dd01dc8c50be203c0077240db6758d2f02480"}, + {file = "openai-1.40.6-py3-none-any.whl", hash = "sha256:b36372124a779381a420a34dd96f762baa748b6bdfaf83a6b9f2745f72ccc1c5"}, + {file = "openai-1.40.6.tar.gz", hash = "sha256:2239232bcb7f4bd4ce8e02544b5769618582411cf399816d96686d1b6c1e5c8d"}, ] [[package]] @@ -2379,6 +2379,37 @@ files = [ {file = "pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54"}, ] +[[package]] +name = "pdfminer-six" +version = "20231228" +requires_python = ">=3.6" +summary = "PDF parser and analyzer" +groups = ["default"] +dependencies = [ + "charset-normalizer>=2.0.0", + "cryptography>=36.0.0", +] +files = [ + {file = "pdfminer.six-20231228-py3-none-any.whl", hash = "sha256:e8d3c3310e6fbc1fe414090123ab01351634b4ecb021232206c4c9a8ca3e3b8f"}, + {file = "pdfminer.six-20231228.tar.gz", hash = "sha256:6004da3ad1a7a4d45930cb950393df89b068e73be365a6ff64a838d37bcb08c4"}, +] + +[[package]] +name = "pdfplumber" +version = "0.11.3" +requires_python = ">=3.8" +summary = "Plumb a PDF for detailed information about each char, rectangle, and line." +groups = ["default"] +dependencies = [ + "Pillow>=9.1", + "pdfminer-six==20231228", + "pypdfium2>=4.18.0", +] +files = [ + {file = "pdfplumber-0.11.3-py3-none-any.whl", hash = "sha256:4f3e13795d18b2e53dfc4cd667a3bc2478cd6975fc9a188881376265d599c5a6"}, + {file = "pdfplumber-0.11.3.tar.gz", hash = "sha256:43a3cac33d2135ce00ac59ad5bc3813a33afe0f513d9284c0e8cb6e447ed6e53"}, +] + [[package]] name = "peewee" version = "3.17.6" @@ -2780,6 +2811,28 @@ files = [ {file = "pypdf-4.3.1.tar.gz", hash = "sha256:b2f37fe9a3030aa97ca86067a56ba3f9d3565f9a791b305c7355d8392c30d91b"}, ] +[[package]] +name = "pypdfium2" +version = "4.30.0" +requires_python = ">=3.6" +summary = "Python bindings to PDFium" +groups = ["default"] +files = [ + {file = "pypdfium2-4.30.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:b33ceded0b6ff5b2b93bc1fe0ad4b71aa6b7e7bd5875f1ca0cdfb6ba6ac01aab"}, + {file = "pypdfium2-4.30.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4e55689f4b06e2d2406203e771f78789bd4f190731b5d57383d05cf611d829de"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e6e50f5ce7f65a40a33d7c9edc39f23140c57e37144c2d6d9e9262a2a854854"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3d0dd3ecaffd0b6dbda3da663220e705cb563918249bda26058c6036752ba3a2"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc3bf29b0db8c76cdfaac1ec1cde8edf211a7de7390fbf8934ad2aa9b4d6dfad"}, + {file = "pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1f78d2189e0ddf9ac2b7a9b9bd4f0c66f54d1389ff6c17e9fd9dc034d06eb3f"}, + {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_aarch64.whl", hash = "sha256:5eda3641a2da7a7a0b2f4dbd71d706401a656fea521b6b6faa0675b15d31a163"}, + {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_i686.whl", hash = "sha256:0dfa61421b5eb68e1188b0b2231e7ba35735aef2d867d86e48ee6cab6975195e"}, + {file = "pypdfium2-4.30.0-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:f33bd79e7a09d5f7acca3b0b69ff6c8a488869a7fab48fdf400fec6e20b9c8be"}, + {file = "pypdfium2-4.30.0-py3-none-win32.whl", hash = "sha256:ee2410f15d576d976c2ab2558c93d392a25fb9f6635e8dd0a8a3a5241b275e0e"}, + {file = "pypdfium2-4.30.0-py3-none-win_amd64.whl", hash = "sha256:90dbb2ac07be53219f56be09961eb95cf2473f834d01a42d901d13ccfad64b4c"}, + {file = "pypdfium2-4.30.0-py3-none-win_arm64.whl", hash = "sha256:119b2969a6d6b1e8d55e99caaf05290294f2d0fe49c12a3f17102d01c441bd29"}, + {file = "pypdfium2-4.30.0.tar.gz", hash = "sha256:48b5b7e5566665bc1015b9d69c1ebabe21f6aee468b509531c3c8318eeee2e16"}, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -2881,7 +2934,7 @@ files = [ [[package]] name = "qdrant-client" -version = "1.10.1" +version = "1.11.0" requires_python = ">=3.8" summary = "Client library for the Qdrant vector search engine" groups = ["default"] @@ -2895,8 +2948,8 @@ dependencies = [ "urllib3<3,>=1.26.14", ] files = [ - {file = "qdrant_client-1.10.1-py3-none-any.whl", hash = "sha256:b9fb8fe50dd168d92b2998be7c6135d5a229b3a3258ad158cc69c8adf9ff1810"}, - {file = "qdrant_client-1.10.1.tar.gz", hash = "sha256:2284c8c5bb1defb0d9dbacb07d16f344972f395f4f2ed062318476a7951fd84c"}, + {file = "qdrant_client-1.11.0-py3-none-any.whl", hash = "sha256:1f574ccebb91c0bc8a620c9a41a5a010084fbc4d8c6f1cd0ab7b2eeb97336fc0"}, + {file = "qdrant_client-1.11.0.tar.gz", hash = "sha256:7c1d4d7a96cfd1ee0cde2a21c607e9df86bcca795ad8d1fd274d295ab64b8458"}, ] [[package]] @@ -3632,7 +3685,7 @@ dependencies = [ [[package]] name = "unstract-sdk" -version = "0.42.0" +version = "0.44.0" requires_python = "<3.11.1,>=3.9" summary = "A framework for writing Unstract Tools/Apps" groups = ["default"] @@ -3663,6 +3716,7 @@ dependencies = [ "llama-index==0.10.58", "llama-parse==0.4.9", "mistralai==0.4.2", + "pdfplumber>=0.11.2", "python-dotenv==1.0.0", "python-magic~=0.4.27", "singleton-decorator~=1.0.0", @@ -3670,8 +3724,8 @@ dependencies = [ "transformers==4.37.0", ] files = [ - {file = "unstract_sdk-0.42.0-py3-none-any.whl", hash = "sha256:9a78f1144d80ab9b6f5bfe477b3a044847cb61d96267b886eaa15721e816b3b7"}, - {file = "unstract_sdk-0.42.0.tar.gz", hash = "sha256:38fa29eed9c714fb5c5e2dfef9ee4dd6b0a806f6042dda4b6f5c25b42670fe12"}, + {file = "unstract_sdk-0.44.0-py3-none-any.whl", hash = "sha256:fb72087261a855750282f2d47381ed802d954a23ef6b63cffa4a2bfe6ba6d63f"}, + {file = "unstract_sdk-0.44.0.tar.gz", hash = "sha256:d4cb9ac3a76d3d67e6d5371c848f5b41217e76a7bbc82b213d0913c60a15419b"}, ] [[package]] diff --git a/prompt-service/pyproject.toml b/prompt-service/pyproject.toml index 383a8e942..4400879aa 100644 --- a/prompt-service/pyproject.toml +++ b/prompt-service/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "flask~=3.0", "llama-index==0.10.58", "python-dotenv==1.0.0", - "unstract-sdk~=0.42.0", + "unstract-sdk~=0.44.0", "redis>=5.0.3", "unstract-core @ file:///${PROJECT_ROOT}/../unstract/core", "unstract-flags @ file:///${PROJECT_ROOT}/../unstract/flags", diff --git a/pyproject.toml b/pyproject.toml index 6ae699be1..f1c1892cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ hook-check-django-migrations = [ "psycopg2-binary==2.9.9", "python-dotenv==1.0.0", "python-magic==0.4.27", - "unstract-sdk~=0.42.0", + "unstract-sdk~=0.44.0", "-e unstract-connectors @ file:///${PROJECT_ROOT}/unstract/connectors", "-e unstract-core @ file:///${PROJECT_ROOT}/unstract/core", "-e unstract-flags @ file:///${PROJECT_ROOT}/unstract/flags", diff --git a/tools/classifier/requirements.txt b/tools/classifier/requirements.txt index 7ae9501be..795b0e727 100644 --- a/tools/classifier/requirements.txt +++ b/tools/classifier/requirements.txt @@ -1,4 +1,4 @@ # Add your dependencies here # Required for all unstract tools -unstract-sdk~=0.42.0 +unstract-sdk~=0.44.0 diff --git a/tools/classifier/src/config/properties.json b/tools/classifier/src/config/properties.json index 4d0eb891d..57caac599 100644 --- a/tools/classifier/src/config/properties.json +++ b/tools/classifier/src/config/properties.json @@ -2,7 +2,7 @@ "schemaVersion": "0.0.1", "displayName": "File Classifier", "functionName": "classify", - "toolVersion": "0.0.31", + "toolVersion": "0.0.32", "description": "Classifies a file into a bin based on its contents", "input": { "description": "File to be classified" diff --git a/tools/structure/src/config/properties.json b/tools/structure/src/config/properties.json index bc83cff86..b60516be7 100644 --- a/tools/structure/src/config/properties.json +++ b/tools/structure/src/config/properties.json @@ -2,7 +2,7 @@ "schemaVersion": "0.0.1", "displayName": "Structure Tool", "functionName": "structure_tool", - "toolVersion": "0.0.36", + "toolVersion": "0.0.37", "description": "This is a template tool which can answer set of input prompts designed in the Prompt Studio", "input": { "description": "File that needs to be indexed and parsed for answers" diff --git a/tools/structure/src/main.py b/tools/structure/src/main.py index e807e8c92..36678e74e 100644 --- a/tools/structure/src/main.py +++ b/tools/structure/src/main.py @@ -86,6 +86,10 @@ def run( self.stream_log("Indexing document") usage_kwargs: dict[Any, Any] = dict() usage_kwargs[SettingsKeys.RUN_ID] = run_id + usage_kwargs[SettingsKeys.FILE_NAME] = ( + self.get_exec_metadata.get(MetadataKey.SOURCE_NAME), + ) + if tool_settings[SettingsKeys.ENABLE_SINGLE_PASS_EXTRACTION]: index.index( tool_id=tool_id, diff --git a/tools/text_extractor/requirements.txt b/tools/text_extractor/requirements.txt index 7ae9501be..795b0e727 100644 --- a/tools/text_extractor/requirements.txt +++ b/tools/text_extractor/requirements.txt @@ -1,4 +1,4 @@ # Add your dependencies here # Required for all unstract tools -unstract-sdk~=0.42.0 +unstract-sdk~=0.44.0 diff --git a/tools/text_extractor/src/config/properties.json b/tools/text_extractor/src/config/properties.json index e03ba8d50..2f241d06c 100644 --- a/tools/text_extractor/src/config/properties.json +++ b/tools/text_extractor/src/config/properties.json @@ -2,7 +2,7 @@ "schemaVersion": "0.0.1", "displayName": "Text Extractor", "functionName": "text_extractor", - "toolVersion": "0.0.29", + "toolVersion": "0.0.30", "description": "The Text Extractor is a powerful tool designed to convert documents to its text form or Extract texts from documents", "input": { "description": "Document" diff --git a/unstract/tool-registry/pyproject.toml b/unstract/tool-registry/pyproject.toml index 95976a7b5..60651dd6e 100644 --- a/unstract/tool-registry/pyproject.toml +++ b/unstract/tool-registry/pyproject.toml @@ -13,7 +13,7 @@ dependencies = [ "docker~=6.1.3", "jsonschema~=4.18.2", "PyYAML~=6.0.1", - "unstract-sdk~=0.42.0", + "unstract-sdk~=0.44.0", # ! IMPORTANT! # Local dependencies usually need to be added as: # https://pdm-project.org/latest/usage/dependency/#local-dependencies diff --git a/unstract/tool-registry/tool_registry_config/public_tools.json b/unstract/tool-registry/tool_registry_config/public_tools.json index 9f05f9ed0..3ae37a28f 100644 --- a/unstract/tool-registry/tool_registry_config/public_tools.json +++ b/unstract/tool-registry/tool_registry_config/public_tools.json @@ -5,7 +5,7 @@ "schemaVersion": "0.0.1", "displayName": "File Classifier", "functionName": "classify", - "toolVersion": "0.0.31", + "toolVersion": "0.0.32", "description": "Classifies a file into a bin based on its contents", "input": { "description": "File to be classified" @@ -106,9 +106,9 @@ "properties": {} }, "icon": "\n\n \n \n \n \n \n \n \n \n \n \n \n\n", - "image_url": "docker:unstract/tool-classifier:0.0.31", + "image_url": "docker:unstract/tool-classifier:0.0.32", "image_name": "unstract/tool-classifier", - "image_tag": "0.0.31" + "image_tag": "0.0.32" }, "text_extractor": { "tool_uid": "text_extractor", @@ -116,7 +116,7 @@ "schemaVersion": "0.0.1", "displayName": "Text Extractor", "functionName": "text_extractor", - "toolVersion": "0.0.29", + "toolVersion": "0.0.30", "description": "The Text Extractor is a powerful tool designed to convert documents to its text form or Extract texts from documents", "input": { "description": "Document" @@ -191,8 +191,8 @@ } }, "icon": "\n\n \n \n \n \n \n \n \n \n \n \n \n\n", - "image_url": "docker:unstract/tool-text-extractor:0.0.29", + "image_url": "docker:unstract/tool-text-extractor:0.0.30", "image_name": "unstract/tool-text-extractor", - "image_tag": "0.0.29" + "image_tag": "0.0.30" } }