Skip to content

Commit

Permalink
File Storage interface and implementation (#112)
Browse files Browse the repository at this point in the history
* contextSizeChanges

* contextSizeChanges

* Version roll and test folder check in

* Fix enum values

* Fix test cases, address review comments

* Address review comments

* Update pyproject.toml

Co-authored-by: Chandrasekharan M <[email protected]>
Signed-off-by: Gayathri <[email protected]>

* Address mypy issues

* Change class design and implementation

* Remove unused definitions

* Add cp() and function refactoring

* Check-in sample env

* Default value of dict changed to None

* Add size()

* Refctor for using FileStorage

* Refactor to use FileStorage

* Fix issues

* Add mim_type, download functions

* change comments

* Refactor het_hash_from_file

* Add return types

* Remove permanent file storage from sdk

* Fix SDK functional issues

* Support minio

* Test cases for Minio

* Bring file variants back to sdk

* Fix copy_on_write

* Add new test cases for uploadd/download

* Add new functions to support platform-service

* Change modififcation_time return type to datetime

* Refactor env pick-up logic

* Sample env

* contextSizeChanges

* Remove commented code and some improvisations

* contextSizeChanges

* Add right JSON formatted string

* Update src/unstract/sdk/file_storage/fs_permanent.py

Co-authored-by: Chandrasekharan M <[email protected]>
Signed-off-by: Gayathri <[email protected]>

* Address review comments

* Address review comments

* Update src/unstract/sdk/file_storage/fs_shared_temporary.py

Co-authored-by: ali <[email protected]>
Signed-off-by: Gayathri <[email protected]>

* Refactor for change in enum value

* Add return type

---------

Signed-off-by: Gayathri <[email protected]>
Co-authored-by: Chandrasekharan M <[email protected]>
Co-authored-by: ali <[email protected]>
  • Loading branch information
3 people authored Nov 13, 2024
1 parent 8c4610f commit ef00f81
Show file tree
Hide file tree
Showing 30 changed files with 2,903 additions and 1,169 deletions.
2,323 changes: 1,268 additions & 1,055 deletions pdm.lock

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,13 @@ scripts = { unstract-tool-gen = "unstract.sdk.scripts.tool_gen:main" }

[tool.pdm.dev-dependencies]
docs = [ "lazydocs~=0.4.8" ]
test = [ "parameterized==0.9.0" ]
test = [
"parameterized==0.9.0",
"pytest==8.3.3",
"pytest-mock==3.14.0",
"gcsfs==2024.10.0",
"s3fs==2024.10.0"
]
lint = [
"autopep8~=2.0.2",
"black~=23.3.0",
Expand Down
2 changes: 1 addition & 1 deletion src/unstract/sdk/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.53.0"
__version__ = "0.54.0rc1"


def get_sdk_version():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from unstract.sdk.adapters.exceptions import AdapterError
from unstract.sdk.adapters.ocr.constants import FileType
from unstract.sdk.adapters.ocr.ocr_adapter import OCRAdapter
from unstract.sdk.file_storage import FileStorage, FileStorageProvider

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -95,10 +96,13 @@ def _get_request_headers(self) -> dict[str, Any]:

""" Detect the mime type from the file content """

def _get_input_file_type_mime(self, input_file_path: str) -> str:
with open(input_file_path, mode="rb") as file_obj:
sample_contents = file_obj.read(100)
file_type = filetype.guess(sample_contents)
def _get_input_file_type_mime(
self,
input_file_path: str,
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
) -> str:
sample_contents = fs.read(path=input_file_path, mode="rb", length=100)
file_type = filetype.guess(sample_contents)

file_type_mime: str = file_type.MIME if file_type else FileType.TEXT_PLAIN

Expand All @@ -110,13 +114,15 @@ def _get_input_file_type_mime(self, input_file_path: str) -> str:
return file_type_mime

def process(
self, input_file_path: str, output_file_path: Optional[str] = None
self,
input_file_path: str,
output_file_path: Optional[str] = None,
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
) -> str:
try:
file_type_mime = self._get_input_file_type_mime(input_file_path)
if os.path.isfile(input_file_path):
with open(input_file_path, "rb") as fop:
file_content_in_bytes: bytes = fop.read()
if fs.exists(input_file_path):
file_content_in_bytes = fs.read(path=input_file_path, mode="rb")
else:
raise AdapterError(f"File not found {input_file_path}")
processor_url = self.config.get(Constants.URL, "") + ":process"
Expand All @@ -131,19 +137,14 @@ def process(
response_json: dict[str, Any] = response.json()
result_text: str = response_json["document"]["text"]
if output_file_path is not None:
with open(output_file_path, "w", encoding="utf-8") as f:
f.write(result_text)
f.close()
fs.write(path=output_file_path, mode="w", encoding="utf-8")
return result_text
except Exception as e:
logger.error(f"Error while processing document {e}")
if not isinstance(e, AdapterError):
raise AdapterError(str(e))
else:
raise e
finally:
if fop is not None:
fop.close()

def test_connection(self) -> bool:
try:
Expand Down
24 changes: 15 additions & 9 deletions src/unstract/sdk/adapters/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from requests.exceptions import RequestException

from unstract.sdk.adapters.constants import Common
from unstract.sdk.file_storage import FileStorage, FileStorageProvider


class AdapterUtils:
Expand Down Expand Up @@ -34,8 +35,13 @@ def get_msg_from_request_exc(
return err_response.text # type: ignore
return default_err

# ToDo: get_file_mime_type() to be removed once migrated to FileStorage
# FileStorage has mime_type() which could be used instead.
@staticmethod
def get_file_mime_type(input_file: Path) -> str:
def get_file_mime_type(
input_file: Path,
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
) -> str:
"""Gets the file MIME type for an input file. Uses libmagic to perform
the same.
Expand All @@ -45,15 +51,15 @@ def get_file_mime_type(input_file: Path) -> str:
Returns:
str: MIME type of the file
"""
input_file_mime = ""
with open(input_file, mode="rb") as input_file_obj:
sample_contents = input_file_obj.read(100)
input_file_mime = magic.from_buffer(sample_contents, mime=True)
input_file_obj.seek(0)
sample_contents = fs.read(path=input_file, mode="rb", length=100)
input_file_mime = magic.from_buffer(sample_contents, mime=True)
return input_file_mime

@staticmethod
def guess_extention(input_file_path: str) -> str:
def guess_extention(
input_file_path: str,
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
) -> str:
"""Returns the extention of the file passed.
Args:
Expand All @@ -63,8 +69,8 @@ def guess_extention(input_file_path: str) -> str:
str: File extention
"""
input_file_extention = ""
with open(input_file_path, mode="rb") as file_obj:
sample_contents = file_obj.read(100)
sample_contents = fs.read(path=input_file_path, mode="rb", length=100)
if sample_contents:
file_type = filetype.guess(sample_contents)
input_file_extention = file_type.EXTENSION
return input_file_extention
28 changes: 15 additions & 13 deletions src/unstract/sdk/adapters/x2text/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from unstract.sdk.adapters.exceptions import AdapterError
from unstract.sdk.adapters.utils import AdapterUtils
from unstract.sdk.adapters.x2text.constants import X2TextConstants
from unstract.sdk.file_storage import FileStorage, FileStorageProvider

logger = logging.getLogger(__name__)

Expand All @@ -17,7 +18,9 @@ class X2TextHelper:

@staticmethod
def parse_response(
response: Response, out_file_path: Optional[str] = None
response: Response,
out_file_path: Optional[str] = None,
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
) -> tuple[str, bool]:
"""Parses the response from a request.
Expand All @@ -27,6 +30,8 @@ def parse_response(
response (Response): Response to parse
out_file_path (Optional[str], optional): Output file path to write
to, skipped if None or emtpy. Defaults to None.
fs (FileStorage): file storage object to perfrom file operations
Returns:
tuple[str, bool]: Response's content and status of parsing
"""
Expand All @@ -35,8 +40,7 @@ def parse_response(
if isinstance(response.content, bytes):
output = response.content.decode("utf-8")
if out_file_path:
with open(out_file_path, "w", encoding="utf-8") as f:
f.write(output)
fs.write(path=out_file_path, mode="w", encoding="utf-8", data=output)
return output, True


Expand All @@ -49,9 +53,7 @@ class UnstructuredHelper:
PROCESS = "process"

@staticmethod
def test_server_connection(
unstructured_adapter_config: dict[str, Any]
) -> bool:
def test_server_connection(unstructured_adapter_config: dict[str, Any]) -> bool:
UnstructuredHelper.make_request(
unstructured_adapter_config, UnstructuredHelper.TEST_CONNECTION
)
Expand All @@ -62,21 +64,23 @@ def process_document(
unstructured_adapter_config: dict[str, Any],
input_file_path: str,
output_file_path: Optional[str] = None,
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
) -> str:
try:
response: Response
local_storage = FileStorage(FileStorageProvider.LOCAL)
if not local_storage.exists(input_file_path):
fs.download(from_path=input_file_path, to_path=input_file_path)
with open(input_file_path, "rb") as input_f:
mime_type = AdapterUtils.get_file_mime_type(
input_file=input_file_path
)
mime_type = AdapterUtils.get_file_mime_type(input_file=input_file_path)
files = {"file": (input_file_path, input_f, mime_type)}
response = UnstructuredHelper.make_request(
unstructured_adapter_config=unstructured_adapter_config,
request_type=UnstructuredHelper.PROCESS,
files=files,
)
output, is_success = X2TextHelper.parse_response(
response=response, out_file_path=output_file_path
response=response, out_file_path=output_file_path, fs=fs
)
if not is_success:
raise AdapterError("Couldn't extract text from file")
Expand All @@ -95,9 +99,7 @@ def make_request(
request_type: str,
**kwargs: dict[Any, Any],
) -> Response:
unstructured_url = unstructured_adapter_config.get(
UnstructuredHelper.URL
)
unstructured_url = unstructured_adapter_config.get(UnstructuredHelper.URL)

x2text_service_url = unstructured_adapter_config.get(
X2TextConstants.X2TEXT_HOST
Expand Down
22 changes: 15 additions & 7 deletions src/unstract/sdk/adapters/x2text/llama_parse/src/llama_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from unstract.sdk.adapters.x2text.dto import TextExtractionResult
from unstract.sdk.adapters.x2text.llama_parse.src.constants import LlamaParseConfig
from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter
from unstract.sdk.file_storage import FileStorage, FileStorageProvider

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -46,8 +47,8 @@ def get_json_schema() -> str:
def _call_parser(
self,
input_file_path: str,
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
) -> str:

parser = LlamaParse(
api_key=self.config.get(LlamaParseConfig.API_KEY),
base_url=self.config.get(LlamaParseConfig.BASE_URL),
Expand All @@ -61,7 +62,9 @@ def _call_parser(
file_extension = pathlib.Path(input_file_path).suffix
if not file_extension:
try:
input_file_extension = AdapterUtils.guess_extention(input_file_path)
input_file_extension = AdapterUtils.guess_extention(
input_file_path, fs
)
input_file_path_copy = input_file_path
input_file_path = ".".join(
(input_file_path_copy, input_file_extension)
Expand All @@ -70,7 +73,8 @@ def _call_parser(
logger.error("Exception raised while handling input file.")
raise AdapterError(str(os_err))

documents = parser.load_data(input_file_path)
file_bytes = fs.read(path=input_file_path, mode="rb")
documents = parser.load_data(file_bytes)

except ConnectError as connec_err:
logger.error(f"Invalid Base URL given. : {connec_err}")
Expand All @@ -91,13 +95,17 @@ def process(
self,
input_file_path: str,
output_file_path: Optional[str] = None,
fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
**kwargs: dict[Any, Any],
) -> TextExtractionResult:

response_text = self._call_parser(input_file_path=input_file_path)
response_text = self._call_parser(input_file_path=input_file_path, fs=fs)
if output_file_path:
with open(output_file_path, "w", encoding="utf-8") as f:
f.write(response_text)
fs.write(
path=output_file_path,
mode="w",
encoding="utf-8",
data=response_text,
)

return TextExtractionResult(extracted_text=response_text)

Expand Down
Loading

0 comments on commit ef00f81

Please sign in to comment.