Skip to content

Commit

Permalink
Support length to be passed while identifying mime type (#134)
Browse files Browse the repository at this point in the history
* Refactoring changed file names

* Roll version

* Update src/unstract/sdk/utils/tool_utils.py

Co-authored-by: Chandrasekharan M <[email protected]>
Signed-off-by: Gayathri <[email protected]>

* Update tests/test_fs_permanent.py

Co-authored-by: Chandrasekharan M <[email protected]>
Signed-off-by: Gayathri <[email protected]>

* Update tests/test_fs_permanent.py

Co-authored-by: Chandrasekharan M <[email protected]>
Signed-off-by: Gayathri <[email protected]>

* Update src/unstract/sdk/utils/tool_utils.py

Co-authored-by: Chandrasekharan M <[email protected]>
Signed-off-by: Gayathri <[email protected]>

* Address review comments

* Add support for passing length to mime_type

* Add recursive and fix mypy issue

* CHange test case with new behavior to return FileNotFound in read()

* Remove typing kwargs.

* Resolve mypy issues

* Resolve mypy issues

* Remove unwanted conditionals/vars

* Remove pandoc and tessaract.

* Details of provider added to error message

---------

Signed-off-by: Gayathri <[email protected]>
Co-authored-by: Chandrasekharan M <[email protected]>
  • Loading branch information
gaya3-zipstack and chandrasekharan-zipstack authored Dec 11, 2024
1 parent e2cf928 commit c1a6ed0
Show file tree
Hide file tree
Showing 9 changed files with 101 additions and 38 deletions.
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ They also contain helper methods/classes to aid with other tasks such as indexin
- Linux

```
sudo apt install build-essential pkg-config libmagic-dev tesseract-ocr pandoc
sudo apt install build-essential pkg-config libmagic-dev
```
- Mac
Expand Down Expand Up @@ -101,6 +101,16 @@ unstract-sdk @ git+https://github.com/Zipstack/unstract-sdk@feature-branch

- Or try installing a [local PyPI server](https://pypi.org/project/pypiserver/) and upload / download your package from this server

#### Additonal dependencies for tool
Tools may need to be backed up by a file storage. unstract.sdk.file_storage contains the required interfaces for the
same. fssepc is being used underneath to implement these interfaces. Hence, one can choose to use a file_system
supported by fsspec for this. However, the required dependencies need to be added in the tool dependency manager.
Eg. If the tool is using Minio as the underlying file storage, then s3fs can be added to support it.
Similarly, for Google Cloud Storage, gcsfs is to be added.
The following versions are tested in the SDK using unit test cases for the above package.
gcsfs==2024.10.0
s3fs==2024.10.0


### Documentation generation

Expand Down
1 change: 1 addition & 0 deletions src/unstract/sdk/file_storage/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

class FileOperationParams:
READ_ENTIRE_LENGTH = -1
MIME_TYPE_DEFAULT_READ_LENGTH = 100
DEFAULT_ENCODING = "utf-8"


Expand Down
4 changes: 2 additions & 2 deletions src/unstract/sdk/file_storage/env_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,11 @@ def get_storage(storage_type: StorageType, env_name: str) -> FileStorage:
and credentials configured in the env
"""
try:
file_storage_creds = json.loads(os.environ.get(env_name))
file_storage_creds = json.loads(os.environ.get(env_name, ""))
provider = FileStorageProvider(
file_storage_creds[CredentialKeyword.PROVIDER]
)
credentials = file_storage_creds.get(CredentialKeyword.CREDENTIALS, {})
credentials = file_storage_creds.get(CredentialKeyword.CREDENTIALS, "{}")
if storage_type == StorageType.PERMANENT.value:
file_storage = PermanentFileStorage(provider=provider, **credentials)
elif storage_type == StorageType.TEMPORARY.value:
Expand Down
3 changes: 2 additions & 1 deletion src/unstract/sdk/file_storage/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def file_storage_init(
try:
protocol = provider.value
if provider == FileStorageProvider.LOCAL:
storage_config.update({"auto_mkdir": True})
# Hard set auto_mkdir to True as default
storage_config.update({"auto_mkdir": True}) # type: ignore
elif provider in [FileStorageProvider.MINIO]:
# Initialise using s3 for Minio
protocol = FileStorageProvider.S3.value
Expand Down
30 changes: 22 additions & 8 deletions src/unstract/sdk/file_storage/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,17 +161,25 @@ def rm(self, path: str, recursive: bool = True):
return self.fs.rm(path=path, recursive=recursive)

@skip_local_cache
def cp(self, src: str, dest: str, overwrite: bool = True):
def cp(
self,
src: str,
dest: str,
recursive: bool = False,
overwrite: bool = True,
):
"""Copies files from source(lpath) path to the destination(rpath) path.
Args:
src (str): Path to the source
dest (str): Path to the destination
recursive (bool): Copy recursively when set to True
overwrite (bool): Overwrite existing path with same name
Returns:
NA
"""
return self.fs.cp(src, dest, overwrite=overwrite)
return self.fs.cp(src, dest, recursive=recursive, overwrite=overwrite)

@skip_local_cache
def size(self, path: str) -> int:
Expand Down Expand Up @@ -203,17 +211,23 @@ def modification_time(self, path: str) -> datetime:
return file_mtime

@skip_local_cache
def mime_type(self, path: str) -> str:
def mime_type(
self,
path: str,
read_length: int = FileOperationParams.MIME_TYPE_DEFAULT_READ_LENGTH,
) -> str:
"""Gets the file MIME type for an input file. Uses libmagic to perform
the same.
Args:
path (str): Path of the input file
read_length (int): Length(bytes) to be read from the file for in
order to identify the mime type
Returns:
str: MIME type of the file
"""
sample_contents = self.read(path=path, mode="rb", length=100)
sample_contents = self.read(path=path, mode="rb", length=read_length)
mime_type = magic.from_buffer(sample_contents, mime=True)
return mime_type

Expand Down Expand Up @@ -291,7 +305,7 @@ def json_dump(
self,
path: str,
data: dict[str, Any],
**kwargs: dict[Any, Any],
**kwargs: dict[Any, Any], # type: ignore
):
"""Dumps data into the given file specified by path.
Expand All @@ -302,15 +316,15 @@ def json_dump(
"""
try:
with self.fs.open(path=path, mode="w", encoding="utf-8") as f:
json.dump(data, f, **kwargs)
json.dump(obj=data, fp=f, **kwargs) # type: ignore
except Exception as e:
raise FileOperationError(str(e)) from e

def yaml_dump(
self,
path: str,
data: dict[str, Any],
**kwargs: dict[Any, Any],
**kwargs: dict[Any, Any], # type: ignore
):
"""Dumps data into the given file specified by path.
Expand All @@ -321,7 +335,7 @@ def yaml_dump(
"""
try:
with self.fs.open(path=path, mode="w", encoding="utf-8") as f:
yaml.dump(data, f, **kwargs)
yaml.dump(data=data, stream=f, **kwargs) # type: ignore
except Exception as e:
raise FileOperationError(str(e)) from e

Expand Down
14 changes: 12 additions & 2 deletions src/unstract/sdk/file_storage/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,13 @@ def rm(self, path: str, recursive: bool = True):
pass

@abstractmethod
def cp(self, lpath: str, rpath: str):
def cp(
self,
lpath: str,
rpath: str,
recursive: bool = False,
overwrite: bool = True,
):
pass

@abstractmethod
Expand All @@ -68,7 +74,11 @@ def modification_time(self, path: str) -> datetime:
pass

@abstractmethod
def mime_type(self, path: str) -> str:
def mime_type(
self,
path: str,
read_length: int = FileOperationParams.MIME_TYPE_DEFAULT_READ_LENGTH,
) -> str:
pass

@abstractmethod
Expand Down
11 changes: 5 additions & 6 deletions src/unstract/sdk/file_storage/permanent.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,10 @@ def __init__(
f"supported in Permanent mode. "
f"Supported providers: {self.SUPPORTED_FILE_STORAGE_TYPES}"
)
if provider == FileStorageProvider.GCS:
if provider == FileStorageProvider.GCS or provider == FileStorageProvider.LOCAL:
super().__init__(provider, **storage_config)
elif provider == FileStorageProvider.LOCAL:
super().__init__(provider)
else:
raise NotImplementedError
raise NotImplementedError(f"Provider {provider.value} is not implemented")

def _copy_on_read(self, path: str, legacy_storage_path: str):
"""Copies the file to the remote storage lazily if not present already.
Expand Down Expand Up @@ -76,6 +74,7 @@ def read(
seek_position (int): Position to start reading from
length (int): Number of bytes to be read. Default is full
file content.
legacy_storage_path (str): Legacy path to the same file
Returns:
Union[bytes, str] - File contents in bytes/string based on the opened mode
Expand All @@ -85,7 +84,7 @@ def read(
if legacy_storage_path:
self._copy_on_read(path, legacy_storage_path)
return super().read(path, mode, encoding, seek_position, length)
except FileNotFoundError:
logger.warning(f"File {path} not found. Ignoring.")
except Exception as e:
if isinstance(e, FileNotFoundError) or isinstance(e, FileOperationError):
raise e
raise FileOperationError(str(e)) from e
48 changes: 40 additions & 8 deletions tests/test_file_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from unstract.sdk.constants import MimeType
from unstract.sdk.exceptions import FileOperationError
from unstract.sdk.file_storage.constants import StorageType
from unstract.sdk.file_storage.constants import FileOperationParams, StorageType
from unstract.sdk.file_storage.env_helper import EnvHelper
from unstract.sdk.file_storage.impl import FileStorage
from unstract.sdk.file_storage.provider import FileStorageProvider
Expand Down Expand Up @@ -488,29 +488,51 @@ def test_file(provider):


@pytest.mark.parametrize(
"file_storage, lpath, rpath",
"file_storage, lpath, rpath, recursive, expected_result",
[
(
file_storage(provider=FileStorageProvider.GCS),
TEST_CONSTANTS.READ_TEXT_FILE,
TEST_CONSTANTS.TEST_FOLDER,
True,
True,
),
(
file_storage(provider=FileStorageProvider.LOCAL),
TEST_CONSTANTS.READ_TEXT_FILE,
TEST_CONSTANTS.TEST_FOLDER,
True,
True,
),
(
file_storage(provider=FileStorageProvider.LOCAL),
TEST_CONSTANTS.READ_FOLDER_PATH,
TEST_CONSTANTS.TEST_FOLDER,
True,
True,
),
(
file_storage(provider=FileStorageProvider.LOCAL),
TEST_CONSTANTS.READ_FOLDER_PATH,
TEST_CONSTANTS.TEST_FOLDER,
False,
False,
),
(
file_storage(provider=FileStorageProvider.MINIO),
TEST_CONSTANTS.READ_TEXT_FILE,
TEST_CONSTANTS.TEST_FOLDER,
True,
True,
),
],
)
def test_cp(file_storage, lpath, rpath):
file_storage.cp(lpath, rpath, overwrite=True)
assert file_storage.exists(rpath) is True
file_storage.rm(rpath, recursive=True)
def test_cp(file_storage, lpath, rpath, recursive, expected_result):
file_storage.cp(lpath, rpath, recursive=recursive, overwrite=True)
actual_result = file_storage.exists(rpath)
assert actual_result == expected_result
if actual_result:
file_storage.rm(rpath, recursive=True)
assert file_storage.exists(rpath) is False


Expand Down Expand Up @@ -566,49 +588,59 @@ def test_file_size(file_storage, path, expected_size):


@pytest.mark.parametrize(
"file_storage, path, expected_mime_type",
"file_storage, path, read_length, expected_mime_type",
[
(
file_storage(provider=FileStorageProvider.GCS),
TEST_CONSTANTS.READ_PDF_FILE,
FileOperationParams.MIME_TYPE_DEFAULT_READ_LENGTH,
MimeType.PDF,
),
(
file_storage(provider=FileStorageProvider.GCS),
TEST_CONSTANTS.READ_TEXT_FILE,
FileOperationParams.READ_ENTIRE_LENGTH,
MimeType.TEXT,
),
(
file_storage(provider=FileStorageProvider.GCS),
TEST_CONSTANTS.READ_PDF_FILE,
FileOperationParams.MIME_TYPE_DEFAULT_READ_LENGTH,
MimeType.PDF,
),
(
file_storage(provider=FileStorageProvider.LOCAL),
TEST_CONSTANTS.READ_TEXT_FILE,
50,
MimeType.TEXT,
),
(
file_storage(provider=FileStorageProvider.MINIO),
TEST_CONSTANTS.READ_PDF_FILE,
FileOperationParams.MIME_TYPE_DEFAULT_READ_LENGTH,
MimeType.PDF,
),
(
file_storage(provider=FileStorageProvider.MINIO),
TEST_CONSTANTS.READ_TEXT_FILE,
FileOperationParams.READ_ENTIRE_LENGTH,
MimeType.TEXT,
),
(
file_storage(provider=FileStorageProvider.MINIO),
TEST_CONSTANTS.READ_PDF_FILE,
FileOperationParams.MIME_TYPE_DEFAULT_READ_LENGTH,
MimeType.PDF,
),
],
)
def test_file_mime_type(file_storage, path, expected_mime_type):
def test_file_mime_type(file_storage, path, read_length, expected_mime_type):
mime_type = file_storage.mime_type(path=path)
file_storage.mkdir(path=TEST_CONSTANTS.READ_FOLDER_PATH)
assert mime_type == expected_mime_type
mime_type_read_length = file_storage.mime_type(path=path, read_length=read_length)
file_storage.mkdir(path=TEST_CONSTANTS.READ_FOLDER_PATH)
assert mime_type_read_length == expected_mime_type


@pytest.mark.parametrize(
Expand Down
16 changes: 6 additions & 10 deletions tests/test_fs_permanent.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@ def permanent_file_storage(provider: FileStorageProvider):
creds = json.loads(os.environ.get(TEST_CONSTANTS.FILE_STORAGE_LOCAL, "{}"))
except JSONDecodeError:
creds = {}
file_storage = PermanentFileStorage(
provider=provider, legacy_storage_path="./prompt_studio_data", **creds
)
file_storage = PermanentFileStorage(provider=provider, **creds)
assert file_storage is not None
return file_storage

Expand All @@ -53,13 +51,11 @@ def permanent_file_storage(provider: FileStorageProvider):
def test_permanent_fs_copy_on_read(file_storage, file_read_path, read_mode):
if file_storage.exists(file_read_path):
file_storage.rm(file_read_path)
file_read_contents = file_storage.read(
file_read_path,
read_mode,
)
print(file_read_contents)
# File in the path does not exist. So no contents can be read
assert file_read_contents is None
with pytest.raises(FileNotFoundError):
file_storage.read(
file_read_path,
read_mode,
)


@pytest.mark.parametrize(
Expand Down

0 comments on commit c1a6ed0

Please sign in to comment.