File Storage interface and implementation (#112)

* contextSizeChanges * contextSizeChanges * Version roll and test folder check in * Fix enum values * Fix test cases, address review comments * Address review comments * Update pyproject.toml Co-authored-by: Chandrasekharan M <[email protected]> Signed-off-by: Gayathri <[email protected]> * Address mypy issues * Change class design and implementation * Remove unused definitions * Add cp() and function refactoring * Check-in sample env * Default value of dict changed to None * Add size() * Refctor for using FileStorage * Refactor to use FileStorage * Fix issues * Add mim_type, download functions * change comments * Refactor het_hash_from_file * Add return types * Remove permanent file storage from sdk * Fix SDK functional issues * Support minio * Test cases for Minio * Bring file variants back to sdk * Fix copy_on_write * Add new test cases for uploadd/download * Add new functions to support platform-service * Change modififcation_time return type to datetime * Refactor env pick-up logic * Sample env * contextSizeChanges * Remove commented code and some improvisations * contextSizeChanges * Add right JSON formatted string * Update src/unstract/sdk/file_storage/fs_permanent.py Co-authored-by: Chandrasekharan M <[email protected]> Signed-off-by: Gayathri <[email protected]> * Address review comments * Address review comments * Update src/unstract/sdk/file_storage/fs_shared_temporary.py Co-authored-by: ali <[email protected]> Signed-off-by: Gayathri <[email protected]> * Refactor for change in enum value * Add return type --------- Signed-off-by: Gayathri <[email protected]> Co-authored-by: Chandrasekharan M <[email protected]> Co-authored-by: ali <[email protected]>
Zipstack · Nov 13, 2024 · ef00f81 · ef00f81
1 parent 8c4610f
commit ef00f81
Show file tree

Hide file tree

Showing 30 changed files with 2,903 additions and 1,169 deletions.
diff --git a/pdm.lock b/pdm.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -82,7 +82,13 @@ scripts = { unstract-tool-gen = "unstract.sdk.scripts.tool_gen:main" }
 
 [tool.pdm.dev-dependencies]
 docs = [ "lazydocs~=0.4.8" ]
-test = [ "parameterized==0.9.0" ]
+test = [
+    "parameterized==0.9.0",
+    "pytest==8.3.3",
+    "pytest-mock==3.14.0",
+    "gcsfs==2024.10.0",
+    "s3fs==2024.10.0"
+]
 lint = [
     "autopep8~=2.0.2",
     "black~=23.3.0",

diff --git a/src/unstract/sdk/__init__.py b/src/unstract/sdk/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.53.0"
+__version__ = "0.54.0rc1"
 
 
 def get_sdk_version():

diff --git a/src/unstract/sdk/adapters/ocr/google_document_ai/src/google_document_ai.py b/src/unstract/sdk/adapters/ocr/google_document_ai/src/google_document_ai.py
@@ -12,6 +12,7 @@
 from unstract.sdk.adapters.exceptions import AdapterError
 from unstract.sdk.adapters.ocr.constants import FileType
 from unstract.sdk.adapters.ocr.ocr_adapter import OCRAdapter
+from unstract.sdk.file_storage import FileStorage, FileStorageProvider
 
 logger = logging.getLogger(__name__)
 
@@ -95,10 +96,13 @@ def _get_request_headers(self) -> dict[str, Any]:
 
     """ Detect the mime type from the file content """
 
-    def _get_input_file_type_mime(self, input_file_path: str) -> str:
-        with open(input_file_path, mode="rb") as file_obj:
-            sample_contents = file_obj.read(100)
-            file_type = filetype.guess(sample_contents)
+    def _get_input_file_type_mime(
+        self,
+        input_file_path: str,
+        fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
+    ) -> str:
+        sample_contents = fs.read(path=input_file_path, mode="rb", length=100)
+        file_type = filetype.guess(sample_contents)
 
         file_type_mime: str = file_type.MIME if file_type else FileType.TEXT_PLAIN
 
@@ -110,13 +114,15 @@ def _get_input_file_type_mime(self, input_file_path: str) -> str:
         return file_type_mime
 
     def process(
-        self, input_file_path: str, output_file_path: Optional[str] = None
+        self,
+        input_file_path: str,
+        output_file_path: Optional[str] = None,
+        fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
     ) -> str:
         try:
             file_type_mime = self._get_input_file_type_mime(input_file_path)
-            if os.path.isfile(input_file_path):
-                with open(input_file_path, "rb") as fop:
-                    file_content_in_bytes: bytes = fop.read()
+            if fs.exists(input_file_path):
+                file_content_in_bytes = fs.read(path=input_file_path, mode="rb")
             else:
                 raise AdapterError(f"File not found {input_file_path}")
             processor_url = self.config.get(Constants.URL, "") + ":process"
@@ -131,19 +137,14 @@ def process(
             response_json: dict[str, Any] = response.json()
             result_text: str = response_json["document"]["text"]
             if output_file_path is not None:
-                with open(output_file_path, "w", encoding="utf-8") as f:
-                    f.write(result_text)
-                    f.close()
+                fs.write(path=output_file_path, mode="w", encoding="utf-8")
             return result_text
         except Exception as e:
             logger.error(f"Error while processing document {e}")
             if not isinstance(e, AdapterError):
                 raise AdapterError(str(e))
             else:
                 raise e
-        finally:
-            if fop is not None:
-                fop.close()
 
     def test_connection(self) -> bool:
         try:

diff --git a/src/unstract/sdk/adapters/utils.py b/src/unstract/sdk/adapters/utils.py
@@ -6,6 +6,7 @@
 from requests.exceptions import RequestException
 
 from unstract.sdk.adapters.constants import Common
+from unstract.sdk.file_storage import FileStorage, FileStorageProvider
 
 
 class AdapterUtils:
@@ -34,8 +35,13 @@ def get_msg_from_request_exc(
                 return err_response.text  # type: ignore
         return default_err
 
+    # ToDo: get_file_mime_type() to be removed once migrated to FileStorage
+    # FileStorage has mime_type() which could be used instead.
     @staticmethod
-    def get_file_mime_type(input_file: Path) -> str:
+    def get_file_mime_type(
+        input_file: Path,
+        fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
+    ) -> str:
         """Gets the file MIME type for an input file. Uses libmagic to perform
         the same.
 
@@ -45,15 +51,15 @@ def get_file_mime_type(input_file: Path) -> str:
         Returns:
             str: MIME type of the file
         """
-        input_file_mime = ""
-        with open(input_file, mode="rb") as input_file_obj:
-            sample_contents = input_file_obj.read(100)
-            input_file_mime = magic.from_buffer(sample_contents, mime=True)
-            input_file_obj.seek(0)
+        sample_contents = fs.read(path=input_file, mode="rb", length=100)
+        input_file_mime = magic.from_buffer(sample_contents, mime=True)
         return input_file_mime
 
     @staticmethod
-    def guess_extention(input_file_path: str) -> str:
+    def guess_extention(
+        input_file_path: str,
+        fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
+    ) -> str:
         """Returns the extention of the file passed.
 
         Args:
@@ -63,8 +69,8 @@ def guess_extention(input_file_path: str) -> str:
             str: File extention
         """
         input_file_extention = ""
-        with open(input_file_path, mode="rb") as file_obj:
-            sample_contents = file_obj.read(100)
+        sample_contents = fs.read(path=input_file_path, mode="rb", length=100)
+        if sample_contents:
             file_type = filetype.guess(sample_contents)
             input_file_extention = file_type.EXTENSION
         return input_file_extention
diff --git a/src/unstract/sdk/adapters/x2text/helper.py b/src/unstract/sdk/adapters/x2text/helper.py
@@ -8,6 +8,7 @@
 from unstract.sdk.adapters.exceptions import AdapterError
 from unstract.sdk.adapters.utils import AdapterUtils
 from unstract.sdk.adapters.x2text.constants import X2TextConstants
+from unstract.sdk.file_storage import FileStorage, FileStorageProvider
 
 logger = logging.getLogger(__name__)
 
@@ -17,7 +18,9 @@ class X2TextHelper:
 
     @staticmethod
     def parse_response(
-        response: Response, out_file_path: Optional[str] = None
+        response: Response,
+        out_file_path: Optional[str] = None,
+        fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
     ) -> tuple[str, bool]:
         """Parses the response from a request.
 
@@ -27,6 +30,8 @@ def parse_response(
             response (Response): Response to parse
             out_file_path (Optional[str], optional): Output file path to write
                  to, skipped if None or emtpy. Defaults to None.
+            fs (FileStorage): file storage object to perfrom file operations
+
         Returns:
             tuple[str, bool]: Response's content and status of parsing
         """
@@ -35,8 +40,7 @@ def parse_response(
         if isinstance(response.content, bytes):
             output = response.content.decode("utf-8")
         if out_file_path:
-            with open(out_file_path, "w", encoding="utf-8") as f:
-                f.write(output)
+            fs.write(path=out_file_path, mode="w", encoding="utf-8", data=output)
         return output, True
 
 
@@ -49,9 +53,7 @@ class UnstructuredHelper:
     PROCESS = "process"
 
     @staticmethod
-    def test_server_connection(
-        unstructured_adapter_config: dict[str, Any]
-    ) -> bool:
+    def test_server_connection(unstructured_adapter_config: dict[str, Any]) -> bool:
         UnstructuredHelper.make_request(
             unstructured_adapter_config, UnstructuredHelper.TEST_CONNECTION
         )
@@ -62,21 +64,23 @@ def process_document(
         unstructured_adapter_config: dict[str, Any],
         input_file_path: str,
         output_file_path: Optional[str] = None,
+        fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
     ) -> str:
         try:
             response: Response
+            local_storage = FileStorage(FileStorageProvider.LOCAL)
+            if not local_storage.exists(input_file_path):
+                fs.download(from_path=input_file_path, to_path=input_file_path)
             with open(input_file_path, "rb") as input_f:
-                mime_type = AdapterUtils.get_file_mime_type(
-                    input_file=input_file_path
-                )
+                mime_type = AdapterUtils.get_file_mime_type(input_file=input_file_path)
                 files = {"file": (input_file_path, input_f, mime_type)}
                 response = UnstructuredHelper.make_request(
                     unstructured_adapter_config=unstructured_adapter_config,
                     request_type=UnstructuredHelper.PROCESS,
                     files=files,
                 )
             output, is_success = X2TextHelper.parse_response(
-                response=response, out_file_path=output_file_path
+                response=response, out_file_path=output_file_path, fs=fs
             )
             if not is_success:
                 raise AdapterError("Couldn't extract text from file")
@@ -95,9 +99,7 @@ def make_request(
         request_type: str,
         **kwargs: dict[Any, Any],
     ) -> Response:
-        unstructured_url = unstructured_adapter_config.get(
-            UnstructuredHelper.URL
-        )
+        unstructured_url = unstructured_adapter_config.get(UnstructuredHelper.URL)
 
         x2text_service_url = unstructured_adapter_config.get(
             X2TextConstants.X2TEXT_HOST

diff --git a/src/unstract/sdk/adapters/x2text/llama_parse/src/llama_parse.py b/src/unstract/sdk/adapters/x2text/llama_parse/src/llama_parse.py
@@ -11,6 +11,7 @@
 from unstract.sdk.adapters.x2text.dto import TextExtractionResult
 from unstract.sdk.adapters.x2text.llama_parse.src.constants import LlamaParseConfig
 from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter
+from unstract.sdk.file_storage import FileStorage, FileStorageProvider
 
 logger = logging.getLogger(__name__)
 
@@ -46,8 +47,8 @@ def get_json_schema() -> str:
     def _call_parser(
         self,
         input_file_path: str,
+        fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
     ) -> str:
-
         parser = LlamaParse(
             api_key=self.config.get(LlamaParseConfig.API_KEY),
             base_url=self.config.get(LlamaParseConfig.BASE_URL),
@@ -61,7 +62,9 @@ def _call_parser(
             file_extension = pathlib.Path(input_file_path).suffix
             if not file_extension:
                 try:
-                    input_file_extension = AdapterUtils.guess_extention(input_file_path)
+                    input_file_extension = AdapterUtils.guess_extention(
+                        input_file_path, fs
+                    )
                     input_file_path_copy = input_file_path
                     input_file_path = ".".join(
                         (input_file_path_copy, input_file_extension)
@@ -70,7 +73,8 @@ def _call_parser(
                     logger.error("Exception raised while handling input file.")
                     raise AdapterError(str(os_err))
 
-            documents = parser.load_data(input_file_path)
+            file_bytes = fs.read(path=input_file_path, mode="rb")
+            documents = parser.load_data(file_bytes)
 
         except ConnectError as connec_err:
             logger.error(f"Invalid Base URL given. : {connec_err}")
@@ -91,13 +95,17 @@ def process(
         self,
         input_file_path: str,
         output_file_path: Optional[str] = None,
+        fs: FileStorage = FileStorage(provider=FileStorageProvider.LOCAL),
         **kwargs: dict[Any, Any],
     ) -> TextExtractionResult:
-
-        response_text = self._call_parser(input_file_path=input_file_path)
+        response_text = self._call_parser(input_file_path=input_file_path, fs=fs)
         if output_file_path:
-            with open(output_file_path, "w", encoding="utf-8") as f:
-                f.write(response_text)
+            fs.write(
+                path=output_file_path,
+                mode="w",
+                encoding="utf-8",
+                data=response_text,
+            )
 
         return TextExtractionResult(extracted_text=response_text)