From 46e90eb3f071d259d0f812dec12740692cbd1765 Mon Sep 17 00:00:00 2001
From: Jonas Maison <Jonas1312@users.noreply.github.com>
Date: Mon, 2 Oct 2023 08:36:25 +0200
Subject: [PATCH] fix: add warnings in
 `DataConnectionComputeDifferencesPayload` for sync cloud storage (#1499)

---
 src/kili/services/data_connection/__init__.py | 24 +++--
 src/kili/services/data_connection/azure.py    | 92 +++++++++++++------
 tests/e2e/test_cloud_storage.py               |  1 -
 3 files changed, 82 insertions(+), 35 deletions(-)

diff --git a/src/kili/services/data_connection/__init__.py b/src/kili/services/data_connection/__init__.py
index 70d21411d..ddcf84e55 100644
--- a/src/kili/services/data_connection/__init__.py
+++ b/src/kili/services/data_connection/__init__.py
@@ -20,6 +20,7 @@
     GQL_COMPUTE_DATA_CONNECTION_DIFFERENCES,
     GQL_VALIDATE_DATA_DIFFERENCES,
 )
+from kili.services.project import get_project_field
 
 LOGGER = None
 
@@ -107,7 +108,7 @@ def compute_differences(kili, data_connection_id: str) -> Dict:
 
     data_integration = data_connection["dataIntegration"]
 
-    blob_paths = None
+    blob_paths = warnings = None
 
     # for azure using credentials, it is required to provide the blob paths to compute the diffs
     if (
@@ -124,22 +125,31 @@ def compute_differences(kili, data_connection_id: str) -> Dict:
 
         try:
             # pylint: disable=import-outside-toplevel
-            from .azure import (
-                get_blob_paths_azure_data_connection_with_service_credentials,
-            )
+            from .azure import AzureBucket
         except ImportError as err:
             raise ImportError(
                 "The azure-storage-blob package is required to use Azure buckets. "
                 " Run `pip install kili[azure]` to install it."
             ) from err
 
-        blob_paths = get_blob_paths_azure_data_connection_with_service_credentials(
-            data_connection=data_connection, data_integration=data_integration
+        blob_paths, warnings = AzureBucket(
+            sas_token=data_integration["azureSASToken"],
+            connection_url=data_integration["azureConnectionURL"],
+        ).get_blob_paths_azure_data_connection_with_service_credentials(
+            data_connection["selectedFolders"],
+            input_type=get_project_field(
+                kili,
+                project_id=get_data_connection(
+                    kili, data_connection_id=data_connection_id, fields=("projectId",)
+                )["projectId"],
+                field="inputType",
+            ),
         )
 
     variables: Dict[str, Any] = {"where": {"id": data_connection_id}}
     if blob_paths is not None:
-        variables["data"] = {"blobPaths": blob_paths}
+        variables["data"] = {"blobPaths": blob_paths, "warnings": warnings}
+
     result = kili.graphql_client.execute(GQL_COMPUTE_DATA_CONNECTION_DIFFERENCES, variables)
     return format_result("data", result, None, kili.http_client)
 
diff --git a/src/kili/services/data_connection/azure.py b/src/kili/services/data_connection/azure.py
index 93d47ab05..5abef095c 100644
--- a/src/kili/services/data_connection/azure.py
+++ b/src/kili/services/data_connection/azure.py
@@ -1,11 +1,13 @@
 """Code specific to Azure blob storage."""
 
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 from urllib.parse import urlparse
 
 from azure.storage.blob import BlobServiceClient
 
+from kili.domain.project import InputType
+
 
 class AzureBucket:
     """Class for Azure blob storage buckets."""
@@ -33,10 +35,6 @@ def _split_connection_url_into_storage_account_and_container_name(
         container_name = url_connection.path.lstrip("/")
         return storage_account, container_name
 
-    def get_blob_paths(self) -> List[str]:
-        """List files in the Azure bucket."""
-        return list(self.storage_bucket.list_blob_names())
-
     def get_blob_paths_as_tree(self) -> Dict:
         """Get a tree representation of the Azure bucket.
 
@@ -58,28 +56,68 @@ def get_blob_paths_as_tree(self) -> Dict:
 
         return filetree
 
+    def get_blob_paths_azure_data_connection_with_service_credentials(
+        self, selected_folders: Optional[List[str]], input_type: InputType
+    ) -> Tuple[List[str], List[Optional[str]]]:
+        """Get the blob paths for an Azure data connection using service credentials."""
+        blob_paths = []
+        warnings = set()
+        for blob in self.storage_bucket.list_blobs():
+            if not hasattr(blob, "name") or not isinstance(blob.name, str):
+                continue
+
+            # blob_paths_in_bucket contains all blob paths in the bucket, we need to filter them
+            # to keep only the ones in the data connection selected folders
+            if isinstance(selected_folders, List) and not any(
+                blob.name.startswith(selected_folder) for selected_folder in selected_folders
+            ):
+                continue
+
+            has_content_type_field = (
+                hasattr(blob, "content_settings")
+                and hasattr(blob.content_settings, "content_type")
+                and isinstance(blob.content_settings.content_type, str)
+            )
+            if not has_content_type_field:
+                warnings.add("Objects with missing content-type were ignored")
+
+            elif not self._is_content_type_compatible_with_input_type(
+                blob.content_settings.content_type,  # pyright: ignore[reportGeneralTypeIssues]
+                input_type,
+            ):
+                warnings.add(
+                    "Objects with unsupported content-type for this type of project were ignored"
+                )
 
-def get_blob_paths_azure_data_connection_with_service_credentials(
-    data_integration: Dict, data_connection: Dict
-) -> List[str]:
-    """Get the blob paths for an Azure data connection using service credentials."""
-    azure_client = AzureBucket(
-        sas_token=data_integration["azureSASToken"],
-        connection_url=data_integration["azureConnectionURL"],
-    )
-
-    blob_paths = azure_client.get_blob_paths()
-
-    # blob_paths_in_bucket contains all blob paths in the bucket, we need to filter them
-    # to keep only the ones in the data connection selected folders
-    if isinstance(data_connection["selectedFolders"], List):
-        blob_paths = [
-            blob_path
-            for blob_path in blob_paths
-            if any(
-                blob_path.startswith(selected_folder)
-                for selected_folder in data_connection["selectedFolders"]
+            else:
+                blob_paths.append(blob.name)
+
+        return blob_paths, list(warnings)
+
+    @staticmethod
+    def _is_content_type_compatible_with_input_type(
+        content_type: str, input_type: InputType
+    ) -> bool:
+        """Check if the content type is compatible with the input type."""
+        if input_type == "IMAGE":
+            return content_type.startswith("image")
+
+        if input_type == "VIDEO":
+            return content_type.startswith("video")
+
+        if input_type == "PDF":
+            return content_type.startswith("application/pdf")
+
+        if input_type == "TEXT":
+            return any(
+                content_type.startswith(text_type)
+                for text_type in (
+                    "application/json",
+                    "text/plain",
+                    "text/html",
+                    "text/csv",
+                    "text/xml",
+                )
             )
-        ]
 
-    return blob_paths
+        raise ValueError(f"Unknown project input type: {input_type}")
diff --git a/tests/e2e/test_cloud_storage.py b/tests/e2e/test_cloud_storage.py
index 9208e282d..c52db6e33 100644
--- a/tests/e2e/test_cloud_storage.py
+++ b/tests/e2e/test_cloud_storage.py
@@ -81,7 +81,6 @@ def is_same_endpoint(endpoint_short_name: str, endpoint_url: str) -> bool:
         ("STAGING", "GCP", "f474c0170c8daa09ec2e368ce4720c73", None, 5),
     ],
 )
-@pytest.mark.skip("to fix")
 def test_e2e_synchronize_cloud_storage_connection(
     kili: Kili,
     src_project: Dict,