From 46e90eb3f071d259d0f812dec12740692cbd1765 Mon Sep 17 00:00:00 2001 From: Jonas Maison Date: Mon, 2 Oct 2023 08:36:25 +0200 Subject: [PATCH] fix: add warnings in `DataConnectionComputeDifferencesPayload` for sync cloud storage (#1499) --- src/kili/services/data_connection/__init__.py | 24 +++-- src/kili/services/data_connection/azure.py | 92 +++++++++++++------ tests/e2e/test_cloud_storage.py | 1 - 3 files changed, 82 insertions(+), 35 deletions(-) diff --git a/src/kili/services/data_connection/__init__.py b/src/kili/services/data_connection/__init__.py index 70d21411d..ddcf84e55 100644 --- a/src/kili/services/data_connection/__init__.py +++ b/src/kili/services/data_connection/__init__.py @@ -20,6 +20,7 @@ GQL_COMPUTE_DATA_CONNECTION_DIFFERENCES, GQL_VALIDATE_DATA_DIFFERENCES, ) +from kili.services.project import get_project_field LOGGER = None @@ -107,7 +108,7 @@ def compute_differences(kili, data_connection_id: str) -> Dict: data_integration = data_connection["dataIntegration"] - blob_paths = None + blob_paths = warnings = None # for azure using credentials, it is required to provide the blob paths to compute the diffs if ( @@ -124,22 +125,31 @@ def compute_differences(kili, data_connection_id: str) -> Dict: try: # pylint: disable=import-outside-toplevel - from .azure import ( - get_blob_paths_azure_data_connection_with_service_credentials, - ) + from .azure import AzureBucket except ImportError as err: raise ImportError( "The azure-storage-blob package is required to use Azure buckets. " " Run `pip install kili[azure]` to install it." ) from err - blob_paths = get_blob_paths_azure_data_connection_with_service_credentials( - data_connection=data_connection, data_integration=data_integration + blob_paths, warnings = AzureBucket( + sas_token=data_integration["azureSASToken"], + connection_url=data_integration["azureConnectionURL"], + ).get_blob_paths_azure_data_connection_with_service_credentials( + data_connection["selectedFolders"], + input_type=get_project_field( + kili, + project_id=get_data_connection( + kili, data_connection_id=data_connection_id, fields=("projectId",) + )["projectId"], + field="inputType", + ), ) variables: Dict[str, Any] = {"where": {"id": data_connection_id}} if blob_paths is not None: - variables["data"] = {"blobPaths": blob_paths} + variables["data"] = {"blobPaths": blob_paths, "warnings": warnings} + result = kili.graphql_client.execute(GQL_COMPUTE_DATA_CONNECTION_DIFFERENCES, variables) return format_result("data", result, None, kili.http_client) diff --git a/src/kili/services/data_connection/azure.py b/src/kili/services/data_connection/azure.py index 93d47ab05..5abef095c 100644 --- a/src/kili/services/data_connection/azure.py +++ b/src/kili/services/data_connection/azure.py @@ -1,11 +1,13 @@ """Code specific to Azure blob storage.""" from pathlib import Path -from typing import Dict, List, Tuple +from typing import Dict, List, Optional, Tuple from urllib.parse import urlparse from azure.storage.blob import BlobServiceClient +from kili.domain.project import InputType + class AzureBucket: """Class for Azure blob storage buckets.""" @@ -33,10 +35,6 @@ def _split_connection_url_into_storage_account_and_container_name( container_name = url_connection.path.lstrip("/") return storage_account, container_name - def get_blob_paths(self) -> List[str]: - """List files in the Azure bucket.""" - return list(self.storage_bucket.list_blob_names()) - def get_blob_paths_as_tree(self) -> Dict: """Get a tree representation of the Azure bucket. @@ -58,28 +56,68 @@ def get_blob_paths_as_tree(self) -> Dict: return filetree + def get_blob_paths_azure_data_connection_with_service_credentials( + self, selected_folders: Optional[List[str]], input_type: InputType + ) -> Tuple[List[str], List[Optional[str]]]: + """Get the blob paths for an Azure data connection using service credentials.""" + blob_paths = [] + warnings = set() + for blob in self.storage_bucket.list_blobs(): + if not hasattr(blob, "name") or not isinstance(blob.name, str): + continue + + # blob_paths_in_bucket contains all blob paths in the bucket, we need to filter them + # to keep only the ones in the data connection selected folders + if isinstance(selected_folders, List) and not any( + blob.name.startswith(selected_folder) for selected_folder in selected_folders + ): + continue + + has_content_type_field = ( + hasattr(blob, "content_settings") + and hasattr(blob.content_settings, "content_type") + and isinstance(blob.content_settings.content_type, str) + ) + if not has_content_type_field: + warnings.add("Objects with missing content-type were ignored") + + elif not self._is_content_type_compatible_with_input_type( + blob.content_settings.content_type, # pyright: ignore[reportGeneralTypeIssues] + input_type, + ): + warnings.add( + "Objects with unsupported content-type for this type of project were ignored" + ) -def get_blob_paths_azure_data_connection_with_service_credentials( - data_integration: Dict, data_connection: Dict -) -> List[str]: - """Get the blob paths for an Azure data connection using service credentials.""" - azure_client = AzureBucket( - sas_token=data_integration["azureSASToken"], - connection_url=data_integration["azureConnectionURL"], - ) - - blob_paths = azure_client.get_blob_paths() - - # blob_paths_in_bucket contains all blob paths in the bucket, we need to filter them - # to keep only the ones in the data connection selected folders - if isinstance(data_connection["selectedFolders"], List): - blob_paths = [ - blob_path - for blob_path in blob_paths - if any( - blob_path.startswith(selected_folder) - for selected_folder in data_connection["selectedFolders"] + else: + blob_paths.append(blob.name) + + return blob_paths, list(warnings) + + @staticmethod + def _is_content_type_compatible_with_input_type( + content_type: str, input_type: InputType + ) -> bool: + """Check if the content type is compatible with the input type.""" + if input_type == "IMAGE": + return content_type.startswith("image") + + if input_type == "VIDEO": + return content_type.startswith("video") + + if input_type == "PDF": + return content_type.startswith("application/pdf") + + if input_type == "TEXT": + return any( + content_type.startswith(text_type) + for text_type in ( + "application/json", + "text/plain", + "text/html", + "text/csv", + "text/xml", + ) ) - ] - return blob_paths + raise ValueError(f"Unknown project input type: {input_type}") diff --git a/tests/e2e/test_cloud_storage.py b/tests/e2e/test_cloud_storage.py index 9208e282d..c52db6e33 100644 --- a/tests/e2e/test_cloud_storage.py +++ b/tests/e2e/test_cloud_storage.py @@ -81,7 +81,6 @@ def is_same_endpoint(endpoint_short_name: str, endpoint_url: str) -> bool: ("STAGING", "GCP", "f474c0170c8daa09ec2e368ce4720c73", None, 5), ], ) -@pytest.mark.skip("to fix") def test_e2e_synchronize_cloud_storage_connection( kili: Kili, src_project: Dict,