[bugfix] Handle deployment failure by deleting deployed replicas and …

…releasing GPU
FedML-AI · Feb 11, 2025 · a912c57 · a912c57
1 parent 46d766a
commit a912c57
Showing 1 changed file with 4 additions and 0 deletions.
diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py
@@ -11,6 +11,7 @@
 import fedml
 from fedml.core.mlops import MLOpsRuntimeLog, MLOpsConfigs
 from fedml.core.mlops.mlops_runtime_log import MLOpsFormatter
+from .device_model_msg_object import FedMLModelMsgObject
 from .device_client_constants import ClientConstants
 from .device_model_cache import FedMLModelCache
 from .device_server_constants import ServerConstants
@@ -278,6 +279,9 @@ def process_deployment_result_message(self, topic=None, payload=None):
                     end_point_id, end_point_name, payload_json["model_name"], "",
                     ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED,
                     message_center=self.message_center)
+                # when report failed to the MLOps, need to delete the replica has successfully deployed and release the gpu
+                model_msg_object = FedMLModelMsgObject(topic, payload)
+                self.send_deployment_delete_request_to_edges(payload, model_msg_object, message_center=self.message_center)
                 return
 
             # Failure handler, send the rollback message to the worker devices only if it has not been rollback