From 922f7b5a7dd71a7c1a5aba16883dd589a5115ba0 Mon Sep 17 00:00:00 2001 From: charlieyl Date: Tue, 11 Feb 2025 11:57:18 +0800 Subject: [PATCH] [bugfix] Add deployment delete request to edges when deployment fails --- .../computing/scheduler/model_scheduler/master_job_runner.py | 5 +++++ python/fedml/core/mlops/mlops_device_perfs.py | 3 +-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py index 00b08acfb..7efda82dd 100755 --- a/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py +++ b/python/fedml/computing/scheduler/model_scheduler/master_job_runner.py @@ -9,6 +9,7 @@ from multiprocessing import Queue import fedml +from .device_model_msg_object import FedMLModelMsgObject from fedml.core.mlops import MLOpsRuntimeLog, MLOpsConfigs from fedml.core.mlops.mlops_runtime_log import MLOpsFormatter from .device_client_constants import ClientConstants @@ -274,10 +275,14 @@ def process_deployment_result_message(self, topic=None, payload=None): # Avoid endless loop, if the rollback also failed, we should report the failure to the MLOps if self.replica_controller.under_rollback or self.is_fresh_endpoint: + logging.info(f"process deploy result, under_rollback {self.replica_controller.under_rollback}, is_fresh_endpoint {self.is_fresh_endpoint}") self.send_deployment_status( end_point_id, end_point_name, payload_json["model_name"], "", ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED, message_center=self.message_center) + # when report failed to the MLOps, need to delete the replica has successfully deployed and release the gpu + model_msg_object = FedMLModelMsgObject(topic, payload) + self.send_deployment_delete_request_to_edges(payload, model_msg_object, message_center=self.message_center) return # Failure handler, send the rollback message to the worker devices only if it has not been rollback diff --git a/python/fedml/core/mlops/mlops_device_perfs.py b/python/fedml/core/mlops/mlops_device_perfs.py index 945732e30..b7e3bb5ed 100644 --- a/python/fedml/core/mlops/mlops_device_perfs.py +++ b/python/fedml/core/mlops/mlops_device_perfs.py @@ -128,8 +128,7 @@ def setup_realtime_stats_process(self, sys_args): self.monitor_run_master_process.start() def report_device_realtime_stats_entry(self, sys_event, role, is_client=False): - logging.info(f"Report device realtime stats, role {role}, is_client {is_client}, process id {os.getpid()}") - + # logging.info(f"Report device realtime stats, role {role}, process id {os.getpid()}") self.device_realtime_stats_event = sys_event mqtt_mgr = MqttManager( self.args.mqtt_config_path["BROKER_HOST"],