Skip to content

Commit

Permalink
[bugfix] Handle deployment failure by deleting deployed replicas and …
Browse files Browse the repository at this point in the history
…releasing GPU
  • Loading branch information
charlieyl committed Feb 11, 2025
1 parent 46d766a commit a912c57
Showing 1 changed file with 4 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import fedml
from fedml.core.mlops import MLOpsRuntimeLog, MLOpsConfigs
from fedml.core.mlops.mlops_runtime_log import MLOpsFormatter
from .device_model_msg_object import FedMLModelMsgObject
from .device_client_constants import ClientConstants
from .device_model_cache import FedMLModelCache
from .device_server_constants import ServerConstants
Expand Down Expand Up @@ -278,6 +279,9 @@ def process_deployment_result_message(self, topic=None, payload=None):
end_point_id, end_point_name, payload_json["model_name"], "",
ServerConstants.MSG_MODELOPS_DEPLOYMENT_STATUS_FAILED,
message_center=self.message_center)
# when report failed to the MLOps, need to delete the replica has successfully deployed and release the gpu
model_msg_object = FedMLModelMsgObject(topic, payload)
self.send_deployment_delete_request_to_edges(payload, model_msg_object, message_center=self.message_center)
return

# Failure handler, send the rollback message to the worker devices only if it has not been rollback
Expand Down

0 comments on commit a912c57

Please sign in to comment.