From 9c776ff76a23526cc14ea50afcf159d76a305b76 Mon Sep 17 00:00:00 2001
From: charlieyl <charlie@tensoropera.com>
Date: Mon, 10 Feb 2025 16:12:03 +0800
Subject: [PATCH] add debug logs

---
 python/fedml/core/mlops/mlops_device_perfs.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/fedml/core/mlops/mlops_device_perfs.py b/python/fedml/core/mlops/mlops_device_perfs.py
index 0c2bde678..945732e30 100644
--- a/python/fedml/core/mlops/mlops_device_perfs.py
+++ b/python/fedml/core/mlops/mlops_device_perfs.py
@@ -10,6 +10,7 @@
 import psutil
 
 from fedml.computing.scheduler.comm_utils import sys_utils
+from fedml.computing.scheduler.comm_utils.job_utils import JobRunnerUtils
 from .device_info_report_protocol import FedMLDeviceInfoReportProtocol
 from .mlops_utils import MLOpsUtils
 from .system_stats import SysStats
@@ -127,7 +128,7 @@ def setup_realtime_stats_process(self, sys_args):
                 self.monitor_run_master_process.start()
 
     def report_device_realtime_stats_entry(self, sys_event, role, is_client=False):
-        # print(f"Report device realtime stats, process id {os.getpid()}")
+        logging.info(f"Report device realtime stats, role {role}, is_client {is_client}, process id {os.getpid()}")
 
         self.device_realtime_stats_event = sys_event
         mqtt_mgr = MqttManager(
@@ -225,14 +226,17 @@ def report_device_realtime_stats_entry(self, sys_event, role, is_client=False):
     def report_gpu_device_info(edge_id, mqtt_mgr=None):
         total_mem, free_mem, total_disk_size, free_disk_size, cup_utilization, cpu_cores, gpu_cores_total, \
             gpu_cores_available, sent_bytes, recv_bytes, gpu_available_ids = sys_utils.get_sys_realtime_stats()
+        logging.info(f"report_gpu_device_info gpu_available_ids{gpu_available_ids} from realtime stats ")
 
         topic_name = "ml_client/mlops/gpu_device_info"
 
         # We should report realtime available gpu count to MLOps, not from local redis cache.
         # Use gpu_available_ids from sys_utils.get_sys_realtime_stats()
         # Do not use the following two lines as the realtime available gpu ids.
-        # gpu_available_ids = JobRunnerUtils.get_available_gpu_id_list(edge_id)
-        # gpu_available_ids = JobRunnerUtils.trim_unavailable_gpu_ids(gpu_available_ids)
+        gpu_available_ids_from_cache = JobRunnerUtils.get_available_gpu_id_list(edge_id)
+        gpu_available_ids_from_cache_trimmed = JobRunnerUtils.trim_unavailable_gpu_ids(gpu_available_ids_from_cache)
+        logging.info(f"report_gpu_device_info gpu_available_ids_from_cache {gpu_available_ids_from_cache}, gpu_available_ids_from_cache_trimmed {gpu_available_ids_from_cache_trimmed}")
+
         gpu_cores_available = len(gpu_available_ids) if gpu_available_ids is not None else 0
         deploy_worker_id_list = list()
         try: