Skip to content

Commit

Permalink
add debug logs
Browse files Browse the repository at this point in the history
  • Loading branch information
charlieyl committed Feb 10, 2025
1 parent 52ce93f commit 9c776ff
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions python/fedml/core/mlops/mlops_device_perfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import psutil

from fedml.computing.scheduler.comm_utils import sys_utils
from fedml.computing.scheduler.comm_utils.job_utils import JobRunnerUtils
from .device_info_report_protocol import FedMLDeviceInfoReportProtocol
from .mlops_utils import MLOpsUtils
from .system_stats import SysStats
Expand Down Expand Up @@ -127,7 +128,7 @@ def setup_realtime_stats_process(self, sys_args):
self.monitor_run_master_process.start()

def report_device_realtime_stats_entry(self, sys_event, role, is_client=False):
# print(f"Report device realtime stats, process id {os.getpid()}")
logging.info(f"Report device realtime stats, role {role}, is_client {is_client}, process id {os.getpid()}")

self.device_realtime_stats_event = sys_event
mqtt_mgr = MqttManager(
Expand Down Expand Up @@ -225,14 +226,17 @@ def report_device_realtime_stats_entry(self, sys_event, role, is_client=False):
def report_gpu_device_info(edge_id, mqtt_mgr=None):
total_mem, free_mem, total_disk_size, free_disk_size, cup_utilization, cpu_cores, gpu_cores_total, \
gpu_cores_available, sent_bytes, recv_bytes, gpu_available_ids = sys_utils.get_sys_realtime_stats()
logging.info(f"report_gpu_device_info gpu_available_ids{gpu_available_ids} from realtime stats ")

topic_name = "ml_client/mlops/gpu_device_info"

# We should report realtime available gpu count to MLOps, not from local redis cache.
# Use gpu_available_ids from sys_utils.get_sys_realtime_stats()
# Do not use the following two lines as the realtime available gpu ids.
# gpu_available_ids = JobRunnerUtils.get_available_gpu_id_list(edge_id)
# gpu_available_ids = JobRunnerUtils.trim_unavailable_gpu_ids(gpu_available_ids)
gpu_available_ids_from_cache = JobRunnerUtils.get_available_gpu_id_list(edge_id)
gpu_available_ids_from_cache_trimmed = JobRunnerUtils.trim_unavailable_gpu_ids(gpu_available_ids_from_cache)
logging.info(f"report_gpu_device_info gpu_available_ids_from_cache {gpu_available_ids_from_cache}, gpu_available_ids_from_cache_trimmed {gpu_available_ids_from_cache_trimmed}")

gpu_cores_available = len(gpu_available_ids) if gpu_available_ids is not None else 0
deploy_worker_id_list = list()
try:
Expand Down

0 comments on commit 9c776ff

Please sign in to comment.