diff --git a/rqd/rqd/rqconstants.py b/rqd/rqd/rqconstants.py index 95c5d8c44..34ea71d0a 100644 --- a/rqd/rqd/rqconstants.py +++ b/rqd/rqd/rqconstants.py @@ -251,7 +251,8 @@ if config.has_option(__override_section, "BACKUP_CACHE_PATH"): BACKUP_CACHE_PATH = config.get(__override_section, "BACKUP_CACHE_PATH") if config.has_option(__override_section, "BACKUP_CACHE_TIME_TO_LIVE_SECONDS"): - BACKUP_CACHE_TIME_TO_LIVE_SECONDS = config.getint(__override_section, "BACKUP_CACHE_TIME_TO_LIVE_SECONDS") + BACKUP_CACHE_TIME_TO_LIVE_SECONDS = config.getint( + __override_section, "BACKUP_CACHE_TIME_TO_LIVE_SECONDS") __docker_mounts = "docker.mounts" __docker_config = "docker.config" diff --git a/rqd/rqd/rqcore.py b/rqd/rqd/rqcore.py index c65948e9b..b49319e5a 100644 --- a/rqd/rqd/rqcore.py +++ b/rqd/rqd/rqcore.py @@ -244,7 +244,8 @@ def recoverCache(self): # pylint: enable=no-member running_frame.frameAttendantThread.start() - except: + # pylint: disable=broad-except + except Exception: pass # Ignore frames that got corrupted @@ -1409,8 +1410,8 @@ def setup(self): self.rqlog.waitForFile() # pylint: disable=broad-except except Exception as e: - err = "Unable to write to %s due to %s" % (runFrame.log_dir_file, e) - raise RuntimeError(err) + err = "Unable to write to %s due to %s" % (runFrame.log_dir_file, e) + raise RuntimeError(err) finally: rqd.rqutil.permissionsLow() @@ -1422,7 +1423,7 @@ def run(self): """Thread initialization""" if self.recovery_mode: self.runRecovery() - return; + return log.info("Monitor frame started for frameId=%s", self.frameId) @@ -1459,6 +1460,7 @@ def run(self): self.postFrameAction() def postFrameAction(self): + """Action to be executed after a frame completes its execution""" self.rqCore.releaseCores(self.runFrame.num_cores, self.runFrame.attributes.get('CPU_LIST'), self.runFrame.attributes.get('GPU_LIST') @@ -1496,6 +1498,11 @@ def recoverDocker(self): self.__createEnvVariables() self.__writeHeader() + tempStatFile = "%srqd-stat-%s-%s" % (self.rqCore.machine.getTempPath(), + frameInfo.frameId, + time.time()) + self._tempLocations.append(tempStatFile) + try: log_stream = None with self.rqCore.docker_lock: @@ -1551,7 +1558,7 @@ def recoverDocker(self): except Exception as e: returncode = -1 msg = "Failed to recover frame container" - logging.exception(msg) + logging.warn(msg) self.rqlog.write("%s - The frame might have finishes during rqd's reinitialization " "- %s" % (msg, e), prependTimestamp=rqd.rqconstants.RQD_PREPEND_TIMESTAMP) @@ -1573,7 +1580,8 @@ def recoverDocker(self): frameInfo.exitSignal = 0 # Log frame start info - log.warning("Frame %s.%s(%s) with pid %s finished on container %s with exitStatus %s %s ", + log.warning( + "Frame %s.%s(%s) with pid %s finished on container %s with exitStatus %s %s", runFrame.job_name, runFrame.frame_name, frameInfo.frameId, @@ -1596,43 +1604,43 @@ def recoverDocker(self): self.__cleanup() def runRecovery(self): - """Recover a frame that was running before this instance started""" - if not self.recovery_mode: - return; - - log.info("Monitor recovered frame started for frameId=%s", self.frameId) - - runFrame = self.runFrame - run_on_docker = self.rqCore.docker is not None - - # pylint: disable=too-many-nested-blocks - try: - self.setup() - # Store frame in cache and register servant - self.rqCore.storeFrame(runFrame.frame_id, self.frameInfo) - - if run_on_docker: - self.recoverDocker() - elif platform.system() == "Linux": - # TODO - pass - elif platform.system() == "Windows": - # TODO - pass - elif platform.system() == "Darwin": - # TODO - pass - else: - self.runUnknown() - - # pylint: disable=broad-except - except Exception: - log.critical( - "Failed launchFrame: For %s due to: \n%s", - runFrame.frame_id, ''.join(traceback.format_exception(*sys.exc_info()))) - # Notifies the cuebot that there was an error launching - self.frameInfo.exitStatus = rqd.rqconstants.EXITSTATUS_FOR_FAILED_LAUNCH - # Delay keeps the cuebot from spamming failing booking requests - time.sleep(10) - finally: - self.postFrameAction() + """Recover a frame that was running before this instance started""" + if not self.recovery_mode: + return + + log.info("Monitor recovered frame started for frameId=%s", self.frameId) + + runFrame = self.runFrame + run_on_docker = self.rqCore.docker is not None + + # pylint: disable=too-many-nested-blocks + try: + self.setup() + # Store frame in cache and register servant + self.rqCore.storeFrame(runFrame.frame_id, self.frameInfo) + + if run_on_docker: + self.recoverDocker() + elif platform.system() == "Linux": + # TODO + pass + elif platform.system() == "Windows": + # TODO + pass + elif platform.system() == "Darwin": + # TODO + pass + else: + self.runUnknown() + + # pylint: disable=broad-except + except Exception: + log.critical( + "Failed launchFrame: For %s due to: \n%s", + runFrame.frame_id, ''.join(traceback.format_exception(*sys.exc_info()))) + # Notifies the cuebot that there was an error launching + self.frameInfo.exitStatus = rqd.rqconstants.EXITSTATUS_FOR_FAILED_LAUNCH + # Delay keeps the cuebot from spamming failing booking requests + time.sleep(10) + finally: + self.postFrameAction() diff --git a/rqd/tests/rqcore_test.py b/rqd/tests/rqcore_test.py index 000ac49b0..8372f0f50 100644 --- a/rqd/tests/rqcore_test.py +++ b/rqd/tests/rqcore_test.py @@ -666,19 +666,19 @@ def test_recoverCache_validBackup(self, attendant_patch): num_cores = 4 ) running_frame = rqd.rqnetwork.RunningFrame(self.rqcore, frame) - self.rqcore.cores.idle_cores = 8 self.rqcore.storeFrame(frameId, running_frame) + self.rqcore.cores.idle_cores = 8 + self.rqcore.cores.booked_cores = 0 self.rqcore.backupCache() - self.__cache = {} + self.rqcore._RqCore__cache = {} self.rqcore.recoverCache() - self.assertIn('frame123', self.rqcore._RqCore__cache) self.assertEqual(4, self.rqcore.cores.idle_cores) self.assertEqual(4, self.rqcore.cores.booked_cores) def test_recoverCache_invalidFrame(self): """Test recoverCache loads frame data from valid backup file""" self.rqcore.backup_cache_path = 'cache.dat' - with open(self.rqcore.backup_cache_path, "w") as f: + with open(self.rqcore.backup_cache_path, "w", encoding='utf-8') as f: f.write("this is not a run frame") self.rqcore.recoverCache()