Skip to content

Commit

Permalink
Revert "Update check to consider number of restarts" (unskript#1080)
Browse files Browse the repository at this point in the history
  • Loading branch information
jayasimha-raghavan-unskript authored Jun 24, 2024
1 parent 821ad59 commit 433e7d2
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 76 deletions.
56 changes: 19 additions & 37 deletions Kubernetes/legos/k8s_get_oomkilled_pods/k8s_get_oomkilled_pods.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
# All rights reserved.
#
import pprint
from datetime import datetime, timedelta, timezone
import datetime
from datetime import timezone
from typing import Tuple, Optional
from pydantic import BaseModel, Field
from kubernetes import client
Expand All @@ -21,11 +22,6 @@ class InputSchema(BaseModel):
description='Time interval in hours. This time window is used to check if POD good OOMKilled. Default is 24 hours.',
title="Time Interval"
)
restart_threshold: int = Field(
10,
description='The threshold for the number of restarts within the specified time interval. Default is 10 restarts.',
title='Restart Threshold'
)



Expand All @@ -37,23 +33,9 @@ def k8s_get_oomkilled_pods_printer(output):
def format_datetime(dt):
# Format datetime to a string 'YYYY-MM-DD HH:MM:SS UTC'
return dt.strftime('%Y-%m-%d %H:%M:%S UTC')

def fetch_restart_events(v1, pod_name, namespace, time_interval):
"""Fetch restart-related events for a specific pod within the given time interval."""
current_time = datetime.now(timezone.utc)
start_time = current_time - timedelta(hours=time_interval)
field_selector = f"involvedObject.name={pod_name},involvedObject.namespace={namespace}"
event_list = v1.list_namespaced_event(namespace, field_selector=field_selector)
restart_events = [
event for event in event_list.items
if event.reason in ["BackOff", "CrashLoopBackOff"] and
event.last_timestamp and
start_time <= event.last_timestamp.replace(tzinfo=timezone.utc) <= current_time
]
return len(restart_events)


def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=24, restart_threshold: int = 10) -> Tuple:
def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=24) -> Tuple:
"""k8s_get_oomkilled_pods This function returns the pods that have OOMKilled event in the container last states
:type handle: Object
Expand Down Expand Up @@ -94,12 +76,16 @@ def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=2
if pods is None:
raise ApiException("No pods returned from the Kubernetes API.")

interval_time_to_check = datetime.now(timezone.utc) - timedelta(hours=time_interval_to_check)
# Get Current Time in UTC
current_time = datetime.datetime.now(timezone.utc)
# Get time interval to check (or 24 hour) reference and convert to UTC
interval_time_to_check = current_time - datetime.timedelta(hours=time_interval_to_check)
interval_time_to_check = interval_time_to_check.replace(tzinfo=timezone.utc)


for pod in pods:
pod_name = pod.metadata.name
namespace = pod.metadata.namespace
recent_restarts = fetch_restart_events(v1, pod_name, namespace, time_interval_to_check)

# Ensure container_statuses is not None before iterating
container_statuses = pod.status.container_statuses
Expand All @@ -111,18 +97,14 @@ def k8s_get_oomkilled_pods(handle, namespace: str = "", time_interval_to_check=2
container_name = container_status.name
last_state = container_status.last_state
if last_state and last_state.terminated and last_state.terminated.reason == "OOMKilled":
oom_time = last_state.terminated.finished_at
termination_time = last_state.terminated.finished_at
termination_time = termination_time.replace(tzinfo=timezone.utc)
# If termination time is greater than interval_time_to_check meaning
# the POD has gotten OOMKilled in the last 24 hours and the number of restarts for
# that pod is greater than 10, so lets flag it!
if oom_time and oom_time.replace(tzinfo=timezone.utc) >= interval_time_to_check:
if recent_restarts > restart_threshold:
formatted_oom_time = format_datetime(oom_time)
result.append({
"pod": pod_name,
"namespace": namespace,
"container": container_name,
"termination_time": formatted_oom_time,
"restarts": recent_restarts
})
return (False, result) if result else (True, None)
# the POD has gotten OOMKilled in the last 24 hours, so lets flag it!
if termination_time and termination_time >= interval_time_to_check:
formatted_termination_time = format_datetime(termination_time)
formatted_interval_time_to_check = format_datetime(interval_time_to_check)
result.append({"pod": pod_name, "namespace": namespace, "container": container_name, "termination_time":formatted_termination_time,"interval_time_to_check": formatted_interval_time_to_check})

return (False, result) if result else (True, None)

Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from kubernetes import client
from kubernetes.client.rest import ApiException
from tabulate import tabulate
from datetime import datetime, timedelta, timezone
import datetime
from datetime import timezone


class InputSchema(BaseModel):
Expand All @@ -19,13 +20,7 @@ class InputSchema(BaseModel):
time_interval_to_check: int = Field(
24,
description='Time interval in hours. This time window is used to check if POD was in Crashloopback. Default is 24 hours.',
title=
"Time Interval"
)
restart_threshold: int = Field(
10,
description='The threshold for the number of restarts within the specified time interval. Default is 10 restarts.',
title='Restart Threshold'
title="Time Interval"
)


Expand All @@ -43,21 +38,7 @@ def format_datetime(dt):
# Format datetime to a string 'YYYY-MM-DD HH:MM:SS UTC'
return dt.strftime('%Y-%m-%d %H:%M:%S UTC')

def fetch_restart_events(v1, pod_name, namespace, time_interval):
"""Fetch restart-related events for a specific pod within the given time interval."""
current_time = datetime.now(timezone.utc)
start_time = current_time - timedelta(hours=time_interval)
field_selector = f"involvedObject.name={pod_name},involvedObject.namespace={namespace}"
event_list = v1.list_namespaced_event(namespace, field_selector=field_selector)
restart_events = [
event for event in event_list.items
if event.reason in ["BackOff", "CrashLoopBackOff"] and
event.last_timestamp and
start_time <= event.last_timestamp.replace(tzinfo=timezone.utc) <= current_time
]
return len(restart_events)

def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_interval_to_check=24, restart_threshold=10) -> Tuple:
def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_interval_to_check=24) -> Tuple:
"""
k8s_get_pods_in_crashloopbackoff_state returns the pods that have CrashLoopBackOff state in their container statuses within the specified time interval.
Expand All @@ -71,9 +52,6 @@ def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_int
:param time_interval_to_check: (Optional) Integer, in hours, the interval within which the
state of the POD should be checked.
:type restart_threshold: int
:param restart_threshold: (Optional) Integer, the threshold of restarts to check against.
:rtype: Status, List of objects of pods, namespaces, and containers that are in CrashLoopBackOff state
"""
result = []
Expand All @@ -99,33 +77,33 @@ def k8s_get_pods_in_crashloopbackoff_state(handle, namespace: str = '', time_int
if pods is None:
raise ApiException("No pods returned from the Kubernetes API.")

interval_time_to_check = datetime.now(timezone.utc) - timedelta(hours=time_interval_to_check)
current_time = datetime.datetime.now(timezone.utc)
interval_time_to_check = current_time - datetime.timedelta(hours=time_interval_to_check)
interval_time_to_check = interval_time_to_check.replace(tzinfo=timezone.utc)

for pod in pods:
pod_name = pod.metadata.name
namespace = pod.metadata.namespace
container_statuses = pod.status.container_statuses
if container_statuses is None:
continue
recent_restarts = fetch_restart_events(v1, pod_name, namespace, time_interval_to_check)

for container_status in container_statuses:
container_name = container_status.name
if container_status.state and container_status.state.waiting and container_status.state.waiting.reason == "CrashLoopBackOff":
last_state = container_status.last_state
if last_state and last_state.terminated:
last_transition_time = last_state.terminated.finished_at
# Check if the last transition time to CrashLoopBackOff is within the specified interval
# and the number of restarts are greater threshold in the last 24 hours
if last_transition_time and last_transition_time.replace(tzinfo=timezone.utc) >= interval_time_to_check:
if recent_restarts > restart_threshold:
formatted_last_transition_time = format_datetime(last_transition_time)
# Check if the last transition time to CrashLoopBackOff is within the specified interval
if container_status.last_state and container_status.last_state.terminated:
last_transition_time = container_status.last_state.terminated.finished_at
if last_transition_time:
last_transition_time = last_transition_time.replace(tzinfo=timezone.utc)
if last_transition_time >= interval_time_to_check:
formatted_transition_time = format_datetime(last_transition_time)
formatted_interval_time_to_check = format_datetime(interval_time_to_check)
result.append({
"pod": pod_name,
"namespace": namespace,
"container": container_name,
"termination_time": formatted_last_transition_time,
"restarts": recent_restarts
"last_transition_time": formatted_transition_time,
"interval_time_to_check": formatted_interval_time_to_check
})

return (False, result) if result else (True, None)

0 comments on commit 433e7d2

Please sign in to comment.