diff --git a/tests/python_client/chaos/chaos_test.sh b/tests/python_client/chaos/chaos_test.sh index b2776e88776c9..caa26041d1c3b 100644 --- a/tests/python_client/chaos/chaos_test.sh +++ b/tests/python_client/chaos/chaos_test.sh @@ -1,104 +1,139 @@ #!/bin/bash -set -e -set -x - - -echo "check os env" -platform='Linux' -unamestr=$(uname) -if [[ "$unamestr" == 'Linux' ]]; then - platform='Linux' -elif [[ "$unamestr" == 'Darwin' ]]; then - platform='Mac' -fi -echo "platform: $platform" +# Set PS4 prompt to display line number, function name and timestamp +export PS4='+(${BASH_SOURCE}:${LINENO}):${FUNCNAME[0]:+${FUNCNAME[0]}(): }' + +set -e # Exit immediately if a command exits with a non-zero status +set -x # Print commands and their arguments as they are executed + +# Store the initial directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Function to log important steps +declare -A colors=( + ["INFO"]=$'\033[32m' # Green + ["BOLD"]=$'\033[1m' # Bold + ["TIME"]=$'\033[36m' # Cyan + ["RESET"]=$'\033[0m' # Reset +) + +log_step() { + # Check if stdout is a terminal + if [ -t 1 ]; then + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + echo -e "${colors[INFO]}${colors[BOLD]}===> STEP [${colors[TIME]}${timestamp}${colors[INFO]}]: $1${colors[RESET]}" + else + # If not terminal (e.g., redirected to file), don't use colors + local timestamp=$(date '+%Y-%m-%d %H:%M:%S') + echo "===> STEP [${timestamp}]: $1" + fi +} + +# Cleanup function to ensure resources are properly released +cleanup() { + local exit_code=$? + log_step "Performing cleanup (exit code: $exit_code)" + + # Make sure we're in the correct directory for cleanup + cd "${SCRIPT_DIR}" || true + + # Export logs + cur_time=$(date +%Y-%m-%d-%H-%M-%S) + if [ -f "../../scripts/export_log_k8s.sh" ]; then + bash ../../scripts/export_log_k8s.sh ${ns} ${release} k8s_log/${target_component}-${chaos_type}-${cur_time} || true + else + echo "Warning: export_log_k8s.sh not found in expected location" + fi + + # Uninstall Milvus + if [ -f "./scripts/uninstall_milvus.sh" ]; then + bash ./scripts/uninstall_milvus.sh ${release} ${ns} || true + else + echo "Warning: uninstall_milvus.sh not found in expected location" + fi + + exit $exit_code +} + +# Set up trap to catch exits +trap cleanup EXIT + +# Initialize basic variables ns="chaos-testing" +cur_time=$(date +%H-%M-%S) +target_component=${1:-"standalone"} +chaos_type=${2:-"pod_kill"} +node_num=${3:-1} -# switch namespace -# kubectl config set-context --current --namespace=${ns} -# kubectl get pod - -# set parameters -pod=${1:-"querynode"} -chaos_type=${2:-"pod_kill"} #pod_kill or pod_failure -chaos_task=${3:-"chaos-test"} # chaos-test or data-consist-test -node_num=${4:-1} # cluster_1_node or cluster_n_nodes +log_step "Initializing with parameters: target_component=${target_component}, chaos_type=${chaos_type}, node_num=${node_num}" -cur_time=$(date +%H-%M-%S) -release_name="test"-${pod}-${chaos_type/_/-}-${cur_time} # replace pod_kill to pod-kill +# Generate release name +release_name="test"-${target_component}-${chaos_type/_/-}-${cur_time} release=${RELEASE_NAME:-"${release_name}"} -# replace separator to default -chaos_type=${chaos_type/-/_} # default separator of chaos_type is _ -chaos_task=${chaos_task/_/-} # default separator of chaos_task is - -echo "chaos_type: ${chaos_type}" -# install milvus cluster for chaos testing -pushd ./scripts -echo "uninstall milvus if exist" -bash uninstall_milvus.sh ${release} ${ns}|| true - -declare -A pod_map=(["querynode"]="queryNode" ["indexnode"]="indexNode" ["datanode"]="dataNode" ["proxy"]="proxy") -echo "install milvus" -if [[ ${pod} != *"standalone"* ]]; -then - echo "insatll cluster" - helm install --wait --timeout 360s ${release} milvus/milvus --set ${pod_map[${pod}]}.replicas=$node_num -f ../cluster-values.yaml -n=${ns} +# Normalize chaos type format +chaos_type=${chaos_type/-/_} +log_step "Configured chaos_type: ${chaos_type}" + +# Change to scripts directory +pushd ./scripts || exit 1 +log_step "Uninstalling existing Milvus instance if any" +bash uninstall_milvus.sh ${release} ${ns} || true + +# Map component names +declare -A target_component_map=(["querynode"]="queryNode" ["indexnode"]="indexNode" ["datanode"]="dataNode" ["proxy"]="proxy") +log_step "Installing Milvus" + +# Install cluster configuration if not standalone +if [[ ${target_component} != *"standalone"* ]]; then + log_step "Installing cluster configuration" + helm repo add milvus https://zilliztech.github.io/milvus-helm/ + helm repo update milvus + helm install --wait --debug --timeout 360s ${release} milvus/milvus \ + --set ${target_component_map[${target_component}]}.replicas=$node_num \ + -f ../cluster-values.yaml -n=${ns} fi -if [[ ${pod} == *"standalone"* ]]; -then - echo "install standalone" - helm install --wait --timeout 360s ${release} milvus/milvus -f ../standalone-values.yaml -n=${ns} +# Install standalone configuration +if [[ ${target_component} == *"standalone"* ]]; then + log_step "Installing standalone configuration" + helm install --wait --debug --timeout 360s ${release} milvus/milvus \ + -f ../standalone-values.yaml -n=${ns} fi -# wait all pod ready +# Wait for all pods to be ready +log_step "Waiting for pods to be ready" kubectl wait --for=condition=Ready pod -l app.kubernetes.io/instance=${release} -n ${ns} --timeout=360s kubectl wait --for=condition=Ready pod -l release=${release} -n ${ns} --timeout=360s +kubectl get pod -o wide -l app.kubernetes.io/instance=${release} -n ${ns} +popd || exit 1 -popd - -# replace chaos object as defined -if [ "$platform" == "Mac" ]; -then - sed -i "" "s/TESTS_CONFIG_LOCATION =.*/TESTS_CONFIG_LOCATION = \'chaos_objects\/${chaos_type}\/'/g" constants.py - sed -i "" "s/ALL_CHAOS_YAMLS =.*/ALL_CHAOS_YAMLS = \'chaos_${pod}_${chaos_type}.yaml\'/g" constants.py - sed -i "" "s/RELEASE_NAME =.*/RELEASE_NAME = \'${release}\'/g" constants.py -else - sed -i "s/TESTS_CONFIG_LOCATION =.*/TESTS_CONFIG_LOCATION = \'chaos_objects\/${chaos_type}\/'/g" constants.py - sed -i "s/ALL_CHAOS_YAMLS =.*/ALL_CHAOS_YAMLS = \'chaos_${pod}_${chaos_type}.yaml\'/g" constants.py - sed -i "s/RELEASE_NAME =.*/RELEASE_NAME = \'${release}\'/g" constants.py -fi +# Configure service and get LoadBalancer IP +log_step "Starting chaos testing" +kubectl patch svc ${release}-milvus -p='{"spec":{"type":"LoadBalancer"}}' -n ${ns} +loadbalancer_ip=$(kubectl get svc ${release}-milvus -n ${ns} -o jsonpath='{.status.loadBalancer.ingress[0].ip}') +host=${loadbalancer_ip} -# run chaos testing -echo "start running testcase ${pod}" -if [[ $release =~ "milvus" ]] -then - host=$(kubectl get svc/${release} -o jsonpath="{.spec.clusterIP}") -else - host=$(kubectl get svc/${release}-milvus -o jsonpath="{.spec.clusterIP}") -fi +# Run initial e2e tests +log_step "Running initial e2e tests" pytest -s -v ../testcases/test_e2e.py --host "$host" --log-cli-level=INFO --capture=no python3 scripts/hello_milvus.py --host "$host" -# chaos test -if [ "$chaos_task" == "chaos-test" ]; -then - pytest -s -v test_chaos.py --host "$host" --log-cli-level=INFO --capture=no || echo "chaos test fail" -fi -# data consist test -if [ "$chaos_task" == "data-consist-test" ]; -then - pytest -s -v test_chaos_data_consist.py --host "$host" --log-cli-level=INFO --capture=no || echo "chaos test fail" -fi -sleep 30 -echo "start running e2e test" +# Run parallel chaos and request tests +log_step "Starting parallel chaos and request tests" +pytest test_chaos_apply.py --milvus_ns ${ns} --chaos_type ${chaos_type} \ + --target_component ${target_component} --host "$host" \ + --log-cli-level=INFO --capture=no & +pytest testcases/test_single_request_operation.py --host "$host" \ + --request_duration 15m --log-cli-level=INFO --capture=no & +wait + +# Wait for system recovery after chaos tests +log_step "Waiting for pods to be ready after chaos tests" kubectl wait --for=condition=Ready pod -l app.kubernetes.io/instance=${release} -n ${ns} --timeout=360s kubectl wait --for=condition=Ready pod -l release=${release} -n ${ns} --timeout=360s +# Run final verification tests +log_step "Running final e2e tests" pytest -s -v ../testcases/test_e2e.py --host "$host" --log-cli-level=INFO --capture=no || echo "e2e test fail" -python3 scripts/hello_milvus.py --host "$host" || echo "e2e test fail" - -# save logs -cur_time=$(date +%Y-%m-%d-%H-%M-%S) -bash ../../scripts/export_log_k8s.sh ${ns} ${release} k8s_log/${pod}-${chaos_type}-${chaos_task}-${cur_time} \ No newline at end of file +python3 scripts/hello_milvus.py --host "$host" || echo "e2e test fail" \ No newline at end of file diff --git a/tests/python_client/chaos/cluster-values.yaml b/tests/python_client/chaos/cluster-values.yaml index 316abd7d92ffd..fe09f9033d003 100644 --- a/tests/python_client/chaos/cluster-values.yaml +++ b/tests/python_client/chaos/cluster-values.yaml @@ -6,7 +6,7 @@ image: all: repository: milvusdb/milvus tag: master-latest - pullPolicy: IfNotPresent + pullPolicy: Always indexNode: resources: @@ -14,6 +14,12 @@ indexNode: cpu: 2 limits: cpu: 8 +proxy: + resources: + requests: + cpu: 2 + limits: + cpu: 8 etcd: replicaCount: 3 @@ -27,119 +33,11 @@ minio: memory: 256Mi kafka: - enabled: false + enabled: true name: kafka replicaCount: 3 defaultReplicationFactor: 2 - pulsar: - enabled: true - extra: - bastion: no - wsproxy: no - - autorecovery: - resources: - requests: - cpu: 0.1 - memory: 256Mi - proxy: - replicaCount: 1 - resources: - requests: - cpu: 0.1 - memory: 256Mi - wsResources: - requests: - memory: 256Mi - cpu: 0.1 - configData: - PULSAR_MEM: > - -Xms256m -Xmx256m - PULSAR_GC: > - -XX:MaxDirectMemorySize=512m - httpNumThreads: "50" - - bookkeeper: - replicaCount: 2 - resources: - requests: - cpu: 0.1 - memory: 512Mi - configData: - PULSAR_MEM: > - -Xms512m - -Xmx512m - -XX:MaxDirectMemorySize=1024m - PULSAR_GC: > - -Dio.netty.leakDetectionLevel=disabled - -Dio.netty.recycler.linkCapacity=1024 - -XX:+UseG1GC -XX:MaxGCPauseMillis=10 - -XX:+ParallelRefProcEnabled - -XX:+UnlockExperimentalVMOptions - -XX:+DoEscapeAnalysis - -XX:ParallelGCThreads=32 - -XX:ConcGCThreads=32 - -XX:G1NewSizePercent=50 - -XX:+DisableExplicitGC - -XX:-ResizePLAB - -XX:+ExitOnOutOfMemoryError - -XX:+PerfDisableSharedMem - -XX:+PrintGCDetails - nettyMaxFrameSizeBytes: "104867840" - zookeeper: - replicaCount: 1 - resources: - requests: - cpu: 0.1 - memory: 256Mi - configData: - PULSAR_MEM: > - -Xms512m - -Xmx512m - PULSAR_GC: > - -Dcom.sun.management.jmxremote - -Djute.maxbuffer=10485760 - -XX:+ParallelRefProcEnabled - -XX:+UnlockExperimentalVMOptions - -XX:+DoEscapeAnalysis - -XX:+DisableExplicitGC - -XX:+PerfDisableSharedMem - -Dzookeeper.forceSync=no - broker: - replicaCount: 1 - resources: - requests: - cpu: 0.1 - memory: 512Mi - configData: - PULSAR_MEM: > - -Xms512m - -Xmx512m - -XX:MaxDirectMemorySize=1024m - PULSAR_GC: > - -Dio.netty.leakDetectionLevel=disabled - -Dio.netty.recycler.linkCapacity=1024 - -XX:+ParallelRefProcEnabled - -XX:+UnlockExperimentalVMOptions - -XX:+DoEscapeAnalysis - -XX:ParallelGCThreads=32 - -XX:ConcGCThreads=32 - -XX:G1NewSizePercent=50 - -XX:+DisableExplicitGC - -XX:-ResizePLAB - -XX:+ExitOnOutOfMemoryError - maxMessageSize: "104857600" - defaultRetentionTimeInMinutes: "10080" - defaultRetentionSizeInMB: "8192" - backlogQuotaDefaultLimitGB: "8" - backlogQuotaDefaultRetentionPolicy: producer_exception - -extraConfigFiles: - user.yaml: |+ - dataCoord: - compaction: - indexBasedCompaction: false - indexCoord: - scheduler: - interval: 100 \ No newline at end of file + enabled: false +pulsarv3: + enabled: false \ No newline at end of file diff --git a/tests/python_client/chaos/standalone-values.yaml b/tests/python_client/chaos/standalone-values.yaml index 8a08f058d44b7..180bd3feb0c11 100644 --- a/tests/python_client/chaos/standalone-values.yaml +++ b/tests/python_client/chaos/standalone-values.yaml @@ -8,7 +8,7 @@ image: all: repository: milvusdb/milvus tag: master-latest - pullPolicy: IfNotPresent + pullPolicy: Always standalone: resources: limits: @@ -17,6 +17,11 @@ standalone: requests: cpu: 4 memory: 8Gi +pulsarv3: + enabled: false + +pulsar: + enabled: false kafka: enabled: false