Skip to content

Associate ssh key pair to dataplane node to help debug (#473) #59

Associate ssh key pair to dataplane node to help debug (#473)

Associate ssh key pair to dataplane node to help debug (#473) #59

Workflow file for this run

name: kit-e2e-test
on:
push:
branches: [main]
permissions:
id-token: write
pull-requests: write
contents: read
jobs:
kit:
strategy:
matrix:
os: [ ubuntu-latest,macos-latest]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v3
- uses: ./.github/actions/install-go-and-dependencies
- uses: aws-actions/configure-aws-credentials@v1
with:
role-to-assume: arn:aws:iam::638256630554:role/kit-github-workflow
role-session-name: githubworkflowsession
aws-region: us-west-2
- name: Install kubectl
run: |
if [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then
curl -LO "https://storage.googleapis.com/kubernetes-release/release/$(curl -L -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl"
elif [[ "${{ matrix.os }}" == "macos-latest" ]]; then
curl -LO "https://storage.googleapis.com/kubernetes-release/release/$(curl -L -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/darwin/amd64/kubectl"
fi
chmod +x kubectl
sudo mv kubectl /usr/local/bin/
- name: Build kitctl, provision Kit Environment and KIT guest cluster.
run: |
allVersions=("1-21" "1-22" "1-23" "1-24" "1-25" "1-26" "1-27" "1-28" "1-29")
cd ${HOME}/work/kubernetes-iteration-toolkit/kubernetes-iteration-toolkit/substrate/cmd/kitctl
go build .
if [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then
os_type="linux"
elif [[ "${{ matrix.os }}" == "macos-latest" ]]; then
os_type="darwin"
fi
./kitctl bootstrap kit-github-${os_type}-${{ github.run_id }}-${{github.run_attempt}}
export KUBECONFIG=${HOME}/.kit/env/kit-github-${os_type}-${{ github.run_id }}-${{github.run_attempt}}/etc/kubernetes/admin.conf
kubectl get pods -A
for version in "${allVersions[@]}"; do
K8S_VERSION=${version//-/.}
GUEST_CLUSTER_NAME="guest-${os_type}-${version}-${{ github.run_id }}-${{github.run_attempt}}"
cat <<-EOF | kubectl apply -f -
apiVersion: kit.k8s.sh/v1alpha1
kind: ControlPlane
metadata:
name: ${GUEST_CLUSTER_NAME} # Desired Cluster name
spec:
kubernetesVersion: "${K8S_VERSION}"
etcd:
replicas: 3
master:
apiServer:
replicas: 1
EOF
for (( time=0; time<=600; time+=5 )); do
controlplane_object=$(kubectl get controlplane ${GUEST_CLUSTER_NAME} -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null)
if [[ "$controlplane_object" == "True" ]]; then
echo "Control plane objects are ready"
break;
else
echo "Control plane objects are not ready"
fi
if [[ "$time" -eq 590 ]]; then
echo "Timeout: Control plane objects are not ready after 10 minutes"
kubectl logs deploy/kit-controller -n kit
exit 1
fi
sleep 5
done
kubectl wait --for=condition=Ready pods --all --timeout=5m
GUESTKUBECONFIG=/tmp/kubeconfig
kubectl get secret ${GUEST_CLUSTER_NAME}-kube-admin-config -ojsonpath='{.data.config}' | base64 -d >$GUESTKUBECONFIG
cat <<-EOF | kubectl apply -f -
apiVersion: kit.k8s.sh/v1alpha1
kind: DataPlane
metadata:
name: ${GUEST_CLUSTER_NAME}-nodes
spec:
clusterName: ${GUEST_CLUSTER_NAME} # Associated Cluster name
nodeCount: 2
EOF
for (( time=0; time<=600; time+=5 )); do
dataplane_object=$(kubectl get dataplane ${GUEST_CLUSTER_NAME}-nodes -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null)
if [[ "$dataplane_object" == "True" ]]; then
echo "Dataplane objects are ready"
break;
else
echo "Dataplane objects are not ready"
fi
if [[ "$time" -eq 590 ]]; then
echo "Timeout: Dataplane objects are not ready after 10 minutes"
kubectl logs deploy/kit-controller -n kit
exit 1
fi
sleep 5
done
kubectl --kubeconfig=$GUESTKUBECONFIG apply -f https://raw.githubusercontent.com/aws/amazon-vpc-cni-k8s/release-1.10/config/master/aws-k8s-cni.yaml
kubectl --kubeconfig=$GUESTKUBECONFIG rollout status daemonset/aws-node -n kube-system --timeout 5m
for (( time=0; time<=220; time+=5 )); do
ready_node=$(kubectl --kubeconfig=$GUESTKUBECONFIG get nodes 2>/dev/null | grep -w Ready | wc -l)
if [[ "$ready_node" -eq 2 ]]; then
echo "Dataplane nodes are ready"
break;
else
echo "Dataplane nodes are not ready"
fi
if [[ "$time" -eq 210 ]]; then
echo "Timeout: Dataplane nodes are not ready after 4 minutes"
exit 1
fi
sleep 5
done
kubectl --kubeconfig=$GUESTKUBECONFIG create deployment nginx-deployment --image=nginx
kubectl --kubeconfig=$GUESTKUBECONFIG rollout status deployment/nginx-deployment --timeout 5m
kubectl --kubeconfig=$GUESTKUBECONFIG expose deployment nginx-deployment --name=nginx-service --port=80 --target-port=80 --type=ClusterIP
kubectl --kubeconfig=$GUESTKUBECONFIG certificate approve $(kubectl --kubeconfig=$GUESTKUBECONFIG get csr | grep Pending | awk '{print $1}')
kubectl --kubeconfig=$GUESTKUBECONFIG run -i --rm --restart=Never curly-pod --image=curlimages/curl --command -- curl -s 'http://nginx-service:80'
done
working-directory: substrate
- name: Delete the Guest Control plane and Dataplane
if: always()
run: |
allVersions=("1-21" "1-22" "1-23" "1-24" "1-25" "1-26" "1-27" "1-28" "1-29")
if [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then
os_type="linux"
elif [[ "${{ matrix.os }}" == "macos-latest" ]]; then
os_type="darwin"
fi
export KUBECONFIG=${HOME}/.kit/env/kit-github-${os_type}-${{ github.run_id }}-${{github.run_attempt}}/etc/kubernetes/admin.conf
kubectl config current-context
for version in "${allVersions[@]}"; do
GUEST_CLUSTER_NAME="guest-${os_type}-${version}-${{ github.run_id }}-${{github.run_attempt}}"
if kubectl get dataplane $GUEST_CLUSTER_NAME-nodes > /dev/null 2>&1; then
kubectl delete dataplane $GUEST_CLUSTER_NAME-nodes
echo "dataplane deleted."
else
echo "No dataplane found $GUEST_CLUSTER_NAME-nodes."
fi
if kubectl get controlplane $GUEST_CLUSTER_NAME > /dev/null 2>&1; then
kubectl delete controlplane $GUEST_CLUSTER_NAME
echo "Controlplane deleted."
else
echo "No controlplane found $GUEST_CLUSTER_NAME."
fi
done
#Adding sleep to ensure that the resources are deleted like EG:Load balancer.
sleep 120
continue-on-error: true
working-directory: substrate
- name: Delete the kit environment
if: always()
run: |
#During the first attempt, "kitctl" was unable to delete some resources sporadically. To ensure that enough time is given for the resources to be deleted, wait time is added around 280 seconds before attempting to delete the kit again.
#and for the second iteration with a wait time of 30 seconds to ensure all resources are successfully deleted as we are left with few resources sometimes.
#TODO: we should fix `kitctl delete` to make it more robust and not hang forever.
cd ${HOME}/work/kubernetes-iteration-toolkit/kubernetes-iteration-toolkit/substrate/cmd/kitctl
if [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then
os_type="linux"
elif [[ "${{ matrix.os }}" == "macos-latest" ]]; then
os_type="darwin"
# installing coreutils for macos to use gtimeout as timeout is not available in macos by default,as kitctl hangs sparodically
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
brew install coreutils
fi
export KUBECONFIG=${HOME}/.kit/env/kit-github-${os_type}-${{ github.run_id }}-${{github.run_attempt}}/etc/kubernetes/admin.conf
# Set the maximum number of attempts
MAX_ATTEMPTS=2
# Set the delay between retries in seconds
RETRY_DELAY=30
# Set the maximum duration in seconds
MAX_DURATION=280
# Set the start time
START_TIME=$(date +%s)
# Set a flag to indicate whether the command was successfully completed or not
COMMAND_COMPLETED=false
# Attempt to delete the resource multiple times
for i in $(seq 1 $MAX_ATTEMPTS); do
# Run the command with a timeout
if [[ "${{ matrix.os }}" == "macos-latest" ]]; then
gtimeout $MAX_DURATION ./kitctl delete kit-github-${os_type}-${{ github.run_id }}-${{github.run_attempt}}
else
timeout $MAX_DURATION ./kitctl delete kit-github-${os_type}-${{ github.run_id }}-${{github.run_attempt}}
fi
# Check if the command timed out
status=$?
if [ $status -eq 0 ]; then
echo "Command executed successfully for $i iteration."
COMMAND_COMPLETED=true
elif [ $status -eq 124 ]; then
echo "Command timed out. Retrying in $RETRY_DELAY seconds."
elif [ $status -eq 143 ]; then
echo "Command terminated by SIGTERM signal. Retrying in $RETRY_DELAY seconds."
else
echo "Command failed with exit code $status."
fi
# Wait for the specified delay before trying again
sleep $RETRY_DELAY
# Calculate the elapsed time
ELAPSED_TIME=$(( $(date +%s) - $START_TIME ))
# If the elapsed time exceeds the maximum duration, print a message and exit the loop
if [ $ELAPSED_TIME -gt $MAX_DURATION ]; then
echo "Maximum duration exceeded. Exiting the loop."
break
fi
done
# Check if the command was successfully completed
if [ "$COMMAND_COMPLETED" = false ]; then
echo "Failed to delete resource after $MAX_ATTEMPTS attempts."
fi
continue-on-error: true
working-directory: substrate