Skip to content

Commit bdc05b2

Browse files
committed
Automate Zero Downtime Scale tests
Problem: We want our NFR tests to be fully automated to save developer time for each release cycle, and have a repetitive way of running the tests. Solution: Automate the zero downtime scaling test. No longer collecting logs as done previously, because error logs would be unreliable to collect for pods that are scaling down (we don't have persistence that we can easily use in the automation to gather historic logs). Ultimately we are still ensuring that traffic is flowing and status updates occur, which are the important pieces here.
1 parent 40784ac commit bdc05b2

File tree

139 files changed

+483
-88555
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

139 files changed

+483
-88555
lines changed

tests/Makefile

+2-2
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ run-tests-on-vm: ## Run the functional tests on a GCP VM
9797

9898
.PHONY: nfr-test
9999
nfr-test: ## Run the NFR tests on a GCP VM
100-
NFR=true bash scripts/run-tests-gcp-vm.sh
100+
NFR=true CI=$(CI) bash scripts/run-tests-gcp-vm.sh
101101

102102
.PHONY: start-longevity-test
103103
start-longevity-test: export START_LONGEVITY=true
@@ -110,7 +110,7 @@ stop-longevity-test: nfr-test ## Stop the longevity test and collects results
110110
.PHONY: .vm-nfr-test
111111
.vm-nfr-test: ## Runs the NFR tests on the GCP VM (called by `nfr-test`)
112112
go run github.com./onsi/ginkgo/v2/ginkgo --randomize-all --randomize-suites --keep-going --fail-on-pending --trace -r -v \
113-
--label-filter "nfr" $(GINKGO_FLAGS) ./suite -- --gateway-api-version=$(GW_API_VERSION) \
113+
--label-filter "nfr" $(GINKGO_FLAGS) --timeout 3h ./suite -- --gateway-api-version=$(GW_API_VERSION) \
114114
--gateway-api-prev-version=$(GW_API_PREV_VERSION) --image-tag=$(TAG) --version-under-test=$(NGF_VERSION) \
115115
--plus-enabled=$(PLUS_ENABLED) --ngf-image-repo=$(PREFIX) --nginx-image-repo=$(NGINX_PREFIX) --nginx-plus-image-repo=$(NGINX_PLUS_PREFIX) \
116116
--pull-policy=$(PULL_POLICY) --service-type=$(GW_SERVICE_TYPE) \

tests/README.md

-2

tests/framework/resourcemanager.go

+29
Original file line numberDiff line numberDiff line change
@@ -744,3 +744,32 @@ func (rm *ResourceManager) WaitForPodsToBeReadyWithCount(ctx context.Context, na
744744
},
745745
)
746746
}
747+
748+
// WaitForGatewayObservedGeneration waits for the provided Gateway's ObservedGeneration to equal the expected value.
749+
func (rm *ResourceManager) WaitForGatewayObservedGeneration(
750+
ctx context.Context,
751+
namespace,
752+
name string,
753+
generation int,
754+
) error {
755+
return wait.PollUntilContextCancel(
756+
ctx,
757+
500*time.Millisecond,
758+
true, /* poll immediately */
759+
func(ctx context.Context) (bool, error) {
760+
var gw v1.Gateway
761+
key := types.NamespacedName{Namespace: namespace, Name: name}
762+
if err := rm.K8sClient.Get(ctx, key, &gw); err != nil {
763+
return false, err
764+
}
765+
766+
for _, cond := range gw.Status.Conditions {
767+
if cond.ObservedGeneration == int64(generation) {
768+
return true, nil
769+
}
770+
}
771+
772+
return false, nil
773+
},
774+
)
775+
}

tests/scripts/run-tests-gcp-vm.sh

+7-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,13 @@ gcloud compute ssh --zone ${GKE_CLUSTER_ZONE} --project=${GKE_PROJECT} username@
1717
bash -s" < ${SCRIPT_DIR}/remote-scripts/${SCRIPT}
1818

1919
if [ "${NFR}" = "true" ]; then
20-
gcloud compute scp --zone ${GKE_CLUSTER_ZONE} --project=${GKE_PROJECT} --recurse username@${RESOURCE_NAME}:~/nginx-gateway-fabric/tests/results .
20+
## Use rsync if running locally (faster); otherwise if in the pipeline don't download an SSH config
21+
if [ "${CI}" = "false" ]; then
22+
gcloud compute config-ssh --ssh-config-file ngf-gcp.ssh > /dev/null
23+
rsync -ave 'ssh -F ngf-gcp.ssh' username@${RESOURCE_NAME}.${GKE_CLUSTER_ZONE}.${GKE_PROJECT}:~/nginx-gateway-fabric/tests/results .
24+
else
25+
gcloud compute scp --zone ${GKE_CLUSTER_ZONE} --project=${GKE_PROJECT} --recurse username@${RESOURCE_NAME}:~/nginx-gateway-fabric/tests/results .
26+
fi
2127
fi
2228

2329
## If tearing down the longevity test, we need to collect logs from gcloud and add to the results

tests/suite/manifests/ngf-upgrade/values.yaml

-4
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,3 @@ affinity:
2525
labelSelector:
2626
matchLabels:
2727
app.kubernetes.io/name: nginx-gateway
28-
29-
service:
30-
annotations:
31-
networking.gke.io/load-balancer-type: "Internal"
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,11 @@
11
nginxGateway:
2-
image:
3-
repository: ghcr.io/nginxinc/nginx-gateway-fabric
4-
tag: edge # change this tag if you are testing a different version
5-
pullPolicy: IfNotPresent
62
lifecycle:
73
preStop:
84
exec:
95
command:
106
- /usr/bin/gateway
117
- sleep
128
- --duration=40s
13-
config:
14-
logging:
15-
level: debug
169

1710
nginx:
1811
lifecycle:
@@ -31,7 +24,3 @@ affinity:
3124
labelSelector:
3225
matchLabels:
3326
app.kubernetes.io/name: nginx-gateway
34-
35-
service:
36-
annotations:
37-
networking.gke.io/load-balancer-type: "Internal"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
nginxGateway:
2+
lifecycle:
3+
preStop:
4+
exec:
5+
command:
6+
- /usr/bin/gateway
7+
- sleep
8+
- --duration=40s
9+
10+
nginx:
11+
lifecycle:
12+
preStop:
13+
exec:
14+
command:
15+
- /bin/sleep
16+
- "40"
17+
18+
terminationGracePeriodSeconds: 50

0 commit comments

Comments
 (0)