From 0f0e4e2672043991b0e5f3abcfa087ea663747ce Mon Sep 17 00:00:00 2001 From: Ryan Zhang Date: Mon, 3 Oct 2022 23:04:10 -0700 Subject: [PATCH 1/6] load test --- .gitignore | 1 + hack/loadtest/main.go | 109 ++++ hack/loadtest/manifests/endpoint-slice.yaml | 17 + hack/loadtest/manifests/endpoints.yaml | 12 + hack/loadtest/manifests/test-cloneset.yaml | 25 + hack/loadtest/manifests/test-configmap-2.yaml | 11 + hack/loadtest/manifests/test-configmap.yaml | 9 + hack/loadtest/manifests/test-role.yaml | 9 + hack/loadtest/manifests/test-rolebinding.yaml | 17 + hack/loadtest/manifests/test-secret.yaml | 8 + hack/loadtest/manifests/test-service.yaml | 27 + .../manifests/test_clonesets_crd.yaml | 506 ++++++++++++++++++ hack/loadtest/manifests/test_clusterrole.yaml | 16 + hack/loadtest/manifests/test_namespace.yaml | 6 + hack/loadtest/manifests/test_pdb.yaml | 10 + hack/loadtest/util/help.go | 110 ++++ hack/loadtest/util/placement.go | 206 +++++++ 17 files changed, 1099 insertions(+) create mode 100644 hack/loadtest/main.go create mode 100644 hack/loadtest/manifests/endpoint-slice.yaml create mode 100644 hack/loadtest/manifests/endpoints.yaml create mode 100644 hack/loadtest/manifests/test-cloneset.yaml create mode 100644 hack/loadtest/manifests/test-configmap-2.yaml create mode 100644 hack/loadtest/manifests/test-configmap.yaml create mode 100644 hack/loadtest/manifests/test-role.yaml create mode 100644 hack/loadtest/manifests/test-rolebinding.yaml create mode 100644 hack/loadtest/manifests/test-secret.yaml create mode 100644 hack/loadtest/manifests/test-service.yaml create mode 100644 hack/loadtest/manifests/test_clonesets_crd.yaml create mode 100644 hack/loadtest/manifests/test_clusterrole.yaml create mode 100644 hack/loadtest/manifests/test_namespace.yaml create mode 100644 hack/loadtest/manifests/test_pdb.yaml create mode 100644 hack/loadtest/util/help.go create mode 100644 hack/loadtest/util/placement.go diff --git a/.gitignore b/.gitignore index c6c0a57bb..b8e3541cc 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ ut-coverage.xml # editor and IDE paraphernalia .idea +.DS_Store *.swp *.swo *~ diff --git a/hack/loadtest/main.go b/hack/loadtest/main.go new file mode 100644 index 000000000..b0087c9f4 --- /dev/null +++ b/hack/loadtest/main.go @@ -0,0 +1,109 @@ +/* +Copyright (c) Microsoft Corporation. +Licensed under the MIT license. +*/ + +package main + +import ( + "context" + "flag" + "net/http" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus/promhttp" + "k8s.io/apimachinery/pkg/runtime" + utilrand "k8s.io/apimachinery/pkg/util/rand" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/klog/v2" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/config" + workv1alpha1 "sigs.k8s.io/work-api/pkg/apis/v1alpha1" + + fleetv1alpha1 "go.goms.io/fleet/apis/v1alpha1" + "go.goms.io/fleet/hack/loadtest/util" +) + +var ( + scheme = runtime.NewScheme() +) + +var ( + placementDeadline = flag.Int("placement-deadline-second", 60, "The deadline for a placement to be applied (in seconds)") + maxCurrentPlacement = flag.Int("max-current-placement", 10, "The number of current placement load.") + clusterNames util.ClusterNames +) + +func init() { + klog.InitFlags(nil) + utilrand.Seed(time.Now().UnixNano()) + + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(fleetv1alpha1.AddToScheme(scheme)) + utilruntime.Must(workv1alpha1.AddToScheme(scheme)) + //+kubebuilder:scaffold:scheme +} + +func main() { + flag.Var(&clusterNames, "cluster", "The name of a member cluster") + flag.Parse() + defer klog.Flush() + + klog.InfoS("start to run placement load test", "placementDeadline", *placementDeadline, "maxCurrentPlacement", *maxCurrentPlacement, "clusterNames", clusterNames) + config := config.GetConfigOrDie() + config.QPS, config.Burst = float32(100), 500 + hubClient, err := client.New(config, client.Options{ + Scheme: scheme, + }) + if err != nil { + panic(err) + } + ctx := ctrl.SetupSignalHandler() + if err = util.ApplyClusterScopeManifests(ctx, hubClient); err != nil { + panic(err) + } + + // run the loadtest in the background + go runLoadTest(ctx, config) + + // setup prometheus server + http.Handle("/metrics", promhttp.Handler()) + if err = http.ListenAndServe(":4848", nil); err != nil { + panic(err) + } +} + +func runLoadTest(ctx context.Context, config *rest.Config) { + var wg sync.WaitGroup + wg.Add(*maxCurrentPlacement) + for i := 0; i < *maxCurrentPlacement; i++ { + go func() { + // each use a separate client to avoid client side throttling + time.Sleep(time.Millisecond * time.Duration(utilrand.Intn(1000))) + hubClient, err := client.New(config, client.Options{ + Scheme: scheme, + }) + if err != nil { + panic(err) + } + defer wg.Done() + // continuously apply and delete resources + for { + select { + case <-ctx.Done(): + return + default: + if err := util.MeasureOnePlacement(ctx, hubClient, time.Duration(*placementDeadline)*time.Second, *maxCurrentPlacement, clusterNames); err != nil { + klog.ErrorS(err, "placement load test failed") + } + } + } + }() + } + wg.Wait() + klog.InfoS(" placement load test finished") +} diff --git a/hack/loadtest/manifests/endpoint-slice.yaml b/hack/loadtest/manifests/endpoint-slice.yaml new file mode 100644 index 000000000..ee2e8e9a3 --- /dev/null +++ b/hack/loadtest/manifests/endpoint-slice.yaml @@ -0,0 +1,17 @@ +apiVersion: discovery.k8s.io/v1 +kind: EndpointSlice +metadata: + labels: + service-name: test-nginx-export + name: test-user-created-endpointslice + namespace: app +addressType: IPv4 +ports: + - name: https + port: 443 + protocol: TCP +endpoints: + - addresses: + - 20.106.105.216 + conditions: + ready: true diff --git a/hack/loadtest/manifests/endpoints.yaml b/hack/loadtest/manifests/endpoints.yaml new file mode 100644 index 000000000..4e5c0d4ab --- /dev/null +++ b/hack/loadtest/manifests/endpoints.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Endpoints +metadata: + name: user-created-endpoint + namespace: app +subsets: + - addresses: + - ip: 20.106.105.216 + ports: + - name: https + port: 443 + protocol: TCP diff --git a/hack/loadtest/manifests/test-cloneset.yaml b/hack/loadtest/manifests/test-cloneset.yaml new file mode 100644 index 000000000..ef6c5efc5 --- /dev/null +++ b/hack/loadtest/manifests/test-cloneset.yaml @@ -0,0 +1,25 @@ +apiVersion: apps.kruise.io/v1alpha1 +kind: CloneSet +metadata: + name: guestbook-clone + namespace: app +spec: + replicas: 20 + selector: + matchLabels: + app.kubernetes.io/name: guestbook-clone + template: + metadata: + labels: + app.kubernetes.io/name: guestbook-clone + spec: + containers: + - name: guestbook + image: openkruise/guestbook:v1 + imagePullPolicy: Always + ports: + - name: http-server + containerPort: 3000 + updateStrategy: + type: InPlaceIfPossible + maxUnavailable: 3 \ No newline at end of file diff --git a/hack/loadtest/manifests/test-configmap-2.yaml b/hack/loadtest/manifests/test-configmap-2.yaml new file mode 100644 index 000000000..a4cd86c00 --- /dev/null +++ b/hack/loadtest/manifests/test-configmap-2.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: test-configmap-2 + namespace: app + labels: + fleet.azure.com/name: app +data: + field1: one + field2: two + field3: three \ No newline at end of file diff --git a/hack/loadtest/manifests/test-configmap.yaml b/hack/loadtest/manifests/test-configmap.yaml new file mode 100644 index 000000000..98396fe81 --- /dev/null +++ b/hack/loadtest/manifests/test-configmap.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: test-configmap + namespace: app +data: + fielda: one + fieldb: two + fieldc: three \ No newline at end of file diff --git a/hack/loadtest/manifests/test-role.yaml b/hack/loadtest/manifests/test-role.yaml new file mode 100644 index 000000000..481850432 --- /dev/null +++ b/hack/loadtest/manifests/test-role.yaml @@ -0,0 +1,9 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + namespace: app + name: test-pod-reader +rules: + - apiGroups: [""] # "" indicates the core API group + resources: ["pods"] + verbs: ["get", "watch", "list"] \ No newline at end of file diff --git a/hack/loadtest/manifests/test-rolebinding.yaml b/hack/loadtest/manifests/test-rolebinding.yaml new file mode 100644 index 000000000..f1993e8d8 --- /dev/null +++ b/hack/loadtest/manifests/test-rolebinding.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +# This role binding allows "jane" to read pods in the "default" namespace. +# You need to already have a Role named "pod-reader" in that namespace. +kind: RoleBinding +metadata: + name: read-pods + namespace: app +subjects: + # You can specify more than one "subject" + - kind: User + name: jane # "name" is case sensitive + apiGroup: rbac.authorization.k8s.io +roleRef: + # "roleRef" specifies the binding to a Role / ClusterRole + kind: Role #this must be Role or ClusterRole + apiGroup: rbac.authorization.k8s.io + name: test-pod-reader # this must match the name of the Role or ClusterRole you wish to bind to \ No newline at end of file diff --git a/hack/loadtest/manifests/test-secret.yaml b/hack/loadtest/manifests/test-secret.yaml new file mode 100644 index 000000000..8dfabd51d --- /dev/null +++ b/hack/loadtest/manifests/test-secret.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +metadata: + name: test-secret + namespace: app +data: + somekey: Q2xpZW50SWQ6IDUxOTEwNTY4LTM0YzktNGQ0ZS1iODA1LTNmNTY3NWQyMDdiYwpDbGllbnRTZWNyZXQ6IDZSLThRfkJvSDNNYm1+eGJpaDhmNVZibHBkWGxzeGQyRnp+WXhjWjYKVGVuYW50SWQ6IDcyZjk4OGJmLTg2ZjEtNDFhZi05MWFiLTJkN2NkMDExZGI0NwpTdWJzY3JpcHRpb25JZDogMmIwM2JmYjgtZTg4NS00NTY2LWE2MmEtOTA5YTExZDcxNjkyClJlc291cmNlR3JvdXA6IGNhcmF2ZWwtZGVtbw== +type: generic \ No newline at end of file diff --git a/hack/loadtest/manifests/test-service.yaml b/hack/loadtest/manifests/test-service.yaml new file mode 100644 index 000000000..abed7143a --- /dev/null +++ b/hack/loadtest/manifests/test-service.yaml @@ -0,0 +1,27 @@ +# Copyright 2021 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + name: test-nginx + namespace: app + labels: + run: test-nginx +spec: + ports: + - port: 80 + protocol: TCP + selector: + run: test-nginx diff --git a/hack/loadtest/manifests/test_clonesets_crd.yaml b/hack/loadtest/manifests/test_clonesets_crd.yaml new file mode 100644 index 000000000..747fbc045 --- /dev/null +++ b/hack/loadtest/manifests/test_clonesets_crd.yaml @@ -0,0 +1,506 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.7.0 + labels: + fleet.azure.com/name: test-placement + creationTimestamp: null + name: clonesets.apps.kruise.io +spec: + group: apps.kruise.io + names: + kind: CloneSet + listKind: CloneSetList + plural: clonesets + shortNames: + - clone + singular: cloneset + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: The desired number of pods. + jsonPath: .spec.replicas + name: DESIRED + type: integer + - description: The number of pods updated. + jsonPath: .status.updatedReplicas + name: UPDATED + type: integer + - description: The number of pods updated and ready. + jsonPath: .status.updatedReadyReplicas + name: UPDATED_READY + type: integer + - description: The number of pods ready. + jsonPath: .status.readyReplicas + name: READY + type: integer + - description: The number of currently all pods. + jsonPath: .status.replicas + name: TOTAL + type: integer + - description: CreationTimestamp is a timestamp representing the server time when + this object was created. It is not guaranteed to be set in happens-before + order across separate operations. Clients may not set this value. It is represented + in RFC3339 form and is in UTC. + jsonPath: .metadata.creationTimestamp + name: AGE + type: date + - description: The containers of currently cloneset. + jsonPath: .spec.template.spec.containers[*].name + name: CONTAINERS + priority: 1 + type: string + - description: The images of currently cloneset. + jsonPath: .spec.template.spec.containers[*].image + name: IMAGES + priority: 1 + type: string + - description: The selector of currently cloneset. + jsonPath: .status.labelSelector + name: SELECTOR + priority: 1 + type: string + name: v1alpha1 + schema: + openAPIV3Schema: + description: CloneSet is the Schema for the clonesets API + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: CloneSetSpec defines the desired state of CloneSet + properties: + lifecycle: + description: Lifecycle defines the lifecycle hooks for Pods pre-delete, + in-place update. + properties: + inPlaceUpdate: + description: InPlaceUpdate is the hook before Pod to update and + after Pod has been updated. + properties: + finalizersHandler: + items: + type: string + type: array + labelsHandler: + additionalProperties: + type: string + type: object + markPodNotReady: + description: 'MarkPodNotReady = true means: - Pod will be + set to ''NotReady'' at preparingDelete/preparingUpdate state. + - Pod will be restored to ''Ready'' at Updated state if + it was set to ''NotReady'' at preparingUpdate state. Default + to false.' + type: boolean + type: object + preDelete: + description: PreDelete is the hook before Pod to be deleted. + properties: + finalizersHandler: + items: + type: string + type: array + labelsHandler: + additionalProperties: + type: string + type: object + markPodNotReady: + description: 'MarkPodNotReady = true means: - Pod will be + set to ''NotReady'' at preparingDelete/preparingUpdate state. + - Pod will be restored to ''Ready'' at Updated state if + it was set to ''NotReady'' at preparingUpdate state. Default + to false.' + type: boolean + type: object + type: object + minReadySeconds: + description: Minimum number of seconds for which a newly created pod + should be ready without any of its container crashing, for it to + be considered available. Defaults to 0 (pod will be considered available + as soon as it is ready) + format: int32 + type: integer + replicas: + description: Replicas is the desired number of replicas of the given + Template. These are replicas in the sense that they are instantiations + of the same Template. If unspecified, defaults to 1. + format: int32 + type: integer + revisionHistoryLimit: + description: RevisionHistoryLimit is the maximum number of revisions + that will be maintained in the CloneSet's revision history. The + revision history consists of all revisions not represented by a + currently applied CloneSetSpec version. The default value is 10. + format: int32 + type: integer + scaleStrategy: + description: ScaleStrategy indicates the ScaleStrategy that will be + employed to create and delete Pods in the CloneSet. + properties: + maxUnavailable: + anyOf: + - type: integer + - type: string + description: The maximum number of pods that can be unavailable + for scaled pods. This field can control the changes rate of + replicas for CloneSet so as to minimize the impact for users' + service. The scale will fail if the number of unavailable pods + were greater than this MaxUnavailable at scaling up. MaxUnavailable + works only when scaling up. + x-kubernetes-int-or-string: true + podsToDelete: + description: PodsToDelete is the names of Pod should be deleted. + Note that this list will be truncated for non-existing pod names. + items: + type: string + type: array + type: object + selector: + description: 'Selector is a label query over pods that should match + the replica count. It must match the pod template''s labels. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors' + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that + contains values, a key, and an operator that relates the key + and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to + a set of values. Valid operators are In, NotIn, Exists + and DoesNotExist. + type: string + values: + description: values is an array of string values. If the + operator is In or NotIn, the values array must be non-empty. + If the operator is Exists or DoesNotExist, the values + array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator + is "In", and the values array contains only "value". The requirements + are ANDed. + type: object + type: object + template: + description: Template describes the pods that will be created. + x-kubernetes-preserve-unknown-fields: true + updateStrategy: + description: UpdateStrategy indicates the UpdateStrategy that will + be employed to update Pods in the CloneSet when a revision is made + to Template. + properties: + inPlaceUpdateStrategy: + description: InPlaceUpdateStrategy contains strategies for in-place + update. + properties: + gracePeriodSeconds: + description: GracePeriodSeconds is the timespan between set + Pod status to not-ready and update images in Pod spec when + in-place update a Pod. + format: int32 + type: integer + type: object + maxSurge: + anyOf: + - type: integer + - type: string + description: 'The maximum number of pods that can be scheduled + above the desired replicas during update or specified delete. + Value can be an absolute number (ex: 5) or a percentage of desired + pods (ex: 10%). Absolute number is calculated from percentage + by rounding up. Defaults to 0.' + x-kubernetes-int-or-string: true + maxUnavailable: + anyOf: + - type: integer + - type: string + description: 'The maximum number of pods that can be unavailable + during update or scale. Value can be an absolute number (ex: + 5) or a percentage of desired pods (ex: 10%). Absolute number + is calculated from percentage by rounding up by default. When + maxSurge > 0, absolute number is calculated from percentage + by rounding down. Defaults to 20%.' + x-kubernetes-int-or-string: true + partition: + anyOf: + - type: integer + - type: string + description: 'Partition is the desired number of pods in old revisions. + Value can be an absolute number (ex: 5) or a percentage of desired + pods (ex: 10%). Absolute number is calculated from percentage + by rounding up by default. It means when partition is set during + pods updating, (replicas - partition value) number of pods will + be updated. Default value is 0.' + x-kubernetes-int-or-string: true + paused: + description: Paused indicates that the CloneSet is paused. Default + value is false + type: boolean + priorityStrategy: + description: Priorities are the rules for calculating the priority + of updating pods. Each pod to be updated, will pass through + these terms and get a sum of weights. + properties: + orderPriority: + description: 'Order priority terms, pods will be sorted by + the value of orderedKey. For example: ``` orderPriority: + - orderedKey: key1 - orderedKey: key2 ``` First, all pods + which have key1 in labels will be sorted by the value of + key1. Then, the left pods which have no key1 but have key2 + in labels will be sorted by the value of key2 and put behind + those pods have key1.' + items: + description: UpdatePriorityOrder defines order priority. + properties: + orderedKey: + description: Calculate priority by value of this key. + Values of this key, will be sorted by GetInt(val). + GetInt method will find the last int in value, such + as getting 5 in value '5', getting 10 in value 'sts-10'. + type: string + required: + - orderedKey + type: object + type: array + weightPriority: + description: Weight priority terms, pods will be sorted by + the sum of all terms weight. + items: + description: UpdatePriorityWeightTerm defines weight priority. + properties: + matchSelector: + description: MatchSelector is used to select by pod's + labels. + properties: + matchExpressions: + description: matchExpressions is a list of label + selector requirements. The requirements are ANDed. + items: + description: A label selector requirement is a + selector that contains values, a key, and an + operator that relates the key and values. + properties: + key: + description: key is the label key that the + selector applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are + In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string + values. If the operator is In or NotIn, + the values array must be non-empty. If the + operator is Exists or DoesNotExist, the + values array must be empty. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} + pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, + whose key field is "key", the operator is "In", + and the values array contains only "value". The + requirements are ANDed. + type: object + type: object + weight: + description: Weight associated with matching the corresponding + matchExpressions, in the range 1-100. + format: int32 + type: integer + required: + - matchSelector + - weight + type: object + type: array + type: object + scatterStrategy: + description: ScatterStrategy defines the scatter rules to make + pods been scattered when update. This will avoid pods with the + same key-value to be updated in one batch. - Note that pods + will be scattered after priority sort. So, although priority + strategy and scatter strategy can be applied together, we suggest + to use either one of them. - If scatterStrategy is used, we + suggest to just use one term. Otherwise, the update order can + be hard to understand. + items: + properties: + key: + type: string + value: + type: string + required: + - key + - value + type: object + type: array + type: + description: Type indicates the type of the CloneSetUpdateStrategy. + Default is ReCreate. + type: string + type: object + volumeClaimTemplates: + description: VolumeClaimTemplates is a list of claims that pods are + allowed to reference. Note that PVC will be deleted when its pod + has been deleted. + x-kubernetes-preserve-unknown-fields: true + required: + - selector + - template + type: object + status: + description: CloneSetStatus defines the observed state of CloneSet + properties: + availableReplicas: + description: AvailableReplicas is the number of Pods created by the + CloneSet controller that have a Ready Condition for at least minReadySeconds. + format: int32 + type: integer + collisionCount: + description: CollisionCount is the count of hash collisions for the + CloneSet. The CloneSet controller uses this field as a collision + avoidance mechanism when it needs to create the name for the newest + ControllerRevision. + format: int32 + type: integer + conditions: + description: Conditions represents the latest available observations + of a CloneSet's current state. + items: + description: CloneSetCondition describes the state of a CloneSet + at a certain point. + properties: + lastTransitionTime: + description: Last time the condition transitioned from one status + to another. + format: date-time + type: string + message: + description: A human readable message indicating details about + the transition. + type: string + reason: + description: The reason for the condition's last transition. + type: string + status: + description: Status of the condition, one of True, False, Unknown. + type: string + type: + description: Type of CloneSet condition. + type: string + required: + - status + - type + type: object + type: array + currentRevision: + description: currentRevision, if not empty, indicates the current + revision version of the CloneSet. + type: string + expectedUpdatedReplicas: + description: ExpectedUpdatedReplicas is the number of Pods that should + be updated by CloneSet controller. This field is calculated via + Replicas - Partition. + format: int32 + type: integer + labelSelector: + description: LabelSelector is label selectors for query over pods + that should match the replica count used by HPA. + type: string + observedGeneration: + description: ObservedGeneration is the most recent generation observed + for this CloneSet. It corresponds to the CloneSet's generation, + which is updated on mutation by the API Server. + format: int64 + type: integer + readyReplicas: + description: ReadyReplicas is the number of Pods created by the CloneSet + controller that have a Ready Condition. + format: int32 + type: integer + replicas: + description: Replicas is the number of Pods created by the CloneSet + controller. + format: int32 + type: integer + updateRevision: + description: UpdateRevision, if not empty, indicates the latest revision + of the CloneSet. + type: string + updatedReadyReplicas: + description: UpdatedReadyReplicas is the number of Pods created by + the CloneSet controller from the CloneSet version indicated by updateRevision + and have a Ready Condition. + format: int32 + type: integer + updatedReplicas: + description: UpdatedReplicas is the number of Pods created by the + CloneSet controller from the CloneSet version indicated by updateRevision. + format: int32 + type: integer + required: + - availableReplicas + - readyReplicas + - replicas + - updatedReadyReplicas + - updatedReplicas + type: object + type: object + served: true + storage: true + subresources: + scale: + labelSelectorPath: .status.labelSelector + specReplicasPath: .spec.replicas + statusReplicasPath: .status.replicas + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] diff --git a/hack/loadtest/manifests/test_clusterrole.yaml b/hack/loadtest/manifests/test_clusterrole.yaml new file mode 100644 index 000000000..c336515e6 --- /dev/null +++ b/hack/loadtest/manifests/test_clusterrole.yaml @@ -0,0 +1,16 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: test-cluster-role + labels: + fleet.azure.com/name: test +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch"] + - apiGroups: [ "" ] + resources: [ "events" ] + verbs: [ "get", "list", "watch", "create", "patch" ] + - apiGroups: [ "" ] + resources: ["nodes"] + verbs: [ "get", "list", "watch"] \ No newline at end of file diff --git a/hack/loadtest/manifests/test_namespace.yaml b/hack/loadtest/manifests/test_namespace.yaml new file mode 100644 index 000000000..e69aa5528 --- /dev/null +++ b/hack/loadtest/manifests/test_namespace.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: app + labels: + fleet.azure.com/name: test diff --git a/hack/loadtest/manifests/test_pdb.yaml b/hack/loadtest/manifests/test_pdb.yaml new file mode 100644 index 000000000..3795cd35e --- /dev/null +++ b/hack/loadtest/manifests/test_pdb.yaml @@ -0,0 +1,10 @@ +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: zk-pdb + namespace: app +spec: + minAvailable: 2 + selector: + matchLabels: + app: zookeeper \ No newline at end of file diff --git a/hack/loadtest/util/help.go b/hack/loadtest/util/help.go new file mode 100644 index 000000000..e677b5d47 --- /dev/null +++ b/hack/loadtest/util/help.go @@ -0,0 +1,110 @@ +package util + +import ( + "context" + "os" + "strings" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/util/yaml" + "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +type ClusterNames []string + +func (i *ClusterNames) String() string { + return "member cluster names: " + strings.Join(*i, ",") +} + +func (i *ClusterNames) Set(value string) error { + *i = append(*i, value) + return nil +} + +// applyObjectFromManifest returns a runtime object decoded from the file +func applyObjectFromManifest(ctx context.Context, hubClient client.Client, namespaceName string, relativeFilePath string) error { + // Read files, create manifest + rawByte, err := os.ReadFile(relativeFilePath) + if err != nil { + return err + } + json, err := yaml.ToJSON(rawByte) + if err != nil { + return err + } + obj := &unstructured.Unstructured{} + err = obj.UnmarshalJSON(json) + if err != nil { + return err + } + obj.SetNamespace(namespaceName) + return hubClient.Create(ctx, obj) +} + +func ApplyClusterScopeManifests(ctx context.Context, hubClient client.Client) error { + if err := applyObjectFromManifest(ctx, hubClient, "", "hack/loadtest/manifests/test_clonesets_crd.yaml"); err != nil { + if !apierrors.IsAlreadyExists(err) { + return err + } + } + if err := applyObjectFromManifest(ctx, hubClient, "", "hack/loadtest/manifests/test_clusterrole.yaml"); err != nil { + if !apierrors.IsAlreadyExists(err) { + return err + } + } + return nil +} + +// applyTestManifests creates the test manifests in the hub cluster under a namespace +func applyTestManifests(ctx context.Context, hubClient client.Client, namespaceName string) error { + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: namespaceName, + Labels: map[string]string{ + labelKey: namespaceName, + }, + }, + } + if err := hubClient.Create(ctx, ns); err != nil { + return err + } + if err := applyObjectFromManifest(ctx, hubClient, namespaceName, "hack/loadtest/manifests/test_pdb.yaml"); err != nil { + return err + } + if err := applyObjectFromManifest(ctx, hubClient, namespaceName, "hack/loadtest/manifests/test-configmap.yaml"); err != nil { + return err + } + if err := applyObjectFromManifest(ctx, hubClient, namespaceName, "hack/loadtest/manifests/test-secret.yaml"); err != nil { + return err + } + if err := applyObjectFromManifest(ctx, hubClient, namespaceName, "hack/loadtest/manifests/test-service.yaml"); err != nil { + return err + } + if err := applyObjectFromManifest(ctx, hubClient, namespaceName, "hack/loadtest/manifests/test-cloneset.yaml"); err != nil { + return err + } + if err := applyObjectFromManifest(ctx, hubClient, namespaceName, "hack/loadtest/manifests/test-configmap-2.yaml"); err != nil { + return err + } + if err := applyObjectFromManifest(ctx, hubClient, namespaceName, "hack/loadtest/manifests/test-role.yaml"); err != nil { + return err + } + if err := applyObjectFromManifest(ctx, hubClient, namespaceName, "hack/loadtest/manifests/test-rolebinding.yaml"); err != nil { + return err + } + return nil +} + +func deleteNamespace(ctx context.Context, hubClient client.Client, namespaceName string) error { + klog.InfoS("delete the namespace", "namespace", namespaceName) + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: namespaceName, + }, + } + return hubClient.Delete(ctx, ns) +} diff --git a/hack/loadtest/util/placement.go b/hack/loadtest/util/placement.go new file mode 100644 index 000000000..64f05a0b4 --- /dev/null +++ b/hack/loadtest/util/placement.go @@ -0,0 +1,206 @@ +/* +Copyright (c) Microsoft Corporation. +Licensed under the MIT license. +*/ +package util + +import ( + "context" + "fmt" + "strconv" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + utilrand "k8s.io/apimachinery/pkg/util/rand" + "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/client" + workv1alpha1 "sigs.k8s.io/work-api/pkg/apis/v1alpha1" + + "go.goms.io/fleet/apis/v1alpha1" + "go.goms.io/fleet/pkg/utils" +) + +const interval = 50 * time.Millisecond + +var ( + labelKey = "workload.azure.com/load" +) + +var ( + LoadTestApplyErrorCount = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "workload_apply_errors_total", + Help: "Total number of placement errors", + }, []string{"currency", "fleetSize", "mode"}) + + LoadTestApplySuccessCount = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "workload_apply_total", + Help: "Total number of placement", + }, []string{"currency", "fleetSize"}) + + LoadTestApplyLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "workload_apply_latency", + Help: "Length of time from placement change to it is applied to the member cluster", + Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, + 1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60}, + }, []string{"currency", "fleetSize"}) + + LoadTestDeleteErrorCount = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "workload_delete_errors_total", + Help: "Total number of placement delete errors", + }, []string{"currency", "fleetSize"}) + + LoadTestDeleteSuccessCount = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "workload_delete_total", + Help: "Total number of placement deleted", + }, []string{"currency", "fleetSize"}) + + LoadTestDeleteLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "workload_delete_latency", + Help: "Length of time from resource deletion to it is deleted from the member cluster", + Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, + 1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60}, + }, []string{"currency", "fleetSize"}) +) + +func MeasureOnePlacement(ctx context.Context, hubClient client.Client, deadline time.Duration, maxCurrentPlacement int, clusterNames ClusterNames) error { + crpName := "load-test-placement-" + utilrand.String(10) + nsName := "load-test-ns-" + utilrand.String(10) + fleetSize := strconv.Itoa(len(clusterNames)) + currency := strconv.Itoa(maxCurrentPlacement) + + defer klog.Flush() + defer deleteNamespace(context.Background(), hubClient, nsName) + klog.Infof("create the resources in namespace `%s` in the hub cluster", nsName) + if err := applyTestManifests(ctx, hubClient, nsName); err != nil { + klog.ErrorS(err, "failed to apply namespaced resources", "namespace", nsName) + return err + } + + klog.Infof("create the cluster resource placement `%s` in the hub cluster", crpName) + crp := &v1alpha1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: crpName, + }, + Spec: v1alpha1.ClusterResourcePlacementSpec{ + ResourceSelectors: []v1alpha1.ClusterResourceSelector{ + { + Group: "", + Version: "v1", + Kind: "Namespace", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{labelKey: nsName}, + }, + }, + { + Group: apiextensionsv1.GroupName, + Version: "v1", + Kind: "CustomResourceDefinition", + Name: "clonesets.apps.kruise.io", + }, + }, + }, + } + defer hubClient.Delete(context.Background(), crp) + if err := hubClient.Create(ctx, crp); err != nil { + klog.ErrorS(err, "failed to apply crp", "namespace", nsName, "crp", crpName) + return err + } + + klog.Infof("verify that the cluster resource placement `%s` is applied", crpName) + collectApplyMetrics(ctx, hubClient, deadline, crpName, currency, fleetSize) + + klog.Infof("remove the namespace referred by the placement `%s`", crpName) + deleteNamespace(ctx, hubClient, nsName) + collectDeleteMetrics(ctx, hubClient, deadline, crpName, clusterNames, currency, fleetSize) + + return hubClient.Delete(ctx, crp) +} + +// collect the crp apply metrics +func collectApplyMetrics(ctx context.Context, hubClient client.Client, deadline time.Duration, crpName string, currency string, fleetSize string) time.Time { + startTime := time.Now() + applyDeadline := startTime.Add(deadline) + var crp v1alpha1.ClusterResourcePlacement + for { + if err := hubClient.Get(ctx, types.NamespacedName{Name: crpName, Namespace: ""}, &crp); err != nil { + time.Sleep(interval) + continue + } + // check if the condition is true + cond := crp.GetCondition(string(v1alpha1.ResourcePlacementStatusConditionTypeApplied)) + if cond == nil || cond.Status == metav1.ConditionUnknown { + // wait + klog.V(5).Infof("the cluster resource placement `%s` is pending", crpName) + time.Sleep(interval) + } else if cond != nil && cond.Status == metav1.ConditionTrue { + // succeeded + klog.V(3).Infof("the cluster resource placement `%s` succeeded", crpName) + LoadTestApplySuccessCount.WithLabelValues(currency, fleetSize).Inc() + LoadTestApplyLatency.WithLabelValues(currency, fleetSize).Observe(time.Since(startTime).Seconds()) + break + } + if time.Now().After(applyDeadline) { + if cond != nil && cond.Status == metav1.ConditionFalse { + // failed + klog.V(3).Infof("the cluster resource placement `%s` failed", crpName) + LoadTestApplyErrorCount.WithLabelValues(currency, fleetSize, "failed").Inc() + } else { + // timeout + klog.V(3).Infof("the cluster resource placement `%s` timeout", crpName) + LoadTestApplyErrorCount.WithLabelValues(currency, fleetSize, "timeout").Inc() + } + break + } + } + return startTime +} + +func collectDeleteMetrics(ctx context.Context, hubClient client.Client, deadline time.Duration, crpName string, clusterNames ClusterNames, currency string, fleetSize string) { + var crp v1alpha1.ClusterResourcePlacement + startTime := time.Now() + deleteDeadline := startTime.Add(deadline * 3) + klog.Infof("verify that the cluster resource placement `%s` is deleted", crpName) + for { + if err := hubClient.Get(ctx, types.NamespacedName{Name: crpName, Namespace: ""}, &crp); err != nil { + time.Sleep(interval) + continue + } + if len(crp.Status.SelectedResources) != 1 { + klog.V(3).Infof("the crp `%s` has not picked up the namespaced resource deleted change", crpName) + // wait + time.Sleep(interval) + continue + } + // check if the change is picked up by the member agent + var clusterWork workv1alpha1.Work + removed := true + for _, clusterName := range clusterNames { + err := hubClient.Get(ctx, types.NamespacedName{Name: crpName, Namespace: fmt.Sprintf(utils.NamespaceNameFormat, clusterName)}, &clusterWork) + if err != nil || (err == nil && len(clusterWork.Status.ManifestConditions) != 1) { + klog.V(3).Infof("the resources `%s` in cluster namespace `%s` is not removed by the member agent yet", crpName, clusterName) + removed = false + break + } + } + if !removed { + // wait + time.Sleep(interval) + } else { + // succeeded + klog.V(3).Infof("the cluster resource placement `%s` delete succeeded", crpName) + LoadTestDeleteSuccessCount.WithLabelValues(currency, fleetSize).Inc() + LoadTestDeleteLatency.WithLabelValues(currency, fleetSize).Observe(time.Since(startTime).Seconds()) + break + } + if time.Now().After(deleteDeadline) { + // timeout + klog.V(3).Infof("the cluster resource placement `%s` delete timeout", crpName) + LoadTestDeleteErrorCount.WithLabelValues(currency, fleetSize).Inc() + break + } + } +} From 00f40495c52428468b00b998c27341ee12f4061f Mon Sep 17 00:00:00 2001 From: Ryan Zhang Date: Wed, 5 Oct 2022 13:33:53 -0700 Subject: [PATCH 2/6] add some details --- hack/loadtest/README.md | 22 ++++++++ hack/loadtest/main.go | 14 +++-- hack/loadtest/prometheus.yml | 19 +++++++ hack/loadtest/util/help.go | 89 +++++++++++++++++++++++++++++--- hack/loadtest/util/placement.go | 91 +++++++++++++++++---------------- 5 files changed, 182 insertions(+), 53 deletions(-) create mode 100644 hack/loadtest/README.md create mode 100644 hack/loadtest/prometheus.yml diff --git a/hack/loadtest/README.md b/hack/loadtest/README.md new file mode 100644 index 000000000..321d2da36 --- /dev/null +++ b/hack/loadtest/README.md @@ -0,0 +1,22 @@ +Here is how to run the load test locally. Make sure that you have installed go and git clone the repo + +1. Build a fleet. + +You can use any Kubernetes clusters you have and install the fleet agents on those clusters. In this example, we built a fleet with four member clusters, namely, cluster-1 to cluster-4. +Please remember to save the kubeconfig file pointing to the hub cluster of the fleet. + +3. Run the load test binary locally. +```shell +export KUBECONFIG=xxxxx +go run hack/loadtest/main.go -max-current-placement 10 --cluster cluster-1 --cluster cluster-2 --cluster cluster-3 --cluster cluster-4 +``` + +3. Manually check the metrics against the load test. +```shell +curl http://localhost:4848/metrics | grep workload +``` + +4. Use a local prometheus to draw graphs. Download prometheus binary for your local machine. Start the prometheus. +```shell +./prometheus --config.file=hack/loadtest/prometheus.yml +``` diff --git a/hack/loadtest/main.go b/hack/loadtest/main.go index b0087c9f4..7e01d6fd3 100644 --- a/hack/loadtest/main.go +++ b/hack/loadtest/main.go @@ -33,7 +33,8 @@ var ( ) var ( - placementDeadline = flag.Int("placement-deadline-second", 60, "The deadline for a placement to be applied (in seconds)") + placementDeadline = flag.Int("placement-deadline-second", 300, "The deadline for a placement to be applied (in seconds)") + pollInterval = flag.Int("poll-interval-millisecond", 250, "The poll interval for verification (in milli-second)") maxCurrentPlacement = flag.Int("max-current-placement", 10, "The number of current placement load.") clusterNames util.ClusterNames ) @@ -69,7 +70,6 @@ func main() { // run the loadtest in the background go runLoadTest(ctx, config) - // setup prometheus server http.Handle("/metrics", promhttp.Handler()) if err = http.ListenAndServe(":4848", nil); err != nil { @@ -84,7 +84,7 @@ func runLoadTest(ctx context.Context, config *rest.Config) { go func() { // each use a separate client to avoid client side throttling time.Sleep(time.Millisecond * time.Duration(utilrand.Intn(1000))) - hubClient, err := client.New(config, client.Options{ + hubClient, err := client.NewWithWatch(config, client.Options{ Scheme: scheme, }) if err != nil { @@ -97,7 +97,7 @@ func runLoadTest(ctx context.Context, config *rest.Config) { case <-ctx.Done(): return default: - if err := util.MeasureOnePlacement(ctx, hubClient, time.Duration(*placementDeadline)*time.Second, *maxCurrentPlacement, clusterNames); err != nil { + if err = util.MeasureOnePlacement(ctx, hubClient, time.Duration(*placementDeadline)*time.Second, time.Duration(*pollInterval)*time.Millisecond, *maxCurrentPlacement, clusterNames); err != nil { klog.ErrorS(err, "placement load test failed") } } @@ -105,5 +105,11 @@ func runLoadTest(ctx context.Context, config *rest.Config) { }() } wg.Wait() + hubClient, _ := client.New(config, client.Options{ + Scheme: scheme, + }) + if err := util.CleanupAll(hubClient); err != nil { + klog.ErrorS(err, "clean up placement load test hit an error") + } klog.InfoS(" placement load test finished") } diff --git a/hack/loadtest/prometheus.yml b/hack/loadtest/prometheus.yml new file mode 100644 index 000000000..21296f837 --- /dev/null +++ b/hack/loadtest/prometheus.yml @@ -0,0 +1,19 @@ +global: + scrape_interval: 15s # By default, scrape targets every 15 seconds. + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'fleet-monitor' + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's fleet load test. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: 'fleet-load-test' + + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + + static_configs: + - targets: ['localhost:4848'] diff --git a/hack/loadtest/util/help.go b/hack/loadtest/util/help.go index e677b5d47..670724551 100644 --- a/hack/loadtest/util/help.go +++ b/hack/loadtest/util/help.go @@ -12,6 +12,8 @@ import ( "k8s.io/apimachinery/pkg/util/yaml" "k8s.io/klog/v2" "sigs.k8s.io/controller-runtime/pkg/client" + + "go.goms.io/fleet/apis/v1alpha1" ) type ClusterNames []string @@ -25,24 +27,44 @@ func (i *ClusterNames) Set(value string) error { return nil } -// applyObjectFromManifest returns a runtime object decoded from the file +// applyObjectFromManifest apply an object from manifest func applyObjectFromManifest(ctx context.Context, hubClient client.Client, namespaceName string, relativeFilePath string) error { + obj, err := readObjFromFile(relativeFilePath, namespaceName) + if err != nil { + return err + } + return hubClient.Create(ctx, obj) +} + +// deleteObjectFromManifest delete an object from manifest +func deleteObjectFromManifest(ctx context.Context, hubClient client.Client, namespaceName string, relativeFilePath string) error { + obj, err := readObjFromFile(relativeFilePath, namespaceName) + if err != nil { + return err + } + return hubClient.Delete(ctx, obj) +} + +// readObjFromFile returns a runtime object decoded from the file +func readObjFromFile(relativeFilePath string, namespaceName string) (*unstructured.Unstructured, error) { // Read files, create manifest rawByte, err := os.ReadFile(relativeFilePath) if err != nil { - return err + return nil, err } json, err := yaml.ToJSON(rawByte) if err != nil { - return err + return nil, err } obj := &unstructured.Unstructured{} err = obj.UnmarshalJSON(json) if err != nil { - return err + return nil, err } - obj.SetNamespace(namespaceName) - return hubClient.Create(ctx, obj) + if len(namespaceName) != 0 { + obj.SetNamespace(namespaceName) + } + return obj, nil } func ApplyClusterScopeManifests(ctx context.Context, hubClient client.Client) error { @@ -99,6 +121,35 @@ func applyTestManifests(ctx context.Context, hubClient client.Client, namespaceN return nil } +// deleteTestManifests deletes the test manifests in the hub cluster under a namespace +func deleteTestManifests(ctx context.Context, hubClient client.Client, namespaceName string) error { + if err := deleteObjectFromManifest(ctx, hubClient, namespaceName, "hack/loadtest/manifests/test_pdb.yaml"); err != nil { + return err + } + if err := deleteObjectFromManifest(ctx, hubClient, namespaceName, "hack/loadtest/manifests/test-configmap.yaml"); err != nil { + return err + } + if err := deleteObjectFromManifest(ctx, hubClient, namespaceName, "hack/loadtest/manifests/test-secret.yaml"); err != nil { + return err + } + if err := deleteObjectFromManifest(ctx, hubClient, namespaceName, "hack/loadtest/manifests/test-service.yaml"); err != nil { + return err + } + if err := deleteObjectFromManifest(ctx, hubClient, namespaceName, "hack/loadtest/manifests/test-cloneset.yaml"); err != nil { + return err + } + if err := deleteObjectFromManifest(ctx, hubClient, namespaceName, "hack/loadtest/manifests/test-configmap-2.yaml"); err != nil { + return err + } + if err := deleteObjectFromManifest(ctx, hubClient, namespaceName, "hack/loadtest/manifests/test-role.yaml"); err != nil { + return err + } + if err := deleteObjectFromManifest(ctx, hubClient, namespaceName, "hack/loadtest/manifests/test-rolebinding.yaml"); err != nil { + return err + } + return nil +} + func deleteNamespace(ctx context.Context, hubClient client.Client, namespaceName string) error { klog.InfoS("delete the namespace", "namespace", namespaceName) ns := &corev1.Namespace{ @@ -108,3 +159,29 @@ func deleteNamespace(ctx context.Context, hubClient client.Client, namespaceName } return hubClient.Delete(ctx, ns) } + +func CleanupAll(hubClient client.Client) error { + var namespaces corev1.NamespaceList + if err := hubClient.List(context.Background(), &namespaces); err != nil { + klog.ErrorS(err, "failed to list namespace") + return err + } + for index, ns := range namespaces.Items { + if strings.HasPrefix(ns.Name, nsPrefix) { + if err := hubClient.Delete(context.Background(), &namespaces.Items[index]); err != nil { + klog.ErrorS(err, "failed to delete namespace", "namespace", ns.Name) + } + } + } + var crps v1alpha1.ClusterResourcePlacementList + if err := hubClient.List(context.Background(), &crps); err != nil { + klog.ErrorS(err, "failed to list namespace") + return err + } + for index, crp := range crps.Items { + if err := hubClient.Delete(context.Background(), &crps.Items[index]); err != nil { + klog.ErrorS(err, "failed to delete crp", "crp", crp.Name) + } + } + return nil +} diff --git a/hack/loadtest/util/placement.go b/hack/loadtest/util/placement.go index 64f05a0b4..ac1c0efa5 100644 --- a/hack/loadtest/util/placement.go +++ b/hack/loadtest/util/placement.go @@ -24,7 +24,10 @@ import ( "go.goms.io/fleet/pkg/utils" ) -const interval = 50 * time.Millisecond +const ( + crpPrefix = "load-test-placement-" + nsPrefix = "load-test-ns-" +) var ( labelKey = "workload.azure.com/load" @@ -42,10 +45,9 @@ var ( }, []string{"currency", "fleetSize"}) LoadTestApplyLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ - Name: "workload_apply_latency", - Help: "Length of time from placement change to it is applied to the member cluster", - Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, - 1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60}, + Name: "workload_apply_latency", + Help: "Length of time from placement change to it is applied to the member cluster", + Buckets: []float64{0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.9, 1.0, 1.25, 1.5, 1.75, 2.0, 3.0, 4.0, 5, 7, 9, 10, 13, 17, 20, 23, 27, 30, 36, 45, 60, 90, 120, 150, 180}, }, []string{"currency", "fleetSize"}) LoadTestDeleteErrorCount = promauto.NewCounterVec(prometheus.CounterOpts{ @@ -59,21 +61,21 @@ var ( }, []string{"currency", "fleetSize"}) LoadTestDeleteLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ - Name: "workload_delete_latency", - Help: "Length of time from resource deletion to it is deleted from the member cluster", - Buckets: []float64{0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, - 1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60}, + Name: "workload_delete_latency", + Help: "Length of time from resource deletion to it is deleted from the member cluster", + Buckets: []float64{0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.9, 1.0, 1.25, 1.5, 1.75, 2.0, 3.0, 4.0, 5, 7, 9, 10, 13, 17, 20, 23, 27, 30, 36, 45, 60, 90, 120, 150, 180}, }, []string{"currency", "fleetSize"}) ) -func MeasureOnePlacement(ctx context.Context, hubClient client.Client, deadline time.Duration, maxCurrentPlacement int, clusterNames ClusterNames) error { - crpName := "load-test-placement-" + utilrand.String(10) - nsName := "load-test-ns-" + utilrand.String(10) +func MeasureOnePlacement(ctx context.Context, hubClient client.WithWatch, deadline, interval time.Duration, maxCurrentPlacement int, clusterNames ClusterNames) error { + crpName := crpPrefix + utilrand.String(10) + nsName := nsPrefix + utilrand.String(10) fleetSize := strconv.Itoa(len(clusterNames)) currency := strconv.Itoa(maxCurrentPlacement) defer klog.Flush() - defer deleteNamespace(context.Background(), hubClient, nsName) + defer deleteNamespace(context.Background(), hubClient, nsName) //nolint + klog.Infof("create the resources in namespace `%s` in the hub cluster", nsName) if err := applyTestManifests(ctx, hubClient, nsName); err != nil { klog.ErrorS(err, "failed to apply namespaced resources", "namespace", nsName) @@ -104,38 +106,40 @@ func MeasureOnePlacement(ctx context.Context, hubClient client.Client, deadline }, }, } - defer hubClient.Delete(context.Background(), crp) + defer hubClient.Delete(context.Background(), crp) //nolint if err := hubClient.Create(ctx, crp); err != nil { klog.ErrorS(err, "failed to apply crp", "namespace", nsName, "crp", crpName) return err } klog.Infof("verify that the cluster resource placement `%s` is applied", crpName) - collectApplyMetrics(ctx, hubClient, deadline, crpName, currency, fleetSize) + collectApplyMetrics(ctx, hubClient, deadline, interval, crpName, currency, fleetSize) - klog.Infof("remove the namespace referred by the placement `%s`", crpName) - deleteNamespace(ctx, hubClient, nsName) - collectDeleteMetrics(ctx, hubClient, deadline, crpName, clusterNames, currency, fleetSize) + klog.Infof("remove the namespaced resources referred by the placement `%s`", crpName) + if err := deleteTestManifests(ctx, hubClient, nsName); err != nil { + klog.ErrorS(err, "failed to delete test manifests", "namespace", nsName, "crp", crpName) + return err + } + collectDeleteMetrics(ctx, hubClient, deadline, interval, crpName, clusterNames, currency, fleetSize) return hubClient.Delete(ctx, crp) } // collect the crp apply metrics -func collectApplyMetrics(ctx context.Context, hubClient client.Client, deadline time.Duration, crpName string, currency string, fleetSize string) time.Time { +func collectApplyMetrics(ctx context.Context, hubClient client.WithWatch, deadline, pollInterval time.Duration, crpName string, currency string, fleetSize string) { startTime := time.Now() applyDeadline := startTime.Add(deadline) var crp v1alpha1.ClusterResourcePlacement - for { - if err := hubClient.Get(ctx, types.NamespacedName{Name: crpName, Namespace: ""}, &crp); err != nil { - time.Sleep(interval) + var err error + for ; ctx.Err() == nil; time.Sleep(pollInterval) { + if err = hubClient.Get(ctx, types.NamespacedName{Name: crpName, Namespace: ""}, &crp); err != nil { + klog.ErrorS(err, "failed to get crp", "crp", crpName) continue } // check if the condition is true cond := crp.GetCondition(string(v1alpha1.ResourcePlacementStatusConditionTypeApplied)) if cond == nil || cond.Status == metav1.ConditionUnknown { - // wait klog.V(5).Infof("the cluster resource placement `%s` is pending", crpName) - time.Sleep(interval) } else if cond != nil && cond.Status == metav1.ConditionTrue { // succeeded klog.V(3).Infof("the cluster resource placement `%s` succeeded", crpName) @@ -146,50 +150,51 @@ func collectApplyMetrics(ctx context.Context, hubClient client.Client, deadline if time.Now().After(applyDeadline) { if cond != nil && cond.Status == metav1.ConditionFalse { // failed - klog.V(3).Infof("the cluster resource placement `%s` failed", crpName) + klog.V(2).Infof("the cluster resource placement `%s` failed", crpName) LoadTestApplyErrorCount.WithLabelValues(currency, fleetSize, "failed").Inc() } else { // timeout - klog.V(3).Infof("the cluster resource placement `%s` timeout", crpName) + klog.V(2).Infof("the cluster resource placement `%s` timeout", crpName) LoadTestApplyErrorCount.WithLabelValues(currency, fleetSize, "timeout").Inc() } break } } - return startTime } -func collectDeleteMetrics(ctx context.Context, hubClient client.Client, deadline time.Duration, crpName string, clusterNames ClusterNames, currency string, fleetSize string) { +func collectDeleteMetrics(ctx context.Context, hubClient client.WithWatch, deadline, pollInterval time.Duration, crpName string, clusterNames ClusterNames, currency string, fleetSize string) { var crp v1alpha1.ClusterResourcePlacement startTime := time.Now() - deleteDeadline := startTime.Add(deadline * 3) + deleteDeadline := startTime.Add(deadline) klog.Infof("verify that the cluster resource placement `%s` is deleted", crpName) - for { + for ; ctx.Err() == nil; time.Sleep(pollInterval) { if err := hubClient.Get(ctx, types.NamespacedName{Name: crpName, Namespace: ""}, &crp); err != nil { - time.Sleep(interval) + klog.ErrorS(err, "failed to get crp", "crp", crpName) continue } - if len(crp.Status.SelectedResources) != 1 { - klog.V(3).Infof("the crp `%s` has not picked up the namespaced resource deleted change", crpName) - // wait - time.Sleep(interval) + // the only thing it still selects are namespace and crd + if len(crp.Status.SelectedResources) != 2 { + klog.V(4).Infof("the crp `%s` has not picked up the namespaced resource deleted change", crpName) + if time.Now().After(deleteDeadline) { + // timeout + klog.V(3).Infof("the cluster resource placement `%s` delete timeout", crpName) + LoadTestDeleteErrorCount.WithLabelValues(currency, fleetSize).Inc() + break + } continue } // check if the change is picked up by the member agent var clusterWork workv1alpha1.Work - removed := true + allRemoved := true for _, clusterName := range clusterNames { err := hubClient.Get(ctx, types.NamespacedName{Name: crpName, Namespace: fmt.Sprintf(utils.NamespaceNameFormat, clusterName)}, &clusterWork) - if err != nil || (err == nil && len(clusterWork.Status.ManifestConditions) != 1) { - klog.V(3).Infof("the resources `%s` in cluster namespace `%s` is not removed by the member agent yet", crpName, clusterName) - removed = false + if err != nil || (err == nil && len(clusterWork.Status.ManifestConditions) != 2) { + klog.V(4).Infof("the resources `%s` in cluster namespace `%s` is not removed by the member agent yet", crpName, clusterName) + allRemoved = false break } } - if !removed { - // wait - time.Sleep(interval) - } else { + if allRemoved { // succeeded klog.V(3).Infof("the cluster resource placement `%s` delete succeeded", crpName) LoadTestDeleteSuccessCount.WithLabelValues(currency, fleetSize).Inc() From 3e01bb39f05819a2b88ee36907ff40efe507c6a1 Mon Sep 17 00:00:00 2001 From: Ryan Zhang Date: Wed, 12 Oct 2022 14:35:57 -0700 Subject: [PATCH 3/6] refine --- hack/loadtest/main.go | 9 +++++---- hack/loadtest/util/placement.go | 18 +++++++++--------- test/e2e/README.md | 10 +++++++--- test/e2e/prometheus.yml | 17 +++++++++++++++++ 4 files changed, 38 insertions(+), 16 deletions(-) create mode 100644 test/e2e/prometheus.yml diff --git a/hack/loadtest/main.go b/hack/loadtest/main.go index 7e01d6fd3..b84fdd50c 100644 --- a/hack/loadtest/main.go +++ b/hack/loadtest/main.go @@ -54,7 +54,7 @@ func main() { flag.Parse() defer klog.Flush() - klog.InfoS("start to run placement load test", "placementDeadline", *placementDeadline, "maxCurrentPlacement", *maxCurrentPlacement, "clusterNames", clusterNames) + klog.InfoS("start to run placement load test", "pollInterval", *pollInterval, "placementDeadline", *placementDeadline, "maxCurrentPlacement", *maxCurrentPlacement, "clusterNames", clusterNames) config := config.GetConfigOrDie() config.QPS, config.Burst = float32(100), 500 hubClient, err := client.New(config, client.Options{ @@ -82,9 +82,10 @@ func runLoadTest(ctx context.Context, config *rest.Config) { wg.Add(*maxCurrentPlacement) for i := 0; i < *maxCurrentPlacement; i++ { go func() { - // each use a separate client to avoid client side throttling - time.Sleep(time.Millisecond * time.Duration(utilrand.Intn(1000))) - hubClient, err := client.NewWithWatch(config, client.Options{ + // each use a separate client to avoid client side throttling, start each client side with a jitter + // to avoid creating too many clients at the same time. + time.Sleep(time.Millisecond * time.Duration(utilrand.Intn(100**maxCurrentPlacement))) + hubClient, err := client.New(config, client.Options{ Scheme: scheme, }) if err != nil { diff --git a/hack/loadtest/util/placement.go b/hack/loadtest/util/placement.go index ac1c0efa5..3781d2f92 100644 --- a/hack/loadtest/util/placement.go +++ b/hack/loadtest/util/placement.go @@ -37,37 +37,37 @@ var ( LoadTestApplyErrorCount = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "workload_apply_errors_total", Help: "Total number of placement errors", - }, []string{"currency", "fleetSize", "mode"}) + }, []string{"concurrency", "fleetSize", "mode"}) LoadTestApplySuccessCount = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "workload_apply_total", Help: "Total number of placement", - }, []string{"currency", "fleetSize"}) + }, []string{"concurrency", "fleetSize"}) LoadTestApplyLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ Name: "workload_apply_latency", Help: "Length of time from placement change to it is applied to the member cluster", Buckets: []float64{0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.9, 1.0, 1.25, 1.5, 1.75, 2.0, 3.0, 4.0, 5, 7, 9, 10, 13, 17, 20, 23, 27, 30, 36, 45, 60, 90, 120, 150, 180}, - }, []string{"currency", "fleetSize"}) + }, []string{"concurrency", "fleetSize"}) LoadTestDeleteErrorCount = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "workload_delete_errors_total", Help: "Total number of placement delete errors", - }, []string{"currency", "fleetSize"}) + }, []string{"concurrency", "fleetSize"}) LoadTestDeleteSuccessCount = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "workload_delete_total", Help: "Total number of placement deleted", - }, []string{"currency", "fleetSize"}) + }, []string{"concurrency", "fleetSize"}) LoadTestDeleteLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ Name: "workload_delete_latency", Help: "Length of time from resource deletion to it is deleted from the member cluster", Buckets: []float64{0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.9, 1.0, 1.25, 1.5, 1.75, 2.0, 3.0, 4.0, 5, 7, 9, 10, 13, 17, 20, 23, 27, 30, 36, 45, 60, 90, 120, 150, 180}, - }, []string{"currency", "fleetSize"}) + }, []string{"concurrency", "fleetSize"}) ) -func MeasureOnePlacement(ctx context.Context, hubClient client.WithWatch, deadline, interval time.Duration, maxCurrentPlacement int, clusterNames ClusterNames) error { +func MeasureOnePlacement(ctx context.Context, hubClient client.Client, deadline, interval time.Duration, maxCurrentPlacement int, clusterNames ClusterNames) error { crpName := crpPrefix + utilrand.String(10) nsName := nsPrefix + utilrand.String(10) fleetSize := strconv.Itoa(len(clusterNames)) @@ -126,7 +126,7 @@ func MeasureOnePlacement(ctx context.Context, hubClient client.WithWatch, deadli } // collect the crp apply metrics -func collectApplyMetrics(ctx context.Context, hubClient client.WithWatch, deadline, pollInterval time.Duration, crpName string, currency string, fleetSize string) { +func collectApplyMetrics(ctx context.Context, hubClient client.Client, deadline, pollInterval time.Duration, crpName string, currency string, fleetSize string) { startTime := time.Now() applyDeadline := startTime.Add(deadline) var crp v1alpha1.ClusterResourcePlacement @@ -162,7 +162,7 @@ func collectApplyMetrics(ctx context.Context, hubClient client.WithWatch, deadli } } -func collectDeleteMetrics(ctx context.Context, hubClient client.WithWatch, deadline, pollInterval time.Duration, crpName string, clusterNames ClusterNames, currency string, fleetSize string) { +func collectDeleteMetrics(ctx context.Context, hubClient client.Client, deadline, pollInterval time.Duration, crpName string, clusterNames ClusterNames, currency string, fleetSize string) { var crp v1alpha1.ClusterResourcePlacement startTime := time.Now() deleteDeadline := startTime.Add(deadline) diff --git a/test/e2e/README.md b/test/e2e/README.md index badfd114a..fbdfaed21 100644 --- a/test/e2e/README.md +++ b/test/e2e/README.md @@ -48,9 +48,9 @@ check the logs of the member cluster controller kubectl --context=kind-member-testing -n fleet-system get pod ``` -check the hub metrics +5. check the hub metrics ```shell -kubectl --context=kind-hub-testing -n fleet-system port-forward hub-agent-xxxx-xxx 13622:8080 +kubectl --context=kind-hub-testing -n fleet-system port-forward hub-agent-8bb6d658-6jj7n 13622:8080 Forwarding from 127.0.0.1:13622 -> 8080 Forwarding from [::1]:13622 -> 8080 @@ -58,8 +58,12 @@ Forwarding from [::1]:13622 -> 8080 curl http://127.0.0.1:13622/metrics ``` +Use a local prometheus to draw graphs. Download prometheus binary for your local machine. Start the prometheus. +```shell +prometheus --config.file=test/e2e/prometheus.yml +``` -5.uninstall the resources +6.uninstall the resources ```shell make uninstall-helm make clean-e2e-tests diff --git a/test/e2e/prometheus.yml b/test/e2e/prometheus.yml new file mode 100644 index 000000000..d307b5543 --- /dev/null +++ b/test/e2e/prometheus.yml @@ -0,0 +1,17 @@ +global: + scrape_interval: 15s # By default, scrape targets every 15 seconds. + + # Attach these labels to any time series or alerts when communicating with + # external systems (federation, remote storage, Alertmanager). + external_labels: + monitor: 'fleet-monitor' + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's fleet load test. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: 'fleet-e2e-test' + # Override the global default and scrape targets from this job every 5 seconds. + scrape_interval: 5s + static_configs: + - targets: ['localhost:13622'] From 3660d77cd42392a8d49dc3ab778336222cf6271f Mon Sep 17 00:00:00 2001 From: Ryan Zhang Date: Thu, 13 Oct 2022 14:44:05 -0700 Subject: [PATCH 4/6] add test time length --- hack/loadtest/main.go | 7 ++++--- hack/loadtest/util/placement.go | 32 +++++++++++--------------------- 2 files changed, 15 insertions(+), 24 deletions(-) diff --git a/hack/loadtest/main.go b/hack/loadtest/main.go index b84fdd50c..0d94a232b 100644 --- a/hack/loadtest/main.go +++ b/hack/loadtest/main.go @@ -36,6 +36,7 @@ var ( placementDeadline = flag.Int("placement-deadline-second", 300, "The deadline for a placement to be applied (in seconds)") pollInterval = flag.Int("poll-interval-millisecond", 250, "The poll interval for verification (in milli-second)") maxCurrentPlacement = flag.Int("max-current-placement", 10, "The number of current placement load.") + loadTestLength = flag.Int("load-test-length-minute", 15, "The length of the load test in miniutes.") clusterNames util.ClusterNames ) @@ -67,9 +68,9 @@ func main() { if err = util.ApplyClusterScopeManifests(ctx, hubClient); err != nil { panic(err) } - + loadTestCtx, _ := context.WithDeadline(ctx, time.Now().Add(time.Minute*time.Duration(*loadTestLength))) // run the loadtest in the background - go runLoadTest(ctx, config) + go runLoadTest(loadTestCtx, config) // setup prometheus server http.Handle("/metrics", promhttp.Handler()) if err = http.ListenAndServe(":4848", nil); err != nil { @@ -112,5 +113,5 @@ func runLoadTest(ctx context.Context, config *rest.Config) { if err := util.CleanupAll(hubClient); err != nil { klog.ErrorS(err, "clean up placement load test hit an error") } - klog.InfoS(" placement load test finished") + klog.InfoS(" placement load test finished, please check the metrics") } diff --git a/hack/loadtest/util/placement.go b/hack/loadtest/util/placement.go index 3781d2f92..01c511243 100644 --- a/hack/loadtest/util/placement.go +++ b/hack/loadtest/util/placement.go @@ -34,15 +34,10 @@ var ( ) var ( - LoadTestApplyErrorCount = promauto.NewCounterVec(prometheus.CounterOpts{ - Name: "workload_apply_errors_total", - Help: "Total number of placement errors", - }, []string{"concurrency", "fleetSize", "mode"}) - - LoadTestApplySuccessCount = promauto.NewCounterVec(prometheus.CounterOpts{ + LoadTestApplyCount = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "workload_apply_total", Help: "Total number of placement", - }, []string{"concurrency", "fleetSize"}) + }, []string{"concurrency", "fleetSize", "result"}) LoadTestApplyLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ Name: "workload_apply_latency", @@ -50,15 +45,10 @@ var ( Buckets: []float64{0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.9, 1.0, 1.25, 1.5, 1.75, 2.0, 3.0, 4.0, 5, 7, 9, 10, 13, 17, 20, 23, 27, 30, 36, 45, 60, 90, 120, 150, 180}, }, []string{"concurrency", "fleetSize"}) - LoadTestDeleteErrorCount = promauto.NewCounterVec(prometheus.CounterOpts{ - Name: "workload_delete_errors_total", - Help: "Total number of placement delete errors", - }, []string{"concurrency", "fleetSize"}) - - LoadTestDeleteSuccessCount = promauto.NewCounterVec(prometheus.CounterOpts{ + LoadTestDeleteCount = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "workload_delete_total", - Help: "Total number of placement deleted", - }, []string{"concurrency", "fleetSize"}) + Help: "Total number of placement delete", + }, []string{"concurrency", "fleetSize", "result"}) LoadTestDeleteLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ Name: "workload_delete_latency", @@ -143,7 +133,7 @@ func collectApplyMetrics(ctx context.Context, hubClient client.Client, deadline, } else if cond != nil && cond.Status == metav1.ConditionTrue { // succeeded klog.V(3).Infof("the cluster resource placement `%s` succeeded", crpName) - LoadTestApplySuccessCount.WithLabelValues(currency, fleetSize).Inc() + LoadTestApplyCount.WithLabelValues(currency, fleetSize, "succeed").Inc() LoadTestApplyLatency.WithLabelValues(currency, fleetSize).Observe(time.Since(startTime).Seconds()) break } @@ -151,11 +141,11 @@ func collectApplyMetrics(ctx context.Context, hubClient client.Client, deadline, if cond != nil && cond.Status == metav1.ConditionFalse { // failed klog.V(2).Infof("the cluster resource placement `%s` failed", crpName) - LoadTestApplyErrorCount.WithLabelValues(currency, fleetSize, "failed").Inc() + LoadTestApplyCount.WithLabelValues(currency, fleetSize, "failed").Inc() } else { // timeout klog.V(2).Infof("the cluster resource placement `%s` timeout", crpName) - LoadTestApplyErrorCount.WithLabelValues(currency, fleetSize, "timeout").Inc() + LoadTestApplyCount.WithLabelValues(currency, fleetSize, "timeout").Inc() } break } @@ -178,7 +168,7 @@ func collectDeleteMetrics(ctx context.Context, hubClient client.Client, deadline if time.Now().After(deleteDeadline) { // timeout klog.V(3).Infof("the cluster resource placement `%s` delete timeout", crpName) - LoadTestDeleteErrorCount.WithLabelValues(currency, fleetSize).Inc() + LoadTestDeleteCount.WithLabelValues(currency, fleetSize, "timeout").Inc() break } continue @@ -197,14 +187,14 @@ func collectDeleteMetrics(ctx context.Context, hubClient client.Client, deadline if allRemoved { // succeeded klog.V(3).Infof("the cluster resource placement `%s` delete succeeded", crpName) - LoadTestDeleteSuccessCount.WithLabelValues(currency, fleetSize).Inc() + LoadTestDeleteCount.WithLabelValues(currency, fleetSize, "succeed").Inc() LoadTestDeleteLatency.WithLabelValues(currency, fleetSize).Observe(time.Since(startTime).Seconds()) break } if time.Now().After(deleteDeadline) { // timeout klog.V(3).Infof("the cluster resource placement `%s` delete timeout", crpName) - LoadTestDeleteErrorCount.WithLabelValues(currency, fleetSize).Inc() + LoadTestDeleteCount.WithLabelValues(currency, fleetSize, "timeout").Inc() break } } From be07cd076c1ce2d36c11825ac87136185fb5cb7e Mon Sep 17 00:00:00 2001 From: Ryan Zhang Date: Thu, 13 Oct 2022 16:14:28 -0700 Subject: [PATCH 5/6] print out the count at the end --- hack/loadtest/main.go | 6 ++-- hack/loadtest/util/placement.go | 53 ++++++++++++++++++++++++--------- 2 files changed, 43 insertions(+), 16 deletions(-) diff --git a/hack/loadtest/main.go b/hack/loadtest/main.go index 0d94a232b..4b8040c9a 100644 --- a/hack/loadtest/main.go +++ b/hack/loadtest/main.go @@ -68,7 +68,8 @@ func main() { if err = util.ApplyClusterScopeManifests(ctx, hubClient); err != nil { panic(err) } - loadTestCtx, _ := context.WithDeadline(ctx, time.Now().Add(time.Minute*time.Duration(*loadTestLength))) + loadTestCtx, canFunc := context.WithDeadline(ctx, time.Now().Add(time.Minute*time.Duration(*loadTestLength))) + defer canFunc() // run the loadtest in the background go runLoadTest(loadTestCtx, config) // setup prometheus server @@ -113,5 +114,6 @@ func runLoadTest(ctx context.Context, config *rest.Config) { if err := util.CleanupAll(hubClient); err != nil { klog.ErrorS(err, "clean up placement load test hit an error") } - klog.InfoS(" placement load test finished, please check the metrics") + util.PrintTestMetrics() + klog.InfoS(" placement load test finished. For more metrics, please use prometheus") } diff --git a/hack/loadtest/util/placement.go b/hack/loadtest/util/placement.go index 01c511243..54f3acf21 100644 --- a/hack/loadtest/util/placement.go +++ b/hack/loadtest/util/placement.go @@ -12,6 +12,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" + "go.uber.org/atomic" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -34,26 +35,31 @@ var ( ) var ( - LoadTestApplyCount = promauto.NewCounterVec(prometheus.CounterOpts{ + applySuccessCount atomic.Int32 + applyFailCount atomic.Int32 + applyTimeoutCount atomic.Int32 + LoadTestApplyCountMetric = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "workload_apply_total", Help: "Total number of placement", }, []string{"concurrency", "fleetSize", "result"}) - LoadTestApplyLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ + LoadTestApplyLatencyMetric = promauto.NewHistogramVec(prometheus.HistogramOpts{ Name: "workload_apply_latency", Help: "Length of time from placement change to it is applied to the member cluster", - Buckets: []float64{0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.9, 1.0, 1.25, 1.5, 1.75, 2.0, 3.0, 4.0, 5, 7, 9, 10, 13, 17, 20, 23, 27, 30, 36, 45, 60, 90, 120, 150, 180}, + Buckets: []float64{0.1, 0.5, 1.0, 2.0, 3, 4, 6, 8, 10, 13, 16, 20, 23, 26, 30, 37, 45, 60, 90, 120, 150, 180, 300, 600, 1200, 1500, 3000}, }, []string{"concurrency", "fleetSize"}) - LoadTestDeleteCount = promauto.NewCounterVec(prometheus.CounterOpts{ + deleteSuccessCount atomic.Int32 + deleteTimeoutCount atomic.Int32 + LoadTestDeleteCountMetric = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "workload_delete_total", Help: "Total number of placement delete", }, []string{"concurrency", "fleetSize", "result"}) - LoadTestDeleteLatency = promauto.NewHistogramVec(prometheus.HistogramOpts{ + LoadTestDeleteLatencyMetric = promauto.NewHistogramVec(prometheus.HistogramOpts{ Name: "workload_delete_latency", Help: "Length of time from resource deletion to it is deleted from the member cluster", - Buckets: []float64{0.1, 0.2, 0.3, 0.4, 0.5, 0.7, 0.9, 1.0, 1.25, 1.5, 1.75, 2.0, 3.0, 4.0, 5, 7, 9, 10, 13, 17, 20, 23, 27, 30, 36, 45, 60, 90, 120, 150, 180}, + Buckets: []float64{0.1, 0.5, 1.0, 1.25, 1.5, 1.75, 2.0, 3, 4, 6, 8, 10, 13, 16, 20, 23, 26, 30, 37, 45, 60, 90, 120, 150, 180, 300, 600, 1200, 1500, 3000}, }, []string{"concurrency", "fleetSize"}) ) @@ -124,6 +130,13 @@ func collectApplyMetrics(ctx context.Context, hubClient client.Client, deadline, for ; ctx.Err() == nil; time.Sleep(pollInterval) { if err = hubClient.Get(ctx, types.NamespacedName{Name: crpName, Namespace: ""}, &crp); err != nil { klog.ErrorS(err, "failed to get crp", "crp", crpName) + if time.Now().After(applyDeadline) { + // timeout + klog.V(2).Infof("the cluster resource placement `%s` timeout", crpName) + LoadTestApplyCountMetric.WithLabelValues(currency, fleetSize, "timeout").Inc() + applyTimeoutCount.Inc() + break + } continue } // check if the condition is true @@ -133,19 +146,22 @@ func collectApplyMetrics(ctx context.Context, hubClient client.Client, deadline, } else if cond != nil && cond.Status == metav1.ConditionTrue { // succeeded klog.V(3).Infof("the cluster resource placement `%s` succeeded", crpName) - LoadTestApplyCount.WithLabelValues(currency, fleetSize, "succeed").Inc() - LoadTestApplyLatency.WithLabelValues(currency, fleetSize).Observe(time.Since(startTime).Seconds()) + LoadTestApplyCountMetric.WithLabelValues(currency, fleetSize, "succeed").Inc() + applySuccessCount.Inc() + LoadTestApplyLatencyMetric.WithLabelValues(currency, fleetSize).Observe(time.Since(startTime).Seconds()) break } if time.Now().After(applyDeadline) { if cond != nil && cond.Status == metav1.ConditionFalse { // failed klog.V(2).Infof("the cluster resource placement `%s` failed", crpName) - LoadTestApplyCount.WithLabelValues(currency, fleetSize, "failed").Inc() + LoadTestApplyCountMetric.WithLabelValues(currency, fleetSize, "failed").Inc() + applyFailCount.Inc() } else { // timeout klog.V(2).Infof("the cluster resource placement `%s` timeout", crpName) - LoadTestApplyCount.WithLabelValues(currency, fleetSize, "timeout").Inc() + LoadTestApplyCountMetric.WithLabelValues(currency, fleetSize, "timeout").Inc() + applyTimeoutCount.Inc() } break } @@ -168,7 +184,8 @@ func collectDeleteMetrics(ctx context.Context, hubClient client.Client, deadline if time.Now().After(deleteDeadline) { // timeout klog.V(3).Infof("the cluster resource placement `%s` delete timeout", crpName) - LoadTestDeleteCount.WithLabelValues(currency, fleetSize, "timeout").Inc() + LoadTestDeleteCountMetric.WithLabelValues(currency, fleetSize, "timeout").Inc() + deleteTimeoutCount.Inc() break } continue @@ -187,15 +204,23 @@ func collectDeleteMetrics(ctx context.Context, hubClient client.Client, deadline if allRemoved { // succeeded klog.V(3).Infof("the cluster resource placement `%s` delete succeeded", crpName) - LoadTestDeleteCount.WithLabelValues(currency, fleetSize, "succeed").Inc() - LoadTestDeleteLatency.WithLabelValues(currency, fleetSize).Observe(time.Since(startTime).Seconds()) + LoadTestDeleteCountMetric.WithLabelValues(currency, fleetSize, "succeed").Inc() + deleteSuccessCount.Inc() + LoadTestDeleteLatencyMetric.WithLabelValues(currency, fleetSize).Observe(time.Since(startTime).Seconds()) break } if time.Now().After(deleteDeadline) { // timeout klog.V(3).Infof("the cluster resource placement `%s` delete timeout", crpName) - LoadTestDeleteCount.WithLabelValues(currency, fleetSize, "timeout").Inc() + LoadTestDeleteCountMetric.WithLabelValues(currency, fleetSize, "timeout").Inc() + deleteTimeoutCount.Inc() break } } } + +func PrintTestMetrics() { + klog.InfoS("Placement apply result", "total applySuccessCount", applySuccessCount.Load(), "applyFailCount", applyFailCount.Load(), "applyTimeoutCount", applyTimeoutCount.Load()) + + klog.InfoS("Placement delete result", "total deleteSuccessCount", deleteSuccessCount.Load(), "deleteTimeoutCount", deleteTimeoutCount.Load()) +} From db07eb03ce34a6ee1585bd8824132e33daa44e9f Mon Sep 17 00:00:00 2001 From: Ryan Zhang Date: Thu, 13 Oct 2022 18:24:35 -0700 Subject: [PATCH 6/6] fix flaky test --- .../work/apply_controller_integration_test.go | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/pkg/controllers/work/apply_controller_integration_test.go b/pkg/controllers/work/apply_controller_integration_test.go index 09ce74f79..eb8c167c9 100644 --- a/pkg/controllers/work/apply_controller_integration_test.go +++ b/pkg/controllers/work/apply_controller_integration_test.go @@ -588,17 +588,7 @@ var _ = Describe("Work Controller", func() { for i := 0; i < numWork; i++ { var resultWork workv1alpha1.Work Expect(k8sClient.Get(ctx, types.NamespacedName{Name: works[i].GetName(), Namespace: workNamespace}, &resultWork)).Should(Succeed()) - Expect(controllerutil.ContainsFinalizer(work, workFinalizer)).Should(BeFalse()) - applyCond := meta.FindStatusCondition(resultWork.Status.Conditions, ConditionTypeApplied) - if applyCond != nil && applyCond.Status == metav1.ConditionTrue && applyCond.ObservedGeneration == resultWork.Generation { - By("the work is applied, check if the applied config map is still there") - Expect(k8sClient.Get(ctx, types.NamespacedName{Name: cmNames[i], Namespace: cmNamespace}, &configMap)).Should(Succeed()) - Expect(cmp.Diff(configMap.Data, data)).Should(BeEmpty()) - } else { - By("the work is not applied, verify that the applied config map is not there") - err := k8sClient.Get(ctx, types.NamespacedName{Name: cmNames[i], Namespace: cmNamespace}, &configMap) - Expect(apierrors.IsNotFound(err)).Should(BeTrue()) - } + Expect(controllerutil.ContainsFinalizer(&resultWork, workFinalizer)).Should(BeFalse()) // make sure that leave can be called as many times as possible Expect(workController.Leave(ctx)).Should(Succeed()) By(fmt.Sprintf("change the work = %s", work.GetName()))