diff --git a/.github/ISSUE_TEMPLATE/blank_issue.md b/.github/ISSUE_TEMPLATE/blank_issue.md new file mode 100644 index 000000000..dd6ebabf8 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/blank_issue.md @@ -0,0 +1,8 @@ +--- +name: Blank Issue +about: Create a new issue from scratch +title: '' +labels: needs-triage +assignees: '' + +--- \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/bug_request.md b/.github/ISSUE_TEMPLATE/bug_request.md index c2597eb32..15ed35e12 100644 --- a/.github/ISSUE_TEMPLATE/bug_request.md +++ b/.github/ISSUE_TEMPLATE/bug_request.md @@ -1,7 +1,9 @@ --- name: Bug Report about: Report a bug you encountered -labels: kind/bug +title: '' +labels: kind/bug, needs-triage +assignees: '' --- diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 000000000..3ba13e0ce --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 53a885c7c..1eee5871b 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -2,7 +2,7 @@ name: Feature request about: Suggest an idea for this project title: '' -labels: '' +labels: needs-triage assignees: '' --- @@ -12,4 +12,3 @@ assignees: '' **What would you like to be added**: **Why is this needed**: - diff --git a/.github/ISSUE_TEMPLATE/new-release.md b/.github/ISSUE_TEMPLATE/new-release.md index be5698441..16ff164b6 100644 --- a/.github/ISSUE_TEMPLATE/new-release.md +++ b/.github/ISSUE_TEMPLATE/new-release.md @@ -4,6 +4,7 @@ about: Propose a new release title: Release v0.x.0 labels: '' assignees: '' + --- - [Introduction](#introduction) @@ -34,10 +35,14 @@ This document defines the process for releasing Gateway API Inference Extension. export RC=1 ``` -4. The vLLM image tag defaults to `v0.7.2` for a release. Set the `VLLM` environment variable if a newer [tag][vllm-tag] has been published. For example: +4. Refer to the [release-quickstart script][release-quickstart] for the default image tags used + by the vLLM deployment manifests. If a newer [GPU][vllm-gpu-tag], [CPU][vllm-cpu-tag], or [Simulator][vllm-sim-tag] + tag has been published, set the appropriate environment variable or update the script. For example: ```shell - export VLLM=0.7.3 + export VLLM_GPU=0.9.2 + export VLLM_CPU=0.9.3 + export VLLM_SIM=0.1.2 ``` ## Release Process @@ -158,4 +163,7 @@ Use the following steps to announce the release. [k8s.io]: https://github.com/kubernetes/k8s.io [yaml]: https://github.com/kubernetes/k8s.io/blob/main/registry.k8s.io/images/k8s-staging-gateway-api-inference-extension/images.yaml [issue]: https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/new/choose -[vllm-tag]: https://hub.docker.com/r/vllm/vllm-openai/tags +[vllm-gpu-tag]: https://hub.docker.com/r/vllm/vllm-openai/tags +[vllm-cpu-tag]: https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo +[vllm-sim-tag]: https://github.com/llm-d/llm-d-inference-sim/pkgs/container/llm-d-inference-sim +[release-quickstart]: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/hack/release-quickstart.sh diff --git a/Dockerfile b/Dockerfile index 8fb00dfbf..551251807 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,6 +8,8 @@ FROM ${BUILDER_IMAGE} AS builder ENV CGO_ENABLED=0 ENV GOOS=linux ENV GOARCH=amd64 +ARG COMMIT_SHA=unknown +ARG BUILD_REF # Dependencies WORKDIR /src @@ -15,12 +17,13 @@ COPY go.mod go.sum ./ RUN go mod download # Sources -COPY cmd ./cmd -COPY pkg ./pkg +COPY cmd/epp ./cmd/epp +COPY pkg/epp ./pkg/epp +COPY conformance/testing-epp ./conformance/testing-epp COPY internal ./internal COPY api ./api WORKDIR /src/cmd/epp -RUN go build -o /epp +RUN go build -ldflags="-X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.CommitSHA=${COMMIT_SHA} -X sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics.BuildRef=${BUILD_REF}" -o /epp ## Multistage deploy FROM ${BASE_IMAGE} diff --git a/Makefile b/Makefile index 66fe89d4a..3e80ac4ed 100644 --- a/Makefile +++ b/Makefile @@ -21,6 +21,7 @@ CONTAINER_TOOL ?= docker SHELL = /usr/bin/env bash -o pipefail .SHELLFLAGS = -ec +GIT_COMMIT_SHA ?= "$(shell git rev-parse HEAD 2>/dev/null)" GIT_TAG ?= $(shell git describe --tags --dirty --always) PLATFORMS ?= linux/amd64 DOCKER_BUILDX_CMD ?= docker buildx @@ -34,7 +35,16 @@ IMAGE_NAME := epp IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(IMAGE_NAME) IMAGE_TAG ?= $(IMAGE_REPO):$(GIT_TAG) PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST)))) -E2E_MANIFEST_PATH ?= config/manifests/vllm/gpu-deployment.yaml +# The path to the E2E manifest file. It can be overridden by setting the +# E2E_MANIFEST_PATH environment variable. Note that HF_TOKEN must be set when using the GPU-based manifest. +E2E_MANIFEST_PATH ?= config/manifests/vllm/sim-deployment.yaml +# E2E_IMAGE specifies the image to be used when running e2e tests using make test-e2e. +# it defaults to current image tag, but can be overwritten to test specific tags, releases, etc. +E2E_IMAGE ?= $(IMAGE_TAG) +# E2E_USE_KIND is a flag used in test-e2e target. when set to true it will load the e2e image into the kind cluster. +# it is possible though to run e2e tests against clusters other than kind. in such a case, it is the user's responsibility to load +# the image into the cluster. +E2E_USE_KIND ?= true SYNCER_IMAGE_NAME := lora-syncer SYNCER_IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(SYNCER_IMAGE_NAME) @@ -50,10 +60,12 @@ ifdef GO_VERSION BUILDER_IMAGE = golang:$(GO_VERSION) endif +BUILD_REF ?= $(shell git describe --abbrev=0 2>/dev/null) ifdef EXTRA_TAG IMAGE_EXTRA_TAG ?= $(IMAGE_REPO):$(EXTRA_TAG) SYNCER_IMAGE_EXTRA_TAG ?= $(SYNCER_IMAGE_REPO):$(EXTRA_TAG) BBR_IMAGE_EXTRA_TAG ?= $(BBR_IMAGE_REPO):$(EXTRA_TAG) +BUILD_REF = $(EXTRA_TAG) endif ifdef IMAGE_EXTRA_TAG IMAGE_BUILD_EXTRA_OPTS += -t $(IMAGE_EXTRA_TAG) @@ -89,7 +101,7 @@ manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and Cust .PHONY: generate generate: controller-gen code-generator manifests ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. - $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." + $(CONTROLLER_GEN) object:headerFile="hack/boilerplate/boilerplate.generatego.txt" paths="./..." ./hack/update-codegen.sh # Use same code-generator version as k8s.io/api @@ -121,15 +133,19 @@ vet: ## Run go vet against code. .PHONY: test test: manifests generate fmt vet envtest image-build ## Run tests. - KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -race -coverprofile cover.out + CGO_ENABLED=1 KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e | grep -v /conformance) -race -coverprofile cover.out + +.PHONY: test-unit +test-unit: ## Run unit tests. + CGO_ENABLED=1 KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./pkg/... -race -coverprofile cover.out .PHONY: test-integration -test-integration: ## Run tests. - KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./test/integration/epp/... -race -coverprofile cover.out +test-integration: ## Run integration tests. + CGO_ENABLED=1 KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./test/integration/epp/... -race -coverprofile cover.out .PHONY: test-e2e -test-e2e: ## Run end-to-end tests against an existing Kubernetes cluster. When using default configuration, the tests need at least 3 available GPUs. - MANIFEST_PATH=$(PROJECT_DIR)/$(E2E_MANIFEST_PATH) go test ./test/e2e/epp/ -v -ginkgo.v +test-e2e: ## Run end-to-end tests against an existing Kubernetes cluster. + MANIFEST_PATH=$(PROJECT_DIR)/$(E2E_MANIFEST_PATH) E2E_IMAGE=$(E2E_IMAGE) USE_KIND=$(E2E_USE_KIND) ./hack/test-e2e.sh .PHONY: lint lint: golangci-lint ## Run golangci-lint linter @@ -144,9 +160,14 @@ ci-lint: golangci-lint $(GOLANGCI_LINT) run --timeout 15m0s .PHONY: verify -verify: vet fmt-verify manifests generate ci-lint +verify: vet fmt-verify manifests generate ci-lint verify-all git --no-pager diff --exit-code config api client-go +# Run static analysis. +.PHONY: verify-all +verify-all: + hack/verify-all.sh -v + ##@ Build # Build the container image @@ -171,6 +192,8 @@ image-build: ## Build the EPP image using Docker Buildx. --platform=$(PLATFORMS) \ --build-arg BASE_IMAGE=$(BASE_IMAGE) \ --build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \ + --build-arg COMMIT_SHA=${GIT_COMMIT_SHA} \ + --build-arg BUILD_REF=${BUILD_REF} \ $(PUSH) \ $(LOAD) \ $(IMAGE_BUILD_EXTRA_OPTS) ./ @@ -232,7 +255,7 @@ bbr-image-local-load: bbr-image-local-build .PHONY: bbr-image-build bbr-image-build: ## Build the image using Docker Buildx. - $(IMAGE_BUILD_CMD) -f body-based-routing.Dockerfile -t $(BBR_IMAGE_TAG) \ + $(IMAGE_BUILD_CMD) -f bbr.Dockerfile -t $(BBR_IMAGE_TAG) \ --platform=$(PLATFORMS) \ --build-arg BASE_IMAGE=$(BASE_IMAGE) \ --build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \ @@ -291,7 +314,6 @@ install: manifests kustomize ## Install CRDs into the K8s cluster specified in ~ uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion. $(KUSTOMIZE) build config/crd | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - - ##@ Helm PHONY: inferencepool-helm-chart-push inferencepool-helm-chart-push: yq helm @@ -322,7 +344,7 @@ release: artifacts release-quickstart verify test # Create a release. ## Location to install dependencies to LOCALBIN ?= $(shell pwd)/bin $(LOCALBIN): - mkdir -p $(LOCALBIN) + [ -d $@ ] || mkdir -p $@ ## Tool Binaries KUBECTL ?= kubectl diff --git a/OWNERS_ALIASES b/OWNERS_ALIASES index 6e8e0c5dc..32a04934c 100644 --- a/OWNERS_ALIASES +++ b/OWNERS_ALIASES @@ -5,12 +5,14 @@ aliases: gateway-api-inference-extension-maintainers: - ahg-g - danehans - - jeffwan + - nirrozenbaum - kfswain gateway-api-inference-extension-reviewers: - liu-cong - robscott + - shaneutt + wg-serving-leads: - ArangoGutierrez diff --git a/PROJECT b/PROJECT index 75c9c9cca..c049fc8e1 100644 --- a/PROJECT +++ b/PROJECT @@ -24,4 +24,12 @@ resources: kind: InferenceModel path: sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1 version: v1alpha1 +- api: + crdVersion: v1 + namespaced: true + domain: x-k8s.io + group: inference + kind: EndpointPickerConfig + path: sigs.k8s.io/gateway-api-inference-extension/api/config/v1alpha1 + version: v1alpha1 version: "3" diff --git a/README.md b/README.md index 2ff00581a..8aa8630d5 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,67 @@ -# Gateway API Inference Extension +[![Go Report Card](https://goreportcard.com/badge/sigs.k8s.io/gateway-api-inference-extension)](https://goreportcard.com/report/sigs.k8s.io/gateway-api-inference-extension) +[![Go Reference](https://pkg.go.dev/badge/sigs.k8s.io/gateway-api-inference-extension.svg)](https://pkg.go.dev/sigs.k8s.io/gateway-api-inference-extension) +[![License](https://img.shields.io/github/license/kubernetes-sigs/gateway-api-inference-extension)](/LICENSE) -This extension upgrades an [ext-proc](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter)-capable proxy or gateway - such as Envoy Gateway, kGateway, or the GKE Gateway - to become an **inference gateway** - supporting inference platform teams self-hosting large language models on Kubernetes. This integration makes it easy to expose and control access to your local [OpenAI-compatible chat completion endpoints](https://platform.openai.com/docs/api-reference/chat) to other workloads on or off cluster, or to integrate your self-hosted models alongside model-as-a-service providers in a higher level **AI Gateway** like LiteLLM, Solo AI Gateway, or Apigee. +# Gateway API Inference Extension -The inference gateway: +Gateway API Inference Extension optimizes self-hosting Generative Models on Kubernetes. +This is achieved by leveraging Envoy's [External Processing] (ext-proc) to extend any gateway that supports both ext-proc and [Gateway API] into an **[inference gateway]**. + +[Inference Gateway]:#concepts-and-definitions + +## New! +Inference Gateway has partnered with vLLM to accelerate LLM serving optimizations with [llm-d](https://llm-d.ai/blog/llm-d-announce)! + +## Concepts and Definitions + +The following specific terms to this project: + +- **Inference Gateway (IGW)**: A proxy/load-balancer which has been coupled with an + `Endpoint Picker`. It provides optimized routing and load balancing for + serving Kubernetes self-hosted generative Artificial Intelligence (AI) + workloads. It simplifies the deployment, management, and observability of AI + inference workloads. +- **Inference Scheduler**: An extendable component that makes decisions about which endpoint is optimal (best cost / + best performance) for an inference request based on `Metrics and Capabilities` + from [Model Serving](/docs/proposals/003-model-server-protocol/README.md). +- **Metrics and Capabilities**: Data provided by model serving platforms about + performance, availability and capabilities to optimize routing. Includes + things like [Prefix Cache] status or [LoRA Adapters] availability. +- **Endpoint Picker(EPP)**: An implementation of an `Inference Scheduler` with additional Routing, Flow, and Request Control layers to allow for sophisticated routing strategies. Additional info on the architecture of the EPP [here](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/0683-epp-architecture-proposal). + + +The following are key industry terms that are important to understand for +this project: + +- **Model**: A generative AI model that has learned patterns from data and is + used for inference. Models vary in size and architecture, from smaller + domain-specific models to massive multi-billion parameter neural networks that + are optimized for diverse language tasks. +- **Inference**: The process of running a generative AI model, such as a large + language model, diffusion model etc, to generate text, embeddings, or other + outputs from input data. +- **Model server**: A service (in our case, containerized) responsible for + receiving inference requests and returning predictions from a model. +- **Accelerator**: specialized hardware, such as Graphics Processing Units + (GPUs) that can be attached to Kubernetes nodes to speed up computations, + particularly for training and inference tasks. + + +For deeper insights and more advanced concepts, refer to our [proposals](/docs/proposals). + +[Inference]:https://www.digitalocean.com/community/tutorials/llm-inference-optimization +[Gateway API]:https://github.com/kubernetes-sigs/gateway-api +[Prefix Cache]:https://docs.vllm.ai/en/stable/design/v1/prefix_caching.html +[LoRA Adapters]:https://docs.vllm.ai/en/stable/features/lora.html +[External Processing]:https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter + + + +## Technical Overview + +This extension upgrades an [ext-proc](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter) capable proxy or gateway - such as Envoy Gateway, kGateway, or the GKE Gateway - to become an **[inference gateway]** - supporting inference platform teams self-hosting Generative Models (with a current focus on large language models) on Kubernetes. This integration makes it easy to expose and control access to your local [OpenAI-compatible chat completion endpoints](https://platform.openai.com/docs/api-reference/chat) to other workloads on or off cluster, or to integrate your self-hosted models alongside model-as-a-service providers in a higher level **AI Gateway** like LiteLLM, Solo AI Gateway, or Apigee. + +The Inference Gateway: * Improves the tail latency and throughput of LLM completion requests against Kubernetes-hosted model servers using an extensible request scheduling alogrithm that is kv-cache and request cost aware, avoiding evictions or queueing as load increases * Provides [Kubernetes-native declarative APIs](https://gateway-api-inference-extension.sigs.k8s.io/concepts/api-overview/) to route client model names to use-case specific LoRA adapters and control incremental rollout of new adapter versions, A/B traffic splitting, and safe blue-green base model and model server upgrades @@ -11,23 +70,33 @@ The inference gateway: ![Architecture Diagram](./docs/inference-gateway-architecture.svg) -It currently requires a version of vLLM that supports the necessary metrics to predict traffic load which is defined in the [model server protocol](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol). Support for Google's Jetstream, nVidia Triton, text-generation-inference, and SGLang is coming soon. +### Model Server Integration + +IGW’s pluggable architecture was leveraged to enable the [llm-d Inference Scheduler](https://github.com/llm-d/llm-d-inference-scheduler). + +Llm-d customizes vLLM & IGW to create a disaggregated serving solution. We've worked closely with this team to enable this integration. IGW will continue to work closely with llm-d to generalize the disaggregated serving plugin(s), & set a standard for disaggregated serving to be used across any [protocol-adherent](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol) model server. + +IGW has enhanced support for vLLM via llm-d, and broad support for any model servers implementing the protocol. More details can be found in [model server integration](https://gateway-api-inference-extension.sigs.k8s.io/implementations/model-servers/). ## Status -This project is [alpha (0.2 release)](https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/tag/v0.2.0). It should not be used in production yet. +![Latest Release](https://img.shields.io/github/v/release/kubernetes-sigs/gateway-api-inference-extension?) + +This project is in alpha. latest release can be found [here](https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/latest). +It should not be used in production yet. ## Getting Started Follow our [Getting Started Guide](https://gateway-api-inference-extension.sigs.k8s.io/guides/) to get the inference-extension up and running on your cluster! -See our website at https://gateway-api-inference-extension.sigs.k8s.io/ for detailed API documentation on leveraging our Kubernetes-native declarative APIs +See [our website](https://gateway-api-inference-extension.sigs.k8s.io/) for detailed API documentation on leveraging our Kubernetes-native declarative APIs ## Roadmap As Inference Gateway builds towards a GA release. We will continue to expand our capabilities, namely: -1. Prefix-cache aware load balancing with interfaces for remote caches -1. Recommended LoRA adapter pipeline for automated rollout + +1. Prefix-cache aware load balancing with interfaces for remote caches +1. Recommended LoRA adapter pipeline for automated rollout 1. Fairness and priority between workloads within the same criticality band 1. HPA support for autoscaling on aggregate metrics derived from the load balancer 1. Support for large multi-modal inputs and outputs @@ -35,7 +104,6 @@ As Inference Gateway builds towards a GA release. We will continue to expand our 1. Heterogeneous accelerators - serve workloads on multiple types of accelerator using latency and request cost-aware load balancing 1. Disaggregated serving support with independently scaling pools - ## End-to-End Tests Follow this [README](./test/e2e/epp/README.md) to learn more about running the inference-extension end-to-end test suite on your cluster. @@ -44,7 +112,7 @@ Follow this [README](./test/e2e/epp/README.md) to learn more about running the i Our community meeting is weekly at Thursday 10AM PDT ([Zoom](https://zoom.us/j/9955436256?pwd=Z2FQWU1jeDZkVC9RRTN4TlZyZTBHZz09), [Meeting Notes](https://www.google.com/url?q=https://docs.google.com/document/d/1frfPE5L1sI3737rdQV04IcDGeOcGJj2ItjMg6z2SRH0/edit?usp%3Dsharing&sa=D&source=calendar&usd=2&usg=AOvVaw1pUVy7UN_2PMj8qJJcFm1U)). -We currently utilize the [#wg-serving](https://kubernetes.slack.com/?redir=%2Fmessages%2Fwg-serving) slack channel for communications. +We currently utilize the [#gateway-api-inference-extension](https://kubernetes.slack.com/?redir=%2Fmessages%2Fgateway-api-inference-extension) channel in Kubernetes Slack workspace for communications. Contributions are readily welcomed, follow the [dev guide](./docs/dev.md) to start contributing! diff --git a/api/config/v1alpha1/defaults.go b/api/config/v1alpha1/defaults.go new file mode 100644 index 000000000..07f1f9a7c --- /dev/null +++ b/api/config/v1alpha1/defaults.go @@ -0,0 +1,29 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +// SetDefaults_EndpointPickerConfig sets default values in a +// EndpointPickerConfig struct. +// +// This naming convension is required by the defalter-gen code. +func SetDefaults_EndpointPickerConfig(cfg *EndpointPickerConfig) { + for idx, pluginConfig := range cfg.Plugins { + if pluginConfig.Name == "" { + cfg.Plugins[idx].Name = pluginConfig.Type + } + } +} diff --git a/api/config/v1alpha1/doc.go b/api/config/v1alpha1/doc.go new file mode 100644 index 000000000..122c3b952 --- /dev/null +++ b/api/config/v1alpha1/doc.go @@ -0,0 +1,22 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1alpha1 contains API Schema definitions for the +// inference.networking.x-k8s.io API group. +// +// +kubebuilder:object:generate=true +// +groupName=inference.networking.x-k8s.io +package v1alpha1 diff --git a/api/config/v1alpha1/endpointpickerconfig_types.go b/api/config/v1alpha1/endpointpickerconfig_types.go new file mode 100644 index 000000000..92baa6c61 --- /dev/null +++ b/api/config/v1alpha1/endpointpickerconfig_types.go @@ -0,0 +1,92 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + "encoding/json" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// +k8s:defaulter-gen=true +// +kubebuilder:object:root=true + +// EndpointPickerConfig is the Schema for the endpointpickerconfigs API +type EndpointPickerConfig struct { + metav1.TypeMeta `json:",inline"` + + // +required + // +kubebuilder:validation:Required + // Plugins is the list of plugins that will be instantiated. + Plugins []PluginSpec `json:"plugins"` + + // +required + // +kubebuilder:validation:Required + // SchedulingProfiles is the list of named SchedulingProfiles + // that will be created. + SchedulingProfiles []SchedulingProfile `json:"schedulingProfiles"` +} + +// PluginSpec contains the information that describes a plugin that +// will be instantiated. +type PluginSpec struct { + // +optional + // Name provides a name for plugin entries to reference. If + // omitted, the value of the Plugin's Type field will be used. + Name string `json:"name"` + + // +required + // +kubebuilder:validation:Required + // Type specifies the plugin type to be instantiated. + Type string `json:"type"` + + // +optional + // Parameters are the set of parameters to be passed to the plugin's + // factory function. The factory function is responsible + // to parse the parameters. + Parameters json.RawMessage `json:"parameters"` +} + +// SchedulingProfile contains the information to create a SchedulingProfile +// entry to be used by the scheduler. +type SchedulingProfile struct { + // +kubebuilder:validation:Required + // Name specifies the name of this SchedulingProfile + Name string `json:"name"` + + // +required + // +kubebuilder:validation:Required + // Plugins is the list of plugins for this SchedulingProfile. They are assigned + // to the appropriate "slots" based on their type. + Plugins []SchedulingPlugin `json:"plugins"` +} + +// SchedulingPlugin describes a plugin that will be associated with a +// SchedulingProfile entry. +type SchedulingPlugin struct { + // +required + // +kubebuilder:validation:Required + // PluginRef specifies a partiular Plugin instance to be associated with + // this SchedulingProfile. The reference is to the name of an + // entry of the Plugins defined in the configuration's Plugins + // section + PluginRef string `json:"pluginRef"` + + // +optional + // Weight is the weight fo be used if this plugin is a Scorer. + Weight *int `json:"weight"` +} diff --git a/api/config/v1alpha1/zz_generated.deepcopy.go b/api/config/v1alpha1/zz_generated.deepcopy.go new file mode 100644 index 000000000..1326f357b --- /dev/null +++ b/api/config/v1alpha1/zz_generated.deepcopy.go @@ -0,0 +1,126 @@ +//go:build !ignore_autogenerated + +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + "encoding/json" + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EndpointPickerConfig) DeepCopyInto(out *EndpointPickerConfig) { + *out = *in + out.TypeMeta = in.TypeMeta + if in.Plugins != nil { + in, out := &in.Plugins, &out.Plugins + *out = make([]PluginSpec, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.SchedulingProfiles != nil { + in, out := &in.SchedulingProfiles, &out.SchedulingProfiles + *out = make([]SchedulingProfile, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EndpointPickerConfig. +func (in *EndpointPickerConfig) DeepCopy() *EndpointPickerConfig { + if in == nil { + return nil + } + out := new(EndpointPickerConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *EndpointPickerConfig) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PluginSpec) DeepCopyInto(out *PluginSpec) { + *out = *in + if in.Parameters != nil { + in, out := &in.Parameters, &out.Parameters + *out = make(json.RawMessage, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PluginSpec. +func (in *PluginSpec) DeepCopy() *PluginSpec { + if in == nil { + return nil + } + out := new(PluginSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SchedulingPlugin) DeepCopyInto(out *SchedulingPlugin) { + *out = *in + if in.Weight != nil { + in, out := &in.Weight, &out.Weight + *out = new(int) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SchedulingPlugin. +func (in *SchedulingPlugin) DeepCopy() *SchedulingPlugin { + if in == nil { + return nil + } + out := new(SchedulingPlugin) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SchedulingProfile) DeepCopyInto(out *SchedulingProfile) { + *out = *in + if in.Plugins != nil { + in, out := &in.Plugins, &out.Plugins + *out = make([]SchedulingPlugin, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SchedulingProfile. +func (in *SchedulingProfile) DeepCopy() *SchedulingProfile { + if in == nil { + return nil + } + out := new(SchedulingProfile) + in.DeepCopyInto(out) + return out +} diff --git a/api/config/v1alpha1/zz_generated.defaults.go b/api/config/v1alpha1/zz_generated.defaults.go new file mode 100644 index 000000000..1cf0c4275 --- /dev/null +++ b/api/config/v1alpha1/zz_generated.defaults.go @@ -0,0 +1,38 @@ +//go:build !ignore_autogenerated +// +build !ignore_autogenerated + +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by defaulter-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// RegisterDefaults adds defaulters functions to the given scheme. +// Public to allow building arbitrary schemes. +// All generated defaulters are covering - they call all nested defaulters. +func RegisterDefaults(scheme *runtime.Scheme) error { + scheme.AddTypeDefaultingFunc(&EndpointPickerConfig{}, func(obj interface{}) { SetObjectDefaults_EndpointPickerConfig(obj.(*EndpointPickerConfig)) }) + return nil +} + +func SetObjectDefaults_EndpointPickerConfig(in *EndpointPickerConfig) { + SetDefaults_EndpointPickerConfig(in) +} diff --git a/api/config/v1alpha1/zz_generated.register.go b/api/config/v1alpha1/zz_generated.register.go new file mode 100644 index 000000000..3c48d0ce2 --- /dev/null +++ b/api/config/v1alpha1/zz_generated.register.go @@ -0,0 +1,69 @@ +//go:build !ignore_autogenerated +// +build !ignore_autogenerated + +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by register-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + schema "k8s.io/apimachinery/pkg/runtime/schema" +) + +// GroupName specifies the group name used to register the objects. +const GroupName = "inference.networking.x-k8s.io" + +// GroupVersion specifies the group and the version used to register the objects. +var GroupVersion = v1.GroupVersion{Group: GroupName, Version: "v1alpha1"} + +// SchemeGroupVersion is group version used to register these objects +// Deprecated: use GroupVersion instead. +var SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: "v1alpha1"} + +// Resource takes an unqualified resource and returns a Group qualified GroupResource +func Resource(resource string) schema.GroupResource { + return SchemeGroupVersion.WithResource(resource).GroupResource() +} + +var ( + // localSchemeBuilder and AddToScheme will stay in k8s.io/kubernetes. + SchemeBuilder runtime.SchemeBuilder + localSchemeBuilder = &SchemeBuilder + // Deprecated: use Install instead + AddToScheme = localSchemeBuilder.AddToScheme + Install = localSchemeBuilder.AddToScheme +) + +func init() { + // We only register manually written functions here. The registration of the + // generated functions takes place in the generated files. The separation + // makes the code compile even when the generated files are missing. + localSchemeBuilder.Register(addKnownTypes) +} + +// Adds the list of known types to Scheme. +func addKnownTypes(scheme *runtime.Scheme) error { + scheme.AddKnownTypes(SchemeGroupVersion, + &EndpointPickerConfig{}, + ) + // AddToGroupVersion allows the serialization of client types like ListOptions. + v1.AddToGroupVersion(scheme, SchemeGroupVersion) + return nil +} diff --git a/api/v1alpha2/inferencemodel_types.go b/api/v1alpha2/inferencemodel_types.go index 052683d88..28d2b5861 100644 --- a/api/v1alpha2/inferencemodel_types.go +++ b/api/v1alpha2/inferencemodel_types.go @@ -67,7 +67,7 @@ type InferenceModelSpec struct { // ModelNames must be unique for a referencing InferencePool // (names can be reused for a different pool in the same cluster). // The modelName with the oldest creation timestamp is retained, and the incoming - // InferenceModel is sets the Ready status to false with a corresponding reason. + // InferenceModel's Ready status is set to false with a corresponding reason. // In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected. // Names can be reserved without an underlying model configured in the pool. // This can be done by specifying a target model and setting the weight to zero, @@ -126,7 +126,7 @@ type PoolObjectReference struct { } // Criticality defines how important it is to serve the model compared to other models. -// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional(use a pointer), and set no default. +// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional (use a pointer), and set no default. // This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior. // +kubebuilder:validation:Enum=Critical;Standard;Sheddable type Criticality string diff --git a/api/v1alpha2/inferencepool_types.go b/api/v1alpha2/inferencepool_types.go index 7018ba21f..214e36dbc 100644 --- a/api/v1alpha2/inferencepool_types.go +++ b/api/v1alpha2/inferencepool_types.go @@ -31,7 +31,11 @@ type InferencePool struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` - Spec InferencePoolSpec `json:"spec,omitempty"` + Spec InferencePoolSpec `json:"spec,omitempty"` + + // Status defines the observed state of InferencePool. + // + // +kubebuilder:default={parent: {{parentRef: {kind: "Status", name: "default"}, conditions: {{type: "Accepted", status: "Unknown", reason: "Pending", message: "Waiting for controller", lastTransitionTime: "1970-01-01T00:00:00Z"}}}}} Status InferencePoolStatus `json:"status,omitempty"` } @@ -80,14 +84,20 @@ type EndpointPickerConfig struct { // Extension specifies how to configure an extension that runs the endpoint picker. type Extension struct { - // Reference is a reference to a service extension. + // Reference is a reference to a service extension. When ExtensionReference is invalid, + // a 5XX status code MUST be returned for the request that would have otherwise been routed + // to the invalid backend. ExtensionReference `json:",inline"` // ExtensionConnection configures the connection between the gateway and the extension. ExtensionConnection `json:",inline"` } -// ExtensionReference is a reference to the extension deployment. +// ExtensionReference is a reference to the extension. +// +// If a reference is invalid, the implementation MUST update the `ResolvedRefs` +// Condition on the InferencePool's status to `status: False`. A 5XX status code MUST be returned +// for the request that would have otherwise been routed to the invalid backend. type ExtensionReference struct { // Group is the group of the referent. // The default value is "", representing the Core API group. @@ -146,14 +156,19 @@ const ( FailClose ExtensionFailureMode = "FailClose" ) -// InferencePoolStatus defines the observed state of InferencePool +// InferencePoolStatus defines the observed state of InferencePool. type InferencePoolStatus struct { // Parents is a list of parent resources (usually Gateways) that are - // associated with the route, and the status of the InferencePool with respect to + // associated with the InferencePool, and the status of the InferencePool with respect to // each parent. // - // A maximum of 32 Gateways will be represented in this list. An empty list - // means the route has not been attached to any Gateway. + // A maximum of 32 Gateways will be represented in this list. When the list contains + // `kind: Status, name: default`, it indicates that the InferencePool is not + // associated with any Gateway and a controller must perform the following: + // + // - Remove the parent when setting the "Accepted" condition. + // - Add the parent when the controller will no longer manage the InferencePool + // and no other parents exist. // // +kubebuilder:validation:MaxItems=32 Parents []PoolStatus `json:"parent,omitempty"` @@ -186,7 +201,7 @@ type InferencePoolConditionType string type InferencePoolReason string const ( - // This condition indicates whether the route has been accepted or rejected + // This condition indicates whether the InferencePool has been accepted or rejected // by a Gateway, and why. // // Possible reasons for this condition to be True are: @@ -196,6 +211,7 @@ const ( // Possible reasons for this condition to be False are: // // * "NotSupportedByGateway" + // * "HTTPRouteNotAccepted" // // Possible reasons for this condition to be Unknown are: // @@ -205,7 +221,7 @@ const ( // prefer to use the reasons listed above to improve interoperability. InferencePoolConditionAccepted InferencePoolConditionType = "Accepted" - // This reason is used with the "Accepted" condition when the Route has been + // This reason is used with the "Accepted" condition when the InferencePool has been // accepted by the Gateway. InferencePoolReasonAccepted InferencePoolReason = "Accepted" @@ -214,8 +230,13 @@ const ( // InferencePool as a backend. InferencePoolReasonNotSupportedByGateway InferencePoolReason = "NotSupportedByGateway" + // This reason is used with the "Accepted" condition when the InferencePool is + // referenced by an HTTPRoute that has been rejected by the Gateway. The user + // should inspect the status of the referring HTTPRoute for the specific reason. + InferencePoolReasonHTTPRouteNotAccepted InferencePoolReason = "HTTPRouteNotAccepted" + // This reason is used with the "Accepted" when a controller has not yet - // reconciled the route. + // reconciled the InferencePool. InferencePoolReasonPending InferencePoolReason = "Pending" ) @@ -223,13 +244,13 @@ const ( // This condition indicates whether the controller was able to resolve all // the object references for the InferencePool. // - // Possible reasons for this condition to be true are: + // Possible reasons for this condition to be True are: // // * "ResolvedRefs" // // Possible reasons for this condition to be False are: // - // * "InvalidExtnesionRef" + // * "InvalidExtensionRef" // // Controllers MAY raise this condition with other reasons, but should // prefer to use the reasons listed above to improve interoperability. diff --git a/api/v1alpha2/zz_generated.deepcopy.go b/api/v1alpha2/zz_generated.deepcopy.go index 3070cdcb6..218df1640 100644 --- a/api/v1alpha2/zz_generated.deepcopy.go +++ b/api/v1alpha2/zz_generated.deepcopy.go @@ -1,7 +1,7 @@ //go:build !ignore_autogenerated /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/api/v1alpha2/zz_generated.register.go b/api/v1alpha2/zz_generated.register.go index 07dbf92bd..75560c11f 100644 --- a/api/v1alpha2/zz_generated.register.go +++ b/api/v1alpha2/zz_generated.register.go @@ -2,7 +2,7 @@ // +build !ignore_autogenerated /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by register-gen. DO NOT EDIT. package v1alpha2 diff --git a/body-based-routing.Dockerfile b/bbr.Dockerfile similarity index 74% rename from body-based-routing.Dockerfile rename to bbr.Dockerfile index e0afcf207..8f21eeaaf 100644 --- a/body-based-routing.Dockerfile +++ b/bbr.Dockerfile @@ -15,16 +15,16 @@ COPY go.mod go.sum ./ RUN go mod download # Sources -COPY cmd ./cmd +COPY cmd/bbr ./cmd COPY pkg ./pkg COPY internal ./internal -WORKDIR /src/cmd/body-based-routing -RUN go build -o /body-based-routing +WORKDIR /src/cmd +RUN go build -o /bbr ## Multistage deploy FROM ${BASE_IMAGE} WORKDIR / -COPY --from=builder /body-based-routing /body-based-routing +COPY --from=builder /bbr /bbr -ENTRYPOINT ["/body-based-routing"] +ENTRYPOINT ["/bbr"] diff --git a/client-go/applyconfiguration/api/v1alpha2/endpointpickerconfig.go b/client-go/applyconfiguration/api/v1alpha2/endpointpickerconfig.go index 679cdba8f..39d4d8de6 100644 --- a/client-go/applyconfiguration/api/v1alpha2/endpointpickerconfig.go +++ b/client-go/applyconfiguration/api/v1alpha2/endpointpickerconfig.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by applyconfiguration-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/applyconfiguration/api/v1alpha2/extension.go b/client-go/applyconfiguration/api/v1alpha2/extension.go index 731467b74..87c982e07 100644 --- a/client-go/applyconfiguration/api/v1alpha2/extension.go +++ b/client-go/applyconfiguration/api/v1alpha2/extension.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by applyconfiguration-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/applyconfiguration/api/v1alpha2/extensionconnection.go b/client-go/applyconfiguration/api/v1alpha2/extensionconnection.go index bd968ec6d..25d703efd 100644 --- a/client-go/applyconfiguration/api/v1alpha2/extensionconnection.go +++ b/client-go/applyconfiguration/api/v1alpha2/extensionconnection.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by applyconfiguration-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/applyconfiguration/api/v1alpha2/extensionreference.go b/client-go/applyconfiguration/api/v1alpha2/extensionreference.go index 4db2dae18..d9226a201 100644 --- a/client-go/applyconfiguration/api/v1alpha2/extensionreference.go +++ b/client-go/applyconfiguration/api/v1alpha2/extensionreference.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by applyconfiguration-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/applyconfiguration/api/v1alpha2/inferencemodel.go b/client-go/applyconfiguration/api/v1alpha2/inferencemodel.go index 8c810170b..e901cd329 100644 --- a/client-go/applyconfiguration/api/v1alpha2/inferencemodel.go +++ b/client-go/applyconfiguration/api/v1alpha2/inferencemodel.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by applyconfiguration-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/applyconfiguration/api/v1alpha2/inferencemodelspec.go b/client-go/applyconfiguration/api/v1alpha2/inferencemodelspec.go index f9b453a45..2fea0a354 100644 --- a/client-go/applyconfiguration/api/v1alpha2/inferencemodelspec.go +++ b/client-go/applyconfiguration/api/v1alpha2/inferencemodelspec.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by applyconfiguration-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/applyconfiguration/api/v1alpha2/inferencemodelstatus.go b/client-go/applyconfiguration/api/v1alpha2/inferencemodelstatus.go index 4c9e10a95..bc5866751 100644 --- a/client-go/applyconfiguration/api/v1alpha2/inferencemodelstatus.go +++ b/client-go/applyconfiguration/api/v1alpha2/inferencemodelstatus.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by applyconfiguration-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/applyconfiguration/api/v1alpha2/inferencepool.go b/client-go/applyconfiguration/api/v1alpha2/inferencepool.go index 15649a60b..8cbd4e522 100644 --- a/client-go/applyconfiguration/api/v1alpha2/inferencepool.go +++ b/client-go/applyconfiguration/api/v1alpha2/inferencepool.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by applyconfiguration-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/applyconfiguration/api/v1alpha2/inferencepoolspec.go b/client-go/applyconfiguration/api/v1alpha2/inferencepoolspec.go index ba0fe3c33..fd6683102 100644 --- a/client-go/applyconfiguration/api/v1alpha2/inferencepoolspec.go +++ b/client-go/applyconfiguration/api/v1alpha2/inferencepoolspec.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by applyconfiguration-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/applyconfiguration/api/v1alpha2/inferencepoolstatus.go b/client-go/applyconfiguration/api/v1alpha2/inferencepoolstatus.go index daf3be204..831059e2f 100644 --- a/client-go/applyconfiguration/api/v1alpha2/inferencepoolstatus.go +++ b/client-go/applyconfiguration/api/v1alpha2/inferencepoolstatus.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by applyconfiguration-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/applyconfiguration/api/v1alpha2/poolobjectreference.go b/client-go/applyconfiguration/api/v1alpha2/poolobjectreference.go index 7227560ee..289458866 100644 --- a/client-go/applyconfiguration/api/v1alpha2/poolobjectreference.go +++ b/client-go/applyconfiguration/api/v1alpha2/poolobjectreference.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by applyconfiguration-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/applyconfiguration/api/v1alpha2/poolstatus.go b/client-go/applyconfiguration/api/v1alpha2/poolstatus.go index 9d7d72947..cfa651eca 100644 --- a/client-go/applyconfiguration/api/v1alpha2/poolstatus.go +++ b/client-go/applyconfiguration/api/v1alpha2/poolstatus.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by applyconfiguration-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/applyconfiguration/api/v1alpha2/targetmodel.go b/client-go/applyconfiguration/api/v1alpha2/targetmodel.go index 1c9277fa0..643587d6f 100644 --- a/client-go/applyconfiguration/api/v1alpha2/targetmodel.go +++ b/client-go/applyconfiguration/api/v1alpha2/targetmodel.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by applyconfiguration-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/applyconfiguration/internal/internal.go b/client-go/applyconfiguration/internal/internal.go index e1bbb864a..1064e227e 100644 --- a/client-go/applyconfiguration/internal/internal.go +++ b/client-go/applyconfiguration/internal/internal.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by applyconfiguration-gen. DO NOT EDIT. package internal diff --git a/client-go/applyconfiguration/utils.go b/client-go/applyconfiguration/utils.go index cec3969aa..eafadbed4 100644 --- a/client-go/applyconfiguration/utils.go +++ b/client-go/applyconfiguration/utils.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by applyconfiguration-gen. DO NOT EDIT. package applyconfiguration diff --git a/client-go/clientset/versioned/clientset.go b/client-go/clientset/versioned/clientset.go index 9ed7187b7..653aa0461 100644 --- a/client-go/clientset/versioned/clientset.go +++ b/client-go/clientset/versioned/clientset.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by client-gen. DO NOT EDIT. package versioned diff --git a/client-go/clientset/versioned/fake/clientset_generated.go b/client-go/clientset/versioned/fake/clientset_generated.go index f2f421107..a78153b43 100644 --- a/client-go/clientset/versioned/fake/clientset_generated.go +++ b/client-go/clientset/versioned/fake/clientset_generated.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,11 +13,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by client-gen. DO NOT EDIT. package fake import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/discovery" @@ -49,9 +51,13 @@ func NewSimpleClientset(objects ...runtime.Object) *Clientset { cs.discovery = &fakediscovery.FakeDiscovery{Fake: &cs.Fake} cs.AddReactor("*", "*", testing.ObjectReaction(o)) cs.AddWatchReactor("*", func(action testing.Action) (handled bool, ret watch.Interface, err error) { + var opts metav1.ListOptions + if watchActcion, ok := action.(testing.WatchActionImpl); ok { + opts = watchActcion.ListOptions + } gvr := action.GetResource() ns := action.GetNamespace() - watch, err := o.Watch(gvr, ns) + watch, err := o.Watch(gvr, ns, opts) if err != nil { return false, nil, err } @@ -98,9 +104,13 @@ func NewClientset(objects ...runtime.Object) *Clientset { cs.discovery = &fakediscovery.FakeDiscovery{Fake: &cs.Fake} cs.AddReactor("*", "*", testing.ObjectReaction(o)) cs.AddWatchReactor("*", func(action testing.Action) (handled bool, ret watch.Interface, err error) { + var opts metav1.ListOptions + if watchActcion, ok := action.(testing.WatchActionImpl); ok { + opts = watchActcion.ListOptions + } gvr := action.GetResource() ns := action.GetNamespace() - watch, err := o.Watch(gvr, ns) + watch, err := o.Watch(gvr, ns, opts) if err != nil { return false, nil, err } diff --git a/client-go/clientset/versioned/fake/doc.go b/client-go/clientset/versioned/fake/doc.go index 0f3cdf288..9b99e7167 100644 --- a/client-go/clientset/versioned/fake/doc.go +++ b/client-go/clientset/versioned/fake/doc.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by client-gen. DO NOT EDIT. // This package has the automatically generated fake clientset. diff --git a/client-go/clientset/versioned/fake/register.go b/client-go/clientset/versioned/fake/register.go index 0966faea5..c9b917d62 100644 --- a/client-go/clientset/versioned/fake/register.go +++ b/client-go/clientset/versioned/fake/register.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by client-gen. DO NOT EDIT. package fake diff --git a/client-go/clientset/versioned/scheme/doc.go b/client-go/clientset/versioned/scheme/doc.go index a3e95ed2c..7dc375616 100644 --- a/client-go/clientset/versioned/scheme/doc.go +++ b/client-go/clientset/versioned/scheme/doc.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by client-gen. DO NOT EDIT. // This package contains the scheme of the automatically generated clientset. diff --git a/client-go/clientset/versioned/scheme/register.go b/client-go/clientset/versioned/scheme/register.go index 1e4975e5e..1566a730b 100644 --- a/client-go/clientset/versioned/scheme/register.go +++ b/client-go/clientset/versioned/scheme/register.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by client-gen. DO NOT EDIT. package scheme diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/api_client.go b/client-go/clientset/versioned/typed/api/v1alpha2/api_client.go index 16c144535..80b3ecfd8 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha2/api_client.go +++ b/client-go/clientset/versioned/typed/api/v1alpha2/api_client.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by client-gen. DO NOT EDIT. package v1alpha2 @@ -49,9 +50,7 @@ func (c *InferenceV1alpha2Client) InferencePools(namespace string) InferencePool // where httpClient was generated with rest.HTTPClientFor(c). func NewForConfig(c *rest.Config) (*InferenceV1alpha2Client, error) { config := *c - if err := setConfigDefaults(&config); err != nil { - return nil, err - } + setConfigDefaults(&config) httpClient, err := rest.HTTPClientFor(&config) if err != nil { return nil, err @@ -63,9 +62,7 @@ func NewForConfig(c *rest.Config) (*InferenceV1alpha2Client, error) { // Note the http client provided takes precedence over the configured transport values. func NewForConfigAndClient(c *rest.Config, h *http.Client) (*InferenceV1alpha2Client, error) { config := *c - if err := setConfigDefaults(&config); err != nil { - return nil, err - } + setConfigDefaults(&config) client, err := rest.RESTClientForConfigAndClient(&config, h) if err != nil { return nil, err @@ -88,7 +85,7 @@ func New(c rest.Interface) *InferenceV1alpha2Client { return &InferenceV1alpha2Client{c} } -func setConfigDefaults(config *rest.Config) error { +func setConfigDefaults(config *rest.Config) { gv := apiv1alpha2.SchemeGroupVersion config.GroupVersion = &gv config.APIPath = "/apis" @@ -97,8 +94,6 @@ func setConfigDefaults(config *rest.Config) error { if config.UserAgent == "" { config.UserAgent = rest.DefaultKubernetesUserAgent() } - - return nil } // RESTClient returns a RESTClient that is used to communicate diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/doc.go b/client-go/clientset/versioned/typed/api/v1alpha2/doc.go index 0240168ec..baaf2d985 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha2/doc.go +++ b/client-go/clientset/versioned/typed/api/v1alpha2/doc.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by client-gen. DO NOT EDIT. // This package has the automatically generated typed clients. diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/fake/doc.go b/client-go/clientset/versioned/typed/api/v1alpha2/fake/doc.go index 01839331f..16f443990 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha2/fake/doc.go +++ b/client-go/clientset/versioned/typed/api/v1alpha2/fake/doc.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by client-gen. DO NOT EDIT. // Package fake has the automatically generated clients. diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_api_client.go b/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_api_client.go index 5bd7fd40f..14fcc20cf 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_api_client.go +++ b/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_api_client.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by client-gen. DO NOT EDIT. package fake diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencemodel.go b/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencemodel.go index 50f78c524..93c11b5e8 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencemodel.go +++ b/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencemodel.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by client-gen. DO NOT EDIT. package fake diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencepool.go b/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencepool.go index a7f6a185b..df8222c4f 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencepool.go +++ b/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencepool.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by client-gen. DO NOT EDIT. package fake diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/generated_expansion.go b/client-go/clientset/versioned/typed/api/v1alpha2/generated_expansion.go index 1b9be99f6..5295d9b49 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha2/generated_expansion.go +++ b/client-go/clientset/versioned/typed/api/v1alpha2/generated_expansion.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by client-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/inferencemodel.go b/client-go/clientset/versioned/typed/api/v1alpha2/inferencemodel.go index c5fb5c3de..5671668d8 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha2/inferencemodel.go +++ b/client-go/clientset/versioned/typed/api/v1alpha2/inferencemodel.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by client-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/inferencepool.go b/client-go/clientset/versioned/typed/api/v1alpha2/inferencepool.go index 6cbfb5462..ac0616c7c 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha2/inferencepool.go +++ b/client-go/clientset/versioned/typed/api/v1alpha2/inferencepool.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by client-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/informers/externalversions/api/interface.go b/client-go/informers/externalversions/api/interface.go index 572f5230c..d7890db51 100644 --- a/client-go/informers/externalversions/api/interface.go +++ b/client-go/informers/externalversions/api/interface.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by informer-gen. DO NOT EDIT. package api diff --git a/client-go/informers/externalversions/api/v1alpha2/inferencemodel.go b/client-go/informers/externalversions/api/v1alpha2/inferencemodel.go index d21f9cdaa..0bf9ca43e 100644 --- a/client-go/informers/externalversions/api/v1alpha2/inferencemodel.go +++ b/client-go/informers/externalversions/api/v1alpha2/inferencemodel.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by informer-gen. DO NOT EDIT. package v1alpha2 @@ -61,13 +62,25 @@ func NewFilteredInferenceModelInformer(client versioned.Interface, namespace str if tweakListOptions != nil { tweakListOptions(&options) } - return client.InferenceV1alpha2().InferenceModels(namespace).List(context.TODO(), options) + return client.InferenceV1alpha2().InferenceModels(namespace).List(context.Background(), options) }, WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { if tweakListOptions != nil { tweakListOptions(&options) } - return client.InferenceV1alpha2().InferenceModels(namespace).Watch(context.TODO(), options) + return client.InferenceV1alpha2().InferenceModels(namespace).Watch(context.Background(), options) + }, + ListWithContextFunc: func(ctx context.Context, options v1.ListOptions) (runtime.Object, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.InferenceV1alpha2().InferenceModels(namespace).List(ctx, options) + }, + WatchFuncWithContext: func(ctx context.Context, options v1.ListOptions) (watch.Interface, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.InferenceV1alpha2().InferenceModels(namespace).Watch(ctx, options) }, }, &gatewayapiinferenceextensionapiv1alpha2.InferenceModel{}, diff --git a/client-go/informers/externalversions/api/v1alpha2/inferencepool.go b/client-go/informers/externalversions/api/v1alpha2/inferencepool.go index 4d042db79..c52a0f349 100644 --- a/client-go/informers/externalversions/api/v1alpha2/inferencepool.go +++ b/client-go/informers/externalversions/api/v1alpha2/inferencepool.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by informer-gen. DO NOT EDIT. package v1alpha2 @@ -61,13 +62,25 @@ func NewFilteredInferencePoolInformer(client versioned.Interface, namespace stri if tweakListOptions != nil { tweakListOptions(&options) } - return client.InferenceV1alpha2().InferencePools(namespace).List(context.TODO(), options) + return client.InferenceV1alpha2().InferencePools(namespace).List(context.Background(), options) }, WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { if tweakListOptions != nil { tweakListOptions(&options) } - return client.InferenceV1alpha2().InferencePools(namespace).Watch(context.TODO(), options) + return client.InferenceV1alpha2().InferencePools(namespace).Watch(context.Background(), options) + }, + ListWithContextFunc: func(ctx context.Context, options v1.ListOptions) (runtime.Object, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.InferenceV1alpha2().InferencePools(namespace).List(ctx, options) + }, + WatchFuncWithContext: func(ctx context.Context, options v1.ListOptions) (watch.Interface, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.InferenceV1alpha2().InferencePools(namespace).Watch(ctx, options) }, }, &gatewayapiinferenceextensionapiv1alpha2.InferencePool{}, diff --git a/client-go/informers/externalversions/api/v1alpha2/interface.go b/client-go/informers/externalversions/api/v1alpha2/interface.go index 6db5619e1..e6a58a8df 100644 --- a/client-go/informers/externalversions/api/v1alpha2/interface.go +++ b/client-go/informers/externalversions/api/v1alpha2/interface.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by informer-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/informers/externalversions/factory.go b/client-go/informers/externalversions/factory.go index 9b52e8144..7593e5174 100644 --- a/client-go/informers/externalversions/factory.go +++ b/client-go/informers/externalversions/factory.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by informer-gen. DO NOT EDIT. package externalversions diff --git a/client-go/informers/externalversions/generic.go b/client-go/informers/externalversions/generic.go index 143f9289c..b044039b2 100644 --- a/client-go/informers/externalversions/generic.go +++ b/client-go/informers/externalversions/generic.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by informer-gen. DO NOT EDIT. package externalversions diff --git a/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go b/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go index b11099a08..940f83e79 100644 --- a/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go +++ b/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by informer-gen. DO NOT EDIT. package internalinterfaces diff --git a/client-go/listers/api/v1alpha2/expansion_generated.go b/client-go/listers/api/v1alpha2/expansion_generated.go index 6abe0b372..e799c304f 100644 --- a/client-go/listers/api/v1alpha2/expansion_generated.go +++ b/client-go/listers/api/v1alpha2/expansion_generated.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by lister-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/listers/api/v1alpha2/inferencemodel.go b/client-go/listers/api/v1alpha2/inferencemodel.go index 22ca6a16e..075ba77ce 100644 --- a/client-go/listers/api/v1alpha2/inferencemodel.go +++ b/client-go/listers/api/v1alpha2/inferencemodel.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by lister-gen. DO NOT EDIT. package v1alpha2 diff --git a/client-go/listers/api/v1alpha2/inferencepool.go b/client-go/listers/api/v1alpha2/inferencepool.go index 488795605..0ddea52a1 100644 --- a/client-go/listers/api/v1alpha2/inferencepool.go +++ b/client-go/listers/api/v1alpha2/inferencepool.go @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + // Code generated by lister-gen. DO NOT EDIT. package v1alpha2 diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 6043d225a..366667489 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -12,6 +12,7 @@ steps: - GIT_TAG=$_GIT_TAG - EXTRA_TAG=$_PULL_BASE_REF - DOCKER_BUILDX_CMD=/buildx-entrypoint + - GIT_COMMIT_SHA=$_PULL_BASE_SHA - name: gcr.io/k8s-staging-test-infra/gcb-docker-gcloud:v20240718-5ef92b5c36 entrypoint: make args: @@ -43,5 +44,7 @@ substitutions: # _PULL_BASE_REF will contain the ref that was pushed to trigger this build - # a branch like 'main' or 'release-0.2', or a tag like 'v0.2'. _PULL_BASE_REF: 'main' + # _PULL_BASE_SHA will contain the Git SHA of the commit that was pushed to trigger this build. + _PULL_BASE_SHA: 'abcdef' options: substitution_option: ALLOW_LOOSE diff --git a/cmd/body-based-routing/health.go b/cmd/bbr/health.go similarity index 53% rename from cmd/body-based-routing/health.go rename to cmd/bbr/health.go index 7d1b5fd53..f42ca2b41 100644 --- a/cmd/body-based-routing/health.go +++ b/cmd/bbr/health.go @@ -19,6 +19,7 @@ package main import ( "context" + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "github.com/go-logr/logr" "google.golang.org/grpc/codes" healthPb "google.golang.org/grpc/health/grpc_health_v1" @@ -31,10 +32,31 @@ type healthServer struct { } func (s *healthServer) Check(ctx context.Context, in *healthPb.HealthCheckRequest) (*healthPb.HealthCheckResponse, error) { + // TODO: we're accepting ANY service name for now as a temporary hack in alignment with + // upstream issues. See https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/788 + // if in.Service != extProcPb.ExternalProcessor_ServiceDesc.ServiceName { + // s.logger.V(logutil.DEFAULT).Info("gRPC health check requested unknown service", "available-services", []string{extProcPb.ExternalProcessor_ServiceDesc.ServiceName}, "requested-service", in.Service) + // return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_SERVICE_UNKNOWN}, nil + // } + s.logger.V(logutil.VERBOSE).Info("gRPC health check serving", "service", in.Service) return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_SERVING}, nil } +func (s *healthServer) List(ctx context.Context, _ *healthPb.HealthListRequest) (*healthPb.HealthListResponse, error) { + // currently only the ext_proc service is provided + serviceHealthResponse, err := s.Check(ctx, &healthPb.HealthCheckRequest{Service: extProcPb.ExternalProcessor_ServiceDesc.ServiceName}) + if err != nil { + return nil, err + } + + return &healthPb.HealthListResponse{ + Statuses: map[string]*healthPb.HealthCheckResponse{ + extProcPb.ExternalProcessor_ServiceDesc.ServiceName: serviceHealthResponse, + }, + }, nil +} + func (s *healthServer) Watch(in *healthPb.HealthCheckRequest, srv healthPb.Health_WatchServer) error { return status.Error(codes.Unimplemented, "Watch is not implemented") } diff --git a/cmd/body-based-routing/main.go b/cmd/bbr/main.go similarity index 67% rename from cmd/body-based-routing/main.go rename to cmd/bbr/main.go index cfc584ce7..0dffa74d5 100644 --- a/cmd/body-based-routing/main.go +++ b/cmd/bbr/main.go @@ -18,26 +18,23 @@ package main import ( "flag" - "net" - "net/http" + "fmt" "os" - "strconv" "github.com/go-logr/logr" - "github.com/prometheus/client_golang/prometheus/promhttp" uberzap "go.uber.org/zap" "go.uber.org/zap/zapcore" "google.golang.org/grpc" healthPb "google.golang.org/grpc/health/grpc_health_v1" - "k8s.io/client-go/rest" - "k8s.io/component-base/metrics/legacyregistry" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" - runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/body-based-routing/server" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/bbr/metrics" + runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/bbr/server" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -85,7 +82,18 @@ func run() error { return err } - mgr, err := ctrl.NewManager(cfg, ctrl.Options{}) + metrics.Register() + + // Register metrics handler. + // Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server. + // More info: + // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/metrics/server + // - https://book.kubebuilder.io/reference/metrics.html + metricsServerOptions := metricsserver.Options{ + BindAddress: fmt.Sprintf(":%d", *metricsPort), + FilterProvider: filters.WithAuthenticationAndAuthorization, + } + mgr, err := ctrl.NewManager(cfg, ctrl.Options{Metrics: metricsServerOptions}) if err != nil { setupLog.Error(err, "Failed to create manager", "config", cfg) return err @@ -107,11 +115,6 @@ func run() error { return err } - // Register metrics handler. - if err := registerMetricsHandler(mgr, *metricsPort, cfg); err != nil { - return err - } - // Start the manager. This blocks until a signal is received. setupLog.Info("Manager starting") if err := mgr.Start(ctx); err != nil { @@ -152,58 +155,3 @@ func initLogging(opts *zap.Options) { logger := zap.New(zap.UseFlagOptions(opts), zap.RawZapOpts(uberzap.AddCaller())) ctrl.SetLogger(logger) } - -const metricsEndpoint = "/metrics" - -// registerMetricsHandler adds the metrics HTTP handler as a Runnable to the given manager. -func registerMetricsHandler(mgr manager.Manager, port int, cfg *rest.Config) error { - metrics.Register() - - // Init HTTP server. - h, err := metricsHandlerWithAuthenticationAndAuthorization(cfg) - if err != nil { - return err - } - - mux := http.NewServeMux() - mux.Handle(metricsEndpoint, h) - - srv := &http.Server{ - Addr: net.JoinHostPort("", strconv.Itoa(port)), - Handler: mux, - } - - if err := mgr.Add(&manager.Server{ - Name: "metrics", - Server: srv, - }); err != nil { - setupLog.Error(err, "Failed to register metrics HTTP handler") - return err - } - return nil -} - -func metricsHandlerWithAuthenticationAndAuthorization(cfg *rest.Config) (http.Handler, error) { - h := promhttp.HandlerFor( - legacyregistry.DefaultGatherer, - promhttp.HandlerOpts{}, - ) - httpClient, err := rest.HTTPClientFor(cfg) - if err != nil { - setupLog.Error(err, "Failed to create http client for metrics auth") - return nil, err - } - - filter, err := filters.WithAuthenticationAndAuthorization(cfg, httpClient) - if err != nil { - setupLog.Error(err, "Failed to create metrics filter for auth") - return nil, err - } - metricsLogger := ctrl.Log.WithName("metrics").WithValues("path", metricsEndpoint) - metricsAuthHandler, err := filter(metricsLogger, h) - if err != nil { - setupLog.Error(err, "Failed to create metrics auth handler") - return nil, err - } - return metricsAuthHandler, nil -} diff --git a/cmd/epp/main.go b/cmd/epp/main.go index 39baf18b1..b5e06177b 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -17,304 +17,20 @@ limitations under the License. package main import ( - "flag" - "fmt" - "net" - "net/http" "os" - "strconv" - "github.com/go-logr/logr" - "github.com/prometheus/client_golang/prometheus/promhttp" - uberzap "go.uber.org/zap" - "go.uber.org/zap/zapcore" - "google.golang.org/grpc" - healthPb "google.golang.org/grpc/health/grpc_health_v1" - "k8s.io/client-go/rest" - "k8s.io/component-base/metrics/legacyregistry" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/log/zap" - "sigs.k8s.io/controller-runtime/pkg/manager" - "sigs.k8s.io/controller-runtime/pkg/metrics/filters" - "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" - backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" - runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" -) - -const ( - defaultMetricsEndpoint = "/metrics" -) - -var ( - grpcPort = flag.Int( - "grpcPort", - runserver.DefaultGrpcPort, - "The gRPC port used for communicating with Envoy proxy") - grpcHealthPort = flag.Int( - "grpcHealthPort", - 9003, - "The port used for gRPC liveness and readiness probes") - metricsPort = flag.Int( - "metricsPort", 9090, "The metrics port") - destinationEndpointHintKey = flag.String( - "destinationEndpointHintKey", - runserver.DefaultDestinationEndpointHintKey, - "Header and response metadata key used by Envoy to route to the appropriate pod. This must match Envoy configuration.") - destinationEndpointHintMetadataNamespace = flag.String( - "DestinationEndpointHintMetadataNamespace", - runserver.DefaultDestinationEndpointHintMetadataNamespace, - "The key for the outer namespace struct in the metadata field of the extproc response that is used to wrap the"+ - "target endpoint. If not set, then an outer namespace struct should not be created.") - poolName = flag.String( - "poolName", - runserver.DefaultPoolName, - "Name of the InferencePool this Endpoint Picker is associated with.") - poolNamespace = flag.String( - "poolNamespace", - runserver.DefaultPoolNamespace, - "Namespace of the InferencePool this Endpoint Picker is associated with.") - refreshMetricsInterval = flag.Duration( - "refreshMetricsInterval", - runserver.DefaultRefreshMetricsInterval, - "interval to refresh metrics") - refreshPrometheusMetricsInterval = flag.Duration( - "refreshPrometheusMetricsInterval", - runserver.DefaultRefreshPrometheusMetricsInterval, - "interval to flush prometheus metrics") - logVerbosity = flag.Int("v", logging.DEFAULT, "number for the log level verbosity") - secureServing = flag.Bool( - "secureServing", runserver.DefaultSecureServing, "Enables secure serving. Defaults to true.") - certPath = flag.String( - "certPath", "", "The path to the certificate for secure serving. The certificate and private key files "+ - "are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+ - "then a self-signed certificate is used.") - // metric flags - totalQueuedRequestsMetric = flag.String("totalQueuedRequestsMetric", - "vllm:num_requests_waiting", - "Prometheus metric for the number of queued requests.") - kvCacheUsagePercentageMetric = flag.String("kvCacheUsagePercentageMetric", - "vllm:gpu_cache_usage_perc", - "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).") - // LoRA metrics - loraInfoMetric = flag.String("loraInfoMetric", - "vllm:lora_requests_info", - "Prometheus metric for the LoRA info metrics (must be in vLLM label format).") - setupLog = ctrl.Log.WithName("setup") + "sigs.k8s.io/gateway-api-inference-extension/cmd/epp/runner" ) func main() { - if err := run(); err != nil { - os.Exit(1) - } -} - -func run() error { - opts := zap.Options{ - Development: true, - } - opts.BindFlags(flag.CommandLine) - flag.Parse() - initLogging(&opts) - - useStreamingServer, err := strconv.ParseBool(os.Getenv("USE_STREAMING")) - if err != nil { - setupLog.Error(err, "Failed to parse env var USE_STREAMING, defaulting to false") - } - - // Validate flags - if err := validateFlags(); err != nil { - setupLog.Error(err, "Failed to validate flags") - return err - } - - // Print all flag values - flags := make(map[string]any) - flag.VisitAll(func(f *flag.Flag) { - flags[f.Name] = f.Value - }) - setupLog.Info("Flags processed", "flags", flags) - - // Init runtime. - cfg, err := ctrl.GetConfig() - if err != nil { - setupLog.Error(err, "Failed to get rest config") - return err - } - - mgr, err := runserver.NewDefaultManager(*poolNamespace, *poolName, cfg) - if err != nil { - setupLog.Error(err, "Failed to create controller manager") - return err - } - - ctx := ctrl.SetupSignalHandler() - - // Set up mapper for metric scraping. - mapping, err := backendmetrics.NewMetricMapping( - *totalQueuedRequestsMetric, - *kvCacheUsagePercentageMetric, - *loraInfoMetric, - ) - if err != nil { - setupLog.Error(err, "Failed to create metric mapping from flags.") - return err - } - verifyMetricMapping(*mapping, setupLog) - - pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.PodMetricsClientImpl{MetricMapping: mapping}, *refreshMetricsInterval) - // Setup runner. - datastore := datastore.NewDatastore(ctx, pmf) - - serverRunner := &runserver.ExtProcServerRunner{ - GrpcPort: *grpcPort, - DestinationEndpointHintMetadataNamespace: *destinationEndpointHintMetadataNamespace, - DestinationEndpointHintKey: *destinationEndpointHintKey, - PoolName: *poolName, - PoolNamespace: *poolNamespace, - Datastore: datastore, - SecureServing: *secureServing, - CertPath: *certPath, - UseStreaming: useStreamingServer, - RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval, - } - if err := serverRunner.SetupWithManager(ctx, mgr); err != nil { - setupLog.Error(err, "Failed to setup ext-proc controllers") - return err - } - - // Register health server. - if err := registerHealthServer(mgr, ctrl.Log.WithName("health"), datastore, *grpcHealthPort); err != nil { - return err - } - - // Register ext-proc server. - if err := mgr.Add(serverRunner.AsRunnable(ctrl.Log.WithName("ext-proc"))); err != nil { - setupLog.Error(err, "Failed to register ext-proc gRPC server") - return err - } - - // Register metrics handler. - if err := registerMetricsHandler(mgr, *metricsPort, cfg); err != nil { - return err - } - - // Start the manager. This blocks until a signal is received. - setupLog.Info("Controller manager starting") - if err := mgr.Start(ctx); err != nil { - setupLog.Error(err, "Error starting controller manager") - return err - } - setupLog.Info("Controller manager terminated") - return nil -} - -func initLogging(opts *zap.Options) { - // Unless -zap-log-level is explicitly set, use -v - useV := true - flag.Visit(func(f *flag.Flag) { - if f.Name == "zap-log-level" { - useV = false - } - }) - if useV { - // See https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/log/zap#Options.Level - lvl := -1 * (*logVerbosity) - opts.Level = uberzap.NewAtomicLevelAt(zapcore.Level(int8(lvl))) - } - - logger := zap.New(zap.UseFlagOptions(opts), zap.RawZapOpts(uberzap.AddCaller())) - ctrl.SetLogger(logger) -} - -// registerHealthServer adds the Health gRPC server as a Runnable to the given manager. -func registerHealthServer(mgr manager.Manager, logger logr.Logger, ds datastore.Datastore, port int) error { - srv := grpc.NewServer() - healthPb.RegisterHealthServer(srv, &healthServer{ - logger: logger, - datastore: ds, - }) - if err := mgr.Add( - runnable.NoLeaderElection(runnable.GRPCServer("health", srv, port))); err != nil { - setupLog.Error(err, "Failed to register health server") - return err - } - return nil -} - -// registerMetricsHandler adds the metrics HTTP handler as a Runnable to the given manager. -func registerMetricsHandler(mgr manager.Manager, port int, cfg *rest.Config) error { - metrics.Register() - - // Init HTTP server. - h, err := metricsHandlerWithAuthenticationAndAuthorization(cfg) - if err != nil { - return err - } - - mux := http.NewServeMux() - mux.Handle(defaultMetricsEndpoint, h) - - srv := &http.Server{ - Addr: net.JoinHostPort("", strconv.Itoa(port)), - Handler: mux, - } - - if err := mgr.Add(&manager.Server{ - Name: "metrics", - Server: srv, - }); err != nil { - setupLog.Error(err, "Failed to register metrics HTTP handler") - return err - } - return nil -} - -func metricsHandlerWithAuthenticationAndAuthorization(cfg *rest.Config) (http.Handler, error) { - h := promhttp.HandlerFor( - legacyregistry.DefaultGatherer, - promhttp.HandlerOpts{}, - ) - httpClient, err := rest.HTTPClientFor(cfg) - if err != nil { - setupLog.Error(err, "Failed to create http client for metrics auth") - return nil, err - } - - filter, err := filters.WithAuthenticationAndAuthorization(cfg, httpClient) - if err != nil { - setupLog.Error(err, "Failed to create metrics filter for auth") - return nil, err - } - metricsLogger := ctrl.Log.WithName("metrics").WithValues("path", defaultMetricsEndpoint) - metricsAuthHandler, err := filter(metricsLogger, h) - if err != nil { - setupLog.Error(err, "Failed to create metrics auth handler") - return nil, err - } - return metricsAuthHandler, nil -} + // Register all known plugin factories + runner.RegisterAllPlugins() + // For adding out-of-tree plugins to the plugins registry, use the following: + // plugins.Register(my-out-of-tree-plugin-name, my-out-of-tree-plugin-factory-function) -func validateFlags() error { - if *poolName == "" { - return fmt.Errorf("required %q flag not set", "poolName") - } - - return nil -} - -func verifyMetricMapping(mapping backendmetrics.MetricMapping, logger logr.Logger) { - if mapping.TotalQueuedRequests == nil { - logger.Info("Not scraping metric: TotalQueuedRequests") - } - if mapping.KVCacheUtilization == nil { - logger.Info("Not scraping metric: KVCacheUtilization") - } - if mapping.LoraRequestInfo == nil { - logger.Info("Not scraping metric: LoraRequestInfo") + if err := runner.NewRunner().Run(ctrl.SetupSignalHandler()); err != nil { + os.Exit(1) } - } diff --git a/cmd/epp/health.go b/cmd/epp/runner/health.go similarity index 58% rename from cmd/epp/health.go rename to cmd/epp/runner/health.go index 936970021..283541e95 100644 --- a/cmd/epp/health.go +++ b/cmd/epp/runner/health.go @@ -14,11 +14,12 @@ See the License for the specific language governing permissions and limitations under the License. */ -package main +package runner import ( "context" + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "github.com/go-logr/logr" "google.golang.org/grpc/codes" healthPb "google.golang.org/grpc/health/grpc_health_v1" @@ -33,6 +34,13 @@ type healthServer struct { } func (s *healthServer) Check(ctx context.Context, in *healthPb.HealthCheckRequest) (*healthPb.HealthCheckResponse, error) { + // TODO: we're accepting ANY service name for now as a temporary hack in alignment with + // upstream issues. See https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/788 + // if in.Service != extProcPb.ExternalProcessor_ServiceDesc.ServiceName { + // s.logger.V(logutil.DEFAULT).Info("gRPC health check requested unknown service", "available-services", []string{extProcPb.ExternalProcessor_ServiceDesc.ServiceName}, "requested-service", in.Service) + // return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_SERVICE_UNKNOWN}, nil + // } + if !s.datastore.PoolHasSynced() { s.logger.V(logutil.DEFAULT).Info("gRPC health check not serving", "service", in.Service) return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_NOT_SERVING}, nil @@ -41,6 +49,20 @@ func (s *healthServer) Check(ctx context.Context, in *healthPb.HealthCheckReques return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_SERVING}, nil } +func (s *healthServer) List(ctx context.Context, _ *healthPb.HealthListRequest) (*healthPb.HealthListResponse, error) { + // currently only the ext_proc service is provided + serviceHealthResponse, err := s.Check(ctx, &healthPb.HealthCheckRequest{Service: extProcPb.ExternalProcessor_ServiceDesc.ServiceName}) + if err != nil { + return nil, err + } + + return &healthPb.HealthListResponse{ + Statuses: map[string]*healthPb.HealthCheckResponse{ + extProcPb.ExternalProcessor_ServiceDesc.ServiceName: serviceHealthResponse, + }, + }, nil +} + func (s *healthServer) Watch(in *healthPb.HealthCheckRequest, srv healthPb.Health_WatchServer) error { return status.Error(codes.Unimplemented, "Watch is not implemented") } diff --git a/cmd/epp/runner/register.go b/cmd/epp/runner/register.go new file mode 100644 index 000000000..3a741d5d0 --- /dev/null +++ b/cmd/epp/runner/register.go @@ -0,0 +1,97 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package runner + +import ( + "context" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/filter" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/scorer" +) + +// RegisterAllPlugins registers the factory functions of all known plugins +func RegisterAllPlugins() { + plugins.Register(filter.DecisionTreeFilterType, filter.DecisionTreeFilterFactory) + plugins.Register(filter.LeastKVCacheFilterType, filter.LeastKVCacheFilterFactory) + plugins.Register(filter.LeastQueueFilterType, filter.LeastQueueFilterFactory) + plugins.Register(filter.LoraAffinityFilterType, filter.LoraAffinityFilterFactory) + plugins.Register(filter.LowQueueFilterType, filter.LowQueueFilterFactory) + plugins.Register(prefix.PrefixCachePluginType, prefix.PrefixCachePluginFactory) + plugins.Register(picker.MaxScorePickerType, picker.MaxScorePickerFactory) + plugins.Register(picker.RandomPickerType, picker.RandomPickerFactory) + plugins.Register(profile.SingleProfileHandlerType, profile.SingleProfileHandlerFactory) + plugins.Register(scorer.KvCacheScorerType, scorer.KvCacheScorerFactory) + plugins.Register(scorer.QueueScorerType, scorer.QueueScorerFactory) +} + +// eppHandle is an implementation of the interface plugins.Handle +type eppHandle struct { + ctx context.Context + plugins plugins.HandlePlugins +} + +// Context returns a context the plugins can use, if they need one +func (h *eppHandle) Context() context.Context { + return h.ctx +} + +// Plugins returns the sub-handle for working with instantiated plugins +func (h *eppHandle) Plugins() plugins.HandlePlugins { + return h.plugins +} + +// eppHandlePlugins implements the set of APIs to work with instantiated plugins +type eppHandlePlugins struct { + thePlugins map[string]plugins.Plugin +} + +// Plugin returns the named plugin instance +func (h *eppHandlePlugins) Plugin(name string) plugins.Plugin { + return h.thePlugins[name] +} + +// AddPlugin adds a plugin to the set of known plugin instances +func (h *eppHandlePlugins) AddPlugin(name string, plugin plugins.Plugin) { + h.thePlugins[name] = plugin +} + +// GetAllPlugins returns all of the known plugins +func (h *eppHandlePlugins) GetAllPlugins() []plugins.Plugin { + result := make([]plugins.Plugin, 0) + for _, plugin := range h.thePlugins { + result = append(result, plugin) + } + return result +} + +// GetAllPluginsWithNames returns al of the known plugins with their names +func (h *eppHandlePlugins) GetAllPluginsWithNames() map[string]plugins.Plugin { + return h.thePlugins +} + +func newEppHandle(ctx context.Context) *eppHandle { + return &eppHandle{ + ctx: ctx, + plugins: &eppHandlePlugins{ + thePlugins: map[string]plugins.Plugin{}, + }, + } +} diff --git a/cmd/epp/runner/runner.go b/cmd/epp/runner/runner.go new file mode 100644 index 000000000..81ec37347 --- /dev/null +++ b/cmd/epp/runner/runner.go @@ -0,0 +1,436 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package runner + +import ( + "context" + "flag" + "fmt" + "net/http/pprof" + + "github.com/go-logr/logr" + "github.com/prometheus/client_golang/prometheus" + uberzap "go.uber.org/zap" + "go.uber.org/zap/zapcore" + "google.golang.org/grpc" + healthPb "google.golang.org/grpc/health/grpc_health_v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/metrics/filters" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + + conformance_epp "sigs.k8s.io/gateway-api-inference-extension/conformance/testing-epp" + "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/common/config/loader" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics/collectors" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/scorer" + runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" + envutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +var ( + grpcPort = flag.Int( + "grpcPort", + runserver.DefaultGrpcPort, + "The gRPC port used for communicating with Envoy proxy") + grpcHealthPort = flag.Int( + "grpcHealthPort", + 9003, + "The port used for gRPC liveness and readiness probes") + metricsPort = flag.Int( + "metricsPort", 9090, "The metrics port") + destinationEndpointHintKey = flag.String( + "destinationEndpointHintKey", + runserver.DefaultDestinationEndpointHintKey, + "Header and response metadata key used by Envoy to route to the appropriate pod. This must match Envoy configuration.") + destinationEndpointHintMetadataNamespace = flag.String( + "DestinationEndpointHintMetadataNamespace", + runserver.DefaultDestinationEndpointHintMetadataNamespace, + "The key for the outer namespace struct in the metadata field of the extproc response that is used to wrap the"+ + "target endpoint. If not set, then an outer namespace struct should not be created.") + poolName = flag.String( + "poolName", + runserver.DefaultPoolName, + "Name of the InferencePool this Endpoint Picker is associated with.") + poolNamespace = flag.String( + "poolNamespace", + runserver.DefaultPoolNamespace, + "Namespace of the InferencePool this Endpoint Picker is associated with.") + refreshMetricsInterval = flag.Duration( + "refreshMetricsInterval", + runserver.DefaultRefreshMetricsInterval, + "interval to refresh metrics") + refreshPrometheusMetricsInterval = flag.Duration( + "refreshPrometheusMetricsInterval", + runserver.DefaultRefreshPrometheusMetricsInterval, + "interval to flush prometheus metrics") + logVerbosity = flag.Int("v", logging.DEFAULT, "number for the log level verbosity") + secureServing = flag.Bool( + "secureServing", runserver.DefaultSecureServing, "Enables secure serving. Defaults to true.") + healthChecking = flag.Bool("healthChecking", runserver.DefaultHealthChecking, "Enables health checking") + certPath = flag.String( + "certPath", "", "The path to the certificate for secure serving. The certificate and private key files "+ + "are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+ + "then a self-signed certificate is used.") + // metric flags + totalQueuedRequestsMetric = flag.String("totalQueuedRequestsMetric", + "vllm:num_requests_waiting", + "Prometheus metric for the number of queued requests.") + kvCacheUsagePercentageMetric = flag.String("kvCacheUsagePercentageMetric", + "vllm:gpu_cache_usage_perc", + "Prometheus metric for the fraction of KV-cache blocks currently in use (from 0 to 1).") + // LoRA metrics + loraInfoMetric = flag.String("loraInfoMetric", + "vllm:lora_requests_info", + "Prometheus metric for the LoRA info metrics (must be in vLLM label format).") + // configuration flags + configFile = flag.String("configFile", "", "The path to the configuration file") + configText = flag.String("configText", "", "The configuration specified as text, in lieu of a file") + + setupLog = ctrl.Log.WithName("setup") + + // Environment variables + schedulerV2 = envutil.GetEnvBool("EXPERIMENTAL_USE_SCHEDULER_V2", false, setupLog) + prefixCacheScheduling = envutil.GetEnvBool("ENABLE_PREFIX_CACHE_SCHEDULING", false, setupLog) + reqHeaderBasedSchedulerForTesting = envutil.GetEnvBool("ENABLE_REQ_HEADER_BASED_SCHEDULER_FOR_TESTING", false, setupLog) +) + +// NewRunner initializes a new EPP Runner and returns its pointer. +func NewRunner() *Runner { + return &Runner{ + requestControlConfig: requestcontrol.NewConfig(), // default requestcontrol config has empty plugin list + } +} + +// Runner is used to run epp with its plugins +type Runner struct { + requestControlConfig *requestcontrol.Config + schedulerConfig *scheduling.SchedulerConfig +} + +func (r *Runner) WithRequestControlConfig(requestControlConfig *requestcontrol.Config) *Runner { + r.requestControlConfig = requestControlConfig + return r +} + +func (r *Runner) WithSchedulerConfig(schedulerConfig *scheduling.SchedulerConfig) *Runner { + r.schedulerConfig = schedulerConfig + return r +} + +func (r *Runner) Run(ctx context.Context) error { + opts := zap.Options{ + Development: true, + } + opts.BindFlags(flag.CommandLine) + flag.Parse() + initLogging(&opts) + + // Validate flags + if err := validateFlags(); err != nil { + setupLog.Error(err, "Failed to validate flags") + return err + } + + // Print all flag values + flags := make(map[string]any) + flag.VisitAll(func(f *flag.Flag) { + flags[f.Name] = f.Value + }) + setupLog.Info("Flags processed", "flags", flags) + + // --- Load Configurations from Environment Variables --- + sdConfig := saturationdetector.LoadConfigFromEnv() + + // --- Get Kubernetes Config --- + cfg, err := ctrl.GetConfig() + if err != nil { + setupLog.Error(err, "Failed to get Kubernetes rest config") + return err + } + + // --- Setup Datastore --- + mapping, err := backendmetrics.NewMetricMapping( + *totalQueuedRequestsMetric, + *kvCacheUsagePercentageMetric, + *loraInfoMetric, + ) + if err != nil { + setupLog.Error(err, "Failed to create metric mapping from flags.") + return err + } + verifyMetricMapping(*mapping, setupLog) + pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.PodMetricsClientImpl{MetricMapping: mapping}, *refreshMetricsInterval) + + datastore := datastore.NewDatastore(ctx, pmf) + + // --- Setup Metrics Server --- + customCollectors := []prometheus.Collector{collectors.NewInferencePoolMetricsCollector(datastore)} + metrics.Register(customCollectors...) + metrics.RecordInferenceExtensionInfo() + // Register metrics handler. + // Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server. + // More info: + // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/metrics/server + // - https://book.kubebuilder.io/reference/metrics.html + metricsServerOptions := metricsserver.Options{ + BindAddress: fmt.Sprintf(":%d", *metricsPort), + FilterProvider: filters.WithAuthenticationAndAuthorization, + } + + poolNamespacedName := types.NamespacedName{ + Name: *poolName, + Namespace: *poolNamespace, + } + mgr, err := runserver.NewDefaultManager(poolNamespacedName, cfg, metricsServerOptions) + if err != nil { + setupLog.Error(err, "Failed to create controller manager") + return err + } + err = setupPprofHandlers(mgr) + if err != nil { + setupLog.Error(err, "Failed to setup pprof handlers") + return err + } + + err = r.parseConfiguration(ctx) + if err != nil { + setupLog.Error(err, "Failed to parse the configuration") + return err + } + + // --- Initialize Core EPP Components --- + scheduler, err := r.initializeScheduler() + if err != nil { + setupLog.Error(err, "Failed to create scheduler") + return err + } + + saturationDetector := saturationdetector.NewDetector(sdConfig, datastore, ctrl.Log) + + director := requestcontrol.NewDirectorWithConfig(datastore, scheduler, saturationDetector, r.requestControlConfig) + + // --- Setup ExtProc Server Runner --- + serverRunner := &runserver.ExtProcServerRunner{ + GrpcPort: *grpcPort, + DestinationEndpointHintMetadataNamespace: *destinationEndpointHintMetadataNamespace, + DestinationEndpointHintKey: *destinationEndpointHintKey, + PoolNamespacedName: poolNamespacedName, + Datastore: datastore, + SecureServing: *secureServing, + HealthChecking: *healthChecking, + CertPath: *certPath, + RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval, + Director: director, + SaturationDetector: saturationDetector, + } + if err := serverRunner.SetupWithManager(ctx, mgr); err != nil { + setupLog.Error(err, "Failed to setup EPP controllers") + return err + } + + // --- Add Runnables to Manager --- + // Register health server. + if err := registerHealthServer(mgr, ctrl.Log.WithName("health"), datastore, *grpcHealthPort); err != nil { + return err + } + + // Register ext-proc server. + if err := registerExtProcServer(mgr, serverRunner, ctrl.Log.WithName("ext-proc")); err != nil { + return err + } + + // --- Start Manager --- + // This blocks until a signal is received. + setupLog.Info("Controller manager starting") + if err := mgr.Start(ctx); err != nil { + setupLog.Error(err, "Error starting controller manager") + return err + } + setupLog.Info("Controller manager terminated") + return nil +} + +func (r *Runner) initializeScheduler() (*scheduling.Scheduler, error) { + if r.schedulerConfig != nil { + return scheduling.NewSchedulerWithConfig(r.schedulerConfig), nil + } + + // otherwise, no one configured from outside scheduler config. use existing configuration + scheduler := scheduling.NewScheduler() + if schedulerV2 { + queueScorerWeight := envutil.GetEnvInt("QUEUE_SCORE_WEIGHT", scorer.DefaultQueueScorerWeight, setupLog) + kvCacheScorerWeight := envutil.GetEnvInt("KV_CACHE_SCORE_WEIGHT", scorer.DefaultKVCacheScorerWeight, setupLog) + + schedulerProfile := framework.NewSchedulerProfile(). + WithScorers(framework.NewWeightedScorer(scorer.NewQueueScorer(), queueScorerWeight), + framework.NewWeightedScorer(scorer.NewKVCacheScorer(), kvCacheScorerWeight)). + WithPicker(picker.NewMaxScorePicker()) + + if prefixCacheScheduling { + prefixScorerWeight := envutil.GetEnvInt("PREFIX_CACHE_SCORE_WEIGHT", prefix.DefaultScorerWeight, setupLog) + if err := schedulerProfile.AddPlugins(framework.NewWeightedScorer(prefix.New(loadPrefixCacheConfig()), prefixScorerWeight)); err != nil { + return nil, fmt.Errorf("Failed to register scheduler plugins - %w", err) + } + } + + schedulerConfig := scheduling.NewSchedulerConfig(profile.NewSingleProfileHandler(), map[string]*framework.SchedulerProfile{"schedulerv2": schedulerProfile}) + scheduler = scheduling.NewSchedulerWithConfig(schedulerConfig) + } + + if reqHeaderBasedSchedulerForTesting { + scheduler = conformance_epp.NewReqHeaderBasedScheduler() + } + + return scheduler, nil +} + +func (r *Runner) parseConfiguration(ctx context.Context) error { + if len(*configText) != 0 || len(*configFile) != 0 { + theConfig, err := loader.LoadConfig([]byte(*configText), *configFile) + if err != nil { + return fmt.Errorf("failed to load the configuration - %w", err) + } + + epp := newEppHandle(ctx) + + err = loader.LoadPluginReferences(theConfig.Plugins, epp) + if err != nil { + return fmt.Errorf("failed to instantiate the plugins - %w", err) + } + + r.schedulerConfig, err = loader.LoadSchedulerConfig(theConfig.SchedulingProfiles, epp) + if err != nil { + return fmt.Errorf("failed to create Scheduler configuration - %w", err) + } + + // Add requestControl plugins + r.requestControlConfig.AddPlugins(epp.Plugins().GetAllPlugins()...) + } + return nil +} + +func initLogging(opts *zap.Options) { + // Unless -zap-log-level is explicitly set, use -v + useV := true + flag.Visit(func(f *flag.Flag) { + if f.Name == "zap-log-level" { + useV = false + } + }) + if useV { + // See https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/log/zap#Options.Level + lvl := -1 * (*logVerbosity) + opts.Level = uberzap.NewAtomicLevelAt(zapcore.Level(int8(lvl))) + } + + logger := zap.New(zap.UseFlagOptions(opts), zap.RawZapOpts(uberzap.AddCaller())) + ctrl.SetLogger(logger) +} + +func loadPrefixCacheConfig() prefix.Config { + baseLogger := log.Log.WithName("env-config") + + return prefix.Config{ + HashBlockSize: envutil.GetEnvInt("PREFIX_CACHE_HASH_BLOCK_SIZE", prefix.DefaultHashBlockSize, baseLogger), + MaxPrefixBlocksToMatch: envutil.GetEnvInt("PREFIX_CACHE_MAX_PREFIX_BLOCKS", prefix.DefaultMaxPrefixBlocks, baseLogger), + LRUCapacityPerServer: envutil.GetEnvInt("PREFIX_CACHE_LRU_CAPACITY_PER_SERVER", prefix.DefaultLRUCapacityPerServer, baseLogger), + } +} + +// registerExtProcServer adds the ExtProcServerRunner as a Runnable to the manager. +func registerExtProcServer(mgr manager.Manager, runner *runserver.ExtProcServerRunner, logger logr.Logger) error { + if err := mgr.Add(runner.AsRunnable(logger)); err != nil { + setupLog.Error(err, "Failed to register ext-proc gRPC server runnable") + return err + } + setupLog.Info("ExtProc server runner added to manager.") + return nil +} + +// registerHealthServer adds the Health gRPC server as a Runnable to the given manager. +func registerHealthServer(mgr manager.Manager, logger logr.Logger, ds datastore.Datastore, port int) error { + srv := grpc.NewServer() + healthPb.RegisterHealthServer(srv, &healthServer{ + logger: logger, + datastore: ds, + }) + if err := mgr.Add( + runnable.NoLeaderElection(runnable.GRPCServer("health", srv, port))); err != nil { + setupLog.Error(err, "Failed to register health server") + return err + } + return nil +} + +func validateFlags() error { + if *poolName == "" { + return fmt.Errorf("required %q flag not set", "poolName") + } + if len(*configText) != 0 && len(*configFile) != 0 { + return fmt.Errorf("both the %s and %s flags can not be set at the same time", "configText", "configFile") + } + + return nil +} + +func verifyMetricMapping(mapping backendmetrics.MetricMapping, logger logr.Logger) { + if mapping.TotalQueuedRequests == nil { + logger.Info("Not scraping metric: TotalQueuedRequests") + } + if mapping.KVCacheUtilization == nil { + logger.Info("Not scraping metric: KVCacheUtilization") + } + if mapping.LoraRequestInfo == nil { + logger.Info("Not scraping metric: LoraRequestInfo") + } +} + +// setupPprofHandlers only implements the pre-defined profiles: +// https://cs.opensource.google/go/go/+/refs/tags/go1.24.4:src/runtime/pprof/pprof.go;l=108 +func setupPprofHandlers(mgr ctrl.Manager) error { + var err error + profiles := []string{ + "heap", + "goroutine", + "allocs", + "threadcreate", + "block", + "mutex", + } + for _, p := range profiles { + err = mgr.AddMetricsServerExtraHandler("/debug/pprof/"+p, pprof.Handler(p)) + if err != nil { + return err + } + } + return nil +} diff --git a/config/charts/body-based-routing/README.md b/config/charts/body-based-routing/README.md index 062f2b5c6..d311b8c39 100644 --- a/config/charts/body-based-routing/README.md +++ b/config/charts/body-based-routing/README.md @@ -10,7 +10,7 @@ To install a body-based router named `body-based-router`, you can run the follow ```txt $ helm install body-based-router ./config/charts/body-based-routing \ --set provider.name=[gke|istio] \ - --set inference-gateway.name=inference-gateway + --set inferenceGateway.name=inference-gateway ``` Note that the provider name is needed to ensure provider-specific manifests are also applied. If no provider is specified, then only @@ -19,7 +19,7 @@ the deployment and service are deployed. To install via the latest published chart in staging (--version v0 indicates latest dev version), you can run the following command: ```txt -$ helm install body-based-router oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/body-based-router \ +$ helm install body-based-router oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/body-based-routing \ --version v0 --set provider.name=[gke|istio] ``` @@ -47,8 +47,8 @@ The following table list the configurable parameters of the chart. | `bbr.image.tag` | Image tag. | | `bbr.image.pullPolicy` | Image pull policy for the container. Possible values: `Always`, `IfNotPresent`, or `Never`. Defaults to `Always`. | | `provider.name` | Name of the Inference Gateway implementation being used. Possible values: `istio`, `gke`. Defaults to `none`. | -| `inference-gateway.name` | The name of the Gateway. Defaults to `inference-gateway`. | +| `inferenceGateway.name` | The name of the Gateway. Defaults to `inference-gateway`. | ## Notes -This chart should only be deployed once per Gateway. \ No newline at end of file +This chart should only be deployed once per Gateway. diff --git a/config/charts/body-based-routing/templates/gke.yaml b/config/charts/body-based-routing/templates/gke.yaml index 937bfa0bd..77b776a4c 100644 --- a/config/charts/body-based-routing/templates/gke.yaml +++ b/config/charts/body-based-routing/templates/gke.yaml @@ -9,7 +9,7 @@ spec: targetRefs: - group: "gateway.networking.k8s.io" kind: Gateway - name: {{ .Values.inference-gateway.name }} + name: {{ .Values.inferenceGateway.name }} extensionChains: - name: chain1 extensions: diff --git a/config/charts/body-based-routing/templates/istio.yaml b/config/charts/body-based-routing/templates/istio.yaml index c4c1444fd..2c744e23d 100644 --- a/config/charts/body-based-routing/templates/istio.yaml +++ b/config/charts/body-based-routing/templates/istio.yaml @@ -25,13 +25,13 @@ spec: processing_mode: request_header_mode: "SEND" response_header_mode: "SKIP" - request_body_mode: "BUFFERED" + request_body_mode: "FULL_DUPLEX_STREAMED" response_body_mode: "NONE" - request_trailer_mode: "SKIP" + request_trailer_mode: "SEND" response_trailer_mode: "SKIP" grpc_service: envoy_grpc: - cluster_name: outbound|{{ .Values.bbr.port }}||{{ .Values.bbr.name }}.default.svc.cluster.local + cluster_name: outbound|{{ .Values.bbr.port }}||{{ .Values.bbr.name }}.{{ .Release.Namespace }}.svc.cluster.local --- apiVersion: networking.istio.io/v1 kind: DestinationRule @@ -39,7 +39,7 @@ metadata: name: {{ .Values.bbr.name }} namespace: {{ .Release.Namespace }} spec: - host: {{ .Values.bbr.name }}.default.svc.cluster.local + host: {{ .Values.bbr.name }}.{{ .Release.Namespace }}.svc.cluster.local trafficPolicy: tls: mode: SIMPLE diff --git a/config/charts/body-based-routing/values.yaml b/config/charts/body-based-routing/values.yaml index b77d75427..0b88dc432 100644 --- a/config/charts/body-based-routing/values.yaml +++ b/config/charts/body-based-routing/values.yaml @@ -12,5 +12,5 @@ bbr: provider: name: none -inference-gateway: +inferenceGateway: name: inference-gateway diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md index 681fc7836..bed4f33c2 100644 --- a/config/charts/inferencepool/README.md +++ b/config/charts/inferencepool/README.md @@ -2,7 +2,6 @@ A chart to deploy an InferencePool and a corresponding EndpointPicker (epp) deployment. - ## Install To install an InferencePool named `vllm-llama3-8b-instruct` that selects from endpoints with label `app: vllm-llama3-8b-instruct` and listening on port `8000`, you can run the following command: @@ -17,6 +16,48 @@ To install via the latest published chart in staging (--version v0 indicates la ```txt $ helm install vllm-llama3-8b-instruct \ --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set provider.name=[none|gke] \ + oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0 +``` + +Note that the provider name is needed to deploy provider-specific resources. If no provider is specified, then only the InferencePool object and the EPP are deployed. + +### Install with Custom Environment Variables + +To set custom environment variables for the EndpointPicker deployment: + +```txt +$ helm install vllm-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set provider.name=[none|gke] \ + --set inferenceExtension.env.FEATURE_FLAG_ENABLED=true \ + oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0 +``` + +Alternatively, you can define environment variables in a values file: + +```yaml +# values.yaml +inferenceExtension: + env: + FEATURE_FLAG_ENABLED: "true" +``` + +And apply it with: + +```txt +$ helm install vllm-llama3-8b-instruct ./config/charts/inferencepool -f values.yaml +``` + +### Install for Triton TensorRT-LLM + +Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install for Triton TensorRT-LLM, e.g., + +```txt +$ helm install triton-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=triton-llama3-8b-instruct \ + --set inferencePool.modelServerType=triton-tensorrt-llm \ + --set provider.name=[none|gke] \ oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0 ``` @@ -34,8 +75,8 @@ The following table list the configurable parameters of the chart. | **Parameter Name** | **Description** | |---------------------------------------------|------------------------------------------------------------------------------------------------------------------------| -| `inferencePool.name` | Name for the InferencePool, and endpoint picker deployment and service will be named as `{.Release.name}-epp`. | | `inferencePool.targetPortNumber` | Target port number for the vllm backends, will be used to scrape metrics by the inference extension. Defaults to 8000. | +| `inferencePool.modelServerType` | Type of the model servers in the pool, valid options are [vllm, triton-tensorrt-llm], default is vllm. | | `inferencePool.modelServers.matchLabels` | Label selector to match vllm backends managed by the inference pool. | | `inferenceExtension.replicas` | Number of replicas for the endpoint picker extension service. Defaults to `1`. | | `inferenceExtension.image.name` | Name of the container image used for the endpoint picker. | @@ -43,6 +84,8 @@ The following table list the configurable parameters of the chart. | `inferenceExtension.image.tag` | Image tag of the endpoint picker. | | `inferenceExtension.image.pullPolicy` | Image pull policy for the container. Possible values: `Always`, `IfNotPresent`, or `Never`. Defaults to `Always`. | | `inferenceExtension.extProcPort` | Port where the endpoint picker service is served for external processing. Defaults to `9002`. | +| `inferenceExtension.env` | Map of environment variables to set in the endpoint picker container. Defaults to `{}`. | +| `provider.name` | Name of the Inference Gateway implementation being used. Possible values: `gke`. Defaults to `none`. | ## Notes diff --git a/config/charts/inferencepool/templates/epp-deployment.yaml b/config/charts/inferencepool/templates/epp-deployment.yaml index d925a38e9..0fcab4932 100644 --- a/config/charts/inferencepool/templates/epp-deployment.yaml +++ b/config/charts/inferencepool/templates/epp-deployment.yaml @@ -35,9 +35,14 @@ spec: - "9003" - -metricsPort - "9090" - env: - - name: USE_STREAMING - value: "true" + {{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }} + - -totalQueuedRequestsMetric + - "nv_trt_llm_request_metrics{request_type=waiting}" + - -kvCacheUsagePercentageMetric + - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}" + - -loraInfoMetric + - "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet. + {{- end }} ports: - name: grpc containerPort: 9002 @@ -57,4 +62,8 @@ spec: service: inference-extension initialDelaySeconds: 5 periodSeconds: 10 - + env: + {{- range $key, $value := .Values.inferenceExtension.env }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} diff --git a/config/charts/inferencepool/templates/gke.yaml b/config/charts/inferencepool/templates/gke.yaml index 220b3beaa..70e05b568 100644 --- a/config/charts/inferencepool/templates/gke.yaml +++ b/config/charts/inferencepool/templates/gke.yaml @@ -33,6 +33,8 @@ spec: name: {{ .Release.Name }} default: timeoutSec: 300 # 5-minute timeout (adjust as needed) + logging: + enabled: true # log all requests by default --- apiVersion: monitoring.googleapis.com/v1 kind: ClusterPodMonitoring diff --git a/config/charts/inferencepool/templates/rbac.yaml b/config/charts/inferencepool/templates/rbac.yaml index cdd50c6a2..4cb5d7be0 100644 --- a/config/charts/inferencepool/templates/rbac.yaml +++ b/config/charts/inferencepool/templates/rbac.yaml @@ -33,6 +33,7 @@ subjects: name: {{ include "gateway-api-inference-extension.name" . }} namespace: {{ .Release.Namespace }} roleRef: + apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: {{ include "gateway-api-inference-extension.name" . }} --- diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index 766ee087b..f793d865d 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -6,9 +6,14 @@ inferenceExtension: tag: main pullPolicy: Always extProcPort: 9002 + env: {} + # Example environment variables: + # env: + # KV_CACHE_SCORE_WEIGHT: "1" inferencePool: targetPortNumber: 8000 + modelServerType: vllm # vllm, triton-tensorrt-llm # modelServers: # REQUIRED # matchLabels: # app: vllm-llama3-8b-instruct diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml index 288050962..d76c93f9f 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml @@ -88,7 +88,7 @@ spec: ModelNames must be unique for a referencing InferencePool (names can be reused for a different pool in the same cluster). The modelName with the oldest creation timestamp is retained, and the incoming - InferenceModel is sets the Ready status to false with a corresponding reason. + InferenceModel's Ready status is set to false with a corresponding reason. In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected. Names can be reserved without an underlying model configured in the pool. This can be done by specifying a target model and setting the weight to zero, diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml index 8386db82c..db342d034 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml @@ -134,16 +134,32 @@ spec: - targetPortNumber type: object status: - description: InferencePoolStatus defines the observed state of InferencePool + default: + parent: + - conditions: + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for controller + reason: Pending + status: Unknown + type: Accepted + parentRef: + kind: Status + name: default + description: Status defines the observed state of InferencePool. properties: parent: description: |- Parents is a list of parent resources (usually Gateways) that are - associated with the route, and the status of the InferencePool with respect to + associated with the InferencePool, and the status of the InferencePool with respect to each parent. - A maximum of 32 Gateways will be represented in this list. An empty list - means the route has not been attached to any Gateway. + A maximum of 32 Gateways will be represented in this list. When the list contains + `kind: Status, name: default`, it indicates that the InferencePool is not + associated with any Gateway and a controller must perform the following: + + - Remove the parent when setting the "Accepted" condition. + - Add the parent when the controller will no longer manage the InferencePool + and no other parents exist. items: description: PoolStatus defines the observed state of InferencePool from a Gateway. diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml deleted file mode 100644 index 1fd9939f9..000000000 --- a/config/default/kustomization.yaml +++ /dev/null @@ -1,151 +0,0 @@ -# Adds namespace to all resources. -namespace: api-system - -# Value of this field is prepended to the -# names of all resources, e.g. a deployment named -# "wordpress" becomes "alices-wordpress". -# Note that it should also match with the prefix (text before '-') of the namespace -# field above. -namePrefix: api- - -# Labels to add to all resources and selectors. -#labels: -#- includeSelectors: true -# pairs: -# someName: someValue - -resources: -- ../crd -- ../rbac -- ../manager -# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in -# crd/kustomization.yaml -#- ../webhook -# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required. -#- ../certmanager -# [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'. -#- ../prometheus -# [METRICS] Expose the controller manager metrics service. -- metrics_service.yaml -# [NETWORK POLICY] Protect the /metrics endpoint and Webhook Server with NetworkPolicy. -# Only Pod(s) running a namespace labeled with 'metrics: enabled' will be able to gather the metrics. -# Only CR(s) which requires webhooks and are applied on namespaces labeled with 'webhooks: enabled' will -# be able to communicate with the Webhook Server. -#- ../network-policy - -# Uncomment the patches line if you enable Metrics, and/or are using webhooks and cert-manager -patches: -# [METRICS] The following patch will enable the metrics endpoint using HTTPS and the port :8443. -# More info: https://book.kubebuilder.io/reference/metrics -- path: manager_metrics_patch.yaml - target: - kind: Deployment - -# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in -# crd/kustomization.yaml -#- path: manager_webhook_patch.yaml - -# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. -# Uncomment 'CERTMANAGER' sections in crd/kustomization.yaml to enable the CA injection in the admission webhooks. -# 'CERTMANAGER' needs to be enabled to use ca injection -#- path: webhookcainjection_patch.yaml - -# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix. -# Uncomment the following replacements to add the cert-manager CA injection annotations -#replacements: -# - source: # Add cert-manager annotation to ValidatingWebhookConfiguration, MutatingWebhookConfiguration and CRDs -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert # this name should match the one in certificate.yaml -# fieldPath: .metadata.namespace # namespace of the certificate CR -# targets: -# - select: -# kind: ValidatingWebhookConfiguration -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 0 -# create: true -# - select: -# kind: MutatingWebhookConfiguration -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 0 -# create: true -# - select: -# kind: CustomResourceDefinition -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 0 -# create: true -# - source: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert # this name should match the one in certificate.yaml -# fieldPath: .metadata.name -# targets: -# - select: -# kind: ValidatingWebhookConfiguration -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 1 -# create: true -# - select: -# kind: MutatingWebhookConfiguration -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 1 -# create: true -# - select: -# kind: CustomResourceDefinition -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 1 -# create: true -# - source: # Add cert-manager annotation to the webhook Service -# kind: Service -# version: v1 -# name: webhook-service -# fieldPath: .metadata.name # namespace of the service -# targets: -# - select: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# fieldPaths: -# - .spec.dnsNames.0 -# - .spec.dnsNames.1 -# options: -# delimiter: '.' -# index: 0 -# create: true -# - source: -# kind: Service -# version: v1 -# name: webhook-service -# fieldPath: .metadata.namespace # namespace of the service -# targets: -# - select: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# fieldPaths: -# - .spec.dnsNames.0 -# - .spec.dnsNames.1 -# options: -# delimiter: '.' -# index: 1 -# create: true diff --git a/config/default/manager_metrics_patch.yaml b/config/default/manager_metrics_patch.yaml deleted file mode 100644 index 2aaef6536..000000000 --- a/config/default/manager_metrics_patch.yaml +++ /dev/null @@ -1,4 +0,0 @@ -# This patch adds the args to allow exposing the metrics endpoint using HTTPS -- op: add - path: /spec/template/spec/containers/0/args/0 - value: --metrics-bind-address=:8443 diff --git a/config/default/metrics_service.yaml b/config/default/metrics_service.yaml deleted file mode 100644 index 140d49433..000000000 --- a/config/default/metrics_service.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - labels: - control-plane: controller-manager - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: controller-manager-metrics-service - namespace: system -spec: - ports: - - name: https - port: 8443 - protocol: TCP - targetPort: 8443 - selector: - control-plane: controller-manager diff --git a/config/manifests/benchmark/benchmark.yaml b/config/manifests/benchmark/benchmark.yaml index c784730e8..abf9ae5f6 100644 --- a/config/manifests/benchmark/benchmark.yaml +++ b/config/manifests/benchmark/benchmark.yaml @@ -37,7 +37,7 @@ spec: - name: BACKEND value: vllm - name: PORT - value: "8081" + value: "80" - name: INPUT_LENGTH value: "1024" - name: OUTPUT_LENGTH diff --git a/config/manifests/gateway/gke/gcp-backend-policy.yaml b/config/manifests/gateway/gke/gcp-backend-policy.yaml index 519a5a930..7b294304e 100644 --- a/config/manifests/gateway/gke/gcp-backend-policy.yaml +++ b/config/manifests/gateway/gke/gcp-backend-policy.yaml @@ -9,3 +9,5 @@ spec: name: vllm-llama3-8b-instruct default: timeoutSec: 300 + logging: + enabled: true diff --git a/config/manifests/gateway/gke/healthcheck.yaml b/config/manifests/gateway/gke/healthcheck.yaml index 95f4f2d2b..93b6cd7fa 100644 --- a/config/manifests/gateway/gke/healthcheck.yaml +++ b/config/manifests/gateway/gke/healthcheck.yaml @@ -7,7 +7,7 @@ spec: targetRef: group: "inference.networking.x-k8s.io" kind: InferencePool - name: vllm-llama2-7b + name: vllm-llama3-8b-instruct default: config: type: HTTP diff --git a/config/manifests/gateway/istio/destination-rule.yaml b/config/manifests/gateway/istio/destination-rule.yaml index f9cd0c3c5..12ca982bc 100644 --- a/config/manifests/gateway/istio/destination-rule.yaml +++ b/config/manifests/gateway/istio/destination-rule.yaml @@ -3,7 +3,7 @@ kind: DestinationRule metadata: name: epp-insecure-tls spec: - host: vllm-llama2-7b-epp + host: vllm-llama3-8b-instruct-epp trafficPolicy: tls: mode: SIMPLE diff --git a/config/manifests/gateway/kgateway/httproute.yaml b/config/manifests/gateway/kgateway/httproute.yaml index 03967729d..18e90ced6 100644 --- a/config/manifests/gateway/kgateway/httproute.yaml +++ b/config/manifests/gateway/kgateway/httproute.yaml @@ -12,7 +12,6 @@ spec: - group: inference.networking.x-k8s.io kind: InferencePool name: vllm-llama3-8b-instruct - port: 8000 # Remove when https://github.com/kgateway-dev/kgateway/issues/10987 is fixed. matches: - path: type: PathPrefix diff --git a/config/manifests/inferencemodel.yaml b/config/manifests/inferencemodel.yaml index 75c9bb173..67c91d0e5 100644 --- a/config/manifests/inferencemodel.yaml +++ b/config/manifests/inferencemodel.yaml @@ -8,9 +8,8 @@ spec: poolRef: name: vllm-llama3-8b-instruct targetModels: - - name: food-review + - name: food-review-1 weight: 100 - --- apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferenceModel @@ -21,7 +20,6 @@ spec: criticality: Critical poolRef: name: vllm-llama3-8b-instruct - --- apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferenceModel diff --git a/config/manifests/inferencepool-resources.yaml b/config/manifests/inferencepool-resources.yaml index cef70d7ff..34aacfa49 100644 --- a/config/manifests/inferencepool-resources.yaml +++ b/config/manifests/inferencepool-resources.yaml @@ -1,7 +1,9 @@ +# Note: If you change this file, please also change the file used for e2e tests! +# +# https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/test/testdata/inferencepool-e2e.yaml apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferencePool metadata: - labels: name: vllm-llama3-8b-instruct spec: targetPortNumber: 8000 @@ -51,6 +53,8 @@ spec: args: - -poolName - "vllm-llama3-8b-instruct" + - "-poolNamespace" + - "default" - -v - "4" - --zap-encoder @@ -59,9 +63,6 @@ spec: - "9002" - -grpcHealthPort - "9003" - env: - - name: USE_STREAMING - value: "true" ports: - containerPort: 9002 - containerPort: 9003 @@ -85,18 +86,15 @@ apiVersion: rbac.authorization.k8s.io/v1 metadata: name: pod-read rules: +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencepools"] + verbs: ["get", "watch", "list"] - apiGroups: ["inference.networking.x-k8s.io"] resources: ["inferencemodels"] verbs: ["get", "watch", "list"] - apiGroups: [""] resources: ["pods"] verbs: ["get", "watch", "list"] -- apiGroups: ["inference.networking.x-k8s.io"] - resources: ["inferencepools"] - verbs: ["get", "watch", "list"] -- apiGroups: ["discovery.k8s.io"] - resources: ["endpointslices"] - verbs: ["get", "watch", "list"] - apiGroups: - authentication.k8s.io resources: @@ -119,5 +117,6 @@ subjects: name: default namespace: default roleRef: + apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: pod-read diff --git a/config/manifests/regression-testing/inferencemodel.yaml b/config/manifests/regression-testing/inferencemodel.yaml new file mode 100644 index 000000000..d8eada95a --- /dev/null +++ b/config/manifests/regression-testing/inferencemodel.yaml @@ -0,0 +1,237 @@ +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-0 +spec: + modelName: adapter-0 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-0 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-1 +spec: + modelName: adapter-1 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-1 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-2 +spec: + modelName: adapter-2 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-2 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-3 +spec: + modelName: adapter-3 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-3 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-4 +spec: + modelName: adapter-4 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-4 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-5 +spec: + modelName: adapter-5 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-5 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-6 +spec: + modelName: adapter-6 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-6 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-7 +spec: + modelName: adapter-7 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-7 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-8 +spec: + modelName: adapter-8 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-8 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-9 +spec: + modelName: adapter-9 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-9 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-10 +spec: + modelName: adapter-10 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-10 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-11 +spec: + modelName: adapter-11 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-11 + weight: 100 + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-12 +spec: + modelName: adapter-12 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-12 + weight: 100 + + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-13 +spec: + modelName: adapter-13 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-13 + weight: 100 + + +--- + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: adapter-14 +spec: + modelName: adapter-14 + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: adapter-14 + weight: 100 + +--- + + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: base-model +spec: + modelName: meta-llama/Llama-3.1-8B-Instruct + criticality: Critical + poolRef: + name: vllm-llama3-8b-instruct \ No newline at end of file diff --git a/config/manifests/regression-testing/multi-lora-regression.yaml b/config/manifests/regression-testing/multi-lora-regression.yaml new file mode 100644 index 000000000..00b5d7d50 --- /dev/null +++ b/config/manifests/regression-testing/multi-lora-regression.yaml @@ -0,0 +1,62 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: benchmark-tool + name: benchmark-tool +spec: + replicas: 1 + selector: + matchLabels: + app: benchmark-tool + template: + metadata: + labels: + app: benchmark-tool + spec: + containers: + # Build image from this source https://github.com/AI-Hypercomputer/inference-benchmark/tree/46d638262650a1928e47699d78ab2da062d4422d + - image: '' + imagePullPolicy: Always + name: benchmark-tool + command: + - bash + - -c + - ./latency_throughput_curve.sh + env: + - name: IP + value: '' + - name: REQUEST_RATES + value: '20,40,60,80,100,120,140,160,180,200' + - name: BENCHMARK_TIME_SECONDS + value: '300' + - name: TOKENIZER + value: 'meta-llama/Llama-3.1-8B-Instruct' + - name: MODELS + value: 'adapter-0,adapter-1,adapter-2,adapter-3,adapter-4,adapter-5,adapter-6,adapter-7,adapter-8,adapter-9,adapter-10,adapter-11,adapter-12,adapter-13,adapter-14' + - name: TRAFFIC_SPLIT + value: '0.12,0.12,0.12,0.12,0.12,0.06,0.06,0.06,0.06,0.06,0.02,0.02,0.02,0.02,0.02' + - name: BACKEND + value: vllm + - name: PORT + value: "80" + - name: INPUT_LENGTH + value: "1024" + - name: OUTPUT_LENGTH + value: '1024' + - name: FILE_PREFIX + value: benchmark + - name: PROMPT_DATASET_FILE + value: Infinity-Instruct_conversations.json + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: token + name: hf-token + resources: + limits: + cpu: "2" + memory: 20Gi + requests: + cpu: "2" + memory: 20Gi \ No newline at end of file diff --git a/config/manifests/regression-testing/single-workload-regression.yaml b/config/manifests/regression-testing/single-workload-regression.yaml new file mode 100644 index 000000000..b13b7eed8 --- /dev/null +++ b/config/manifests/regression-testing/single-workload-regression.yaml @@ -0,0 +1,60 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: benchmark-tool + name: benchmark-tool +spec: + replicas: 1 + selector: + matchLabels: + app: benchmark-tool + template: + metadata: + labels: + app: benchmark-tool + spec: + containers: + # Build image from this source https://github.com/AI-Hypercomputer/inference-benchmark/tree/46d638262650a1928e47699d78ab2da062d4422d + - image: '' + imagePullPolicy: Always + name: benchmark-tool + command: + - bash + - -c + - ./latency_throughput_curve.sh + env: + - name: IP + value: '' + - name: REQUEST_RATES + value: '300,310,320,330,340,350' + - name: BENCHMARK_TIME_SECONDS + value: '300' + - name: TOKENIZER + value: 'meta-llama/Llama-3.1-8B-Instruct' + - name: MODELS + value: 'meta-llama/Llama-3.1-8B-Instruct' + - name: BACKEND + value: vllm + - name: PORT + value: "80" + - name: INPUT_LENGTH + value: "1024" + - name: OUTPUT_LENGTH + value: '1024' + - name: FILE_PREFIX + value: benchmark + - name: PROMPT_DATASET_FILE + value: billsum_conversations.json + - name: HF_TOKEN + valueFrom: + secretKeyRef: + key: token + name: hf-token + resources: + limits: + cpu: "2" + memory: 20Gi + requests: + cpu: "2" + memory: 20Gi \ No newline at end of file diff --git a/config/manifests/regression-testing/vllm/multi-lora-deployment.yaml b/config/manifests/regression-testing/vllm/multi-lora-deployment.yaml new file mode 100644 index 000000000..114cd9922 --- /dev/null +++ b/config/manifests/regression-testing/vllm/multi-lora-deployment.yaml @@ -0,0 +1,289 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-llama3-8b-instruct +spec: + replicas: 10 + selector: + matchLabels: + app: vllm-llama3-8b-instruct + template: + metadata: + labels: + app: vllm-llama3-8b-instruct + spec: + containers: + - name: vllm + image: "vllm/vllm-openai:latest" + imagePullPolicy: Always + command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args: + - "--model" + - "meta-llama/Llama-3.1-8B-Instruct" + - "--tensor-parallel-size" + - "1" + - "--port" + - "8000" + - "--enable-lora" + - "--max-loras" + - "15" + - "--max-cpu-loras" + - "15" + - "--compilation-config" + - "3" + - "--max-lora-rank" + - "8" + - "--max-num-seqs" + - "2048" + - "--max-model-len" + - "2048" + - "--no-enable-prefix-caching" + env: + - name: PORT + value: "8000" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: token + - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING + value: "false" + ports: + - containerPort: 8000 + name: http + protocol: TCP + lifecycle: + preStop: + # vLLM stops accepting connections when it receives SIGTERM, so we need to sleep + # to give upstream gateways a chance to take us out of rotation. The time we wait + # is dependent on the time it takes for all upstreams to completely remove us from + # rotation. Older or simpler load balancers might take upwards of 30s, but we expect + # our deployment to run behind a modern gateway like Envoy which is designed to + # probe for readiness aggressively. + sleep: + # Upstream gateway probers for health should be set on a low period, such as 5s, + # and the shorter we can tighten that bound the faster that we release + # accelerators during controlled shutdowns. However, we should expect variance, + # as load balancers may have internal delays, and we don't want to drop requests + # normally, so we're often aiming to set this value to a p99 propagation latency + # of readiness -> load balancer taking backend out of rotation, not the average. + # + # This value is generally stable and must often be experimentally determined on + # for a given load balancer and health check period. We set the value here to + # the highest value we observe on a supported load balancer, and we recommend + # tuning this value down and verifying no requests are dropped. + # + # If this value is updated, be sure to update terminationGracePeriodSeconds. + # + seconds: 30 + # + # IMPORTANT: preStop.sleep is beta as of Kubernetes 1.30 - for older versions + # replace with this exec action. + #exec: + # command: + # - /usr/bin/sleep + # - "30" + livenessProbe: + httpGet: + path: /health + port: http + scheme: HTTP + # vLLM's health check is simple, so we can more aggressively probe it. Liveness + # check endpoints should always be suitable for aggressive probing. + periodSeconds: 1 + successThreshold: 1 + # vLLM has a very simple health implementation, which means that any failure is + # likely significant. However, any liveness triggered restart requires the very + # large core model to be reloaded, and so we should bias towards ensuring the + # server is definitely unhealthy vs immediately restarting. Use 5 attempts as + # evidence of a serious problem. + failureThreshold: 5 + timeoutSeconds: 1 + readinessProbe: + httpGet: + path: /health + port: http + scheme: HTTP + # vLLM's health check is simple, so we can more aggressively probe it. Readiness + # check endpoints should always be suitable for aggressive probing, but may be + # slightly more expensive than readiness probes. + periodSeconds: 1 + successThreshold: 1 + # vLLM has a very simple health implementation, which means that any failure is + # likely significant, + failureThreshold: 1 + timeoutSeconds: 1 + # We set a startup probe so that we don't begin directing traffic or checking + # liveness to this instance until the model is loaded. + startupProbe: + # Failure threshold is when we believe startup will not happen at all, and is set + # to the maximum possible time we believe loading a model will take. In our + # default configuration we are downloading a model from HuggingFace, which may + # take a long time, then the model must load into the accelerator. We choose + # 10 minutes as a reasonable maximum startup time before giving up and attempting + # to restart the pod. + # + # IMPORTANT: If the core model takes more than 10 minutes to load, pods will crash + # loop forever. Be sure to set this appropriately. + failureThreshold: 600 + # Set delay to start low so that if the base model changes to something smaller + # or an optimization is deployed, we don't wait unneccesarily. + initialDelaySeconds: 2 + # As a startup probe, this stops running and so we can more aggressively probe + # even a moderately complex startup - this is a very important workload. + periodSeconds: 1 + httpGet: + # vLLM does not start the OpenAI server (and hence make /health available) + # until models are loaded. This may not be true for all model servers. + path: /health + port: http + scheme: HTTP + resources: + limits: + nvidia.com/gpu: 1 + requests: + nvidia.com/gpu: 1 + volumeMounts: + - mountPath: /data + name: data + - mountPath: /dev/shm + name: shm + - name: adapters + mountPath: "/adapters" + initContainers: + - name: lora-adapter-syncer + tty: true + stdin: true + image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer:main + restartPolicy: Always + imagePullPolicy: Always + env: + - name: DYNAMIC_LORA_ROLLOUT_CONFIG + value: "/config/configmap.yaml" + volumeMounts: # DO NOT USE subPath, dynamic configmap updates don't work on subPaths + - name: config-volume + mountPath: /config + restartPolicy: Always + + # vLLM allows VLLM_PORT to be specified as an environment variable, but a user might + # create a 'vllm' service in their namespace. That auto-injects VLLM_PORT in docker + # compatible form as `tcp://:` instead of the numeric value vLLM accepts + # causing CrashLoopBackoff. Set service environment injection off by default. + enableServiceLinks: false + + # Generally, the termination grace period needs to last longer than the slowest request + # we expect to serve plus any extra time spent waiting for load balancers to take the + # model server out of rotation. + # + # An easy starting point is the p99 or max request latency measured for your workload, + # although LLM request latencies vary significantly if clients send longer inputs or + # trigger longer outputs. Since steady state p99 will be higher than the latency + # to drain a server, you may wish to slightly this value either experimentally or + # via the calculation below. + # + # For most models you can derive an upper bound for the maximum drain latency as + # follows: + # + # 1. Identify the maximum context length the model was trained on, or the maximum + # allowed length of output tokens configured on vLLM (llama2-7b was trained to + # 4k context length, while llama3-8b was trained to 128k). + # 2. Output tokens are the more compute intensive to calculate and the accelerator + # will have a maximum concurrency (batch size) - the time per output token at + # maximum batch with no prompt tokens being processed is the slowest an output + # token can be generated (for this model it would be about 100ms TPOT at a max + # batch size around 50) + # 3. Calculate the worst case request duration if a request starts immediately + # before the server stops accepting new connections - generally when it receives + # SIGTERM (for this model that is about 4096 / 10 ~ 40s) + # 4. If there are any requests generating prompt tokens that will delay when those + # output tokens start, and prompt token generation is roughly 6x faster than + # compute-bound output token generation, so add 20% to the time from above (40s + + # 16s ~ 55s) + # + # Thus we think it will take us at worst about 55s to complete the longest possible + # request the model is likely to receive at maximum concurrency (highest latency) + # once requests stop being sent. + # + # NOTE: This number will be lower than steady state p99 latency since we stop receiving + # new requests which require continuous prompt token computation. + # NOTE: The max timeout for backend connections from gateway to model servers should + # be configured based on steady state p99 latency, not drain p99 latency + # + # 5. Add the time the pod takes in its preStop hook to allow the load balancers have + # stopped sending us new requests (55s + 30s ~ 85s) + # + # Because termination grace period controls when the Kubelet forcibly terminates a + # stuck or hung process (a possibility due to a GPU crash), there is operational safety + # in keeping the value roughly proportional to the time to finish serving. There is also + # value in adding a bit of extra time to deal with unexpectedly long workloads. + # + # 6. Add a 50% safety buffer to this time since the operational impact should be low + # (85s * 1.5 ~ 130s) + # + # One additional source of drain latency is that some workloads may run close to + # saturation and have queued requests on each server. Since traffic in excess of the + # max sustainable QPS will result in timeouts as the queues grow, we assume that failure + # to drain in time due to excess queues at the time of shutdown is an expected failure + # mode of server overload. If your workload occasionally experiences high queue depths + # due to periodic traffic, consider increasing the safety margin above to account for + # time to drain queued requests. + terminationGracePeriodSeconds: 130 + + volumes: + - name: data + emptyDir: {} + - name: shm + emptyDir: + medium: Memory + - name: adapters + emptyDir: {} + - name: config-volume + configMap: + name: vllm-llama3-8b-instruct-adapters +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: vllm-llama3-8b-instruct-adapters +data: + configmap.yaml: | + vLLMLoRAConfig: + name: vllm-llama3.1-8b-instruct + port: 8000 + defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct + ensureExist: + models: + - id: adapter-0 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-1 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-2 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-3 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-4 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-5 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-6 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-7 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-8 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-9 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-10 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-11 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-12 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-13 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + - id: adapter-14 + source: nvidia/llama-3.1-nemoguard-8b-topic-control + + + + diff --git a/config/manifests/vllm/cpu-deployment.yaml b/config/manifests/vllm/cpu-deployment.yaml index 6fb409500..485d44a19 100644 --- a/config/manifests/vllm/cpu-deployment.yaml +++ b/config/manifests/vllm/cpu-deployment.yaml @@ -14,7 +14,7 @@ spec: spec: containers: - name: lora - image: "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.8.0" # formal images can be found in https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo + image: "public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v0.9.1" # formal images can be found in https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo imagePullPolicy: Always command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] args: @@ -113,5 +113,8 @@ data: ensureExist: models: - base-model: Qwen/Qwen2.5-1.5B - id: food-review-1 + id: food-review + source: SriSanth2345/Qwen-1.5B-Tweet-Generations + - base-model: Qwen/Qwen2.5-1.5B + id: cad-fabricator source: SriSanth2345/Qwen-1.5B-Tweet-Generations \ No newline at end of file diff --git a/config/manifests/vllm/gpu-deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml index 4f13736d9..16f938826 100644 --- a/config/manifests/vllm/gpu-deployment.yaml +++ b/config/manifests/vllm/gpu-deployment.yaml @@ -24,9 +24,15 @@ spec: - "1" - "--port" - "8000" + - "--max-num-seq" + - "1024" + - "--compilation-config" + - "3" - "--enable-lora" - "--max-loras" - "2" + - "--max-lora-rank" + - "8" - "--max-cpu-loras" - "12" env: @@ -77,7 +83,7 @@ spec: #exec: # command: # - /usr/bin/sleep - # - 30 + # - "30" livenessProbe: httpGet: path: /health @@ -133,7 +139,6 @@ spec: path: /health port: http scheme: HTTP - resources: limits: nvidia.com/gpu: 1 @@ -244,12 +249,10 @@ metadata: data: configmap.yaml: | vLLMLoRAConfig: - name: vllm-llama3.1-8b-instruct + name: vllm-llama3-8b-instruct-adapters port: 8000 defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct ensureExist: models: - - id: food-review + - id: food-review-1 source: Kawon/llama3.1-food-finetune_v14_r8 - - id: cad-fabricator - source: redcathode/fabricator diff --git a/config/manifests/vllm/sim-deployment.yaml b/config/manifests/vllm/sim-deployment.yaml new file mode 100644 index 000000000..1862efe48 --- /dev/null +++ b/config/manifests/vllm/sim-deployment.yaml @@ -0,0 +1,43 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-llama3-8b-instruct +spec: + replicas: 3 + selector: + matchLabels: + app: vllm-llama3-8b-instruct + template: + metadata: + labels: + app: vllm-llama3-8b-instruct + spec: + containers: + - name: vllm-sim + image: ghcr.io/llm-d/llm-d-inference-sim:latest + imagePullPolicy: Always + args: + - --model + - meta-llama/Llama-3.1-8B-Instruct + - --port + - "8000" + - --max-loras + - "2" + - --lora + - food-review-1 + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + ports: + - containerPort: 8000 + name: http + protocol: TCP + resources: + requests: + cpu: 10m diff --git a/config/network-policy/allow-metrics-traffic.yaml b/config/network-policy/allow-metrics-traffic.yaml deleted file mode 100644 index aae53668e..000000000 --- a/config/network-policy/allow-metrics-traffic.yaml +++ /dev/null @@ -1,26 +0,0 @@ -# This NetworkPolicy allows ingress traffic -# with Pods running on namespaces labeled with 'metrics: enabled'. Only Pods on those -# namespaces are able to gathering data from the metrics endpoint. -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: allow-metrics-traffic - namespace: system -spec: - podSelector: - matchLabels: - control-plane: controller-manager - policyTypes: - - Ingress - ingress: - # This allows ingress traffic from any namespace with the label metrics: enabled - - from: - - namespaceSelector: - matchLabels: - metrics: enabled # Only from namespaces with this label - ports: - - port: 8443 - protocol: TCP diff --git a/config/network-policy/kustomization.yaml b/config/network-policy/kustomization.yaml deleted file mode 100644 index ec0fb5e57..000000000 --- a/config/network-policy/kustomization.yaml +++ /dev/null @@ -1,2 +0,0 @@ -resources: -- allow-metrics-traffic.yaml diff --git a/config/prometheus/kustomization.yaml b/config/prometheus/kustomization.yaml deleted file mode 100644 index ed137168a..000000000 --- a/config/prometheus/kustomization.yaml +++ /dev/null @@ -1,2 +0,0 @@ -resources: -- monitor.yaml diff --git a/config/prometheus/monitor.yaml b/config/prometheus/monitor.yaml deleted file mode 100644 index aac24ef33..000000000 --- a/config/prometheus/monitor.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# Prometheus Monitor Service (Metrics) -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - control-plane: controller-manager - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: controller-manager-metrics-monitor - namespace: system -spec: - endpoints: - - path: /metrics - port: https # Ensure this is the name of the port that exposes HTTPS metrics - scheme: https - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - tlsConfig: - # TODO(user): The option insecureSkipVerify: true is not recommended for production since it disables - # certificate verification. This poses a significant security risk by making the system vulnerable to - # man-in-the-middle attacks, where an attacker could intercept and manipulate the communication between - # Prometheus and the monitored services. This could lead to unauthorized access to sensitive metrics data, - # compromising the integrity and confidentiality of the information. - # Please use the following options for secure configurations: - # caFile: /etc/metrics-certs/ca.crt - # certFile: /etc/metrics-certs/tls.crt - # keyFile: /etc/metrics-certs/tls.key - insecureSkipVerify: true - selector: - matchLabels: - control-plane: controller-manager diff --git a/config/rbac/inferencemodel_editor_role.yaml b/config/rbac/inferencemodel_editor_role.yaml deleted file mode 100644 index b175a9a39..000000000 --- a/config/rbac/inferencemodel_editor_role.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# permissions for end users to edit inferencemodels. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: inferencemodel-editor-role -rules: -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencemodels - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencemodels/status - verbs: - - get diff --git a/config/rbac/inferencemodel_viewer_role.yaml b/config/rbac/inferencemodel_viewer_role.yaml deleted file mode 100644 index 3b3e67f66..000000000 --- a/config/rbac/inferencemodel_viewer_role.yaml +++ /dev/null @@ -1,23 +0,0 @@ -# permissions for end users to view inferencemodels. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: inferencemodel-viewer-role -rules: -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencemodels - verbs: - - get - - list - - watch -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencemodels/status - verbs: - - get diff --git a/config/rbac/inferencepool_editor_role.yaml b/config/rbac/inferencepool_editor_role.yaml deleted file mode 100644 index cc1f7c353..000000000 --- a/config/rbac/inferencepool_editor_role.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# permissions for end users to edit inferencepools. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: inferencepool-editor-role -rules: -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencepools - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencepools/status - verbs: - - get diff --git a/config/rbac/inferencepool_viewer_role.yaml b/config/rbac/inferencepool_viewer_role.yaml deleted file mode 100644 index 828e0022c..000000000 --- a/config/rbac/inferencepool_viewer_role.yaml +++ /dev/null @@ -1,23 +0,0 @@ -# permissions for end users to view inferencepools. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: inferencepool-viewer-role -rules: -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencepools - verbs: - - get - - list - - watch -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencepools/status - verbs: - - get diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml deleted file mode 100644 index c3a521375..000000000 --- a/config/rbac/kustomization.yaml +++ /dev/null @@ -1,29 +0,0 @@ -resources: -# All RBAC will be applied under this service account in -# the deployment namespace. You may comment out this resource -# if your manager will use a service account that exists at -# runtime. Be sure to update RoleBinding and ClusterRoleBinding -# subjects if changing service account names. -- service_account.yaml -- role.yaml -- role_binding.yaml -- leader_election_role.yaml -- leader_election_role_binding.yaml -# The following RBAC configurations are used to protect -# the metrics endpoint with authn/authz. These configurations -# ensure that only authorized users and service accounts -# can access the metrics endpoint. Comment the following -# permissions if you want to disable this protection. -# More info: https://book.kubebuilder.io/reference/metrics.html -- metrics_auth_role.yaml -- metrics_auth_role_binding.yaml -- metrics_reader_role.yaml -# For each CRD, "Editor" and "Viewer" roles are scaffolded by -# default, aiding admins in cluster management. Those roles are -# not used by the Project itself. You can comment the following lines -# if you do not want those helpers be installed with your Project. -- inferencemodel_editor_role.yaml -- inferencemodel_viewer_role.yaml -- inferencepool_editor_role.yaml -- inferencepool_viewer_role.yaml - diff --git a/config/rbac/leader_election_role.yaml b/config/rbac/leader_election_role.yaml deleted file mode 100644 index e2f8551b5..000000000 --- a/config/rbac/leader_election_role.yaml +++ /dev/null @@ -1,40 +0,0 @@ -# permissions to do leader election. -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: leader-election-role -rules: -- apiGroups: - - "" - resources: - - configmaps - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - coordination.k8s.io - resources: - - leases - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - "" - resources: - - events - verbs: - - create - - patch diff --git a/config/rbac/leader_election_role_binding.yaml b/config/rbac/leader_election_role_binding.yaml deleted file mode 100644 index fb71a1222..000000000 --- a/config/rbac/leader_election_role_binding.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: leader-election-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: leader-election-role -subjects: -- kind: ServiceAccount - name: controller-manager - namespace: system diff --git a/config/rbac/metrics_auth_role.yaml b/config/rbac/metrics_auth_role.yaml deleted file mode 100644 index 32d2e4ec6..000000000 --- a/config/rbac/metrics_auth_role.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: metrics-auth-role -rules: -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create diff --git a/config/rbac/metrics_auth_role_binding.yaml b/config/rbac/metrics_auth_role_binding.yaml deleted file mode 100644 index e775d67ff..000000000 --- a/config/rbac/metrics_auth_role_binding.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: metrics-auth-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: metrics-auth-role -subjects: -- kind: ServiceAccount - name: controller-manager - namespace: system diff --git a/config/rbac/metrics_reader_role.yaml b/config/rbac/metrics_reader_role.yaml deleted file mode 100644 index 51a75db47..000000000 --- a/config/rbac/metrics_reader_role.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: metrics-reader -rules: -- nonResourceURLs: - - "/metrics" - verbs: - - get diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml deleted file mode 100644 index 9d6247ebe..000000000 --- a/config/rbac/role.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: manager-role -rules: -- apiGroups: [""] - resources: ["pods"] - verbs: ["get", "list", "watch"] diff --git a/config/rbac/role_binding.yaml b/config/rbac/role_binding.yaml deleted file mode 100644 index c66b66bf3..000000000 --- a/config/rbac/role_binding.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: manager-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: manager-role -subjects: -- kind: ServiceAccount - name: controller-manager - namespace: system diff --git a/config/rbac/service_account.yaml b/config/rbac/service_account.yaml deleted file mode 100644 index 9286120f8..000000000 --- a/config/rbac/service_account.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: controller-manager - namespace: system diff --git a/config/samples/gateway_v1alpha1_inferencemodel.yaml b/config/samples/gateway_v1alpha1_inferencemodel.yaml deleted file mode 100644 index 34ea06803..000000000 --- a/config/samples/gateway_v1alpha1_inferencemodel.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: inference.networking.x-k8s.io/v1alpha1 -kind: InferenceModel -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: sample-sql-assist -spec: - criticality: Critical - modelName: sql-code-assist - poolRef: - name: vllm-llama-31-8b-sample-pool - targetModels: - - name: npc-bot-v1 - weight: 50 - - name: npc-bot-v2 - weight: 50 diff --git a/config/samples/gateway_v1alpha1_inferencepool.yaml b/config/samples/gateway_v1alpha1_inferencepool.yaml deleted file mode 100644 index 4993d7864..000000000 --- a/config/samples/gateway_v1alpha1_inferencepool.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: inference.networking.x-k8s.io/v1alpha1 -kind: InferencePool -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: vllm-llama-31-8b-sample-pool -spec: - selector: - app: npc-bot - targetPortNumber: 8000 diff --git a/config/samples/kustomization.yaml b/config/samples/kustomization.yaml deleted file mode 100644 index e4b9f2e83..000000000 --- a/config/samples/kustomization.yaml +++ /dev/null @@ -1,5 +0,0 @@ -## Append samples of your project ## -resources: -- gateway_v1alpha1_inferencepool.yaml -- gateway_v1alpha1_inferencemodel.yaml -# +kubebuilder:scaffold:manifestskustomizesamples diff --git a/conformance/conformance.go b/conformance/conformance.go new file mode 100644 index 000000000..2fbcc31a5 --- /dev/null +++ b/conformance/conformance.go @@ -0,0 +1,369 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package conformance contains the core setup and execution logic +// for the Gateway API Inference Extension conformance test suite. +package conformance + +import ( + "context" + "errors" + "fmt" + "io/fs" + "os" + "slices" + "testing" + + "github.com/stretchr/testify/require" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" + clientset "k8s.io/client-go/kubernetes" + clientsetscheme "k8s.io/client-go/kubernetes/scheme" + + // Import runtime package for scheme creation + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/sets" + "sigs.k8s.io/controller-runtime/pkg/client" + k8sconfig "sigs.k8s.io/controller-runtime/pkg/client/config" + "sigs.k8s.io/yaml" + + // Import necessary types and utilities from the core Gateway API conformance suite. + gatewayv1 "sigs.k8s.io/gateway-api/apis/v1" // Import core Gateway API types + confapis "sigs.k8s.io/gateway-api/conformance/apis/v1" // Report struct definition + confflags "sigs.k8s.io/gateway-api/conformance/utils/flags" + apikubernetes "sigs.k8s.io/gateway-api/conformance/utils/kubernetes" + confsuite "sigs.k8s.io/gateway-api/conformance/utils/suite" + "sigs.k8s.io/gateway-api/conformance/utils/tlog" + "sigs.k8s.io/gateway-api/pkg/features" + + // Import the test definitions package to access the ConformanceTests slice + "sigs.k8s.io/gateway-api-inference-extension/conformance/tests" + + // Import test packages using blank identifier + // This triggers the init() functions in these packages, which register the tests + // by appending them to the tests.ConformanceTests slice. + _ "sigs.k8s.io/gateway-api-inference-extension/conformance/tests/basic" + // TODO: Add blank imports for other test categories as they are created. + // _ "sigs.k8s.io/gateway-api-inference-extension/conformance/tests/model_routing" + + // Import the Inference Extension API types + inferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + inferenceconfig "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/config" +) + +const ( + infraNameSpace = "gateway-conformance-infra" + appBackendNameSpace = "gateway-conformance-app-backend" + primaryGatewayName = "conformance-primary-gateway" + secondaryGatewayName = "conformance-secondary-gateway" +) + +var ( + primaryGatewayNN = types.NamespacedName{Name: primaryGatewayName, Namespace: infraNameSpace} + secondaryGatewayNN = types.NamespacedName{Name: secondaryGatewayName, Namespace: infraNameSpace} +) + +// GatewayLayerProfileName defines the name for the conformance profile that tests +// the Gateway API layer aspects of the Inference Extension (e.g., InferencePool, InferenceModel CRDs). +// Future profiles will cover EPP and ModelServer layers. +const GatewayLayerProfileName confsuite.ConformanceProfileName = "Gateway" + +// TODO(#863) Create a dedicated share location for feature names similar to +// sigs.k8s.io/gateway-api/pkg/features and change the tests from +// string casting the feature name to referencing the shared feature names. + +// Conformance specific features +const SupportInferencePool features.FeatureName = "SupportInferencePool" + +// InferenceCoreFeatures defines the core features that implementations +// of the "Gateway" profile for the Inference Extension MUST support. +var InferenceCoreFeatures = sets.New( + features.SupportGateway, // This is needed to ensure manifest gets applied during setup. + features.SupportHTTPRoute, + SupportInferencePool, +) + +var GatewayLayerProfile = confsuite.ConformanceProfile{ + Name: GatewayLayerProfileName, + CoreFeatures: InferenceCoreFeatures, +} + +// logDebugf conditionally logs a debug message if debug mode is enabled. +func logDebugf(t *testing.T, debug bool, format string, args ...any) { + if debug { + t.Helper() + t.Logf(format, args...) + } +} + +// DefaultOptions parses command line flags and sets up the suite options. +// Adapted from the core Gateway API conformance suite. +func DefaultOptions(t *testing.T) confsuite.ConformanceOptions { + t.Helper() + + cfg, err := k8sconfig.GetConfig() + require.NoError(t, err, "error loading Kubernetes config") + + scheme := runtime.NewScheme() + + t.Log("Registering API types with scheme...") + // Register core K8s types (like v1.Secret for certs) to scheme, needed by client to create/manage these resources. + require.NoError(t, clientsetscheme.AddToScheme(scheme), "failed to add core Kubernetes types to scheme") + // Add Gateway API types + require.NoError(t, gatewayv1.Install(scheme), "failed to install gatewayv1 types into scheme") + // Add APIExtensions types (for CRDs) + require.NoError(t, apiextensionsv1.AddToScheme(scheme), "failed to add apiextensionsv1 types to scheme") + + // Register Inference Extension API types + t.Logf("Attempting to install inferencev1alpha2 types into scheme from package: %s", inferencev1alpha2.GroupName) + require.NoError(t, inferencev1alpha2.Install(scheme), "failed to install inferencev1alpha2 types into scheme") + + clientOptions := client.Options{Scheme: scheme} + c, err := client.New(cfg, clientOptions) + require.NoError(t, err, "error initializing Kubernetes client") + cs, err := clientset.NewForConfig(cfg) + require.NoError(t, err, "error initializing Kubernetes clientset") + + exemptFeatures := confsuite.ParseSupportedFeatures(*confflags.ExemptFeatures) + skipTests := confsuite.ParseSkipTests(*confflags.SkipTests) + namespaceLabels := confsuite.ParseKeyValuePairs(*confflags.NamespaceLabels) + namespaceAnnotations := confsuite.ParseKeyValuePairs(*confflags.NamespaceAnnotations) + + // Initially, run the GatewayLayerProfile. This will expand as other profiles + // (EPP, ModelServer) are added and can be selected via flags in future iterations. + conformanceProfiles := sets.New(GatewayLayerProfileName) + + // Implementation details from flags + implementation := confsuite.ParseImplementation( + *confflags.ImplementationOrganization, + *confflags.ImplementationProject, + *confflags.ImplementationURL, + *confflags.ImplementationVersion, + *confflags.ImplementationContact, + ) + + // Inference Extension Specific Report Fields + inferenceExtensionVersion := "v0.3.0" + _ = inferenceExtensionVersion // Avoid unused variable error until implemented + + baseManifestsValue := "resources/manifests/manifests.yaml" + + opts := confsuite.ConformanceOptions{ + Client: c, + ClientOptions: clientOptions, + Clientset: cs, + RestConfig: cfg, + GatewayClassName: *confflags.GatewayClassName, + BaseManifests: baseManifestsValue, + Debug: *confflags.ShowDebug, + CleanupBaseResources: *confflags.CleanupBaseResources, + SupportedFeatures: sets.New[features.FeatureName](), + TimeoutConfig: inferenceconfig.DefaultInferenceExtensionTimeoutConfig().TimeoutConfig, + SkipTests: skipTests, + ExemptFeatures: exemptFeatures, + RunTest: *confflags.RunTest, + Mode: *confflags.Mode, + Implementation: implementation, + ConformanceProfiles: conformanceProfiles, + ManifestFS: []fs.FS{&Manifests}, + ReportOutputPath: *confflags.ReportOutput, + SkipProvisionalTests: *confflags.SkipProvisionalTests, + AllowCRDsMismatch: *confflags.AllowCRDsMismatch, + NamespaceLabels: namespaceLabels, + NamespaceAnnotations: namespaceAnnotations, + // TODO: Add the inference extension specific fields to ConformanceOptions struct if needed, + // or handle them during report generation. + // GatewayAPIInferenceExtensionChannel: inferenceExtensionChannel, + // GatewayAPIInferenceExtensionVersion: inferenceExtensionVersion, + } + + // Populate SupportedFeatures based on the GatewayLayerProfile. + // Since all features are mandatory for this profile, add all defined core features. + if opts.ConformanceProfiles.Has(GatewayLayerProfileName) { + logDebugf(t, opts.Debug, "Populating SupportedFeatures with GatewayLayerProfile.CoreFeatures: %v", GatewayLayerProfile.CoreFeatures.UnsortedList()) + if GatewayLayerProfile.CoreFeatures.Len() > 0 { + opts.SupportedFeatures = opts.SupportedFeatures.Insert(GatewayLayerProfile.CoreFeatures.UnsortedList()...) + } + } + + // Remove any features explicitly exempted via flags. + if opts.ExemptFeatures.Len() > 0 { + logDebugf(t, opts.Debug, "Removing ExemptFeatures from SupportedFeatures: %v", opts.ExemptFeatures.UnsortedList()) + opts.SupportedFeatures = opts.SupportedFeatures.Delete(opts.ExemptFeatures.UnsortedList()...) + } + + logDebugf(t, opts.Debug, "Final opts.SupportedFeatures: %v", opts.SupportedFeatures.UnsortedList()) + + return opts +} + +// RunConformance runs the Inference Extension conformance tests using default options. +func RunConformance(t *testing.T) { + RunConformanceWithOptions(t, DefaultOptions(t)) +} + +// RunConformanceWithOptions runs the Inference Extension conformance tests with specific options. +func RunConformanceWithOptions(t *testing.T, opts confsuite.ConformanceOptions) { + t.Helper() + t.Logf("Running Inference Extension conformance tests with GatewayClass %s", opts.GatewayClassName) + logDebugf(t, opts.Debug, "RunConformanceWithOptions: BaseManifests path being used by opts: %q", opts.BaseManifests) + + // Register the GatewayLayerProfile with the suite runner. + // In the future, other profiles (EPP, ModelServer) will also be registered here, + // and the suite runner will execute tests based on the selected profiles. + confsuite.RegisterConformanceProfile(GatewayLayerProfile) + + // Initialize the test suite. + cSuite, err := confsuite.NewConformanceTestSuite(opts) + require.NoError(t, err, "error initializing conformance suite") + + SetupConformanceTestSuite(t, cSuite, opts, tests.ConformanceTests) + + t.Log("Running Inference Extension conformance tests against all registered tests") + err = cSuite.Run(t, tests.ConformanceTests) + require.NoError(t, err, "error running conformance tests") + + // Generate and write the report if requested. + if opts.ReportOutputPath != "" { + t.Log("Generating Inference Extension conformance report") + report, err := cSuite.Report() // Use the existing report generation logic. + require.NoError(t, err, "error generating conformance report") + + // TODO: Modify the report struct here if channel, version need to be modified. + // Example (requires adding fields to confapis.ConformanceReport): + // report.GatewayAPIInferenceExtensionChannel = opts.GatewayAPIInferenceExtensionChannel + // report.GatewayAPIInferenceExtensionVersion = opts.GatewayAPIInferenceExtensionVersion + + err = writeReport(t.Logf, *report, opts.ReportOutputPath) + require.NoError(t, err, "error writing conformance report") + } +} + +func SetupConformanceTestSuite(t *testing.T, suite *confsuite.ConformanceTestSuite, opts confsuite.ConformanceOptions, tests []confsuite.ConformanceTest) { + suite.Applier.ManifestFS = suite.ManifestFS + if suite.RunTest != "" { + idx := slices.IndexFunc(tests, func(t confsuite.ConformanceTest) bool { + return t.ShortName == suite.RunTest + }) + + if idx == -1 { + require.FailNow(t, fmt.Sprintf("Test %q does not exist", suite.RunTest)) + } + } + + tlog.Logf(t, "Test Setup: Ensuring GatewayClass has been accepted") + suite.ControllerName = apikubernetes.GWCMustHaveAcceptedConditionTrue(t, suite.Client, suite.TimeoutConfig, suite.GatewayClassName) + + suite.Applier.GatewayClass = suite.GatewayClassName + suite.Applier.ControllerName = suite.ControllerName + + tlog.Logf(t, "Test Setup: Applying base manifests") + suite.Applier.MustApplyWithCleanup(t, suite.Client, suite.TimeoutConfig, suite.BaseManifests, suite.Cleanup) + + tlog.Logf(t, "Test Setup: Ensuring Gateways and Pods from base manifests are ready") + namespaces := []string{ + infraNameSpace, + appBackendNameSpace, + } + apikubernetes.NamespacesMustBeReady(t, suite.Client, suite.TimeoutConfig, namespaces) + + ensureGatewayAvailableAndReady(t, suite.Client, opts, primaryGatewayNN) + ensureGatewayAvailableAndReady(t, suite.Client, opts, secondaryGatewayNN) +} + +// ensureGatewayAvailableAndReady polls for the specified Gateway to exist and become ready +// with an address and programmed condition. +func ensureGatewayAvailableAndReady(t *testing.T, k8sClient client.Client, opts confsuite.ConformanceOptions, gatewayNN types.NamespacedName) { + t.Helper() + + t.Logf("Attempting to fetch Gateway %s/%s.", gatewayNN.Namespace, gatewayNN.Name) + gw := &gatewayv1.Gateway{} // This gw instance will be populated by the poll function + + // Use extension-specific config for the polling interval defined in timeout.go. + extTimeoutConf := inferenceconfig.DefaultInferenceExtensionTimeoutConfig() + + // Use the GatewayMustHaveAddress timeout from the suite's base TimeoutConfig for the Gateway object to appear. + waitForGatewayCreationTimeout := extTimeoutConf.TimeoutConfig.GatewayMustHaveAddress + + logDebugf(t, opts.Debug, "Waiting up to %v for Gateway object %s/%s to appear after manifest application...", waitForGatewayCreationTimeout, gatewayNN.Namespace, gatewayNN.Name) + + ctx := context.TODO() + pollErr := wait.PollUntilContextTimeout(ctx, extTimeoutConf.GatewayObjectPollInterval, waitForGatewayCreationTimeout, true, func(pollCtx context.Context) (bool, error) { + fetchErr := k8sClient.Get(pollCtx, gatewayNN, gw) + if fetchErr == nil { + t.Logf("Successfully fetched Gateway %s/%s. Spec.GatewayClassName: %s", + gw.Namespace, gw.Name, gw.Spec.GatewayClassName) + return true, nil + } + if apierrors.IsNotFound(fetchErr) { + logDebugf(t, opts.Debug, "Gateway %s/%s not found, still waiting...", gatewayNN.Namespace, gatewayNN.Name) + return false, nil // Not found, continue polling + } + // For any other error, stop polling and return this error + t.Logf("Error fetching Gateway %s/%s: %v. Halting polling for this attempt.", gatewayNN.Namespace, gatewayNN.Name, fetchErr) + return false, fetchErr + }) + + // Check if polling timed out or an error occurred during polling + if pollErr != nil { + var failureMessage string + if errors.Is(pollErr, context.DeadlineExceeded) { + failureMessage = fmt.Sprintf("Timed out after %v waiting for Gateway object %s/%s to appear in the API server.", + waitForGatewayCreationTimeout, gatewayNN.Namespace, gatewayNN.Name) + } else { + failureMessage = fmt.Sprintf("Error while waiting for Gateway object %s/%s to appear: %v.", + gatewayNN.Namespace, gatewayNN.Name, pollErr) + } + finalMessage := failureMessage + " The Gateway object should have been created by the base manifest application." + require.FailNow(t, finalMessage) // Use FailNow to stop if the Gateway isn't found. + } + + logDebugf(t, opts.Debug, "Waiting for shared Gateway %s/%s to be ready", gatewayNN.Namespace, gatewayNN.Name) + apikubernetes.GatewayMustHaveCondition(t, k8sClient, opts.TimeoutConfig, gatewayNN, metav1.Condition{ + Type: string(gatewayv1.GatewayConditionAccepted), + Status: metav1.ConditionTrue, + }) + apikubernetes.GatewayMustHaveCondition(t, k8sClient, opts.TimeoutConfig, gatewayNN, metav1.Condition{ + Type: string(gatewayv1.GatewayConditionProgrammed), + Status: metav1.ConditionTrue, + }) + _, err := apikubernetes.WaitForGatewayAddress(t, k8sClient, opts.TimeoutConfig, apikubernetes.NewGatewayRef(gatewayNN)) + require.NoErrorf(t, err, "shared gateway %s/%s did not get an address", gatewayNN.Namespace, gatewayNN.Name) + t.Logf("Shared Gateway %s/%s is ready.", gatewayNN.Namespace, gatewayNN.Name) +} + +// writeReport writes the generated conformance report to the specified output file or logs it. +// Adapted from the core Gateway API suite. +func writeReport(logf func(string, ...any), report confapis.ConformanceReport, output string) error { + rawReport, err := yaml.Marshal(report) + if err != nil { + return fmt.Errorf("error marshaling report: %w", err) + } + + if output != "" { + if err = os.WriteFile(output, rawReport, 0o600); err != nil { + return fmt.Errorf("error writing report file %s: %w", output, err) + } + logf("Conformance report written to %s", output) + } else { + // Log the report YAML to stdout if no output file is specified. + logf("Conformance report:\n%s", string(rawReport)) + } + return nil +} diff --git a/pkg/epp/scheduling/types.go b/conformance/conformance_test.go similarity index 60% rename from pkg/epp/scheduling/types.go rename to conformance/conformance_test.go index 29e6648da..de82d5ecf 100644 --- a/pkg/epp/scheduling/types.go +++ b/conformance/conformance_test.go @@ -14,14 +14,16 @@ See the License for the specific language governing permissions and limitations under the License. */ -package scheduling +package conformance -// LLMRequest is a structured representation of the fields we parse out of the LLMRequest body. -type LLMRequest struct { - Model string - // Target models is a map of target model name to weight. - TargetModels map[string]int - // Resolved target model is the final target model after traffic split. - ResolvedTargetModel string - Critical bool +import ( + "testing" +) + +// TestConformance is the top-level function that runs the conformance tests. +// It calls the RunConformance function which sets up the suite and executes +// the registered tests. +func TestConformance(t *testing.T) { + // RunConformance is defined in conformance.go + RunConformance(t) } diff --git a/conformance/embed.go b/conformance/embed.go new file mode 100644 index 000000000..c9175db1d --- /dev/null +++ b/conformance/embed.go @@ -0,0 +1,25 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package conformance + +import "embed" + +// Manifests embeds the contents of the conformance/resources directory making +// the YAML files within them available to the test suite at runtime. +// +//go:embed resources tests/* +var Manifests embed.FS diff --git a/conformance/reports/README.md b/conformance/reports/README.md new file mode 100644 index 000000000..ba0278d0c --- /dev/null +++ b/conformance/reports/README.md @@ -0,0 +1,95 @@ +# Conformance Reports for Gateway API Inference Extension + +This directory stores conformance reports submitted by various implementations of the Gateway API Inference Extension. This structure closely follows the [kubernetes-sigs/gateway-api/conformance/reports](https://github.com/kubernetes-sigs/gateway-api/blob/main/conformance/reports/README.md). + +## How this folder is structured + +This folder stores conformance reports organized first by the version of the Gateway API Inference Extension specification they were tested against, and then by the specific conformance profile (e.g., Gateway, EPP, Model Server): + +``` +|-- conformance/reports +| |-- v0.3.0 # Example extension version +| | |-- gateway # Conformance profile/category +| | | |-- my-inference-gateway +| | | | |-- README.md +| | | | |-- experimental-v1.2.3-default-gateway-report.yaml # Example report file +| | | |-- another-implementation +| | | | |-- README.md +| | | | |-- ... +| | |-- epp # Future conformance profile/category +| | | |-- my-epp-implementation +| | | | |-- ... +| | |-- model-server # Future conformance profile/category +| | | |-- ... +| |-- v0.4.0 # Future extension version +| | |-- ... +``` + +## Implementation Submissions + +Each implementation conformant with a specific profile of a specific version of the Gateway API Inference Extension should have its own folder within the corresponding version and profile directory (e.g., `/conformance/reports/v0.3.0/Gateway/my-implementation/`). + +The implementation is the owner of its folder and is responsible for: + +1. Uploading one or more conformance reports (YAML files). +2. Maintaining a mandatory `README.md` file within their folder, structured as follows: + + # My Inference Gateway Implementation (Gateway Profile Conformance) + + General information about the My/Implementation project. + + ## Table of Contents + +| Extension Version Tested | Profile Tested | Implementation Version | Mode | Report | +|--------------------------|----------------|------------------------|---------|----------------------------------------------------------------------------| +| v0.3.0 | Gateway | v1.2.3 | default | [v1.2.3 Gateway report](./experimental-v1.2.3-default-gateway-report.yaml) | +| ... | ... | ... | ... | ... | + + ## Reproduce + + Instructions on how to reproduce the claimed report(s). + +### Table of Contents (within Implementation README) + +The table of contents within an implementation's `README.md` should contain one row for each submitted report and include the following columns: + +* **Extension Version Tested**: The version of the Gateway API Inference Extension specification tested against (e.g., `v0.3.0`). Must correspond to the `gatewayAPIInferenceExtensionVersion` field in the report. +* **Profile Tested**: The specific conformance profile tested (e.g., `Gateway`, `EPP`, `ModelServer`). Must correspond to the `name` of the profile in the `profiles` list within the report. +* **Implementation Version**: A link to the GitHub/website page for the specific release/commit of the implementation tested. The version value MUST correspond to the `implementation.version` field in the report. +* **Mode**: The operating mode of the implementation used for the test run (default is `default`). Must correspond to the `mode` field in the report. If a mode other than `default` is used, the "Reproduce" section must explain how to configure it. +* **Report**: A link to the corresponding report YAML file. Reports MUST be named according to the pattern: `---report.yaml` (e.g., `experimental-v1.2.3-default-gateway-report.yaml`). + +### Reproduce Section (within Implementation README) + +This section MUST exist and contain the manual or automatic steps required to reproduce the results claimed by the uploaded conformance reports for that specific implementation. If reproduction steps differ significantly between implementation versions, use sub-sections. + +## Report Files + +Conformance reports MUST be uploaded exactly as generated by the official Gateway API Inference Extension conformance test suite, without any modifications. The "Reproduce" section allows for verification of the submitted report against a fresh run. + +### Report Rules + +To be accepted, submitted conformance reports must comply with the following rules: + +1. **Implementation Details:** All fields within the `implementation` block must have meaningful values: + * `organization`: The entity maintaining the implementation (company, open source org, individual). + * `project`: The name of the implementation project, unique within the organization. + * `url`: A valid URL for the project (e.g., GitHub repository, product page). + * `version`: A specific, reproducible snapshot of the implementation (e.g., tag, commit hash, release version). Branch names are not acceptable. + * `contact`: A list of contact points (GitHub handles like `@maintainer`, team handles like `@org/team`, email addresses, or support URLs like an issue tracker). +2. **Inference Extension Versioning:** The report MUST include: + * `gatewayAPIInferenceExtensionVersion`: The specific version of the Gateway API Inference Extension specification tested against (e.g., `v0.3.0`). +3. **Mode:** The `mode` field indicates the implementation's operating mode during the test run. +4. **Test Profile & Result:** + * The report MUST contain exactly one profile result under the `profiles` list for the specific conformance category being submitted (e.g., a report for "Gateway" conformance should only contain the "Gateway" profile result). + * The profile's `name` MUST match the conformance category (e.g., `Gateway`, `EPP`, `ModelServer`). + * The profile's `result` field MUST be `success`. A `success` result indicates that **all** tests defined within the Gateway API Inference Extension conformance suite for that specific profile and version passed. + +## Submission Process + +Conformance reports demonstrating a `success` result for a specific profile (e.g., `Gateway`) should be submitted via Pull Request directly to this repository (`kubernetes-sigs/gateway-api-inference-extension`). + +1. Create a new folder structure under `/conformance/reports///` named after your implementation (e.g., `/conformance/reports/v0.3.0/Gateway/my-implementation/`). +2. Add your implementation's `README.md` to this folder, following the structure described above. +3. Add your generated conformance report YAML file(s) to this folder, ensuring they follow the naming convention `---report.yaml`. +4. Submit the Pull Request. diff --git a/conformance/resources/manifests/manifests.yaml b/conformance/resources/manifests/manifests.yaml new file mode 100644 index 000000000..5b1f2d409 --- /dev/null +++ b/conformance/resources/manifests/manifests.yaml @@ -0,0 +1,353 @@ +# Base Kubernetes resources for the Gateway API Inference Extension conformance tests. +# This includes namespaces and a minimal set of resources (Gateway, Backend) +# required by many tests. More specific resources should be defined within +# individual test files or other resource directories (e.g., sample_backends). + +--- +apiVersion: v1 +kind: Namespace +metadata: + name: gateway-conformance-infra + labels: + gateway-conformance: infra +--- +apiVersion: v1 +kind: Namespace +metadata: + name: gateway-conformance-app-backend + labels: + gateway-conformance: backend +--- +# A basic Gateway resource that allows HTTPRoutes from the same namespace. +# Tests can use this as a parent reference for routes that target InferencePools. +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: conformance-primary-gateway + namespace: gateway-conformance-infra +spec: + gatewayClassName: "{GATEWAY_CLASS_NAME}" + listeners: + - name: http + port: 80 + protocol: HTTP + allowedRoutes: + namespaces: + from: All + kinds: + - group: gateway.networking.k8s.io + kind: HTTPRoute +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: conformance-secondary-gateway + namespace: gateway-conformance-infra +spec: + gatewayClassName: "{GATEWAY_CLASS_NAME}" + listeners: + - name: http + port: 80 + protocol: HTTP + hostname: "secondary.example.com" + allowedRoutes: + namespaces: + from: All + +### The following defines the essential resources for the gateway conformance test. +### All resources are created in the 'gateway-conformance-app-backend' namespace. +--- +# Deploys a mock backend service to act as a model server. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: primary-inference-model-server-deployment + namespace: gateway-conformance-app-backend + labels: + app: primary-inference-model-server +spec: + replicas: 3 + selector: + matchLabels: + app: primary-inference-model-server + template: + metadata: + labels: + app: primary-inference-model-server + spec: + containers: + - name: echoserver + image: gcr.io/k8s-staging-gateway-api/echo-basic:v20240412-v1.0.0-394-g40c666fd + ports: + - containerPort: 3000 + readinessProbe: + httpGet: + path: / + port: 3000 + initialDelaySeconds: 3 + periodSeconds: 5 + failureThreshold: 2 + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP +--- +# Deploys a secondary mock backend service to act as a model server. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: secondary-inference-model-server-deployment + namespace: gateway-conformance-app-backend + labels: + app: secondary-inference-model-server +spec: + replicas: 3 + selector: + matchLabels: + app: secondary-inference-model-server + template: + metadata: + labels: + app: secondary-inference-model-server + spec: + containers: + - name: echoserver + image: gcr.io/k8s-staging-gateway-api/echo-basic:v20240412-v1.0.0-394-g40c666fd + ports: + - containerPort: 3000 + readinessProbe: + httpGet: + path: / + port: 3000 + initialDelaySeconds: 3 + periodSeconds: 5 + failureThreshold: 2 + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP +--- +# --- Primary InferencePool Definition --- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + name: primary-inference-pool + namespace: gateway-conformance-app-backend +spec: + selector: + app: primary-inference-model-server + targetPortNumber: 3000 + extensionRef: + name: primary-endpoint-picker-svc +--- +# --- Primary Conformance EPP service Definition --- +apiVersion: v1 +kind: Service +metadata: + name: primary-endpoint-picker-svc + namespace: gateway-conformance-app-backend +spec: + selector: + app: primary-app-backend-epp + ports: + - protocol: TCP + port: 9002 + targetPort: 9002 + appProtocol: http2 + type: ClusterIP +--- +# --- Primary Conformance EPP Deployment --- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: primary-app-endpoint-picker + namespace: gateway-conformance-app-backend + labels: + app: primary-app-backend-epp +spec: + replicas: 1 + selector: + matchLabels: + app: primary-app-backend-epp + template: + metadata: + labels: + app: primary-app-backend-epp + spec: + # Conservatively, this timeout should mirror the longest grace period of the pods within the pool + terminationGracePeriodSeconds: 130 + containers: + - name: epp + image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main + imagePullPolicy: Always + args: + - -poolName + - "primary-inference-pool" + - -poolNamespace + - "gateway-conformance-app-backend" + - -v + - "4" + - --zap-encoder + - "json" + - -grpcPort + - "9002" + - -grpcHealthPort + - "9003" + env: + - name: USE_STREAMING + value: "true" + - name: ENABLE_REQ_HEADER_BASED_SCHEDULER_FOR_TESTING # Used for conformance test. + value: "true" + ports: + - containerPort: 9002 + - containerPort: 9003 + - name: metrics + containerPort: 9090 + livenessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 +--- +# --- Secondary InferencePool Definition --- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + name: secondary-inference-pool + namespace: gateway-conformance-app-backend +spec: + selector: + app: secondary-inference-model-server + targetPortNumber: 3000 + extensionRef: + name: secondary-endpoint-picker-svc + failureMode: FailOpen +--- +# --- Secondary Conformance EPP service Definition --- +apiVersion: v1 +kind: Service +metadata: + name: secondary-endpoint-picker-svc + namespace: gateway-conformance-app-backend +spec: + selector: + app: secondary-app-backend-epp + ports: + - protocol: TCP + port: 9002 + targetPort: 9002 + appProtocol: http2 + type: ClusterIP +--- +# --- Secondary Conformance EPP Deployment --- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: secondary-app-endpoint-picker + namespace: gateway-conformance-app-backend + labels: + app: secondary-app-backend-epp +spec: + replicas: 1 + selector: + matchLabels: + app: secondary-app-backend-epp + template: + metadata: + labels: + app: secondary-app-backend-epp + spec: + # Conservatively, this timeout should mirror the longest grace period of the pods within the pool + terminationGracePeriodSeconds: 130 + containers: + - name: epp + image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main + imagePullPolicy: Always + args: + - -poolName + - "secondary-inference-pool" + - -poolNamespace + - "gateway-conformance-app-backend" + - -v + - "4" + - --zap-encoder + - "json" + - -grpcPort + - "9002" + - -grpcHealthPort + - "9003" + env: + - name: USE_STREAMING + value: "true" + - name: ENABLE_REQ_HEADER_BASED_SCHEDULER_FOR_TESTING # Used for conformance test. + value: "true" + ports: + - containerPort: 9002 + - containerPort: 9003 + - name: metrics + containerPort: 9090 + livenessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 +--- +# --- Required Role and RoleBinding for Conformance Test for EPP --- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: inference-model-reader + namespace: gateway-conformance-app-backend +rules: +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencemodels", "inferencepools"] + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: epp-to-inference-model-reader + namespace: gateway-conformance-app-backend +subjects: +- kind: ServiceAccount + name: default + namespace: gateway-conformance-app-backend +roleRef: + kind: Role + name: inference-model-reader + apiGroup: rbac.authorization.k8s.io diff --git a/conformance/testing-epp/plugins/filter/filter_test.go b/conformance/testing-epp/plugins/filter/filter_test.go new file mode 100644 index 000000000..2c0082189 --- /dev/null +++ b/conformance/testing-epp/plugins/filter/filter_test.go @@ -0,0 +1,127 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package filter + +import ( + "context" + "testing" + + "github.com/google/go-cmp/cmp" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +func TestFilter(t *testing.T) { + tests := []struct { + name string + req *types.LLMRequest + filter framework.Filter + input []types.Pod + output []types.Pod + }{ + { + name: "TestHeaderBasedFilter, header endpoint unset in request", + req: &types.LLMRequest{}, // Delieverately unset the header. + filter: &HeaderBasedTestingFilter{}, + input: []types.Pod{ + &types.PodMetrics{ + Pod: &backend.Pod{ + Address: "test-endpoint", + }, + }, + }, + output: []types.Pod{}, + }, + { + name: "TestHeaderBasedFilter, header endpoint set in request but no match", + req: &types.LLMRequest{Headers: map[string]string{headerTestEppEndPointSelectionKey: "test-endpoint"}}, + filter: &HeaderBasedTestingFilter{}, + input: []types.Pod{ + &types.PodMetrics{ + Pod: &backend.Pod{ + Address: "test-endpoint-unmatch", + }, + }, + }, + output: []types.Pod{}, + }, + { + name: "TestHeaderBasedFilter, header endpoint set", + req: &types.LLMRequest{Headers: map[string]string{headerTestEppEndPointSelectionKey: "test-endpoint"}}, + filter: &HeaderBasedTestingFilter{}, + input: []types.Pod{ + &types.PodMetrics{ + Pod: &backend.Pod{ + Address: "test-endpoint", + }, + }, + }, + output: []types.Pod{ + &types.PodMetrics{ + Pod: &backend.Pod{ + Address: "test-endpoint", + }, + }, + }, + }, + { + name: "TestHeaderBasedFilter, multiple header endpoints set and multiple matches", + req: &types.LLMRequest{Headers: map[string]string{headerTestEppEndPointSelectionKey: "test-endpoint3,test-endpoint2"}}, + filter: &HeaderBasedTestingFilter{}, + input: []types.Pod{ + &types.PodMetrics{ + Pod: &backend.Pod{ + Address: "test-endpoint1", + }, + }, + &types.PodMetrics{ + Pod: &backend.Pod{ + Address: "test-endpoint2", + }, + }, + &types.PodMetrics{ + Pod: &backend.Pod{ + Address: "test-endpoint3", + }, + }, + }, + output: []types.Pod{ + &types.PodMetrics{ + Pod: &backend.Pod{ + Address: "test-endpoint3", + }, + }, + &types.PodMetrics{ + Pod: &backend.Pod{ + Address: "test-endpoint2", + }, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + got := test.filter.Filter(context.Background(), types.NewCycleState(), test.req, test.input) + + if diff := cmp.Diff(test.output, got); diff != "" { + t.Errorf("Unexpected output (-want +got): %v", diff) + } + }) + } +} diff --git a/conformance/testing-epp/plugins/filter/request_header_based_filter.go b/conformance/testing-epp/plugins/filter/request_header_based_filter.go new file mode 100644 index 000000000..ee191111e --- /dev/null +++ b/conformance/testing-epp/plugins/filter/request_header_based_filter.go @@ -0,0 +1,78 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package filter + +import ( + "context" + "strings" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +const ( + // headerTestEppEndPointSelectionKey is the header used for testing purposes to make EPP behavior controllable. + // The header value should be a comma-separated list of endpoint IP addresses. + // E.g., "test-epp-endpoint-selection": "10.0.0.7,10.0.0.8" + // The returned order is the same as the order provided in the header. + headerTestEppEndPointSelectionKey = "test-epp-endpoint-selection" +) + +// compile-time type assertion +var _ framework.Filter = &HeaderBasedTestingFilter{} + +// NewHeaderBasedTestingFilter initializes a new HeaderBasedTestingFilter. +// This should only be used for testing purposes. +func NewHeaderBasedTestingFilter() *HeaderBasedTestingFilter { + return &HeaderBasedTestingFilter{} +} + +// HeaderBasedTestingFilter filters Pods based on an address specified in the "test-epp-endpoint-selection" request header. +type HeaderBasedTestingFilter struct{} + +// Type returns the type of the filter. +func (f *HeaderBasedTestingFilter) Type() string { + return "header-based-testing" +} + +// Name returns the type of the filter. +func (f *HeaderBasedTestingFilter) Name() string { + return "header-based-testing-filter" +} + +// Filter selects pods that match the IP addresses specified in the request header. +func (f *HeaderBasedTestingFilter) Filter(_ context.Context, _ *types.CycleState, request *types.LLMRequest, pods []types.Pod) []types.Pod { + headerValue, ok := request.Headers[headerTestEppEndPointSelectionKey] + if !ok || headerValue == "" { + return []types.Pod{} + } + + podAddressMap := make(map[string]types.Pod, len(pods)) + for _, pod := range pods { + podAddressMap[pod.GetPod().Address] = pod + } + + endpoints := strings.Split(headerValue, ",") + filteredPods := make([]types.Pod, 0, len(endpoints)) + for _, endpoint := range endpoints { + trimmedEndpoint := strings.TrimSpace(endpoint) + if pod, found := podAddressMap[trimmedEndpoint]; found { + filteredPods = append(filteredPods, pod) + } + } + return filteredPods +} diff --git a/conformance/testing-epp/scheduler.go b/conformance/testing-epp/scheduler.go new file mode 100644 index 000000000..94f9ee5bb --- /dev/null +++ b/conformance/testing-epp/scheduler.go @@ -0,0 +1,37 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduling + +import ( + "sigs.k8s.io/gateway-api-inference-extension/conformance/testing-epp/plugins/filter" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile" +) + +// NewReqHeaderBasedScheduler creates a scheduler for conformance tests that selects +// an endpoint based on the "test-epp-endpoint-selection" request header. If the +// header is missing or the specified endpoint doesn't exist, no endpoint is returned. +func NewReqHeaderBasedScheduler() *scheduling.Scheduler { + predicatableSchedulerProfile := framework.NewSchedulerProfile(). + WithFilters(filter.NewHeaderBasedTestingFilter()). + WithPicker(picker.NewMaxScorePicker()) + + return scheduling.NewSchedulerWithConfig(scheduling.NewSchedulerConfig( + profile.NewSingleProfileHandler(), map[string]*framework.SchedulerProfile{"req-header-based-profile": predicatableSchedulerProfile})) +} diff --git a/conformance/testing-epp/scheduler_test.go b/conformance/testing-epp/scheduler_test.go new file mode 100644 index 000000000..4901e0380 --- /dev/null +++ b/conformance/testing-epp/scheduler_test.go @@ -0,0 +1,113 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduling + +import ( + "context" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/uuid" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" // Import config for thresholds + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +// Tests the scheduler for conformance tests. +func TestSchedule(t *testing.T) { + tests := []struct { + name string + input []backendmetrics.PodMetrics + req *types.LLMRequest + wantRes *types.SchedulingResult + err bool + }{ + { + name: "no candidate pods and req header is set", + req: &types.LLMRequest{ + Headers: map[string]string{"test-epp-endpoint-selection": "random-endpoint"}, + RequestId: uuid.NewString(), + }, + wantRes: nil, + err: true, + }, + { + name: "req header not set", + input: []backendmetrics.PodMetrics{ + &backendmetrics.FakePodMetrics{Pod: &backend.Pod{Address: "random-endpoint"}}, + }, + req: &types.LLMRequest{ + Headers: map[string]string{}, // Deliberately set an empty header. + RequestId: uuid.NewString(), + }, + wantRes: nil, + err: true, + }, + { + name: "no pods address from the candidate pods matches req header address", + input: []backendmetrics.PodMetrics{ + &backendmetrics.FakePodMetrics{Pod: &backend.Pod{Address: "nonmatched-endpoint"}}, + }, + req: &types.LLMRequest{ + Headers: map[string]string{"test-epp-endpoint-selection": "matched-endpoint"}, + RequestId: uuid.NewString(), + }, + wantRes: nil, + err: true, + }, + { + name: "one pod address from the candidate pods matches req header address", + input: []backendmetrics.PodMetrics{ + &backendmetrics.FakePodMetrics{Pod: &backend.Pod{Address: "nonmatched-endpoint"}}, + &backendmetrics.FakePodMetrics{Pod: &backend.Pod{Address: "matched-endpoint"}}, + }, + req: &types.LLMRequest{ + Headers: map[string]string{"test-epp-endpoint-selection": "matched-endpoint"}, + RequestId: uuid.NewString(), + }, + wantRes: &types.SchedulingResult{ + ProfileResults: map[string]*types.ProfileRunResult{ + "req-header-based-profile": { + TargetPod: &types.ScoredPod{ + Pod: &types.PodMetrics{ + Pod: &backend.Pod{ + Address: "matched-endpoint", + Labels: map[string]string{}, + }, + }, + }, + }, + }, + PrimaryProfileName: "req-header-based-profile", + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + scheduler := NewReqHeaderBasedScheduler() + got, err := scheduler.Schedule(context.Background(), test.req, types.ToSchedulerPodMetrics(test.input)) + if test.err != (err != nil) { + t.Errorf("Unexpected error, got %v, want %v", err, test.err) + } + + if diff := cmp.Diff(test.wantRes, got); diff != "" { + t.Errorf("Unexpected output (-want +got): %v", diff) + } + }) + } +} diff --git a/conformance/tests/basic/epp_unavailable_fail_open.go b/conformance/tests/basic/epp_unavailable_fail_open.go new file mode 100644 index 000000000..05c0290f7 --- /dev/null +++ b/conformance/tests/basic/epp_unavailable_fail_open.go @@ -0,0 +1,117 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package basic + +import ( + "net/http" + "testing" + + "github.com/stretchr/testify/require" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api/conformance/utils/suite" + "sigs.k8s.io/gateway-api/pkg/features" + + "sigs.k8s.io/gateway-api-inference-extension/conformance/tests" + k8sutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/kubernetes" + trafficutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/traffic" +) + +func init() { + tests.ConformanceTests = append(tests.ConformanceTests, EppUnAvailableFailOpen) +} + +var EppUnAvailableFailOpen = suite.ConformanceTest{ + ShortName: "EppUnAvailableFailOpen", + Description: "Inference gateway should send traffic to backends even when the EPP is unavailable (fail-open)", + Manifests: []string{"tests/basic/epp_unavailable_fail_open.yaml"}, + Features: []features.FeatureName{ + features.FeatureName("SupportInferencePool"), + features.SupportGateway, + }, + Test: func(t *testing.T, s *suite.ConformanceTestSuite) { + const ( + appBackendNamespace = "gateway-conformance-app-backend" + infraNamespace = "gateway-conformance-infra" + hostname = "secondary.example.com" + path = "/failopen-pool-test" + expectedPodReplicas = 3 + eppSelectionHeaderName = "test-epp-endpoint-selection" + appPodBackendPrefix = "secondary-inference-model-server" + requestBody = `{ + "model": "conformance-fake-model", + "prompt": "Write as if you were a critic: San Francisco" + }` + ) + + httpRouteNN := types.NamespacedName{Name: "httproute-for-failopen-pool-gw", Namespace: appBackendNamespace} + gatewayNN := types.NamespacedName{Name: "conformance-secondary-gateway", Namespace: infraNamespace} + poolNN := types.NamespacedName{Name: "secondary-inference-pool", Namespace: appBackendNamespace} + eppDeploymentNN := types.NamespacedName{Name: "secondary-app-endpoint-picker", Namespace: appBackendNamespace} + backendPodLabels := map[string]string{"app": "secondary-inference-model-server"} + + k8sutils.HTTPRouteMustBeAcceptedAndResolved(t, s.Client, s.TimeoutConfig, httpRouteNN, gatewayNN) + k8sutils.InferencePoolMustBeAcceptedByParent(t, s.Client, poolNN) + gwAddr := k8sutils.GetGatewayEndpoint(t, s.Client, s.TimeoutConfig, gatewayNN) + + pods, err := k8sutils.GetPodsWithLabel(t, s.Client, appBackendNamespace, backendPodLabels, s.TimeoutConfig) + require.NoError(t, err, "Failed to get backend pods") + require.Len(t, pods, expectedPodReplicas, "Expected to find %d backend pod, but found %d.", expectedPodReplicas, len(pods)) + + targetPodIP := pods[0].Status.PodIP + t.Run("Phase 1: Verify baseline connectivity with EPP available", func(t *testing.T) { + t.Log("Sending request to ensure the Gateway and EPP are working correctly...") + trafficutils.MakeRequestAndExpectSuccess( + t, + s.RoundTripper, + s.TimeoutConfig, + gwAddr, + trafficutils.Request{ + Host: hostname, + Path: path, + Headers: map[string]string{eppSelectionHeaderName: targetPodIP}, + Method: http.MethodPost, + Body: requestBody, + Backend: pods[0].Name, // Make sure the request is from the targetPod when the EPP is alive. + Namespace: appBackendNamespace, + }, + ) + }) + + t.Run("Phase 2: Verify fail-open behavior after EPP becomes unavailable", func(t *testing.T) { + t.Log("Simulating an EPP failure by deleting its deployment...") + deleteErr := k8sutils.DeleteDeployment(t, s.Client, s.TimeoutConfig, eppDeploymentNN) + require.NoError(t, deleteErr, "Failed to delete the EPP deployment") + + t.Log("Sending request again, expecting success to verify fail-open...") + trafficutils.MakeRequestAndExpectSuccess( + t, + s.RoundTripper, + s.TimeoutConfig, + gwAddr, + trafficutils.Request{ + Host: hostname, + Path: path, + Headers: map[string]string{eppSelectionHeaderName: targetPodIP}, + Method: http.MethodPost, + Body: requestBody, + Backend: appPodBackendPrefix, // Only checks the prefix since the EPP is not alive and the response can return from any Pod. + Namespace: appBackendNamespace, + }, + ) + }) + }, +} diff --git a/conformance/tests/basic/epp_unavailable_fail_open.yaml b/conformance/tests/basic/epp_unavailable_fail_open.yaml new file mode 100644 index 000000000..cd681cb74 --- /dev/null +++ b/conformance/tests/basic/epp_unavailable_fail_open.yaml @@ -0,0 +1,36 @@ +# --- InferenceModel Definition --- +# TODO: remove inferenceModel dependency https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/1002 +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: conformance-fake-model-server + namespace: gateway-conformance-app-backend +spec: + modelName: conformance-fake-model + criticality: Critical # Mark it as critical to bypass the saturation check since the model server is fake and don't have such metrics. + poolRef: + name: secondary-inference-pool +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: httproute-for-failopen-pool-gw + namespace: gateway-conformance-app-backend +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: conformance-secondary-gateway + namespace: gateway-conformance-infra + sectionName: http + hostnames: + - "secondary.example.com" + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: secondary-inference-pool # Use secondary-inferencePool because it has failureMode set to failOpen + matches: + - path: + type: PathPrefix + value: /failopen-pool-test diff --git a/conformance/tests/basic/gateway_following_epp_routing.go b/conformance/tests/basic/gateway_following_epp_routing.go new file mode 100644 index 000000000..30a5a6f0b --- /dev/null +++ b/conformance/tests/basic/gateway_following_epp_routing.go @@ -0,0 +1,194 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package basic + +import ( + "fmt" + "net/http" + "slices" + "strings" + "testing" + + "github.com/stretchr/testify/require" + "golang.org/x/sync/errgroup" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api/conformance/utils/suite" + "sigs.k8s.io/gateway-api/pkg/features" + + "sigs.k8s.io/gateway-api-inference-extension/conformance/tests" + k8sutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/kubernetes" + "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/traffic" + trafficutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/traffic" + gwhttp "sigs.k8s.io/gateway-api/conformance/utils/http" +) + +func init() { + // Register the GatewayFollowingEPPRouting test case with the conformance suite. + // This ensures it will be discovered and run by the test runner. + tests.ConformanceTests = append(tests.ConformanceTests, GatewayFollowingEPPRouting) +} + +// GatewayFollowingEPPRouting defines the test case for verifying gateway should send traffic to an endpoint in the list returned by EPP. +var GatewayFollowingEPPRouting = suite.ConformanceTest{ + ShortName: "GatewayFollowingEPPRouting", + Description: "Inference gateway should send traffic to an endpoint in the list returned by EPP", + Manifests: []string{"tests/basic/gateway_following_epp_routing.yaml"}, + Features: []features.FeatureName{ + features.FeatureName("SupportInferencePool"), + features.SupportGateway, + }, + Test: func(t *testing.T, s *suite.ConformanceTestSuite) { + const ( + appBackendNamespace = "gateway-conformance-app-backend" + infraNamespace = "gateway-conformance-infra" + hostname = "primary.example.com" + path = "/primary-gateway-test" + expectedPodReplicas = 3 + // eppSelectionHeaderName is the custom header used by the testing-EPP service + // to determine which endpoint to select. + eppSelectionHeaderName = "test-epp-endpoint-selection" + appPodBackendPrefix = "primary-inference-model-server" + ) + + httpRouteNN := types.NamespacedName{Name: "httproute-for-primary-gw", Namespace: appBackendNamespace} + gatewayNN := types.NamespacedName{Name: "conformance-primary-gateway", Namespace: infraNamespace} + poolNN := types.NamespacedName{Name: "primary-inference-pool", Namespace: appBackendNamespace} + backendPodLabels := map[string]string{"app": "primary-inference-model-server"} + + t.Log("Verifying HTTPRoute and InferencePool are accepted and the Gateway has an address.") + k8sutils.HTTPRouteMustBeAcceptedAndResolved(t, s.Client, s.TimeoutConfig, httpRouteNN, gatewayNN) + k8sutils.InferencePoolMustBeAcceptedByParent(t, s.Client, poolNN) + gwAddr := k8sutils.GetGatewayEndpoint(t, s.Client, s.TimeoutConfig, gatewayNN) + + t.Logf("Fetching backend pods with labels: %v", backendPodLabels) + pods, err := k8sutils.GetPodsWithLabel(t, s.Client, appBackendNamespace, backendPodLabels, s.TimeoutConfig) + require.NoError(t, err, "Failed to get backend pods") + require.Len(t, pods, expectedPodReplicas, "Expected to find %d backend pods, but found %d.", expectedPodReplicas, len(pods)) + + podIPs := make([]string, len(pods)) + podNames := make([]string, len(pods)) + for i, pod := range pods { + podIPs[i] = pod.Status.PodIP + podNames[i] = pod.Name + } + + requestBody := `{ + "model": "conformance-fake-model", + "prompt": "Write as if you were a critic: San Francisco" + }` + + for i := 0; i < len(pods); i++ { + // Send an initial request targeting a single pod and wait for it to be successful to ensure the Gateway and EPP + // are functioning correctly before running the main test cases. + trafficutils.MakeRequestAndExpectSuccess( + t, + s.RoundTripper, + s.TimeoutConfig, + gwAddr, + trafficutils.Request{ + Host: hostname, + Path: path, + Headers: map[string]string{eppSelectionHeaderName: podIPs[i]}, + Method: http.MethodPost, + Body: requestBody, + Backend: podNames[i], + Namespace: appBackendNamespace, + }, + ) + } + + testCases := []struct { + name string + podIPsToBeReturnedByEPP []string + expectAllRequestsRoutedWithinPodNames []string + }{ + { + name: "should route traffic to a single designated pod", + podIPsToBeReturnedByEPP: []string{podIPs[2]}, + expectAllRequestsRoutedWithinPodNames: []string{podNames[2]}, + }, + { + name: "should route traffic to two designated pods", + podIPsToBeReturnedByEPP: []string{podIPs[0], podIPs[1]}, + expectAllRequestsRoutedWithinPodNames: []string{podNames[0], podNames[1]}, + }, + { + name: "should route traffic to all available pods", + podIPsToBeReturnedByEPP: []string{podIPs[0], podIPs[1], podIPs[2]}, + expectAllRequestsRoutedWithinPodNames: []string{podNames[0], podNames[1], podNames[2]}, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + eppHeaderValue := strings.Join(tc.podIPsToBeReturnedByEPP, ",") + headers := map[string]string{eppSelectionHeaderName: eppHeaderValue} + + t.Logf("Sending request to %s with EPP header '%s: %s'", gwAddr, eppSelectionHeaderName, eppHeaderValue) + t.Logf("Expecting traffic to be routed to pod: %v", tc.expectAllRequestsRoutedWithinPodNames) + + assertTrafficOnlyReachesToExpectedPods(t, s, gwAddr, gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Host: hostname, + Path: path, + Method: http.MethodPost, + Headers: headers, + }, + Response: gwhttp.Response{ + StatusCode: http.StatusOK, + }, + Backend: appPodBackendPrefix, + Namespace: appBackendNamespace, + }, requestBody, tc.expectAllRequestsRoutedWithinPodNames) + }) + } + }, +} + +func assertTrafficOnlyReachesToExpectedPods(t *testing.T, suite *suite.ConformanceTestSuite, gwAddr string, expected gwhttp.ExpectedResponse, requestBody string, expectedPodNames []string) { + t.Helper() + const ( + concurrentRequests = 10 + totalRequests = 100 + ) + var ( + roundTripper = suite.RoundTripper + g errgroup.Group + req = gwhttp.MakeRequest(t, &expected, gwAddr, "HTTP", "http") + ) + g.SetLimit(concurrentRequests) + for i := 0; i < totalRequests; i++ { + g.Go(func() error { + cReq, cRes, err := traffic.MakeCallRoundTripper(t, roundTripper, &traffic.RequestWithBody{Request: req, Body: strings.NewReader(requestBody)}) + if err != nil { + return fmt.Errorf("failed to roundtrip request: %w", err) + } + if err := gwhttp.CompareRequest(t, &req, cReq, cRes, expected); err != nil { + return fmt.Errorf("response expectation failed for request: %w", err) + } + + if slices.Contains(expectedPodNames, cReq.Pod) { + return nil + } + return fmt.Errorf("request was handled by an unexpected pod %q", cReq.Pod) + }) + } + if err := g.Wait(); err != nil { + t.Fatalf("Not all the requests are sent to the expectedPods successfully, err: %v", err) + } + t.Logf("Traffic successfully reached only to expected pods: %v", expectedPodNames) +} diff --git a/conformance/tests/basic/gateway_following_epp_routing.yaml b/conformance/tests/basic/gateway_following_epp_routing.yaml new file mode 100644 index 000000000..d290b7541 --- /dev/null +++ b/conformance/tests/basic/gateway_following_epp_routing.yaml @@ -0,0 +1,38 @@ +# --- InferenceModel Definition --- +# Service for the infra-backend-deployment. +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: conformance-fake-model-server + namespace: gateway-conformance-app-backend +spec: + modelName: conformance-fake-model + criticality: Critical # Mark it as critical to bypass the saturation check since the model server is fake and don't have such metrics. + poolRef: + name: primary-inference-pool +--- +# --- HTTPRoute for Primary Gateway (conformance-gateway) --- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: httproute-for-primary-gw + namespace: gateway-conformance-app-backend +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: conformance-primary-gateway + namespace: gateway-conformance-infra + sectionName: http + hostnames: + - "primary.example.com" + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: primary-inference-pool + matches: + - path: + type: PathPrefix + value: /primary-gateway-test + \ No newline at end of file diff --git a/conformance/tests/basic/httproute_invalid_inferencepool_ref.go b/conformance/tests/basic/httproute_invalid_inferencepool_ref.go new file mode 100644 index 000000000..5fc2a66cb --- /dev/null +++ b/conformance/tests/basic/httproute_invalid_inferencepool_ref.go @@ -0,0 +1,74 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// TODO(#864) refactor the structure to put all tests directly under tests instead of creating subfolders. +package basic + +import ( + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + gatewayv1 "sigs.k8s.io/gateway-api/apis/v1" + "sigs.k8s.io/gateway-api/conformance/utils/kubernetes" + "sigs.k8s.io/gateway-api/conformance/utils/suite" + "sigs.k8s.io/gateway-api/pkg/features" + + "sigs.k8s.io/gateway-api-inference-extension/conformance/tests" +) + +func init() { + tests.ConformanceTests = append(tests.ConformanceTests, HTTPRouteInvalidInferencePoolRef) +} + +var HTTPRouteInvalidInferencePoolRef = suite.ConformanceTest{ + ShortName: "HTTPRouteInvalidInferencePoolRef", + Description: "Tests HTTPRoute status when it references an InferencePool that does not exist.", + Manifests: []string{"tests/basic/httproute_invalid_inferencepool_ref.yaml"}, + Features: []features.FeatureName{ + features.FeatureName("SupportInferencePool"), + features.SupportGateway, + }, + Test: func(t *testing.T, s *suite.ConformanceTestSuite) { + const ( + appBackendNamespace = "gateway-conformance-app-backend" + infraNamespace = "gateway-conformance-infra" + routeName = "httproute-to-non-existent-pool" + gatewayName = "conformance-primary-gateway" + ) + routeNN := types.NamespacedName{Name: routeName, Namespace: appBackendNamespace} + gatewayNN := types.NamespacedName{Name: gatewayName, Namespace: infraNamespace} + + t.Run("HTTPRoute should have Accepted=True and ResolvedRefs=False for non-existent InferencePool", func(t *testing.T) { + acceptedCondition := metav1.Condition{ + Type: string(gatewayv1.RouteConditionAccepted), + Status: metav1.ConditionTrue, + Reason: string(gatewayv1.RouteReasonAccepted), + } + kubernetes.HTTPRouteMustHaveCondition(t, s.Client, s.TimeoutConfig, routeNN, gatewayNN, acceptedCondition) + + resolvedRefsCondition := metav1.Condition{ + Type: string(gatewayv1.RouteConditionResolvedRefs), + Status: metav1.ConditionFalse, + Reason: string(gatewayv1.RouteReasonBackendNotFound), + } + kubernetes.HTTPRouteMustHaveCondition(t, s.Client, s.TimeoutConfig, routeNN, gatewayNN, resolvedRefsCondition) + + t.Logf("Successfully verified HTTPRoute %s has conditions: Accepted=True and ResolvedRefs=False (Reason: BackendNotFound) for Gateway %s", + routeNN.String(), gatewayNN.String()) + }) + }, +} diff --git a/conformance/tests/basic/httproute_invalid_inferencepool_ref.yaml b/conformance/tests/basic/httproute_invalid_inferencepool_ref.yaml new file mode 100644 index 000000000..227aeef1f --- /dev/null +++ b/conformance/tests/basic/httproute_invalid_inferencepool_ref.yaml @@ -0,0 +1,30 @@ +# httproute_invalid_inferencepool_ref.yaml +# This manifest defines an HTTPRoute that references an InferencePool +# by name ("non-existent-inference-pool") which is intentionally NOT defined. +# The test will verify that the HTTPRoute reflects an appropriate +# failure status because the referenced InferencePool backend cannot be found. + +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + # This name must match the 'routeNN.Name' in the Go test file. + name: httproute-to-non-existent-pool + # This namespace should be one created by the base manifests, + # typically where backend applications and their routes reside. + namespace: gateway-conformance-app-backend +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: conformance-primary-gateway # Name of the shared Gateway from base manifests + namespace: gateway-conformance-infra # Namespace of the shared Gateway + sectionName: http + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: non-existent-inference-pool # Intentionally Non-Existing + matches: + - path: + type: PathPrefix + value: /test-non-existent-pool diff --git a/conformance/tests/basic/httproute_multiple_gateways_different_pools.go b/conformance/tests/basic/httproute_multiple_gateways_different_pools.go new file mode 100644 index 000000000..251f55c81 --- /dev/null +++ b/conformance/tests/basic/httproute_multiple_gateways_different_pools.go @@ -0,0 +1,118 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package basic + +import ( + "net/http" + "testing" + + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api/conformance/utils/suite" + + "sigs.k8s.io/gateway-api-inference-extension/conformance/tests" + k8sutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/kubernetes" + "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/traffic" +) + +func init() { + tests.ConformanceTests = append(tests.ConformanceTests, HTTPRouteMultipleGatewaysDifferentPools) +} + +var HTTPRouteMultipleGatewaysDifferentPools = suite.ConformanceTest{ + ShortName: "HTTPRouteMultipleGatewaysDifferentPools", + Description: "Validates two HTTPRoutes on different Gateways successfully referencing different InferencePools and routes traffic accordingly.", + Manifests: []string{"tests/basic/httproute_multiple_gateways_different_pools.yaml"}, + Test: func(t *testing.T, s *suite.ConformanceTestSuite) { + const ( + appBackendNamespace = "gateway-conformance-app-backend" + infraNamespace = "gateway-conformance-infra" + + primaryGatewayName = "conformance-primary-gateway" + routeForPrimaryGWName = "route-for-primary-gateway" + primaryPoolName = "primary-inference-pool" + primaryBackendPodName = "primary-inference-model-server" + primaryRoutePath = "/test-primary-gateway" + primaryRouteHostname = "primary.example.com" + + secondaryGatewayName = "conformance-secondary-gateway" + routeForSecondaryGWName = "route-for-secondary-gateway" + secondaryPoolName = "secondary-inference-pool" + secondaryBackendPodName = "secondary-inference-model-server" + secondaryRoutePath = "/test-secondary-gateway" + secondaryRouteHostname = "secondary.example.com" + ) + + routeForPrimaryGWNN := types.NamespacedName{Name: routeForPrimaryGWName, Namespace: appBackendNamespace} + routeForSecondaryGWNN := types.NamespacedName{Name: routeForSecondaryGWName, Namespace: appBackendNamespace} + primaryPoolNN := types.NamespacedName{Name: primaryPoolName, Namespace: appBackendNamespace} + secondaryPoolNN := types.NamespacedName{Name: secondaryPoolName, Namespace: appBackendNamespace} + primaryGatewayNN := types.NamespacedName{Name: primaryGatewayName, Namespace: infraNamespace} + secondaryGatewayNN := types.NamespacedName{Name: secondaryGatewayName, Namespace: infraNamespace} + + t.Run("Primary HTTPRoute, InferencePool, and Gateway path: verify status and traffic", func(t *testing.T) { + k8sutils.HTTPRouteAndInferencePoolMustBeAcceptedAndRouteAccepted( + t, + s.Client, + routeForPrimaryGWNN, + primaryGatewayNN, + primaryPoolNN, + ) + + primaryGwAddr := k8sutils.GetGatewayEndpoint(t, s.Client, s.TimeoutConfig, primaryGatewayNN) + + traffic.MakeRequestAndExpectEventuallyConsistentResponse( + t, + s.RoundTripper, + s.TimeoutConfig, + primaryGwAddr, + traffic.Request{ + Host: primaryRouteHostname, + Path: primaryRoutePath, + ExpectedStatusCode: http.StatusOK, + Backend: primaryBackendPodName, + Namespace: appBackendNamespace, + }, + ) + }) + + t.Run("Secondary HTTPRoute, InferencePool, and Gateway path: verify status and traffic", func(t *testing.T) { + k8sutils.HTTPRouteAndInferencePoolMustBeAcceptedAndRouteAccepted( + t, + s.Client, + routeForSecondaryGWNN, + secondaryGatewayNN, + secondaryPoolNN, + ) + + secondaryGwAddr := k8sutils.GetGatewayEndpoint(t, s.Client, s.TimeoutConfig, secondaryGatewayNN) + + traffic.MakeRequestAndExpectEventuallyConsistentResponse( + t, + s.RoundTripper, + s.TimeoutConfig, + secondaryGwAddr, + traffic.Request{ + Host: secondaryRouteHostname, + Path: secondaryRoutePath, + ExpectedStatusCode: http.StatusOK, + Backend: secondaryBackendPodName, + Namespace: appBackendNamespace, + }, + ) + }) + }, +} diff --git a/conformance/tests/basic/httproute_multiple_gateways_different_pools.yaml b/conformance/tests/basic/httproute_multiple_gateways_different_pools.yaml new file mode 100644 index 000000000..b8ec2d114 --- /dev/null +++ b/conformance/tests/basic/httproute_multiple_gateways_different_pools.yaml @@ -0,0 +1,44 @@ +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: route-for-primary-gateway + namespace: gateway-conformance-app-backend +spec: + parentRefs: + - kind: Gateway + name: conformance-primary-gateway + namespace: gateway-conformance-infra + hostnames: + - "primary.example.com" + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: primary-inference-pool + matches: + - path: + type: PathPrefix + value: /test-primary-gateway +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: route-for-secondary-gateway + namespace: gateway-conformance-app-backend +spec: + parentRefs: + - kind: Gateway + name: conformance-secondary-gateway + namespace: gateway-conformance-infra + hostnames: + - "secondary.example.com" + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: secondary-inference-pool + matches: + - path: + type: PathPrefix + value: /test-secondary-gateway diff --git a/conformance/tests/basic/inferencepool_accepted.go b/conformance/tests/basic/inferencepool_accepted.go new file mode 100644 index 000000000..f34c5fcc4 --- /dev/null +++ b/conformance/tests/basic/inferencepool_accepted.go @@ -0,0 +1,63 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package basic + +import ( + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + gatewayv1 "sigs.k8s.io/gateway-api/apis/v1" // For standard condition types + "sigs.k8s.io/gateway-api/conformance/utils/suite" + "sigs.k8s.io/gateway-api/pkg/features" // For standard feature names + + // Import the tests package to append to ConformanceTests + "sigs.k8s.io/gateway-api-inference-extension/conformance/tests" + k8sutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/kubernetes" +) + +func init() { + // Register the InferencePoolAccepted test case with the conformance suite. + // This ensures it will be discovered and run by the test runner. + tests.ConformanceTests = append(tests.ConformanceTests, InferencePoolAccepted) +} + +// InferencePoolAccepted defines the test case for verifying basic InferencePool acceptance. +var InferencePoolAccepted = suite.ConformanceTest{ + ShortName: "InferencePoolAccepted", + Description: "A minimal InferencePool resource should be accepted by the controller and report an Accepted condition", + Manifests: []string{"tests/basic/inferencepool_accepted.yaml"}, + Features: []features.FeatureName{ + features.FeatureName("SupportInferencePool"), + features.SupportGateway, + }, + Test: func(t *testing.T, s *suite.ConformanceTestSuite) { + // created by the associated manifest file. + poolNN := types.NamespacedName{Name: "primary-inference-pool", Namespace: "gateway-conformance-app-backend"} + + t.Run("InferencePool should have Accepted condition set to True", func(t *testing.T) { + // Define the expected status condition. We use the standard "Accepted" + // condition type from the Gateway API for consistency. + acceptedCondition := metav1.Condition{ + Type: string(gatewayv1.GatewayConditionAccepted), // Standard condition type + Status: metav1.ConditionTrue, + Reason: "", // "" means we don't strictly check the Reason for this basic test. + } + k8sutils.InferencePoolMustHaveCondition(t, s.Client, poolNN, acceptedCondition) + }) + }, +} diff --git a/conformance/tests/basic/inferencepool_accepted.yaml b/conformance/tests/basic/inferencepool_accepted.yaml new file mode 100644 index 000000000..838150013 --- /dev/null +++ b/conformance/tests/basic/inferencepool_accepted.yaml @@ -0,0 +1,23 @@ +# --- HTTPRoute Definition --- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: httproute-for-inferencepool-accepted + namespace: gateway-conformance-app-backend +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: conformance-primary-gateway # Name of the shared Gateway from maniffests.yaml + namespace: gateway-conformance-infra # Namespace of the shared Gateway + sectionName: http + rules: + - backendRefs: + - group: inference.networking.x-k8s.io # InferencePool API group + kind: InferencePool + name: primary-inference-pool # Name of the InferencePool this route points to + # namespace: gateway-conformance-app-backend - is omitted since it is in the same namespace as HTTPRoute + matches: + - path: + type: PathPrefix + value: /accepted-pool-test diff --git a/conformance/tests/basic/inferencepool_httproute_port_validation.go b/conformance/tests/basic/inferencepool_httproute_port_validation.go new file mode 100644 index 000000000..45c95a5d9 --- /dev/null +++ b/conformance/tests/basic/inferencepool_httproute_port_validation.go @@ -0,0 +1,125 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package basic + +import ( + "testing" + + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api/conformance/utils/suite" + "sigs.k8s.io/gateway-api/pkg/features" + + // Local project imports + "sigs.k8s.io/gateway-api-inference-extension/conformance/tests" + k8sutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/kubernetes" + trafficutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/traffic" +) + +func init() { + tests.ConformanceTests = append(tests.ConformanceTests, InferencePoolHTTPRoutePortValidation) +} + +var InferencePoolHTTPRoutePortValidation = suite.ConformanceTest{ + ShortName: "InferencePoolHTTPRoutePortValidation", + Description: "Validates HTTPRoute backendRef port configurations (unspecified, matching, non-matching) when referencing an InferencePool, and checks resulting status conditions.", + Manifests: []string{"tests/basic/inferencepool_httproute_port_validation.yaml"}, + Features: []features.FeatureName{ + features.FeatureName("SupportInferencePool"), + features.SupportGateway, + }, + Test: func(t *testing.T, s *suite.ConformanceTestSuite) { + const ( + appBackendNamespace = "gateway-conformance-app-backend" + infraNamespace = "gateway-conformance-infra" + gatewayName = "conformance-primary-gateway" + poolName = "primary-inference-pool" + backendDeploymentName = "primary-inference-model-server-deployment" + ) + + gatewayNN := types.NamespacedName{Name: gatewayName, Namespace: infraNamespace} + poolNN := types.NamespacedName{Name: poolName, Namespace: appBackendNamespace} + + gatewayAddr := k8sutils.GetGatewayEndpoint(t, s.Client, s.TimeoutConfig, gatewayNN) + + t.Run("Scenario 1: HTTPRoute backendRef to InferencePool with Port Unspecified", func(t *testing.T) { + routeNN := types.NamespacedName{Name: "httproute-pool-port-unspecified", Namespace: appBackendNamespace} + hostname := "port-unspecified.example.com" + path := "/test-port-unspecified" + + k8sutils.HTTPRouteMustBeAcceptedAndResolved(t, s.Client, s.TimeoutConfig, routeNN, gatewayNN) + k8sutils.InferencePoolMustBeAcceptedByParent(t, s.Client, poolNN) + + trafficutils.MakeRequestAndExpectSuccess( + t, + s.RoundTripper, + s.TimeoutConfig, + gatewayAddr, + trafficutils.Request{ + Host: hostname, + Path: path, + Backend: backendDeploymentName, + Namespace: appBackendNamespace, + }, + ) + }) + + t.Run("Scenario 2: HTTPRoute backendRef to InferencePool with Port Specified and Matching", func(t *testing.T) { + routeNN := types.NamespacedName{Name: "httproute-pool-port-matching", Namespace: appBackendNamespace} + hostname := "port-matching.example.com" + path := "/test-port-matching" + + k8sutils.HTTPRouteMustBeAcceptedAndResolved(t, s.Client, s.TimeoutConfig, routeNN, gatewayNN) + k8sutils.InferencePoolMustBeAcceptedByParent(t, s.Client, poolNN) + + trafficutils.MakeRequestAndExpectSuccess( + t, + s.RoundTripper, + s.TimeoutConfig, + gatewayAddr, + trafficutils.Request{ + Host: hostname, + Path: path, + Backend: backendDeploymentName, + Namespace: appBackendNamespace, + }, + ) + }) + + // TODO: Add a warning check after the required change is made per discussion in github.com/kubernetes-sigs/gateway-api-inference-extension/discussions/918 + t.Run("Scenario 3: HTTPRoute backendRef to InferencePool with Port Specified and Non-Matching. Request still passing because HTTP Port is ignored when inferencePool is backendRef", func(t *testing.T) { + routeNN := types.NamespacedName{Name: "httproute-pool-port-non-matching", Namespace: appBackendNamespace} + hostname := "port-non-matching.example.com" + path := "/test-port-non-matching" + + k8sutils.HTTPRouteMustBeAcceptedAndResolved(t, s.Client, s.TimeoutConfig, routeNN, gatewayNN) + k8sutils.InferencePoolMustBeAcceptedByParent(t, s.Client, poolNN) + + trafficutils.MakeRequestAndExpectSuccess( + t, + s.RoundTripper, + s.TimeoutConfig, + gatewayAddr, + trafficutils.Request{ + Host: hostname, + Path: path, + Backend: backendDeploymentName, + Namespace: appBackendNamespace, + }, + ) + }) + }, +} diff --git a/conformance/tests/basic/inferencepool_httproute_port_validation.yaml b/conformance/tests/basic/inferencepool_httproute_port_validation.yaml new file mode 100644 index 000000000..62e243a14 --- /dev/null +++ b/conformance/tests/basic/inferencepool_httproute_port_validation.yaml @@ -0,0 +1,79 @@ +# --- HTTPRoute Scenario 1: Port Unspecified --- +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: httproute-pool-port-unspecified + namespace: gateway-conformance-app-backend +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: conformance-primary-gateway + namespace: gateway-conformance-infra + sectionName: http + hostnames: + - "port-unspecified.example.com" + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: primary-inference-pool + # Port is intentionally unspecified here + matches: + - path: + type: PathPrefix + value: /test-port-unspecified +--- +# --- HTTPRoute Scenario 2: Port Matching --- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: httproute-pool-port-matching + namespace: gateway-conformance-app-backend +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: conformance-primary-gateway + namespace: gateway-conformance-infra + sectionName: http + hostnames: + - "port-matching.example.com" + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: primary-inference-pool + port: 3000 # Port matches InferencePool's targetPortNumber + matches: + - path: + type: PathPrefix + value: /test-port-matching +--- +# --- HTTPRoute Scenario 3: Port Non-Matching --- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: httproute-pool-port-non-matching + namespace: gateway-conformance-app-backend +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: conformance-primary-gateway + namespace: gateway-conformance-infra + sectionName: http + hostnames: + - "port-non-matching.example.com" + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: primary-inference-pool + port: 8888 # Port does NOT match InferencePool's targetPortNumber + matches: + - path: + type: PathPrefix + value: /test-port-non-matching +--- diff --git a/conformance/tests/basic/inferencepool_invalid_epp_service.go b/conformance/tests/basic/inferencepool_invalid_epp_service.go new file mode 100644 index 000000000..7e3bfaedf --- /dev/null +++ b/conformance/tests/basic/inferencepool_invalid_epp_service.go @@ -0,0 +1,76 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package basic + +import ( + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + inferenceapi "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + "sigs.k8s.io/gateway-api/conformance/utils/kubernetes" + "sigs.k8s.io/gateway-api/conformance/utils/suite" + "sigs.k8s.io/gateway-api/pkg/features" + + "sigs.k8s.io/gateway-api-inference-extension/conformance/tests" + k8sutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/kubernetes" + trafficutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/traffic" +) + +func init() { + tests.ConformanceTests = append(tests.ConformanceTests, InferencePoolInvalidEPPService) +} + +var InferencePoolInvalidEPPService = suite.ConformanceTest{ + ShortName: "InferencePoolInvalidEPPService", + Description: "An HTTPRoute that references an InferencePool with a non-existent EPP service should have a ResolvedRefs condition with a status of False and a reason of BackendNotFound.", + Manifests: []string{"tests/basic/inferencepool_invalid_epp_service.yaml"}, + Features: []features.FeatureName{ + features.SupportGateway, + features.SupportHTTPRoute, + features.FeatureName("SupportInferencePool"), + }, + Test: func(t *testing.T, s *suite.ConformanceTestSuite) { + const ( + routePath = "/invalid-epp-test" + infraNamespace = "gateway-conformance-infra" + appNamespace = "gateway-conformance-app-backend" + poolName = "pool-with-invalid-epp" + ) + + routeNN := types.NamespacedName{Name: "httproute-for-invalid-epp-pool", Namespace: appNamespace} + gwNN := types.NamespacedName{Name: "conformance-primary-gateway", Namespace: infraNamespace} + poolNN := types.NamespacedName{Name: poolName, Namespace: appNamespace} + + gwAddr := kubernetes.GatewayAndHTTPRoutesMustBeAccepted(t, s.Client, s.TimeoutConfig, s.ControllerName, kubernetes.NewGatewayRef(gwNN), routeNN) + t.Run("InferecePool has a ResolvedRefs Condition with status False", func(t *testing.T) { + acceptedCondition := metav1.Condition{ + Type: string(inferenceapi.InferencePoolConditionResolvedRefs), // Standard condition type + Status: metav1.ConditionFalse, + Reason: "", // "" means we don't strictly check the Reason for this basic test. + } + k8sutils.InferencePoolMustHaveCondition(t, s.Client, poolNN, acceptedCondition) + }) + + t.Run("Request to a route with an invalid backend reference receives a 500 response", func(t *testing.T) { + trafficutils.MakeRequestAndExpectEventuallyConsistentResponse(t, s.RoundTripper, s.TimeoutConfig, gwAddr, trafficutils.Request{ + Path: routePath, + ExpectedStatusCode: 5, // Expecting response status code 5XX. + }) + }) + }, +} diff --git a/conformance/tests/basic/inferencepool_invalid_epp_service.yaml b/conformance/tests/basic/inferencepool_invalid_epp_service.yaml new file mode 100644 index 000000000..7d9295fe2 --- /dev/null +++ b/conformance/tests/basic/inferencepool_invalid_epp_service.yaml @@ -0,0 +1,30 @@ +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + name: pool-with-invalid-epp + namespace: gateway-conformance-app-backend +spec: + selector: + app: primary-inference-model-server + targetPortNumber: 3000 + extensionRef: + name: non-existent-epp-svc +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: httproute-for-invalid-epp-pool + namespace: gateway-conformance-app-backend +spec: + parentRefs: + - name: conformance-primary-gateway + namespace: gateway-conformance-infra + rules: + - backendRefs: + - name: pool-with-invalid-epp + kind: InferencePool + group: inference.networking.x-k8s.io + matches: + - path: + type: PathPrefix + value: /invalid-epp-test diff --git a/conformance/tests/basic/inferencepool_multiple_rules_different_pools.go b/conformance/tests/basic/inferencepool_multiple_rules_different_pools.go new file mode 100644 index 000000000..078c7c516 --- /dev/null +++ b/conformance/tests/basic/inferencepool_multiple_rules_different_pools.go @@ -0,0 +1,97 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package basic + +import ( + "testing" + + "k8s.io/apimachinery/pkg/types" + gwhttp "sigs.k8s.io/gateway-api/conformance/utils/http" + "sigs.k8s.io/gateway-api/conformance/utils/suite" + "sigs.k8s.io/gateway-api/pkg/features" + + "sigs.k8s.io/gateway-api-inference-extension/conformance/tests" + k8sutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/kubernetes" +) + +func init() { + tests.ConformanceTests = append(tests.ConformanceTests, HTTPRouteMultipleRulesDifferentPools) +} + +var HTTPRouteMultipleRulesDifferentPools = suite.ConformanceTest{ + ShortName: "HTTPRouteMultipleRulesDifferentPools", + Description: "An HTTPRoute with two rules routing to two different InferencePools", + Manifests: []string{"tests/basic/inferencepool_multiple_rules_different_pools.yaml"}, + Features: []features.FeatureName{ + features.SupportGateway, + features.SupportHTTPRoute, + features.FeatureName("SupportInferencePool"), + }, + Test: func(t *testing.T, s *suite.ConformanceTestSuite) { + const ( + appBackendNamespace = "gateway-conformance-app-backend" + infraNamespace = "gateway-conformance-infra" + + poolPrimaryName = "primary-inference-pool" + poolSecondaryName = "secondary-inference-pool" + routeName = "httproute-multiple-rules-different-pools" + gatewayName = "conformance-primary-gateway" + + primaryPath = "/primary" + secondaryPath = "/secondary" + + primaryPodBackendPrefix = "primary-inference-model-server" + secondaryPodBackendPrefix = "secondary-inference-model-server" + ) + + primaryPoolNN := types.NamespacedName{Name: poolPrimaryName, Namespace: appBackendNamespace} + secondaryPoolNN := types.NamespacedName{Name: poolSecondaryName, Namespace: appBackendNamespace} + routeNN := types.NamespacedName{Name: routeName, Namespace: appBackendNamespace} + gatewayNN := types.NamespacedName{Name: gatewayName, Namespace: infraNamespace} + + t.Run("Wait for resources to be accepted", func(t *testing.T) { + k8sutils.HTTPRouteAndInferencePoolMustBeAcceptedAndRouteAccepted(t, s.Client, routeNN, gatewayNN, primaryPoolNN) + k8sutils.HTTPRouteAndInferencePoolMustBeAcceptedAndRouteAccepted(t, s.Client, routeNN, gatewayNN, secondaryPoolNN) + }) + + t.Run("Traffic should be routed to the correct pool based on path", func(t *testing.T) { + gwAddr := k8sutils.GetGatewayEndpoint(t, s.Client, s.TimeoutConfig, gatewayNN) + + t.Run("request to primary pool", func(t *testing.T) { + gwhttp.MakeRequestAndExpectEventuallyConsistentResponse(t, s.RoundTripper, + s.TimeoutConfig, gwAddr, gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Path: primaryPath, + }, + Backend: primaryPodBackendPrefix, // Make sure the request is reaching the primary backend. + Namespace: appBackendNamespace, + }) + }) + + t.Run("request to secondary pool", func(t *testing.T) { + gwhttp.MakeRequestAndExpectEventuallyConsistentResponse(t, s.RoundTripper, + s.TimeoutConfig, gwAddr, gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Path: secondaryPath, + }, + Backend: secondaryPodBackendPrefix, // Make sure the request is reaching the secondary backend. + Namespace: appBackendNamespace, + }) + }) + }) + }, +} diff --git a/conformance/tests/basic/inferencepool_multiple_rules_different_pools.yaml b/conformance/tests/basic/inferencepool_multiple_rules_different_pools.yaml new file mode 100644 index 000000000..7ef7ced0b --- /dev/null +++ b/conformance/tests/basic/inferencepool_multiple_rules_different_pools.yaml @@ -0,0 +1,27 @@ +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: httproute-multiple-rules-different-pools + namespace: gateway-conformance-app-backend +spec: + parentRefs: + - name: conformance-primary-gateway + namespace: gateway-conformance-infra + rules: + - matches: + - path: + type: PathPrefix + value: /primary + backendRefs: + - name: primary-inference-pool + kind: InferencePool + group: inference.networking.x-k8s.io + - matches: + - path: + type: PathPrefix + value: /secondary + backendRefs: + - name: secondary-inference-pool + kind: InferencePool + group: inference.networking.x-k8s.io diff --git a/conformance/tests/basic/inferencepool_resolvedrefs_condition.go b/conformance/tests/basic/inferencepool_resolvedrefs_condition.go new file mode 100644 index 000000000..97a08862b --- /dev/null +++ b/conformance/tests/basic/inferencepool_resolvedrefs_condition.go @@ -0,0 +1,178 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package basic + +import ( + "context" + "net/http" + "testing" + "time" + + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + gatewayv1 "sigs.k8s.io/gateway-api/apis/v1" + "sigs.k8s.io/gateway-api/conformance/utils/suite" + "sigs.k8s.io/gateway-api/pkg/features" + + "sigs.k8s.io/gateway-api-inference-extension/conformance/tests" + "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/config" + k8sutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/kubernetes" + trafficutils "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/traffic" +) + +func init() { + tests.ConformanceTests = append(tests.ConformanceTests, InferencePoolParentStatus) +} + +var InferencePoolParentStatus = suite.ConformanceTest{ + ShortName: "InferencePoolResolvedRefsCondition", + Description: "Verify that an InferencePool correctly updates its parent-specific status (e.g., Accepted condition) when referenced by HTTPRoutes attached to shared Gateways, and clears parent statuses when no longer referenced.", + Manifests: []string{"tests/basic/inferencepool_resolvedrefs_condition.yaml"}, + Features: []features.FeatureName{ + features.FeatureName("SupportInferencePool"), + features.SupportGateway, + }, + Test: func(t *testing.T, s *suite.ConformanceTestSuite) { + const ( + appBackendNamespace = "gateway-conformance-app-backend" + infraNamespace = "gateway-conformance-infra" + poolName = "primary-inference-pool" + sharedPrimaryGatewayName = "conformance-primary-gateway" + sharedSecondaryGatewayName = "conformance-secondary-gateway" + httpRoutePrimaryName = "httproute-for-primary-gw" + httpRouteSecondaryName = "httproute-for-secondary-gw" + hostnamePrimaryGw = "primary.example.com" + pathPrimaryGw = "/primary-gateway-test" + hostnameSecondaryGw = "secondary.example.com" + pathSecondaryGw = "/secondary-gateway-test" + backendServicePodName = "primary-inference-model-server-deployment" + ) + + poolNN := types.NamespacedName{Name: poolName, Namespace: appBackendNamespace} + httpRoutePrimaryNN := types.NamespacedName{Name: httpRoutePrimaryName, Namespace: appBackendNamespace} + httpRouteSecondaryNN := types.NamespacedName{Name: httpRouteSecondaryName, Namespace: appBackendNamespace} + gatewayPrimaryNN := types.NamespacedName{Name: sharedPrimaryGatewayName, Namespace: infraNamespace} + gatewaySecondaryNN := types.NamespacedName{Name: sharedSecondaryGatewayName, Namespace: infraNamespace} + + inferenceTimeoutConfig := config.DefaultInferenceExtensionTimeoutConfig() + + k8sutils.HTTPRouteMustBeAcceptedAndResolved(t, s.Client, s.TimeoutConfig, httpRoutePrimaryNN, gatewayPrimaryNN) + k8sutils.HTTPRouteMustBeAcceptedAndResolved(t, s.Client, s.TimeoutConfig, httpRouteSecondaryNN, gatewaySecondaryNN) + + gwPrimaryAddr := k8sutils.GetGatewayEndpoint(t, s.Client, s.TimeoutConfig, gatewayPrimaryNN) + gwSecondaryAddr := k8sutils.GetGatewayEndpoint(t, s.Client, s.TimeoutConfig, gatewaySecondaryNN) + + t.Run("InferencePool should show Accepted:True by parents and be routable via multiple HTTPRoutes", func(t *testing.T) { + k8sutils.InferencePoolMustBeAcceptedByParent(t, s.Client, poolNN) + t.Logf("InferencePool %s has parent status Accepted:True as expected with two references.", poolNN.String()) + + trafficutils.MakeRequestAndExpectSuccess( + t, + s.RoundTripper, + s.TimeoutConfig, + gwPrimaryAddr, + trafficutils.Request{ + Host: hostnamePrimaryGw, + Path: pathPrimaryGw, + Backend: backendServicePodName, + Namespace: appBackendNamespace, + }, + ) + + trafficutils.MakeRequestAndExpectSuccess( + t, + s.RoundTripper, + s.TimeoutConfig, + gwSecondaryAddr, + trafficutils.Request{ + Host: hostnameSecondaryGw, + Path: pathSecondaryGw, + Backend: backendServicePodName, + Namespace: appBackendNamespace, + }, + ) + }) + + t.Run("Delete httproute-for-primary-gw and verify InferencePool status and routing via secondary gw", func(t *testing.T) { + httpRoutePrimary := &gatewayv1.HTTPRoute{ + ObjectMeta: metav1.ObjectMeta{Name: httpRoutePrimaryNN.Name, Namespace: httpRoutePrimaryNN.Namespace}, + } + t.Logf("Deleting HTTPRoute %s", httpRoutePrimaryNN.String()) + require.NoError(t, s.Client.Delete(context.TODO(), httpRoutePrimary), "failed to delete httproute-for-primary-gw") + + t.Logf("Waiting for %v for Gateway conditions to update after deleting HTTPRoute %s", inferenceTimeoutConfig.HTTPRouteDeletionReconciliationTimeout, httpRoutePrimaryNN.String()) + time.Sleep(inferenceTimeoutConfig.HTTPRouteDeletionReconciliationTimeout) + + k8sutils.InferencePoolMustBeAcceptedByParent(t, s.Client, poolNN) + t.Logf("InferencePool %s still has parent status Accepted:True as expected with one reference remaining.", poolNN.String()) + + trafficutils.MakeRequestAndExpectSuccess( + t, + s.RoundTripper, + s.TimeoutConfig, + gwSecondaryAddr, + trafficutils.Request{ + Host: hostnameSecondaryGw, + Path: pathSecondaryGw, + Backend: backendServicePodName, + Namespace: appBackendNamespace, + }, + ) + + trafficutils.MakeRequestAndExpectEventuallyConsistentResponse( + t, + s.RoundTripper, + s.TimeoutConfig, + gwPrimaryAddr, + trafficutils.Request{ + Host: hostnamePrimaryGw, + Path: pathPrimaryGw, + ExpectedStatusCode: http.StatusNotFound, + }, + ) + }) + + t.Run("Delete httproute-for-secondary-gw and verify InferencePool has no parent statuses and is not routable", func(t *testing.T) { + httpRouteSecondary := &gatewayv1.HTTPRoute{ + ObjectMeta: metav1.ObjectMeta{Name: httpRouteSecondaryNN.Name, Namespace: httpRouteSecondaryNN.Namespace}, + } + t.Logf("Deleting HTTPRoute %s", httpRouteSecondaryNN.String()) + require.NoError(t, s.Client.Delete(context.TODO(), httpRouteSecondary), "failed to delete httproute-for-secondary-gw") + + t.Logf("Waiting for %v for Gateway conditions to update after deleting HTTPRoute %s", inferenceTimeoutConfig.HTTPRouteDeletionReconciliationTimeout, httpRouteSecondaryNN.String()) + time.Sleep(inferenceTimeoutConfig.HTTPRouteDeletionReconciliationTimeout) + + k8sutils.InferencePoolMustHaveNoParents(t, s.Client, poolNN) + t.Logf("InferencePool %s correctly shows no parent statuses, indicating it's no longer referenced.", poolNN.String()) + + trafficutils.MakeRequestAndExpectEventuallyConsistentResponse( + t, + s.RoundTripper, + s.TimeoutConfig, + gwSecondaryAddr, + trafficutils.Request{ + Host: hostnameSecondaryGw, + Path: pathSecondaryGw, + ExpectedStatusCode: http.StatusNotFound, + }, + ) + }) + + t.Logf("InferencePoolResolvedRefsCondition test completed.") + }, +} diff --git a/conformance/tests/basic/inferencepool_resolvedrefs_condition.yaml b/conformance/tests/basic/inferencepool_resolvedrefs_condition.yaml new file mode 100644 index 000000000..416352e30 --- /dev/null +++ b/conformance/tests/basic/inferencepool_resolvedrefs_condition.yaml @@ -0,0 +1,54 @@ +# conformance/tests/basic/inferencepool_resolvedrefs_condition.yaml + +# This manifest defines the initial resources for the +# inferencepool_resolvedrefs_condition.go conformance test. + +# --- HTTPRoute for Primary Gateway (conformance-primary-gateway) --- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: httproute-for-primary-gw + namespace: gateway-conformance-app-backend +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: conformance-primary-gateway + namespace: gateway-conformance-infra + sectionName: http + hostnames: + - "primary.example.com" + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: primary-inference-pool + matches: + - path: + type: PathPrefix + value: /primary-gateway-test +--- +# --- HTTPRoute for Secondary Gateway (conformance-secondary-gateway) --- +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: httproute-for-secondary-gw + namespace: gateway-conformance-app-backend +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: conformance-secondary-gateway + namespace: gateway-conformance-infra + sectionName: http + hostnames: + - "secondary.example.com" + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: primary-inference-pool + matches: + - path: + type: PathPrefix + value: /secondary-gateway-test diff --git a/conformance/tests/main.go b/conformance/tests/main.go new file mode 100644 index 000000000..fc66c7652 --- /dev/null +++ b/conformance/tests/main.go @@ -0,0 +1,35 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package tests is the root package for all Gateway API Inference Extension +// conformance test implementations. +package tests + +import ( + // Importing the suite package to access the ConformanceTest struct definition. + // For initial version directly importing from the core gateway-api repo. + // This may be adjusted in the future if we have need to create a copy of + // the suite utilities. + "sigs.k8s.io/gateway-api/conformance/utils/suite" + // Do NOT add blank imports for specific test packages here. + // They should be added to the main conformance package instead + // to avoid import cycles. +) + +// ConformanceTests holds all the conformance tests definitions for the +// Gateway API Inference Extension suite. Tests are registered from other packages +// using init() functions like the one in the basic package. +var ConformanceTests []suite.ConformanceTest diff --git a/conformance/utils/assertions.go b/conformance/utils/assertions.go new file mode 100644 index 000000000..c77d0fc5b --- /dev/null +++ b/conformance/utils/assertions.go @@ -0,0 +1,25 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package assertions contains custom assertion helper functions used within +// the Gateway API Inference Extension conformance test suite. +package assertions + +// TODO: Implement custom assertion functions specific to Inference Extension testing. +// Examples might include: +// - Asserting specific fields or structures within an inference API response body. +// - Asserting specific metrics reported by mock model servers or EPPs. +// - Asserting specific conditions or status fields unique to InferencePool or InferenceModel. diff --git a/conformance/utils/config/timing.go b/conformance/utils/config/timing.go new file mode 100644 index 000000000..00aefdaa5 --- /dev/null +++ b/conformance/utils/config/timing.go @@ -0,0 +1,60 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package config + +import ( + "time" + + // Import the upstream Gateway API timeout config + + gatewayconfig "sigs.k8s.io/gateway-api/conformance/utils/config" +) + +// InferenceExtensionTimeoutConfig embeds the upstream TimeoutConfig and adds +// extension-specific timeout values. +type InferenceExtensionTimeoutConfig struct { + // All fields from gatewayconfig.TimeoutConfig will be available directly. + gatewayconfig.TimeoutConfig + + // GeneralMustHaveConditionTimeout represents the maximum time to wait for an InferencePool, HttpRoute or other assets to have a specific condition. + GeneralMustHaveConditionTimeout time.Duration + + // InferencePoolMustHaveConditionInterval represents the polling interval for checking an InferencePool's condition. + InferencePoolMustHaveConditionInterval time.Duration + + // GatewayObjectPollInterval is the polling interval used when waiting for a Gateway object to appear. + GatewayObjectPollInterval time.Duration + + // HTTPRouteConditionTimeout represents the maximum time to wait for an HTTPRoute to have a specific condition. + HTTPRouteDeletionReconciliationTimeout time.Duration +} + +// DefaultInferenceExtensionTimeoutConfig returns a new InferenceExtensionTimeoutConfig with default values. +func DefaultInferenceExtensionTimeoutConfig() InferenceExtensionTimeoutConfig { + config := gatewayconfig.DefaultTimeoutConfig() + config.HTTPRouteMustHaveCondition = 300 * time.Second + config.RouteMustHaveParents = 200 * time.Second + config.MaxTimeToConsistency = 200 * time.Second + config.DefaultTestTimeout = 600 * time.Second + return InferenceExtensionTimeoutConfig{ + TimeoutConfig: config, // Initialize embedded struct + GeneralMustHaveConditionTimeout: 300 * time.Second, + InferencePoolMustHaveConditionInterval: 10 * time.Second, + GatewayObjectPollInterval: 5 * time.Second, + HTTPRouteDeletionReconciliationTimeout: 5 * time.Second, + } +} diff --git a/conformance/utils/kubernetes/helpers.go b/conformance/utils/kubernetes/helpers.go new file mode 100644 index 000000000..f2e1e2147 --- /dev/null +++ b/conformance/utils/kubernetes/helpers.go @@ -0,0 +1,384 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package kubernetes contains helper functions for interacting with +// Kubernetes objects within the conformance test suite. +package kubernetes + +import ( + "context" + "fmt" + "reflect" + "testing" + "time" + + "github.com/stretchr/testify/require" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" + "sigs.k8s.io/controller-runtime/pkg/client" + + inferenceapi "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/config" + gatewayv1 "sigs.k8s.io/gateway-api/apis/v1" + gatewayapiconfig "sigs.k8s.io/gateway-api/conformance/utils/config" + gatewayk8sutils "sigs.k8s.io/gateway-api/conformance/utils/kubernetes" +) + +// checkCondition is a helper function similar to findConditionInList or CheckCondition +// from the Gateway API conformance utilities. +// It checks if the expectedCondition is present in the conditions list. +// If expectedCondition.Reason is an empty string, it matches any reason. +func checkCondition(t *testing.T, conditions []metav1.Condition, expectedCondition metav1.Condition) bool { + t.Helper() + for _, cond := range conditions { + if cond.Type == expectedCondition.Type { + if cond.Status == expectedCondition.Status { + if expectedCondition.Reason == "" || cond.Reason == expectedCondition.Reason { + return true + } + t.Logf("Condition %s found with Status %s, but Reason %s did not match expected %s", + expectedCondition.Type, cond.Status, cond.Reason, expectedCondition.Reason) + } else { + t.Logf("Condition %s found, but Status %s did not match expected %s", + expectedCondition.Type, cond.Status, expectedCondition.Status) + } + } + } + t.Logf("Condition %s with Status %s (and Reason %s if specified) not found in conditions list: %+v", + expectedCondition.Type, expectedCondition.Status, expectedCondition.Reason, conditions) + return false +} + +// InferencePoolMustHaveCondition waits for the specified InferencePool resource +// to exist and report the expected status condition within one of its parent statuses. +// It polls the InferencePool's status until the condition is met or the timeout occurs. +func InferencePoolMustHaveCondition(t *testing.T, c client.Reader, poolNN types.NamespacedName, expectedCondition metav1.Condition) { + t.Helper() // Marks this function as a test helper + + var timeoutConfig config.InferenceExtensionTimeoutConfig = config.DefaultInferenceExtensionTimeoutConfig() + var lastObservedPool *inferenceapi.InferencePool + var lastError error + var conditionFound bool + + waitErr := wait.PollUntilContextTimeout( + context.Background(), + timeoutConfig.InferencePoolMustHaveConditionInterval, + timeoutConfig.GeneralMustHaveConditionTimeout, + true, func(ctx context.Context) (bool, error) { + pool := &inferenceapi.InferencePool{} // This is the type instance used for Get + err := c.Get(ctx, poolNN, pool) + if err != nil { + if apierrors.IsNotFound(err) { + t.Logf("InferencePool %s not found yet. Retrying.", poolNN.String()) + lastError = err + return false, nil + } + t.Logf("Error fetching InferencePool %s (type: %s): %v. Retrying.", poolNN.String(), reflect.TypeOf(pool).String(), err) + lastError = err + return false, nil + } + lastObservedPool = pool + lastError = nil + conditionFound = false + + if len(pool.Status.Parents) == 0 { + t.Logf("InferencePool %s has no parent statuses reported yet.", poolNN.String()) + return false, nil + } + + for _, parentStatus := range pool.Status.Parents { + if checkCondition(t, parentStatus.Conditions, expectedCondition) { + conditionFound = true + return true, nil + } + } + return false, nil + }) + + if waitErr != nil || !conditionFound { + debugMsg := "" + if waitErr != nil { + debugMsg += fmt.Sprintf(" Polling error: %v.", waitErr) + } + if lastError != nil { + debugMsg += fmt.Sprintf(" Last error during fetching: %v.", lastError) + } + + if lastObservedPool != nil { + debugMsg += "\nLast observed InferencePool status:" + if len(lastObservedPool.Status.Parents) == 0 { + debugMsg += " (No parent statuses reported)" + } + for i, parentStatus := range lastObservedPool.Status.Parents { + debugMsg += fmt.Sprintf("\n Parent %d (Gateway: %s/%s):", i, parentStatus.GatewayRef.Namespace, parentStatus.GatewayRef.Name) + if len(parentStatus.Conditions) == 0 { + debugMsg += " (No conditions reported for this parent)" + } + for _, cond := range parentStatus.Conditions { + debugMsg += fmt.Sprintf("\n - Type: %s, Status: %s, Reason: %s, Message: %s", cond.Type, cond.Status, cond.Reason, cond.Message) + } + } + } else if lastError == nil || !apierrors.IsNotFound(lastError) { + debugMsg += "\nInferencePool was not found or not observed successfully during polling." + } + + finalMsg := fmt.Sprintf("timed out or condition not met for InferencePool %s to have condition Type=%s, Status=%s", + poolNN.String(), expectedCondition.Type, expectedCondition.Status) + if expectedCondition.Reason != "" { + finalMsg += fmt.Sprintf(", Reason='%s'", expectedCondition.Reason) + } + finalMsg += "." + debugMsg + require.FailNow(t, finalMsg) + } + + logMsg := fmt.Sprintf("InferencePool %s successfully has condition Type=%s, Status=%s", + poolNN.String(), expectedCondition.Type, expectedCondition.Status) + if expectedCondition.Reason != "" { + logMsg += fmt.Sprintf(", Reason='%s'", expectedCondition.Reason) + } + t.Log(logMsg) +} + +// InferencePoolMustHaveNoParents waits for the specified InferencePool resource +// to exist and report that it has no parent references in its status. +// This typically indicates it is no longer referenced by any Gateway API resources. +func InferencePoolMustHaveNoParents(t *testing.T, c client.Reader, poolNN types.NamespacedName) { + t.Helper() + + var lastObservedPool *inferenceapi.InferencePool + var lastError error + var timeoutConfig config.InferenceExtensionTimeoutConfig = config.DefaultInferenceExtensionTimeoutConfig() + + ctx := context.Background() + waitErr := wait.PollUntilContextTimeout( + ctx, + + timeoutConfig.InferencePoolMustHaveConditionInterval, + timeoutConfig.GeneralMustHaveConditionTimeout, + true, + func(pollCtx context.Context) (bool, error) { + pool := &inferenceapi.InferencePool{} + err := c.Get(pollCtx, poolNN, pool) + if err != nil { + if apierrors.IsNotFound(err) { + t.Logf("InferencePool %s not found. Considering this as having no parents.", poolNN.String()) + lastError = nil + return true, nil + } + t.Logf("Error fetching InferencePool %s: %v. Retrying.", poolNN.String(), err) + lastError = err + return false, nil + } + lastObservedPool = pool + lastError = nil + + if len(pool.Status.Parents) == 0 { + t.Logf("InferencePool %s successfully has no parent statuses.", poolNN.String()) + return true, nil + } + t.Logf("InferencePool %s still has %d parent statuses. Waiting...", poolNN.String(), len(pool.Status.Parents)) + return false, nil + }) + + if waitErr != nil { + debugMsg := fmt.Sprintf("Timed out waiting for InferencePool %s to have no parent statuses.", poolNN.String()) + if lastError != nil { + debugMsg += fmt.Sprintf(" Last error during fetching: %v.", lastError) + } + if lastObservedPool != nil && len(lastObservedPool.Status.Parents) > 0 { + debugMsg += fmt.Sprintf(" Last observed InferencePool still had %d parent(s):", len(lastObservedPool.Status.Parents)) + } else if lastError == nil && (lastObservedPool == nil || len(lastObservedPool.Status.Parents) == 0) { + debugMsg += " Polling completed without timeout, but an unexpected waitErr occurred." + } + require.FailNow(t, debugMsg, waitErr) + } + t.Logf("Successfully verified that InferencePool %s has no parent statuses.", poolNN.String()) +} + +// HTTPRouteMustBeAcceptedAndResolved waits for the specified HTTPRoute +// to be Accepted and have its references resolved by the specified Gateway. +// It uses the upstream Gateway API's HTTPRouteMustHaveCondition helper. +func HTTPRouteMustBeAcceptedAndResolved(t *testing.T, c client.Client, timeoutConfig gatewayapiconfig.TimeoutConfig, routeNN, gatewayNN types.NamespacedName) { + t.Helper() + + acceptedCondition := metav1.Condition{ + Type: string(gatewayv1.RouteConditionAccepted), + Status: metav1.ConditionTrue, + Reason: string(gatewayv1.RouteReasonAccepted), + } + + resolvedRefsCondition := metav1.Condition{ + Type: string(gatewayv1.RouteConditionResolvedRefs), + Status: metav1.ConditionTrue, + Reason: string(gatewayv1.RouteReasonResolvedRefs), + } + + t.Logf("Waiting for HTTPRoute %s to be Accepted by Gateway %s", routeNN.String(), gatewayNN.String()) + gatewayk8sutils.HTTPRouteMustHaveCondition(t, c, timeoutConfig, routeNN, gatewayNN, acceptedCondition) + + t.Logf("Waiting for HTTPRoute %s to have ResolvedRefs by Gateway %s", routeNN.String(), gatewayNN.String()) + gatewayk8sutils.HTTPRouteMustHaveCondition(t, c, timeoutConfig, routeNN, gatewayNN, resolvedRefsCondition) + + t.Logf("HTTPRoute %s is now Accepted and has ResolvedRefs by Gateway %s", routeNN.String(), gatewayNN.String()) +} + +// InferencePoolMustBeAcceptedByParent waits for the specified InferencePool +// to report an Accepted condition with status True and reason "Accepted" +// from at least one of its parent Gateways. +func InferencePoolMustBeAcceptedByParent(t *testing.T, c client.Reader, poolNN types.NamespacedName) { + t.Helper() + + acceptedByParentCondition := metav1.Condition{ + Type: string(gatewayv1.GatewayConditionAccepted), + Status: metav1.ConditionTrue, + Reason: string(gatewayv1.GatewayReasonAccepted), // Expecting the standard "Accepted" reason + } + + t.Logf("Waiting for InferencePool %s to be Accepted by a parent Gateway (Reason: %s)", poolNN.String(), gatewayv1.GatewayReasonAccepted) + InferencePoolMustHaveCondition(t, c, poolNN, acceptedByParentCondition) + t.Logf("InferencePool %s is Accepted by a parent Gateway (Reason: %s)", poolNN.String(), gatewayv1.GatewayReasonAccepted) +} + +// InferencePoolMustBeRouteAccepted waits for the specified InferencePool resource +// to exist and report an Accepted condition with Type=RouteConditionAccepted, +// Status=True, and Reason=RouteReasonAccepted within one of its parent statuses. +func InferencePoolMustBeRouteAccepted(t *testing.T, c client.Reader, poolNN types.NamespacedName) { + t.Helper() + + expectedPoolCondition := metav1.Condition{ + Type: string(gatewayv1.RouteConditionAccepted), + Status: metav1.ConditionTrue, + Reason: string(gatewayv1.RouteReasonAccepted), + } + + // Call the existing generic helper with the predefined condition + InferencePoolMustHaveCondition(t, c, poolNN, expectedPoolCondition) + t.Logf("InferencePool %s successfully verified with RouteAccepted condition (Type: %s, Status: %s, Reason: %s).", + poolNN.String(), expectedPoolCondition.Type, expectedPoolCondition.Status, expectedPoolCondition.Reason) +} + +// HTTPRouteAndInferencePoolMustBeAcceptedAndRouteAccepted waits for the specified HTTPRoute +// to be Accepted and have its references resolved by the specified Gateway, +// AND for the specified InferencePool to be "RouteAccepted" using the specific +// RouteConditionAccepted criteria. +func HTTPRouteAndInferencePoolMustBeAcceptedAndRouteAccepted( + t *testing.T, + c client.Client, + routeNN types.NamespacedName, + gatewayNN types.NamespacedName, + poolNN types.NamespacedName) { + t.Helper() + var timeoutConfig config.InferenceExtensionTimeoutConfig = config.DefaultInferenceExtensionTimeoutConfig() + + HTTPRouteMustBeAcceptedAndResolved(t, c, timeoutConfig.TimeoutConfig, routeNN, gatewayNN) + InferencePoolMustBeRouteAccepted(t, c, poolNN) + t.Logf("Successfully verified: HTTPRoute %s (Gateway %s) is Accepted & Resolved, and InferencePool %s is RouteAccepted.", + routeNN.String(), gatewayNN.String(), poolNN.String()) +} + +// GetGatewayEndpoint waits for the specified Gateway to have at least one address +// and returns the address in "host:port" format. +// It leverages the upstream Gateway API's WaitForGatewayAddress. +func GetGatewayEndpoint(t *testing.T, k8sClient client.Client, timeoutConfig gatewayapiconfig.TimeoutConfig, gatewayNN types.NamespacedName) string { + t.Helper() + + t.Logf("Waiting for Gateway %s/%s to get an address...", gatewayNN.Namespace, gatewayNN.Name) + gwAddr, err := gatewayk8sutils.WaitForGatewayAddress(t, k8sClient, timeoutConfig, gatewayk8sutils.NewGatewayRef(gatewayNN)) + require.NoError(t, err, "failed to get Gateway address for %s", gatewayNN.String()) + require.NotEmpty(t, gwAddr, "Gateway %s has no address", gatewayNN.String()) + + t.Logf("Gateway %s/%s has address: %s", gatewayNN.Namespace, gatewayNN.Name, gwAddr) + return gwAddr +} + +// GetPodsWithLabel retrieves a list of Pods. +// It finds pods matching the given labels in a specific namespace. +func GetPodsWithLabel(t *testing.T, c client.Reader, namespace string, labels map[string]string, timeConfig gatewayapiconfig.TimeoutConfig) ([]corev1.Pod, error) { + t.Helper() + + pods := &corev1.PodList{} + timeout := timeConfig.RequestTimeout + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + listOptions := []client.ListOption{ + client.InNamespace(namespace), + client.MatchingLabels(labels), + } + + t.Logf("Searching for Pods with labels %v in namespace %s", labels, namespace) + waitErr := wait.PollUntilContextTimeout(ctx, 1*time.Second, timeout, true, func(ctx context.Context) (bool, error) { + if err := c.List(context.Background(), pods, listOptions...); err != nil { + return false, fmt.Errorf("failed to list pods with labels '%v' in namespace '%s': %w", labels, namespace, err) + } + if len(pods.Items) > 0 { + for _, pod := range pods.Items { + if pod.Status.PodIP == "" || pod.Status.Phase != corev1.PodRunning { + t.Logf("Pod %s found, but not yet running or has no IP. Current phase: %s, IP: '%s'. Retrying.", pod.Name, pod.Status.Phase, pod.Status.PodIP) + return false, nil + } + } + return true, nil + } + t.Logf("No pods found with selector %v yet. Retrying.", labels) + return false, nil + }) + return pods.Items, waitErr +} + +// DeleteDeployment deletes the specified Deployment and waits until it is no longer +// present in the cluster. +func DeleteDeployment(t *testing.T, c client.Client, timeoutConfig gatewayapiconfig.TimeoutConfig, deploymentRef types.NamespacedName) error { + t.Helper() + + deploymentToDelete := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: deploymentRef.Name, + Namespace: deploymentRef.Namespace, + }, + } + + t.Logf("Deleting Deployment %s/%s...", deploymentRef.Namespace, deploymentRef.Name) + if err := c.Delete(context.Background(), deploymentToDelete); err != nil { + // If the resource is already gone, we don't consider it an error. + if !apierrors.IsNotFound(err) { + return fmt.Errorf("failed to delete Deployment %s/%s: %w", deploymentRef.Namespace, deploymentRef.Name, err) + } + } + + // Wait for the Deployment to be fully removed. + waitErr := wait.PollUntilContextTimeout(context.Background(), 1*time.Second, timeoutConfig.DeleteTimeout, true, func(ctx context.Context) (bool, error) { + var dep appsv1.Deployment + err := c.Get(ctx, deploymentRef, &dep) + if apierrors.IsNotFound(err) { + return true, nil + } + if err != nil { + return false, fmt.Errorf("error waiting for Deployment %s/%s to be deleted: %w", deploymentRef.Namespace, deploymentRef.Name, err) + } + return false, nil + }) + + if waitErr != nil { + return fmt.Errorf("timed out waiting for Deployment %s/%s to be deleted: %w", deploymentRef.Namespace, deploymentRef.Name, waitErr) + } + t.Logf("Successfully deleted Deployment %s/%s", deploymentRef.Namespace, deploymentRef.Name) + return nil +} diff --git a/conformance/utils/traffic/traffic.go b/conformance/utils/traffic/traffic.go new file mode 100644 index 000000000..f53cc3236 --- /dev/null +++ b/conformance/utils/traffic/traffic.go @@ -0,0 +1,325 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package traffic + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/http/httputil" + "regexp" + "strings" + "testing" + "time" + + gwconfig "sigs.k8s.io/gateway-api/conformance/utils/config" + gwhttp "sigs.k8s.io/gateway-api/conformance/utils/http" + "sigs.k8s.io/gateway-api/conformance/utils/roundtripper" + "sigs.k8s.io/gateway-api/conformance/utils/tlog" +) + +// Request defines the parameters for a single HTTP test request and its expected outcome. +type Request struct { + // Host is the hostname to use in the HTTP request. + Host string + // Path is the path to request. + Path string + // Method is the HTTP method to use. Defaults to "GET" if empty. + Method string + // Headers are the HTTP headers to include in the request. + Headers map[string]string + // Body is the request body. + Body string + + // ExpectedStatusCode is the HTTP status code expected in the response. + ExpectedStatusCode int + // Backend is the name of the backend service expected to handle the request. + // This is not checked for non-200 responses. + Backend string + // Namespace is the namespace of the backend service. + Namespace string +} + +// MakeRequestAndExpectSuccess is a convenience wrapper for requests that are +// expected to succeed with a 200 OK status. +func MakeRequestAndExpectSuccess( + t *testing.T, + r roundtripper.RoundTripper, + timeoutConfig gwconfig.TimeoutConfig, + gatewayAddress string, + req Request, +) { + t.Helper() + req.ExpectedStatusCode = http.StatusOK + MakeRequestAndExpectEventuallyConsistentResponse(t, r, timeoutConfig, gatewayAddress, req) +} + +// MakeRequestAndExpectEventuallyConsistentResponse makes a request using the parameters +// from the Request struct and waits for the response to consistently match the expectations. +func MakeRequestAndExpectEventuallyConsistentResponse( + t *testing.T, + r roundtripper.RoundTripper, + timeoutConfig gwconfig.TimeoutConfig, + gatewayAddress string, + req Request, +) { + t.Helper() + + expectedResponse := makeExpectedResponse(t, req) + waitForConvergeToExpected(t, r, timeoutConfig, gatewayAddress, req.Body, expectedResponse) +} + +// MakeRequestAndExpectResponseFromPod sends a request to the specified path +func MakeRequestAndExpectResponseFromPod(t *testing.T, r roundtripper.RoundTripper, timeoutConfig gwconfig.TimeoutConfig, gwAddr, path, podPrefix, nameSpace string) { + t.Helper() + expectedResponse := gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Path: path, + }, + Backend: podPrefix, + Namespace: nameSpace, + } + + gwhttp.MakeRequestAndExpectEventuallyConsistentResponse(t, r, timeoutConfig, gwAddr, expectedResponse) +} + +func makeExpectedResponse(t *testing.T, req Request) gwhttp.ExpectedResponse { + t.Helper() + + method := http.MethodGet + if req.Method != "" { + method = req.Method + } + + expectedResponse := gwhttp.ExpectedResponse{ + Request: gwhttp.Request{ + Host: req.Host, + Path: req.Path, + Method: method, + Headers: req.Headers, + }, + Response: gwhttp.Response{ + StatusCode: req.ExpectedStatusCode, + }, + Backend: req.Backend, + Namespace: req.Namespace, + } + + // For successful responses (200 OK), we also verify that the backend + // received the request with the correct details (Host, Path, etc.). + // For other statuses (e.g., 404), this check is skipped. + if req.ExpectedStatusCode == http.StatusOK { + expectedResponse.ExpectedRequest = &gwhttp.ExpectedRequest{ + Request: gwhttp.Request{ + Host: req.Host, + Path: req.Path, + Headers: req.Headers, + Method: method, + }, + } + } + return expectedResponse +} + +// TODO: https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/1031 +// replace the following method when sigs.k8s.io/gateway-api/conformance/utils/roundtripper is able to send request with body. +func waitForConvergeToExpected( + t *testing.T, + r roundtripper.RoundTripper, + timeoutConfig gwconfig.TimeoutConfig, + gatewayAddress string, + requestBody string, + expectedResponse gwhttp.ExpectedResponse, +) { + gwhttp.AwaitConvergence(t, timeoutConfig.RequiredConsecutiveSuccesses, timeoutConfig.MaxTimeToConsistency, func(elapsed time.Duration) bool { + req := gwhttp.MakeRequest(t, &expectedResponse, gatewayAddress, "HTTP", "http") + request := &RequestWithBody{Request: req} + if requestBody != "" { + request = &RequestWithBody{Request: req, Body: strings.NewReader(requestBody)} + } + cReq, cRes, err := MakeCallRoundTripper(t, r, request) + if err != nil { + tlog.Logf(t, "Request failed, not ready yet: %v (after %v)", err.Error(), elapsed) + return false + } + + if err := CompareRequestWithWildcardStatus(t, &request.Request, cReq, cRes, expectedResponse); err != nil { + tlog.Logf(t, "Response expectation failed for request: %+v not ready yet: %v (after %v)", request.Request, err, elapsed) + return false + } + + return true + }) + tlog.Logf(t, "Request passed") +} + +// CompareRequestWithWildcardStatus compares requests with wildcard status code support. +// It treats a single-digit expected code (e.g., 4) as a class wildcard (4xx), +// while standard 3-digit codes are matched exactly. +func CompareRequestWithWildcardStatus(t *testing.T, req *roundtripper.Request, cReq *roundtripper.CapturedRequest, cRes *roundtripper.CapturedResponse, expected gwhttp.ExpectedResponse) error { + if expected.Response.StatusCode < 1 || expected.Response.StatusCode >= 100 { + return gwhttp.CompareRequest(t, req, cReq, cRes, expected) + } + + expectedClass := expected.Response.StatusCode + actualClass := cRes.StatusCode / 100 + if expectedClass != actualClass { + return fmt.Errorf("expected status code class %dxx, but got %d", expectedClass, cRes.StatusCode) + } + + // StatusCode Class matches; update status code on a copy to allow the standard comparator to pass. + modifiedExpected := expected + modifiedExpected.Response.StatusCode = cRes.StatusCode + return gwhttp.CompareRequest(t, req, cReq, cRes, modifiedExpected) +} + +// TODO: https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/1031 +// remove this when sigs.k8s.io/gateway-api/conformance/utils/roundtripper is able to send request with body. +// RequestWithBody extends roundtripper.Request to include a request body. +type RequestWithBody struct { + roundtripper.Request + Body io.Reader +} + +// TODO: https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/1031 +// remove this when sigs.k8s.io/gateway-api/conformance/utils/roundtripper is able to send request with body. +// MakeCallRoundTripper executes an HTTP request using the provided RoundTripper and captures the request and response. +func MakeCallRoundTripper(t *testing.T, r roundtripper.RoundTripper, request *RequestWithBody) (*roundtripper.CapturedRequest, *roundtripper.CapturedResponse, error) { + client := &http.Client{} + + defaultRoundTripper, ok := r.(*roundtripper.DefaultRoundTripper) + if !ok { + t.Fatalf("Unsupported RoundTripper type: %T", r) + } + rt := defaultRoundTripper + if request.UnfollowRedirect { + client.CheckRedirect = func(_ *http.Request, _ []*http.Request) error { + return http.ErrUseLastResponse + } + } + + client.Transport = &http.Transport{ + DialContext: rt.CustomDialContext, + // We disable keep-alives so that we don't leak established TCP connections. + // Leaking TCP connections is bad because we could eventually hit the + // threshold of maximum number of open TCP connections to a specific + // destination. Keep-alives are not presently utilized so disabling this has + // no adverse affect. + // + // Ref. https://github.com/kubernetes-sigs/gateway-api/issues/2357 + DisableKeepAlives: true, + } + + method := "GET" + if request.Method != "" { + method = request.Method + } + ctx, cancel := context.WithTimeout(context.Background(), rt.TimeoutConfig.RequestTimeout) + defer cancel() + req, err := http.NewRequestWithContext(ctx, method, request.URL.String(), request.Body) + if err != nil { + return nil, nil, err + } + + if request.Host != "" { + req.Host = request.Host + } + + if request.Headers != nil { + for name, value := range request.Headers { + req.Header.Set(name, value[0]) + } + } + + if rt.Debug { + var dump []byte + dump, err = httputil.DumpRequestOut(req, true) + if err != nil { + return nil, nil, err + } + + tlog.Logf(request.T, "Sending Request:\n%s\n\n", formatDump(dump, "< ")) + } + + resp, err := client.Do(req) + if err != nil { + return nil, nil, err + } + defer resp.Body.Close() + + if rt.Debug { + var dump []byte + dump, err = httputil.DumpResponse(resp, true) + if err != nil { + return nil, nil, err + } + + tlog.Logf(request.T, "Received Response:\n%s\n\n", formatDump(dump, "< ")) + } + + cReq := &roundtripper.CapturedRequest{} + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, nil, err + } + + // we cannot assume the response is JSON + if resp.Header.Get("Content-type") == "application/json" { + err = json.Unmarshal(body, cReq) + if err != nil { + return nil, nil, fmt.Errorf("unexpected error reading response: %w", err) + } + } else { + cReq.Method = method // assume it made the right request if the service being called isn't echoing + } + + cRes := &roundtripper.CapturedResponse{ + StatusCode: resp.StatusCode, + ContentLength: resp.ContentLength, + Protocol: resp.Proto, + Headers: resp.Header, + } + + if resp.TLS != nil { + cRes.PeerCertificates = resp.TLS.PeerCertificates + } + + if roundtripper.IsRedirect(resp.StatusCode) { + redirectURL, err := resp.Location() + if err != nil { + return nil, nil, err + } + cRes.RedirectRequest = &roundtripper.RedirectRequest{ + Scheme: redirectURL.Scheme, + Host: redirectURL.Hostname(), + Port: redirectURL.Port(), + Path: redirectURL.Path, + } + } + + return cReq, cRes, nil +} + +var startLineRegex = regexp.MustCompile(`(?m)^`) + +func formatDump(data []byte, prefix string) string { + data = startLineRegex.ReplaceAllLiteral(data, []byte(prefix)) + return string(data) +} diff --git a/docs/proposals/002-api-proposal/README.md b/docs/proposals/002-api-proposal/README.md index f6d0c9e70..e82d4693c 100644 --- a/docs/proposals/002-api-proposal/README.md +++ b/docs/proposals/002-api-proposal/README.md @@ -122,13 +122,98 @@ type InferencePool struct { metav1.ObjectMeta metav1.TypeMeta - Spec InferencePoolSpec + Spec InferencePoolSpec + Status InferencePoolStatus } type InferencePoolSpec struct { - // ModelServerSelector uses label selection to watch model server pods + // Selector defines a map of labels to watch model server pods // that should be included in the InferencePool. - ModelServerSelector map[string]string `json:"modelServerSelector,omitempty"` + // In some cases, implementations may translate this field to a Service selector, so this matches the simple + // map used for Service selectors instead of the full Kubernetes LabelSelector type. + // If sepecified, it will be applied to match the model server pods in the same namespace as the InferencePool. + // Cross namesoace selector is not supported. + Selector map[LabelKey]LabelValue `json:"selector"` + + // TargetPortNumber defines the port number to access the selected model servers. + // The number must be in the range 1 to 65535. + TargetPortNumber int32 `json:"targetPortNumber"` + + // EndpointPickerConfig specifies the configuration needed by the proxy to discover and connect to the endpoint + // picker service that picks endpoints for the requests routed to this pool. + EndpointPickerConfig `json:",inline"` +} + +// EndpointPickerConfig specifies the configuration needed by the proxy to discover and connect to the endpoint picker extension. +// This type is intended to be a union of mutually exclusive configuration options that we may add in the future. +type EndpointPickerConfig struct { + // Extension configures an endpoint picker as an extension service. + ExtensionRef *Extension `json:"extensionRef,omitempty"` +} + +// Extension specifies how to configure an extension that runs the endpoint picker. +type Extension struct { + // Reference is a reference to a service extension. + ExtensionReference `json:",inline"` + + // ExtensionConnection configures the connection between the gateway and the extension. + ExtensionConnection `json:",inline"` +} + +// ExtensionReference is a reference to the extension. +// +// If a reference is invalid, the implementation MUST update the `ResolvedRefs` +// Condition on the InferencePool's status to `status: False`. A 5XX status code MUST be returned +// for the request that would have otherwise been routed to the invalid backend. +type ExtensionReference struct { + // Group is the group of the referent. + // The default value is "", representing the Core API group. + Group *Group `json:"group,omitempty"` + + // Kind is the Kubernetes resource kind of the referent. For example + // "Service". + // + // Defaults to "Service" when not specified. + // + // ExternalName services can refer to CNAME DNS records that may live + // outside of the cluster and as such are difficult to reason about in + // terms of conformance. They also may not be safe to forward to (see + // CVE-2021-25740 for more information). Implementations MUST NOT + // support ExternalName Services. + Kind *Kind `json:"kind,omitempty"` + + // Name is the name of the referent. + Name ObjectName `json:"name"` + + // The port number on the service running the extension. When unspecified, + // implementations SHOULD infer a default value of 9002 when the Kind is + // Service. + PortNumber *PortNumber `json:"portNumber,omitempty"` +} + +// ExtensionConnection encapsulates options that configures the connection to the extension. +type ExtensionConnection struct { + // Configures how the gateway handles the case when the extension is not responsive. + // Defaults to failClose. + FailureMode *ExtensionFailureMode `json:"failureMode"` +} + +// ExtensionFailureMode defines the options for how the gateway handles the case when the extension is not +type ExtensionFailureMode string + + +// PoolStatus defines the observed state of InferencePool from a Gateway. +type PoolStatus struct { + // GatewayRef indicates the gateway that observed state of InferencePool. + GatewayRef corev1.ObjectReference `json:"parentRef"` + + // Conditions track the state of the InferencePool. + // + // Known condition types are: + // + // * "Accepted" + // * "ResolvedRefs" + Conditions []metav1.Condition `json:"conditions,omitempty"` } ``` @@ -147,6 +232,7 @@ type InferenceModel struct { metav1.TypeMeta Spec InferenceModelSpec + Status InferenceModelStatus } type InferenceModelSpec struct { @@ -154,7 +240,7 @@ type InferenceModelSpec struct { // ModelNames are expected to be unique for a specific InferencePool // (names can be reused for a different pool in the same cluster). // The modelName with the oldest creation timestamp is retained, and the incoming - // InferenceModel is sets the Ready status to false with a corresponding reason. + // InferenceModel's Ready status is set to false with a corresponding reason. // In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected. // Names can be reserved without an underlying model configured in the pool. // This can be done by specifying a target model and setting the weight to zero, @@ -172,8 +258,21 @@ type InferenceModelSpec struct { // If not specified, the target model name is defaulted to the ModelName parameter. // ModelName is often in reference to a LoRA adapter. TargetModels []TargetModel - // Reference to the InferencePool that the model registers to. It must exist in the same namespace. - PoolReference *LocalObjectReference + // PoolRef is a reference to the inference pool, the pool must exist in the same namespace. + PoolRef PoolObjectReference +} + +// PoolObjectReference identifies an API object within the namespace of the +// referrer. +type PoolObjectReference struct { + // Group is the group of the referent. + Group Group + + // Kind is kind of the referent. For example "InferencePool". + Kind Kind + + // Name is the name of the referent. + Name ObjectName } // Defines how important it is to serve the model compared to other models. @@ -181,13 +280,17 @@ type InferenceModelSpec struct { // This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior. type Criticality string const ( - // Most important. Requests to this band will be shed last. - Critical Criticality = "Critical" - // More important than Sheddable, less important than Critical. - // Requests in this band will be shed before critical traffic. - Default Criticality = "Default" - // Least important. Requests to this band will be shed before all other bands. - Sheddable Criticality = "Sheddable" + // Critical defines the highest level of criticality. Requests to this band will be shed last. + Critical Criticality = "Critical" + + // Standard defines the base criticality level and is more important than Sheddable but less + // important than Critical. Requests in this band will be shed before critical traffic. + // Most models are expected to fall within this band. + Standard Criticality = "Standard" + + // Sheddable defines the lowest level of criticality. Requests to this band will be shed before + // all other bands. + Sheddable Criticality = "Sheddable" ) // TargetModel represents a deployed model or a LoRA adapter. The @@ -200,24 +303,16 @@ const ( type TargetModel struct { // The name of the adapter as expected by the ModelServer. Name string - // Weight is used to determine the percentage of traffic that should be + // Weight is used to determine the percentage of traffic that should be // sent to this target model when multiple versions of the model are specified. - Weight *int + Weight *int32 } -// LocalObjectReference identifies an API object within the namespace of the -// referrer. -type LocalObjectReference struct { - // Group is the group of the referent. - Group Group - - // Kind is kind of the referent. For example "InferencePool". - Kind Kind - - // Name is the name of the referent. - Name ObjectName +// InferenceModelStatus defines the observed state of InferenceModel +type InferenceModelStatus struct { + // Conditions track the state of the InferenceModel. + Conditions []metav1.Condition } - ``` ### Yaml Examples @@ -225,27 +320,32 @@ type LocalObjectReference struct { #### InferencePool(s) Here we create a pool that selects the appropriate pods ```yaml -apiVersion: inference.x-k8s.io/v1alpha1 +apiVersion: inference.x-k8s.io/v1alpha2 kind: InferencePool metadata: name: base-model-pool - modelServerSelector: - - app: llm-server +spec: + selector: + app: llm-server + targetNumber: 8080 + extensionRef: + name: infra-backend-v1-app ``` #### InferenceModel Here we consume the pool with two InferenceModels. Where `sql-code-assist` is both the name of the model and the name of the LoRA adapter on the model server. And `npc-bot` has a layer of indirection for those names, as well as a specified criticality. Both `sql-code-assist` and `npc-bot` have available LoRA adapters on the InferencePool and routing to each InferencePool happens earlier (at the K8s Gateway). ```yaml -apiVersion: inference.x-k8s.io/v1alpha1 +apiVersion: inference.x-k8s.io/v1alpha2 kind: InferenceModel metadata: name: sql-code-assist spec: modelName: sql-code-assist - poolRef: base-model-pool + poolRef: + name: base-model-pool --- -apiVersion: inference.x-k8s.io/v1alpha1 +apiVersion: inference.x-k8s.io/v1alpha2 kind: InferenceModel metadata: name: npc-bot @@ -253,11 +353,12 @@ spec: modelName: npc-bot criticality: Critical targetModels: - targetModelName: npc-bot-v1 + - name: npc-bot-v1 + weight: 50 + - name: npc-bot-v2 weight: 50 - targetModelName: npc-bot-v2 - weight: 50 - poolRef: base-model-pool + poolRef: + name: base-model-pool ``` diff --git a/docs/proposals/003-model-server-protocol/README.md b/docs/proposals/003-model-server-protocol/README.md index 02efbe5cb..e6c7cf3fc 100644 --- a/docs/proposals/003-model-server-protocol/README.md +++ b/docs/proposals/003-model-server-protocol/README.md @@ -21,10 +21,10 @@ effort. The corresponding metrics in vLLM are also shown in the table below, as vLLM is already integrated into the reference endpoint picker implementation. -| Metric | Type | Description | vLLM metric | -| ----- | ---- | ---- | ---- | -| TotalQueuedRequests | Gauge | The current total number of requests in the queue.| `vllm:num_requests_waiting`| -| KVCacheUtilization| Gauge | The current KV cache utilization in percentage.| `vllm:gpu_cache_usage_perc`| +| Metric | Type | Description | vLLM metric | Triton TensorRT-LLM| +| ----- | ---- | ---- | ---- | ---- | +| TotalQueuedRequests | Gauge | The current total number of requests in the queue.| `vllm:num_requests_waiting`| `nv_trt_llm_request_metrics{request_type=waiting}`| +| KVCacheUtilization| Gauge | The current KV cache utilization in percentage.| `vllm:gpu_cache_usage_perc`| `nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}`| ### LoRA Adapter Serving @@ -48,3 +48,10 @@ The model server MUST expose the following LoRA adapter metrics via the same Pro * `running_lora_adapters`: A comma separated list of adapters that are currently loaded in GPU memory and ready to serve requests. Example: `"running_lora_adapters": "adapter1, adapter2"` * `waiting_lora_adapters`: A comma separated list of adapters that are waiting to be served. Example: `"waiting_lora_adapters": "adapter1, adapter2"` + +### Prefix Cache Reuse + +Starting from [v0.4.0](https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/tag/v0.4.0), +the EPP supports [prefix cache optimized request scheduling](https://gateway-api-inference-extension.sigs.k8s.io/guides/epp-configuration/prefix-aware/). +To benefit from the optimal prefix aware request scheduling, model servers SHOULD support prefix +cache reuse, such as the [vllm automatic prefix caching](https://docs.vllm.ai/en/latest/features/automatic_prefix_caching.html) feature. \ No newline at end of file diff --git a/docs/proposals/004-endpoint-picker-protocol/README.md b/docs/proposals/004-endpoint-picker-protocol/README.md index 5280e05cb..964dfc4b4 100644 --- a/docs/proposals/004-endpoint-picker-protocol/README.md +++ b/docs/proposals/004-endpoint-picker-protocol/README.md @@ -7,7 +7,7 @@ found [here](../../../pkg/epp/). This doc defines the protocol between the EPP and the proxy (e.g, Envoy). The EPP MUST implement the Envoy -[external processing service](https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor) protocol. +[external processing service](https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/filters/http/ext_proc/v3/ext_proc.proto) protocol. ## Endpoint Subset For each HTTP request, the proxy CAN communicate the subset of endpoints the EPP MUST pick from by setting an unstructured entry in the [filter metadata](https://github.com/envoyproxy/go-control-plane/blob/63a55395d7a39a8d43dcc7acc3d05e4cae7eb7a2/envoy/config/core/v3/base.pb.go#L819) field of the ext-proc request. The metadata entry for the subset list MUST be wrapped with an outer key (which represents the metadata namespace) with a default of `envoy.lb.subset_hint`. @@ -25,11 +25,11 @@ If the key `x-gateway-destination-endpoint-subset` is set, the EPP MUST only sel If the key `x-gateway-destination-endpoint-subset` is not set, then the EPP MUST select from the set defined by the `InferencePool` selector. ## Destination Endpoint -For each HTTP request, the EPP MUST communicate to the proxy the picked model server endpoint via: +For each HTTP request, the EPP MUST communicate to the proxy one or more selected model server endpoints via: -1. Setting the `x-gateway-destination-endpoint` HTTP header to the selected endpoint in format. +1. Setting the `x-gateway-destination-endpoint` HTTP header to one or more selected endpoints. -2. Set an unstructured entry in the [dynamic_metadata](https://github.com/envoyproxy/go-control-plane/blob/c19bf63a811c90bf9e02f8e0dc1dcef94931ebb4/envoy/service/ext_proc/v3/external_processor.pb.go#L320) field of the ext-proc response. The metadata entry for the picked endpoint MUST be wrapped with an outer key (which represents the metadata namespace) with a default of `envoy.lb`. +2. Set an unstructured entry in the [dynamic_metadata](https://github.com/envoyproxy/go-control-plane/blob/c19bf63a811c90bf9e02f8e0dc1dcef94931ebb4/envoy/service/ext_proc/v3/external_processor.pb.go#L320) field of the ext-proc response. The metadata entry for the picked endpoints MUST be wrapped with an outer key (which represents the metadata namespace) with a default of `envoy.lb`. The primary endpoint MUST be set using the key `x-gateway-destination-endpoint` as follows: ```go @@ -39,26 +39,25 @@ dynamicMetadata: { } } ``` +or: +```go +dynamicMetadata: { + "envoy.lb": { + "x-gateway-destination-endpoint": ,,... + } +} +``` + +The value of the header or metadata entry MUST contain at least one endpoint in `` format or multiple endpoints in `,,...` format. Multiple endpoints are separated by commas. The first valid endpoint in the list will be used. If retry is configured, the proxy will go sequentially down the list until one valid endpoint is found. Constraints: - If the EPP did not communicate the server endpoint via these two methods, it MUST return an error as follows: - [ImmediateResponse](https://github.com/envoyproxy/envoy/blob/f2023ef77bdb4abaf9feef963c9a0c291f55568f/api/envoy/service/ext_proc/v3/external_processor.proto#L195) with 503 (Serivce Unavailable) HTTP status code if there are no ready endpoints. - [ImmediateResponse](https://github.com/envoyproxy/envoy/blob/f2023ef77bdb4abaf9feef963c9a0c291f55568f/api/envoy/service/ext_proc/v3/external_processor.proto#L195) with 429 (Too Many Requests) HTTP status code if the request should be dropped (e.g., a Sheddable request, and the servers under heavy load). -- The EPP MUST not set two different values in the header and the inner response metadata value. +- The EPP MUST not set two different values in the header and the inner response metadata value. - Setting different value leads to unpredictable behavior because proxies aren't guaranteed to support both paths, and so this protocol does not define what takes precedence. -### Destination endpoint fallback -A single fallback endpoint CAN be set using the key `x-gateway-destination-endpoint-fallback` in the same metadata namespace as one used for `x-gateway-destination-endpoint` as follows: - -```go -dynamicMetadata: { - "envoy.lb" { - "x-gateway-destination-endpoint-fallback": - } -} -``` - -### Why envoy.lb namespace as a default? +### Why envoy.lb namespace as a default? The `envoy.lb` namespace is a predefined namespace. One common way to use the selected endpoint returned from the server, is [envoy subsets](https://www.envoyproxy.io/docs/envoy/latest/intro/arch_overview/upstream/load_balancing/subsets) where host metadata for subset load balancing must be placed under `envoy.lb`. Note that this is not related to the subsetting feature discussed above, this is an enovy implementation detail. ## Matching An InferenceModel diff --git a/docs/proposals/006-scheduler/README.md b/docs/proposals/006-scheduler/README.md new file mode 100644 index 000000000..77fc4c258 --- /dev/null +++ b/docs/proposals/006-scheduler/README.md @@ -0,0 +1,188 @@ +# Gateway API Scheduler + +Authors: @kfswain, @smarterclayton + +## Proposal Status + ***Draft*** + +## Table of Contents + + + +- [Summary](#summary) +- [Goals](#goals) +- [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [Personas](#personas) + - [Requirements](#requirements) + - [Design](#design) + - [Alternatives](#alternatives) +- [FAQ](#faq) +- [Open Questions](#open-questions) + + + +## Summary + +The inference gateway leverages insight into the anticipated cost of a request and a dynamic capacity model of the backend to achieve higher utilization and more predictable response latency than random balancing can achieve. It should accomplish this over multiple optimization dimensions including, but not limited to: + - prompt length + - anticipated output length + - current traffic distribution + - available backend kv-cache + - workload latency objective + - anticipated savings from prefix cache aware routing + - heterogenous accelerator performance + - backend topology (such as prefill disaggregation or different model server tuning). + + This unified model can better serve diverse workloads on shared models with fewer accelerators as it is reactive to current traffic rather than defined up front. The scheduler selects endpoints on these optimization dimensions, effectively acting as the enforcement of these decisions. + +This proposal defines this *scheduler* subsystem and clearly defines scope, with the possibility of extending scope via future proposals. + +## Goals + +- The scheduler should be reasonably fast - decide request mapping to endpoints within O(10ms) on average +- The scheduler should be effective - requiring little configuration out of the box to get great performance +- The scheduler should be maintainable - new in-tree features should compose cleanly +- The scheduler should be extensible - downstream consumers should expect some stability of the code interface +- The scheduler should be enrichable - extending the [model server protocol](../003-model-server-protocol/) with new metrics or adding a new source of data should be minimally invasive +- The scheduler should be pluggable - the reference endpoint picker implementation should support build time plugins, through a clearly defined interface, or fully delegating scheduling decisions per pool to an alternative **replacement scheduler** + +## Non-Goals + +- Dynamic reconfiguration of the reference scheduler algorithms at runtime +- Being a general scheduler framework for any type of load balancing besides inference +- Determining the characteristics of the underlying model servers and hardware + +## Proposal + +### Definitions + +#### Scheduler + +The 'scheduler' as referred to in this proposal, and repo, is the subsystem that operates _after_ any queuing mechanism, and is the algorithm that actuates on the different optimization dimensions & model server data, selecting the endpoint that best serves the workload and best consumes the underlying compute capacity. + +Any reference to scheduler performance is scoped to this subsystem & not the EPP as a whole. + +#### Saturation + +As model servers accrue requests to compute in the batch, the latency of each batch cycle increases, and so the latency of a single request also will increase. This increase in latency also increases throughput (serving multiple requests in parallel). + +Saturation defines the point at which the latency/throughput tradeoff is no longer efficient. For the scope of inference gateway, and this proposal, we will define 2 saturation definitions: +- Hard Saturation - the model server is completely at capacity, and requests will now be queued and/or evicted. +- Soft Saturation - a saturation limit dictated by the latency sensitivity of the workloads using it. + - i.e. if a model server is saturated to a point that all requests sent to it will not achieve the latency SLO, and so those requests (and the model server), can be considered in an 'unusable' state. + +Subsequent designs will expand on this work. + +#### Request Cost + +The 'cost' of an inference request is simply the amount of resource(s) the request will consume. In the context of this proposal, the resource(s) considered are the GPU mem & GPU compute time, usually in terms of *saturation* of the model server. +- Ex: This 200 token prompt that has no prefix cache hit is projected to have 456 output tokens and so will take up X amount of GPU memory, and should take ~Y time to complete, and so will contribute to the saturation of model server Z for that Y time. + +### Personas + +These are the personas we target with the scheduler subsystem: + +#### OSS Algorithm Researcher + +The OSS Researcher forks and extends the reference scheduler to add new algorithmic improvements and shows how it impacts workloads. They: + +- Provide a replacement scheduler OR extend the reference scheduler +- Test their changes repeatedly against a set of scenarios +- Validate that their changes do not regress other scenarios +- Propose changes to the reference scheduler or the replacement scheduler protocol + +#### Production Algorithm Contributor + +The production algorithm contributor is an ML engineer or platform owner who observes that a specific scheduling outcome is non-optimal for their workloads and must rapidly fix the issue and get it live. They: + +- Fix a scheduler bug OR Extend the reference scheduler with changes specific to their environment +- Quickly deploy a custom EPP with their changes to their environment, and sustain that fix until upstream merges +- Add new test cases to validate their issue is resolved and does not introduce a regression +- If necessary, open a feature request and proposal to cover the novel requirement + +#### Inference Platform Admin + +The Inference Platform Admin creates and manages the infrastructure necessary to run LLM workloads. They: + +- Configure the model server under an InferencePool to accomplish the objectives of the workloads +- Configure the scheduler associated with an InferencePool to be more efficient or more predictable +- Observe rollouts for degradation of existing workload performance and stop rollout + +#### Inference Workload Owner + +An Inference Workload Owner persona owns and manages 1 or many Generative AI Workloads. They: + +- Configure API objects to leverage new algorithm features in test and production environments +- Reproducibly measure production traffic against new algorithms +- Identify regressions in performance when new algorithm changes are rolled out via alerting + +### Requirements + +We desire the following outcomes from the reference scheduler: + +1. Allow model servers to more predictably approach saturation +2. Make user-visible request latency more predictable +3. Provide isolation between multiple workloads on the same model servers before saturation +4. Prioritize and fairly share resources between multiple workloads on the same model servers after saturation + +We desire the following outcomes from the act of using a modified, or replacement scheduler: + +1. Fast iteration with the ML ecosystem, namely other languages +2. Use data from already integrated informers without having multiple implementations or copies running +3. Acceptable speed of scheduling for 10-1000 QPS systems + +### Design + +We expect the following challenges to be addressed by the reference scheduler design: + +1. Understand the cost of an incoming request and its impact on the target model server before placing it +2. Track the cost of previously issued requests to avoid overloading servers +3. Integrate future cost features such as prefix cache routing into a holistic cost model +4. Support heterogenous model server capabilities in terms of capacity, latency, memory, and features + +In general, the cost of the request is the resources it will consume during its execution. That includes the fraction of compute and memory (as kv-cache) on the accelerator and may be modified by the workload's latency sensitivity (which requires more compute to be set aside to serve the request). + +#### Reference Scheduler + +The reference scheduler will be a Golang scheduler interface that is expected to run cooperatively with other instances of the scheduler with the same configuration or with appropriate 1 version/config skew. + +The reference scheduler receives a list of **candidate endpoints** from the EPP and is responsible for selecting a match. + +The reference scheduler is **informed** about the current state of model servers via **informers**, of which the current informer is a fast-polling loop retrieving model server metrics via the [model server protocol](../003-model-server-protocol/). + +The reference scheduler is configured with a series of **predicates** that **filter** candidate endpoints, removing impossible matches. If no matches or only one match is feasible, that endpoint is selected. If multiple matches are made, the scheduler will consult a list of configured **scorers** to **score** the matches into a **prioritized** list of endpoints, and then **sample** from that list. + +Once an endpoint is selected, the endpoint is **assumed** to be running that request until the EPP observes the termination of that request (most common) OR an informer invalidates the execution of those requests. The scheduler must integrate the impact of assumed load to with informer state, especially when traffic spikes. + +Given that we anticipate a significant amount of future work to integrate heterogenous hardware (different generations / topologies) and heterogeous server roles (prefill-heavy, prefill/decode split, latency objectives), we expect that there will be an **assignment** informer that partitions the candidate endpoints over multiple dimensions for the scheduler. This will decouple the scheduling algorithm from the process of determining the capacity and suitability of different model servers to different dimensions of request cost. + +#### Alternate Scheduler + +The alternate scheduler will be a low-latency mechanism for out-of-process execution of the core endpoint selection option. The alternate scheduler will accept one or more requests to schedule, a list of endpoints, and optionally the associated informer state for those endpoints. The alternate scheduler will return a list of selected endpoints, length of list is configured. Schedulers can run in parallel with one another, with a scheduler selected as the source of truth, allowing for safe development of new scheduling algorithms that can operate on production traffic without impact. + +#### Scheduler Validation + +The proper functioning of the scheduler to prevent regression of performance is critical. A multi-level test strategy will be required: + +- Unit tests that verify scheduling decisions are accurate for all predictates and scorers +- Integration tests that verify concurrent execution as well as cooperative scheduling +- End to end tests that verify production traces against default scheduling achieve specific behavior + + A benchmarking harness will be provided to capture and reproduce a production trace, primarily to aid algorithmic contributors. A small but diverse set of production traces will be used initially to anchor expectations, and scaling both the number of supported traces and efficient regression testing at scale will be critical. + + We anticipate that accelerator availability will limit the scale of e2e testing and contribution. We will develop a **model server stub** that can emulate the behavior of the core expected algorithm for model servers and does not require accelerators. We will support both time-accurate and configurable ratio emulation to allow fast execution. + +### Alternatives + +#### Replaceable but not extensible scheduler + +A non-extensible scheduler would be a black-box that could be replaced, and would be ideal if we do not intend the reference implementation to be featureful or if there is no wide set of scheduler features valuable to many users. + +Given that we desire to have a strong out of the box reference implementation that improves performance for many users with no configuration, we do not select this alternative. + +#### Highly-parameterizable scheduler + +A parameterizable scheduler would have a rich configuration syntax exposed to InferencePool admins (and potentially InferenceModel users). It would be ideal if most inference workloads had no similarities and every workload needed to be configured at the pool level or higher. + +Given that we desire to have a strong reference implementation that improves performance for many users with no out of the box configuration, and that we desire to have many implementations able to directly consume the InferenceModel and InferencePool APIs, we at this time recommend not exposing full configurability of the extension via the Inference* APIs (collectively referred to as Model Routing APIs). Instead, we recommend that algorithms be configurable either by parameterization to the EPP until we have clear design evidence for a need to add new CRDs. At that time, in keeping with the project principles around API extension, we will reassess. \ No newline at end of file diff --git a/docs/proposals/0602-prefix-cache-aware-routing-proposal/README.md b/docs/proposals/0602-prefix-cache-aware-routing-proposal/README.md new file mode 100644 index 000000000..468e3be8e --- /dev/null +++ b/docs/proposals/0602-prefix-cache-aware-routing-proposal/README.md @@ -0,0 +1,124 @@ +# Prefix Cache Aware Request Scheduling + +## Overview + +Prefix caching is a well-known technique in LLM inference to save duplicate tensor computation for prompts with the same prefix tokens, and is available in many model servers or model as a service providers. Leveraging prefix caching can significantly boost system performance, especially the time to first token (TTFT). Given that EPP has a global view of requests and model servers in the `InferencePool`, it can schedule requests intelligently to maximize the global prefix cache hit rate. + +### Goals + +Implement a prefix aware scheduling algorithm on EPP to maximize the cache hit rate on the model servers. + +### Non-goals + +* Change how model server manages prefix caches, or add any prefix cache APIs. +* Coordinate cache beyond accelerator HBM cache, such as remote caches. + +## Terminology + +In the gateway-api-inference-extension project, we use the term "request scheduling" to mean the process of estimating the cost of a request and placing it to the best backend server. This is different from "model routing" which oftentimes means picking the right model server endpoint based on cost, availability, etc. However, we acknowledge that various other projects uses the term "routing" or "router" to mean what we call "request scheduling". In this doc, we use "scheduling" when referring to the inference extension, and "routing" or "router" when referring to other projects, respecting the terminology of those projects. + +## Existing Solutions + +[vLLM](https://docs.vllm.ai/en/latest/features/automatic_prefix_caching.html) has the automatic prefix cache (APC) feature by caching in the accelerator HBM, and uses an LRU cache eviction strategy. + +[vLLM production stack](https://github.com/vllm-project/production-stack/issues/59) is exploring a prefix aware router to exploit the APC feature of the vLLM. The WIP [PR](https://github.com/vllm-project/production-stack/issues/59#issuecomment-2677268482) implements two strategies: a HashTrie based matching and a SimHash based consistent hashing. The HashTrie solution is showing better cache hit rate. + +[SGLang](https://github.com/sgl-project/sglang/blob/4d2a88bdffe91168dfc73ef7e3bc9100ba96686b/sgl-router/src/router.rs#L61) has a cache aware routing strategy which builds a radix tree based on request history. + +[AIBrix](https://aibrix.readthedocs.io/latest/features/distributed-kv-cache.html) uses a distributed prefix cache pool and has a customized vLLM to support loading cache from the pool. At request routing, it has a [Prefix Router](https://github.com/vllm-project/aibrix/blob/6feec99d77c84e371da9c535054c2b8aa8912704/pkg/plugins/gateway/algorithms/prefix_cache.go#L64) that maximizes prefix cache hit on model server HBM. It currently implements a hash based (similar to vLLM) and radix tree based (similar to SGLang) matching strategy. + +[KubeAI](https://www.kubeai.org/blog/2025/02/26/llm-load-balancing-at-scale-chwbl/) uses a Consistent Hashing with Bounded Loads (CHWBL) algorithm which hashes request prefixes up to a configurable length (and therefore will lose some accuracy), and use an "overflow" strategy when the server is hot loaded. + +## Design Options + +### Session affinity + +Session affinity is based on client attributes such as IP address. It works well for use cases such as multi-turn conversations, where requests from the same client tend to share the same prefixes. This, of course, highly depends on the nature of the use case. + +Pros: + +* Easy to implement/understand + +Cons: + +* Limited use case +* Does not exploit prefix cache between different clients +* Using client IP isn't always reliable, will likely need client to provide "session info" for good affinity + +### Prefix affinity consistent hashing + +This goes a step beyond the session affinity by using a prefix aware hash function to schedule requests with similar prefixes to the same or similar servers. A naive hash function can be just taking the hash of the first N characters/tokens of the request, and therefore all requests with the same first N characters/tokens will be scheduled to the same server. The [vLLM production stack](https://github.com/vllm-project/production-stack/issues/59) is exploring this strategy using simhash, and preliminary experiments showed mixed results. KubeAI uses a simple strategy to only hash request prefix up to a configurable `prefixCharLength`. Its effectiveness is likely highly dependent on the input length distribution. + +Pros: + +* (Compared to session affinity) Is aware of prefix and not limited to per-client affinity +* Small memory overhead (just need to store the ring of the servers) + +Cons: + +* Highly depends on the effectiveness of the prefix aware hash function. +* Consistent hashing can be challenging to reason about. + +### Report prefix cache indexes on the EPP + +If the EPP knows what prefixes are currently cached on each model server replica, it can make the optimal decision. A potential solution is to have the model server (or with a sidecar) report the kv cache indexes to the EPP. + +Pros: + +* Best cache hit rate in theory + +Cons: + +* Requires API changes on the model servers to report the cache indexes. +* Reporting the cache indexes in real time requires non-trivial network bandwidth. + +### Approximate prefix index on the EPP + +This builds on the intuition that if `requestA=prefix+XX` was scheduled to server 1, then scheduling `requestB=prefix+YY` to the same server will likely hit its prefix cache. Therefore the EPP can build an approximate index table of the prefix caches on all the backend servers, by mimicking a similar cache eviction strategy of the model server (e.g., LRU). + +Pros: + +* (Compared to the session affinity strategy) Broader application to most use cases and doesn't require any client integration. +* (Compared to the consistent hashing strategy) Easy to implement and explain and is more effective. + +Cons: + +* Relies on knowledge of the cache eviction strategy of the model server, and may need careful tuning for different environments (e.g., model server with different total kv cache space may have different characteristics of cache eviction). +* Complexity in managing cache state (eviction, memory limit) +* An in memory cache is preferred for high performance. However, that means cache need to be rebuilt for restarts. Moreover, cache hit performance decreases with multiple active EPP replicas. + +## Proposal + +Based on the above discussion, I propose implementing "Approximate prefix cache on the EPP" solution, which has the advantage of fast time to market, automatic prefix cache (without needing client integration), decent performance with the cost of degraded performance when sharded. + +A request is broken down into N chunks of the same number of characters (we don’t necessarily need to tokenize). For each chunk we will calculate a hash based on the **content of the chunk + hash of the prefix**: `hash(chunk i) = hash(chunk i content + hash(chunk i-1))`. This gives us a nice property that if we find a match of a chunk hash, then we know all its prefix chunk hashes match as well. This is very similar to how vLLM does it. + +When we schedule a request `r1` with `N` chunks to a server `s1`, we update the approximate cache index table like so: + +``` +hash(chunk 1): append s1 +hash(chunk 2): append s1 +… +hash(chunk N): append s1 +``` + +This means all these N chunks are cached on server `s1`. + +When the EPP receives a new request `r2`, we calculate its chunk hashes, and look up the table to find a server with longest prefix matching. + + + +[Image source](https://docs.google.com/drawings/d/1KL5DKh42Z_XzvcnejUcRymu99_HwW9y8U29IrPzRCss/edit?usp=sharing) + + +## How does prefix cache affinity work with LoRA affinity and load-aware scheduling + +1. Prefix cache needs to be LoRA aware, as different adapters don’t share the same kv cache. Therefore when finding prefix matches, we only match for the same model/adapter. +2. Prefix affinity needs to be aware of the server load and avoid overloading servers. We can calculate a combined weighted score of servers depending on: prefix cache hit ratio, queue length and k-v cache utilization to achieve a good balance between prefix cache affinity and load balancing. + +## Future work + +The main drawback of the proposed solution is the degraded performance when EPP is sharded, as the in memory cache index table loses a global view of all requests. To mitigate this issue, we can consider: + +* Establish a "prefix cache index reporting" protocol with model servers, and use a combination of the approximate cache index with reported indexes. This can potentially work better than a solution purely based on reported indexes, as discussed in [`Solution 3`](https://github.com/kubernetes-sigs/gateway-api-inference-extension/discussions/678). +* When scheduling a request with low or no prefix cache in the EPP in memory index table, use the consistent hashing strategy to improve the predictability of two EPPs picking the same server, instead of random picking. \ No newline at end of file diff --git a/docs/proposals/0683-epp-architecture-proposal/README.md b/docs/proposals/0683-epp-architecture-proposal/README.md new file mode 100644 index 000000000..7bd688c73 --- /dev/null +++ b/docs/proposals/0683-epp-architecture-proposal/README.md @@ -0,0 +1,74 @@ +# EPP Architecture Proposal + +Author(s): @kfswain +## Proposal Status + ***Draft*** + +## Summary + +This proposal seeks to standardize the implementation of an EPP (End-point Picker) for the Inference Gateway extension (also known as Gateway API Inference Extension). Additionally, this proposes to restructure the current implementation of the EPP to be more modular, and approachable. + +## Goals + +- Set a standard on how the EPP & APIs interact +- Settle on common nomenclature for clearer communication +- Allow for modularization of the EPP, to be extended to a user's specific needs + +## Non-Goals + +- Reshaping the current API +- A change in scope of the current project + +## Proposal + +This proposal is not proposing any net new features, instead, we are refactoring our current implementation to better handle more devs, more features, etc. At the time of writing, GIE is currently at v0.3, and that stronger experimental context (along with external feedback) made clear the need this restructure. The image below give a high level view of how our components work together. + +Scheduling Algorithm + +## Overview +At a quick glance, the EPP is being broken into specific layers. The `Data Layer` is of note, as it is a vertical that will be accessed by all the others. The data layer manages the k8s, data, metric & usage data, as well as processing of the above data to determine resource scarcity regimes. + +The other layers are handled in sequential process. Starting with the **Ext-Proc** call. The request is buffered and then sent to the **Routing Layer**, which processes any User defined per-InferenceModel routing rules & request enrichment happening first (at the time of writing that is currently just translating the InferenceModel name to a weight-split actual model). Then _all_ requests pass through the to-be-implemented [**Flow Controller**](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/674) to ensure that any request entry to the pool adhereing to the guidelines set by the Priority, Fairness, & Queueing configuration. And finally, the **Scheduling Layer** is the load balancing algorithm that intelligently routes requests based on the current state of the InferencePool. + +## Components + +To further expand upon these component layers. We will first break them into `extensible` and `non-extensible` layers. `Non-extensible` layers are intended to be static, and handled on behalf of the user, typically implementing low-opinion infrastructure. + +The `Extensible` layers are: +- Data Layer +- Routing Layer +- Flow Controller +- Scheduling Layer + +The `Non-Extensible` layer(s) are: +- The Ext-Proc Server + +### `Extensible` + +#### Data Layer + +The data layer will consume and store: the InferencePool/InferenceModel config and the pre-defined [Model Server Protocol](../003-model-server-protocol/README.md). Additionally, the data fed from the model servers will be processed and digested to provide resource scarcity regime hints, and autoscaling reccomendations. + +Many extensions to scheduling will require changes to ingested metrics, as such, the data layer will be built to be extended, but extenders accept that the Model Server Protocol will no longer provide guarantees on portability of a model server out of the box. + +#### Routing Layer + +The routing layer is likely to be the most opinion heavy section, as the scope of what constitutes a 'Route Rule' is somewhat broad. The current examples we expect would be: + +- System Prompt injection +- RAG callout +- Per-InferenceModel request validation (such as saftey/on-topic, etc) + +Due to the possibility of this becoming a bit of a dumping ground. The API will keep a _very_ tight scope on which of these route rules are included in the spec. A standard method of extension will be provided if the need to define a custom rule arises. + +#### Flow Controller (WIP - implementation tracked in [#674](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/674)) + +The flow controller will consume resource regime data, and enforce proper resource sharing between workloads. This will primarily be done through a queuing mechanism [as described here](https://docs.google.com/document/d/1VZL7opFWuwgWquvgiOzLlXAJ633qZ9U-A0ZixGjBgaI/edit?usp=sharing). + + + +### `Non-extensible` + +#### Ext-Proc Server + +The Ext-Proc Server protocol is very well defined & specific, deviation could cause the EPP to become unusable or unstable. Extension is ill-advised. diff --git a/docs/proposals/0683-epp-architecture-proposal/images/epp_arch.svg b/docs/proposals/0683-epp-architecture-proposal/images/epp_arch.svg new file mode 100644 index 000000000..4c5857281 --- /dev/null +++ b/docs/proposals/0683-epp-architecture-proposal/images/epp_arch.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/proposals/0845-scheduler-architecture-proposal/README.md b/docs/proposals/0845-scheduler-architecture-proposal/README.md new file mode 100644 index 000000000..4141ce6a2 --- /dev/null +++ b/docs/proposals/0845-scheduler-architecture-proposal/README.md @@ -0,0 +1,93 @@ +# Scheduling Subsystem Architecture + +Author(s): @kfswain, @ahg-g, @nirrozenbaum +## Proposal Status + ***Draft*** + +## Summary +The Scheduling Subsystem is a framework used to implement scheduling algorithms. High level definition [here](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/006-scheduler) & EPP Architecture [here](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/0683-epp-architecture-proposal). + +## Design Principles +- The scheduler framework should act as an independent library, there should be no dependency on EPP packages defined outside of the scheduler +- The *framework* should be agnostic to endpoint types (such as model servers), and K8s concepts. + - Opinions should be held by the plugins, not the framework +- The entry & exit points should be defined by the framework, acting as the API surface of the system +- Multiple scheduling 'profiles' should be able to be ran for a single request. + - They can be conditionally dependent on previous runs, or in parallel +- State management + - State per request: This is managed by what we are calling CycleState and its lifecycle is tied to the request. + Cycle state is created internally by the Scheduler per request and its pointer is passed as argument. + - State managed by the plugin struct itself: The lifecycle of this state is tied to the plugin, and since plugins will be instantiated once, + it is a state that plugins can use across requests (like prefix-cache index). + - State managed by the data layer: each endpoint will be associated with state (currently metrics) that a data layer plugin can add to it. + A data layer plugin could be one that scrapes v1/models from the endpoint for example. + +## Definitions +- **Scheduling Framework** - The system created to allow for a pluggable scheduling algorithm. +- **Scheduler Profile** - A named, specific set of Filter(s), Scorer(s), & Picker used to select endpoints. +- **Scheduler Profile Run** - a one time run of the Scheduler Profile filters, scorers and picker given a request. +- **Scheduler** - An extensible implementation of a scheduling algorithm. Including logic to select Scheduler Profiles iteratively, + the Scheduler Profiles themselves, & logic to interpret the result. +- **Scheduling Cycle** - A single run of a Scheduler through the Scheduling Framework. a scheduling cycle includes one or + more Scheduler Profile runs (at least one). +- **Plugin** - Implementation of framework-defined interface(s) to add or extend logic across the framework. + +## Proposal + +The Scheduling System draws inspiration from the kube-schedulers pluggable system, though there are distinct differences in goals/usage. + +The Scheduling System can loosely be defined into 3 sections: +- A *framework* to implement the system +- The *interfaces* that a consumer can use to extend the system +- A *configuration API* to define the Scheduler, Profile(s), & the plugins used within those profiles + +A sketch of the System, with extension points is here: +Scheduling Algorithm + +Describing the interface extension points & flow is the simplest way to convey the intent of what the framework should enable: + +### ProfileHandler + +ProfileHandler is a schedler plugin with two extension points - ProfilePick, and ProcessProfilesResults. +Below is a detailed explanation about these extension points. +Only a single ProfileHandler plugin may be defined per scheduler. + +### ProfilePick + +ProfilePick is the entry point into the scheduling cycle (called by the framework). +it selects profiles conditionally based on: + +- Request data +- Results of previously executed SchedulerProfiles +- Cycle State + +ProfilePick will be continuously called so long as profiles are returned; multiple profiles may be returned in a single call. +ProfilePick extension point will be configured as part of a ProfileHandler plugin. +Since there is only a single ProfileHandler plugin, that means there is only a single ProfilePick function. + +### Scheduler Profile Run + +The SchedulerProfile run consists of 3 defined phases `Filter`, `Score`, & `Pick` + +*Profile Constraints* +- A profile can have any number of `Filter` plugins registered (including zero) +- A profile can have any number of `Score` plugins registered (including zero) +- A profile MUST have exactly one `Pick` plugin registered + + +#### Filter +Filter runs before any scoring, and remove endpoints that are not fit for selection. The framework will return an error to the client if the endpoints are filtered to zero. + +#### Score +Score applies a score to each remaining endpoint provided. Scorers SHOULD keep their score values in a normalized range: [0-1]. Any weighting should be added at the SchedulerProfile configuration level. + +#### Pick +Picker selects the endpoint(s) from the provided list of scored endpoints. Picker MUST return, one endpoint at minimum. + + +### ProcessProfilesResults +ProcessProfilesResults receives the output of the result(s) of the scheduler profile(s) and makes sense of the data to be consumed by the calling system. +Since there is only a single ProfileHandler plugin, that means there is only a single ProcessProfilesResults function. + +## ConfigurationAPI +TODO \ No newline at end of file diff --git a/docs/proposals/0845-scheduler-architecture-proposal/examples/example.yaml b/docs/proposals/0845-scheduler-architecture-proposal/examples/example.yaml new file mode 100644 index 000000000..06725a981 --- /dev/null +++ b/docs/proposals/0845-scheduler-architecture-proposal/examples/example.yaml @@ -0,0 +1,34 @@ +#names are egregiously long, but attempting to descibe custom logic within a name +profileSelection: disagg-token-length +schedulingResult: log-shadowbox-label-pd-result +profiles: + prefill: + preschedule: + - decode-prefix-cache-check + filter: + - is-prefill + - has-required-accelerator + score: + - prefix-cache: 3 + - latency-scorer: 2 + selection: + - best-score + postschedule: + - log-full-scores + decode: + filter: + - is-decode + score: + - prefix-cache: 3 + - kv-cache-util: 5 + selection: + - random-top-3 + shadowbox-decode: + filter: + - is-decode + - is-tpu + score: + - prefix-cache-v2: 4 + - kv-cache-util: 1 + selection: + - random-top-3 diff --git a/docs/proposals/0845-scheduler-architecture-proposal/images/scheduler_cycle.png b/docs/proposals/0845-scheduler-architecture-proposal/images/scheduler_cycle.png new file mode 100644 index 000000000..f819e5032 Binary files /dev/null and b/docs/proposals/0845-scheduler-architecture-proposal/images/scheduler_cycle.png differ diff --git a/docs/proposals/0845-scheduler-architecture-proposal/images/scheduler_subsystem.svg b/docs/proposals/0845-scheduler-architecture-proposal/images/scheduler_subsystem.svg new file mode 100644 index 000000000..3186c1695 --- /dev/null +++ b/docs/proposals/0845-scheduler-architecture-proposal/images/scheduler_subsystem.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/proposals/0845-scheduler-architecture-proposal/interfaces/interface.go b/docs/proposals/0845-scheduler-architecture-proposal/interfaces/interface.go new file mode 100644 index 000000000..35b787b35 --- /dev/null +++ b/docs/proposals/0845-scheduler-architecture-proposal/interfaces/interface.go @@ -0,0 +1,142 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package framework + +import ( + "context" + + scheduling "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +type Endpoint struct { + State EndpointState +} + +type EndpointState struct { + // storage is per Scheduling Cycle, and so has no thread-safe concerns. + storage map[string]any //nolint:unused +} + +// Request is a structured representation of the fields we parse out of the Request body. +type Request struct { + // RequestId is the Envoy generated Id for the request being processed + RequestId string + // TargetModel is the final target model after traffic split. + TargetModel string + // Prompt is the prompt that was sent in the request body. + Prompt string + // Headers is a map of the request headers. + Headers map[string]string +} + +// ScoredEndpoint encapsulates Endpoint with its Score. +// The lifecycle of an endpoint is typically different than a lifecycle of a request. +// This is intended to be used only internally by Scheduler logic and/or scheduler plugins within the lifecycle of the request. +// When returning the selected Endpoint(s) out of the Scheduler, an Endpoint is returned without the score. +type ScoredEndpoint struct { + Endpoint + Score float64 +} + +type Scheduler struct { + SchedulerConfig +} + +// SchedulerConfig is the struct that maps to the configuration file that should be further discussed. +// the configuration file should include the ProfileHandler plugin as well as the profiles with their plugins. +type SchedulerConfig struct { + // exactly one ProfileHandler instance is required. + profileHandler ProfileHandler //nolint:unused + // map from profile name to its set of plugins. + profiles map[string]*SchedulerProfile //nolint:unused +} + +// SchedulerProfile is used to describe a profile that will +// run for a given scheduling cycle. +type SchedulerProfile struct { + // Filters lists all Filter plugins associated with this Profile. + // Filters are optional. + filters []Filter //nolint:unused + // Scorers lists all Score plugins associated with this Profile. + // Scorers are optional. + scorers []*WeightedScorer //nolint:unused + // Picker returns the function that picks the endpoint(s). Picker is required. + picker Picker //nolint:unused +} + +type SchedulingResult struct { + ProfileResults map[string][]*Endpoint // a map from profile name to its scheduling result + PrimaryProfileName string // key of the primary profile, its selected endpoints will be used by default as the destination +} + +// Plugin is the parent type for all the scheduling framework plugins. +type Plugin interface { + Type() string +} + +// ProfileHandler defines the interface for handling multi SchedulerProfile instances. +// More specifically, this interfaction defines two extension points, 'PickProfiles' +// which runs iteratively, and 'ProcessProfilesResults' which runs after all profiles runs complete +// and process the results of all profiles. +type ProfileHandler interface { + Plugin + // Pick picks the SchedulingProfile objects to run from a list of candidate profiles, + // while taking into consideration the request properties + // and the previously executed SchedluderProfile runs along with their results. + // returns: + // - profiles - A subset of the registered scheduling profiles to be ran in next iteration + Pick(request *Request, profiles map[string]*SchedulerProfile, executionResults map[string][]*ScoredEndpoint) map[string]*SchedulerProfile + + // ProcessResults handles the outcome of each profile run. + // It may aggregate results, log test profile outputs, or apply custom logic. It specifies in the SchedulingResult the + // key of the primary profile that should be used to get the request selected destination. + // Example: suppose you have 2 profiles ShadowBoxing Profile & Production Profile. + // ProcessProfileResults would know to simply log the result of ShadowBoxing + // profile, and do nothing else with it. + ProcessResults(request *Request, profileResults map[string][]*ScoredEndpoint) *SchedulingResult +} + +// Filter runs before any scoring, and remove endpoints that are not fit for selection. +// The framework will return an error to the client if the endpoints are filtered to zero. +type Filter interface { + Plugin + Filter(ctx context.Context, request *Request, state *scheduling.CycleState, endpoints []*Endpoint) []*Endpoint +} + +// Scorer applies a score to each remaining endpoint provided. +// Scorers SHOULD keep their score values in a normalized range: [0-1]. +// Any weighting should be added at the SchedulerProfile configuration level. +type Scorer interface { + Plugin + Score(ctx context.Context, request *Request, state *scheduling.CycleState, endpoints []*Endpoint) []*ScoredEndpoint +} + +// WeightedScorer is a struct that encapsulates a scorer with its weight. +// We need this struct in order to be able to keep scorers in profile as a slice instead of a map. +// This is very useful for having a generic AddPlugin function that registers a plugin to all its extension points. +// Using a map is much less convenient for this purpose. +type WeightedScorer struct { + Scorer + weight int //nolint:unused +} + +// Picker selects the endpoint(s) from the provided list of scored endpoints. +// Picker MUST return, one endpoint at minimum. +type Picker interface { + Plugin + Pick(ctx context.Context, state *scheduling.CycleState, endpoints []*ScoredEndpoint) []*ScoredEndpoint +} diff --git a/docs/proposals/1023-data-layer-architecture/README.md b/docs/proposals/1023-data-layer-architecture/README.md new file mode 100644 index 000000000..e39264319 --- /dev/null +++ b/docs/proposals/1023-data-layer-architecture/README.md @@ -0,0 +1,176 @@ +# Data Layer Architecture Proposal + +Author(s): @elevran @nirrozenbaum + +## Proposal Status + +***Draft*** + +## Summary + +The EPP Architecture proposal identifies the need for an extensible + [Data Layer](../0683-epp-architecture-proposal/README.md#data-layer). + Recently, the scheduling subsystem underwent a major [architecture change](../0845-scheduler-architecture-proposal/README.md) + to allow easier extension and pluggability. This proposal aims to apply + similar extensibility to the Data Layer subsystem, allowing custom inference + gateways to extend the Gateway API Inference extension (GIE) for their use + cases without modifying the core GIE code base. + +See [this document](https://docs.google.com/document/d/1eCCuyB_VW08ik_jqPC1__z6FzeWO_VOlPDUpN85g9Ww/edit?usp=sharing) for additional context amd reference. + +## Goals + +The Data Layer pluggability effort aims to address the following goals and + requirements: + +- Make endpoint attributes used by GIE components accessible via well defined + Data Layer interfaces. +- Enable collection of additional (or different) subset of attributes from an + existing data source (e.g., the `/metrics` endpoint scraper). +- Add a new data source that collects attributes not already collected. +- Follow best practices and experience from the Scheduling subsystem + pluggability effort. For example, extending the system to support the above + should be through implementing well defined Plugin interfaces and registering + them in the GIE Data Layer subsystem; any configuration would be done in the + same way (e.g., code and/or configuration file), etc. +- Be efficient (RAM, CPU, concurrency) in collecting and storing attributes. +- Limit change blast radius in GIE when making above changes. Core GIE code + should not need to be modified in order to support collecting and storing new + attributes. Affected code should be scoped only to modules that make use of + the new attributes. +- The extensions should not increase coupling between GIE subsystems and + Kubernetes (i.e., the environment specific code should be encapsulated and + not “leaked” into the subsystem and its users). +- (Future) Allow non-uniform data collection (i.e., not all endpoints share the + same data). + +## Non-Goals + +- Modify existing GIE abstractions, such as `InferencePool`, to conform to the + Data Layer pluggability design. They are to remain first class concepts, as + today. +- Enable reconciliation or modification of external state. The data sources are + strictly read-only. For example, data source accessing Kubernetes state as part of + the data collection would registered for `Watch()` notifications and shall not + receive access to a k8s client. +- Inference scheduler Plugins, that rely on custom data collection, accept that + the [Model Server Protocol](../003-model-server-protocol/README.md) no longer + provides guarantees on portability of a model server out of the box. +- Intent is *not* to introduce a new scraping mechanism, and continue to support + the current model of Go-routine per endpoint. + +## Proposal + +### Overview + +There are two existing Data Sources in the Data Layer: a Pod reconciler that + collects Pod IP address(es) and labels, copying them to endpoint attributes, + and a metrics scraper that collects a defined set of metric values from the + `/metrics` endpoint of each Pod. Note that the `InferencePool` reconciler is + *not* considered part of the Data Layer. + +### Components + +The proposal is to make the Data Layer more extensible approaching by introducing + these two interfaces: + +- An **Attribute Collection** plugin interface responsible for extracting relevant + attributes from a data source and storing them into the Data Layer for consumption + by other components. The plugin can be registered with existing or new + *Data Sources* (see below) and sources would call their registered plugins + periodically or on change to process attributes. +- A **Data source** plugin interface that can be added to an inference gateway + system, and on which *Attribute Collection* plugins can be registered to enrich + the data model. + +### Implementation Phases + +In order to make iterative progress and validate the design alongside, we + propose to implement and evolve the Data Layer extensibility over several + phases: + +1. Extend the backend, per endpoint, storage with a map from a name (i.e., the + attribute collection interface) to the data it collected. Existing attributes, + such as IP address or Pod labels, are not modified. +1. Introduce a Data Source registry where new data sources can be registered and + bootstrap it by wrapping the existing `/metrics` with a Data Source API. At this + point, the metrics scraping code includes only the `Data Source` interface and the + `Data Collection` interface is not used/exposed. +1. Refactor the metrics scraping code into separate Data Source and Data Collection + plugin interfaces. +1. Following that, and based on any lessons learnt, we’ll refactor the existing + Kubernetes Pod reconciliation loop to the new plugin interfaces. + +### Suggested Data Layer Plugin Interfaces + +```go +// DataCollection interface consumes data updates from sources, stores +// it in the data layer for consumption. +// The plugin should not assume a deterministic invocation behavior beyond +// "the data layer believes the state should be updated" +type DataCollection interface { + // Extract is called by data sources with (possibly) updated + // data per endpoint. Extracted attributes are added to the + // Endpoint. + Extract(ep Endpoint, data interface{}) error // or Collect? +} + +// Endpoint interface allows setting and retrieving of attributes +// by a data collector. +// Note that actual endpoint structure would be something like (pseudocode) +// type EndpointState struct { +// address +// ... +// data map[string]interface{} +// } +// The plugin interface would only mutate the `data` map +type Endpoint interface { + // StoreAttributes sets the data for the Endpoint on behalf + // of the named collection Plugin + StoreAttributes(collector string, data interface{}) error + + // GetAttributes retrieves the attributes of the named collection + // plugin for the Endpoint + GetAttributes(collector string) (interface{}, error) +} + +// DataLayerSourcesRegistry include the list of available +// Data Sources (interface defined below) in the system. +// It is accompanied by functions (not shown) to register +// and retrieve sources +type DataLayerSourcesRegistry map[string]DataSource + +// DataSource interface represents a data source that tracks +// pods/resources and notifies data collection plugins to +// extract relevant attributes. +type DataSource interface { + // Type of data available from this source + Type() string + + // Start begins the data collection and notification loop + Start(ctx context) error + + // Stop terminates data collection + Stop() error + + // Subscribe a collector to receive updates for tracked endpoints + Subscribe(collector DataCollection) error + + // UpdateEndpoints replaces the set of pods/resources tracked by + // this source. + // Alternative: add/remove individual endpoints? + UpdateEndpoints(epIDs []string) error +} +``` + +## Open Questions + +1. Type safety in extensible data collection: `map[string]interface{}` seems + like the simplest option to start, but may want to evolve to support + type safety using generics or code generation. +1. Should we design a separate interface specifically for k8s object watching + under GIE control or do we want these to be managed as yet another data source? + This affects the design (e.g., who owns the k8s caches, clients, etc.). + With a GIE controlled data source, collectors just register the types (and + other constraints? Labels, namespaces, …) with GIE core, and all k8s + functionality is under GIE control. diff --git a/docs/proposals/README.md b/docs/proposals/README.md new file mode 100644 index 000000000..2b0408d35 --- /dev/null +++ b/docs/proposals/README.md @@ -0,0 +1,5 @@ +# Proposals Best Practices + + +## Naming +The directory of the proposal should lead with a 4-digit PR number (will move to 5,6,... should our PR count get that high), followed by kebab-cased title. The PR number is not known until the PR is cut, so development can use a placeholder, ex. XXXX-my-proposal. PR number is used b/c it is unique & chronological, allowing the default ordering of proposals to follow the timeline of development. \ No newline at end of file diff --git a/go.mod b/go.mod index fba85f91a..a0cc2e134 100644 --- a/go.mod +++ b/go.mod @@ -3,78 +3,77 @@ module sigs.k8s.io/gateway-api-inference-extension go 1.24.0 require ( + github.com/cespare/xxhash/v2 v2.3.0 github.com/elastic/crd-ref-docs v0.1.0 github.com/envoyproxy/go-control-plane/envoy v1.32.4 - github.com/go-logr/logr v1.4.2 + github.com/go-logr/logr v1.4.3 github.com/google/go-cmp v0.7.0 - github.com/onsi/ginkgo/v2 v2.23.3 - github.com/onsi/gomega v1.36.3 - github.com/prometheus/client_golang v1.21.1 - github.com/prometheus/client_model v0.6.1 - github.com/prometheus/common v0.63.0 + github.com/google/uuid v1.6.0 + github.com/hashicorp/golang-lru/v2 v2.0.7 + github.com/onsi/ginkgo/v2 v2.23.4 + github.com/onsi/gomega v1.37.0 + github.com/prometheus/client_golang v1.22.0 + github.com/prometheus/client_model v0.6.2 + github.com/prometheus/common v0.65.0 github.com/stretchr/testify v1.10.0 go.uber.org/multierr v1.11.0 go.uber.org/zap v1.27.0 - google.golang.org/grpc v1.71.0 + golang.org/x/sync v0.15.0 + google.golang.org/grpc v1.73.0 google.golang.org/protobuf v1.36.6 - k8s.io/api v0.32.3 - k8s.io/apiextensions-apiserver v0.32.3 - k8s.io/apimachinery v0.32.3 - k8s.io/client-go v0.32.3 - k8s.io/code-generator v0.32.3 - k8s.io/component-base v0.32.3 + k8s.io/api v0.33.2 + k8s.io/apiextensions-apiserver v0.33.2 + k8s.io/apimachinery v0.33.2 + k8s.io/client-go v0.33.2 + k8s.io/code-generator v0.33.2 + k8s.io/component-base v0.33.2 k8s.io/utils v0.0.0-20241210054802-24370beab758 - sigs.k8s.io/controller-runtime v0.20.4 - sigs.k8s.io/structured-merge-diff/v4 v4.6.0 - sigs.k8s.io/yaml v1.4.0 + sigs.k8s.io/controller-runtime v0.21.0 + sigs.k8s.io/gateway-api v1.3.0 + sigs.k8s.io/structured-merge-diff/v4 v4.7.0 + sigs.k8s.io/yaml v1.5.0 ) require ( - cel.dev/expr v0.19.1 // indirect + cel.dev/expr v0.23.0 // indirect github.com/Masterminds/goutils v1.1.1 // indirect github.com/Masterminds/semver v1.5.0 // indirect github.com/Masterminds/sprig v2.22.0+incompatible // indirect github.com/antlr4-go/antlr/v4 v4.13.0 // indirect - github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect - github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3 // indirect + github.com/cncf/xds/go v0.0.0-20250326154945-ae57f3c0d45f // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/emicklei/go-restful/v3 v3.11.0 // indirect + github.com/emicklei/go-restful/v3 v3.12.0 // indirect github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect - github.com/fatih/color v1.16.0 // indirect + github.com/fatih/color v1.18.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/fxamacker/cbor/v2 v2.7.0 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect - github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/jsonreference v0.21.0 // indirect github.com/go-openapi/swag v0.23.0 // indirect github.com/go-playground/locales v0.14.1 // indirect github.com/go-playground/universal-translator v0.18.0 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect - github.com/gobuffalo/flect v1.0.2 // indirect + github.com/gobuffalo/flect v1.0.3 // indirect github.com/goccy/go-yaml v1.11.3 // indirect github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/protobuf v1.5.4 // indirect github.com/google/btree v1.1.3 // indirect - github.com/google/cel-go v0.22.0 // indirect - github.com/google/gnostic-models v0.6.8 // indirect - github.com/google/gofuzz v1.2.0 // indirect - github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect - github.com/google/uuid v1.6.0 // indirect - github.com/gorilla/websocket v1.5.0 // indirect - github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect + github.com/google/cel-go v0.23.2 // indirect + github.com/google/gnostic-models v0.6.9 // indirect + github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect + github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 // indirect github.com/huandu/xstrings v1.3.3 // indirect - github.com/imdario/mergo v0.3.11 // indirect + github.com/imdario/mergo v0.3.16 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/compress v1.17.11 // indirect github.com/kylelemons/godebug v1.1.0 // indirect github.com/leodido/go-urn v1.2.1 // indirect github.com/mailru/easyjson v0.7.7 // indirect @@ -91,43 +90,45 @@ require ( github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/procfs v0.15.1 // indirect - github.com/spf13/cobra v1.8.1 // indirect - github.com/spf13/pflag v1.0.5 // indirect + github.com/spf13/cobra v1.9.1 // indirect + github.com/spf13/pflag v1.0.6 // indirect github.com/stoewer/go-strcase v1.3.0 // indirect github.com/x448/float16 v0.8.4 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 // indirect - go.opentelemetry.io/otel v1.34.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 // indirect - go.opentelemetry.io/otel/metric v1.34.0 // indirect - go.opentelemetry.io/otel/sdk v1.34.0 // indirect - go.opentelemetry.io/otel/trace v1.34.0 // indirect - go.opentelemetry.io/proto/otlp v1.3.1 // indirect - golang.org/x/crypto v0.36.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 // indirect + go.opentelemetry.io/otel v1.35.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 // indirect + go.opentelemetry.io/otel/metric v1.35.0 // indirect + go.opentelemetry.io/otel/sdk v1.35.0 // indirect + go.opentelemetry.io/otel/trace v1.35.0 // indirect + go.opentelemetry.io/proto/otlp v1.4.0 // indirect + go.uber.org/automaxprocs v1.6.0 // indirect + go.yaml.in/yaml/v2 v2.4.2 // indirect + golang.org/x/crypto v0.38.0 // indirect golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect - golang.org/x/mod v0.23.0 // indirect - golang.org/x/net v0.37.0 // indirect - golang.org/x/oauth2 v0.25.0 // indirect - golang.org/x/sync v0.12.0 // indirect - golang.org/x/sys v0.31.0 // indirect - golang.org/x/term v0.30.0 // indirect - golang.org/x/text v0.23.0 // indirect - golang.org/x/time v0.7.0 // indirect - golang.org/x/tools v0.30.0 // indirect + golang.org/x/mod v0.24.0 // indirect + golang.org/x/net v0.40.0 // indirect + golang.org/x/oauth2 v0.30.0 // indirect + golang.org/x/sys v0.33.0 // indirect + golang.org/x/term v0.32.0 // indirect + golang.org/x/text v0.25.0 // indirect + golang.org/x/time v0.9.0 // indirect + golang.org/x/tools v0.31.0 // indirect golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250324211829-b45e905df463 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250324211829-b45e905df463 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiserver v0.32.3 // indirect - k8s.io/gengo/v2 v2.0.0-20240911193312-2b36238f13e9 // indirect + k8s.io/apiserver v0.33.2 // indirect + k8s.io/gengo/v2 v2.0.0-20250207200755-1244d31929d7 // indirect k8s.io/klog/v2 v2.130.1 // indirect - k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect - sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 // indirect - sigs.k8s.io/controller-tools v0.14.0 // indirect + k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect + sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 // indirect + sigs.k8s.io/controller-tools v0.17.3 // indirect sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect + sigs.k8s.io/randfill v1.0.0 // indirect ) diff --git a/go.sum b/go.sum index 2bcff108f..9a7492158 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,5 @@ -cel.dev/expr v0.19.1 h1:NciYrtDRIR0lNCnH1LFJegdjspNx9fI59O7TWcua/W4= -cel.dev/expr v0.19.1/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= +cel.dev/expr v0.23.0 h1:wUb94w6OYQS4uXraxo9U+wUAs9jT47Xvl4iPgAwM2ss= +cel.dev/expr v0.23.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJLSYI= github.com/Masterminds/goutils v1.1.1/go.mod h1:8cTjp+g8YejhMuvIA5y2vz3BpJxksy863GQaJW2MFNU= github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= @@ -10,8 +10,6 @@ github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8 github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= -github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a h1:idn718Q4B6AGu/h5Sxe66HYVdqdGu2l9Iebqhi/AEoA= -github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= @@ -20,18 +18,17 @@ github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK3 github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3 h1:boJj011Hh+874zpIySeApCX4GeOjPl9qhRF3QuIZq+Q= -github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= -github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= -github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/cncf/xds/go v0.0.0-20250326154945-ae57f3c0d45f h1:C5bqEmzEPLsHm9Mv73lSE9e9bKV23aB1vxOsmZrkl3k= +github.com/cncf/xds/go v0.0.0-20250326154945-ae57f3c0d45f/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= +github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/elastic/crd-ref-docs v0.1.0 h1:Cr5kz89QB3Iuuj7dhAfLMApCrChEGAaIBTxGk/xuRKw= github.com/elastic/crd-ref-docs v0.1.0/go.mod h1:X83mMBdJt05heJUYiS3T0yJ/JkCuliuhSUNav5Gjo/U= -github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= -github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/emicklei/go-restful/v3 v3.12.0 h1:y2DdzBAURM29NFF94q6RaY4vjIH1rtwDapwQtU84iWk= +github.com/emicklei/go-restful/v3 v3.12.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/envoyproxy/go-control-plane/envoy v1.32.4 h1:jb83lalDRZSpPWW2Z7Mck/8kXZ5CQAFYVjQcdVIr83A= github.com/envoyproxy/go-control-plane/envoy v1.32.4/go.mod h1:Gzjc5k8JcJswLjAx1Zm+wSYE20UrLtt7JZMWiWQXQEw= github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8= @@ -40,8 +37,8 @@ github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8 github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= -github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= -github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= +github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM= +github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= @@ -49,18 +46,16 @@ github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyT github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= -github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= -github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= -github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= -github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= -github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= +github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= github.com/go-playground/locales v0.14.0/go.mod h1:sawfccIbzZTqEDETgFXqTho0QybSa7l++s0DH+LDiLs= @@ -72,8 +67,8 @@ github.com/go-playground/validator/v10 v10.4.1 h1:pH2c5ADXtd66mxoE0Zm9SUhxE20r7a github.com/go-playground/validator/v10 v10.4.1/go.mod h1:nlOn6nFhuKACm19sB/8EGNn9GlaMV7XkbRSipzJ0Ii4= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= -github.com/gobuffalo/flect v1.0.2 h1:eqjPGSo2WmjgY2XlpGwo2NXgL3RucAKo4k4qQMNA5sA= -github.com/gobuffalo/flect v1.0.2/go.mod h1:A5msMlrHtLqh9umBSnvabjsMrCcCpAyzglnDvkbYKHs= +github.com/gobuffalo/flect v1.0.3 h1:xeWBM2nui+qnVvNM4S3foBhCAL2XgPU+a7FdpelbTq4= +github.com/gobuffalo/flect v1.0.3/go.mod h1:A5msMlrHtLqh9umBSnvabjsMrCcCpAyzglnDvkbYKHs= github.com/goccy/go-yaml v1.11.3 h1:B3W9IdWbvrUu2OYQGwvU1nZtvMQJPBKgBUuweJjLj6I= github.com/goccy/go-yaml v1.11.3/go.mod h1:wKnAMd44+9JAAnGQpWVEgBzGt3YuTaQ4uXoHvE4m7WU= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= @@ -82,28 +77,30 @@ github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= -github.com/google/cel-go v0.22.0 h1:b3FJZxpiv1vTMo2/5RDUqAHPxkT8mmMfJIrq1llbf7g= -github.com/google/cel-go v0.22.0/go.mod h1:BuznPXXfQDpXKWQ9sPW3TzlAJN5zzFe+i9tIs0yC4s8= -github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= -github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= +github.com/google/cel-go v0.23.2 h1:UdEe3CvQh3Nv+E/j9r1Y//WO0K0cSyD7/y0bzyLIMI4= +github.com/google/cel-go v0.23.2/go.mod h1:52Pb6QsDbC5kvgxvZhiL9QX1oZEkcUF/ZqaPx1J5Wwo= +github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= +github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad h1:a6HEuzUHeKH6hwfN/ZoQgRgVIWFJljSWa/zetS2WTvg= -github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= -github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1YCS1PXdKYWi8FsN0= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k= +github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo= +github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0 h1:TmHmbvxPmaegwhDubVz0lICL0J5Ka2vwTzhoePEXsGE= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.24.0/go.mod h1:qztMSjm835F2bXf+5HKAPIS5qsmQDqZna/PgVt4rWtI= +github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= +github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/huandu/xstrings v1.3.3 h1:/Gcsuc1x8JVbJ9/rlye4xZnVAbEkGauT8lbebqcQws4= github.com/huandu/xstrings v1.3.3/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE= -github.com/imdario/mergo v0.3.11 h1:3tnifQM4i+fbajXKBHXWEH+KvNHqojZ778UH75j3bGA= -github.com/imdario/mergo v0.3.11/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA= +github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= +github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= @@ -112,13 +109,10 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= -github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= -github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= @@ -151,10 +145,10 @@ github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= -github.com/onsi/ginkgo/v2 v2.23.3 h1:edHxnszytJ4lD9D5Jjc4tiDkPBZ3siDeJJkUZJJVkp0= -github.com/onsi/ginkgo/v2 v2.23.3/go.mod h1:zXTP6xIp3U8aVuXN8ENK9IXRaTjFnpVB9mGmaSRvxnM= -github.com/onsi/gomega v1.36.3 h1:hID7cr8t3Wp26+cYnfcjR6HpJ00fdogN6dqZ1t6IylU= -github.com/onsi/gomega v1.36.3/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0= +github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= +github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= +github.com/onsi/gomega v1.37.0 h1:CdEG8g0S133B4OswTDC/5XPSzE1OeP29QOioj2PID2Y= +github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 h1:GFCKgmp0tecUJ0sJuv4pzYCqS9+RGSn52M3FUwPs+uo= @@ -162,26 +156,30 @@ github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.21.1 h1:DOvXXTqVzvkIewV/CDPFdejpMCGeMcbGCQ8YOmu+Ibk= -github.com/prometheus/client_golang v1.21.1/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg= -github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= -github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= -github.com/prometheus/common v0.63.0 h1:YR/EIY1o3mEFP/kZCD7iDMnLPlGyuU2Gb3HIcXnA98k= -github.com/prometheus/common v0.63.0/go.mod h1:VVFF/fBIoToEnWRVkYoXEkq3R3paCoxG9PXP74SnV18= +github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= +github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= +github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= +github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE= +github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= -github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= -github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= -github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/cobra v1.9.1 h1:CXSaggrXdbHK9CF+8ywj8Amf7PBRmPCOJugH954Nnlo= +github.com/spf13/cobra v1.9.1/go.mod h1:nDyEzZ8ogv936Cinf6g1RU9MRY64Ir93oCnqb9wxYW0= +github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= +github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stoewer/go-strcase v1.3.0 h1:g0eASXYtp+yvN9fK8sH94oCIk0fau9uV1/ZdJ0AVEzs= github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8wodgtPmh1xo= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= @@ -195,76 +193,82 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 h1:4K4tsIXefpVJtvA/8srF4V4y0akAoPHkIslgAkjixJA= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0/go.mod h1:jjdQuTGVsXV4vSs+CJ2qYDeDPf9yIJV23qlIzBm73Vg= -go.opentelemetry.io/otel v1.34.0 h1:zRLXxLCgL1WyKsPVrgbSdMN4c0FMkDAskSTQP+0hdUY= -go.opentelemetry.io/otel v1.34.0/go.mod h1:OWFPOQ+h4G8xpyjgqo4SxJYdDQ/qmRH+wivy7zzx9oI= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 h1:3Q/xZUyC1BBkualc9ROb4G8qkH90LXEIICcs5zv1OYY= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0/go.mod h1:s75jGIWA9OfCMzF0xr+ZgfrB5FEbbV7UuYo32ahUiFI= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 h1:qFffATk0X+HD+f1Z8lswGiOQYKHRlzfmdJm0wEaVrFA= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0/go.mod h1:MOiCmryaYtc+V0Ei+Tx9o5S1ZjA7kzLucuVuyzBZloQ= -go.opentelemetry.io/otel/metric v1.34.0 h1:+eTR3U0MyfWjRDhmFMxe2SsW64QrZ84AOhvqS7Y+PoQ= -go.opentelemetry.io/otel/metric v1.34.0/go.mod h1:CEDrp0fy2D0MvkXE+dPV7cMi8tWZwX3dmaIhwPOaqHE= -go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A= -go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU= -go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk= -go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= -go.opentelemetry.io/otel/trace v1.34.0 h1:+ouXS2V8Rd4hp4580a8q23bg0azF2nI8cqLYnC8mh/k= -go.opentelemetry.io/otel/trace v1.34.0/go.mod h1:Svm7lSjQD7kG7KJ/MUHPVXSDGz2OX4h0M2jHBhmSfRE= -go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0= -go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0 h1:yd02MEjBdJkG3uabWP9apV+OuWRIXGDuJEUJbOHmCFU= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.58.0/go.mod h1:umTcuxiv1n/s/S6/c2AT/g2CQ7u5C59sHDNmfSwgz7Q= +go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= +go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0 h1:Vh5HayB/0HHfOQA7Ctx69E/Y/DcQSMPpKANYVMQ7fBA= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.33.0/go.mod h1:cpgtDBaqD/6ok/UG0jT15/uKjAY8mRA53diogHBg3UI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0 h1:5pojmb1U1AogINhN3SurB+zm/nIcusopeBNp42f45QM= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.33.0/go.mod h1:57gTHJSE5S1tqg+EKsLPlTWhpHMsWlVmer+LA926XiA= +go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= +go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= +go.opentelemetry.io/otel/sdk v1.35.0 h1:iPctf8iprVySXSKJffSS79eOjl9pvxV9ZqOWT0QejKY= +go.opentelemetry.io/otel/sdk v1.35.0/go.mod h1:+ga1bZliga3DxJ3CQGg3updiaAJoNECOgJREo9KHGQg= +go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5JpUCaEqEI9o= +go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w= +go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= +go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= +go.opentelemetry.io/proto/otlp v1.4.0 h1:TA9WRvW6zMwP+Ssb6fLoUIuirti1gGbP28GcKG1jgeg= +go.opentelemetry.io/proto/otlp v1.4.0/go.mod h1:PPBWZIP98o2ElSqI35IHfu7hIhSwvc5N38Jw8pXuGFY= +go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= +go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= +go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= +go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= +go.yaml.in/yaml/v3 v3.0.3 h1:bXOww4E/J3f66rav3pX3m8w6jDE4knZjGOw8b5Y6iNE= +go.yaml.in/yaml/v3 v3.0.3/go.mod h1:tBHosrYAkRZjRAOREWbDnBXUf08JOwYq++0QNwQiWzI= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34= -golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc= +golang.org/x/crypto v0.38.0 h1:jt+WWG8IZlBnVbomuhg2Mdq0+BBQaHbtqHEFEigjUV8= +golang.org/x/crypto v0.38.0/go.mod h1:MvrbAqul58NNYPKnOra203SB9vpuZW0e+RRZV+Ggqjw= golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.23.0 h1:Zb7khfcRGKk+kqfxFaP5tZqCnDZMjC5VtUBs87Hr6QM= -golang.org/x/mod v0.23.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= +golang.org/x/mod v0.24.0 h1:ZfthKaKaT4NrhGVZHO1/WDTwGES4De8KtWO0SIbNJMU= +golang.org/x/mod v0.24.0/go.mod h1:IXM97Txy2VM4PJ3gI61r1YEk/gAj6zAHN3AdZt6S9Ww= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.37.0 h1:1zLorHbz+LYj7MQlSf1+2tPIIgibq2eL5xkrGk6f+2c= -golang.org/x/net v0.37.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= -golang.org/x/oauth2 v0.25.0 h1:CY4y7XT9v0cRI9oupztF8AgiIu99L/ksR/Xp/6jrZ70= -golang.org/x/oauth2 v0.25.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/net v0.40.0 h1:79Xs7wF06Gbdcg4kdCCIQArK11Z1hr5POQ6+fIYHNuY= +golang.org/x/net v0.40.0/go.mod h1:y0hY0exeL2Pku80/zKK7tpntoX23cqL3Oa6njdgRtds= +golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= +golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw= -golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= +golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik= -golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y= -golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g= +golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= +golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg= +golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= -golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= -golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= -golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4= +golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA= +golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= +golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.30.0 h1:BgcpHewrV5AUp2G9MebG4XPFI1E2W41zU1SaqVA9vJY= -golang.org/x/tools v0.30.0/go.mod h1:c347cR/OJfw5TI+GfX7RUPNMdDRRbjvYTS0jPyvsVtY= +golang.org/x/tools v0.31.0 h1:0EedkvKDbh+qistFTd0Bcwe/YLh4vHwWEkiI0toFIBU= +golang.org/x/tools v0.31.0/go.mod h1:naFTU+Cev749tSJRXJlna0T3WxKvb1kWEx15xA4SdmQ= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -273,12 +277,12 @@ golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 h1:+cNy6SZtPcJQH3LJVLOSm golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 h1:GVIKPyP/kLIyVOgOnTwFOrvQaQUzOzGMCxgFUOEmm24= -google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422/go.mod h1:b6h1vNKhxaSoEI+5jc3PJUCustfli/mRab7295pY7rw= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f h1:OxYkA3wjPsZyBylwymxSHa7ViiW1Sml4ToBrncvFehI= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f/go.mod h1:+2Yz8+CLJbIfL9z73EW45avw8Lmge3xVElCP9zEKi50= -google.golang.org/grpc v1.71.0 h1:kF77BGdPTQ4/JZWMlb9VpJ5pa25aqvVqogsxNHHdeBg= -google.golang.org/grpc v1.71.0/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= +google.golang.org/genproto/googleapis/api v0.0.0-20250324211829-b45e905df463 h1:hE3bRWtU6uceqlh4fhrSnUyjKHMKB9KrTLLG+bc0ddM= +google.golang.org/genproto/googleapis/api v0.0.0-20250324211829-b45e905df463/go.mod h1:U90ffi8eUL9MwPcrJylN5+Mk2v3vuPDptd5yyNUiRR8= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250324211829-b45e905df463 h1:e0AIkUUhxyBKh6ssZNrAMeqhA7RKUj42346d1y02i2g= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250324211829-b45e905df463/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= +google.golang.org/grpc v1.73.0 h1:VIWSmpI2MegBtTuFt5/JWy2oXxtjJ/e89Z70ImfD2ok= +google.golang.org/grpc v1.73.0/go.mod h1:50sbHOUqWoCQGI8V2HQLJM0B+LMlIUjNSZmow7EVBQc= google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= @@ -290,45 +294,48 @@ gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= -gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.32.3 h1:Hw7KqxRusq+6QSplE3NYG4MBxZw1BZnq4aP4cJVINls= -k8s.io/api v0.32.3/go.mod h1:2wEDTXADtm/HA7CCMD8D8bK4yuBUptzaRhYcYEEYA3k= -k8s.io/apiextensions-apiserver v0.32.3 h1:4D8vy+9GWerlErCwVIbcQjsWunF9SUGNu7O7hiQTyPY= -k8s.io/apiextensions-apiserver v0.32.3/go.mod h1:8YwcvVRMVzw0r1Stc7XfGAzB/SIVLunqApySV5V7Dss= -k8s.io/apimachinery v0.32.3 h1:JmDuDarhDmA/Li7j3aPrwhpNBA94Nvk5zLeOge9HH1U= -k8s.io/apimachinery v0.32.3/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= -k8s.io/apiserver v0.32.3 h1:kOw2KBuHOA+wetX1MkmrxgBr648ksz653j26ESuWNY8= -k8s.io/apiserver v0.32.3/go.mod h1:q1x9B8E/WzShF49wh3ADOh6muSfpmFL0I2t+TG0Zdgc= -k8s.io/client-go v0.32.3 h1:RKPVltzopkSgHS7aS98QdscAgtgah/+zmpAogooIqVU= -k8s.io/client-go v0.32.3/go.mod h1:3v0+3k4IcT9bXTc4V2rt+d2ZPPG700Xy6Oi0Gdl2PaY= -k8s.io/code-generator v0.32.3 h1:31p2TVzC9+hVdSkAFruAk3JY+iSfzrJ83Qij1yZutyw= -k8s.io/code-generator v0.32.3/go.mod h1:+mbiYID5NLsBuqxjQTygKM/DAdKpAjvBzrJd64NU1G8= -k8s.io/component-base v0.32.3 h1:98WJvvMs3QZ2LYHBzvltFSeJjEx7t5+8s71P7M74u8k= -k8s.io/component-base v0.32.3/go.mod h1:LWi9cR+yPAv7cu2X9rZanTiFKB2kHA+JjmhkKjCZRpI= -k8s.io/gengo/v2 v2.0.0-20240911193312-2b36238f13e9 h1:si3PfKm8dDYxgfbeA6orqrtLkvvIeH8UqffFJDl0bz4= -k8s.io/gengo/v2 v2.0.0-20240911193312-2b36238f13e9/go.mod h1:EJykeLsmFC60UQbYJezXkEsG2FLrt0GPNkU5iK5GWxU= +k8s.io/api v0.33.2 h1:YgwIS5jKfA+BZg//OQhkJNIfie/kmRsO0BmNaVSimvY= +k8s.io/api v0.33.2/go.mod h1:fhrbphQJSM2cXzCWgqU29xLDuks4mu7ti9vveEnpSXs= +k8s.io/apiextensions-apiserver v0.33.2 h1:6gnkIbngnaUflR3XwE1mCefN3YS8yTD631JXQhsU6M8= +k8s.io/apiextensions-apiserver v0.33.2/go.mod h1:IvVanieYsEHJImTKXGP6XCOjTwv2LUMos0YWc9O+QP8= +k8s.io/apimachinery v0.33.2 h1:IHFVhqg59mb8PJWTLi8m1mAoepkUNYmptHsV+Z1m5jY= +k8s.io/apimachinery v0.33.2/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM= +k8s.io/apiserver v0.33.2 h1:KGTRbxn2wJagJowo29kKBp4TchpO1DRO3g+dB/KOJN4= +k8s.io/apiserver v0.33.2/go.mod h1:9qday04wEAMLPWWo9AwqCZSiIn3OYSZacDyu/AcoM/M= +k8s.io/client-go v0.33.2 h1:z8CIcc0P581x/J1ZYf4CNzRKxRvQAwoAolYPbtQes+E= +k8s.io/client-go v0.33.2/go.mod h1:9mCgT4wROvL948w6f6ArJNb7yQd7QsvqavDeZHvNmHo= +k8s.io/code-generator v0.33.2 h1:PCJ0Y6viTCxxJHMOyGqYwWEteM4q6y1Hqo2rNpl6jF4= +k8s.io/code-generator v0.33.2/go.mod h1:hBjCA9kPMpjLWwxcr75ReaQfFXY8u+9bEJJ7kRw3J8c= +k8s.io/component-base v0.33.2 h1:sCCsn9s/dG3ZrQTX/Us0/Sx2R0G5kwa0wbZFYoVp/+0= +k8s.io/component-base v0.33.2/go.mod h1:/41uw9wKzuelhN+u+/C59ixxf4tYQKW7p32ddkYNe2k= +k8s.io/gengo/v2 v2.0.0-20250207200755-1244d31929d7 h1:2OX19X59HxDprNCVrWi6jb7LW1PoqTlYqEq5H2oetog= +k8s.io/gengo/v2 v2.0.0-20250207200755-1244d31929d7/go.mod h1:EJykeLsmFC60UQbYJezXkEsG2FLrt0GPNkU5iK5GWxU= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f h1:GA7//TjRY9yWGy1poLzYYJJ4JRdzg3+O6e8I+e+8T5Y= -k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f/go.mod h1:R/HEjbvWI0qdfb8viZUeVZm0X6IZnxAydC7YU42CMw4= +k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff h1:/usPimJzUKKu+m+TE36gUyGcf03XZEP0ZIKgKj35LS4= +k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= k8s.io/utils v0.0.0-20241210054802-24370beab758 h1:sdbE21q2nlQtFh65saZY+rRM6x6aJJI8IUa1AmH/qa0= k8s.io/utils v0.0.0-20241210054802-24370beab758/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= -sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 h1:CPT0ExVicCzcpeN4baWEV2ko2Z/AsiZgEdwgcfwLgMo= -sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= -sigs.k8s.io/controller-runtime v0.20.4 h1:X3c+Odnxz+iPTRobG4tp092+CvBU9UK0t/bRf+n0DGU= -sigs.k8s.io/controller-runtime v0.20.4/go.mod h1:xg2XB0K5ShQzAgsoujxuKN4LNXR2LfwwHsPj7Iaw+XY= -sigs.k8s.io/controller-tools v0.14.0 h1:rnNoCC5wSXlrNoBKKzL70LNJKIQKEzT6lloG6/LF73A= -sigs.k8s.io/controller-tools v0.14.0/go.mod h1:TV7uOtNNnnR72SpzhStvPkoS/U5ir0nMudrkrC4M9Sc= +sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM= +sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= +sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8= +sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM= +sigs.k8s.io/controller-tools v0.17.3 h1:lwFPLicpBKLgIepah+c8ikRBubFW5kOQyT88r3EwfNw= +sigs.k8s.io/controller-tools v0.17.3/go.mod h1:1ii+oXcYZkxcBXzwv3YZBlzjt1fvkrCGjVF73blosJI= +sigs.k8s.io/gateway-api v1.3.0 h1:q6okN+/UKDATola4JY7zXzx40WO4VISk7i9DIfOvr9M= +sigs.k8s.io/gateway-api v1.3.0/go.mod h1:d8NV8nJbaRbEKem+5IuxkL8gJGOZ+FJ+NvOIltV8gDk= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= -sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016 h1:kXv6kKdoEtedwuqMmkqhbkgvYKeycVbC8+iPCP9j5kQ= sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= -sigs.k8s.io/structured-merge-diff/v4 v4.6.0 h1:IUA9nvMmnKWcj5jl84xn+T5MnlZKThmUW1TdblaLVAc= -sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= -sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= +sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= +sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/structured-merge-diff/v4 v4.7.0 h1:qPeWmscJcXP0snki5IYF79Z8xrl8ETFxgMd7wez1XkI= +sigs.k8s.io/structured-merge-diff/v4 v4.7.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= +sigs.k8s.io/yaml v1.5.0 h1:M10b2U7aEUY6hRtU870n2VTPgR5RZiL/I6Lcc2F4NUQ= +sigs.k8s.io/yaml v1.5.0/go.mod h1:wZs27Rbxoai4C0f8/9urLZtZtF3avA3gKvGyPdDqTO4= diff --git a/hack/boilerplate/boilerplate.generatego.txt b/hack/boilerplate/boilerplate.generatego.txt new file mode 100644 index 000000000..0926592d3 --- /dev/null +++ b/hack/boilerplate/boilerplate.generatego.txt @@ -0,0 +1,15 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ diff --git a/hack/boilerplate.go.txt b/hack/boilerplate/boilerplate.go.txt similarity index 92% rename from hack/boilerplate.go.txt rename to hack/boilerplate/boilerplate.go.txt index 8057371b8..4b76f1fdd 100644 --- a/hack/boilerplate.go.txt +++ b/hack/boilerplate/boilerplate.go.txt @@ -1,5 +1,5 @@ /* -Copyright 2025 The Kubernetes Authors. +Copyright YEAR The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,4 +12,4 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ \ No newline at end of file +*/ diff --git a/hack/boilerplate/boilerplate.py b/hack/boilerplate/boilerplate.py new file mode 100755 index 000000000..013fb427e --- /dev/null +++ b/hack/boilerplate/boilerplate.py @@ -0,0 +1,262 @@ +#!/usr/bin/env python3 + +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This file is copied from https://github.com/kubernetes/kubernetes/blob/04c2b1fbdc1289c9a72eda87cf7072346e60d241/hack/boilerplate/boilerplate.py + +from __future__ import print_function + +import argparse +import datetime +import difflib +import glob +import os +import re +import sys + +parser = argparse.ArgumentParser() +parser.add_argument( + "filenames", + help="list of files to check, all files if unspecified", + nargs='*') + +rootdir = os.path.dirname(__file__) + "/../../" +rootdir = os.path.abspath(rootdir) +parser.add_argument( + "--rootdir", default=rootdir, help="root directory to examine") + +default_boilerplate_dir = os.path.join(rootdir, "hack/boilerplate") +parser.add_argument( + "--boilerplate-dir", default=default_boilerplate_dir) + +parser.add_argument( + "-v", "--verbose", + help="give verbose output regarding why a file does not pass", + action="store_true") + +args = parser.parse_args() + +verbose_out = sys.stderr if args.verbose else open("/dev/null", "w") + + +def get_refs(): + refs = {} + + for path in glob.glob(os.path.join(args.boilerplate_dir, "boilerplate.*.txt")): + extension = os.path.basename(path).split(".")[1] + + ref_file = open(path, 'r') + ref = ref_file.read().splitlines() + ref_file.close() + refs[extension] = ref + + return refs + + +def is_generated_file(filename, data, regexs): + for d in skipped_ungenerated_files: + if d in filename: + return False + + p = regexs["generated"] + return p.search(data) + + +def file_passes(filename, refs, regexs): + try: + f = open(filename, 'r') + except Exception as exc: + print("Unable to open %s: %s" % (filename, exc), file=verbose_out) + return False + + data = f.read() + f.close() + + # determine if the file is automatically generated + generated = is_generated_file(filename, data, regexs) + + basename = os.path.basename(filename) + extension = file_extension(filename) + if generated: + if extension == "go": + extension = "generatego" + elif extension == "bzl": + extension = "generatebzl" + + if extension != "": + ref = refs[extension] + else: + ref = refs[basename] + + # remove extra content from the top of files + if extension == "go" or extension == "generatego": + p = regexs["go_build_constraints"] + (data, found) = p.subn("", data, 1) + elif extension in ["sh", "py"]: + p = regexs["shebang"] + (data, found) = p.subn("", data, 1) + + data = data.splitlines() + + # if our test file is smaller than the reference it surely fails! + if len(ref) > len(data): + print('File %s smaller than reference (%d < %d)' % + (filename, len(data), len(ref)), + file=verbose_out) + return False + + # trim our file to the same number of lines as the reference file + data = data[:len(ref)] + + p = regexs["year"] + for d in data: + if p.search(d): + if generated: + print('File %s has the YEAR field, but it should not be in generated file' % + filename, file=verbose_out) + else: + print('File %s has the YEAR field, but missing the year of date' % + filename, file=verbose_out) + return False + + if not generated: + # Replace all occurrences of the regex "2014|2015|2016|2017|2018" with "YEAR" + p = regexs["date"] + for i, d in enumerate(data): + (data[i], found) = p.subn('YEAR', d) + if found != 0: + break + + # if we don't match the reference at this point, fail + if ref != data: + print("Header in %s does not match reference, diff:" % + filename, file=verbose_out) + if args.verbose: + print(file=verbose_out) + for line in difflib.unified_diff(ref, data, 'reference', filename, lineterm=''): + print(line, file=verbose_out) + print(file=verbose_out) + return False + + return True + + +def file_extension(filename): + return os.path.splitext(filename)[1].split(".")[-1].lower() + + +skipped_dirs = [ + 'bin/kube_codegen.sh', + 'cluster/env.sh', + '.git', + '_gopath', + 'hack/boilerplate/test', + 'internal/runnable/leader_election.go', + '_output', + 'pkg/epp/backend/metrics/pod_metrics_test.go', + 'pkg/epp/saturationdetector/config.go', + 'pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go', + 'pkg/epp/util/env/env_test.go', + 'staging/src/k8s.io/kubectl/pkg/generated/bindata.go', + 'test/e2e/generated/bindata.go', + 'third_party', + 'vendor', + '.venv', + ] + +# list all the files contain 'DO NOT EDIT', but are not generated +skipped_ungenerated_files = ['hack/boilerplate/boilerplate.py'] + + +def normalize_files(files): + newfiles = [] + for pathname in files: + if any(x in pathname for x in skipped_dirs): + continue + newfiles.append(pathname) + for i, pathname in enumerate(newfiles): + if not os.path.isabs(pathname): + newfiles[i] = os.path.join(args.rootdir, pathname) + return newfiles + + +def get_files(extensions): + files = [] + if len(args.filenames) > 0: + files = args.filenames + else: + for root, dirs, walkfiles in os.walk(args.rootdir): + # don't visit certain dirs. This is just a performance improvement + # as we would prune these later in normalize_files(). But doing it + # cuts down the amount of filesystem walking we do and cuts down + # the size of the file list + for d in skipped_dirs: + if d in dirs: + dirs.remove(d) + + for name in walkfiles: + pathname = os.path.join(root, name) + files.append(pathname) + + files = normalize_files(files) + outfiles = [] + for pathname in files: + basename = os.path.basename(pathname) + extension = file_extension(pathname) + if extension in extensions or basename in extensions: + outfiles.append(pathname) + return outfiles + + +def get_dates(): + years = datetime.datetime.now().year + return '(%s)' % '|'.join((str(year) for year in range(2014, years+1))) + + +def get_regexs(): + regexs = {} + # Search for "YEAR" which exists in the boilerplate, but shouldn't in the real thing + regexs["year"] = re.compile('YEAR') + # get_dates return 2014, 2015, 2016, 2017, or 2018 until the current year as a regex like: "(2014|2015|2016|2017|2018)"; + # company holder names can be anything + regexs["date"] = re.compile(get_dates()) + # strip the following build constraints/tags: + # //go:build + # // +build \n\n + regexs["go_build_constraints"] = re.compile( + r"^(//(go:build| \+build).*\n)+\n", re.MULTILINE) + # strip #!.* from scripts + regexs["shebang"] = re.compile(r"^(#!.*\n)\n*", re.MULTILINE) + # Search for generated files + regexs["generated"] = re.compile('DO NOT EDIT') + return regexs + + +def main(): + regexs = get_regexs() + refs = get_refs() + filenames = get_files(refs.keys()) + + for filename in filenames: + if not file_passes(filename, refs, regexs): + print(filename, file=sys.stdout) + + print("Verified %d file headers match boilerplate" % (len(filenames),), file=sys.stderr) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/hack/boilerplate/boilerplate.py.txt b/hack/boilerplate/boilerplate.py.txt new file mode 100644 index 000000000..34cb349c4 --- /dev/null +++ b/hack/boilerplate/boilerplate.py.txt @@ -0,0 +1,13 @@ +# Copyright YEAR The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/hack/boilerplate/boilerplate.sh.txt b/hack/boilerplate/boilerplate.sh.txt new file mode 100644 index 000000000..34cb349c4 --- /dev/null +++ b/hack/boilerplate/boilerplate.sh.txt @@ -0,0 +1,13 @@ +# Copyright YEAR The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/hack/kube-env.sh b/hack/kube-env.sh new file mode 100644 index 000000000..4415df155 --- /dev/null +++ b/hack/kube-env.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# Copyright 2014 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Some useful colors. +if [[ -z "${color_start-}" ]]; then + declare -r color_start="\033[" + declare -r color_red="${color_start}0;31m" + declare -r color_yellow="${color_start}0;33m" + declare -r color_green="${color_start}0;32m" + declare -r color_norm="${color_start}0m" +fi + +# Returns the server version as MMmmpp, with MM as the major +# component, mm the minor component, and pp as the patch +# revision. e.g. 0.7.1 is echoed as 701, and 1.0.11 would be +# 10011. (This makes for easy integer comparison in bash.) +function kube_server_version() { + local server_version + local major + local minor + local patch + + # This sed expression is the POSIX BRE to match strings like: + # Server Version: &version.Info{Major:"0", Minor:"7+", GitVersion:"v0.7.0-dirty", GitCommit:"ad44234f7152e9c66bc2853575445c7071335e57", GitTreeState:"dirty"} + # and capture the GitVersion portion (which has the patch level) + server_version=$(${KUBECTL} --match-server-version=false version | grep "Server Version:") + read major minor patch < <( + echo ${server_version} | \ + sed "s/.*GitVersion:\"v\([0-9]\{1,\}\)\.\([0-9]\{1,\}\)\.\([0-9]\{1,\}\).*/\1 \2 \3/") + printf "%02d%02d%02d" ${major} ${minor} ${patch} | sed 's/^0*//' +} diff --git a/hack/referencer.go b/hack/referencer.go index 87fcc557d..31fb3df07 100644 --- a/hack/referencer.go +++ b/hack/referencer.go @@ -1,10 +1,11 @@ /* Copyright 2024 The Kubernetes Authors. + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, diff --git a/hack/release-quickstart.sh b/hack/release-quickstart.sh index c2c0f74d9..068be7f24 100755 --- a/hack/release-quickstart.sh +++ b/hack/release-quickstart.sh @@ -1,3 +1,17 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + #!/bin/bash set -euo pipefail @@ -15,11 +29,18 @@ else RELEASE_TAG="v${MAJOR}.${MINOR}.0-rc.${RC}" fi -# vLLM image version (default to 0.7.2 if not defined) -VLLM="${VLLM:-0.7.2}" +# The vLLM image versions +# The GPU image is from https://hub.docker.com/layers/vllm/vllm-openai +VLLM_GPU="${VLLM_GPU:-0.9.1}" +# The CPU image is from https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo +VLLM_CPU="${VLLM_CPU:-0.9.1}" +# The sim image is from https://github.com/llm-d/llm-d-inference-sim/pkgs/container/llm-d-inference-sim +VLLM_SIM="${VLLM_SIM:-0.1.1}" echo "Using release tag: ${RELEASE_TAG}" -echo "Using vLLM image version: ${VLLM}" +echo "Using vLLM GPU image version: ${VLLM_GPU}" +echo "Using vLLM CPU image version: ${VLLM_CPU}" +echo "Using vLLM Simulator image version: ${VLLM_SIM}" # ----------------------------------------------------------------------------- # Update pkg/README.md @@ -50,7 +71,7 @@ sed -i.bak -E "s|(tag: )[^\"[:space:]]+|\1${RELEASE_TAG}|g" "$EPP_HELM" sed -i.bak -E "s|(tag: )[^\"[:space:]]+|\1${RELEASE_TAG}|g" "$BBR_HELM" # Update the container image pull policy. -sed -i.bak '/us-central1-docker.pkg.dev\/k8s-staging-images\/gateway-api-inference-extension\/epp/ { n; s/Always/IfNotPresent/ }' "$EPP" +sed -i.bak '/us-central1-docker.pkg.dev\/k8s-staging-images\/gateway-api-inference-extension\/epp/{n;s/Always/IfNotPresent/;}' "$EPP" # Update the container registry. sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$EPP" @@ -58,22 +79,40 @@ sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$BBR_HELM" # ----------------------------------------------------------------------------- -# Update config/manifests/vllm/gpu-deployment.yaml +# Update vLLM deployment manifests # ----------------------------------------------------------------------------- -VLLM_DEPLOY="config/manifests/vllm/gpu-deployment.yaml" -echo "Updating ${VLLM_DEPLOY} ..." +VLLM_GPU_DEPLOY="config/manifests/vllm/gpu-deployment.yaml" +echo "Updating ${VLLM_GPU_DEPLOY} ..." + +# Update the vLLM GPU image version +sed -i.bak -E "s|(vllm/vllm-openai:)[^\"[:space:]]+|\1v${VLLM_GPU}|g" "$VLLM_GPU_DEPLOY" + +# Also change the imagePullPolicy from Always to IfNotPresent on lines containing the vLLM image. +sed -i.bak '/vllm\/vllm-openai/{n;s/Always/IfNotPresent/;}' "$VLLM_GPU_DEPLOY" + +VLLM_CPU_DEPLOY="config/manifests/vllm/cpu-deployment.yaml" +echo "Updating ${VLLM_CPU_DEPLOY} ..." + +# Update the vLLM CPU image version +sed -i.bak -E "s|(q9t5s3a7/vllm-cpu-release-repo:)[^\"[:space:]]+|\1v${VLLM_CPU}|g" "$VLLM_CPU_DEPLOY" + +# Also change the imagePullPolicy from Always to IfNotPresent on lines containing the vLLM CPU image. +sed -i.bak '/q9t5s3a7\/vllm-cpu-release-repo/{n;s/Always/IfNotPresent/;}' "$VLLM_CPU_DEPLOY" + +VLLM_SIM_DEPLOY="config/manifests/vllm/sim-deployment.yaml" +echo "Updating ${VLLM_SIM_DEPLOY} ..." -# Update the vLLM image version -sed -i.bak -E "s|(vllm/vllm-openai:)[^\"[:space:]]+|\1v${VLLM}|g" "$VLLM_DEPLOY" +# Update the vLLM Simulator image version +sed -i.bak -E "s|(llm-d/llm-d-inference-sim:)[^\"[:space:]]+|\1v${VLLM_SIM}|g" "$VLLM_SIM_DEPLOY" # Also change the imagePullPolicy from Always to IfNotPresent on lines containing the vLLM image. -sed -i.bak '/vllm\/vllm-openai/ { n; s/Always/IfNotPresent/ }' "$VLLM_DEPLOY" +sed -i.bak '/llm-d\/llm-d-inference-sim/{n;s/Always/IfNotPresent/;}' "$VLLM_SIM_DEPLOY" # ----------------------------------------------------------------------------- # Stage the changes # ----------------------------------------------------------------------------- -echo "Staging $README $EPP $EPP_HELM $BBR_HELM $VLLM_DEPLOY files..." -git add $README $EPP $EPP_HELM $BBR_HELM $VLLM_DEPLOY +echo "Staging $README $EPP $EPP_HELM $BBR_HELM $VLLM_GPU_DEPLOY $VLLM_CPU_DEPLOY $VLLM_SIM_DEPLOY files..." +git add $README $EPP $EPP_HELM $BBR_HELM $VLLM_GPU_DEPLOY $VLLM_CPU_DEPLOY $VLLM_SIM_DEPLOY # ----------------------------------------------------------------------------- # Cleanup backup files and finish diff --git a/hack/test-e2e.sh b/hack/test-e2e.sh index 0d6bdfc0c..497e5de1e 100755 --- a/hack/test-e2e.sh +++ b/hack/test-e2e.sh @@ -1,137 +1,54 @@ -#!/bin/bash -# -# This script verifies end-to-end connectivity for an example inference extension test environment based on -# resources from the quickstart guide or e2e test framework. It can optionally launch a "curl" client pod to -# run these tests within the cluster. +#!/usr/bin/env bash + +# Copyright 2025 The Kubernetes Authors. # -# USAGE: ./hack/e2e-test.sh +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# OPTIONAL ENVIRONMENT VARIABLES: -# - TIME: The duration (in seconds) for which the test will run. Defaults to 1 second. -# - CURL_POD: If set to "true", the script will use a Kubernetes pod named "curl" for making requests. -# - IP: Override the detected IP address. If not provided, the script attempts to use a Gateway based on -# the quickstart guide or an Envoy service IP based on the e2e test framework. -# - PORT: Override the detected port. If not provided, the script attempts to use a Gateway based on the -# quickstart guide or an Envoy service IP based on the e2e test framework. +# http://www.apache.org/licenses/LICENSE-2.0 # -# WHAT THE SCRIPT DOES: -# 1. Determines if there is a Gateway named "inference-gateway" in the "default" namespace. If found, it extracts the IP -# address and port from the Gateway's "llm-gw" listener. Otherwise, it falls back to the Envoy service in the "default" namespace. -# 2. Optionally checks for (or creates) a "curl" pod, ensuring it is ready to execute requests. -# 3. Loops for $TIME seconds, sending requests every 5 seconds to the /v1/completions endpoint to confirm successful connectivity. - -set -euo pipefail - -# Determine the directory of this script and build an absolute path to client.yaml. -SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -CLIENT_YAML="$SCRIPT_DIR/../test/testdata/client.yaml" - -# TIME is the amount of time, in seconds, to run the test. -TIME=${TIME:-1} -# Optionally use a client curl pod for executing the curl command. -CURL_POD=${CURL_POD:-false} - -check_resource_exists() { - local type=$1 - local name=$2 - local namespace=$3 - - if kubectl get "$type" "$name" -n "$namespace" &>/dev/null; then - return 0 - else - return 1 - fi +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -euox pipefail + +install_kind() { + if ! command -v kind &>/dev/null; then + echo "kind not found, installing..." + [ $(uname -m) = x86_64 ] && curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.29.0/kind-linux-amd64 + # For ARM64 + [ $(uname -m) = aarch64 ] && curl -Lo ./kind https://kind.sigs.k8s.io/dl/v0.29.0/kind-linux-arm64 + chmod +x ./kind + mv ./kind /usr/local/bin/kind + else + echo "kind is already installed." + fi } -check_pod_ready() { - local pod_name=$1 - local namespace=$2 - # Check the Ready condition using jsonpath. Default to False if not found. - local ready_status - ready_status=$(kubectl get pod "$pod_name" -n "$namespace" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "False") - if [[ "$ready_status" == "True" ]]; then - return 0 - else - return 1 - fi -} - -# Try to get the Gateway's IP and the port from the listener named "llm-gw" if it exists. -if check_resource_exists "gateway" "inference-gateway" "default"; then - GATEWAY_IP=$(kubectl get gateway inference-gateway -n default -o jsonpath='{.status.addresses[0].value}') - # Use JSONPath to select the port from the listener with name "llm-gw" - GATEWAY_PORT=$(kubectl get gateway inference-gateway -n default -o jsonpath='{.spec.listeners[?(@.name=="llm-gw")].port}') -else - GATEWAY_IP="" - GATEWAY_PORT="" -fi - -if [[ -n "$GATEWAY_IP" && -n "$GATEWAY_PORT" ]]; then - echo "Using Gateway inference-gateway IP and port from listener 'llm-gw'." - IP=${IP:-$GATEWAY_IP} - PORT=${PORT:-$GATEWAY_PORT} -else - echo "Gateway inference-gateway not found or missing IP/port. Falling back to Envoy service." - # Ensure the Envoy service exists. - if ! check_resource_exists "svc" "envoy" "default"; then - echo "Error: Envoy service not found in namespace 'default'." - exit 1 - fi - IP=${IP:-$(kubectl get svc envoy -n default -o jsonpath='{.spec.clusterIP}')} - PORT=${PORT:-$(kubectl get svc envoy -n default -o jsonpath='{.spec.ports[0].port}')} -fi - -# Optionally verify that the curl pod exists and is ready. -if [[ "$CURL_POD" == "true" ]]; then - if ! check_resource_exists "pod" "curl" "default"; then - echo "Pod 'curl' not found in namespace 'default'. Applying client.yaml from $CLIENT_YAML..." - kubectl apply -f "$CLIENT_YAML" - fi - echo "Waiting for pod 'curl' to be ready..." - # Retry every 5 seconds for up to 30 seconds (6 attempts) - for i in {1..6}; do - if check_pod_ready "curl" "default"; then - echo "Pod 'curl' is now ready." - break - fi - echo "Retry attempt $i: Pod 'curl' not ready; waiting 5 seconds..." - sleep 5 - done - - if ! check_pod_ready "curl" "default"; then - echo "Error: Pod 'curl' is still not ready in namespace 'default' after 30 seconds." - exit 1 - fi -fi - -# Validate that we have a non-empty IP and PORT. -if [[ -z "$IP" ]]; then - echo "Error: Unable to determine a valid IP from either Gateway or Envoy service." - exit 1 -fi - -if [[ -z "$PORT" ]]; then - echo "Error: Unable to determine a valid port from either Gateway or Envoy service." - exit 1 +if [ "$USE_KIND" = "true" ]; then + install_kind # make sure kind cli is installed + if ! kubectl config current-context >/dev/null 2>&1; then # if no active kind cluster found + echo "No active kubecontext found. creating a kind cluster for running the tests..." + kind create cluster --name inference-e2e + KIND_CLUSTER=inference-e2e IMAGE_TAG=${E2E_IMAGE} make image-kind + else + current_context=$(kubectl config current-context) + current_kind_cluster="${current_context#kind-}" + echo "Found an active kind cluster ${current_kind_cluster} for running the tests..." + KIND_CLUSTER=${current_kind_cluster} IMAGE_TAG=${E2E_IMAGE} make image-kind + fi +else + # don't use kind. it's the caller responsibility to load the image into the cluster, we just run the tests. + # this section is useful when one wants to run an official release or latest main against a cluster other than kind. + if ! kubectl config current-context >/dev/null 2>&1; then # if no active cluster found + echo "No active kubecontext found. exiting..." + exit + fi fi -echo "Using IP: $IP" -echo "Using PORT: $PORT" - -# Run the test for the specified duration. -end=$((SECONDS + TIME)) -if [[ "$CURL_POD" == "true" ]]; then - while [ $SECONDS -lt $end ]; do - kubectl exec po/curl -- curl -i "$IP:$PORT/v1/completions" \ - -H 'Content-Type: application/json' \ - -d '{"model": "food-review","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}' - sleep 5 - done -else - while [ $SECONDS -lt $end ]; do - curl -i "$IP:$PORT/v1/completions" \ - -H 'Content-Type: application/json' \ - -d '{"model": "food-review","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}' - sleep 5 - done -fi +echo "Found an active cluster. Running Go e2e tests in ./epp..." +go test ./test/e2e/epp/ -v -ginkgo.v diff --git a/hack/update-codegen.sh b/hack/update-codegen.sh index ab5818fa4..0dc5f7200 100755 --- a/hack/update-codegen.sh +++ b/hack/update-codegen.sh @@ -27,11 +27,11 @@ THIS_PKG="sigs.k8s.io/gateway-api-inference-extension" kube::codegen::gen_helpers \ - --boilerplate "${SCRIPT_ROOT}/hack/boilerplate.go.txt" \ + --boilerplate "${SCRIPT_ROOT}/hack/boilerplate/boilerplate.generatego.txt" \ "${SCRIPT_ROOT}" kube::codegen::gen_register \ - --boilerplate "${SCRIPT_ROOT}/hack/boilerplate.go.txt" \ + --boilerplate "${SCRIPT_ROOT}/hack/boilerplate/boilerplate.generatego.txt" \ "${SCRIPT_ROOT}" kube::codegen::gen_client \ @@ -39,5 +39,5 @@ kube::codegen::gen_client \ --with-applyconfig \ --output-dir "${SCRIPT_ROOT}/client-go" \ --output-pkg "${THIS_PKG}/client-go" \ ---boilerplate "${SCRIPT_ROOT}/hack/boilerplate.go.txt" \ +--boilerplate "${SCRIPT_ROOT}/hack/boilerplate/boilerplate.generatego.txt" \ "${SCRIPT_ROOT}" diff --git a/hack/verify-all.sh b/hack/verify-all.sh new file mode 100755 index 000000000..e1343ac74 --- /dev/null +++ b/hack/verify-all.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +SCRIPT_ROOT=$(dirname "${BASH_SOURCE}")/.. +source "${SCRIPT_ROOT}/hack/kube-env.sh" + +SILENT=true +FAILED_TEST=() + +function is-excluded { + for e in $EXCLUDE; do + if [[ $1 -ef ${BASH_SOURCE} ]]; then + return + fi + if [[ $1 -ef "$SCRIPT_ROOT/hack/$e" ]]; then + return + fi + done + return 1 +} + +while getopts ":v" opt; do + case $opt in + v) + SILENT=false + ;; + \?) + echo "Invalid flag: -$OPTARG" >&2 + exit 1 + ;; + esac +done + +if $SILENT ; then + echo "Running in the silent mode, run with -v if you want to see script logs." +fi + +EXCLUDE="verify-all.sh" + +SCRIPTS=$(find "${SCRIPT_ROOT}"/hack -name "verify-*.sh") + +ret=0 +for t in $SCRIPTS; +do + if is-excluded "${t}" ; then + echo "Skipping $t" + continue + fi + if $SILENT ; then + echo -e "Verifying $t" + if bash "$t" &> /dev/null; then + echo -e "${color_green}SUCCESS${color_norm}" + else + echo -e "${color_red}FAILED: $t ${color_norm}" + FAILED_TEST+=("$t") + ret=1 + fi + else + if bash "$t"; then + echo -e "${color_green}SUCCESS: $t ${color_norm}" + else + echo -e "${color_red}Test FAILED: $t ${color_norm}" + FAILED_TEST+=("$t") + ret=1 + fi + fi +done + +if [ ${#FAILED_TEST[@]} -ne 0 ]; then + echo -e "\n${color_red}Summary of failed tests:${color_norm}" + for test in "${FAILED_TEST[@]}"; do + echo -e "${color_red}- $test${color_norm}" + done +else + echo -e "\n${color_green}All tests passed successfully.${color_norm}" +fi + +exit $ret + +# ex: ts=2 sw=2 et filetype=sh diff --git a/hack/verify-boilerplate.sh b/hack/verify-boilerplate.sh new file mode 100755 index 000000000..0a4ae8989 --- /dev/null +++ b/hack/verify-boilerplate.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -o errexit +set -o nounset +set -o pipefail + +SCRIPT_ROOT=$(dirname "${BASH_SOURCE}")/.. + +boilerDir="${SCRIPT_ROOT}/hack/boilerplate" +boiler="${boilerDir}/boilerplate.py" + +files_need_boilerplate=($(${boiler} "$@")) + +# Run boilerplate check +if [[ ${#files_need_boilerplate[@]} -gt 0 ]]; then + for file in "${files_need_boilerplate[@]}"; do + echo "Boilerplate header is wrong for: ${file}" + done + + exit 1 +fi diff --git a/internal/runnable/grpc.go b/internal/runnable/grpc.go index a619f788d..82b7b85e2 100644 --- a/internal/runnable/grpc.go +++ b/internal/runnable/grpc.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package runnable import ( diff --git a/mkdocs.yml b/mkdocs.yml index 2dc4d2a18..d22d93713 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -10,7 +10,7 @@ theme: icon: repo: fontawesome/brands/git-alt logo: images/logo/logo-text-large-horizontal-white.png - favicon: images/k8s-favicon.png + favicon: images/favicon-64.png features: - search.highlight - navigation.tabs @@ -52,18 +52,31 @@ nav: - Introduction: index.md - Concepts: API Overview: concepts/api-overview.md + Design Principles: concepts/design-principles.md Conformance: concepts/conformance.md Roles and Personas: concepts/roles-and-personas.md - - Implementations: implementations.md + - Implementations: + - Gateways: implementations/gateways.md + - Model Servers: implementations/model-servers.md - FAQ: faq.md - Guides: - User Guides: - Getting started: guides/index.md - - Adapter Rollout: guides/adapter-rollout.md - - Metrics: guides/metrics.md - - Implementer's Guide: guides/implementers.md + - Use Cases: + - Serve Multiple GenAI models: guides/serve-multiple-genai-models.md + - Serve Multiple LoRA adapters: guides/serve-multiple-lora-adapters.md + - Rollout: + - Adapter Rollout: guides/adapter-rollout.md + - InferencePool Rollout: guides/inferencepool-rollout.md + - Metrics and Observability: guides/metrics-and-observability.md + - Configuration Guide: + - Prefix Cache Aware Plugin: guides/epp-configuration/prefix-aware.md + - Implementer Guides: + - Getting started: guides/implementers.md + - Conformance Tests: guides/conformance-tests.md - Performance: - Benchmark: performance/benchmark/index.md + - Regression Testing: performance/regression-testing/index.md - Reference: - API Reference: reference/spec.md - API Types: diff --git a/pkg/body-based-routing/README.md b/pkg/bbr/README.md similarity index 100% rename from pkg/body-based-routing/README.md rename to pkg/bbr/README.md diff --git a/pkg/body-based-routing/handlers/request.go b/pkg/bbr/handlers/request.go similarity index 98% rename from pkg/body-based-routing/handlers/request.go rename to pkg/bbr/handlers/request.go index c0be46ac1..32fffc021 100644 --- a/pkg/body-based-routing/handlers/request.go +++ b/pkg/bbr/handlers/request.go @@ -25,7 +25,7 @@ import ( eppb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/body-based-routing/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/bbr/metrics" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) diff --git a/pkg/body-based-routing/handlers/request_test.go b/pkg/bbr/handlers/request_test.go similarity index 95% rename from pkg/body-based-routing/handlers/request_test.go rename to pkg/bbr/handlers/request_test.go index 0f088702f..3bc0d6fe4 100644 --- a/pkg/body-based-routing/handlers/request_test.go +++ b/pkg/bbr/handlers/request_test.go @@ -26,9 +26,9 @@ import ( extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "github.com/google/go-cmp/cmp" "google.golang.org/protobuf/testing/protocmp" - "k8s.io/component-base/metrics/legacyregistry" metricsutils "k8s.io/component-base/metrics/testutil" - "sigs.k8s.io/gateway-api-inference-extension/pkg/body-based-routing/metrics" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/bbr/metrics" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -204,7 +204,7 @@ func TestHandleRequestBody(t *testing.T) { bbr_success_total{} 1 ` - if err := metricsutils.GatherAndCompare(legacyregistry.DefaultGatherer, strings.NewReader(wantMetrics), "inference_model_request_total"); err != nil { + if err := metricsutils.GatherAndCompare(crmetrics.Registry, strings.NewReader(wantMetrics), "inference_model_request_total"); err != nil { t.Error(err) } } diff --git a/pkg/body-based-routing/handlers/response.go b/pkg/bbr/handlers/response.go similarity index 100% rename from pkg/body-based-routing/handlers/response.go rename to pkg/bbr/handlers/response.go diff --git a/pkg/body-based-routing/handlers/server.go b/pkg/bbr/handlers/server.go similarity index 90% rename from pkg/body-based-routing/handlers/server.go rename to pkg/bbr/handlers/server.go index 24664f988..a5803806b 100644 --- a/pkg/body-based-routing/handlers/server.go +++ b/pkg/bbr/handlers/server.go @@ -28,6 +28,7 @@ import ( "google.golang.org/grpc/status" "sigs.k8s.io/controller-runtime/pkg/log" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + requtil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/request" ) func NewServer(streaming bool) *Server { @@ -74,6 +75,11 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { // If streaming and the body is not empty, then headers are handled when processing request body. loggerVerbose.Info("Received headers, passing off header processing until body arrives...") } else { + if requestId := requtil.ExtractHeaderValue(v, requtil.RequestIdHeaderKey); len(requestId) > 0 { + logger = logger.WithValues(requtil.RequestIdHeaderKey, requestId) + loggerVerbose = logger.V(logutil.VERBOSE) + ctx = log.IntoContext(ctx, logger) + } responses, err = s.HandleRequestHeaders(req.GetRequestHeaders()) } case *extProcPb.ProcessingRequest_RequestBody: @@ -112,18 +118,18 @@ type streamedBody struct { func (s *Server) processRequestBody(ctx context.Context, body *extProcPb.HttpBody, streamedBody *streamedBody, logger logr.Logger) ([]*extProcPb.ProcessingResponse, error) { loggerVerbose := logger.V(logutil.VERBOSE) - var requestBody map[string]interface{} + var requestBody map[string]any if s.streaming { + streamedBody.body = append(streamedBody.body, body.Body...) // In the stream case, we can receive multiple request bodies. - if !body.EndOfStream { - streamedBody.body = append(streamedBody.body, body.Body...) - return nil, nil - } else { + if body.EndOfStream { loggerVerbose.Info("Flushing stream buffer") err := json.Unmarshal(streamedBody.body, &requestBody) if err != nil { logger.V(logutil.DEFAULT).Error(err, "Error unmarshaling request body") } + } else { + return nil, nil } } else { if err := json.Unmarshal(body.GetBody(), &requestBody); err != nil { diff --git a/pkg/body-based-routing/handlers/server_test.go b/pkg/bbr/handlers/server_test.go similarity index 100% rename from pkg/body-based-routing/handlers/server_test.go rename to pkg/bbr/handlers/server_test.go diff --git a/pkg/body-based-routing/metrics/metrics.go b/pkg/bbr/metrics/metrics.go similarity index 59% rename from pkg/body-based-routing/metrics/metrics.go rename to pkg/bbr/metrics/metrics.go index fc3538fba..4aec0e16d 100644 --- a/pkg/body-based-routing/metrics/metrics.go +++ b/pkg/bbr/metrics/metrics.go @@ -19,49 +19,48 @@ package metrics import ( "sync" + "github.com/prometheus/client_golang/prometheus" compbasemetrics "k8s.io/component-base/metrics" - "k8s.io/component-base/metrics/legacyregistry" + "sigs.k8s.io/controller-runtime/pkg/metrics" + + metricsutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/metrics" ) const component = "bbr" var ( - successCounter = compbasemetrics.NewCounterVec( - &compbasemetrics.CounterOpts{ - Subsystem: component, - Name: "success_total", - Help: "Count of successes pulling model name from body and injecting it in the request headers.", - StabilityLevel: compbasemetrics.ALPHA, + successCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: component, + Name: "success_total", + Help: metricsutil.HelpMsgWithStability("Count of successes pulling model name from body and injecting it in the request headers.", compbasemetrics.ALPHA), }, []string{}, ) - modelNotInBodyCounter = compbasemetrics.NewCounterVec( - &compbasemetrics.CounterOpts{ - Subsystem: component, - Name: "model_not_in_body_total", - Help: "Count of times the model was not present in the request body.", - StabilityLevel: compbasemetrics.ALPHA, + modelNotInBodyCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: component, + Name: "model_not_in_body_total", + Help: metricsutil.HelpMsgWithStability("Count of times the model was not present in the request body.", compbasemetrics.ALPHA), }, []string{}, ) - modelNotParsedCounter = compbasemetrics.NewCounterVec( - &compbasemetrics.CounterOpts{ - Subsystem: component, - Name: "model_not_parsed_total", - Help: "Count of times the model was in the request body but we could not parse it.", - StabilityLevel: compbasemetrics.ALPHA, + modelNotParsedCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: component, + Name: "model_not_parsed_total", + Help: metricsutil.HelpMsgWithStability("Count of times the model was in the request body but we could not parse it.", compbasemetrics.ALPHA), }, []string{}, ) // TODO: Uncomment and use this metrics once the core server implementation has handling to skip body parsing if header exists. /* - modelAlreadyPresentInHeaderCounter = compbasemetrics.NewCounterVec( - &compbasemetrics.CounterOpts{ + modelAlreadyPresentInHeaderCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ Subsystem: component, Name: "model_already_present_in_header_total", Help: "Count of times the model was already present in request headers.", - StabilityLevel: compbasemetrics.ALPHA, }, []string{}, ) @@ -73,10 +72,10 @@ var registerMetrics sync.Once // Register all metrics. func Register() { registerMetrics.Do(func() { - legacyregistry.MustRegister(successCounter) - legacyregistry.MustRegister(modelNotInBodyCounter) - legacyregistry.MustRegister(modelNotParsedCounter) - // legacyregistry.MustRegister(modelAlreadyPresentInHeaderCounter) + metrics.Registry.MustRegister(successCounter) + metrics.Registry.MustRegister(modelNotInBodyCounter) + metrics.Registry.MustRegister(modelNotParsedCounter) + // metrics.Registry.MustRegister(modelAlreadyPresentInHeaderCounter) }) } diff --git a/pkg/body-based-routing/server/runserver.go b/pkg/bbr/server/runserver.go similarity index 96% rename from pkg/body-based-routing/server/runserver.go rename to pkg/bbr/server/runserver.go index 1646aa5a6..2001b7ff0 100644 --- a/pkg/body-based-routing/server/runserver.go +++ b/pkg/bbr/server/runserver.go @@ -27,7 +27,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" tlsutil "sigs.k8s.io/gateway-api-inference-extension/internal/tls" - "sigs.k8s.io/gateway-api-inference-extension/pkg/body-based-routing/handlers" + "sigs.k8s.io/gateway-api-inference-extension/pkg/bbr/handlers" ) // ExtProcServerRunner provides methods to manage an external process server. diff --git a/pkg/epp/README.md b/pkg/epp/README.md index 1bf479937..99d1bf06b 100644 --- a/pkg/epp/README.md +++ b/pkg/epp/README.md @@ -1,5 +1,5 @@ # The EndPoint Picker (EPP) -This package provides the reference implementation for the Endpoint Picker (EPP). As demonistrated in the diagram below, it implements the [extension protocol](../../docs/proposals/004-endpoint-picker-protocol), enabling a proxy or gateway to request endpoint hints from an extension, and interacts with the model servers through the defined [model server protocol](../..//docs/proposals/003-model-server-protocol). +This package provides the reference implementation for the Endpoint Picker (EPP). As demonstrated in the diagram below, it implements the [extension protocol](../../docs/proposals/004-endpoint-picker-protocol), enabling a proxy or gateway to request endpoint hints from an extension, and interacts with the model servers through the defined [model server protocol](../..//docs/proposals/003-model-server-protocol). ![Architecture Diagram](../../docs/endpoint-picker.svg) diff --git a/pkg/epp/backend/metrics/fake.go b/pkg/epp/backend/metrics/fake.go index 7fd4970db..5599d4ec0 100644 --- a/pkg/epp/backend/metrics/fake.go +++ b/pkg/epp/backend/metrics/fake.go @@ -24,24 +24,24 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) // FakePodMetrics is an implementation of PodMetrics that doesn't run the async refresh loop. type FakePodMetrics struct { - Pod *Pod - Metrics *Metrics + Pod *backend.Pod + Metrics *MetricsState } func (fpm *FakePodMetrics) String() string { return fmt.Sprintf("Pod: %v; Metrics: %v", fpm.GetPod(), fpm.GetMetrics()) } -func (fpm *FakePodMetrics) GetPod() *Pod { +func (fpm *FakePodMetrics) GetPod() *backend.Pod { return fpm.Pod } -func (fpm *FakePodMetrics) GetMetrics() *Metrics { +func (fpm *FakePodMetrics) GetMetrics() *MetricsState { return fpm.Metrics } func (fpm *FakePodMetrics) UpdatePod(pod *corev1.Pod) { @@ -53,10 +53,10 @@ type FakePodMetricsClient struct { errMu sync.RWMutex Err map[types.NamespacedName]error resMu sync.RWMutex - Res map[types.NamespacedName]*Metrics + Res map[types.NamespacedName]*MetricsState } -func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, pod *Pod, existing *Metrics, port int32) (*Metrics, error) { +func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, pod *backend.Pod, existing *MetricsState, port int32) (*MetricsState, error) { f.errMu.RLock() err, ok := f.Err[pod.NamespacedName] f.errMu.RUnlock() @@ -73,7 +73,7 @@ func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, pod *Pod, exist return res.Clone(), nil } -func (f *FakePodMetricsClient) SetRes(new map[types.NamespacedName]*Metrics) { +func (f *FakePodMetricsClient) SetRes(new map[types.NamespacedName]*MetricsState) { f.resMu.Lock() defer f.resMu.Unlock() f.Res = new @@ -84,11 +84,3 @@ func (f *FakePodMetricsClient) SetErr(new map[types.NamespacedName]error) { defer f.errMu.Unlock() f.Err = new } - -type FakeDataStore struct { - Res map[string]*v1alpha2.InferenceModel -} - -func (fds *FakeDataStore) FetchModelData(modelName string) (returnModel *v1alpha2.InferenceModel) { - return fds.Res[modelName] -} diff --git a/pkg/epp/backend/metrics/logger.go b/pkg/epp/backend/metrics/logger.go index d71dc3fa3..7dc1a8b8b 100644 --- a/pkg/epp/backend/metrics/logger.go +++ b/pkg/epp/backend/metrics/logger.go @@ -32,6 +32,7 @@ const ( // Note currently the EPP treats stale metrics same as fresh. // TODO: https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/336 metricsValidityPeriod = 5 * time.Second + debugPrintInterval = 5 * time.Second ) type Datastore interface { @@ -46,17 +47,16 @@ type Datastore interface { // enabled; 2) flushes Prometheus metrics about the backend servers. func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometheusMetricsInterval time.Duration) { logger := log.FromContext(ctx) - - // Periodically flush prometheus metrics for inference pool + ticker := time.NewTicker(refreshPrometheusMetricsInterval) go func() { + defer ticker.Stop() for { select { case <-ctx.Done(): logger.V(logutil.DEFAULT).Info("Shutting down prometheus metrics thread") return - default: - time.Sleep(refreshPrometheusMetricsInterval) - flushPrometheusMetricsOnce(logger, datastore) + case <-ticker.C: // Periodically refresh prometheus metrics for inference pool + refreshPrometheusMetrics(logger, datastore) } } }() @@ -64,13 +64,14 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh // Periodically print out the pods and metrics for DEBUGGING. if logger := logger.V(logutil.DEBUG); logger.Enabled() { go func() { + ticker := time.NewTicker(debugPrintInterval) + defer ticker.Stop() for { select { case <-ctx.Done(): logger.V(logutil.DEFAULT).Info("Shutting down metrics logger thread") return - default: - time.Sleep(5 * time.Second) + case <-ticker.C: podsWithFreshMetrics := datastore.PodList(func(pm PodMetrics) bool { return time.Since(pm.GetMetrics().UpdateTime) <= metricsValidityPeriod }) @@ -85,11 +86,11 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh } } -func flushPrometheusMetricsOnce(logger logr.Logger, datastore Datastore) { +func refreshPrometheusMetrics(logger logr.Logger, datastore Datastore) { pool, err := datastore.PoolGet() if err != nil { // No inference pool or not initialize. - logger.V(logutil.DEFAULT).Info("pool is not initialized, skipping flushing metrics") + logger.V(logutil.DEFAULT).Info("Pool is not initialized, skipping refreshing metrics") return } @@ -97,7 +98,7 @@ func flushPrometheusMetricsOnce(logger logr.Logger, datastore Datastore) { var queueTotal int podMetrics := datastore.PodGetAll() - logger.V(logutil.VERBOSE).Info("Flushing Prometheus Metrics", "ReadyPods", len(podMetrics)) + logger.V(logutil.TRACE).Info("Refreshing Prometheus Metrics", "ReadyPods", len(podMetrics)) if len(podMetrics) == 0 { return } diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go index d48b1dc5b..8899e00ce 100644 --- a/pkg/epp/backend/metrics/metrics.go +++ b/pkg/epp/backend/metrics/metrics.go @@ -26,6 +26,7 @@ import ( dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" "go.uber.org/multierr" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" ) const ( @@ -39,15 +40,8 @@ type PodMetricsClientImpl struct { MetricMapping *MetricMapping } -// FetchMetrics fetches metrics from a given pod, clones the existing metrics object and returns an -// updated one. -func (p *PodMetricsClientImpl) FetchMetrics( - ctx context.Context, - pod *Pod, - existing *Metrics, - port int32, -) (*Metrics, error) { - +// FetchMetrics fetches metrics from a given pod, clones the existing metrics object and returns an updated one. +func (p *PodMetricsClientImpl) FetchMetrics(ctx context.Context, pod *backend.Pod, existing *MetricsState, port int32) (*MetricsState, error) { // Currently the metrics endpoint is hard-coded, which works with vLLM. // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. url := "http://" + pod.Address + ":" + strconv.Itoa(int(port)) + "/metrics" @@ -79,8 +73,8 @@ func (p *PodMetricsClientImpl) FetchMetrics( // promToPodMetrics updates internal pod metrics with scraped Prometheus metrics. func (p *PodMetricsClientImpl) promToPodMetrics( metricFamilies map[string]*dto.MetricFamily, - existing *Metrics, -) (*Metrics, error) { + existing *MetricsState, +) (*MetricsState, error) { var errs error updated := existing.Clone() @@ -109,6 +103,7 @@ func (p *PodMetricsClientImpl) promToPodMetrics( if loraMetrics != nil { updated.ActiveModels = make(map[string]int) + updated.WaitingModels = make(map[string]int) for _, label := range loraMetrics.GetLabel() { if label.GetName() == LoraInfoRunningAdaptersMetricName { if label.GetValue() != "" { @@ -122,7 +117,7 @@ func (p *PodMetricsClientImpl) promToPodMetrics( if label.GetValue() != "" { adapterList := strings.Split(label.GetValue(), ",") for _, adapter := range adapterList { - updated.ActiveModels[adapter] = 0 + updated.WaitingModels[adapter] = 0 } } } diff --git a/pkg/epp/backend/metrics/metrics_state.go b/pkg/epp/backend/metrics/metrics_state.go new file mode 100644 index 000000000..0215ac05f --- /dev/null +++ b/pkg/epp/backend/metrics/metrics_state.go @@ -0,0 +1,80 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "fmt" + "time" +) + +// NewMetricsState initializes a new MetricsState and returns its pointer. +func NewMetricsState() *MetricsState { + return &MetricsState{ + ActiveModels: make(map[string]int), + WaitingModels: make(map[string]int), + } +} + +// MetricsState holds the latest state of the metrics that were scraped from a pod. +type MetricsState struct { + // ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU. + ActiveModels map[string]int + WaitingModels map[string]int + // MaxActiveModels is the maximum number of models that can be loaded to GPU. + MaxActiveModels int + RunningQueueSize int + WaitingQueueSize int + KVCacheUsagePercent float64 + KvCacheMaxTokenCapacity int + + // UpdateTime record the last time when the metrics were updated. + UpdateTime time.Time +} + +// String returns a string with all MetricState information +func (s *MetricsState) String() string { + if s == nil { + return "" + } + return fmt.Sprintf("%+v", *s) +} + +// Clone creates a copy of MetricsState and returns its pointer. +// Clone returns nil if the object being cloned is nil. +func (s *MetricsState) Clone() *MetricsState { + if s == nil { + return nil + } + activeModels := make(map[string]int, len(s.ActiveModels)) + for key, value := range s.ActiveModels { + activeModels[key] = value + } + waitingModels := make(map[string]int, len(s.WaitingModels)) + for key, value := range s.WaitingModels { + waitingModels[key] = value + } + return &MetricsState{ + ActiveModels: activeModels, + WaitingModels: waitingModels, + MaxActiveModels: s.MaxActiveModels, + RunningQueueSize: s.RunningQueueSize, + WaitingQueueSize: s.WaitingQueueSize, + KVCacheUsagePercent: s.KVCacheUsagePercent, + KvCacheMaxTokenCapacity: s.KvCacheMaxTokenCapacity, + UpdateTime: s.UpdateTime, + } +} diff --git a/pkg/epp/backend/metrics/metrics_test.go b/pkg/epp/backend/metrics/metrics_test.go index d0396bf74..bfc3e01fa 100644 --- a/pkg/epp/backend/metrics/metrics_test.go +++ b/pkg/epp/backend/metrics/metrics_test.go @@ -30,6 +30,7 @@ import ( "google.golang.org/protobuf/proto" "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -376,8 +377,8 @@ func TestPromToPodMetrics(t *testing.T) { name string metricFamilies map[string]*dto.MetricFamily mapping *MetricMapping - existingMetrics *Metrics - expectedMetrics *Metrics + existingMetrics *MetricsState + expectedMetrics *MetricsState expectedErr error // Count of expected errors }{ { @@ -400,11 +401,12 @@ func TestPromToPodMetrics(t *testing.T) { KVCacheUtilization: &MetricSpec{MetricName: "vllm_usage"}, LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, }, - existingMetrics: &Metrics{}, - expectedMetrics: &Metrics{ + existingMetrics: &MetricsState{}, + expectedMetrics: &MetricsState{ WaitingQueueSize: 7, KVCacheUsagePercent: 0.8, - ActiveModels: map[string]int{"lora1": 0, "lora2": 0, "lora3": 0}, + ActiveModels: map[string]int{"lora1": 0, "lora2": 0}, + WaitingModels: map[string]int{"lora3": 0}, MaxActiveModels: 3, }, }, @@ -416,8 +418,8 @@ func TestPromToPodMetrics(t *testing.T) { KVCacheUtilization: &MetricSpec{MetricName: "vllm_usage"}, LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, }, - existingMetrics: &Metrics{ActiveModels: map[string]int{}}, - expectedMetrics: &Metrics{ActiveModels: map[string]int{}}, + existingMetrics: &MetricsState{ActiveModels: map[string]int{}, WaitingModels: map[string]int{}}, + expectedMetrics: &MetricsState{ActiveModels: map[string]int{}, WaitingModels: map[string]int{}}, expectedErr: multierr.Combine(errors.New("metric family \"vllm_waiting\" not found"), errors.New("metric family \"vllm_usage\" not found"), errors.New("metric family \"vllm:lora_requests_info\" not found")), }, { @@ -435,11 +437,12 @@ func TestPromToPodMetrics(t *testing.T) { KVCacheUtilization: &MetricSpec{MetricName: "vllm_usage"}, LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, }, - existingMetrics: &Metrics{}, - expectedMetrics: &Metrics{ + existingMetrics: &MetricsState{}, + expectedMetrics: &MetricsState{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.8, - ActiveModels: map[string]int{"lora1": 0, "lora2": 0, "lora3": 0}, + ActiveModels: map[string]int{"lora1": 0, "lora2": 0}, + WaitingModels: map[string]int{"lora3": 0}, MaxActiveModels: 3, }, expectedErr: errors.New("metric family \"vllm_waiting\" not found"), @@ -454,9 +457,10 @@ func TestPromToPodMetrics(t *testing.T) { mapping: &MetricMapping{ LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, }, - existingMetrics: &Metrics{}, - expectedMetrics: &Metrics{ + existingMetrics: &MetricsState{}, + expectedMetrics: &MetricsState{ ActiveModels: map[string]int{"lora1": 0}, + WaitingModels: map[string]int{}, MaxActiveModels: 0, // Should still default to 0. }, @@ -483,14 +487,14 @@ func TestPromToPodMetrics(t *testing.T) { // there's no server running on the specified port. func TestFetchMetrics(t *testing.T) { ctx := logutil.NewTestLoggerIntoContext(context.Background()) - pod := &Pod{ + pod := &backend.Pod{ Address: "127.0.0.1", NamespacedName: types.NamespacedName{ Namespace: "test", Name: "pod", }, } - existing := &Metrics{} + existing := &MetricsState{} p := &PodMetricsClientImpl{} // No MetricMapping needed for this basic test _, err := p.FetchMetrics(ctx, pod, existing, 9999) // Use a port that's unlikely to be in use. diff --git a/pkg/epp/backend/metrics/pod_metrics.go b/pkg/epp/backend/metrics/pod_metrics.go index cfb6b1384..3471ddf3d 100644 --- a/pkg/epp/backend/metrics/pod_metrics.go +++ b/pkg/epp/backend/metrics/pod_metrics.go @@ -5,7 +5,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -27,6 +27,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -35,70 +36,73 @@ const ( ) type podMetrics struct { - pod atomic.Pointer[Pod] - metrics atomic.Pointer[Metrics] + pod atomic.Pointer[backend.Pod] + metrics atomic.Pointer[MetricsState] pmc PodMetricsClient ds Datastore interval time.Duration - parentCtx context.Context - once sync.Once // ensure the StartRefreshLoop is only called once. + startOnce sync.Once // ensures the refresh loop goroutine is started only once + stopOnce sync.Once // ensures the done channel is closed only once done chan struct{} logger logr.Logger } type PodMetricsClient interface { - FetchMetrics(ctx context.Context, pod *Pod, existing *Metrics, port int32) (*Metrics, error) + FetchMetrics(ctx context.Context, pod *backend.Pod, existing *MetricsState, port int32) (*MetricsState, error) } func (pm *podMetrics) String() string { return fmt.Sprintf("Pod: %v; Metrics: %v", pm.GetPod(), pm.GetMetrics()) } -func (pm *podMetrics) GetPod() *Pod { +func (pm *podMetrics) GetPod() *backend.Pod { return pm.pod.Load() } -func (pm *podMetrics) GetMetrics() *Metrics { +func (pm *podMetrics) GetMetrics() *MetricsState { return pm.metrics.Load() } -func (pm *podMetrics) UpdatePod(in *corev1.Pod) { - pm.pod.Store(toInternalPod(in)) +func (pm *podMetrics) UpdatePod(pod *corev1.Pod) { + pm.pod.Store(toInternalPod(pod)) } -func toInternalPod(in *corev1.Pod) *Pod { - return &Pod{ +func toInternalPod(pod *corev1.Pod) *backend.Pod { + labels := make(map[string]string, len(pod.GetLabels())) + for key, value := range pod.GetLabels() { + labels[key] = value + } + return &backend.Pod{ NamespacedName: types.NamespacedName{ - Name: in.Name, - Namespace: in.Namespace, + Name: pod.Name, + Namespace: pod.Namespace, }, - Address: in.Status.PodIP, + Address: pod.Status.PodIP, + Labels: labels, } } // start starts a goroutine exactly once to periodically update metrics. The goroutine will be -// stopped either when stop() is called, or the parentCtx is cancelled. -func (pm *podMetrics) startRefreshLoop() { - pm.once.Do(func() { +// stopped either when stop() is called, or the given ctx is cancelled. +func (pm *podMetrics) startRefreshLoop(ctx context.Context) { + pm.startOnce.Do(func() { go func() { pm.logger.V(logutil.DEFAULT).Info("Starting refresher", "pod", pm.GetPod()) + ticker := time.NewTicker(pm.interval) + defer ticker.Stop() for { select { case <-pm.done: return - case <-pm.parentCtx.Done(): + case <-ctx.Done(): return - default: - } - - err := pm.refreshMetrics() - if err != nil { - pm.logger.V(logutil.TRACE).Error(err, "Failed to refresh metrics", "pod", pm.GetPod()) + case <-ticker.C: // refresh metrics periodically + if err := pm.refreshMetrics(); err != nil { + pm.logger.V(logutil.TRACE).Error(err, "Failed to refresh metrics", "pod", pm.GetPod()) + } } - - time.Sleep(pm.interval) } }() }) @@ -135,5 +139,7 @@ func (pm *podMetrics) refreshMetrics() error { func (pm *podMetrics) StopRefreshLoop() { pm.logger.V(logutil.DEFAULT).Info("Stopping refresher", "pod", pm.GetPod()) - close(pm.done) + pm.stopOnce.Do(func() { + close(pm.done) + }) } diff --git a/pkg/epp/backend/metrics/pod_metrics_test.go b/pkg/epp/backend/metrics/pod_metrics_test.go index cf6698ca9..796b636b4 100644 --- a/pkg/epp/backend/metrics/pod_metrics_test.go +++ b/pkg/epp/backend/metrics/pod_metrics_test.go @@ -36,7 +36,7 @@ var ( Namespace: "default", }, } - initial = &Metrics{ + initial = &MetricsState{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, MaxActiveModels: 2, @@ -44,8 +44,9 @@ var ( "foo": 1, "bar": 1, }, + WaitingModels: map[string]int{}, } - updated = &Metrics{ + updated = &MetricsState{ WaitingQueueSize: 9999, KVCacheUsagePercent: 0.99, MaxActiveModels: 99, @@ -53,6 +54,7 @@ var ( "foo": 1, "bar": 1, }, + WaitingModels: map[string]int{}, } ) @@ -67,16 +69,17 @@ func TestMetricsRefresh(t *testing.T) { namespacedName := types.NamespacedName{Name: pod1.Name, Namespace: pod1.Namespace} // Use SetRes to simulate an update of metrics from the pod. // Verify that the metrics are updated. - pmc.SetRes(map[types.NamespacedName]*Metrics{namespacedName: initial}) + pmc.SetRes(map[types.NamespacedName]*MetricsState{namespacedName: initial}) condition := func(collect *assert.CollectT) { - assert.True(collect, cmp.Equal(pm.GetMetrics(), initial, cmpopts.IgnoreFields(Metrics{}, "UpdateTime"))) + assert.True(collect, cmp.Equal(pm.GetMetrics(), initial, cmpopts.IgnoreFields(MetricsState{}, "UpdateTime"))) } assert.EventuallyWithT(t, condition, time.Second, time.Millisecond) // Stop the loop, and simulate metric update again, this time the PodMetrics won't get the // new update. pm.StopRefreshLoop() - pmc.SetRes(map[types.NamespacedName]*Metrics{namespacedName: updated}) + time.Sleep(pmf.refreshMetricsInterval * 2 /* small buffer for robustness */) + pmc.SetRes(map[types.NamespacedName]*MetricsState{namespacedName: updated}) // Still expect the same condition (no metrics update). assert.EventuallyWithT(t, condition, time.Second, time.Millisecond) } diff --git a/pkg/epp/backend/metrics/types.go b/pkg/epp/backend/metrics/types.go index 17db23b4c..80b708555 100644 --- a/pkg/epp/backend/metrics/types.go +++ b/pkg/epp/backend/metrics/types.go @@ -19,13 +19,12 @@ package metrics import ( "context" - "fmt" "sync" "time" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" ) func NewPodMetricsFactory(pmc PodMetricsClient, refreshMetricsInterval time.Duration) *PodMetricsFactory { @@ -41,82 +40,27 @@ type PodMetricsFactory struct { } func (f *PodMetricsFactory) NewPodMetrics(parentCtx context.Context, in *corev1.Pod, ds Datastore) PodMetrics { + pod := toInternalPod(in) pm := &podMetrics{ pmc: f.pmc, ds: ds, interval: f.refreshMetricsInterval, - parentCtx: parentCtx, - once: sync.Once{}, + startOnce: sync.Once{}, + stopOnce: sync.Once{}, done: make(chan struct{}), - logger: log.FromContext(parentCtx), + logger: log.FromContext(parentCtx).WithValues("pod", pod.NamespacedName), } - pm.pod.Store(toInternalPod(in)) - pm.metrics.Store(newMetrics()) + pm.pod.Store(pod) + pm.metrics.Store(NewMetricsState()) - pm.startRefreshLoop() + pm.startRefreshLoop(parentCtx) return pm } type PodMetrics interface { - GetPod() *Pod - GetMetrics() *Metrics + GetPod() *backend.Pod + GetMetrics() *MetricsState UpdatePod(*corev1.Pod) StopRefreshLoop() String() string } - -type Pod struct { - NamespacedName types.NamespacedName - Address string -} - -func (p *Pod) String() string { - if p == nil { - return "" - } - return fmt.Sprintf("%+v", *p) -} - -type Metrics struct { - // ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU. - ActiveModels map[string]int - // MaxActiveModels is the maximum number of models that can be loaded to GPU. - MaxActiveModels int - RunningQueueSize int - WaitingQueueSize int - KVCacheUsagePercent float64 - KvCacheMaxTokenCapacity int - - // UpdateTime record the last time when the metrics were updated. - UpdateTime time.Time -} - -func newMetrics() *Metrics { - return &Metrics{ - ActiveModels: make(map[string]int), - } -} - -func (m *Metrics) String() string { - if m == nil { - return "" - } - return fmt.Sprintf("%+v", *m) -} - -func (m *Metrics) Clone() *Metrics { - cm := make(map[string]int, len(m.ActiveModels)) - for k, v := range m.ActiveModels { - cm[k] = v - } - clone := &Metrics{ - ActiveModels: cm, - MaxActiveModels: m.MaxActiveModels, - RunningQueueSize: m.RunningQueueSize, - WaitingQueueSize: m.WaitingQueueSize, - KVCacheUsagePercent: m.KVCacheUsagePercent, - KvCacheMaxTokenCapacity: m.KvCacheMaxTokenCapacity, - UpdateTime: m.UpdateTime, - } - return clone -} diff --git a/pkg/epp/backend/pod.go b/pkg/epp/backend/pod.go new file mode 100644 index 000000000..3340a3d70 --- /dev/null +++ b/pkg/epp/backend/pod.go @@ -0,0 +1,54 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package backend + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/types" +) + +type Pod struct { + NamespacedName types.NamespacedName + Address string + Labels map[string]string +} + +func (p *Pod) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("%+v", *p) +} + +func (p *Pod) Clone() *Pod { + if p == nil { + return nil + } + clonedLabels := make(map[string]string, len(p.Labels)) + for key, value := range p.Labels { + clonedLabels[key] = value + } + return &Pod{ + NamespacedName: types.NamespacedName{ + Name: p.NamespacedName.Name, + Namespace: p.NamespacedName.Namespace, + }, + Address: p.Address, + Labels: clonedLabels, + } +} diff --git a/pkg/epp/common/config/defaults.go b/pkg/epp/common/config/defaults.go new file mode 100644 index 000000000..89fd6f493 --- /dev/null +++ b/pkg/epp/common/config/defaults.go @@ -0,0 +1,28 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package config holds common configuration default values used across +// different EPP components. +package config + +const ( + // DefaultKVCacheThreshold is the default KV cache utilization (0.0 to 1.0) + // threshold. + DefaultKVCacheThreshold = 0.8 + // DefaultQueueThresholdCritical is the default backend waiting queue size + // threshold. + DefaultQueueThresholdCritical = 5 +) diff --git a/pkg/epp/common/config/loader/configloader.go b/pkg/epp/common/config/loader/configloader.go new file mode 100644 index 000000000..864a3ea72 --- /dev/null +++ b/pkg/epp/common/config/loader/configloader.go @@ -0,0 +1,189 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package loader + +import ( + "errors" + "fmt" + "os" + + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/serializer" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + + "sigs.k8s.io/gateway-api-inference-extension/api/config/v1alpha1" + configapi "sigs.k8s.io/gateway-api-inference-extension/api/config/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" +) + +var scheme = runtime.NewScheme() + +func init() { + configapi.SchemeBuilder.Register(configapi.RegisterDefaults) + utilruntime.Must(configapi.Install(scheme)) +} + +// Load config either from supplied text or from a file +func LoadConfig(configText []byte, fileName string) (*configapi.EndpointPickerConfig, error) { + var err error + if len(configText) == 0 { + configText, err = os.ReadFile(fileName) + if err != nil { + return nil, fmt.Errorf("failed to load config file. Error: %s", err) + } + } + + theConfig := &configapi.EndpointPickerConfig{} + + codecs := serializer.NewCodecFactory(scheme, serializer.EnableStrict) + err = runtime.DecodeInto(codecs.UniversalDecoder(), configText, theConfig) + if err != nil { + return nil, fmt.Errorf("the configuration is invalid. Error: %s", err) + } + + // Validate loaded configuration + err = validateConfiguration(theConfig) + if err != nil { + return nil, fmt.Errorf("the configuration is invalid. error: %s", err) + } + return theConfig, nil +} + +func LoadPluginReferences(thePlugins []configapi.PluginSpec, handle plugins.Handle) error { + for _, pluginConfig := range thePlugins { + thePlugin, err := instantiatePlugin(pluginConfig, handle) + if err != nil { + return err + } + handle.Plugins().AddPlugin(pluginConfig.Name, thePlugin) + } + return nil +} + +func LoadSchedulerConfig(configProfiles []v1alpha1.SchedulingProfile, handle plugins.Handle) (*scheduling.SchedulerConfig, error) { + + var profiles = map[string]*framework.SchedulerProfile{} + + for _, configProfile := range configProfiles { + profile := framework.SchedulerProfile{} + + for _, plugin := range configProfile.Plugins { + var err error + thePlugin := handle.Plugins().Plugin(plugin.PluginRef) + if theScorer, ok := thePlugin.(framework.Scorer); ok { + if plugin.Weight == nil { + return nil, fmt.Errorf("scorer '%s' is missing a weight", plugin.PluginRef) + } + thePlugin = framework.NewWeightedScorer(theScorer, *plugin.Weight) + } + err = profile.AddPlugins(thePlugin) + if err != nil { + return nil, err + } + } + profiles[configProfile.Name] = &profile + } + + var profileHandler framework.ProfileHandler + var profileHandlerName string + + for pluginName, thePlugin := range handle.Plugins().GetAllPluginsWithNames() { + if theProfileHandler, ok := thePlugin.(framework.ProfileHandler); ok { + if profileHandler != nil { + return nil, fmt.Errorf("only one profile handler is allowed. Both %s and %s are profile handlers", profileHandlerName, pluginName) + } + profileHandler = theProfileHandler + profileHandlerName = pluginName + } + } + if profileHandler == nil { + return nil, errors.New("no profile handler was specified") + } + + return scheduling.NewSchedulerConfig(profileHandler, profiles), nil +} + +func instantiatePlugin(pluginSpec configapi.PluginSpec, handle plugins.Handle) (plugins.Plugin, error) { + factory, ok := plugins.Registry[pluginSpec.Type] + if !ok { + return nil, fmt.Errorf("failed to instantiate the plugin. plugin type %s not found", pluginSpec.Type) + } + thePlugin, err := factory(pluginSpec.Name, pluginSpec.Parameters, handle) + if err != nil { + return nil, fmt.Errorf("failed to instantiate the plugin type %s. Error: %s", pluginSpec.Type, err) + } + return thePlugin, err +} + +func validateConfiguration(theConfig *configapi.EndpointPickerConfig) error { + names := make(map[string]struct{}) + + for _, pluginConfig := range theConfig.Plugins { + if pluginConfig.Type == "" { + return fmt.Errorf("plugin definition for %s is missing a type", pluginConfig.Name) + } + + if _, ok := names[pluginConfig.Name]; ok { + return fmt.Errorf("plugin name %s used more than once", pluginConfig.Name) + } + names[pluginConfig.Name] = struct{}{} + + _, ok := plugins.Registry[pluginConfig.Type] + if !ok { + return fmt.Errorf("plugin type %s is not found", pluginConfig.Type) + } + } + + if len(theConfig.SchedulingProfiles) == 0 { + return errors.New("there must be at least one scheduling profile in the configuration") + } + + names = map[string]struct{}{} + for _, profile := range theConfig.SchedulingProfiles { + if profile.Name == "" { + return errors.New("SchedulingProfiles need a name") + } + + if _, ok := names[profile.Name]; ok { + return fmt.Errorf("the name %s has been specified for more than one SchedulingProfile", profile.Name) + } + names[profile.Name] = struct{}{} + + if len(profile.Plugins) == 0 { + return errors.New("SchedulingProfiles need at least one plugin") + } + for _, plugin := range profile.Plugins { + if len(plugin.PluginRef) == 0 { + return errors.New("SchedulingProfile's plugins need a plugin reference") + } + + notFound := true + for _, pluginConfig := range theConfig.Plugins { + if plugin.PluginRef == pluginConfig.Name { + notFound = false + break + } + } + if notFound { + return errors.New(plugin.PluginRef + " is a reference to an undefined Plugin") + } + } + } + return nil +} diff --git a/pkg/epp/common/config/loader/configloader_test.go b/pkg/epp/common/config/loader/configloader_test.go new file mode 100644 index 000000000..a6a982029 --- /dev/null +++ b/pkg/epp/common/config/loader/configloader_test.go @@ -0,0 +1,792 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package loader + +import ( + "context" + "encoding/json" + "testing" + + "github.com/google/go-cmp/cmp" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + configapi "sigs.k8s.io/gateway-api-inference-extension/api/config/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/filter" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/multi/prefix" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + "sigs.k8s.io/gateway-api-inference-extension/test/utils" +) + +const ( + testProfileHandlerType = "test-profile-handler" + test1Type = "test-one" + test2Type = "test-two" + testPickerType = "test-picker" +) + +func TestLoadConfiguration(t *testing.T) { + test2Weight := 50 + + registerTestPlugins() + + goodConfig := &configapi.EndpointPickerConfig{ + TypeMeta: metav1.TypeMeta{ + Kind: "EndpointPickerConfig", + APIVersion: "inference.networking.x-k8s.io/v1alpha1", + }, + Plugins: []configapi.PluginSpec{ + { + Name: "test1", + Type: test1Type, + Parameters: json.RawMessage("{\"threshold\":10}"), + }, + { + Name: "profileHandler", + Type: "test-profile-handler", + }, + { + Name: test2Type, + Type: test2Type, + Parameters: json.RawMessage("{\"hashBlockSize\":32}"), + }, + { + Name: "testPicker", + Type: testPickerType, + }, + }, + SchedulingProfiles: []configapi.SchedulingProfile{ + { + Name: "default", + Plugins: []configapi.SchedulingPlugin{ + { + PluginRef: "test1", + }, + { + PluginRef: "test-two", + Weight: &test2Weight, + }, + { + PluginRef: "testPicker", + }, + }, + }, + }, + } + + tests := []struct { + name string + configText string + configFile string + want *configapi.EndpointPickerConfig + wantErr bool + }{ + { + name: "success", + configText: successConfigText, + configFile: "", + want: goodConfig, + wantErr: false, + }, + { + name: "errorBadYaml", + configText: errorBadYamlText, + configFile: "", + wantErr: true, + }, + { + name: "errorNoProfileHandler", + configText: errorNoProfileHandlerText, + configFile: "", + wantErr: true, + }, + { + name: "errorBadPluginReferenceText", + configText: errorBadPluginReferenceText, + configFile: "", + wantErr: true, + }, + { + name: "errorBadPluginReferencePluginText", + configText: errorBadPluginReferencePluginText, + configFile: "", + wantErr: true, + }, + { + name: "errorNoProfiles", + configText: errorNoProfilesText, + configFile: "", + wantErr: true, + }, + { + name: "errorNoProfileName", + configText: errorNoProfileNameText, + configFile: "", + wantErr: true, + }, + { + name: "errorNoProfilePlugins", + configText: errorNoProfilePluginsText, + configFile: "", + wantErr: true, + }, + { + name: "errorBadProfilePlugin", + configText: errorBadProfilePluginText, + configFile: "", + wantErr: true, + }, + { + name: "errorBadProfilePluginRef", + configText: errorBadProfilePluginRefText, + configFile: "", + wantErr: true, + }, + { + name: "errorDuplicatePlugin", + configText: errorDuplicatePluginText, + configFile: "", + wantErr: true, + }, + { + name: "errorDuplicateProfile", + configText: errorDuplicateProfileText, + configFile: "", + wantErr: true, + }, + { + name: "successFromFile", + configText: "", + configFile: "../../../../../test/testdata/configloader_1_test.yaml", + want: goodConfig, + wantErr: false, + }, + { + name: "noSuchFile", + configText: "", + configFile: "../../../../../test/testdata/configloader_error_test.yaml", + wantErr: true, + }, + } + + for _, test := range tests { + got, err := LoadConfig([]byte(test.configText), test.configFile) + if err != nil { + if !test.wantErr { + t.Fatalf("In test %s LoadConfig returned unexpected error: %v, want %v", test.name, err, test.wantErr) + } + t.Logf("error was %s", err) + } else { + if test.wantErr { + t.Fatalf("In test %s LoadConfig did not return an expected error", test.name) + } + if diff := cmp.Diff(test.want, got); diff != "" { + t.Errorf("In test %s LoadConfig returned unexpected response, diff(-want, +got): %v", test.name, diff) + } + } + } +} + +func TestLoadPluginReferences(t *testing.T) { + ctx := context.Background() + theConfig, err := LoadConfig([]byte(successConfigText), "") + if err != nil { + t.Fatalf("LoadConfig returned unexpected error: %v", err) + } + handle := utils.NewTestHandle(ctx) + err = LoadPluginReferences(theConfig.Plugins, handle) + if err != nil { + t.Fatalf("LoadPluginReferences returned unexpected error: %v", err) + } + if len(handle.Plugins().GetAllPlugins()) == 0 { + t.Fatalf("LoadPluginReferences returned an empty set of references") + } + if t1 := handle.Plugins().Plugin("test1"); t1 == nil { + t.Fatalf("LoadPluginReferences returned references did not contain test1") + } else if _, ok := t1.(*test1); !ok { + t.Fatalf("LoadPluginReferences returned references value for test1 has the wrong type %#v", t1) + } + + theConfig, err = LoadConfig([]byte(errorBadPluginReferenceParametersText), "") + if err != nil { + t.Fatalf("LoadConfig returned unexpected error: %v", err) + } + err = LoadPluginReferences(theConfig.Plugins, utils.NewTestHandle(ctx)) + if err == nil { + t.Fatalf("LoadPluginReferences did not return the expected error") + } +} + +func TestInstantiatePlugin(t *testing.T) { + plugSpec := configapi.PluginSpec{Type: "plover"} + _, err := instantiatePlugin(plugSpec, utils.NewTestHandle(context.Background())) + if err == nil { + t.Fatalf("InstantiatePlugin did not return the expected error") + } +} + +func TestLoadSchedulerConfig(t *testing.T) { + tests := []struct { + name string + configText string + wantErr bool + }{ + { + name: "schedulerSuccess", + configText: successSchedulerConfigText, + wantErr: false, + }, + { + name: "errorBadPluginJson", + configText: errorBadPluginJsonText, + wantErr: true, + }, + { + name: "errorBadReferenceNoWeight", + configText: errorBadReferenceNoWeightText, + wantErr: true, + }, + { + name: "errorTwoPickers", + configText: errorTwoPickersText, + wantErr: true, + }, + { + name: "errorConfig", + configText: errorConfigText, + wantErr: true, + }, + { + name: "errorTwoProfileHandlers", + configText: errorTwoProfileHandlersText, + wantErr: true, + }, + { + name: "errorNoProfileHandlers", + configText: errorNoProfileHandlersText, + wantErr: true, + }, + } + + registerNeededPlgugins() + + ctx := context.Background() + + for _, test := range tests { + theConfig, err := LoadConfig([]byte(test.configText), "") + if err != nil { + if test.wantErr { + continue + } + t.Fatalf("LoadConfig returned unexpected error: %v", err) + } + handle := utils.NewTestHandle(ctx) + err = LoadPluginReferences(theConfig.Plugins, handle) + if err != nil { + if test.wantErr { + continue + } + t.Fatalf("LoadPluginReferences returned unexpected error: %v", err) + } + + _, err = LoadSchedulerConfig(theConfig.SchedulingProfiles, handle) + if err != nil { + if !test.wantErr { + t.Errorf("LoadSchedulerConfig returned an unexpected error. error %v", err) + } + } else if test.wantErr { + t.Errorf("LoadSchedulerConfig did not return an expected error (%s)", test.name) + } + } +} + +func registerNeededPlgugins() { + plugins.Register(filter.LowQueueFilterType, filter.LowQueueFilterFactory) + plugins.Register(prefix.PrefixCachePluginType, prefix.PrefixCachePluginFactory) + plugins.Register(picker.MaxScorePickerType, picker.MaxScorePickerFactory) + plugins.Register(picker.RandomPickerType, picker.RandomPickerFactory) + plugins.Register(profile.SingleProfileHandlerType, profile.SingleProfileHandlerFactory) +} + +// The following multi-line string constants, cause false positive lint errors (dupword) + +// valid configuration +// +//nolint:dupword +const successConfigText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: test1 + type: test-one + parameters: + threshold: 10 +- name: profileHandler + type: test-profile-handler +- type: test-two + parameters: + hashBlockSize: 32 +- name: testPicker + type: test-picker +schedulingProfiles: +- name: default + plugins: + - pluginRef: test1 + - pluginRef: test-two + weight: 50 + - pluginRef: testPicker +` + +// YAML does not follow expected structure of config +// +//nolint:dupword +const errorBadYamlText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- testing 1 2 3 +` + +// missing required Plugin type +// +//nolint:dupword +const errorBadPluginReferenceText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- parameters: + a: 1234 +` + +// plugin type does not exist +// +//nolint:dupword +const errorBadPluginReferencePluginText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: testx + type: test-x +- name: profileHandler + type: test-profile-handler +` + +// missing required profile handler +// +//nolint:dupword +const errorNoProfileHandlerText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: test1 + type: test-one + parameters: + threshold: 10 +schedulingProfiles: +- name: default +` + +// missing scheduling profiles +// +//nolint:dupword +const errorNoProfilesText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: test1 + type: test-one + parameters: + threshold: 10 +- name: profileHandler + type: test-profile-handler +` + +// missing required scheduling profile name +// +//nolint:dupword +const errorNoProfileNameText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: test1 + type: test-one + parameters: + threshold: 10 +- name: profileHandler + type: test-profile-handler +schedulingProfiles: +- plugins: + - pluginRef: test1 +` + +// missing plugins in scheduling profile +// +//nolint:dupword +const errorNoProfilePluginsText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: test1 + type: test-one + parameters: + threshold: 10 +- name: profileHandler + type: test-profile-handler +schedulingProfiles: +- name: default +` + +// missing required plugin reference name, only weight is provided +// +//nolint:dupword +const errorBadProfilePluginText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: profileHandler + type: test-profile-handler +schedulingProfiles: +- name: default + plugins: + - weight: 10 +` + +// reference a non-existent plugin +// +//nolint:dupword +const errorBadProfilePluginRefText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: profileHandler + type: test-profile-handler +schedulingProfiles: +- name: default + plugins: + - pluginRef: plover +` + +// invalid parameters (string provided where int is expected) +// +//nolint:dupword +const errorBadPluginReferenceParametersText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: test1 + type: test-one + parameters: + threshold: asdf +- name: profileHandler + type: test-profile-handler +schedulingProfiles: +- name: default + plugins: + - pluginRef: test1 +` + +// duplicate names in plugin list +// +//nolint:dupword +const errorDuplicatePluginText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: test1 + type: test-one + parameters: + threshold: 10 +- name: test1 + type: test-one + parameters: + threshold: 20 +- name: profileHandler + type: test-profile-handler +schedulingProfiles: +- name: default + plugins: + - pluginRef: test1 +` + +// duplicate scheduling profile name +// +//nolint:dupword +const errorDuplicateProfileText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: test1 + type: test-one + parameters: + threshold: 10 +- name: test2 + pluginName: test-one + type: + threshold: 20 +- name: profileHandler + type: test-profile-handler +schedulingProfiles: +- name: default + plugins: + - pluginRef: test1 +- name: default + plugins: + - pluginRef: test2 +` + +// compile-time type validation +var _ framework.Filter = &test1{} + +type test1 struct { + Threshold int `json:"threshold"` +} + +func (f *test1) Type() string { + return test1Type +} + +func (f *test1) Name() string { + return "test-1" +} + +// Filter filters out pods that doesn't meet the filter criteria. +func (f *test1) Filter(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, pods []types.Pod) []types.Pod { + return pods +} + +// compile-time type validation +var _ framework.Scorer = &test2{} +var _ framework.PostCycle = &test2{} + +type test2 struct{} + +func (f *test2) Type() string { + return test2Type +} + +func (f *test2) Name() string { + return "test-2" +} + +func (m *test2) Score(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, _ []types.Pod) map[types.Pod]float64 { + return map[types.Pod]float64{} +} + +func (m *test2) PostCycle(_ context.Context, _ *types.CycleState, _ *types.ProfileRunResult) {} + +// compile-time type validation +var _ framework.Picker = &testPicker{} + +type testPicker struct{} + +func (p *testPicker) Type() string { + return testPickerType +} + +func (p *testPicker) Name() string { + return "test-picker" +} + +func (p *testPicker) Pick(_ context.Context, _ *types.CycleState, _ []*types.ScoredPod) *types.ProfileRunResult { + return nil +} + +// compile-time type validation +var _ framework.ProfileHandler = &testProfileHandler{} + +type testProfileHandler struct{} + +func (p *testProfileHandler) Type() string { + return testProfileHandlerType +} + +func (p *testProfileHandler) Name() string { + return "test-profile-handler" +} + +func (p *testProfileHandler) Pick(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, _ map[string]*framework.SchedulerProfile, _ map[string]*types.ProfileRunResult) map[string]*framework.SchedulerProfile { + return nil +} + +func (p *testProfileHandler) ProcessResults(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, _ map[string]*types.ProfileRunResult) (*types.SchedulingResult, error) { + return nil, nil +} + +func registerTestPlugins() { + plugins.Register(test1Type, + func(name string, parameters json.RawMessage, handle plugins.Handle) (plugins.Plugin, error) { + result := test1{} + err := json.Unmarshal(parameters, &result) + return &result, err + }, + ) + + plugins.Register(test2Type, + func(name string, parameters json.RawMessage, handle plugins.Handle) (plugins.Plugin, error) { + return &test2{}, nil + }, + ) + + plugins.Register(testPickerType, + func(name string, parameters json.RawMessage, handle plugins.Handle) (plugins.Plugin, error) { + return &testPicker{}, nil + }, + ) + + plugins.Register(testProfileHandlerType, + func(name string, parameters json.RawMessage, handle plugins.Handle) (plugins.Plugin, error) { + return &testProfileHandler{}, nil + }, + ) +} + +// valid configuration +// +//nolint:dupword +const successSchedulerConfigText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: lowQueue + type: low-queue + parameters: + threshold: 10 +- name: prefixCache + type: prefix-cache + parameters: + hashBlockSize: 32 +- name: maxScore + type: max-score +- name: profileHandler + type: single-profile +schedulingProfiles: +- name: default + plugins: + - pluginRef: lowQueue + - pluginRef: prefixCache + weight: 50 + - pluginRef: maxScore +` + +// invalid parameter configuration for plugin (string passed, in expected) +// +//nolint:dupword +const errorBadPluginJsonText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name:profileHandler + type: single-profile +- name: prefixCache + type: prefix-cache + parameters: + hashBlockSize: asdf +schedulingProfiles: +- name: default + plugins: + - pluginRef: prefixCache + weight: 50 +` + +// missing weight for scorer +// +//nolint:dupword +const errorBadReferenceNoWeightText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: profileHandler + type: single-profile +- name: prefixCache + type: prefix-cache + parameters: + hashBlockSize: 32 +schedulingProfiles: +- name: default + plugins: + - pluginRef: prefixCache +` + +// multiple pickers in scheduling profile +// +//nolint:dupword +const errorTwoPickersText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: profileHandler + type: single-profile +- name: maxScore + type: max-score +- name: random + type: random +schedulingProfiles: +- name: default + plugins: + - pluginRef: maxScore + - pluginRef: random +` + +// missing required scheduling profile +// +//nolint:dupword +const errorConfigText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: lowQueue + pluginName: low-queue + parameters: + threshold: 10 +` + +// multiple profile handlers when only one is allowed +// +//nolint:dupword +const errorTwoProfileHandlersText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: profileHandler + type: single-profile +- name: secondProfileHandler + type: single-profile +- name: maxScore + type: max-score +schedulingProfiles: +- name: default + plugins: + - pluginRef: maxScore +` + +// missing required profile handler +// +//nolint:dupword +const errorNoProfileHandlersText = ` +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: maxScore + type: max-score +schedulingProfiles: +- name: default + plugins: + - pluginRef: maxScore +` diff --git a/pkg/epp/controller/inferencemodel_reconciler.go b/pkg/epp/controller/inferencemodel_reconciler.go index a7f365b79..f1774ffbb 100644 --- a/pkg/epp/controller/inferencemodel_reconciler.go +++ b/pkg/epp/controller/inferencemodel_reconciler.go @@ -34,7 +34,7 @@ import ( ) type InferenceModelReconciler struct { - client.Client + client.Reader Record record.EventRecorder Datastore datastore.Datastore PoolNamespacedName types.NamespacedName @@ -88,7 +88,7 @@ func (c *InferenceModelReconciler) handleModelDeleted(ctx context.Context, req t logger.Info("InferenceModel removed from datastore", "poolRef", existing.Spec.PoolRef, "modelName", existing.Spec.ModelName) // TODO(#409): replace this backfill logic with one that is based on InferenceModel Ready conditions once those are set by an external controller. - updated, err := c.Datastore.ModelResync(ctx, c.Client, existing.Spec.ModelName) + updated, err := c.Datastore.ModelResync(ctx, c.Reader, existing.Spec.ModelName) if err != nil { return err } diff --git a/pkg/epp/controller/inferencemodel_reconciler_test.go b/pkg/epp/controller/inferencemodel_reconciler_test.go index cd1ff1fbd..838737a72 100644 --- a/pkg/epp/controller/inferencemodel_reconciler_test.go +++ b/pkg/epp/controller/inferencemodel_reconciler_test.go @@ -25,6 +25,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -178,7 +179,8 @@ func TestInferenceModelReconciler(t *testing.T) { t.Run(test.name, func(t *testing.T) { // Create a fake client with no InferenceModel objects. scheme := runtime.NewScheme() - _ = v1alpha2.AddToScheme(scheme) + _ = clientgoscheme.AddToScheme(scheme) + _ = v1alpha2.Install(scheme) initObjs := []client.Object{} if test.model != nil { initObjs = append(initObjs, test.model) @@ -186,6 +188,7 @@ func TestInferenceModelReconciler(t *testing.T) { for _, m := range test.modelsInAPIServer { initObjs = append(initObjs, m) } + fakeClient := fake.NewClientBuilder(). WithScheme(scheme). WithObjects(initObjs...). @@ -196,9 +199,9 @@ func TestInferenceModelReconciler(t *testing.T) { for _, m := range test.modelsInStore { ds.ModelSetIfOlder(m) } - ds.PoolSet(pool) + _ = ds.PoolSet(context.Background(), fakeClient, pool) reconciler := &InferenceModelReconciler{ - Client: fakeClient, + Reader: fakeClient, Record: record.NewFakeRecorder(10), Datastore: ds, PoolNamespacedName: types.NamespacedName{Name: pool.Name, Namespace: pool.Namespace}, diff --git a/pkg/epp/controller/inferencepool_reconciler.go b/pkg/epp/controller/inferencepool_reconciler.go index c92d4eccb..54781703b 100644 --- a/pkg/epp/controller/inferencepool_reconciler.go +++ b/pkg/epp/controller/inferencepool_reconciler.go @@ -18,10 +18,8 @@ package controller import ( "context" - "reflect" "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -35,10 +33,9 @@ import ( // This implementation is just used for reading & maintaining data sync. The Gateway implementation // will have the proper controller that will create/manage objects on behalf of the server pool. type InferencePoolReconciler struct { - client.Client - Record record.EventRecorder - PoolNamespacedName types.NamespacedName - Datastore datastore.Datastore + client.Reader + Record record.EventRecorder + Datastore datastore.Datastore } func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { @@ -62,28 +59,15 @@ func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Reques c.Datastore.Clear() return ctrl.Result{}, nil } - - c.updateDatastore(ctx, infPool) + // update pool in datastore + if err := c.Datastore.PoolSet(ctx, c.Reader, infPool); err != nil { + logger.Error(err, "Failed to update datastore") + return ctrl.Result{}, err + } return ctrl.Result{}, nil } -func (c *InferencePoolReconciler) updateDatastore(ctx context.Context, newPool *v1alpha2.InferencePool) { - logger := log.FromContext(ctx) - oldPool, err := c.Datastore.PoolGet() - c.Datastore.PoolSet(newPool) - if err != nil || !reflect.DeepEqual(newPool.Spec.Selector, oldPool.Spec.Selector) { - logger.V(logutil.DEFAULT).Info("Updating inference pool endpoints", "selector", newPool.Spec.Selector) - // A full resync is required to address two cases: - // 1) At startup, the pod events may get processed before the pool is synced with the datastore, - // and hence they will not be added to the store since pool selector is not known yet - // 2) If the selector on the pool was updated, then we will not get any pod events, and so we need - // to resync the whole pool: remove pods in the store that don't match the new selector and add - // the ones that may have existed already to the store. - c.Datastore.PodResyncAll(ctx, c.Client, newPool) - } -} - func (c *InferencePoolReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). For(&v1alpha2.InferencePool{}). diff --git a/pkg/epp/controller/inferencepool_reconciler_test.go b/pkg/epp/controller/inferencepool_reconciler_test.go index 27c4238ee..c61abb327 100644 --- a/pkg/epp/controller/inferencepool_reconciler_test.go +++ b/pkg/epp/controller/inferencepool_reconciler_test.go @@ -77,7 +77,7 @@ func TestInferencePoolReconciler(t *testing.T) { // Set up the scheme. scheme := runtime.NewScheme() _ = clientgoscheme.AddToScheme(scheme) - _ = v1alpha2.AddToScheme(scheme) + _ = v1alpha2.Install(scheme) // Create a fake client with the pool and the pods. initialObjects := []client.Object{pool1, pool2} @@ -96,7 +96,7 @@ func TestInferencePoolReconciler(t *testing.T) { pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) datastore := datastore.NewDatastore(ctx, pmf) - inferencePoolReconciler := &InferencePoolReconciler{PoolNamespacedName: namespacedName, Client: fakeClient, Datastore: datastore} + inferencePoolReconciler := &InferencePoolReconciler{Reader: fakeClient, Datastore: datastore} // Step 1: Inception, only ready pods matching pool1 are added to the store. if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { diff --git a/pkg/epp/controller/pod_reconciler.go b/pkg/epp/controller/pod_reconciler.go index 046561e4d..bf8cb44e2 100644 --- a/pkg/epp/controller/pod_reconciler.go +++ b/pkg/epp/controller/pod_reconciler.go @@ -26,22 +26,23 @@ import ( "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + podutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/pod" ) type PodReconciler struct { - client.Client + client.Reader Datastore datastore.Datastore Record record.EventRecorder } func (c *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { logger := log.FromContext(ctx) - pool, err := c.Datastore.PoolGet() - if err != nil { + if !c.Datastore.PoolHasSynced() { logger.V(logutil.TRACE).Info("Skipping reconciling Pod because the InferencePool is not available yet") // When the inferencePool is initialized it lists the appropriate pods and populates the datastore, so no need to requeue. return ctrl.Result{}, nil @@ -59,38 +60,46 @@ func (c *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R return ctrl.Result{}, err } - c.updateDatastore(logger, pod, pool) + c.updateDatastore(logger, pod) return ctrl.Result{}, nil } func (c *PodReconciler) SetupWithManager(mgr ctrl.Manager) error { + filter := predicate.Funcs{ + CreateFunc: func(ce event.CreateEvent) bool { + pod := ce.Object.(*corev1.Pod) + return c.Datastore.PoolLabelsMatch(pod.GetLabels()) + }, + UpdateFunc: func(ue event.UpdateEvent) bool { + oldPod := ue.ObjectOld.(*corev1.Pod) + newPod := ue.ObjectNew.(*corev1.Pod) + return c.Datastore.PoolLabelsMatch(oldPod.GetLabels()) || c.Datastore.PoolLabelsMatch(newPod.GetLabels()) + }, + DeleteFunc: func(de event.DeleteEvent) bool { + pod := de.Object.(*corev1.Pod) + return c.Datastore.PoolLabelsMatch(pod.GetLabels()) + }, + GenericFunc: func(ge event.GenericEvent) bool { + pod := ge.Object.(*corev1.Pod) + return c.Datastore.PoolLabelsMatch(pod.GetLabels()) + }, + } return ctrl.NewControllerManagedBy(mgr). For(&corev1.Pod{}). + WithEventFilter(filter). Complete(c) } -func (c *PodReconciler) updateDatastore(logger logr.Logger, pod *corev1.Pod, pool *v1alpha2.InferencePool) { +func (c *PodReconciler) updateDatastore(logger logr.Logger, pod *corev1.Pod) { namespacedName := types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace} - if !pod.DeletionTimestamp.IsZero() || !c.Datastore.PoolLabelsMatch(pod.Labels) || !podIsReady(pod) { + if !podutil.IsPodReady(pod) || !c.Datastore.PoolLabelsMatch(pod.Labels) { logger.V(logutil.DEBUG).Info("Pod removed or not added", "name", namespacedName) c.Datastore.PodDelete(namespacedName) } else { - if c.Datastore.PodUpdateOrAddIfNotExist(pod, pool) { + if c.Datastore.PodUpdateOrAddIfNotExist(pod) { logger.V(logutil.DEFAULT).Info("Pod added", "name", namespacedName) } else { logger.V(logutil.DEFAULT).Info("Pod already exists", "name", namespacedName) } } } - -func podIsReady(pod *corev1.Pod) bool { - for _, condition := range pod.Status.Conditions { - if condition.Type == corev1.PodReady { - if condition.Status == corev1.ConditionTrue { - return true - } - break - } - } - return false -} diff --git a/pkg/epp/controller/pod_reconciler_test.go b/pkg/epp/controller/pod_reconciler_test.go index e4cb0b62d..bbb2a8318 100644 --- a/pkg/epp/controller/pod_reconciler_test.go +++ b/pkg/epp/controller/pod_reconciler_test.go @@ -182,12 +182,12 @@ func TestPodReconciler(t *testing.T) { // Configure the initial state of the datastore. store := datastore.NewDatastore(t.Context(), pmf) - store.PoolSet(test.pool) + _ = store.PoolSet(t.Context(), fakeClient, test.pool) for _, pod := range test.existingPods { - store.PodUpdateOrAddIfNotExist(pod, pool) + store.PodUpdateOrAddIfNotExist(pod) } - podReconciler := &PodReconciler{Client: fakeClient, Datastore: store} + podReconciler := &PodReconciler{Reader: fakeClient, Datastore: store} if test.req == nil { namespacedName := types.NamespacedName{Name: test.incomingPod.Name, Namespace: test.incomingPod.Namespace} test.req = &ctrl.Request{NamespacedName: namespacedName} diff --git a/pkg/epp/datastore/datastore.go b/pkg/epp/datastore/datastore.go index 8ada3e64d..524355413 100644 --- a/pkg/epp/datastore/datastore.go +++ b/pkg/epp/datastore/datastore.go @@ -20,6 +20,7 @@ import ( "context" "errors" "fmt" + "reflect" "sync" corev1 "k8s.io/api/core/v1" @@ -30,6 +31,7 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + podutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/pod" ) const ( @@ -43,7 +45,10 @@ var ( // The datastore is a local cache of relevant data for the given InferencePool (currently all pulled from k8s-api) type Datastore interface { // InferencePool operations - PoolSet(pool *v1alpha2.InferencePool) + // PoolSet sets the given pool in datastore. If the given pool has different label selector than the previous pool + // that was stored, the function triggers a resync of the pods to keep the datastore updated. If the given pool + // is nil, this call triggers the datastore.Clear() function. + PoolSet(ctx context.Context, reader client.Reader, pool *v1alpha2.InferencePool) error PoolGet() (*v1alpha2.InferencePool, error) PoolHasSynced() bool PoolLabelsMatch(podLabels map[string]string) bool @@ -52,23 +57,22 @@ type Datastore interface { ModelSetIfOlder(infModel *v1alpha2.InferenceModel) bool ModelGet(modelName string) *v1alpha2.InferenceModel ModelDelete(namespacedName types.NamespacedName) *v1alpha2.InferenceModel - ModelResync(ctx context.Context, ctrlClient client.Client, modelName string) (bool, error) + ModelResync(ctx context.Context, reader client.Reader, modelName string) (bool, error) ModelGetAll() []*v1alpha2.InferenceModel // PodMetrics operations // PodGetAll returns all pods and metrics, including fresh and stale. PodGetAll() []backendmetrics.PodMetrics // PodList lists pods matching the given predicate. - PodList(func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics - PodUpdateOrAddIfNotExist(pod *corev1.Pod, pool *v1alpha2.InferencePool) bool + PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics + PodUpdateOrAddIfNotExist(pod *corev1.Pod) bool PodDelete(namespacedName types.NamespacedName) - PodResyncAll(ctx context.Context, ctrlClient client.Client, pool *v1alpha2.InferencePool) // Clears the store state, happens when the pool gets deleted. Clear() } -func NewDatastore(parentCtx context.Context, pmf *backendmetrics.PodMetricsFactory) *datastore { +func NewDatastore(parentCtx context.Context, pmf *backendmetrics.PodMetricsFactory) Datastore { store := &datastore{ parentCtx: parentCtx, poolAndModelsMu: sync.RWMutex{}, @@ -97,14 +101,40 @@ func (ds *datastore) Clear() { defer ds.poolAndModelsMu.Unlock() ds.pool = nil ds.models = make(map[string]*v1alpha2.InferenceModel) + // stop all pods go routines before clearing the pods map. + ds.pods.Range(func(_, v any) bool { + v.(backendmetrics.PodMetrics).StopRefreshLoop() + return true + }) ds.pods.Clear() } // /// InferencePool APIs /// -func (ds *datastore) PoolSet(pool *v1alpha2.InferencePool) { +func (ds *datastore) PoolSet(ctx context.Context, reader client.Reader, pool *v1alpha2.InferencePool) error { + if pool == nil { + ds.Clear() + return nil + } + logger := log.FromContext(ctx) ds.poolAndModelsMu.Lock() defer ds.poolAndModelsMu.Unlock() + + oldPool := ds.pool ds.pool = pool + if oldPool == nil || !reflect.DeepEqual(pool.Spec.Selector, oldPool.Spec.Selector) { + logger.V(logutil.DEFAULT).Info("Updating inference pool endpoints", "selector", pool.Spec.Selector) + // A full resync is required to address two cases: + // 1) At startup, the pod events may get processed before the pool is synced with the datastore, + // and hence they will not be added to the store since pool selector is not known yet + // 2) If the selector on the pool was updated, then we will not get any pod events, and so we need + // to resync the whole pool: remove pods in the store that don't match the new selector and add + // the ones that may have existed already to the store. + if err := ds.podResyncAll(ctx, reader); err != nil { + return fmt.Errorf("failed to update pods according to the pool selector - %w", err) + } + } + + return nil } func (ds *datastore) PoolGet() (*v1alpha2.InferencePool, error) { @@ -125,6 +155,9 @@ func (ds *datastore) PoolHasSynced() bool { func (ds *datastore) PoolLabelsMatch(podLabels map[string]string) bool { ds.poolAndModelsMu.RLock() defer ds.poolAndModelsMu.RUnlock() + if ds.pool == nil { + return false + } poolSelector := selectorFromInferencePoolSelector(ds.pool.Spec.Selector) podSet := labels.Set(podLabels) return poolSelector.Matches(podSet) @@ -149,12 +182,12 @@ func (ds *datastore) ModelSetIfOlder(infModel *v1alpha2.InferenceModel) bool { return true } -func (ds *datastore) ModelResync(ctx context.Context, c client.Client, modelName string) (bool, error) { +func (ds *datastore) ModelResync(ctx context.Context, reader client.Reader, modelName string) (bool, error) { ds.poolAndModelsMu.Lock() defer ds.poolAndModelsMu.Unlock() var models v1alpha2.InferenceModelList - if err := c.List(ctx, &models, client.MatchingFields{ModelNameIndexKey: modelName}, client.InNamespace(ds.pool.Namespace)); err != nil { + if err := reader.List(ctx, &models, client.MatchingFields{ModelNameIndexKey: modelName}, client.InNamespace(ds.pool.Namespace)); err != nil { return false, fmt.Errorf("listing models that match the modelName %s: %w", modelName, err) } if len(models.Items) == 0 { @@ -217,18 +250,19 @@ func (ds *datastore) PodGetAll() []backendmetrics.PodMetrics { func (ds *datastore) PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics { res := []backendmetrics.PodMetrics{} - fn := func(k, v any) bool { + + ds.pods.Range(func(k, v any) bool { pm := v.(backendmetrics.PodMetrics) if predicate(pm) { res = append(res, pm) } return true - } - ds.pods.Range(fn) + }) + return res } -func (ds *datastore) PodUpdateOrAddIfNotExist(pod *corev1.Pod, pool *v1alpha2.InferencePool) bool { +func (ds *datastore) PodUpdateOrAddIfNotExist(pod *corev1.Pod) bool { namespacedName := types.NamespacedName{ Name: pod.Name, Namespace: pod.Namespace, @@ -246,48 +280,49 @@ func (ds *datastore) PodUpdateOrAddIfNotExist(pod *corev1.Pod, pool *v1alpha2.In return ok } -func (ds *datastore) PodResyncAll(ctx context.Context, ctrlClient client.Client, pool *v1alpha2.InferencePool) { +func (ds *datastore) PodDelete(namespacedName types.NamespacedName) { + v, ok := ds.pods.LoadAndDelete(namespacedName) + if ok { + pmr := v.(backendmetrics.PodMetrics) + pmr.StopRefreshLoop() + } +} + +func (ds *datastore) podResyncAll(ctx context.Context, reader client.Reader) error { logger := log.FromContext(ctx) podList := &corev1.PodList{} - if err := ctrlClient.List(ctx, podList, &client.ListOptions{ - LabelSelector: selectorFromInferencePoolSelector(pool.Spec.Selector), - Namespace: pool.Namespace, + if err := reader.List(ctx, podList, &client.ListOptions{ + LabelSelector: selectorFromInferencePoolSelector(ds.pool.Spec.Selector), + Namespace: ds.pool.Namespace, }); err != nil { - log.FromContext(ctx).V(logutil.DEFAULT).Error(err, "Failed to list clients") - return + return fmt.Errorf("failed to list pods - %w", err) } activePods := make(map[string]bool) for _, pod := range podList.Items { - if podIsReady(&pod) { - namespacedName := types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace} - activePods[pod.Name] = true - if ds.PodUpdateOrAddIfNotExist(&pod, pool) { - logger.V(logutil.DEFAULT).Info("Pod added", "name", namespacedName) - } else { - logger.V(logutil.DEFAULT).Info("Pod already exists", "name", namespacedName) - } + if !podutil.IsPodReady(&pod) { + continue + } + namespacedName := types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace} + activePods[pod.Name] = true + if ds.PodUpdateOrAddIfNotExist(&pod) { + logger.V(logutil.DEFAULT).Info("Pod added", "name", namespacedName) + } else { + logger.V(logutil.DEFAULT).Info("Pod already exists", "name", namespacedName) } } // Remove pods that don't belong to the pool or not ready any more. - deleteFn := func(k, v any) bool { + ds.pods.Range(func(k, v any) bool { pm := v.(backendmetrics.PodMetrics) if exist := activePods[pm.GetPod().NamespacedName.Name]; !exist { logger.V(logutil.VERBOSE).Info("Removing pod", "pod", pm.GetPod()) ds.PodDelete(pm.GetPod().NamespacedName) } return true - } - ds.pods.Range(deleteFn) -} + }) -func (ds *datastore) PodDelete(namespacedName types.NamespacedName) { - v, ok := ds.pods.LoadAndDelete(namespacedName) - if ok { - pmr := v.(backendmetrics.PodMetrics) - pmr.StopRefreshLoop() - } + return nil } func selectorFromInferencePoolSelector(selector map[v1alpha2.LabelKey]v1alpha2.LabelValue) labels.Selector { @@ -301,23 +336,3 @@ func stripLabelKeyAliasFromLabelMap(labels map[v1alpha2.LabelKey]v1alpha2.LabelV } return outMap } - -func IsCritical(model *v1alpha2.InferenceModel) bool { - if model.Spec.Criticality != nil && *model.Spec.Criticality == v1alpha2.Critical { - return true - } - return false -} - -// TODO: move out to share with pod_reconciler.go -func podIsReady(pod *corev1.Pod) bool { - for _, condition := range pod.Status.Conditions { - if condition.Type == corev1.PodReady { - if condition.Status == corev1.ConditionTrue { - return true - } - break - } - } - return false -} diff --git a/pkg/epp/datastore/datastore_test.go b/pkg/epp/datastore/datastore_test.go index 22bb03654..cf1f610cb 100644 --- a/pkg/epp/datastore/datastore_test.go +++ b/pkg/epp/datastore/datastore_test.go @@ -27,7 +27,10 @@ import ( "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" testutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" @@ -71,9 +74,15 @@ func TestPool(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + // Set up the scheme. + scheme := runtime.NewScheme() + _ = clientgoscheme.AddToScheme(scheme) + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) datastore := NewDatastore(context.Background(), pmf) - datastore.PoolSet(tt.inferencePool) + _ = datastore.PoolSet(context.Background(), fakeClient, tt.inferencePool) gotPool, gotErr := datastore.PoolGet() if diff := cmp.Diff(tt.wantErr, gotErr, cmpopts.EquateErrors()); diff != "" { t.Errorf("Unexpected error diff (+got/-want): %s", diff) @@ -228,7 +237,7 @@ var ( Name: "pod1", }, } - pod1Metrics = &backendmetrics.Metrics{ + pod1Metrics = &backendmetrics.MetricsState{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, MaxActiveModels: 2, @@ -236,13 +245,14 @@ var ( "foo": 1, "bar": 1, }, + WaitingModels: map[string]int{}, } pod2 = &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: "pod2", }, } - pod2Metrics = &backendmetrics.Metrics{ + pod2Metrics = &backendmetrics.MetricsState{ WaitingQueueSize: 1, KVCacheUsagePercent: 0.2, MaxActiveModels: 2, @@ -250,6 +260,7 @@ var ( "foo1": 1, "bar1": 1, }, + WaitingModels: map[string]int{}, } pod1NamespacedName = types.NamespacedName{Name: pod1.Name, Namespace: pod1.Namespace} pod2NamespacedName = types.NamespacedName{Name: pod2.Name, Namespace: pod2.Namespace} @@ -265,29 +276,29 @@ func TestMetrics(t *testing.T) { name string pmc backendmetrics.PodMetricsClient storePods []*corev1.Pod - want []*backendmetrics.Metrics + want []*backendmetrics.MetricsState }{ { name: "Probing metrics success", pmc: &backendmetrics.FakePodMetricsClient{ - Res: map[types.NamespacedName]*backendmetrics.Metrics{ + Res: map[types.NamespacedName]*backendmetrics.MetricsState{ pod1NamespacedName: pod1Metrics, pod2NamespacedName: pod2Metrics, }, }, storePods: []*corev1.Pod{pod1, pod2}, - want: []*backendmetrics.Metrics{pod1Metrics, pod2Metrics}, + want: []*backendmetrics.MetricsState{pod1Metrics, pod2Metrics}, }, { name: "Only pods in are probed", pmc: &backendmetrics.FakePodMetricsClient{ - Res: map[types.NamespacedName]*backendmetrics.Metrics{ + Res: map[types.NamespacedName]*backendmetrics.MetricsState{ pod1NamespacedName: pod1Metrics, pod2NamespacedName: pod2Metrics, }, }, storePods: []*corev1.Pod{pod1}, - want: []*backendmetrics.Metrics{pod1Metrics}, + want: []*backendmetrics.MetricsState{pod1Metrics}, }, { name: "Probing metrics error", @@ -295,16 +306,17 @@ func TestMetrics(t *testing.T) { Err: map[types.NamespacedName]error{ pod2NamespacedName: errors.New("injected error"), }, - Res: map[types.NamespacedName]*backendmetrics.Metrics{ + Res: map[types.NamespacedName]*backendmetrics.MetricsState{ pod1NamespacedName: pod1Metrics, }, }, storePods: []*corev1.Pod{pod1, pod2}, - want: []*backendmetrics.Metrics{ + want: []*backendmetrics.MetricsState{ pod1Metrics, // Failed to fetch pod2 metrics so it remains the default values. { ActiveModels: map[string]int{}, + WaitingModels: map[string]int{}, WaitingQueueSize: 0, KVCacheUsagePercent: 0, MaxActiveModels: 0, @@ -317,19 +329,25 @@ func TestMetrics(t *testing.T) { t.Run(test.name, func(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() + // Set up the scheme. + scheme := runtime.NewScheme() + _ = clientgoscheme.AddToScheme(scheme) + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() pmf := backendmetrics.NewPodMetricsFactory(test.pmc, time.Millisecond) ds := NewDatastore(ctx, pmf) - ds.PoolSet(inferencePool) + _ = ds.PoolSet(ctx, fakeClient, inferencePool) for _, pod := range test.storePods { - ds.PodUpdateOrAddIfNotExist(pod, inferencePool) + ds.PodUpdateOrAddIfNotExist(pod) } assert.EventuallyWithT(t, func(t *assert.CollectT) { got := ds.PodGetAll() - metrics := []*backendmetrics.Metrics{} + metrics := []*backendmetrics.MetricsState{} for _, one := range got { metrics = append(metrics, one.GetMetrics()) } - diff := cmp.Diff(test.want, metrics, cmpopts.IgnoreFields(backendmetrics.Metrics{}, "UpdateTime"), cmpopts.SortSlices(func(a, b *backendmetrics.Metrics) bool { + diff := cmp.Diff(test.want, metrics, cmpopts.IgnoreFields(backendmetrics.MetricsState{}, "UpdateTime"), cmpopts.SortSlices(func(a, b *backendmetrics.MetricsState) bool { return a.String() < b.String() })) assert.Equal(t, "", diff, "Unexpected diff (+got/-want)") @@ -337,3 +355,94 @@ func TestMetrics(t *testing.T) { }) } } + +func TestPods(t *testing.T) { + updatedPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + }, + Spec: corev1.PodSpec{ + NodeName: "node-1", + }, + } + tests := []struct { + name string + op func(ctx context.Context, ds Datastore) + existingPods []*corev1.Pod + wantPods []*corev1.Pod + }{ + { + name: "Add new pod, no existing pods, should add", + existingPods: []*corev1.Pod{}, + wantPods: []*corev1.Pod{pod1}, + op: func(ctx context.Context, ds Datastore) { + ds.PodUpdateOrAddIfNotExist(pod1) + }, + }, + { + name: "Add new pod, with existing pods, should add", + existingPods: []*corev1.Pod{pod1}, + wantPods: []*corev1.Pod{pod1, pod2}, + op: func(ctx context.Context, ds Datastore) { + ds.PodUpdateOrAddIfNotExist(pod2) + }, + }, + { + name: "Update existing pod, new field, should update", + existingPods: []*corev1.Pod{pod1}, + wantPods: []*corev1.Pod{updatedPod}, + op: func(ctx context.Context, ds Datastore) { + ds.PodUpdateOrAddIfNotExist(updatedPod) + }, + }, + { + name: "Update existing pod, no new fields, should not update", + existingPods: []*corev1.Pod{pod1}, + wantPods: []*corev1.Pod{pod1}, + op: func(ctx context.Context, ds Datastore) { + incoming := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Namespace: "default", + }, + } + ds.PodUpdateOrAddIfNotExist(incoming) + }, + }, + { + name: "Delete the pod", + wantPods: []*corev1.Pod{pod1}, + op: func(ctx context.Context, ds Datastore) { + ds.PodDelete(pod2NamespacedName) + }, + }, + { + name: "Delete the pod that doesn't exist", + existingPods: []*corev1.Pod{pod1}, + wantPods: []*corev1.Pod{pod1}, + op: func(ctx context.Context, ds Datastore) { + ds.PodDelete(pod2NamespacedName) + }, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ctx := context.Background() + pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) + ds := NewDatastore(t.Context(), pmf) + for _, pod := range test.existingPods { + ds.PodUpdateOrAddIfNotExist(pod) + } + + test.op(ctx, ds) + var gotPods []*corev1.Pod + for _, pm := range ds.PodGetAll() { + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: pm.GetPod().NamespacedName.Name, Namespace: pm.GetPod().NamespacedName.Namespace}, Status: corev1.PodStatus{PodIP: pm.GetPod().Address}} + gotPods = append(gotPods, pod) + } + if !cmp.Equal(gotPods, test.wantPods, cmpopts.SortSlices(func(a, b *corev1.Pod) bool { return a.Name < b.Name })) { + t.Logf("got (%v) != want (%v);", gotPods, test.wantPods) + } + }) + } +} diff --git a/pkg/epp/flowcontrol/README.md b/pkg/epp/flowcontrol/README.md new file mode 100644 index 000000000..5db86f446 --- /dev/null +++ b/pkg/epp/flowcontrol/README.md @@ -0,0 +1,116 @@ +# Flow Control Module + +## Introduction + +In a multi-tenant, heterogeneous inference serving environment, managing diverse SLOs and fairness requirements is +critical. Today, the serving stack often relies on a simple "best-effort" or FIFO (First-In, First-Out) basis for +handling requests. This is insufficient and leads to significant problems: + +* **Head-of-Line Blocking**: A long-running, low-priority request can block short, high-priority requests, violating + SLOs. +* **Lack of Predictability**: Without proper queuing and prioritization, it's impossible to provide predictable latency + guarantees to different tenants. +* **Inability to Handle Saturation**: Under heavy load, the system has no graceful way to manage overload, leading to + cascading failures instead of controlled degradation. + +The Flow Controller is a sophisticated library designed to solve these problems. It acts as a crucial gatekeeper that +decides *if* and *when* a request should proceed to be scheduled. Its primary mission is to enable predictable, fair, +and efficient utilization of shared backend resources by enforcing prioritization, applying fairness policies, managing +request queuing under saturation, and orchestrating displacement (the eviction of lower-priority queued items to make +space for higher-priority ones). + +It is designed for extensibility, allowing custom logic for policies and queuing mechanisms to be plugged into a robust, +high-performance orchestration engine. + +### Role in the Gateway API Inference Extension + +Within the Gateway API Inference Extension's Endpoint Picker (EPP), the Flow Controller acts as a crucial gatekeeper +between the Routing and Scheduling layers. It decides *if* and *when* a request, already assigned to a logical flow +(e.g., a specific workload or tenant), should proceed to be scheduled onto a backend resource. It is the primary +mechanism for managing diverse SLOs, ensuring fairness among competing workloads, and maintaining system stability under +high load. + +### High Level Architecture + +The following diagram illustrates the high-level dependency model and request flow for the system. It shows how +concurrent client requests are managed by the central `FlowController`, which in turn relies on a set of decoupled +components to make its decisions. Each component package in this module will contain its own more detailed architectural +diagrams. + +```mermaid +graph LR + %% Style Definitions + classDef default fill:#fff,stroke:#333,stroke-width:1.5px,color:#000; + classDef client fill:#dcfce7,stroke:#333; + classDef system_entry fill:#fef9c3,stroke:#333; + classDef downstream_ok fill:#dbeafe,stroke:#333; + classDef downstream_err fill:#fee2e2,stroke:#333; + + %% Client Goroutines (Fan-In) + subgraph Client Goroutines + direction TB + R1(Goroutine 1); + R2(Goroutine N); + end + + %% Flow Control System + subgraph Flow Control System + C{Flow Controller Engine}; + + subgraph Internal Interactions + direction LR + D(Ports) -- "abstracts state" --> E(Flow Registry); + D -- "abstracts load" --> SD(Saturation Detector); + E -- "configures" --> F(Framework); + F -- "defines" --> P(Plugins: Queues & Policies); + end + + C -- "Orchestrates via
abstractions" --> D; + end + + %% Downstream Actions (Fan-Out) + subgraph Downstream Actions + direction TB + A1(Outcome: Dispatched
Proceed to Scheduler); + A2(Outcome: Rejected
Return Error); + end + + %% Connections + R1 -- "calls & blocks" --> C; + R2 -- "calls & blocks" --> C; + C -- "unblocks 'goroutine 1'" --> A1; + C -- "unblocks 'goroutine N'" --> A2; + + %% Apply Classes + class R1,R2 client; + class C system_entry; + class A1 downstream_ok; + class A2 downstream_err; + class D,E,F,P,SD default; +``` + +## Architectural Pillars + +The Flow Controller framework is built on several key components that work in concert. This architecture is designed to +be highly modular and scalable, with clear separation of concerns. For a deep dive into the specific design choices and +their justifications, please refer to the detailed documentation within the relevant sub-packages. + +1. **The `FlowController` Engine (`./controller`)**: The central, sharded orchestrator responsible for the main request + processing loop. It manages a pool of workers that distribute incoming requests, apply policies, and dispatch + requests to the backends. Its design focuses on high throughput and backpressure. + +2. **Pluggable `Policy` Framework (`./framework`)**: This defines the core interfaces for all pluggable logic. It + features a two-tier policy system for `InterFlow` (decisions *between* different flows) and `IntraFlow` + (decisions *within* a single flow) logic, covering both request dispatch and displacement. + +3. **Extensible `SafeQueue` System (`./framework`)**: This defines the `framework.SafeQueue` interface for + concurrent-safe request storage. It uses a `QueueCapability` system that allows for diverse and extensible queue + implementations (e.g., FIFO, Priority Heap) while maintaining a stable interface. + +4. **The `FlowRegistry` (`./registry`, `./ports`)**: This is the stateful control plane of the system. It manages the + configuration and lifecycle of all flows, policies, and queues. It presents a sharded view of its state to the + `FlowController` workers to enable parallel operation with minimal lock contention. + +5. **Core Types and Service Ports (`./types`, `./ports`)**: These packages define the foundational data structures + (e.g., `FlowControlRequest`), errors, and service interfaces that decouple the engine from its dependencies, + following a "Ports and Adapters" architectural style. diff --git a/pkg/epp/flowcontrol/types/README.md b/pkg/epp/flowcontrol/types/README.md new file mode 100644 index 000000000..a75e976fc --- /dev/null +++ b/pkg/epp/flowcontrol/types/README.md @@ -0,0 +1,33 @@ +# Flow Control Core Types + +This package defines the fundamental data structures, interfaces, and errors that form the vocabulary of the Flow +Control system. It establishes the core concepts of the request lifecycle and its final, reportable outcomes. + +## Request Lifecycle Interfaces + +A request's journey through the Flow Controller is represented by a series of interfaces that define its state as it +moves through the system: + +1. **`FlowControlRequest`**: The initial, "raw" contract for an incoming request. It carries the essential data + provided by the client, such as its `FlowID` and `ByteSize`. +2. **`QueueItemAccessor`**: The internal, enriched, and read-only view of a request once it has been accepted by the + controller. This interface is the primary means by which policy plugins inspect items. +3. **`QueueItemHandle`**: An opaque, queue-specific handle to a queued item. The controller uses this handle to perform + targeted operations, such as removing a specific item, without needing to know the queue's internal implementation + details. + +## Final State Reporting: Outcomes and Errors + +The final state of every request is reported using a combination of a `QueueOutcome` enum and a corresponding `error`. +This provides a clear, machine-inspectable way to understand the result. + +* **`QueueOutcome`**: A concise enum summarizing the final result (e.g., `QueueOutcomeDispatched`, + `QueueOutcomeRejectedCapacity`, `QueueOutcomeEvictedDisplaced`). This is ideal for metrics. + +* **Errors**: For any non-dispatch outcome, a specific sentinel error is returned. These are nested to provide detailed + context: + * `ErrRejected`: The parent error for any request rejected *before* being enqueued. + * `ErrEvicted`: The parent error for any request removed *after* being enqueued for reasons other than dispatch. + +Callers of `FlowController.EnqueueAndWait()` can first use `errors.Is()` to check for the general class of failure +(`ErrRejected` or `ErrEvicted`), and then unwrap the error to find the specific cause (e.g., `ErrQueueAtCapacity`). diff --git a/pkg/epp/flowcontrol/types/errors.go b/pkg/epp/flowcontrol/types/errors.go new file mode 100644 index 000000000..9e7882f94 --- /dev/null +++ b/pkg/epp/flowcontrol/types/errors.go @@ -0,0 +1,78 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package types + +import ( + "errors" +) + +// --- High Level Queue Outcome Errors --- +var ( + // ErrRejected is a sentinel error indicating a request was rejected by the Flow Controller *before* being formally + // enqueued. Errors returned by `FlowController.EnqueueAndWait()` that signify pre-queue rejection will wrap this + // error. + // Callers should use `errors.Is(err, ErrRejected)` to check for this general class of failure. + ErrRejected = errors.New("request rejected pre-queue") + + // ErrEvicted is a sentinel error indicating a request was removed from a queue *after* being successfully enqueued, + // but for reasons other than successful dispatch (e.g., TTL expiry, displacement). + // Errors returned by `FlowController.EnqueueAndWait()` that signify post-queue eviction will wrap this error. + // Callers should use `errors.Is(err, ErrEvicted)` to check for this general class of failure. + ErrEvicted = errors.New("request evicted from queue") +) + +// --- Pre-Enqueue Rejection Errors --- +// Errors that can occur before a request is formally added to a `framework.SafeQueue`. +// When returned by `FlowController.EnqueueAndWait()`, these specific errors will typically be wrapped by `ErrRejected`. +var ( + // ErrNilRequest indicates that a nil `types.FlowControlRequest` was provided. + ErrNilRequest = errors.New("FlowControlRequest cannot be nil") + + // ErrFlowIDEmpty indicates that a flow ID was empty when one was required. + ErrFlowIDEmpty = errors.New("flow ID cannot be empty") + + // ErrQueueAtCapacity indicates that a request could not be enqueued because queue capacity limits were met and + // displacement (if applicable) failed to make space. + ErrQueueAtCapacity = errors.New("queue at capacity and displacement failed to make space") +) + +// --- Post-Enqueue Eviction Errors --- +// Errors that occur when a request, already in a `framework.SafeQueue`, is removed for reasons other than dispatch. +// When returned by `FlowController.EnqueueAndWait()`, these specific errors will typically be wrapped by `ErrEvicted`. +var ( + // ErrTTLExpired indicates a request was evicted from a queue because its effective Time-To-Live expired. + ErrTTLExpired = errors.New("request TTL expired") + + // ErrContextCancelled indicates a request was evicted because its associated context (from + // `FlowControlRequest.Context()`) was cancelled. This error typically wraps the underlying `context.Canceled` or + // `context.DeadlineExceeded` error. + ErrContextCancelled = errors.New("request context cancelled") + + // ErrDisplaced indicates a request was evicted from a queue because it was chosen as a victim by a displacement + // policy to make space for another request. + ErrDisplaced = errors.New("request displaced") +) + +// --- General FlowController Errors --- +// General runtime errors for the Flow Controller. +var ( + // ErrFlowControllerShutdown indicates that an operation could not complete or an item was evicted because the Flow + // Controller is shutting down or has stopped. + // When returned by `FlowController.EnqueueAndWait()`, this will be wrapped by `ErrRejected` (if rejection happens + // before internal queuing) or `ErrEvicted` (if eviction happens after internal queuing). + ErrFlowControllerShutdown = errors.New("FlowController is shutting down") +) diff --git a/pkg/epp/flowcontrol/types/flow.go b/pkg/epp/flowcontrol/types/flow.go new file mode 100644 index 000000000..031a83799 --- /dev/null +++ b/pkg/epp/flowcontrol/types/flow.go @@ -0,0 +1,33 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package types defines the core data structures and service contracts for the Flow Controller system. It establishes +// the "vocabulary" of the system, including the request lifecycle interfaces, final outcomes, and standard error types. +package types + +// FlowSpecification defines the configuration of a logical flow, encapsulating its identity and registered priority. +// +// A FlowSpecification acts as the registration key for a flow within the Flow Registry. +type FlowSpecification interface { + // ID returns the unique name or identifier for this flow (e.g., model name, tenant ID), corresponding to the value + // from `FlowControlRequest.FlowID()`. + ID() string + + // Priority returns the numerical priority level currently associated with this flow within the Flow Registry. + // + // Convention: Lower numerical values indicate higher priority. + Priority() uint +} diff --git a/pkg/epp/flowcontrol/types/outcomes.go b/pkg/epp/flowcontrol/types/outcomes.go new file mode 100644 index 000000000..7c52db934 --- /dev/null +++ b/pkg/epp/flowcontrol/types/outcomes.go @@ -0,0 +1,93 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package types + +import "strconv" + +// QueueOutcome represents the high-level final state of a request's lifecycle within the Flow Controller. +// +// It is returned by `FlowController.EnqueueAndWait()` along with a corresponding error. This enum is designed to be a +// low-cardinality label ideal for metrics, while the error provides fine-grained details for non-dispatched outcomes. +type QueueOutcome int + +const ( + // QueueOutcomeDispatched indicates the request was successfully processed by the Flow Controller and unblocked for + // the caller to proceed. + // The associated error from `FlowController.EnqueueAndWait()` will be nil. + QueueOutcomeDispatched QueueOutcome = iota + + // --- Pre-Enqueue Rejection Outcomes (request never entered a `framework.SafeQueue`) --- + // For these outcomes, the error from `FlowController.EnqueueAndWait()` will wrap `ErrRejected`. + + // QueueOutcomeRejectedCapacity indicates rejection because queue capacity limits were met and displacement (if + // applicable) failed to make space. + // The associated error will wrap `ErrQueueAtCapacity` (and `ErrRejected`). + QueueOutcomeRejectedCapacity + + // QueueOutcomeRejectedOther indicates rejection for reasons other than capacity before the request was formally + // enqueued. + // The specific underlying cause can be determined from the associated error (e.g., a nil request, an unregistered + // flow ID, or a controller shutdown), which will be wrapped by `ErrRejected`. + QueueOutcomeRejectedOther + + // --- Post-Enqueue Eviction Outcomes (request was in a `framework.SafeQueue` but not dispatched) --- + // For these outcomes, the error from `FlowController.EnqueueAndWait()` will wrap `ErrEvicted`. + + // QueueOutcomeEvictedTTL indicates eviction from a queue because the request's effective Time-To-Live expired. + // The associated error will wrap `ErrTTLExpired` (and `ErrEvicted`). + QueueOutcomeEvictedTTL + + // QueueOutcomeEvictedContextCancelled indicates eviction from a queue because the request's own context (from + // `FlowControlRequest.Context()`) was cancelled. + // The associated error will wrap `ErrContextCancelled` (which may further wrap the underlying `context.Canceled` or + // `context.DeadlineExceeded` error) (and `ErrEvicted`). + QueueOutcomeEvictedContextCancelled + + // QueueOutcomeEvictedDisplaced indicates eviction from a queue to make space for another request due to a + // displacement policy. + // The associated error will wrap `ErrDisplaced` (and `ErrEvicted`). + QueueOutcomeEvictedDisplaced + + // QueueOutcomeEvictedOther indicates eviction from a queue for reasons not covered by more specific eviction + // outcomes. + // The specific underlying cause can be determined from the associated error (e.g., a controller shutdown while the + // item was queued), which will be wrapped by `ErrEvicted`. + QueueOutcomeEvictedOther +) + +// String returns a human-readable string representation of the QueueOutcome. +func (o QueueOutcome) String() string { + switch o { + case QueueOutcomeDispatched: + return "Dispatched" + case QueueOutcomeRejectedCapacity: + return "RejectedCapacity" + case QueueOutcomeRejectedOther: + return "RejectedOther" + case QueueOutcomeEvictedTTL: + return "EvictedTTL" + case QueueOutcomeEvictedContextCancelled: + return "EvictedContextCancelled" + case QueueOutcomeEvictedDisplaced: + return "EvictedDisplaced" + case QueueOutcomeEvictedOther: + return "EvictedOther" + default: + // Return the integer value for unknown outcomes to aid in debugging. + return "UnknownOutcome(" + strconv.Itoa(int(o)) + ")" + } +} diff --git a/pkg/epp/flowcontrol/types/request.go b/pkg/epp/flowcontrol/types/request.go new file mode 100644 index 000000000..bf6960507 --- /dev/null +++ b/pkg/epp/flowcontrol/types/request.go @@ -0,0 +1,121 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package types + +import ( + "context" + "time" +) + +// FlowControlRequest is the contract for an incoming request submitted to the Flow Controller. It represents the "raw" +// user-provided data and context for a single unit of work. +// +// An object implementing this interface is the primary input to `FlowController.EnqueueAndWait()`. The controller then +// wraps this object with its own internal structures (which implement `QueueItemAccessor`) to manage the request's +// lifecycle without modifying the original. +type FlowControlRequest interface { + // Context returns the request's context. The Flow Controller uses this for monitoring cancellation (e.g., if the + // client disconnects or a request-scoped timeout occurs), which can lead to the request being evicted from a queue. + Context() context.Context + + // FlowID returns the unique identifier for the flow this request belongs to (e.g., model name, tenant ID). The + // Flow Controller uses this ID, in conjunction with the flow's registered priority, to look up the active + // `ports.ManagedQueue` from the Flow Registry's `ports.RegistryShard`. + FlowID() string + + // ByteSize returns the request's size in bytes (e.g., prompt size). This is used by the Flow Controller and for + // managing byte-based capacity limits and for Flow Registry statistics. + ByteSize() uint64 + + // InitialEffectiveTTL returns the suggested Time-To-Live for this request. + // This value is treated as a hint; the Flow Controller may override it based on its own configuration or policies. + // A zero value indicates the request has no specific TTL preference, and a system-wide default should be applied. + InitialEffectiveTTL() time.Duration + + // ID returns an optional, user-facing unique identifier for this specific request. It is intended for logging, + // tracing, and observability. The core flow control logic does not use this ID for dispatching decisions; it uses + // the internal, opaque `QueueItemHandle`. + ID() string +} + +// QueueItemHandle is an opaque handle to an item that has been successfully added to a `framework.SafeQueue`. It acts +// as a key, allowing the Flow Controller to perform targeted operations (like removal) on a specific item without +// needing to know the queue's internal structure. +// +// A handle is created by and bound to the specific `framework.SafeQueue` instance that stores the item. +type QueueItemHandle interface { + // Handle returns the underlying, queue-specific raw handle (e.g., *list.Element). + // This method is intended for internal use by the `framework.SafeQueue` implementation that created it. + // Callers outside the queue implementation should treat the returned value as opaque. + Handle() any + + // Invalidate marks this handle as no longer valid for future operations. + // This method MUST be called by the `framework.SafeQueue` implementation itself after the item associated with this + // handle has been removed. + // + // Conformance: Implementations of this method MUST be idempotent. + Invalidate() + + // IsInvalidated returns true if this handle has been marked as invalid (e.g., by a call to `Invalidate`). + // A `framework.SafeQueue` MUST reject any operation that attempts to use an invalidated handle, typically by + // returning `framework.ErrInvalidQueueItemHandle`. + IsInvalidated() bool +} + +// QueueItemAccessor provides the internal, enriched, read-only view of a request being managed within the Flow +// Controller's queues. It is the primary interface through which `framework.SafeQueue` implementations and policy +// plugins interact with request data and its associated flow control metadata. +// +// The Flow Controller creates an object that implements this interface by wrapping an incoming `FlowControlRequest`. +type QueueItemAccessor interface { + // EnqueueTime is the timestamp when the item was logically accepted by the Flow Controller for queuing (i.e., when + // `FlowController.EnqueueAndWait()` was called. + EnqueueTime() time.Time + + // ByteSize returns the byte size of the original request, cached from `FlowControlRequest.ByteSize()`. + ByteSize() uint64 + + // FlowID returns the unique identifier of the flow this item belongs to, cached from `FlowControlRequest.FlowID()`. + FlowID() string + + // EffectiveTTL is the actual Time-To-Live assigned to this item by the Flow Controller, taking into account the + // request's preference (`FlowControlRequest.InitialEffectiveTTL()`) and any Flow Controller or per-flow + // defaults/policies. + EffectiveTTL() time.Duration + + // RequestID is the user-facing ID from the original request (`FlowControlRequest.ID()`). + RequestID() string + + // OriginalRequest returns the underlying `FlowControlRequest` that this accessor provides a view of. + // This method serves as an escape hatch, allowing policies or components that are aware of specific + // `FlowControlRequest` implementations to perform type assertions and access richer, application-specific data. + OriginalRequest() FlowControlRequest + + // Handle returns the `QueueItemHandle` associated with this item once it has been successfully added to a + // `framework.SafeQueue`. It returns nil if the item is not yet in a queue. + Handle() QueueItemHandle + + // SetHandle associates a `QueueItemHandle` with this item. + // + // Conformance: This method MUST be called by a `framework.SafeQueue` implementation within its `Add` method, + // immediately after a new `QueueItemHandle` is created for the item. This ensures that the item always carries a + // valid handle while it is in a queue. This method is not intended for use outside of `framework.SafeQueue` + // implementations. + // + //go:doc + SetHandle(handle QueueItemHandle) +} diff --git a/pkg/epp/handlers/request.go b/pkg/epp/handlers/request.go index d7678fadf..ab93e023a 100644 --- a/pkg/epp/handlers/request.go +++ b/pkg/epp/handlers/request.go @@ -18,126 +18,117 @@ package handlers import ( "context" - "encoding/json" - "fmt" "strconv" + "time" configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "google.golang.org/protobuf/types/known/structpb" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) -// HandleRequestBody handles body of the request to the backend server, such as parsing the "model" -// parameter. -// Envoy sends the request body to ext proc before sending the request to the backend server. -func (s *Server) HandleRequestBody( - ctx context.Context, - reqCtx *RequestContext, - req *extProcPb.ProcessingRequest, -) (*extProcPb.ProcessingResponse, error) { - logger := log.FromContext(ctx) - loggerVerbose := logger.V(logutil.VERBOSE) - loggerVerbose.Info("Handling request body") - - // Unmarshal request body (must be JSON). - v := req.Request.(*extProcPb.ProcessingRequest_RequestBody) - var rb map[string]interface{} - if err := json.Unmarshal(v.RequestBody.Body, &rb); err != nil { - logger.V(logutil.DEFAULT).Error(err, "Error unmarshaling request body") - return nil, errutil.Error{Code: errutil.BadRequest, Msg: fmt.Sprintf("error unmarshaling request body: %v", err)} - } - loggerVerbose.Info("Request body unmarshalled", "body", rb) - - // Resolve target models. - model, ok := rb["model"].(string) - if !ok { - return nil, errutil.Error{Code: errutil.BadRequest, Msg: "model not found in request"} - } - loggerVerbose.Info("Model requested", "model", model) - modelName := model - - // NOTE: The nil checking for the modelObject means that we DO allow passthrough currently. - // This might be a security risk in the future where adapters not registered in the InferenceModel - // are able to be requested by using their distinct name. - modelObj := s.datastore.ModelGet(model) - if modelObj == nil { - return nil, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error finding a model object in InferenceModel for input %v", model)} - } - if len(modelObj.Spec.TargetModels) > 0 { - modelName = RandomWeightedDraw(logger, modelObj, 0) - if modelName == "" { - return nil, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error getting target model name for model %v", modelObj.Name)} +func (s *StreamingServer) HandleRequestHeaders(ctx context.Context, reqCtx *RequestContext, req *extProcPb.ProcessingRequest_RequestHeaders) error { + reqCtx.RequestReceivedTimestamp = time.Now() + + // an EoS in the request headers means this request has no body or trailers. + if req.RequestHeaders.EndOfStream { + // We will route this request to a random pod as this is assumed to just be a GET + // More context: https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/526 + // The above PR will address endpoint admission, but currently any request without a body will be + // routed to a random upstream pod. + pod := s.director.GetRandomPod() + if pod == nil { + return errutil.Error{Code: errutil.Internal, Msg: "no pods available in datastore"} } - } - llmReq := &scheduling.LLMRequest{ - Model: model, - ResolvedTargetModel: modelName, - Critical: datastore.IsCritical(modelObj), - } - loggerVerbose.Info("LLM request assembled", "request", llmReq) - - requestBody := v.RequestBody.Body - var err error - // Update target models in the body. - if llmReq.Model != llmReq.ResolvedTargetModel { - rb["model"] = llmReq.ResolvedTargetModel - requestBody, err = json.Marshal(rb) + pool, err := s.datastore.PoolGet() if err != nil { - logger.V(logutil.DEFAULT).Error(err, "Error marshaling request body") - return nil, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("error marshaling request body: %v", err)} + return err } - loggerVerbose.Info("Updated request body marshalled", "body", string(requestBody)) + reqCtx.TargetEndpoint = pod.Address + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber)) + reqCtx.RequestSize = 0 + reqCtx.reqHeaderResp = s.generateRequestHeaderResponse(reqCtx) + return nil } - target, err := s.scheduler.Schedule(ctx, llmReq) - if err != nil { - return nil, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()} + for _, header := range req.RequestHeaders.Headers.Headers { + if header.RawValue != nil { + reqCtx.Request.Headers[header.Key] = string(header.RawValue) + } else { + reqCtx.Request.Headers[header.Key] = header.Value + } } - targetPod := target.GetPod() - - logger.V(logutil.DEFAULT).Info("Request handled", - "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "endpoint", targetPod) + return nil +} - // Insert target endpoint to instruct Envoy to route requests to the specified target pod. - // Attach the port number - pool, err := s.datastore.PoolGet() - if err != nil { - return nil, err +func (s *StreamingServer) generateRequestBodyResponses(requestBodyBytes []byte) []*extProcPb.ProcessingResponse { + commonResponses := buildCommonResponses(requestBodyBytes, bodyByteLimit, true) + responses := []*extProcPb.ProcessingResponse{} + for _, commonResp := range commonResponses { + resp := &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: commonResp, + }, + }, + } + responses = append(responses, resp) } - endpoint := targetPod.Address + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber)) + return responses +} - reqCtx.Model = llmReq.Model - reqCtx.ResolvedTargetModel = llmReq.ResolvedTargetModel - reqCtx.RequestSize = len(v.RequestBody.Body) - reqCtx.TargetPod = targetPod.NamespacedName.String() - reqCtx.TargetEndpoint = endpoint +func (s *StreamingServer) generateRequestHeaderResponse(reqCtx *RequestContext) *extProcPb.ProcessingResponse { + // The Endpoint Picker supports two approaches to communicating the target endpoint, as a request header + // and as an unstructure ext-proc response metadata key/value pair. This enables different integration + // options for gateway providers. + return &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: s.generateHeaders(reqCtx), + }, + }, + }, + }, + DynamicMetadata: s.generateMetadata(reqCtx.TargetEndpoint), + } +} +func (s *StreamingServer) generateHeaders(reqCtx *RequestContext) []*configPb.HeaderValueOption { + // can likely refactor these two bespoke headers to be updated in PostDispatch, to centralize logic. headers := []*configPb.HeaderValueOption{ { Header: &configPb.HeaderValue{ Key: s.destinationEndpointHintKey, - RawValue: []byte(endpoint), + RawValue: []byte(reqCtx.TargetEndpoint), }, }, + } + if reqCtx.RequestSize > 0 { // We need to update the content length header if the body is mutated, see Envoy doc: // https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/filters/http/ext_proc/v3/processing_mode.proto - { + headers = append(headers, &configPb.HeaderValueOption{ Header: &configPb.HeaderValue{ Key: "Content-Length", - RawValue: []byte(strconv.Itoa(len(requestBody))), + RawValue: []byte(strconv.Itoa(reqCtx.RequestSize)), }, - }, + }) } - // Print headers for debugging - for _, header := range headers { - logger.V(logutil.DEBUG).Info("Request body header", "key", header.Header.Key, "value", header.Header.RawValue) + + // include all headers + for key, value := range reqCtx.Request.Headers { + headers = append(headers, &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: key, + RawValue: []byte(value), + }, + }) } + return headers +} +func (s *StreamingServer) generateMetadata(endpoint string) *structpb.Struct { targetEndpointValue := &structpb.Struct{ Fields: map[string]*structpb.Value{ s.destinationEndpointHintKey: { @@ -160,51 +151,5 @@ func (s *Server) HandleRequestBody( }, } } - - resp := &extProcPb.ProcessingResponse{ - // The Endpoint Picker supports two approaches to communicating the target endpoint, as a request header - // and as an unstructure ext-proc response metadata key/value pair. This enables different integration - // options for gateway providers. - Response: &extProcPb.ProcessingResponse_RequestBody{ - RequestBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: headers, - }, - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_Body{ - Body: requestBody, - }, - }, - }, - }, - }, - DynamicMetadata: dynamicMetadata, - } - return resp, nil -} - -func HandleRequestHeaders( - ctx context.Context, - reqCtx *RequestContext, - req *extProcPb.ProcessingRequest, -) *extProcPb.ProcessingResponse { - r := req.Request - h := r.(*extProcPb.ProcessingRequest_RequestHeaders) - log.FromContext(ctx).V(logutil.VERBOSE).Info("Handling request headers", "headers", h) - - resp := &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_RequestHeaders{ - RequestHeaders: &extProcPb.HeadersResponse{ - Response: &extProcPb.CommonResponse{ - // Set `clear_route_cache = true` to force Envoy to recompute the target cluster - // based on the new "target-pod" header. - // See https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor.proto#service-ext-proc-v3-commonresponse. - ClearRouteCache: true, - }, - }, - }, - } - - return resp + return dynamicMetadata } diff --git a/pkg/epp/handlers/response.go b/pkg/epp/handlers/response.go index 991b7d162..a776bd1d9 100644 --- a/pkg/epp/handlers/response.go +++ b/pkg/epp/handlers/response.go @@ -19,14 +19,12 @@ package handlers import ( "context" "encoding/json" - "fmt" "strings" configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" - "github.com/go-logr/logr" "sigs.k8s.io/controller-runtime/pkg/log" - errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -35,185 +33,112 @@ const ( streamingEndMsg = "data: [DONE]" ) -// HandleResponseHeaders processes response headers from the backend model server. -func (s *Server) HandleResponseHeaders( - ctx context.Context, - reqCtx *RequestContext, - req *extProcPb.ProcessingRequest, -) (*extProcPb.ProcessingResponse, error) { - loggerVerbose := log.FromContext(ctx).V(logutil.VERBOSE) - loggerVerbose.Info("Processing ResponseHeaders") - h := req.Request.(*extProcPb.ProcessingRequest_ResponseHeaders) - loggerVerbose.Info("Headers before", "headers", h) - - // Example header - // { - // "ResponseHeaders": { - // "headers": [ - // { - // "key": ":status", - // "raw_value": "200" - // }, - // { - // "key": "date", - // "raw_value": "Thu, 30 Jan 2025 18:50:48 GMT" - // }, - // { - // "key": "server", - // "raw_value": "uvicorn" - // }, - // { - // "key": "content-type", - // "raw_value": "text/event-stream; charset=utf-8" - // }, - // { - // "key": "transfer-encoding", - // "raw_value": "chunked" - // } - // ] - // } - // } - for _, header := range h.ResponseHeaders.Headers.GetHeaders() { - var statusFound, typeFound bool - if header.Key == "status" { - code := header.RawValue[0] - if string(code) != "200" { - reqCtx.ResponseStatusCode = errutil.ModelServerError - statusFound = true - } - } - if header.Key == "content-type" { - contentType := header.RawValue - if strings.Contains(string(contentType), "text/event-stream") { - reqCtx.modelServerStreaming = true - } - typeFound = true +// HandleResponseBody always returns the requestContext even in the error case, as the request context is used in error handling. +func (s *StreamingServer) HandleResponseBody(ctx context.Context, reqCtx *RequestContext, response map[string]any) (*RequestContext, error) { + logger := log.FromContext(ctx) + responseBytes, err := json.Marshal(response) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "error marshalling responseBody") + return reqCtx, err + } + if response["usage"] != nil { + usg := response["usage"].(map[string]any) + usage := Usage{ + PromptTokens: int(usg["prompt_tokens"].(float64)), + CompletionTokens: int(usg["completion_tokens"].(float64)), + TotalTokens: int(usg["total_tokens"].(float64)), } + reqCtx.Usage = usage + logger.V(logutil.VERBOSE).Info("Response generated", "usage", reqCtx.Usage) + } + reqCtx.ResponseSize = len(responseBytes) + // ResponseComplete is to indicate the response is complete. In non-streaming + // case, it will be set to be true once the response is processed; in + // streaming case, it will be set to be true once the last chunk is processed. + // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/178) + // will add the processing for streaming case. + reqCtx.ResponseComplete = true + + reqCtx.respBodyResp = generateResponseBodyResponses(responseBytes, true) + return reqCtx, nil +} - if statusFound && typeFound { - break +// The function is to handle streaming response if the modelServer is streaming. +func (s *StreamingServer) HandleResponseBodyModelStreaming(ctx context.Context, reqCtx *RequestContext, responseText string) { + if strings.Contains(responseText, streamingEndMsg) { + resp := parseRespForUsage(ctx, responseText) + reqCtx.Usage = resp.Usage + metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, resp.Usage.PromptTokens) + metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, resp.Usage.CompletionTokens) + } +} + +func (s *StreamingServer) HandleResponseHeaders(ctx context.Context, reqCtx *RequestContext, resp *extProcPb.ProcessingRequest_ResponseHeaders) (*RequestContext, error) { + for _, header := range resp.ResponseHeaders.Headers.Headers { + if header.RawValue != nil { + reqCtx.Response.Headers[header.Key] = string(header.RawValue) + } else { + reqCtx.Response.Headers[header.Key] = header.Value } } - resp := &extProcPb.ProcessingResponse{ + reqCtx, err := s.director.HandleResponse(ctx, reqCtx) + + return reqCtx, err +} + +func (s *StreamingServer) generateResponseHeaderResponse(reqCtx *RequestContext) *extProcPb.ProcessingResponse { + return &extProcPb.ProcessingResponse{ Response: &extProcPb.ProcessingResponse_ResponseHeaders{ ResponseHeaders: &extProcPb.HeadersResponse{ Response: &extProcPb.CommonResponse{ HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - // This is for debugging purpose only. - Key: "x-went-into-resp-headers", - RawValue: []byte("true"), - }, - }, - }, + SetHeaders: s.generateResponseHeaders(reqCtx), }, }, }, }, } - return resp, nil } -// HandleResponseBody parses response body to update information such as number of completion tokens. -// NOTE: The current implementation only supports Buffered mode, which is not enabled by default. To -// use it, you need to configure EnvoyExtensionPolicy to have response body in Buffered mode. -// https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/filters/http/ext_proc/v3/processing_mode.proto#envoy-v3-api-msg-extensions-filters-http-ext-proc-v3-processingmode -// Example response -/* -{ - "id": "cmpl-573498d260f2423f9e42817bbba3743a", - "object": "text_completion", - "created": 1732563765, - "model": "meta-llama/Llama-3.1-8B-Instruct", - "choices": [ - { - "index": 0, - "text": " Chronicle\nThe San Francisco Chronicle has a new book review section, and it's a good one. The reviews are short, but they're well-written and well-informed. The Chronicle's book review section is a good place to start if you're looking for a good book review.\nThe Chronicle's book review section is a good place to start if you're looking for a good book review. The Chronicle's book review section", - "logprobs": null, - "finish_reason": "length", - "stop_reason": null, - "prompt_logprobs": null - } - ], - "usage": { - "prompt_tokens": 11, - "total_tokens": 111, - "completion_tokens": 100 - } -}*/ -func (s *Server) HandleResponseBody( - ctx context.Context, - reqCtx *RequestContext, - req *extProcPb.ProcessingRequest, -) (*extProcPb.ProcessingResponse, error) { - logger := log.FromContext(ctx) - loggerVerbose := logger.V(logutil.VERBOSE) - body := req.Request.(*extProcPb.ProcessingRequest_ResponseBody) - - if reqCtx.modelServerStreaming { - logger.V(logutil.DEBUG).Info("Processing HandleResponseBody") - if err := s.HandleStreaming(ctx, reqCtx, body, loggerVerbose); err != nil { - return nil, err - } - } else { - loggerVerbose.Info("Processing HandleResponseBody") - if err := s.HandleNonStreaming(ctx, reqCtx, body, loggerVerbose); err != nil { - return nil, err +func generateResponseBodyResponses(responseBodyBytes []byte, setEoS bool) []*extProcPb.ProcessingResponse { + commonResponses := buildCommonResponses(responseBodyBytes, bodyByteLimit, setEoS) + responses := []*extProcPb.ProcessingResponse{} + for _, commonResp := range commonResponses { + resp := &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: commonResp, + }, + }, } + responses = append(responses, resp) } + return responses +} - resp := &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_ResponseBody{ - ResponseBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{}, +func (s *StreamingServer) generateResponseHeaders(reqCtx *RequestContext) []*configPb.HeaderValueOption { + // can likely refactor these two bespoke headers to be updated in PostDispatch, to centralize logic. + headers := []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + // This is for debugging purpose only. + Key: "x-went-into-resp-headers", + RawValue: []byte("true"), }, }, } - return resp, nil -} - -func (s *Server) HandleNonStreaming( - ctx context.Context, - reqCtx *RequestContext, - body *extProcPb.ProcessingRequest_ResponseBody, - loggerVerbose logr.Logger, -) error { - loggerVerbose.Info("Processing HandleResponseBody") - res := Response{} - if err := json.Unmarshal(body.ResponseBody.Body, &res); err != nil { - return errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("unmarshaling response body: %v", err)} - } - reqCtx.Usage = res.Usage - reqCtx.ResponseSize = len(body.ResponseBody.Body) - reqCtx.ResponseComplete = true - loggerVerbose.Info("Response generated", "response", res) - return nil -} - -func (s *Server) HandleStreaming( - ctx context.Context, - reqCtx *RequestContext, - body *extProcPb.ProcessingRequest_ResponseBody, - loggerVerbose logr.Logger, -) error { - responseText := string(body.ResponseBody.Body) - if strings.Contains(responseText, streamingEndMsg) { - parsedResp := ParseRespForUsage(ctx, responseText) - reqCtx.Usage = parsedResp.Usage - } - - if body.ResponseBody.EndOfStream { - loggerVerbose.Info("Streaming is completed") - reqCtx.ResponseComplete = true - } else { - reqCtx.ResponseSize += len(body.ResponseBody.Body) + // include all headers + for key, value := range reqCtx.Response.Headers { + headers = append(headers, &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: key, + RawValue: []byte(value), + }, + }) } - - return nil + return headers } // Example message if "stream_options": {"include_usage": "true"} is included in the request: @@ -227,11 +152,9 @@ func (s *Server) HandleStreaming( // // If include_usage is not included in the request, `data: [DONE]` is returned separately, which // indicates end of streaming. -func ParseRespForUsage( - ctx context.Context, - responseText string, -) Response { - response := Response{} +func parseRespForUsage(ctx context.Context, responseText string) ResponseBody { + response := ResponseBody{} + logger := log.FromContext(ctx) lines := strings.Split(responseText, "\n") for _, line := range lines { @@ -245,8 +168,7 @@ func ParseRespForUsage( byteSlice := []byte(content) if err := json.Unmarshal(byteSlice, &response); err != nil { - logger := log.FromContext(ctx) - logger.V(logutil.DEFAULT).Error(err, "unmarshaling response body") + logger.Error(err, "unmarshaling response body") continue } } @@ -254,7 +176,7 @@ func ParseRespForUsage( return response } -type Response struct { +type ResponseBody struct { Usage Usage `json:"usage"` } diff --git a/pkg/epp/handlers/response_test.go b/pkg/epp/handlers/response_test.go index 074b45c94..b79f4ee46 100644 --- a/pkg/epp/handlers/response_test.go +++ b/pkg/epp/handlers/response_test.go @@ -18,9 +18,9 @@ package handlers import ( "context" + "encoding/json" "testing" - extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "github.com/google/go-cmp/cmp" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -63,40 +63,61 @@ func TestHandleResponseBody(t *testing.T) { tests := []struct { name string - req *extProcPb.ProcessingRequest_ResponseBody + body []byte reqCtx *RequestContext want Usage wantErr bool }{ { name: "success", - req: &extProcPb.ProcessingRequest_ResponseBody{ - ResponseBody: &extProcPb.HttpBody{ - Body: []byte(body), - }, - }, + body: []byte(body), want: Usage{ PromptTokens: 11, TotalTokens: 111, CompletionTokens: 100, }, }, - { - name: "malformed response", - req: &extProcPb.ProcessingRequest_ResponseBody{ - ResponseBody: &extProcPb.HttpBody{ - Body: []byte("malformed json"), - }, - }, - wantErr: true, - }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + server := &StreamingServer{} + reqCtx := test.reqCtx + if reqCtx == nil { + reqCtx = &RequestContext{} + } + var responseMap map[string]any + marshalErr := json.Unmarshal(test.body, &responseMap) + if marshalErr != nil { + t.Error(marshalErr, "Error unmarshaling request body") + } + _, err := server.HandleResponseBody(ctx, reqCtx, responseMap) + if err != nil { + if !test.wantErr { + t.Fatalf("HandleResponseBody returned unexpected error: %v, want %v", err, test.wantErr) + } + return + } + + if diff := cmp.Diff(test.want, reqCtx.Usage); diff != "" { + t.Errorf("HandleResponseBody returned unexpected response, diff(-want, +got): %v", diff) + } + }) + } +} + +func TestHandleStreamedResponseBody(t *testing.T) { + ctx := logutil.NewTestLoggerIntoContext(context.Background()) + tests := []struct { + name string + body string + reqCtx *RequestContext + want Usage + wantErr bool + }{ { name: "streaming request without usage", - req: &extProcPb.ProcessingRequest_ResponseBody{ - ResponseBody: &extProcPb.HttpBody{ - Body: []byte(streamingBodyWithoutUsage), - }, - }, + body: streamingBodyWithoutUsage, reqCtx: &RequestContext{ modelServerStreaming: true, }, @@ -105,11 +126,7 @@ func TestHandleResponseBody(t *testing.T) { }, { name: "streaming request with usage", - req: &extProcPb.ProcessingRequest_ResponseBody{ - ResponseBody: &extProcPb.HttpBody{ - Body: []byte(streamingBodyWithUsage), - }, - }, + body: streamingBodyWithUsage, reqCtx: &RequestContext{ modelServerStreaming: true, }, @@ -124,18 +141,12 @@ func TestHandleResponseBody(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - server := &Server{} + server := &StreamingServer{} reqCtx := test.reqCtx if reqCtx == nil { reqCtx = &RequestContext{} } - _, err := server.HandleResponseBody(ctx, reqCtx, &extProcPb.ProcessingRequest{Request: test.req}) - if err != nil { - if !test.wantErr { - t.Fatalf("HandleResponseBody returned unexpected error: %v, want %v", err, test.wantErr) - } - return - } + server.HandleResponseBodyModelStreaming(ctx, reqCtx, test.body) if diff := cmp.Diff(test.want, reqCtx.Usage); diff != "" { t.Errorf("HandleResponseBody returned unexpected response, diff(-want, +got): %v", diff) diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go index a92f091c5..3ac13c892 100644 --- a/pkg/epp/handlers/server.go +++ b/pkg/epp/handlers/server.go @@ -18,69 +18,156 @@ package handlers import ( "context" + "encoding/json" "io" + "strings" "time" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3" + "github.com/go-logr/logr" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" "sigs.k8s.io/controller-runtime/pkg/log" - backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" + schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + requtil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/request" ) -func NewServer(scheduler Scheduler, destinationEndpointHintMetadataNamespace, destinationEndpointHintKey string, datastore datastore.Datastore) *Server { - return &Server{ - scheduler: scheduler, +const ( + // Certain envoy implementations set a max limit of 64Kb per streamed chunk, intentionally setting this lower for a safe margin. + bodyByteLimit = 62000 +) + +func NewStreamingServer(destinationEndpointHintMetadataNamespace, destinationEndpointHintKey string, datastore Datastore, director Director) *StreamingServer { + return &StreamingServer{ destinationEndpointHintMetadataNamespace: destinationEndpointHintMetadataNamespace, destinationEndpointHintKey: destinationEndpointHintKey, + director: director, datastore: datastore, } } +type Director interface { + HandleRequest(ctx context.Context, reqCtx *RequestContext) (*RequestContext, error) + HandleResponse(ctx context.Context, reqCtx *RequestContext) (*RequestContext, error) + GetRandomPod() *backend.Pod +} + +type Datastore interface { + PoolGet() (*v1alpha2.InferencePool, error) +} + // Server implements the Envoy external processing server. // https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor.proto -type Server struct { - scheduler Scheduler +type StreamingServer struct { // The key of the header to specify the target pod address. This value needs to match Envoy // configuration. destinationEndpointHintKey string // The key acting as the outer namespace struct in the metadata extproc response to communicate // back the picked endpoints. destinationEndpointHintMetadataNamespace string - datastore datastore.Datastore + datastore Datastore + director Director } -type Scheduler interface { - Schedule(ctx context.Context, b *scheduling.LLMRequest) (targetPod backendmetrics.PodMetrics, err error) +// RequestContext stores context information during the life time of an HTTP request. +// TODO: The requestContext is gathering a ton of fields. A future refactor needs to tease these fields apart. +// Specifically, there are fields related to the ext-proc protocol, and then fields related to the lifecycle of the request. +// We should split these apart as this monolithic object exposes too much data to too many layers. +type RequestContext struct { + TargetPod *backend.Pod + TargetEndpoint string + Model string + ResolvedTargetModel string + RequestReceivedTimestamp time.Time + ResponseCompleteTimestamp time.Time + RequestSize int + Usage Usage + ResponseSize int + ResponseComplete bool + ResponseStatusCode string + RequestRunning bool + Request *Request + + SchedulingRequest *schedulingtypes.LLMRequest + + RequestState StreamRequestState + modelServerStreaming bool + + Response *Response + + reqHeaderResp *extProcPb.ProcessingResponse + reqBodyResp []*extProcPb.ProcessingResponse + reqTrailerResp *extProcPb.ProcessingResponse + + respHeaderResp *extProcPb.ProcessingResponse + respBodyResp []*extProcPb.ProcessingResponse + respTrailerResp *extProcPb.ProcessingResponse } -func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { +type Request struct { + Headers map[string]string + Body map[string]any + Metadata map[string]any +} +type Response struct { + Headers map[string]string +} +type StreamRequestState int + +const ( + RequestReceived StreamRequestState = 0 + HeaderRequestResponseComplete StreamRequestState = 1 + BodyRequestResponsesComplete StreamRequestState = 2 + TrailerRequestResponsesComplete StreamRequestState = 3 + ResponseRecieved StreamRequestState = 4 + HeaderResponseResponseComplete StreamRequestState = 5 + BodyResponseResponsesComplete StreamRequestState = 6 + TrailerResponseResponsesComplete StreamRequestState = 7 +) + +func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { ctx := srv.Context() logger := log.FromContext(ctx) - loggerVerbose := logger.V(logutil.VERBOSE) - loggerVerbose.Info("Processing") + loggerTrace := logger.V(logutil.TRACE) + loggerTrace.Info("Processing") // Create request context to share states during life time of an HTTP request. // See https://github.com/envoyproxy/envoy/issues/17540. - reqCtx := &RequestContext{} + reqCtx := &RequestContext{ + RequestState: RequestReceived, + Request: &Request{ + Headers: make(map[string]string), + Body: make(map[string]any), + Metadata: make(map[string]any), + }, + Response: &Response{ + Headers: make(map[string]string), + }, + } + + var body []byte + var responseBody map[string]any - // Create variable for error handling as each request should only report once for - // error metric. This doesn't cover the error "Cannot receive stream request" because - // such error might happen even the response is processed. + // Create error handling var as each request should only report once for + // error metrics. This doesn't cover the error "Cannot receive stream request" because + // such errors might happen even though response is processed. var err error - defer func(error) { + defer func(error, *RequestContext) { if reqCtx.ResponseStatusCode != "" { metrics.RecordRequestErrCounter(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseStatusCode) } else if err != nil { metrics.RecordRequestErrCounter(reqCtx.Model, reqCtx.ResolvedTargetModel, errutil.CanonicalCode(err)) } - }(err) + if reqCtx.RequestRunning { + metrics.DecRunningRequests(reqCtx.Model) + } + }(err, reqCtx) for { select { @@ -96,72 +183,213 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { if recvErr != nil { // This error occurs very frequently, though it doesn't seem to have any impact. // TODO Figure out if we can remove this noise. - loggerVerbose.Error(err, "Cannot receive stream request") + logger.V(logutil.DEFAULT).Error(err, "Cannot receive stream request") return status.Errorf(codes.Unknown, "cannot receive stream request: %v", err) } - var resp *extProcPb.ProcessingResponse + reqCtx.Request.Metadata = requtil.ExtractMetadataValues(req) + switch v := req.Request.(type) { case *extProcPb.ProcessingRequest_RequestHeaders: - reqCtx.RequestReceivedTimestamp = time.Now() - resp = HandleRequestHeaders(ctx, reqCtx, req) - loggerVerbose.Info("Request context after HandleRequestHeaders", "context", reqCtx) + if requestId := requtil.ExtractHeaderValue(v, requtil.RequestIdHeaderKey); len(requestId) > 0 { + logger = logger.WithValues(requtil.RequestIdHeaderKey, requestId) + loggerTrace = logger.V(logutil.TRACE) + ctx = log.IntoContext(ctx, logger) + } + err = s.HandleRequestHeaders(ctx, reqCtx, v) case *extProcPb.ProcessingRequest_RequestBody: - resp, err = s.HandleRequestBody(ctx, reqCtx, req) - if err == nil { + loggerTrace.Info("Incoming body chunk", "EoS", v.RequestBody.EndOfStream) + // In the stream case, we can receive multiple request bodies. + body = append(body, v.RequestBody.Body...) + + // Message is buffered, we can read and decode. + if v.RequestBody.EndOfStream { + loggerTrace.Info("decoding") + err = json.Unmarshal(body, &reqCtx.Request.Body) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error unmarshaling request body") + err = errutil.Error{Code: errutil.BadRequest, Msg: "Error unmarshaling request body: " + string(body)} + break + } + + // Body stream complete. Allocate empty slice for response to use. + body = []byte{} + + reqCtx, err = s.director.HandleRequest(ctx, reqCtx) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error handling request") + break + } + + // Populate the ExtProc protocol responses for the request body. + requestBodyBytes, err := json.Marshal(reqCtx.Request.Body) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error marshalling request body") + break + } + reqCtx.RequestSize = len(requestBodyBytes) + reqCtx.reqHeaderResp = s.generateRequestHeaderResponse(reqCtx) + reqCtx.reqBodyResp = s.generateRequestBodyResponses(requestBodyBytes) + metrics.RecordRequestCounter(reqCtx.Model, reqCtx.ResolvedTargetModel) metrics.RecordRequestSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestSize) } - loggerVerbose.Info("Request context after HandleRequestBody", "context", reqCtx) + case *extProcPb.ProcessingRequest_RequestTrailers: + // This is currently unused. case *extProcPb.ProcessingRequest_ResponseHeaders: - resp, err = s.HandleResponseHeaders(ctx, reqCtx, req) - loggerVerbose.Info("Request context after HandleResponseHeaders", "context", reqCtx) - case *extProcPb.ProcessingRequest_ResponseBody: - // Don't send a 500 on a response error. Just let the message passthrough and log our error for debugging purposes. - // We assume the body is valid JSON, err messages are not guaranteed to be json, and so capturing and sending a 500 obfuscates the response message. - // using the standard 'err' var will send an immediate error response back to the caller. + for _, header := range v.ResponseHeaders.Headers.GetHeaders() { + value := string(header.RawValue) + + loggerTrace.Info("header", "key", header.Key, "value", value) + if header.Key == "status" && value != "200" { + reqCtx.ResponseStatusCode = errutil.ModelServerError + } else if header.Key == "content-type" && strings.Contains(value, "text/event-stream") { + reqCtx.modelServerStreaming = true + loggerTrace.Info("model server is streaming response") + } + } + reqCtx.RequestState = ResponseRecieved + var responseErr error - resp, responseErr = s.HandleResponseBody(ctx, reqCtx, req) + reqCtx, responseErr = s.HandleResponseHeaders(ctx, reqCtx, v) if responseErr != nil { - logger.V(logutil.DEFAULT).Error(responseErr, "Failed to process response body", "request", req) - } else if reqCtx.ResponseComplete { - reqCtx.ResponseCompleteTimestamp = time.Now() - metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) - metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) - metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens) - metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens) + logger.V(logutil.DEFAULT).Error(responseErr, "Failed to process response headers", "request", req) } + reqCtx.respHeaderResp = s.generateResponseHeaderResponse(reqCtx) + + case *extProcPb.ProcessingRequest_ResponseBody: if reqCtx.modelServerStreaming { - logger.V(logutil.DEBUG).Info("Request context after HandleResponseBody", "context", reqCtx) + // Currently we punt on response parsing if the modelServer is streaming, and we just passthrough. + + responseText := string(v.ResponseBody.Body) + s.HandleResponseBodyModelStreaming(ctx, reqCtx, responseText) + if v.ResponseBody.EndOfStream { + loggerTrace.Info("stream completed") + + reqCtx.ResponseCompleteTimestamp = time.Now() + metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) + metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) + } + + reqCtx.respBodyResp = generateResponseBodyResponses(v.ResponseBody.Body, v.ResponseBody.EndOfStream) } else { - loggerVerbose.Info("Request context after HandleResponseBody", "context", reqCtx) + body = append(body, v.ResponseBody.Body...) + + // Message is buffered, we can read and decode. + if v.ResponseBody.EndOfStream { + loggerTrace.Info("stream completed") + // Don't send a 500 on a response error. Just let the message passthrough and log our error for debugging purposes. + // We assume the body is valid JSON, err messages are not guaranteed to be json, and so capturing and sending a 500 obfuscates the response message. + // Using the standard 'err' var will send an immediate error response back to the caller. + var responseErr error + responseErr = json.Unmarshal(body, &responseBody) + if responseErr != nil { + logger.V(logutil.DEFAULT).Error(responseErr, "Error unmarshaling request body", "body", string(body)) + reqCtx.respBodyResp = generateResponseBodyResponses(body, true) + break + } + + reqCtx, responseErr = s.HandleResponseBody(ctx, reqCtx, responseBody) + if responseErr != nil { + logger.V(logutil.DEFAULT).Error(responseErr, "Failed to process response body", "request", req) + } else if reqCtx.ResponseComplete { + reqCtx.ResponseCompleteTimestamp = time.Now() + metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) + metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) + metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens) + metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens) + } + } } - default: - logger.V(logutil.DEFAULT).Error(nil, "Unknown Request type", "request", v) - return status.Error(codes.Unknown, "unknown request type") + case *extProcPb.ProcessingRequest_ResponseTrailers: + // This is currently unused. } + // Handle the err and fire an immediate response. if err != nil { logger.V(logutil.DEFAULT).Error(err, "Failed to process request", "request", req) - resp, err = BuildErrResponse(err) + resp, err := buildErrResponse(err) if err != nil { return err } + if err := srv.Send(resp); err != nil { + logger.V(logutil.DEFAULT).Error(err, "Send failed") + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + return nil + } + loggerTrace.Info("checking", "request state", reqCtx.RequestState) + if err := reqCtx.updateStateAndSendIfNeeded(srv, logger); err != nil { + return err } + } +} - if !reqCtx.modelServerStreaming { - loggerVerbose.Info("Response generated", "response", resp) - } else { - logger.V(logutil.DEBUG).Info("Response generated", "response", resp) +// updateStateAndSendIfNeeded checks state and can send mutiple responses in a single pass, but only if ordered properly. +// Order of requests matter in FULL_DUPLEX_STREAMING. For both request and response, the order of response sent back MUST be: Header->Body->Trailer, with trailer being optional. +func (r *RequestContext) updateStateAndSendIfNeeded(srv extProcPb.ExternalProcessor_ProcessServer, logger logr.Logger) error { + loggerTrace := logger.V(logutil.TRACE) + // No switch statement as we could send multiple responses in one pass. + if r.RequestState == RequestReceived && r.reqHeaderResp != nil { + loggerTrace.Info("Sending request header response", "obj", r.reqHeaderResp) + if err := srv.Send(r.reqHeaderResp); err != nil { + logger.V(logutil.DEFAULT).Error(err, "error sending response") + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + r.RequestState = HeaderRequestResponseComplete + } + if r.RequestState == HeaderRequestResponseComplete && r.reqBodyResp != nil && len(r.reqBodyResp) > 0 { + loggerTrace.Info("Sending request body response(s)") + + for _, response := range r.reqBodyResp { + if err := srv.Send(response); err != nil { + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + } + r.RequestState = BodyRequestResponsesComplete + metrics.IncRunningRequests(r.Model) + r.RequestRunning = true + // Dump the response so a new stream message can begin + r.reqBodyResp = nil + } + if r.RequestState == BodyRequestResponsesComplete && r.reqTrailerResp != nil { + // Trailers in requests are not guaranteed + if err := srv.Send(r.reqTrailerResp); err != nil { + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + } + if r.RequestState == ResponseRecieved && r.respHeaderResp != nil { + loggerTrace.Info("Sending response header response", "obj", r.respHeaderResp) + if err := srv.Send(r.respHeaderResp); err != nil { + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) } - if err := srv.Send(resp); err != nil { - logger.V(logutil.DEFAULT).Error(err, "Send failed") + r.RequestState = HeaderResponseResponseComplete + } + if r.RequestState == HeaderResponseResponseComplete && r.respBodyResp != nil && len(r.respBodyResp) > 0 { + loggerTrace.Info("Sending response body response(s)") + for _, response := range r.respBodyResp { + if err := srv.Send(response); err != nil { + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + + body := response.Response.(*extProcPb.ProcessingResponse_ResponseBody) + if body.ResponseBody.Response.GetBodyMutation().GetStreamedResponse().GetEndOfStream() { + r.RequestState = BodyResponseResponsesComplete + } + } + // Dump the response so a new stream message can begin + r.respBodyResp = nil + } + if r.RequestState == BodyResponseResponsesComplete && r.respTrailerResp != nil { + // Trailers in requests are not guaranteed + if err := srv.Send(r.respTrailerResp); err != nil { return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) } } + return nil } -func BuildErrResponse(err error) (*extProcPb.ProcessingResponse, error) { +func buildErrResponse(err error) (*extProcPb.ProcessingResponse, error) { var resp *extProcPb.ProcessingResponse switch errutil.CanonicalCode(err) { @@ -188,6 +416,17 @@ func BuildErrResponse(err error) (*extProcPb.ProcessingResponse, error) { }, }, } + // This code can be returned by the director when there are no candidate pods for the request scheduling. + case errutil.ServiceUnavailable: + resp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: envoyTypePb.StatusCode_ServiceUnavailable, + }, + }, + }, + } // This code can be returned when users provide invalid json request. case errutil.BadRequest: resp = &extProcPb.ProcessingResponse{ @@ -212,45 +451,55 @@ func BuildErrResponse(err error) (*extProcPb.ProcessingResponse, error) { default: return nil, status.Errorf(status.Code(err), "failed to handle request: %v", err) } + + if err.Error() != "" { + resp.Response.(*extProcPb.ProcessingResponse_ImmediateResponse).ImmediateResponse.Body = []byte(err.Error()) + } + return resp, nil } -// RequestContext stores context information during the life time of an HTTP request. -type RequestContext struct { - TargetPod string - TargetEndpoint string - Model string - ResolvedTargetModel string - RequestReceivedTimestamp time.Time - ResponseCompleteTimestamp time.Time - RequestSize int - Usage Usage - ResponseSize int - ResponseComplete bool - ResponseStatusCode string - RequestRunning bool - - RequestState StreamRequestState - modelServerStreaming bool +func buildCommonResponses(bodyBytes []byte, byteLimit int, setEos bool) []*extProcPb.CommonResponse { + responses := []*extProcPb.CommonResponse{} + startingIndex := 0 + bodyLen := len(bodyBytes) - reqHeaderResp *extProcPb.ProcessingResponse - reqBodyResp *extProcPb.ProcessingResponse - reqTrailerResp *extProcPb.ProcessingResponse + if bodyLen == 0 { + return []*extProcPb.CommonResponse{ + { + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: bodyBytes, + EndOfStream: setEos, + }, + }, + }, + }, + } + } - respHeaderResp *extProcPb.ProcessingResponse - respBodyResp *extProcPb.ProcessingResponse - respTrailerResp *extProcPb.ProcessingResponse -} + for startingIndex < bodyLen { + eos := false + len := min(bodyLen-startingIndex, byteLimit) + chunk := bodyBytes[startingIndex : len+startingIndex] + if setEos && len+startingIndex >= bodyLen { + eos = true + } -type StreamRequestState int + commonResp := &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: chunk, + EndOfStream: eos, + }, + }, + }, + } + responses = append(responses, commonResp) + startingIndex += len + } -const ( - RequestReceived StreamRequestState = 0 - HeaderRequestResponseComplete StreamRequestState = 1 - BodyRequestResponsesComplete StreamRequestState = 2 - TrailerRequestResponsesComplete StreamRequestState = 3 - ResponseRecieved StreamRequestState = 4 - HeaderResponseResponseComplete StreamRequestState = 5 - BodyResponseResponsesComplete StreamRequestState = 6 - TrailerResponseResponsesComplete StreamRequestState = 7 -) + return responses +} diff --git a/pkg/epp/handlers/server_test.go b/pkg/epp/handlers/server_test.go new file mode 100644 index 000000000..72f9d1b96 --- /dev/null +++ b/pkg/epp/handlers/server_test.go @@ -0,0 +1,85 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package handlers + +import ( + "crypto/rand" + "testing" +) + +func TestBuildCommonResponses(t *testing.T) { + tests := []struct { + name string + count int + expectedMessageCount int + }{ + { + name: "zero case", + count: 0, + expectedMessageCount: 1, + }, + { + name: "below limit", + count: bodyByteLimit - 1000, + expectedMessageCount: 1, + }, + { + name: "at limit", + count: bodyByteLimit, + expectedMessageCount: 1, + }, + { + name: "off by one error?", + count: bodyByteLimit + 1, + expectedMessageCount: 2, + }, + { + name: "above limit", + count: bodyByteLimit + 1000, + expectedMessageCount: 2, + }, + { + name: "above limit", + count: (bodyByteLimit * 2) + 1000, + expectedMessageCount: 3, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + arr := generateBytes(test.count) + responses := buildCommonResponses(arr, bodyByteLimit, true) + for i, response := range responses { + eos := response.BodyMutation.GetStreamedResponse().GetEndOfStream() + if eos == true && i+1 != len(responses) { + t.Fatalf("EoS should not be set") + } + if eos == false && i+1 == len(responses) { + t.Fatalf("EoS should be set") + } + } + if len(responses) != test.expectedMessageCount { + t.Fatalf("Expected: %v, Got %v", test.expectedMessageCount, len(responses)) + } + }) + } +} + +func generateBytes(count int) []byte { + arr := make([]byte, count) + _, _ = rand.Read(arr) + return arr +} diff --git a/pkg/epp/handlers/streamingserver.go b/pkg/epp/handlers/streamingserver.go deleted file mode 100644 index 874dd734f..000000000 --- a/pkg/epp/handlers/streamingserver.go +++ /dev/null @@ -1,592 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package handlers - -import ( - "context" - "encoding/json" - "fmt" - "io" - "math/rand" - "strconv" - "strings" - "time" - - configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" - extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" - "github.com/go-logr/logr" - "google.golang.org/grpc/codes" - "google.golang.org/grpc/status" - "google.golang.org/protobuf/types/known/structpb" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" - backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" - errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" -) - -func NewStreamingServer(scheduler Scheduler, destinationEndpointHintMetadataNamespace, destinationEndpointHintKey string, datastore datastore.Datastore) *StreamingServer { - return &StreamingServer{ - scheduler: scheduler, - destinationEndpointHintMetadataNamespace: destinationEndpointHintMetadataNamespace, - destinationEndpointHintKey: destinationEndpointHintKey, - datastore: datastore, - } -} - -type StreamingServer struct { - scheduler Scheduler - // The key of the header to specify the target pod address. This value needs to match Envoy - // configuration. - destinationEndpointHintKey string - // The key acting as the outer namespace struct in the metadata extproc response to communicate - // back the picked endpoints. - destinationEndpointHintMetadataNamespace string - datastore datastore.Datastore -} - -func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { - ctx := srv.Context() - logger := log.FromContext(ctx) - loggerTrace := logger.V(logutil.TRACE) - loggerTrace.Info("Processing") - - // Create request context to share states during life time of an HTTP request. - // See https://github.com/envoyproxy/envoy/issues/17540. - reqCtx := &RequestContext{ - RequestState: RequestReceived, - } - - var body []byte - var requestBody, responseBody map[string]interface{} - - // Create error handling var as each request should only report once for - // error metrics. This doesn't cover the error "Cannot receive stream request" because - // such errors might happen even though response is processed. - var err error - defer func(error, *RequestContext) { - if reqCtx.ResponseStatusCode != "" { - metrics.RecordRequestErrCounter(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseStatusCode) - } else if err != nil { - metrics.RecordRequestErrCounter(reqCtx.Model, reqCtx.ResolvedTargetModel, errutil.CanonicalCode(err)) - } - if reqCtx.RequestRunning { - metrics.DecRunningRequests(reqCtx.Model) - } - }(err, reqCtx) - - for { - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - - req, recvErr := srv.Recv() - if recvErr == io.EOF || status.Code(recvErr) == codes.Canceled { - return nil - } - if recvErr != nil { - // This error occurs very frequently, though it doesn't seem to have any impact. - // TODO Figure out if we can remove this noise. - logger.V(logutil.DEFAULT).Error(err, "Cannot receive stream request") - return status.Errorf(codes.Unknown, "cannot receive stream request: %v", err) - } - - switch v := req.Request.(type) { - case *extProcPb.ProcessingRequest_RequestHeaders: - err = s.HandleRequestHeaders(ctx, reqCtx, v) - case *extProcPb.ProcessingRequest_RequestBody: - loggerTrace.Info("Incoming body chunk", "EoS", v.RequestBody.EndOfStream) - // In the stream case, we can receive multiple request bodies. - body = append(body, v.RequestBody.Body...) - - // Message is buffered, we can read and decode. - if v.RequestBody.EndOfStream { - loggerTrace.Info("decoding") - err = json.Unmarshal(body, &requestBody) - if err != nil { - logger.V(logutil.DEFAULT).Error(err, "Error unmarshaling request body") - } - - // Body stream complete. Allocate empty slice for response to use. - body = []byte{} - - reqCtx, err = s.HandleRequestBody(ctx, reqCtx, req, requestBody) - if err != nil { - logger.V(logutil.DEFAULT).Error(err, "Error handling body") - } else { - metrics.RecordRequestCounter(reqCtx.Model, reqCtx.ResolvedTargetModel) - metrics.RecordRequestSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestSize) - } - } - case *extProcPb.ProcessingRequest_RequestTrailers: - // This is currently unused. - case *extProcPb.ProcessingRequest_ResponseHeaders: - for _, header := range v.ResponseHeaders.Headers.GetHeaders() { - value := string(header.RawValue) - - loggerTrace.Info("header", "key", header.Key, "value", value) - if header.Key == "status" && value != "200" { - reqCtx.ResponseStatusCode = errutil.ModelServerError - } else if header.Key == "content-type" && strings.Contains(value, "text/event-stream") { - reqCtx.modelServerStreaming = true - loggerTrace.Info("model server is streaming response") - } - } - reqCtx.RequestState = ResponseRecieved - reqCtx.respHeaderResp = &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_ResponseHeaders{ - ResponseHeaders: &extProcPb.HeadersResponse{ - Response: &extProcPb.CommonResponse{ - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - // This is for debugging purpose only. - Key: "x-went-into-resp-headers", - RawValue: []byte("true"), - }, - }, - }, - }, - }, - }, - }, - } - - case *extProcPb.ProcessingRequest_ResponseBody: - if reqCtx.modelServerStreaming { - // Currently we punt on response parsing if the modelServer is streaming, and we just passthrough. - - responseText := string(v.ResponseBody.Body) - s.HandleResponseBodyModelStreaming(ctx, reqCtx, responseText) - if v.ResponseBody.EndOfStream { - loggerTrace.Info("stream completed") - - reqCtx.ResponseCompleteTimestamp = time.Now() - metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) - metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) - } - - reqCtx.respBodyResp = &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_ResponseBody{ - ResponseBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: v.ResponseBody.Body, - EndOfStream: v.ResponseBody.EndOfStream, - }, - }, - }, - }, - }, - }, - } - } else { - body = append(body, v.ResponseBody.Body...) - - // Message is buffered, we can read and decode. - if v.ResponseBody.EndOfStream { - loggerTrace.Info("stream completed") - // Don't send a 500 on a response error. Just let the message passthrough and log our error for debugging purposes. - // We assume the body is valid JSON, err messages are not guaranteed to be json, and so capturing and sending a 500 obfuscates the response message. - // using the standard 'err' var will send an immediate error response back to the caller. - var responseErr error - responseErr = json.Unmarshal(body, &responseBody) - if responseErr != nil { - logger.V(logutil.DEFAULT).Error(responseErr, "Error unmarshaling request body") - } - - reqCtx, responseErr = s.HandleResponseBody(ctx, reqCtx, responseBody) - if responseErr != nil { - logger.V(logutil.DEFAULT).Error(responseErr, "Failed to process response body", "request", req) - } else if reqCtx.ResponseComplete { - reqCtx.ResponseCompleteTimestamp = time.Now() - metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) - metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) - metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens) - metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens) - } - } - } - case *extProcPb.ProcessingRequest_ResponseTrailers: - // This is currently unused. - } - - // Handle the err and fire an immediate response. - if err != nil { - logger.V(logutil.DEFAULT).Error(err, "Failed to process request", "request", req) - resp, err := BuildErrResponse(err) - if err != nil { - return err - } - if err := srv.Send(resp); err != nil { - logger.V(logutil.DEFAULT).Error(err, "Send failed") - return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) - } - return nil - } - loggerTrace.Info("checking", "request state", reqCtx.RequestState) - if err := reqCtx.updateStateAndSendIfNeeded(srv, logger); err != nil { - return err - } - } -} - -// updateStateAndSendIfNeeded checks state and can send mutiple responses in a single pass, but only if ordered properly. -// Order of requests matter in FULL_DUPLEX_STREAMING. For both request and response, the order of response sent back MUST be: Header->Body->Trailer, with trailer being optional. -func (r *RequestContext) updateStateAndSendIfNeeded(srv extProcPb.ExternalProcessor_ProcessServer, logger logr.Logger) error { - loggerTrace := logger.V(logutil.TRACE) - // No switch statement as we could send multiple responses in one pass. - if r.RequestState == RequestReceived && r.reqHeaderResp != nil { - loggerTrace.Info("Sending request header response", "obj", r.reqHeaderResp) - if err := srv.Send(r.reqHeaderResp); err != nil { - logger.V(logutil.DEFAULT).Error(err, "error sending response") - return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) - } - r.RequestState = HeaderRequestResponseComplete - } - if r.RequestState == HeaderRequestResponseComplete && r.reqBodyResp != nil { - loggerTrace.Info("Sending request body response") - if err := srv.Send(r.reqBodyResp); err != nil { - return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) - } - r.RequestState = BodyRequestResponsesComplete - metrics.IncRunningRequests(r.Model) - r.RequestRunning = true - // Dump the response so a new stream message can begin - r.reqBodyResp = nil - } - if r.RequestState == BodyRequestResponsesComplete && r.reqTrailerResp != nil { - // Trailers in requests are not guaranteed - if err := srv.Send(r.reqHeaderResp); err != nil { - return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) - } - } - if r.RequestState == ResponseRecieved && r.respHeaderResp != nil { - loggerTrace.Info("Sending response header response", "obj", r.respHeaderResp) - if err := srv.Send(r.respHeaderResp); err != nil { - return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) - } - r.RequestState = HeaderResponseResponseComplete - } - if r.RequestState == HeaderResponseResponseComplete && r.respBodyResp != nil { - loggerTrace.Info("Sending response body response") - if err := srv.Send(r.respBodyResp); err != nil { - return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) - } - - body := r.respBodyResp.Response.(*extProcPb.ProcessingResponse_ResponseBody) - if body.ResponseBody.Response.GetBodyMutation().GetStreamedResponse().GetEndOfStream() { - r.RequestState = BodyResponseResponsesComplete - } - // Dump the response so a new stream message can begin - r.respBodyResp = nil - } - if r.RequestState == BodyResponseResponsesComplete && r.respTrailerResp != nil { - // Trailers in requests are not guaranteed - if err := srv.Send(r.reqHeaderResp); err != nil { - return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) - } - } - return nil -} - -// HandleRequestBody always returns the requestContext even in the error case, as the request context is used in error handling. -func (s *StreamingServer) HandleRequestBody( - ctx context.Context, - reqCtx *RequestContext, - req *extProcPb.ProcessingRequest, - requestBodyMap map[string]interface{}, -) (*RequestContext, error) { - var requestBodyBytes []byte - logger := log.FromContext(ctx) - - // Resolve target models. - model, ok := requestBodyMap["model"].(string) - if !ok { - return reqCtx, errutil.Error{Code: errutil.BadRequest, Msg: "model not found in request"} - } - - modelName := model - - // NOTE: The nil checking for the modelObject means that we DO allow passthrough currently. - // This might be a security risk in the future where adapters not registered in the InferenceModel - // are able to be requested by using their distinct name. - modelObj := s.datastore.ModelGet(model) - if modelObj == nil { - return reqCtx, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error finding a model object in InferenceModel for input %v", model)} - } - if len(modelObj.Spec.TargetModels) > 0 { - modelName = RandomWeightedDraw(logger, modelObj, 0) - if modelName == "" { - return reqCtx, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error getting target model name for model %v", modelObj.Name)} - } - } - llmReq := &scheduling.LLMRequest{ - Model: model, - ResolvedTargetModel: modelName, - Critical: datastore.IsCritical(modelObj), - } - logger.V(logutil.DEBUG).Info("LLM request assembled", "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "critical", llmReq.Critical) - - var err error - // Update target models in the body. - if llmReq.Model != llmReq.ResolvedTargetModel { - requestBodyMap["model"] = llmReq.ResolvedTargetModel - } - - requestBodyBytes, err = json.Marshal(requestBodyMap) - if err != nil { - logger.V(logutil.DEFAULT).Error(err, "Error marshaling request body") - return reqCtx, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("error marshaling request body: %v", err)} - } - - target, err := s.scheduler.Schedule(ctx, llmReq) - if err != nil { - return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()} - } - targetPod := target.GetPod() - - // Insert target endpoint to instruct Envoy to route requests to the specified target pod. - // Attach the port number - pool, err := s.datastore.PoolGet() - if err != nil { - return reqCtx, err - } - endpoint := targetPod.Address + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber)) - - logger.V(logutil.DEFAULT).Info("Request handled", - "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "endpoint", targetPod, "endpoint metrics", - fmt.Sprintf("%+v", target)) - - reqCtx.Model = llmReq.Model - reqCtx.ResolvedTargetModel = llmReq.ResolvedTargetModel - reqCtx.RequestSize = len(requestBodyBytes) - reqCtx.TargetPod = targetPod.NamespacedName.String() - reqCtx.TargetEndpoint = endpoint - - s.populateRequestHeaderResponse(reqCtx, endpoint, len(requestBodyBytes)) - - reqCtx.reqBodyResp = &extProcPb.ProcessingResponse{ - // The Endpoint Picker supports two approaches to communicating the target endpoint, as a request header - // and as an unstructure ext-proc response metadata key/value pair. This enables different integration - // options for gateway providers. - Response: &extProcPb.ProcessingResponse_RequestBody{ - RequestBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: requestBodyBytes, - EndOfStream: true, - }, - }, - }, - }, - }, - }, - } - return reqCtx, nil -} - -// HandleResponseBody always returns the requestContext even in the error case, as the request context is used in error handling. -func (s *StreamingServer) HandleResponseBody( - ctx context.Context, - reqCtx *RequestContext, - response map[string]interface{}, -) (*RequestContext, error) { - logger := log.FromContext(ctx) - responseBytes, err := json.Marshal(response) - if err != nil { - logger.V(logutil.DEFAULT).Error(err, "error marshalling responseBody") - return reqCtx, err - } - if response["usage"] != nil { - usg := response["usage"].(map[string]interface{}) - usage := Usage{ - PromptTokens: int(usg["prompt_tokens"].(float64)), - CompletionTokens: int(usg["completion_tokens"].(float64)), - TotalTokens: int(usg["total_tokens"].(float64)), - } - reqCtx.Usage = usage - logger.V(logutil.VERBOSE).Info("Response generated", "usage", reqCtx.Usage) - } - reqCtx.ResponseSize = len(responseBytes) - // ResponseComplete is to indicate the response is complete. In non-streaming - // case, it will be set to be true once the response is processed; in - // streaming case, it will be set to be true once the last chunk is processed. - // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/178) - // will add the processing for streaming case. - reqCtx.ResponseComplete = true - - reqCtx.respBodyResp = &extProcPb.ProcessingResponse{ - // The Endpoint Picker supports two approaches to communicating the target endpoint, as a request header - // and as an unstructure ext-proc response metadata key/value pair. This enables different integration - // options for gateway providers. - Response: &extProcPb.ProcessingResponse_ResponseBody{ - ResponseBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: responseBytes, - EndOfStream: true, - }, - }, - }, - }, - }, - }, - } - return reqCtx, nil -} - -// The function is to handle streaming response if the modelServer is streaming. -func (s *StreamingServer) HandleResponseBodyModelStreaming( - ctx context.Context, - reqCtx *RequestContext, - responseText string, -) { - if strings.Contains(responseText, streamingEndMsg) { - resp := ParseRespForUsage(ctx, responseText) - metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, resp.Usage.PromptTokens) - metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, resp.Usage.CompletionTokens) - } -} - -func (s *StreamingServer) HandleRequestHeaders(ctx context.Context, reqCtx *RequestContext, req *extProcPb.ProcessingRequest_RequestHeaders) error { - reqCtx.RequestReceivedTimestamp = time.Now() - - // an EoS in the request headers means this request has no body or trailers. - if req.RequestHeaders.EndOfStream { - // We will route this request to a random pod as this is assumed to just be a GET - // More context: https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/526 - // The above PR will address endpoint admission, but currently any request without a body will be - // routed to a random upstream pod. - pod := GetRandomPod(s.datastore) - pool, err := s.datastore.PoolGet() - if err != nil { - return err - } - endpoint := pod.Address + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber)) - s.populateRequestHeaderResponse(reqCtx, endpoint, 0) - } - return nil -} - -func (s *StreamingServer) populateRequestHeaderResponse(reqCtx *RequestContext, endpoint string, requestBodyLength int) { - headers := []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: s.destinationEndpointHintKey, - RawValue: []byte(endpoint), - }, - }, - } - if requestBodyLength > 0 { - // We need to update the content length header if the body is mutated, see Envoy doc: - // https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/filters/http/ext_proc/v3/processing_mode.proto - headers = append(headers, &configPb.HeaderValueOption{ - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte(strconv.Itoa(requestBodyLength)), - }, - }) - } - - targetEndpointValue := &structpb.Struct{ - Fields: map[string]*structpb.Value{ - s.destinationEndpointHintKey: { - Kind: &structpb.Value_StringValue{ - StringValue: endpoint, - }, - }, - }, - } - dynamicMetadata := targetEndpointValue - if s.destinationEndpointHintMetadataNamespace != "" { - // If a namespace is defined, wrap the selected endpoint with that. - dynamicMetadata = &structpb.Struct{ - Fields: map[string]*structpb.Value{ - s.destinationEndpointHintMetadataNamespace: { - Kind: &structpb.Value_StructValue{ - StructValue: targetEndpointValue, - }, - }, - }, - } - } - - reqCtx.reqHeaderResp = &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_RequestHeaders{ - RequestHeaders: &extProcPb.HeadersResponse{ - Response: &extProcPb.CommonResponse{ - ClearRouteCache: true, - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: headers, - }, - }, - }, - }, - DynamicMetadata: dynamicMetadata, - } -} - -func RandomWeightedDraw(logger logr.Logger, model *v1alpha2.InferenceModel, seed int64) string { - // TODO: after we are down to 1 server implementation, make these methods a part of the struct - // and handle random seeding on the struct. - source := rand.NewSource(rand.Int63()) - if seed > 0 { - source = rand.NewSource(seed) - } - r := rand.New(source) - - // all the weight values are nil, then we should return random model name - if model.Spec.TargetModels[0].Weight == nil { - index := r.Int31n(int32(len(model.Spec.TargetModels))) - return model.Spec.TargetModels[index].Name - } - - var weights int32 - for _, model := range model.Spec.TargetModels { - weights += *model.Weight - } - logger.V(logutil.TRACE).Info("Weights for model computed", "model", model.Name, "weights", weights) - randomVal := r.Int31n(weights) - // TODO: optimize this without using loop - for _, model := range model.Spec.TargetModels { - if randomVal < *model.Weight { - return model.Name - } - randomVal -= *model.Weight - } - return "" -} - -func GetRandomPod(ds datastore.Datastore) *backendmetrics.Pod { - pods := ds.PodGetAll() - number := rand.Intn(len(pods)) - pod := pods[number] - return pod.GetPod() -} diff --git a/pkg/epp/handlers/streamingserver_test.go b/pkg/epp/handlers/streamingserver_test.go deleted file mode 100644 index 72f7031a4..000000000 --- a/pkg/epp/handlers/streamingserver_test.go +++ /dev/null @@ -1,131 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package handlers - -import ( - "testing" - - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" -) - -func TestRandomWeightedDraw(t *testing.T) { - logger := logutil.NewTestLogger() - tests := []struct { - name string - model *v1alpha2.InferenceModel - want string - }{ - { - name: "'random' distribution", - model: &v1alpha2.InferenceModel{ - Spec: v1alpha2.InferenceModelSpec{ - TargetModels: []v1alpha2.TargetModel{ - { - Name: "canary", - Weight: pointer(50), - }, - { - Name: "v1", - Weight: pointer(50), - }, - }, - }, - }, - want: "canary", - }, - { - name: "'random' distribution", - model: &v1alpha2.InferenceModel{ - Spec: v1alpha2.InferenceModelSpec{ - TargetModels: []v1alpha2.TargetModel{ - { - Name: "canary", - Weight: pointer(25), - }, - { - Name: "v1.1", - Weight: pointer(55), - }, - { - Name: "v1", - Weight: pointer(50), - }, - }, - }, - }, - want: "v1", - }, - { - name: "'random' distribution", - model: &v1alpha2.InferenceModel{ - Spec: v1alpha2.InferenceModelSpec{ - TargetModels: []v1alpha2.TargetModel{ - { - Name: "canary", - Weight: pointer(20), - }, - { - Name: "v1.1", - Weight: pointer(20), - }, - { - Name: "v1", - Weight: pointer(10), - }, - }, - }, - }, - want: "v1.1", - }, - { - name: "weighted distribution with weight unset", - model: &v1alpha2.InferenceModel{ - Spec: v1alpha2.InferenceModelSpec{ - TargetModels: []v1alpha2.TargetModel{ - { - Name: "canary", - }, - { - Name: "v1.1", - }, - { - Name: "v1", - }, - }, - }, - }, - want: "canary", - }, - } - var seedVal int64 = 420 - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - for range 10000 { - model := RandomWeightedDraw(logger, test.model, seedVal) - if model != test.want { - t.Errorf("Model returned: %v != %v", model, test.want) - break - } - } - }) - } -} - -func pointer(v int32) *int32 { - return &v -} diff --git a/pkg/epp/metrics/collectors/inference_pool.go b/pkg/epp/metrics/collectors/inference_pool.go new file mode 100644 index 000000000..2be3c1957 --- /dev/null +++ b/pkg/epp/metrics/collectors/inference_pool.go @@ -0,0 +1,79 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package collectors + +import ( + "github.com/prometheus/client_golang/prometheus" + compbasemetrics "k8s.io/component-base/metrics" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + metricsutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/metrics" +) + +var ( + descInferencePoolPerPodQueueSize = prometheus.NewDesc( + "inference_pool_per_pod_queue_size", + metricsutil.HelpMsgWithStability("The total number of requests pending in the model server queue for each underlying pod.", compbasemetrics.ALPHA), + []string{ + "name", + "model_server_pod", + }, nil, + ) +) + +type inferencePoolMetricsCollector struct { + ds datastore.Datastore +} + +// Check if inferencePoolMetricsCollector implements necessary interface +var _ prometheus.Collector = &inferencePoolMetricsCollector{} + +// NewInferencePoolMetricsCollector implements the prometheus.Collector interface and +// exposes metrics about inference pool. +func NewInferencePoolMetricsCollector(ds datastore.Datastore) prometheus.Collector { + return &inferencePoolMetricsCollector{ + ds: ds, + } +} + +// DescribeWithStability implements the prometheus.Collector interface. +func (c *inferencePoolMetricsCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- descInferencePoolPerPodQueueSize +} + +// CollectWithStability implements the prometheus.Collector interface. +func (c *inferencePoolMetricsCollector) Collect(ch chan<- prometheus.Metric) { + pool, err := c.ds.PoolGet() + if err != nil { + return + } + + podMetrics := c.ds.PodGetAll() + if len(podMetrics) == 0 { + return + } + + for _, pod := range podMetrics { + ch <- prometheus.MustNewConstMetric( + descInferencePoolPerPodQueueSize, + prometheus.GaugeValue, + float64(pod.GetMetrics().WaitingQueueSize), + pool.Name, + pod.GetPod().NamespacedName.Name, + ) + } +} diff --git a/pkg/epp/metrics/collectors/inference_pool_test.go b/pkg/epp/metrics/collectors/inference_pool_test.go new file mode 100644 index 000000000..d97377ee7 --- /dev/null +++ b/pkg/epp/metrics/collectors/inference_pool_test.go @@ -0,0 +1,101 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package collectors + +import ( + "context" + "strings" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/component-base/metrics/testutil" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" +) + +var ( + pod1 = &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + }, + } + pod1NamespacedName = types.NamespacedName{Name: pod1.Name, Namespace: pod1.Namespace} + pod1Metrics = &backendmetrics.MetricsState{ + WaitingQueueSize: 100, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + } +) + +func TestNoMetricsCollected(t *testing.T) { + pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) + datastore := datastore.NewDatastore(context.Background(), pmf) + + collector := &inferencePoolMetricsCollector{ + ds: datastore, + } + + if err := testutil.CollectAndCompare(collector, strings.NewReader(""), ""); err != nil { + t.Fatal(err) + } +} + +func TestMetricsCollected(t *testing.T) { + pmc := &backendmetrics.FakePodMetricsClient{ + Res: map[types.NamespacedName]*backendmetrics.MetricsState{ + pod1NamespacedName: pod1Metrics, + }, + } + pmf := backendmetrics.NewPodMetricsFactory(pmc, time.Millisecond) + ds := datastore.NewDatastore(context.Background(), pmf) + + scheme := runtime.NewScheme() + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() + + inferencePool := &v1alpha2.InferencePool{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pool", + }, + Spec: v1alpha2.InferencePoolSpec{ + TargetPortNumber: 8000, + }, + } + _ = ds.PoolSet(context.Background(), fakeClient, inferencePool) + _ = ds.PodUpdateOrAddIfNotExist(pod1) + + time.Sleep(1 * time.Second) + + collector := &inferencePoolMetricsCollector{ + ds: ds, + } + err := testutil.CollectAndCompare(collector, strings.NewReader(` + # HELP inference_pool_per_pod_queue_size [ALPHA] The total number of requests pending in the model server queue for each underlying pod. + # TYPE inference_pool_per_pod_queue_size gauge + inference_pool_per_pod_queue_size{model_server_pod="pod1",name="test-pool"} 100 +`), "inference_pool_per_pod_queue_size") + if err != nil { + t.Fatal(err) + } +} diff --git a/pkg/epp/metrics/metrics.go b/pkg/epp/metrics/metrics.go index 434b8381d..50f637478 100644 --- a/pkg/epp/metrics/metrics.go +++ b/pkg/epp/metrics/metrics.go @@ -21,168 +21,294 @@ import ( "sync" "time" + "github.com/prometheus/client_golang/prometheus" compbasemetrics "k8s.io/component-base/metrics" - "k8s.io/component-base/metrics/legacyregistry" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/metrics" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + metricsutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/metrics" ) const ( InferenceModelComponent = "inference_model" InferencePoolComponent = "inference_pool" + InferenceExtension = "inference_extension" +) + +var ( + // The git hash of the latest commit in the build. + CommitSHA string + + // The build ref from the _PULL_BASE_REF from cloud build trigger. + BuildRef string ) var ( // Inference Model Metrics - requestCounter = compbasemetrics.NewCounterVec( - &compbasemetrics.CounterOpts{ - Subsystem: InferenceModelComponent, - Name: "request_total", - Help: "Counter of inference model requests broken out for each model and target model.", - StabilityLevel: compbasemetrics.ALPHA, + requestCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: InferenceModelComponent, + Name: "request_total", + Help: metricsutil.HelpMsgWithStability("Counter of inference model requests broken out for each model and target model.", compbasemetrics.ALPHA), }, []string{"model_name", "target_model_name"}, ) - requestErrCounter = compbasemetrics.NewCounterVec( - &compbasemetrics.CounterOpts{ - Subsystem: InferenceModelComponent, - Name: "request_error_total", - Help: "Counter of inference model requests errors broken out for each model and target model.", - StabilityLevel: compbasemetrics.ALPHA, + requestErrCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: InferenceModelComponent, + Name: "request_error_total", + Help: metricsutil.HelpMsgWithStability("Counter of inference model requests errors broken out for each model and target model.", compbasemetrics.ALPHA), }, []string{"model_name", "target_model_name", "error_code"}, ) - requestLatencies = compbasemetrics.NewHistogramVec( - &compbasemetrics.HistogramOpts{ + requestLatencies = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ Subsystem: InferenceModelComponent, Name: "request_duration_seconds", - Help: "Inference model response latency distribution in seconds for each model and target model.", + Help: metricsutil.HelpMsgWithStability("Inference model response latency distribution in seconds for each model and target model.", compbasemetrics.ALPHA), Buckets: []float64{ 0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3, 4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600, }, - StabilityLevel: compbasemetrics.ALPHA, }, []string{"model_name", "target_model_name"}, ) - requestSizes = compbasemetrics.NewHistogramVec( - &compbasemetrics.HistogramOpts{ + requestSizes = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ Subsystem: InferenceModelComponent, Name: "request_sizes", - Help: "Inference model requests size distribution in bytes for each model and target model.", + Help: metricsutil.HelpMsgWithStability("Inference model requests size distribution in bytes for each model and target model.", compbasemetrics.ALPHA), // Use buckets ranging from 1000 bytes (1KB) to 10^9 bytes (1GB). Buckets: []float64{ 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, // More fine-grained up to 64KB 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, // Exponential up to 8MB 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, // Exponential up to 1GB }, - StabilityLevel: compbasemetrics.ALPHA, }, []string{"model_name", "target_model_name"}, ) - responseSizes = compbasemetrics.NewHistogramVec( - &compbasemetrics.HistogramOpts{ + responseSizes = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ Subsystem: InferenceModelComponent, Name: "response_sizes", - Help: "Inference model responses size distribution in bytes for each model and target model.", + Help: metricsutil.HelpMsgWithStability("Inference model responses size distribution in bytes for each model and target model.", compbasemetrics.ALPHA), // Most models have a response token < 8192 tokens. Each token, in average, has 4 characters. // 8192 * 4 = 32768. - Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32778, 65536}, - StabilityLevel: compbasemetrics.ALPHA, + Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32778, 65536}, }, []string{"model_name", "target_model_name"}, ) - inputTokens = compbasemetrics.NewHistogramVec( - &compbasemetrics.HistogramOpts{ + inputTokens = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ Subsystem: InferenceModelComponent, Name: "input_tokens", - Help: "Inference model input token count distribution for requests in each model.", + Help: metricsutil.HelpMsgWithStability("Inference model input token count distribution for requests in each model.", compbasemetrics.ALPHA), // Most models have a input context window less than 1 million tokens. - Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32778, 65536, 131072, 262144, 524288, 1048576}, - StabilityLevel: compbasemetrics.ALPHA, + Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32778, 65536, 131072, 262144, 524288, 1048576}, }, []string{"model_name", "target_model_name"}, ) - outputTokens = compbasemetrics.NewHistogramVec( - &compbasemetrics.HistogramOpts{ + outputTokens = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ Subsystem: InferenceModelComponent, Name: "output_tokens", - Help: "Inference model output token count distribution for requests in each model.", + Help: metricsutil.HelpMsgWithStability("Inference model output token count distribution for requests in each model.", compbasemetrics.ALPHA), // Most models generates output less than 8192 tokens. - Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192}, - StabilityLevel: compbasemetrics.ALPHA, + Buckets: []float64{1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192}, }, []string{"model_name", "target_model_name"}, ) - runningRequests = compbasemetrics.NewGaugeVec( - &compbasemetrics.GaugeOpts{ - Subsystem: InferenceModelComponent, - Name: "running_requests", - Help: "Inference model number of running requests in each model.", - StabilityLevel: compbasemetrics.ALPHA, + runningRequests = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: InferenceModelComponent, + Name: "running_requests", + Help: metricsutil.HelpMsgWithStability("Inference model number of running requests in each model.", compbasemetrics.ALPHA), }, []string{"model_name"}, ) + // NTPOT - Normalized Time Per Output Token + NormalizedTimePerOutputToken = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: InferenceModelComponent, + Name: "normalized_time_per_output_token_seconds", + Help: metricsutil.HelpMsgWithStability("Inference model latency divided by number of output tokens in seconds for each model and target model.", compbasemetrics.ALPHA), + // From few milliseconds per token to multiple seconds per token + Buckets: []float64{ + 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, + }, + }, + []string{"model_name", "target_model_name"}, + ) + // Inference Pool Metrics - inferencePoolAvgKVCache = compbasemetrics.NewGaugeVec( - &compbasemetrics.GaugeOpts{ - Subsystem: InferencePoolComponent, - Name: "average_kv_cache_utilization", - Help: "The average kv cache utilization for an inference server pool.", - StabilityLevel: compbasemetrics.ALPHA, + inferencePoolAvgKVCache = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: InferencePoolComponent, + Name: "average_kv_cache_utilization", + Help: metricsutil.HelpMsgWithStability("The average kv cache utilization for an inference server pool.", compbasemetrics.ALPHA), }, []string{"name"}, ) - inferencePoolAvgQueueSize = compbasemetrics.NewGaugeVec( - &compbasemetrics.GaugeOpts{ - Subsystem: InferencePoolComponent, - Name: "average_queue_size", - Help: "The average number of requests pending in the model server queue.", - StabilityLevel: compbasemetrics.ALPHA, + inferencePoolAvgQueueSize = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: InferencePoolComponent, + Name: "average_queue_size", + Help: metricsutil.HelpMsgWithStability("The average number of requests pending in the model server queue.", compbasemetrics.ALPHA), }, []string{"name"}, ) - inferencePoolReadyPods = compbasemetrics.NewGaugeVec( - &compbasemetrics.GaugeOpts{ - Subsystem: InferencePoolComponent, - Name: "ready_pods", - Help: "The number of ready pods in the inference server pool.", - StabilityLevel: compbasemetrics.ALPHA, + inferencePoolReadyPods = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: InferencePoolComponent, + Name: "ready_pods", + Help: metricsutil.HelpMsgWithStability("The number of ready pods in the inference server pool.", compbasemetrics.ALPHA), }, []string{"name"}, ) + + // Scheduler Metrics + SchedulerE2ELatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: InferenceExtension, + Name: "scheduler_e2e_duration_seconds", + Help: metricsutil.HelpMsgWithStability("End-to-end scheduling latency distribution in seconds.", compbasemetrics.ALPHA), + Buckets: []float64{ + 0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, + }, + // StabilityLevel: prometheus.ALPHA, + }, + []string{}, + ) + SchedulerPluginProcessingLatencies = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: InferenceExtension, + Name: "scheduler_plugin_duration_seconds", + Help: metricsutil.HelpMsgWithStability("Scheduler plugin processing latency distribution in seconds for each plugin type and plugin name.", compbasemetrics.ALPHA), + Buckets: []float64{ + 0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, + }, + }, + []string{"plugin_type", "plugin_name"}, + ) + + RequestControlPluginProcessingLatencies = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: InferenceExtension, + Name: "request_control_plugin_duration_seconds", + Help: metricsutil.HelpMsgWithStability("RequestControl plugin processing latency distribution in seconds for each plugin type and plugin name.", compbasemetrics.ALPHA), + Buckets: []float64{ + 0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, + }, + }, + []string{"plugin_type", "plugin_name"}, + ) + + // Prefix indexer Metrics + PrefixCacheSize = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: InferenceExtension, + Name: "prefix_indexer_size", + Help: metricsutil.HelpMsgWithStability("Size of the prefix indexer.", compbasemetrics.ALPHA), + }, + []string{}, + ) + + PrefixCacheHitRatio = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: InferenceExtension, + Name: "prefix_indexer_hit_ratio", + Help: metricsutil.HelpMsgWithStability("Ratio of prefix length matched to total prefix length in the cache lookup.", compbasemetrics.ALPHA), + // Buckets from 0.0 to 1.0 in increments + Buckets: []float64{0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0}, + }, + []string{}, + ) + + PrefixCacheHitLength = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: InferenceExtension, + Name: "prefix_indexer_hit_bytes", + Help: metricsutil.HelpMsgWithStability("Length of the prefix match in number of bytes in the cache lookup.", compbasemetrics.ALPHA), + Buckets: []float64{0, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536}, + }, + []string{}, + ) + + // Info Metrics + InferenceExtensionInfo = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: InferenceExtension, + Name: "info", + Help: metricsutil.HelpMsgWithStability("General information of the current build of Inference Extension.", compbasemetrics.ALPHA), + }, + []string{"commit", "build_ref"}, + ) ) var registerMetrics sync.Once // Register all metrics. -func Register() { +func Register(customCollectors ...prometheus.Collector) { registerMetrics.Do(func() { - legacyregistry.MustRegister(requestCounter) - legacyregistry.MustRegister(requestErrCounter) - legacyregistry.MustRegister(requestLatencies) - legacyregistry.MustRegister(requestSizes) - legacyregistry.MustRegister(responseSizes) - legacyregistry.MustRegister(inputTokens) - legacyregistry.MustRegister(outputTokens) - legacyregistry.MustRegister(runningRequests) - - legacyregistry.MustRegister(inferencePoolAvgKVCache) - legacyregistry.MustRegister(inferencePoolAvgQueueSize) - legacyregistry.MustRegister(inferencePoolReadyPods) + metrics.Registry.MustRegister(requestCounter) + metrics.Registry.MustRegister(requestErrCounter) + metrics.Registry.MustRegister(requestLatencies) + metrics.Registry.MustRegister(requestSizes) + metrics.Registry.MustRegister(responseSizes) + metrics.Registry.MustRegister(inputTokens) + metrics.Registry.MustRegister(outputTokens) + metrics.Registry.MustRegister(runningRequests) + metrics.Registry.MustRegister(NormalizedTimePerOutputToken) + metrics.Registry.MustRegister(inferencePoolAvgKVCache) + metrics.Registry.MustRegister(inferencePoolAvgQueueSize) + metrics.Registry.MustRegister(inferencePoolReadyPods) + metrics.Registry.MustRegister(SchedulerPluginProcessingLatencies) + metrics.Registry.MustRegister(SchedulerE2ELatency) + metrics.Registry.MustRegister(RequestControlPluginProcessingLatencies) + metrics.Registry.MustRegister(InferenceExtensionInfo) + metrics.Registry.MustRegister(PrefixCacheSize) + metrics.Registry.MustRegister(PrefixCacheHitRatio) + metrics.Registry.MustRegister(PrefixCacheHitLength) + for _, collector := range customCollectors { + metrics.Registry.MustRegister(collector) + } }) } +// Just for integration test +func Reset() { + requestCounter.Reset() + requestErrCounter.Reset() + requestLatencies.Reset() + requestSizes.Reset() + responseSizes.Reset() + inputTokens.Reset() + outputTokens.Reset() + runningRequests.Reset() + NormalizedTimePerOutputToken.Reset() + inferencePoolAvgKVCache.Reset() + inferencePoolAvgQueueSize.Reset() + inferencePoolReadyPods.Reset() + SchedulerPluginProcessingLatencies.Reset() + SchedulerE2ELatency.Reset() + RequestControlPluginProcessingLatencies.Reset() + InferenceExtensionInfo.Reset() + PrefixCacheSize.Reset() + PrefixCacheHitRatio.Reset() + PrefixCacheHitLength.Reset() +} + // RecordRequstCounter records the number of requests. func RecordRequestCounter(modelName, targetModelName string) { requestCounter.WithLabelValues(modelName, targetModelName).Inc() @@ -231,6 +357,27 @@ func RecordOutputTokens(modelName, targetModelName string, size int) { } } +// RecordNormalizedTimePerOutputToken (NTPOT) records the normalized time per output token. +func RecordNormalizedTimePerOutputToken(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time, outputTokenCount int) bool { + if !complete.After(received) { + log.FromContext(ctx).Error(nil, "Request latency values are invalid for NTPOT calculation", + "modelName", modelName, "targetModelName", targetModelName, "completeTime", complete, "receivedTime", received) + return false + } + + if outputTokenCount <= 0 { + log.FromContext(ctx).Error(nil, "Output token count must be positive for NTPOT calculation", + "modelName", modelName, "targetModelName", targetModelName, "outputTokenCount", outputTokenCount) + return false + } + + elapsedSeconds := complete.Sub(received).Seconds() + secondsPerToken := elapsedSeconds / float64(outputTokenCount) + + NormalizedTimePerOutputToken.WithLabelValues(modelName, targetModelName).Observe(secondsPerToken) + return true +} + // IncRunningRequests increases the current running requests. func IncRunningRequests(modelName string) { if modelName != "" { @@ -256,3 +403,40 @@ func RecordInferencePoolAvgQueueSize(name string, queueSize float64) { func RecordinferencePoolReadyPods(name string, runningPods float64) { inferencePoolReadyPods.WithLabelValues(name).Set(runningPods) } + +// RecordSchedulerPluginProcessingLatency records the processing latency for a scheduler plugin. +func RecordSchedulerPluginProcessingLatency(pluginType, pluginName string, duration time.Duration) { + SchedulerPluginProcessingLatencies.WithLabelValues(pluginType, pluginName).Observe(duration.Seconds()) +} + +// RecordSchedulerE2ELatency records the end-to-end scheduling latency. +func RecordSchedulerE2ELatency(duration time.Duration) { + SchedulerE2ELatency.WithLabelValues().Observe(duration.Seconds()) +} + +// RecordRequestControlPluginProcessingLatency records the processing latency for a request-control plugin. +func RecordRequestControlPluginProcessingLatency(pluginType, pluginName string, duration time.Duration) { + RequestControlPluginProcessingLatencies.WithLabelValues(pluginType, pluginName).Observe(duration.Seconds()) +} + +// RecordPrefixCacheSize records the size of the prefix indexer in megabytes. +func RecordPrefixCacheSize(size int64) { + PrefixCacheSize.WithLabelValues().Set(float64(size)) +} + +// RecordPrefixCacheMatch records both the hit ratio and hit length for a prefix indexer match. +// matchedLength is the number of characters that matched, and totalLength is the total prefix length. +func RecordPrefixCacheMatch(matchedLength, totalLength int) { + // Record the hit length metric + PrefixCacheHitLength.WithLabelValues().Observe(float64(matchedLength)) + + // Record the hit ratio metric if totalLength is positive + if totalLength > 0 { + ratio := float64(matchedLength) / float64(totalLength) + PrefixCacheHitRatio.WithLabelValues().Observe(ratio) + } +} + +func RecordInferenceExtensionInfo() { + InferenceExtensionInfo.WithLabelValues(CommitSHA, BuildRef).Set(1) +} diff --git a/pkg/epp/metrics/metrics_test.go b/pkg/epp/metrics/metrics_test.go index dc4c70444..5dd97055d 100644 --- a/pkg/epp/metrics/metrics_test.go +++ b/pkg/epp/metrics/metrics_test.go @@ -22,23 +22,25 @@ import ( "testing" "time" - "k8s.io/component-base/metrics/legacyregistry" "k8s.io/component-base/metrics/testutil" + "sigs.k8s.io/controller-runtime/pkg/metrics" errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) const ( - RequestTotalMetric = InferenceModelComponent + "_request_total" - RequestErrorTotalMetric = InferenceModelComponent + "_request_error_total" - RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds" - RequestSizesMetric = InferenceModelComponent + "_request_sizes" - ResponseSizesMetric = InferenceModelComponent + "_response_sizes" - InputTokensMetric = InferenceModelComponent + "_input_tokens" - OutputTokensMetric = InferenceModelComponent + "_output_tokens" - RunningRequestsMetric = InferenceModelComponent + "_running_requests" - KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization" - QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size" + RequestTotalMetric = InferenceModelComponent + "_request_total" + RequestErrorTotalMetric = InferenceModelComponent + "_request_error_total" + RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds" + RequestSizesMetric = InferenceModelComponent + "_request_sizes" + ResponseSizesMetric = InferenceModelComponent + "_response_sizes" + InputTokensMetric = InferenceModelComponent + "_input_tokens" + OutputTokensMetric = InferenceModelComponent + "_output_tokens" + NormalizedTimePerOutputTokenMetric = InferenceModelComponent + "_normalized_time_per_output_token_seconds" + RunningRequestsMetric = InferenceModelComponent + "_running_requests" + KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization" + QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size" + PerPodQueueSizeMetrics = InferencePoolComponent + "_per_pod_queue_size" ) func TestRecordRequestCounterandSizes(t *testing.T) { @@ -91,7 +93,7 @@ func TestRecordRequestCounterandSizes(t *testing.T) { if err != nil { t.Fatal(err) } - if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestTotal, RequestTotalMetric); err != nil { + if err := testutil.GatherAndCompare(metrics.Registry, wantRequestTotal, RequestTotalMetric); err != nil { t.Error(err) } wantRequestSizes, err := os.Open("testdata/request_sizes_metric") @@ -103,7 +105,7 @@ func TestRecordRequestCounterandSizes(t *testing.T) { if err != nil { t.Fatal(err) } - if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestSizes, RequestSizesMetric); err != nil { + if err := testutil.GatherAndCompare(metrics.Registry, wantRequestSizes, RequestSizesMetric); err != nil { t.Error(err) } }) @@ -163,7 +165,7 @@ func TestRecordRequestErrorCounter(t *testing.T) { if err != nil { t.Fatal(err) } - if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestErrorCounter, RequestErrorTotalMetric); err != nil { + if err := testutil.GatherAndCompare(metrics.Registry, wantRequestErrorCounter, RequestErrorTotalMetric); err != nil { t.Error(err) } }) @@ -245,7 +247,108 @@ func TestRecordRequestLatencies(t *testing.T) { if err != nil { t.Fatal(err) } - if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestLatencies, RequestLatenciesMetric); err != nil { + if err := testutil.GatherAndCompare(metrics.Registry, wantRequestLatencies, RequestLatenciesMetric); err != nil { + t.Error(err) + } + }) + } +} + +func TestRecordNormalizedTimePerOutputToken(t *testing.T) { + ctx := logutil.NewTestLoggerIntoContext(context.Background()) + timeBaseline := time.Now() + type tokenRequests struct { + modelName string + targetModelName string + receivedTime time.Time + completeTime time.Time + outputTokens int + } + scenarios := []struct { + name string + reqs []tokenRequests + invalid bool + }{ + { + name: "multiple requests", + reqs: []tokenRequests{ + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 1000), + outputTokens: 100, // 10ms per token + }, + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 1600), + outputTokens: 80, // 20ms per token + }, + { + modelName: "m10", + targetModelName: "t11", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 6000), + outputTokens: 300, // 20ms per token + }, + { + modelName: "m20", + targetModelName: "t20", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 2400), + outputTokens: 400, // 6ms per token + }, + }, + }, + { + name: "invalid elapsed time", + reqs: []tokenRequests{ + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline.Add(time.Millisecond * 10), + completeTime: timeBaseline, + outputTokens: 100, + }, + }, + invalid: true, + }, + { + name: "invalid token count", + reqs: []tokenRequests{ + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 1000), + outputTokens: 0, // Invalid: zero tokens + }, + }, + invalid: true, + }, + } + Register() + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + for _, req := range scenario.reqs { + success := RecordNormalizedTimePerOutputToken(ctx, req.modelName, req.targetModelName, req.receivedTime, req.completeTime, req.outputTokens) + if success == scenario.invalid { + t.Errorf("got record success(%v), but the request expects invalid(%v)", success, scenario.invalid) + } + } + + wantLatencyPerToken, err := os.Open("testdata/normalized_time_per_output_token_seconds_metric") + defer func() { + if err := wantLatencyPerToken.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(metrics.Registry, wantLatencyPerToken, NormalizedTimePerOutputTokenMetric); err != nil { t.Error(err) } }) @@ -313,7 +416,7 @@ func TestRecordResponseMetrics(t *testing.T) { if err != nil { t.Fatal(err) } - if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantResponseSize, ResponseSizesMetric); err != nil { + if err := testutil.GatherAndCompare(metrics.Registry, wantResponseSize, ResponseSizesMetric); err != nil { t.Error(err) } @@ -326,7 +429,7 @@ func TestRecordResponseMetrics(t *testing.T) { if err != nil { t.Fatal(err) } - if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantInputToken, InputTokensMetric); err != nil { + if err := testutil.GatherAndCompare(metrics.Registry, wantInputToken, InputTokensMetric); err != nil { t.Error(err) } @@ -339,7 +442,7 @@ func TestRecordResponseMetrics(t *testing.T) { if err != nil { t.Fatal(err) } - if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantOutputToken, OutputTokensMetric); err != nil { + if err := testutil.GatherAndCompare(metrics.Registry, wantOutputToken, OutputTokensMetric); err != nil { t.Error(err) } }) @@ -399,7 +502,7 @@ func TestRunningRequestsMetrics(t *testing.T) { if err != nil { t.Fatal(err) } - if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRunningRequests, RunningRequestsMetric); err != nil { + if err := testutil.GatherAndCompare(metrics.Registry, wantRunningRequests, RunningRequestsMetric); err != nil { t.Error(err) } }) @@ -435,7 +538,7 @@ func TestInferencePoolMetrics(t *testing.T) { if err != nil { t.Fatal(err) } - if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantKVCache, KVCacheAvgUsageMetric); err != nil { + if err := testutil.GatherAndCompare(metrics.Registry, wantKVCache, KVCacheAvgUsageMetric); err != nil { t.Error(err) } @@ -448,9 +551,214 @@ func TestInferencePoolMetrics(t *testing.T) { if err != nil { t.Fatal(err) } - if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantQueueSize, QueueAvgSizeMetric); err != nil { + if err := testutil.GatherAndCompare(metrics.Registry, wantQueueSize, QueueAvgSizeMetric); err != nil { t.Error(err) } }) } } + +func TestSchedulerPluginProcessingLatencies(t *testing.T) { + type pluginLatency struct { + pluginType string + pluginName string + duration time.Duration + } + scenarios := []struct { + name string + latencies []pluginLatency + }{ + { + name: "multiple plugins", + latencies: []pluginLatency{ + { + pluginType: "PostSchedule", + pluginName: "PluginB", + duration: 200 * time.Millisecond, + }, + { + pluginType: "Filter", + pluginName: "PluginC", + duration: 50 * time.Millisecond, + }, + { + pluginType: "Scorer", + pluginName: "PluginD", + duration: 10 * time.Millisecond, + }, + { + pluginType: "Picker", + pluginName: "PluginE", + duration: 10 * time.Microsecond, + }, + }, + }, + } + Register() + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + for _, latency := range scenario.latencies { + RecordSchedulerPluginProcessingLatency(latency.pluginType, latency.pluginName, latency.duration) + } + + wantPluginLatencies, err := os.Open("testdata/scheduler_plugin_processing_latencies_metric") + defer func() { + if err := wantPluginLatencies.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(metrics.Registry, wantPluginLatencies, "inference_extension_scheduler_plugin_duration_seconds"); err != nil { + t.Error(err) + } + }) + } +} + +func TestSchedulerE2ELatency(t *testing.T) { + scenarios := []struct { + name string + durations []time.Duration + }{ + { + name: "multiple scheduling latencies", + durations: []time.Duration{ + 200 * time.Microsecond, // 0.00014s - should go in the 0.0002 bucket + 800 * time.Microsecond, // 0.0008s - should go in the 0.001 bucket + 1500 * time.Microsecond, // 0.0015s - should go in the 0.002 bucket + 3 * time.Millisecond, // 0.003s - should go in the 0.005 bucket + 8 * time.Millisecond, // 0.008s - should go in the 0.01 bucket + 15 * time.Millisecond, // 0.015s - should go in the 0.02 bucket + 30 * time.Millisecond, // 0.03s - should go in the 0.05 bucket + 75 * time.Millisecond, // 0.075s - should go in the 0.1 bucket + 150 * time.Millisecond, // 0.15s - should go in the +Inf bucket + }, + }, + } + Register() + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + for _, duration := range scenario.durations { + RecordSchedulerE2ELatency(duration) + } + + wantE2ELatency, err := os.Open("testdata/scheduler_e2e_duration_seconds_metric") + defer func() { + if err := wantE2ELatency.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(metrics.Registry, wantE2ELatency, "inference_extension_scheduler_e2e_duration_seconds"); err != nil { + t.Error(err) + } + }) + } +} + +func TestPrefixCacheMetrics(t *testing.T) { + const ( + PrefixCacheSizeMetric = InferenceExtension + "_prefix_indexer_size" + PrefixCacheHitRatioMetric = InferenceExtension + "_prefix_indexer_hit_ratio" + PrefixCacheHitLengthMetric = InferenceExtension + "_prefix_indexer_hit_bytes" + ) + + type cacheMatchRecord struct { + matchedLength int + totalLength int + } + + scenario := struct { + name string + cacheSizes []int64 + cacheMatches []cacheMatchRecord + }{ + name: "multiple cache metrics", + cacheSizes: []int64{1024, 2048, 4096}, + cacheMatches: []cacheMatchRecord{ + { + matchedLength: 5, + totalLength: 10, + }, + { + matchedLength: 0, + totalLength: 10, + }, + { + matchedLength: 10, + totalLength: 10, + }, + { + matchedLength: 7, + totalLength: 10, + }, + { + matchedLength: 64, + totalLength: 128, + }, + { + matchedLength: 0, + totalLength: 128, + }, + }, + } + + Register() + t.Run(scenario.name, func(t *testing.T) { + // Record cache size metrics + for _, size := range scenario.cacheSizes { + RecordPrefixCacheSize(size) + } + + // Record cache match metrics (both hit ratio and hit length) + for _, match := range scenario.cacheMatches { + RecordPrefixCacheMatch(match.matchedLength, match.totalLength) + } + + // Verify cache size metrics + wantCacheSizeMetrics, err := os.Open("testdata/prefix_indexer_size_metric") + defer func() { + if err := wantCacheSizeMetrics.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(metrics.Registry, wantCacheSizeMetrics, PrefixCacheSizeMetric); err != nil { + t.Error(err) + } + + // Verify hit ratio metrics + wantHitRatioMetrics, err := os.Open("testdata/prefix_indexer_hit_ratio_metric") + defer func() { + if err := wantHitRatioMetrics.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(metrics.Registry, wantHitRatioMetrics, PrefixCacheHitRatioMetric); err != nil { + t.Error(err) + } + + // Verify hit length metrics + wantHitLengthMetrics, err := os.Open("testdata/prefix_indexer_hit_bytes_metric") + defer func() { + if err := wantHitLengthMetrics.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(metrics.Registry, wantHitLengthMetrics, PrefixCacheHitLengthMetric); err != nil { + t.Error(err) + } + }) +} diff --git a/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric b/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric new file mode 100644 index 000000000..bb6e93737 --- /dev/null +++ b/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric @@ -0,0 +1,50 @@ +# HELP inference_model_normalized_time_per_output_token_seconds [ALPHA] Inference model latency divided by number of output tokens in seconds for each model and target model. +# TYPE inference_model_normalized_time_per_output_token_seconds histogram +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.001"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.002"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.01"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.02"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.5"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="2.0"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="5.0"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="10.0"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="+Inf"} 2 +inference_model_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t10"} 0.03 +inference_model_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t10"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.001"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.002"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.005"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.01"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.02"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.05"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.1"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.2"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.5"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="1.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="2.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="5.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="10.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="+Inf"} 1 +inference_model_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t11"} 0.02 +inference_model_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t11"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.001"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.002"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.005"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.01"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.02"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.05"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.1"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.2"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.5"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="1.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="2.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="5.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="10.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="+Inf"} 1 +inference_model_normalized_time_per_output_token_seconds_sum{model_name="m20", target_model_name="t20"} 0.006 +inference_model_normalized_time_per_output_token_seconds_count{model_name="m20", target_model_name="t20"} 1 diff --git a/pkg/epp/metrics/testdata/prefix_indexer_hit_bytes_metric b/pkg/epp/metrics/testdata/prefix_indexer_hit_bytes_metric new file mode 100644 index 000000000..86b48724e --- /dev/null +++ b/pkg/epp/metrics/testdata/prefix_indexer_hit_bytes_metric @@ -0,0 +1,19 @@ +# HELP inference_extension_prefix_indexer_hit_bytes [ALPHA] Length of the prefix match in number of bytes in the cache lookup. +# TYPE inference_extension_prefix_indexer_hit_bytes histogram +inference_extension_prefix_indexer_hit_bytes_bucket{le="0"} 2 +inference_extension_prefix_indexer_hit_bytes_bucket{le="16"} 5 +inference_extension_prefix_indexer_hit_bytes_bucket{le="32"} 5 +inference_extension_prefix_indexer_hit_bytes_bucket{le="64"} 6 +inference_extension_prefix_indexer_hit_bytes_bucket{le="128"} 6 +inference_extension_prefix_indexer_hit_bytes_bucket{le="256"} 6 +inference_extension_prefix_indexer_hit_bytes_bucket{le="512"} 6 +inference_extension_prefix_indexer_hit_bytes_bucket{le="1024"} 6 +inference_extension_prefix_indexer_hit_bytes_bucket{le="2048"} 6 +inference_extension_prefix_indexer_hit_bytes_bucket{le="4096"} 6 +inference_extension_prefix_indexer_hit_bytes_bucket{le="8192"} 6 +inference_extension_prefix_indexer_hit_bytes_bucket{le="16384"} 6 +inference_extension_prefix_indexer_hit_bytes_bucket{le="32768"} 6 +inference_extension_prefix_indexer_hit_bytes_bucket{le="65536"} 6 +inference_extension_prefix_indexer_hit_bytes_bucket{le="+Inf"} 6 +inference_extension_prefix_indexer_hit_bytes_sum 86 +inference_extension_prefix_indexer_hit_bytes_count 6 diff --git a/pkg/epp/metrics/testdata/prefix_indexer_hit_ratio_metric b/pkg/epp/metrics/testdata/prefix_indexer_hit_ratio_metric new file mode 100644 index 000000000..e94827cb6 --- /dev/null +++ b/pkg/epp/metrics/testdata/prefix_indexer_hit_ratio_metric @@ -0,0 +1,16 @@ +# HELP inference_extension_prefix_indexer_hit_ratio [ALPHA] Ratio of prefix length matched to total prefix length in the cache lookup. +# TYPE inference_extension_prefix_indexer_hit_ratio histogram +inference_extension_prefix_indexer_hit_ratio_bucket{le="0"} 2 +inference_extension_prefix_indexer_hit_ratio_bucket{le="0.1"} 2 +inference_extension_prefix_indexer_hit_ratio_bucket{le="0.2"} 2 +inference_extension_prefix_indexer_hit_ratio_bucket{le="0.3"} 2 +inference_extension_prefix_indexer_hit_ratio_bucket{le="0.4"} 2 +inference_extension_prefix_indexer_hit_ratio_bucket{le="0.5"} 4 +inference_extension_prefix_indexer_hit_ratio_bucket{le="0.6"} 4 +inference_extension_prefix_indexer_hit_ratio_bucket{le="0.7"} 5 +inference_extension_prefix_indexer_hit_ratio_bucket{le="0.8"} 5 +inference_extension_prefix_indexer_hit_ratio_bucket{le="0.9"} 5 +inference_extension_prefix_indexer_hit_ratio_bucket{le="1"} 6 +inference_extension_prefix_indexer_hit_ratio_bucket{le="+Inf"} 6 +inference_extension_prefix_indexer_hit_ratio_sum 2.7 +inference_extension_prefix_indexer_hit_ratio_count 6 diff --git a/pkg/epp/metrics/testdata/prefix_indexer_size_metric b/pkg/epp/metrics/testdata/prefix_indexer_size_metric new file mode 100644 index 000000000..9799b1729 --- /dev/null +++ b/pkg/epp/metrics/testdata/prefix_indexer_size_metric @@ -0,0 +1,3 @@ +# HELP inference_extension_prefix_indexer_size [ALPHA] Size of the prefix indexer. +# TYPE inference_extension_prefix_indexer_size gauge +inference_extension_prefix_indexer_size{} 4096 diff --git a/pkg/epp/metrics/testdata/scheduler_e2e_duration_seconds_metric b/pkg/epp/metrics/testdata/scheduler_e2e_duration_seconds_metric new file mode 100644 index 000000000..0bbb35b16 --- /dev/null +++ b/pkg/epp/metrics/testdata/scheduler_e2e_duration_seconds_metric @@ -0,0 +1,15 @@ +# HELP inference_extension_scheduler_e2e_duration_seconds [ALPHA] End-to-end scheduling latency distribution in seconds. +# TYPE inference_extension_scheduler_e2e_duration_seconds histogram +inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.0001"} 0 +inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.0002"} 1 +inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.0005"} 1 +inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.001"} 2 +inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.002"} 3 +inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.005"} 4 +inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.01"} 5 +inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.02"} 6 +inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.05"} 7 +inference_extension_scheduler_e2e_duration_seconds_bucket{le="0.1"} 8 +inference_extension_scheduler_e2e_duration_seconds_bucket{le="+Inf"} 9 +inference_extension_scheduler_e2e_duration_seconds_sum{} 0.2835 +inference_extension_scheduler_e2e_duration_seconds_count{} 9 diff --git a/pkg/epp/metrics/testdata/scheduler_plugin_processing_latencies_metric b/pkg/epp/metrics/testdata/scheduler_plugin_processing_latencies_metric new file mode 100644 index 000000000..38ac8a09d --- /dev/null +++ b/pkg/epp/metrics/testdata/scheduler_plugin_processing_latencies_metric @@ -0,0 +1,54 @@ +# HELP inference_extension_scheduler_plugin_duration_seconds [ALPHA] Scheduler plugin processing latency distribution in seconds for each plugin type and plugin name. +# TYPE inference_extension_scheduler_plugin_duration_seconds histogram +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.0001"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.0002"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.0005"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.001"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.002"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.005"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.01"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.02"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.05"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.1"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="+Inf"} 1 +inference_extension_scheduler_plugin_duration_seconds_sum{plugin_name="PluginB",plugin_type="PostSchedule"} 0.2 +inference_extension_scheduler_plugin_duration_seconds_count{plugin_name="PluginB",plugin_type="PostSchedule"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.0001"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.0002"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.0005"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.001"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.002"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.005"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.01"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.02"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.05"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.1"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="+Inf"} 1 +inference_extension_scheduler_plugin_duration_seconds_sum{plugin_name="PluginC",plugin_type="Filter"} 0.05 +inference_extension_scheduler_plugin_duration_seconds_count{plugin_name="PluginC",plugin_type="Filter"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.0001"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.0002"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.0005"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.001"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.002"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.005"} 0 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.01"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.02"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.05"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.1"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="+Inf"} 1 +inference_extension_scheduler_plugin_duration_seconds_sum{plugin_name="PluginD",plugin_type="Scorer"} 0.01 +inference_extension_scheduler_plugin_duration_seconds_count{plugin_name="PluginD",plugin_type="Scorer"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.0001"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.0002"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.0005"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.001"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.002"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.005"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.01"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.02"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.05"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.1"} 1 +inference_extension_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="+Inf"} 1 +inference_extension_scheduler_plugin_duration_seconds_sum{plugin_name="PluginE",plugin_type="Picker"} 1e-05 +inference_extension_scheduler_plugin_duration_seconds_count{plugin_name="PluginE",plugin_type="Picker"} 1 diff --git a/pkg/epp/plugins/plugins.go b/pkg/epp/plugins/plugins.go new file mode 100644 index 000000000..2875e06da --- /dev/null +++ b/pkg/epp/plugins/plugins.go @@ -0,0 +1,70 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package plugins + +import ( + "context" + "fmt" +) + +// Plugin defines the interface for a plugin. +// This interface should be embedded in all plugins across the code. +type Plugin interface { + // Type returns the type of the plugin. + Type() string + // Name returns the name of this plugin instance. + Name() string +} + +// Handle provides plugins a set of standard data and tools to work with +type Handle interface { + // Context returns a context the plugins can use, if they need one + Context() context.Context + + // Plugins returns the sub-handle for working with instantiated plugins + Plugins() HandlePlugins +} + +// HandlePlugins defines a set of APIs to work with instantiated plugins +type HandlePlugins interface { + // Plugin returns the named plugin instance + Plugin(name string) Plugin + + // AddPlugin adds a plugin to the set of known plugin instances + AddPlugin(name string, plugin Plugin) + + // GetAllPlugins returns all of the known plugins + GetAllPlugins() []Plugin + + // GetAllPluginsWithNames returns all of the known plugins with their names + GetAllPluginsWithNames() map[string]Plugin +} + +// PluginByType retrieves the specified plugin by name and verifies its type +func PluginByType[P Plugin](handlePlugins HandlePlugins, name string) (P, error) { + var zero P + + rawPlugin := handlePlugins.Plugin(name) + if rawPlugin == nil { + return zero, fmt.Errorf("there is no plugin with the name '%s' defined", name) + } + thePlugin, ok := rawPlugin.(P) + if !ok { + return zero, fmt.Errorf("the plugin with the name '%s' is not an instance of %T", name, zero) + } + return thePlugin, nil +} diff --git a/pkg/epp/plugins/registry.go b/pkg/epp/plugins/registry.go new file mode 100644 index 000000000..e9ff0e8fd --- /dev/null +++ b/pkg/epp/plugins/registry.go @@ -0,0 +1,33 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package plugins + +import ( + "encoding/json" +) + +// Factory is the definition of the factory functions that are used to instantiate plugins +// specified in a configuration. +type FactoryFunc func(name string, parameters json.RawMessage, handle Handle) (Plugin, error) + +// Register is a static function that can be called to register plugin factory functions. +func Register(pluginType string, factory FactoryFunc) { + Registry[pluginType] = factory +} + +// Registry is a mapping from plugin name to Factory function +var Registry map[string]FactoryFunc = map[string]FactoryFunc{} diff --git a/pkg/epp/requestcontrol/director.go b/pkg/epp/requestcontrol/director.go new file mode 100644 index 000000000..78effeda0 --- /dev/null +++ b/pkg/epp/requestcontrol/director.go @@ -0,0 +1,329 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package requestcontrol defines the Director component responsible for orchestrating request processing after initial +// parsing. +package requestcontrol + +import ( + "context" + "fmt" + "math/rand" + "net" + "strconv" + "strings" + "time" + + "github.com/go-logr/logr" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + requtil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/request" +) + +const ( + subsetHintNamespace = "envoy.lb.subset_hint" + subsetHintKey = "x-gateway-destination-endpoint-subset" +) + +// Scheduler defines the interface required by the Director for scheduling. +type Scheduler interface { + Schedule(ctx context.Context, request *schedulingtypes.LLMRequest, candidatePods []schedulingtypes.Pod) (result *schedulingtypes.SchedulingResult, err error) +} + +// SaturationDetector provides a signal indicating whether the backends are considered saturated. +type SaturationDetector interface { + IsSaturated(ctx context.Context) bool +} + +// NewDirectorWithConfig creates a new Director instance with all dependencies. +func NewDirectorWithConfig(datastore datastore.Datastore, scheduler Scheduler, saturationDetector SaturationDetector, config *Config) *Director { + return &Director{ + datastore: datastore, + scheduler: scheduler, + saturationDetector: saturationDetector, + preRequestPlugins: config.preRequestPlugins, + postResponsePlugins: config.postResponsePlugins, + } +} + +// Director orchestrates the request handling flow, including scheduling. +type Director struct { + datastore datastore.Datastore + scheduler Scheduler + saturationDetector SaturationDetector + preRequestPlugins []PreRequest + postResponsePlugins []PostResponse +} + +// HandleRequest orchestrates the request lifecycle: +// 1. Parses request details. +// 2. Calls admitRequest for admission control. +// 3. Calls Scheduler.Schedule if request is approved. +// 4. Calls prepareRequest to populate RequestContext with results and call PreRequest plugins. +// +// It always returns the requestContext even in the error case, as the request context is used in error handling. +func (d *Director) HandleRequest(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { + logger := log.FromContext(ctx) + + // --- 1. Parse Request, Resolve Target Models, and Determine Parameters --- + var ok bool + requestBodyMap := reqCtx.Request.Body + reqCtx.Model, ok = requestBodyMap["model"].(string) + if !ok { + return reqCtx, errutil.Error{Code: errutil.BadRequest, Msg: "model not found in request body"} + } + prompt, err := requtil.ExtractPromptFromRequestBody(requestBodyMap) + if err != nil { + return reqCtx, err + } + + modelObj := d.datastore.ModelGet(reqCtx.Model) + if modelObj == nil { + logger.Info("No associated inferenceModel found, using default", "model", reqCtx.Model) + sheddable := v1alpha2.Sheddable + modelObj = &v1alpha2.InferenceModel{ + Spec: v1alpha2.InferenceModelSpec{ + ModelName: reqCtx.Model, + Criticality: &sheddable, + }, + } + } + + reqCtx.ResolvedTargetModel = reqCtx.Model + if len(modelObj.Spec.TargetModels) > 0 { + reqCtx.ResolvedTargetModel = RandomWeightedDraw(logger, modelObj, 0) + if reqCtx.ResolvedTargetModel == "" { + return reqCtx, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error getting target model name for model %v", modelObj.Name)} + } + reqCtx.Request.Body["model"] = reqCtx.ResolvedTargetModel // Update target model in the body. + } + + requestCriticality := v1alpha2.Standard + if modelObj.Spec.Criticality != nil { + requestCriticality = *modelObj.Spec.Criticality + } + + // Prepare LLMRequest (needed for both saturation detection and Scheduler) + reqCtx.SchedulingRequest = &schedulingtypes.LLMRequest{ + RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey], + TargetModel: reqCtx.ResolvedTargetModel, + Prompt: prompt, + Headers: reqCtx.Request.Headers, + } + + logger = logger.WithValues("model", reqCtx.Model, "resolvedTargetModel", reqCtx.ResolvedTargetModel, "criticality", requestCriticality) + + ctx = log.IntoContext(ctx, logger) + logger.V(logutil.DEBUG).Info("LLM request assembled") + + // --- 2. Admission Control check -- + if err := d.admitRequest(ctx, requestCriticality); err != nil { + return reqCtx, err + } + + // --- 3. Call Scheduler (with the relevant candidate pods) --- + candidatePods := d.getCandidatePodsForScheduling(ctx, reqCtx.Request.Metadata) + if len(candidatePods) == 0 { + return reqCtx, errutil.Error{Code: errutil.ServiceUnavailable, Msg: "failed to find candidate pods for serving the request"} + } + results, err := d.scheduler.Schedule(ctx, reqCtx.SchedulingRequest, candidatePods) + if err != nil { + return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()} + } + + // --- 4. Prepare Request (Populates RequestContext and call PreRequest plugins) --- + // Insert target endpoint to instruct Envoy to route requests to the specified target pod and attach the port number. + // Invoke PreRequest registered plugins. + reqCtx, err = d.prepareRequest(ctx, reqCtx, results) + if err != nil { + return reqCtx, err + } + + return reqCtx, nil +} + +// admitRequest handles admission control to decide whether or not to accept the request +// based on the request criticality and system saturation state. +func (d *Director) admitRequest(ctx context.Context, requestCriticality v1alpha2.Criticality) error { + logger := log.FromContext(ctx) + + if requestCriticality == v1alpha2.Critical { + logger.V(logutil.DEBUG).Info("Critical request bypassing saturation check.") + return nil + } + + logger.V(logutil.DEBUG).Info("Performing saturation check for non-critical request.") + if d.saturationDetector.IsSaturated(ctx) { // Assuming non-nil Saturation Detector + return errutil.Error{ + Code: errutil.InferencePoolResourceExhausted, + Msg: "system saturated, non-critical request dropped", + } + } + + return nil +} + +// getCandidatePodsForScheduling gets the list of relevant endpoints for the scheduling cycle from the datastore. +// according to EPP protocol, if "x-gateway-destination-endpoint-subset" is set on the request metadata and specifies +// a subset of endpoints, only these endpoints will be considered as candidates for the scheduler. +// Snapshot pod metrics from the datastore to: +// 1. Reduce concurrent access to the datastore. +// 2. Ensure consistent data during the scheduling operation of a request between all scheduling cycles. +func (d *Director) getCandidatePodsForScheduling(ctx context.Context, requestMetadata map[string]any) []schedulingtypes.Pod { + loggerTrace := log.FromContext(ctx).V(logutil.TRACE) + + subsetMap, found := requestMetadata[subsetHintNamespace].(map[string]any) + if !found { + return schedulingtypes.ToSchedulerPodMetrics(d.datastore.PodGetAll()) + } + + // Check if endpoint key is present in the subset map and ensure there is at least one value + endpointSubsetList, found := subsetMap[subsetHintKey].([]any) + if !found { + return schedulingtypes.ToSchedulerPodMetrics(d.datastore.PodGetAll()) + } else if len(endpointSubsetList) == 0 { + loggerTrace.Info("found empty subset filter in request metadata, filtering all pods") + return []schedulingtypes.Pod{} + } + + // Create a map of endpoint addresses for easy lookup + endpoints := make(map[string]bool) + for _, endpoint := range endpointSubsetList { + // Extract address from endpoint + // The endpoint is formatted as "
:" (ex. "10.0.1.0:8080") + epStr := strings.Split(endpoint.(string), ":")[0] + endpoints[epStr] = true + } + + podTotalCount := 0 + podFitleredList := d.datastore.PodList(func(pm backendmetrics.PodMetrics) bool { + podTotalCount++ + if _, found := endpoints[pm.GetPod().Address]; found { + return true + } + return false + }) + + loggerTrace.Info("filtered candidate pods by subset filtering", "podTotalCount", podTotalCount, "filteredCount", len(podFitleredList)) + + return schedulingtypes.ToSchedulerPodMetrics(podFitleredList) +} + +// prepareRequest populates the RequestContext and calls the registered PreRequest plugins +// for allowing plugging customized logic based on the scheduling results. +func (d *Director) prepareRequest(ctx context.Context, reqCtx *handlers.RequestContext, result *schedulingtypes.SchedulingResult) (*handlers.RequestContext, error) { + logger := log.FromContext(ctx) + if result == nil || len(result.ProfileResults) == 0 { + return reqCtx, errutil.Error{Code: errutil.Internal, Msg: "results must be greater than zero"} + } + // primary profile is used to set destination + targetPod := result.ProfileResults[result.PrimaryProfileName].TargetPod.GetPod() + + pool, err := d.datastore.PoolGet() + if err != nil { + return reqCtx, err + } + targetPort := int(pool.Spec.TargetPortNumber) + + endpoint := net.JoinHostPort(targetPod.Address, strconv.Itoa(targetPort)) + logger.V(logutil.DEFAULT).Info("Request handled", "model", reqCtx.Model, "targetModel", reqCtx.ResolvedTargetModel, "endpoint", targetPod) + + reqCtx.TargetPod = targetPod + reqCtx.TargetEndpoint = endpoint + + d.runPreRequestPlugins(ctx, reqCtx.SchedulingRequest, result, targetPort) + + return reqCtx, nil +} + +func (d *Director) HandleResponse(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { + response := &Response{ + RequestId: reqCtx.Request.Headers[requtil.RequestIdHeaderKey], + Headers: reqCtx.Response.Headers, + } + + d.runPostResponsePlugins(ctx, reqCtx.SchedulingRequest, response, reqCtx.TargetPod) + + return reqCtx, nil +} + +func (d *Director) GetRandomPod() *backend.Pod { + pods := d.datastore.PodGetAll() + if len(pods) == 0 { + return nil + } + number := rand.Intn(len(pods)) + pod := pods[number] + return pod.GetPod() +} + +func RandomWeightedDraw(logger logr.Logger, model *v1alpha2.InferenceModel, seed int64) string { + // TODO: after we are down to 1 server implementation, make these methods a part of the struct + // and handle random seeding on the struct. + source := rand.NewSource(rand.Int63()) + if seed > 0 { + source = rand.NewSource(seed) + } + r := rand.New(source) + + // all the weight values are nil, then we should return random model name + if model.Spec.TargetModels[0].Weight == nil { + index := r.Int31n(int32(len(model.Spec.TargetModels))) + return model.Spec.TargetModels[index].Name + } + + var weights int32 + for _, model := range model.Spec.TargetModels { + weights += *model.Weight + } + logger.V(logutil.TRACE).Info("Weights for model computed", "model", model.Name, "weights", weights) + randomVal := r.Int31n(weights) + // TODO: optimize this without using loop + for _, model := range model.Spec.TargetModels { + if randomVal < *model.Weight { + return model.Name + } + randomVal -= *model.Weight + } + return "" +} + +func (d *Director) runPreRequestPlugins(ctx context.Context, request *schedulingtypes.LLMRequest, schedulingResult *schedulingtypes.SchedulingResult, + targetPort int) { + for _, plugin := range d.preRequestPlugins { + log.FromContext(ctx).V(logutil.DEBUG).Info("Running pre-request plugin", "plugin", plugin.Type()) + before := time.Now() + plugin.PreRequest(ctx, request, schedulingResult, targetPort) + metrics.RecordRequestControlPluginProcessingLatency(PreRequestPluginType, plugin.Type(), time.Since(before)) + } +} + +func (d *Director) runPostResponsePlugins(ctx context.Context, request *schedulingtypes.LLMRequest, response *Response, targetPod *backend.Pod) { + for _, plugin := range d.postResponsePlugins { + log.FromContext(ctx).V(logutil.DEBUG).Info("Running post-response plugin", "plugin", plugin.Type()) + before := time.Now() + plugin.PostResponse(ctx, request, response, targetPod) + metrics.RecordRequestControlPluginProcessingLatency(PostResponsePluginType, plugin.Type(), time.Since(before)) + } +} diff --git a/pkg/epp/requestcontrol/director_test.go b/pkg/epp/requestcontrol/director_test.go new file mode 100644 index 000000000..0f214b830 --- /dev/null +++ b/pkg/epp/requestcontrol/director_test.go @@ -0,0 +1,706 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package requestcontrol + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + k8stypes "k8s.io/apimachinery/pkg/types" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" + schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + requtil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/request" + testutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" +) + +// --- Mocks --- + +type mockSaturationDetector struct { + isSaturated bool +} + +func (m *mockSaturationDetector) IsSaturated(_ context.Context) bool { + return m.isSaturated +} + +type mockScheduler struct { + scheduleResults *schedulingtypes.SchedulingResult + scheduleErr error +} + +func (m *mockScheduler) Schedule(_ context.Context, _ *schedulingtypes.LLMRequest, _ []schedulingtypes.Pod) (*schedulingtypes.SchedulingResult, error) { + return m.scheduleResults, m.scheduleErr +} + +func TestDirector_HandleRequest(t *testing.T) { + ctx := logutil.NewTestLoggerIntoContext(context.Background()) + + // --- Setup common objects --- + model := "food-review" + modelSheddable := "food-review-sheddable" + modelWithResolvedTarget := "food-review-resolve" + + // InferenceModel definitions + imFoodReview := testutil.MakeInferenceModel("imFoodReview"). + CreationTimestamp(metav1.Unix(1000, 0)). + ModelName(model). + Criticality(v1alpha2.Critical). + ObjRef() + imFoodReviewSheddable := testutil.MakeInferenceModel("imFoodReviewSheddable"). + CreationTimestamp(metav1.Unix(1000, 0)). + ModelName(modelSheddable). + Criticality(v1alpha2.Sheddable). + ObjRef() + imFoodReviewResolve := testutil.MakeInferenceModel("imFoodReviewResolve"). + CreationTimestamp(metav1.Unix(1000, 0)). + ModelName(modelWithResolvedTarget). + Criticality(v1alpha2.Standard). + TargetModel("resolved-target-model-A"). + ObjRef() + + // Datastore setup + pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) + ds := datastore.NewDatastore(t.Context(), pmf) + ds.ModelSetIfOlder(imFoodReview) + ds.ModelSetIfOlder(imFoodReviewResolve) + ds.ModelSetIfOlder(imFoodReviewSheddable) + + pool := &v1alpha2.InferencePool{ + ObjectMeta: metav1.ObjectMeta{Name: "test-pool", Namespace: "default"}, + Spec: v1alpha2.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ + "app": "inference", + }, + }, + } + + // Pod setup + testPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Namespace: "default", + Labels: map[string]string{"app": "inference"}, + }, + Status: corev1.PodStatus{ + PodIP: "192.168.1.100", + Phase: corev1.PodRunning, + Conditions: []corev1.PodCondition{{Type: corev1.PodReady, Status: corev1.ConditionTrue}}, + }, + } + scheme := runtime.NewScheme() + _ = clientgoscheme.AddToScheme(scheme) + fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() + if err := ds.PoolSet(ctx, fakeClient, pool); err != nil { + t.Fatalf("Error while setting inference pool: %v", err) + } + ds.PodUpdateOrAddIfNotExist(testPod) + + defaultSuccessfulScheduleResults := &schedulingtypes.SchedulingResult{ + ProfileResults: map[string]*schedulingtypes.ProfileRunResult{ + "testProfile": { + TargetPod: &schedulingtypes.ScoredPod{ + Pod: &schedulingtypes.PodMetrics{ + Pod: &backend.Pod{ + Address: "192.168.1.100", + NamespacedName: k8stypes.NamespacedName{Name: "pod1", Namespace: "default"}, + }, + }, + }, + }, + }, + PrimaryProfileName: "testProfile", + } + + tests := []struct { + name string + reqBodyMap map[string]any + mockSaturationDetector *mockSaturationDetector + schedulerMockSetup func(m *mockScheduler) + wantErrCode string // Expected errutil code string + wantReqCtx *handlers.RequestContext // Fields to check in the returned RequestContext + wantMutatedBodyModel string // Expected model in reqCtx.Request.Body after PostDispatch + }{ + { + name: "successful completions request (critical, saturation ignored)", + reqBodyMap: map[string]any{ + "model": model, + "prompt": "critical prompt", + }, + mockSaturationDetector: &mockSaturationDetector{isSaturated: true}, + schedulerMockSetup: func(m *mockScheduler) { + m.scheduleResults = defaultSuccessfulScheduleResults + }, + wantReqCtx: &handlers.RequestContext{ + Model: model, + ResolvedTargetModel: model, + TargetPod: &backend.Pod{ + NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, + Address: "192.168.1.100", + }, + TargetEndpoint: "192.168.1.100:8000", + }, + wantMutatedBodyModel: model, + }, + { + name: "successful chat completions request (critical, saturation ignored)", + reqBodyMap: map[string]any{ + "model": model, + "messages": []any{ + map[string]any{ + "role": "user", + "content": "critical prompt", + }, + }, + }, + schedulerMockSetup: func(m *mockScheduler) { + m.scheduleResults = defaultSuccessfulScheduleResults + }, + wantReqCtx: &handlers.RequestContext{ + Model: model, + ResolvedTargetModel: model, + TargetPod: &backend.Pod{ + NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, + Address: "192.168.1.100", + }, + TargetEndpoint: "192.168.1.100:8000", + }, + wantMutatedBodyModel: model, + }, + { + name: "successful chat completions request with multiple messages (critical, saturation ignored)", + reqBodyMap: map[string]any{ + "model": model, + "messages": []any{ + map[string]any{ + "role": "developer", + "content": "You are a helpful assistant.", + }, + map[string]any{ + "role": "user", + "content": "Hello!", + }, + }, + }, + schedulerMockSetup: func(m *mockScheduler) { + m.scheduleResults = defaultSuccessfulScheduleResults + }, + wantReqCtx: &handlers.RequestContext{ + Model: model, + ResolvedTargetModel: model, + TargetPod: &backend.Pod{ + NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, + Address: "192.168.1.100", + }, + TargetEndpoint: "192.168.1.100:8000", + }, + wantMutatedBodyModel: model, + }, + { + name: "successful completions request (sheddable, not saturated)", + reqBodyMap: map[string]any{ + "model": modelSheddable, + "prompt": "sheddable prompt", + }, + mockSaturationDetector: &mockSaturationDetector{isSaturated: false}, + schedulerMockSetup: func(m *mockScheduler) { + m.scheduleResults = defaultSuccessfulScheduleResults + }, + wantReqCtx: &handlers.RequestContext{ + Model: modelSheddable, + ResolvedTargetModel: modelSheddable, + TargetPod: &backend.Pod{ + NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, + Address: "192.168.1.100", + }, + TargetEndpoint: "192.168.1.100:8000", + }, + wantMutatedBodyModel: modelSheddable, + }, + { + name: "successful request with target model resolution", + reqBodyMap: map[string]any{ + "model": modelWithResolvedTarget, + "prompt": "prompt for target resolution", + }, + mockSaturationDetector: &mockSaturationDetector{isSaturated: false}, + schedulerMockSetup: func(m *mockScheduler) { + m.scheduleResults = defaultSuccessfulScheduleResults + }, + wantReqCtx: &handlers.RequestContext{ + Model: modelWithResolvedTarget, + ResolvedTargetModel: "resolved-target-model-A", + TargetPod: &backend.Pod{ + NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, + Address: "192.168.1.100", + }, + TargetEndpoint: "192.168.1.100:8000", + }, + wantMutatedBodyModel: "resolved-target-model-A", + }, + { + name: "nonexistent target defined, use default inference model", + schedulerMockSetup: func(m *mockScheduler) { + m.scheduleResults = defaultSuccessfulScheduleResults + }, + wantReqCtx: &handlers.RequestContext{ + Model: "food-review-1", + ResolvedTargetModel: "food-review-1", + TargetPod: &backend.Pod{ + NamespacedName: types.NamespacedName{Namespace: "default", Name: "pod1"}, + Address: "192.168.1.100", + }, + TargetEndpoint: "192.168.1.100:8000", + }, + wantMutatedBodyModel: "food-review-1", + reqBodyMap: map[string]any{ + "model": "food-review-1", + "prompt": "test prompt", + }, + mockSaturationDetector: &mockSaturationDetector{isSaturated: false}, + }, + { + + name: "request dropped (sheddable, saturated)", + reqBodyMap: map[string]any{ + "model": modelSheddable, + "prompt": "sheddable prompt", + }, + mockSaturationDetector: &mockSaturationDetector{isSaturated: true}, + wantErrCode: errutil.InferencePoolResourceExhausted, + }, + { + name: "model not found, expect err", + reqBodyMap: map[string]any{"prompt": "p"}, + mockSaturationDetector: &mockSaturationDetector{isSaturated: false}, + wantErrCode: errutil.BadRequest, + }, + + { + name: "prompt or messages not found, expect err", + reqBodyMap: map[string]any{"model": model}, + wantErrCode: errutil.BadRequest, + }, + { + name: "empty messages, expect err", + reqBodyMap: map[string]any{ + "model": model, + "messages": []any{}, + }, + wantErrCode: errutil.BadRequest, + }, + { + name: "scheduler returns error", + reqBodyMap: map[string]any{ + "model": model, + "prompt": "prompt that causes scheduler error", + }, + schedulerMockSetup: func(m *mockScheduler) { + m.scheduleErr = errors.New("simulated scheduler failure") + }, + wantErrCode: errutil.InferencePoolResourceExhausted, + }, + { + name: "scheduler returns nil result and nil error", + reqBodyMap: map[string]any{ + "model": model, + "prompt": "prompt for nil,nil scheduler return", + }, + schedulerMockSetup: func(m *mockScheduler) { + m.scheduleResults = nil + m.scheduleErr = nil + }, + wantErrCode: errutil.Internal, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + mockSched := &mockScheduler{} + if test.schedulerMockSetup != nil { + test.schedulerMockSetup(mockSched) + } + director := NewDirectorWithConfig(ds, mockSched, test.mockSaturationDetector, NewConfig()) + + reqCtx := &handlers.RequestContext{ + Request: &handlers.Request{ + // Create a copy of the map for each test run to avoid mutation issues. + Body: make(map[string]any), + Headers: map[string]string{ + requtil.RequestIdHeaderKey: "test-req-id-" + test.name, // Ensure a default request ID + }, + }, + } + // Deep copy the body map. + for k, v := range test.reqBodyMap { + reqCtx.Request.Body[k] = v + } + + returnedReqCtx, err := director.HandleRequest(ctx, reqCtx) + + if test.wantErrCode != "" { + assert.Error(t, err, "HandleRequest() should have returned an error") + var e errutil.Error + if assert.ErrorAs(t, err, &e, "Error should be of type errutil.Error") { + assert.Equal(t, test.wantErrCode, e.Code, "Error code mismatch") + } + return + } + + assert.NoError(t, err, "HandleRequest() returned unexpected error") + + if test.wantReqCtx != nil { + assert.Equal(t, test.wantReqCtx.Model, returnedReqCtx.Model, "reqCtx.Model mismatch") + assert.Equal(t, test.wantReqCtx.ResolvedTargetModel, returnedReqCtx.ResolvedTargetModel, + "reqCtx.ResolvedTargetModel mismatch") + assert.Equal(t, test.wantReqCtx.TargetPod, returnedReqCtx.TargetPod, "reqCtx.TargetPod mismatch") + assert.Equal(t, test.wantReqCtx.TargetEndpoint, returnedReqCtx.TargetEndpoint, "reqCtx.TargetEndpoint mismatch") + } + + if test.wantMutatedBodyModel != "" { + assert.NotNil(t, returnedReqCtx.Request.Body, "Expected mutated body, but reqCtx.Request.Body is nil") + assert.Equal(t, test.wantMutatedBodyModel, returnedReqCtx.Request.Body["model"], + "Mutated reqCtx.Request.Body model mismatch") + } + }) + } +} + +// TestGetCandidatePodsForScheduling is testing getCandidatePodsForScheduling and more specifically the functionality of SubsetFilter. +func TestGetCandidatePodsForScheduling(t *testing.T) { + var makeFilterMetadata = func(data []any) map[string]any { + return map[string]any{ + "envoy.lb.subset_hint": map[string]any{ + "x-gateway-destination-endpoint-subset": data, + }, + } + } + + testInput := []*corev1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + }, + Status: corev1.PodStatus{ + PodIP: "10.0.0.1", + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod2", + }, + Status: corev1.PodStatus{ + PodIP: "10.0.0.2", + }, + }, + } + + outputPod1 := &backend.Pod{ + NamespacedName: types.NamespacedName{Name: "pod1"}, + Address: "10.0.0.1", + Labels: map[string]string{}, + } + + outputPod2 := &backend.Pod{ + NamespacedName: types.NamespacedName{Name: "pod2"}, + Address: "10.0.0.2", + Labels: map[string]string{}, + } + + tests := []struct { + name string + metadata map[string]any + output []schedulingtypes.Pod + }{ + { + name: "SubsetFilter, filter not present — return all pods", + metadata: map[string]any{}, + output: []schedulingtypes.Pod{ + &schedulingtypes.PodMetrics{ + Pod: outputPod1, + MetricsState: backendmetrics.NewMetricsState(), + }, + &schedulingtypes.PodMetrics{ + Pod: outputPod2, + MetricsState: backendmetrics.NewMetricsState(), + }, + }, + }, + { + name: "SubsetFilter, namespace present filter not present — return all pods", + metadata: map[string]any{"envoy.lb.subset_hint": map[string]any{}}, + output: []schedulingtypes.Pod{ + &schedulingtypes.PodMetrics{ + Pod: outputPod1, + MetricsState: backendmetrics.NewMetricsState(), + }, + &schedulingtypes.PodMetrics{ + Pod: outputPod2, + MetricsState: backendmetrics.NewMetricsState(), + }, + }, + }, + { + name: "SubsetFilter, filter present with empty list — return error", + metadata: makeFilterMetadata([]any{}), + output: []schedulingtypes.Pod{}, + }, + { + name: "SubsetFilter, subset with one matching pod", + metadata: makeFilterMetadata([]any{"10.0.0.1"}), + output: []schedulingtypes.Pod{ + &schedulingtypes.PodMetrics{ + Pod: outputPod1, + MetricsState: backendmetrics.NewMetricsState(), + }, + }, + }, + { + name: "SubsetFilter, subset with multiple matching pods", + metadata: makeFilterMetadata([]any{"10.0.0.1", "10.0.0.2", "10.0.0.3"}), + output: []schedulingtypes.Pod{ + &schedulingtypes.PodMetrics{ + Pod: outputPod1, + MetricsState: backendmetrics.NewMetricsState(), + }, + &schedulingtypes.PodMetrics{ + Pod: outputPod2, + MetricsState: backendmetrics.NewMetricsState(), + }, + }, + }, + { + name: "SubsetFilter, subset with no matching pods", + metadata: makeFilterMetadata([]any{"10.0.0.3"}), + output: []schedulingtypes.Pod{}, + }, + } + + pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) + ds := datastore.NewDatastore(t.Context(), pmf) + for _, testPod := range testInput { + ds.PodUpdateOrAddIfNotExist(testPod) + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + director := NewDirectorWithConfig(ds, &mockScheduler{}, &mockSaturationDetector{}, NewConfig()) + + got := director.getCandidatePodsForScheduling(context.Background(), test.metadata) + + diff := cmp.Diff(test.output, got, cmpopts.SortSlices(func(a, b schedulingtypes.Pod) bool { + return a.GetPod().NamespacedName.String() < b.GetPod().NamespacedName.String() + })) + if diff != "" { + t.Errorf("Unexpected output (-want +got): %v", diff) + } + }) + } +} + +func TestRandomWeightedDraw(t *testing.T) { + logger := logutil.NewTestLogger() + // Note: These tests verify deterministic outcomes for a fixed seed (420). + // They do not test the statistical properties of the random draw. + tests := []struct { + name string + model *v1alpha2.InferenceModel + want string + }{ + { + name: "deterministic draw: 50/50 weights, seed 420", + model: &v1alpha2.InferenceModel{ + Spec: v1alpha2.InferenceModelSpec{ + TargetModels: []v1alpha2.TargetModel{ + {Name: "canary", Weight: pointer(50)}, + {Name: "v1", Weight: pointer(50)}, + }, + }, + }, + want: "canary", + }, + { + name: "deterministic draw: 25/55/50 weights, seed 420", + model: &v1alpha2.InferenceModel{ + Spec: v1alpha2.InferenceModelSpec{ + TargetModels: []v1alpha2.TargetModel{ + {Name: "canary", Weight: pointer(25)}, + {Name: "v1.1", Weight: pointer(55)}, + {Name: "v1", Weight: pointer(50)}, + }, + }, + }, + want: "v1", + }, + { + name: "deterministic draw: 20/20/10 weights, seed 420", + model: &v1alpha2.InferenceModel{ + Spec: v1alpha2.InferenceModelSpec{ + TargetModels: []v1alpha2.TargetModel{ + {Name: "canary", Weight: pointer(20)}, + {Name: "v1.1", Weight: pointer(20)}, + {Name: "v1", Weight: pointer(10)}, + }, + }, + }, + want: "v1.1", + }, + { + name: "deterministic draw: nil weights (uniform), seed 420", + model: &v1alpha2.InferenceModel{ + Spec: v1alpha2.InferenceModelSpec{ + TargetModels: []v1alpha2.TargetModel{ + {Name: "canary"}, + {Name: "v1.1"}, + {Name: "v1"}, + }, + }, + }, + want: "canary", + }, + } + var seedVal int64 = 420 + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + model := RandomWeightedDraw(logger, test.model, seedVal) + assert.Equal(t, test.want, model, "RandomWeightedDraw() with seed %d should produce expected model", seedVal) + }) + } +} + +func TestGetRandomPod(t *testing.T) { + tests := []struct { + name string + storePods []*corev1.Pod + expectNil bool + }{ + { + name: "No pods available", + storePods: []*corev1.Pod{}, + expectNil: true, + }, + { + name: "Single pod available", + storePods: []*corev1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "pod1"}}, + }, + expectNil: false, + }, + { + name: "Multiple pods available", + storePods: []*corev1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "pod1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "pod2"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "pod3"}}, + }, + expectNil: false, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Millisecond) + ds := datastore.NewDatastore(t.Context(), pmf) + for _, pod := range test.storePods { + ds.PodUpdateOrAddIfNotExist(pod) + } + d := &Director{datastore: ds} + gotPod := d.GetRandomPod() + + if test.expectNil && gotPod != nil { + t.Errorf("expected nil pod, got: %v", gotPod) + } + if !test.expectNil && gotPod == nil { + t.Errorf("expected non-nil pod, got nil") + } + }) + } +} + +func pointer(v int32) *int32 { + return &v +} + +func TestDirector_HandleResponse(t *testing.T) { + pr1 := &testPostResponse{ + TypeRes: "pr1", + } + + ctx := logutil.NewTestLoggerIntoContext(context.Background()) + ds := datastore.NewDatastore(t.Context(), nil) + mockSched := &mockScheduler{} + director := NewDirectorWithConfig(ds, mockSched, nil, NewConfig().WithPostResponsePlugins(pr1)) + + reqCtx := &handlers.RequestContext{ + Request: &handlers.Request{ + Headers: map[string]string{ + requtil.RequestIdHeaderKey: "test-req-id-for-response", + }, + }, + Response: &handlers.Response{ // Simulate some response headers + Headers: map[string]string{"X-Test-Response-Header": "TestValue"}, + }, + + TargetPod: &backend.Pod{NamespacedName: types.NamespacedName{Namespace: "namespace1", Name: "test-pod-name"}}, + } + + _, err := director.HandleResponse(ctx, reqCtx) + if err != nil { + t.Fatalf("HandleResponse() returned unexpected error: %v", err) + } + + if diff := cmp.Diff("test-req-id-for-response", pr1.lastRespOnResponse.RequestId); diff != "" { + t.Errorf("Scheduler.OnResponse RequestId mismatch (-want +got):\n%s", diff) + } + if diff := cmp.Diff(reqCtx.Response.Headers, pr1.lastRespOnResponse.Headers); diff != "" { + t.Errorf("Scheduler.OnResponse Headers mismatch (-want +got):\n%s", diff) + } + if diff := cmp.Diff("namespace1/test-pod-name", pr1.lastTargetPodOnResponse); diff != "" { + t.Errorf("Scheduler.OnResponse TargetPodName mismatch (-want +got):\n%s", diff) + } +} + +type testPostResponse struct { + TypeRes string + lastRespOnResponse *Response + lastTargetPodOnResponse string +} + +func (p *testPostResponse) Type() string { return p.TypeRes } +func (p *testPostResponse) Name() string { return "test-post-response" } + +func (p *testPostResponse) PostResponse(_ context.Context, _ *schedulingtypes.LLMRequest, response *Response, targetPod *backend.Pod) { + p.lastRespOnResponse = response + p.lastTargetPodOnResponse = targetPod.NamespacedName.String() +} diff --git a/pkg/epp/requestcontrol/plugins.go b/pkg/epp/requestcontrol/plugins.go new file mode 100644 index 000000000..ba51c2afb --- /dev/null +++ b/pkg/epp/requestcontrol/plugins.go @@ -0,0 +1,44 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package requestcontrol + +import ( + "context" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +const ( + PreRequestPluginType = "PreRequest" + PostResponsePluginType = "PostResponse" +) + +// PreRequest is called by the director after a getting result from scheduling layer and +// before a request is sent to the selected model server. +type PreRequest interface { + plugins.Plugin + PreRequest(ctx context.Context, request *types.LLMRequest, schedulingResult *types.SchedulingResult, targetPort int) +} + +// PostResponse is called by the director after a successful response was sent. +// The given pod argument is the pod that served the request. +type PostResponse interface { + plugins.Plugin + PostResponse(ctx context.Context, request *types.LLMRequest, response *Response, targetPod *backend.Pod) +} diff --git a/pkg/epp/requestcontrol/request_control_config.go b/pkg/epp/requestcontrol/request_control_config.go new file mode 100644 index 000000000..d658cde31 --- /dev/null +++ b/pkg/epp/requestcontrol/request_control_config.go @@ -0,0 +1,69 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package requestcontrol + +import ( + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" +) + +// NewConfig creates a new Config object and returns its pointer. +func NewConfig() *Config { + return &Config{ + preRequestPlugins: []PreRequest{}, + postResponsePlugins: []PostResponse{}, + } +} + +// Config provides a configuration for the requestcontrol plugins. +type Config struct { + preRequestPlugins []PreRequest + postResponsePlugins []PostResponse +} + +// WithPreRequestPlugins sets the given plugins as the PreRequest plugins. +// If the Config has PreRequest plugins already, this call replaces the existing plugins with the given ones. +func (c *Config) WithPreRequestPlugins(plugins ...PreRequest) *Config { + c.preRequestPlugins = plugins + return c +} + +// WithPostResponsePlugins sets the given plugins as the PostResponse plugins. +// If the Config has PostResponse plugins already, this call replaces the existing plugins with the given ones. +func (c *Config) WithPostResponsePlugins(plugins ...PostResponse) *Config { + c.postResponsePlugins = plugins + return c +} + +func (c *Config) AddPlugins(pluginObjects ...plugins.Plugin) { + for _, plugin := range pluginObjects { + if preRequestPlugin, ok := plugin.(PreRequest); ok { + c.preRequestPlugins = append(c.preRequestPlugins, preRequestPlugin) + } + if postResponsePlugin, ok := plugin.(PostResponse); ok { + c.postResponsePlugins = append(c.postResponsePlugins, postResponsePlugin) + } + } +} + +func LoadRequestControlConfig(instantiatedPlugins map[string]plugins.Plugin) *Config { + config := NewConfig() + for _, plugin := range instantiatedPlugins { + config.AddPlugins(plugin) + } + + return config +} diff --git a/pkg/epp/requestcontrol/types.go b/pkg/epp/requestcontrol/types.go new file mode 100644 index 000000000..8604e1dda --- /dev/null +++ b/pkg/epp/requestcontrol/types.go @@ -0,0 +1,31 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package requestcontrol + +// Response contains information from the response received to be passed to PostResponse plugins +type Response struct { + // RequestId is the Envoy generated Id for the request being processed + RequestId string + // Headers is a map of the response headers. Nil during body processing + Headers map[string]string + // Body Is the body of the response or nil during header processing + Body string + // IsStreaming indicates whether or not the response is being streamed by the model + IsStreaming bool + // EndOfStream when true indicates that this invocation contains the last chunk of the response + EndOfStream bool +} diff --git a/pkg/epp/saturationdetector/config.go b/pkg/epp/saturationdetector/config.go new file mode 100644 index 000000000..78a5833e4 --- /dev/null +++ b/pkg/epp/saturationdetector/config.go @@ -0,0 +1,70 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package saturationdetector + +import ( + "fmt" + "time" + + "sigs.k8s.io/controller-runtime/pkg/log" + commonconfig "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/common/config" + envutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" +) + +// Default configuration values +const ( + DefaultQueueDepthThreshold = commonconfig.DefaultQueueThresholdCritical + DefaultKVCacheUtilThreshold = commonconfig.DefaultKVCacheThreshold + // DefaultMetricsStalenessThreshold defines how old metrics can be before they + // are considered stale. + // Given the pod metrics refresh interval is 50ms, a threshold slightly above + // that should be fine. + DefaultMetricsStalenessThreshold = 200 * time.Millisecond +) + +// Environment variable names for SaturationDetector configuration +const ( + EnvSdQueueDepthThreshold = "SD_QUEUE_DEPTH_THRESHOLD" + EnvSdKVCacheUtilThreshold = "SD_KV_CACHE_UTIL_THRESHOLD" + EnvSdMetricsStalenessThreshold = "SD_METRICS_STALENESS_THRESHOLD" +) + +// LoadConfigFromEnv loads SaturationDetector Config from environment variables. +func LoadConfigFromEnv() *Config { + // Use a default logger for initial configuration loading. + logger := log.Log.WithName("saturation-detector-config") + + cfg := &Config{} + + cfg.QueueDepthThreshold = envutil.GetEnvInt(EnvSdQueueDepthThreshold, DefaultQueueDepthThreshold, logger) + if cfg.QueueDepthThreshold <= 0 { + cfg.QueueDepthThreshold = DefaultQueueDepthThreshold + } + + cfg.KVCacheUtilThreshold = envutil.GetEnvFloat(EnvSdKVCacheUtilThreshold, DefaultKVCacheUtilThreshold, logger) + if cfg.KVCacheUtilThreshold <= 0 || cfg.KVCacheUtilThreshold >= 1 { + cfg.KVCacheUtilThreshold = DefaultKVCacheUtilThreshold + } + + cfg.MetricsStalenessThreshold = envutil.GetEnvDuration(EnvSdMetricsStalenessThreshold, DefaultMetricsStalenessThreshold, logger) + if cfg.MetricsStalenessThreshold <= 0 { + cfg.MetricsStalenessThreshold = DefaultMetricsStalenessThreshold + } + + // NewDetector validates the config and assigns defaults. + logger.Info("SaturationDetector configuration loaded from env", "config", fmt.Sprintf("%+v", cfg)) + return cfg +} diff --git a/pkg/epp/saturationdetector/saturationdetector.go b/pkg/epp/saturationdetector/saturationdetector.go new file mode 100644 index 000000000..6ac0881e7 --- /dev/null +++ b/pkg/epp/saturationdetector/saturationdetector.go @@ -0,0 +1,160 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package saturationdetector implements a mechanism to determine if the +// backend model servers are considered saturated based on observed metrics. +// +// The current implementation provides a global saturation signal (IsSaturated) +// primarily based on backend queue depths and KV cache utilization, reflecting +// the saturation signals previously used by the Scheduler before the +// introduction of the FlowController. It fetches live metrics from the +// provided Datastore. +// +// TODO: Explore more advanced saturation signals in the future, such as: +// - Latency-objective-based saturation. +// - Predictive saturation based on trends. +// - Hysteresis bands or other smoothing techniques to prevent rapid +// oscillations of the saturation signal. +package saturationdetector + +import ( + "context" + "time" + + "github.com/go-logr/logr" + "sigs.k8s.io/controller-runtime/pkg/log" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + // loggerName is the name to use for loggers created by this package. + loggerName = "SaturationDetector" +) + +// Config holds the configuration for the SaturationDetector. +type Config struct { + // QueueDepthThreshold defines the backend waiting queue size above which a + // pod is considered to have insufficient capacity for new requests. + QueueDepthThreshold int + // KVCacheUtilThreshold defines the KV cache utilization (0.0 to 1.0) above + // which a pod is considered to have insufficient capacity. + KVCacheUtilThreshold float64 + // MetricsStalenessThreshold defines how old a pod's metrics can be. + // If a pod's metrics are older than this, it might be excluded from + // "good capacity" considerations or treated as having no capacity for + // safety. + MetricsStalenessThreshold time.Duration +} + +// Datastore provides an interface to access backend pod metrics. +type Datastore interface { + PodGetAll() []backendmetrics.PodMetrics +} + +// Detector determines system saturation based on metrics from the Datastore. +// +// The Detector currently holds a direct dependency on a Datastore interface. +// This design choice was made to encapsulate the logic of fetching and +// interpreting metrics for saturation, thereby simplifying the dependencies +// for primary consumers like the FlowController--to be added soon--(which +// would otherwise need to manage Datastore interactions itself). +// This architectural decision may be revisited in the future if a more +// decoupled approach (e.g., passing metrics directly to IsSaturated) proves +// more beneficial. +type Detector struct { + datastore Datastore + config *Config +} + +// NewDetector creates a new SaturationDetector. +// The datastore is expected to provide access to live/recently-updated pod +// metrics. +// The config provides the thresholds for determining saturation. +func NewDetector(config *Config, datastore Datastore, logger logr.Logger) *Detector { + logger.WithName(loggerName).V(logutil.DEFAULT).Info("Creating new SaturationDetector", + "queueDepthThreshold", config.QueueDepthThreshold, + "kvCacheUtilThreshold", config.KVCacheUtilThreshold, + "metricsStalenessThreshold", config.MetricsStalenessThreshold.String()) + + return &Detector{ + datastore: datastore, + config: config, + } +} + +// IsSaturated checks if the system is currently considered saturated. +// The system is saturated if NO pod currently has "good capacity". +// "Good capacity" means: +// 1. Metrics are fresh (not stale). +// 2. WaitingQueueSize <= QueueDepthThreshold. +// 3. KVCacheUsagePercent <= KVCacheUtilThreshold. +// +// If no pods are found in the datastore, the system is considered saturated +// (no capacity). +func (d *Detector) IsSaturated(ctx context.Context) bool { + logger := log.FromContext(ctx).WithName(loggerName) + allPodsMetrics := d.datastore.PodGetAll() + if len(allPodsMetrics) == 0 { + logger.V(logutil.VERBOSE).Info("No pods found in datastore; system is considered SATURATED (no capacity).") + // If there are no pods, there is no capacity to serve requests. + // Treat this as a saturated state to enable FlowController queuing. + return true + } + + for _, podMetric := range allPodsMetrics { + metrics := podMetric.GetMetrics() + podNn := "unknown-pod" + if podMetric.GetPod() != nil { + podNn = podMetric.GetPod().NamespacedName.String() + } + + if metrics == nil { + logger.V(logutil.TRACE).Info("Pod has nil metrics, skipping for saturation check", + "pod", podNn) + continue + } + + // Check for metric staleness + if time.Since(metrics.UpdateTime) > d.config.MetricsStalenessThreshold { + logger.V(logutil.TRACE).Info("Pod metrics are stale, considered as not having good capacity", + "pod", podNn, "updateTime", metrics.UpdateTime, "stalenessThreshold", d.config.MetricsStalenessThreshold) + continue + } + + // Check queue depth + if metrics.WaitingQueueSize > d.config.QueueDepthThreshold { + logger.V(logutil.TRACE).Info("Pod WaitingQueueSize is above threshold, considered as not having good capacity", + "pod", podNn, "waitingQueueSize", metrics.WaitingQueueSize, "threshold", d.config.QueueDepthThreshold) + continue // WaitingQueueSize is above threshold, considered saturated. + } + + // Check KV cache utilization + if metrics.KVCacheUsagePercent > d.config.KVCacheUtilThreshold { + logger.V(logutil.TRACE).Info("Pod KVCacheUsagePercent is above threshold, considered as not having good capacity", + "pod", podNn, "kvCacheUsagePercent", metrics.KVCacheUsagePercent, "threshold", d.config.KVCacheUtilThreshold) + continue // KVCacheUsagePercent is above threshold, considered saturated. + } + + logger.V(logutil.TRACE).Info("Found pod with good capacity", "pod", podNn, "waitingQueue", metrics.WaitingQueueSize, + "queueThreshold", d.config.QueueDepthThreshold, "kvCacheUtil", metrics.KVCacheUsagePercent, "kvCacheThreshold", d.config.KVCacheUtilThreshold) + + return false // Found at least one pod with good capacity, so system is NOT saturated. + } + + logger.V(logutil.VERBOSE).Info("No pods found with good capacity; system is considered SATURATED.") + return true +} diff --git a/pkg/epp/saturationdetector/saturationdetector_test.go b/pkg/epp/saturationdetector/saturationdetector_test.go new file mode 100644 index 000000000..42e81b5fd --- /dev/null +++ b/pkg/epp/saturationdetector/saturationdetector_test.go @@ -0,0 +1,326 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package saturationdetector + +import ( + "context" + "fmt" + "os" + "strconv" + "testing" + "time" + + "github.com/go-logr/logr" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" +) + +// --- Mock Implementations --- + +type mockDatastore struct { + pods []*backendmetrics.FakePodMetrics +} + +// PodGetAll returns all pod metrics from the fake datastore. +func (fds *mockDatastore) PodGetAll() []backendmetrics.PodMetrics { + pm := make([]backendmetrics.PodMetrics, 0, len(fds.pods)) + for _, pod := range fds.pods { + pm = append(pm, pod) + } + return pm +} + +func newMockPodMetrics(name string, metrics *backendmetrics.MetricsState) *backendmetrics.FakePodMetrics { + return &backendmetrics.FakePodMetrics{ + Pod: &backend.Pod{ + NamespacedName: types.NamespacedName{Name: name, Namespace: "ns1"}, + }, + Metrics: metrics, + } +} + +// --- Tests --- + +func TestNewDetector(t *testing.T) { + tests := []struct { + name string + config *Config + datastore Datastore + expectedQueueDepthThreshold int + expectedKVCacheUtilThreshold float64 + expectedStalenessThreshold time.Duration + }{ + { + name: "Valid config", + config: &Config{ + QueueDepthThreshold: 10, + KVCacheUtilThreshold: 0.8, + MetricsStalenessThreshold: 100 * time.Millisecond, + }, + datastore: &mockDatastore{}, + expectedQueueDepthThreshold: 10, + expectedKVCacheUtilThreshold: 0.8, + expectedStalenessThreshold: 100 * time.Millisecond, + }, + { + name: "invalid thresholds, fallback to default", + config: &Config{ + QueueDepthThreshold: -1, + KVCacheUtilThreshold: -5, + MetricsStalenessThreshold: 0, + }, + datastore: &mockDatastore{}, + expectedQueueDepthThreshold: DefaultQueueDepthThreshold, + expectedKVCacheUtilThreshold: DefaultKVCacheUtilThreshold, + expectedStalenessThreshold: DefaultMetricsStalenessThreshold, + }, + { + name: "kv cache threshold above range, fallback to default", + config: &Config{ + QueueDepthThreshold: 10, + KVCacheUtilThreshold: 1.5, + MetricsStalenessThreshold: 100 * time.Millisecond, + }, + datastore: &mockDatastore{}, + expectedQueueDepthThreshold: 10, + expectedKVCacheUtilThreshold: DefaultKVCacheUtilThreshold, + expectedStalenessThreshold: 100 * time.Millisecond, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + // validate configuration values are loaded from env vars properly, including the use of default values when provided value is invalid. + os.Setenv(EnvSdQueueDepthThreshold, strconv.Itoa(test.config.QueueDepthThreshold)) + os.Setenv(EnvSdKVCacheUtilThreshold, fmt.Sprintf("%v", test.config.KVCacheUtilThreshold)) + os.Setenv(EnvSdMetricsStalenessThreshold, test.config.MetricsStalenessThreshold.String()) + + detector := NewDetector(LoadConfigFromEnv(), test.datastore, logr.Discard()) + if detector == nil { + t.Fatalf("NewDetector() returned nil detector for valid config") + } + if detector.config.QueueDepthThreshold != test.expectedQueueDepthThreshold { + t.Errorf("NewDetector() QueueDepthThreshold = %d, want %d", detector.config.QueueDepthThreshold, test.expectedQueueDepthThreshold) + } + if detector.config.KVCacheUtilThreshold != test.expectedKVCacheUtilThreshold { + t.Errorf("NewDetector() KVCacheUtilThreshold = %f, want %f", detector.config.KVCacheUtilThreshold, test.expectedKVCacheUtilThreshold) + } + if detector.config.MetricsStalenessThreshold != test.expectedStalenessThreshold { + t.Errorf("NewDetector() MetricsStalenessThreshold = %v, want %v", detector.config.MetricsStalenessThreshold, test.expectedStalenessThreshold) + } + }) + } +} + +func TestDetector_IsSaturated(t *testing.T) { + baseTime := time.Now() + defaultConfig := &Config{ + QueueDepthThreshold: 5, + KVCacheUtilThreshold: 0.90, + MetricsStalenessThreshold: 100 * time.Millisecond, + } + + tests := []struct { + name string + config *Config + pods []*backendmetrics.FakePodMetrics + expectedSaturat bool + }{ + { + name: "No pods in datastore", + config: defaultConfig, + pods: []*backendmetrics.FakePodMetrics{}, + expectedSaturat: true, // No capacity = saturated + }, + { + name: "Single pod with good capacity", + config: defaultConfig, + pods: []*backendmetrics.FakePodMetrics{ + newMockPodMetrics("pod1", &backendmetrics.MetricsState{ + UpdateTime: baseTime, + WaitingQueueSize: 2, + KVCacheUsagePercent: 0.5, + }), + }, + expectedSaturat: false, + }, + { + name: "Single pod with stale metrics", + config: defaultConfig, + pods: []*backendmetrics.FakePodMetrics{ + newMockPodMetrics("pod1", &backendmetrics.MetricsState{ + UpdateTime: baseTime.Add(-200 * time.Millisecond), // Stale + WaitingQueueSize: 1, + KVCacheUsagePercent: 0.1, + }), + }, + expectedSaturat: true, + }, + { + name: "Single pod with high queue depth", + config: defaultConfig, + pods: []*backendmetrics.FakePodMetrics{ + newMockPodMetrics("pod1", &backendmetrics.MetricsState{ + UpdateTime: baseTime, + WaitingQueueSize: 10, // Exceeds threshold 5 + KVCacheUsagePercent: 0.1, + }), + }, + expectedSaturat: true, + }, + { + name: "Single pod with high KV cache utilization", + config: defaultConfig, + pods: []*backendmetrics.FakePodMetrics{ + newMockPodMetrics("pod1", &backendmetrics.MetricsState{ + UpdateTime: baseTime, + WaitingQueueSize: 1, + KVCacheUsagePercent: 0.95, // Exceeds threshold 0.90 + }), + }, + expectedSaturat: true, + }, + { + name: "Single pod with nil metrics", + config: defaultConfig, + pods: []*backendmetrics.FakePodMetrics{ + newMockPodMetrics("pod1", nil), + }, + expectedSaturat: true, + }, + { + name: "Multiple pods, all good capacity", + config: defaultConfig, + pods: []*backendmetrics.FakePodMetrics{ + newMockPodMetrics("pod1", &backendmetrics.MetricsState{ + UpdateTime: baseTime, + WaitingQueueSize: 1, + KVCacheUsagePercent: 0.1, + }), + newMockPodMetrics("pod2", &backendmetrics.MetricsState{ + UpdateTime: baseTime.Add(-10 * time.Millisecond), + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + }), + }, + expectedSaturat: false, + }, + { + name: "Multiple pods, one good, one bad (stale)", + config: defaultConfig, + pods: []*backendmetrics.FakePodMetrics{ + newMockPodMetrics("pod1", &backendmetrics.MetricsState{ + UpdateTime: baseTime, // Good + WaitingQueueSize: 1, + KVCacheUsagePercent: 0.1, + }), + newMockPodMetrics("pod2", &backendmetrics.MetricsState{ + UpdateTime: baseTime.Add(-300 * time.Millisecond), // Stale + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + }), + }, + expectedSaturat: false, // One good pod is enough + }, + { + name: "Multiple pods, one good, one bad (high queue)", + config: defaultConfig, + pods: []*backendmetrics.FakePodMetrics{ + newMockPodMetrics("pod1", &backendmetrics.MetricsState{ + UpdateTime: baseTime, + WaitingQueueSize: 1, + KVCacheUsagePercent: 0.1, + }), + newMockPodMetrics("pod2", &backendmetrics.MetricsState{ + UpdateTime: baseTime, + WaitingQueueSize: 15, // Bad queue + KVCacheUsagePercent: 0.2, + }), + }, + expectedSaturat: false, + }, + { + name: "Multiple pods, all bad capacity", + config: defaultConfig, + pods: []*backendmetrics.FakePodMetrics{ + newMockPodMetrics("pod1", &backendmetrics.MetricsState{ + UpdateTime: baseTime.Add(-200 * time.Millisecond), // Stale + WaitingQueueSize: 1, + KVCacheUsagePercent: 0.1, + }), + newMockPodMetrics("pod2", &backendmetrics.MetricsState{ + UpdateTime: baseTime, + WaitingQueueSize: 20, // High queue + KVCacheUsagePercent: 0.2, + }), + newMockPodMetrics("pod3", &backendmetrics.MetricsState{ + UpdateTime: baseTime, + WaitingQueueSize: 1, + KVCacheUsagePercent: 0.99, // High KV + }), + }, + expectedSaturat: true, + }, + { + name: "Queue depth exactly at threshold", + config: defaultConfig, + pods: []*backendmetrics.FakePodMetrics{ + newMockPodMetrics("pod1", &backendmetrics.MetricsState{ + UpdateTime: baseTime, + WaitingQueueSize: defaultConfig.QueueDepthThreshold, // Exactly at threshold (good) + KVCacheUsagePercent: 0.1, + }), + }, + expectedSaturat: false, + }, + { + name: "KV cache exactly at threshold", + config: defaultConfig, + pods: []*backendmetrics.FakePodMetrics{ + newMockPodMetrics("pod1", &backendmetrics.MetricsState{ + UpdateTime: baseTime, + WaitingQueueSize: 1, + KVCacheUsagePercent: defaultConfig.KVCacheUtilThreshold, // Exactly at threshold (good) + }), + }, + expectedSaturat: false, + }, + { + name: "Metrics age just over staleness threshold", + config: defaultConfig, + pods: []*backendmetrics.FakePodMetrics{ + newMockPodMetrics("pod1", &backendmetrics.MetricsState{ + UpdateTime: baseTime.Add(-defaultConfig.MetricsStalenessThreshold - time.Nanosecond), // Just over (stale) + WaitingQueueSize: 1, + KVCacheUsagePercent: 0.1, + }), + }, + expectedSaturat: true, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + detector := NewDetector(test.config, &mockDatastore{pods: test.pods}, logr.Discard()) + + if got := detector.IsSaturated(context.Background()); got != test.expectedSaturat { + t.Errorf("IsSaturated() = %v, want %v", got, test.expectedSaturat) + } + }) + } +} diff --git a/pkg/epp/scheduling/config/config.go b/pkg/epp/scheduling/config/config.go new file mode 100644 index 000000000..e7fd0a3f4 --- /dev/null +++ b/pkg/epp/scheduling/config/config.go @@ -0,0 +1,57 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package config + +import ( + "sigs.k8s.io/controller-runtime/pkg/log" + commonconfig "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/common/config" + envutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +// Config holds all the configuration values for the scheduler +type Config struct { + KVCacheThreshold float64 + QueueThresholdCritical int + QueueingThresholdLoRA int + LoraAffinityThreshold float64 +} + +const ( + // Default values for LoRA specific thresholds + DefaultQueueingThresholdLoRA = 128 + DefaultLoraAffinityThreshold = 0.999 +) + +// LoadConfig loads configuration from environment variables +func LoadConfig() Config { + // Use a default logger for initial configuration loading + baseLogger := log.Log.WithName("scheduling-config") + + config := Config{ + KVCacheThreshold: envutil.GetEnvFloat("KV_CACHE_THRESHOLD", commonconfig.DefaultKVCacheThreshold, baseLogger), + QueueThresholdCritical: envutil.GetEnvInt("QUEUE_THRESHOLD_CRITICAL", commonconfig.DefaultQueueThresholdCritical, baseLogger), + QueueingThresholdLoRA: envutil.GetEnvInt("QUEUING_THRESHOLD_LORA", DefaultQueueingThresholdLoRA, baseLogger), + LoraAffinityThreshold: envutil.GetEnvFloat("LORA_AFFINITY_THRESHOLD", DefaultLoraAffinityThreshold, baseLogger), + } + + baseLogger.V(logutil.DEFAULT).Info("Scheduler configuration loaded", "config", config) + + return config +} + +var Conf = LoadConfig() diff --git a/pkg/epp/scheduling/filter.go b/pkg/epp/scheduling/filter.go deleted file mode 100644 index f48480899..000000000 --- a/pkg/epp/scheduling/filter.go +++ /dev/null @@ -1,248 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package scheduling - -import ( - "errors" - "math" - "math/rand" - "time" - - "github.com/go-logr/logr" - backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" -) - -type Filter interface { - Name() string - Filter(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) -} - -// filter applies current filterFunc, and then recursively applies next filters depending success or -// failure of the current filterFunc. -// It can be used to construct a flow chart algorithm. -type filter struct { - name string - filter filterFunc - // nextOnSuccess filter will be applied after successfully applying the current filter. - // The filtered results will be passed to the next filter. - nextOnSuccess *filter - // nextOnFailure filter will be applied if current filter fails. - // The original input will be passed to the next filter. - nextOnFailure *filter - // nextOnSuccessOrFailure is a convenience field to configure the next filter regardless of the - // success or failure of the current filter. - // NOTE: When using nextOnSuccessOrFailure, both nextOnSuccess and nextOnFailure SHOULD be nil. - // However if that's not the case, nextOnSuccess and nextOnFailure will be used, instead of - // nextOnSuccessOrFailure, in the success and failure scenarios, respectively. - nextOnSuccessOrFailure *filter -} - -func (f *filter) Name() string { - if f == nil { - return "nil" - } - return f.name -} - -func (f *filter) Filter(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { - loggerTrace := logger.V(logutil.TRACE) - loggerTrace.Info("Running a filter", "name", f.Name(), "podCount", len(pods)) - - filtered, err := f.filter(logger, req, pods) - - next := f.nextOnSuccessOrFailure - if err == nil && len(filtered) > 0 { - if f.nextOnSuccess == nil && f.nextOnSuccessOrFailure == nil { - // No succeeding filters to run, return. - return filtered, err - } - if f.nextOnSuccess != nil { - next = f.nextOnSuccess - } - loggerTrace.Info("Filter succeeded", "filter", f.Name(), "next", next.Name(), "filteredPodCount", len(filtered)) - // On success, pass the filtered result to the next filter. - return next.Filter(logger, req, filtered) - } else { - if f.nextOnFailure == nil && f.nextOnSuccessOrFailure == nil { - // No succeeding filters to run, return. - return filtered, err - } - if f.nextOnFailure != nil { - next = f.nextOnFailure - } - loggerTrace.Info("Filter failed", "filter", f.Name(), "next", next.Name()) - // On failure, pass the initial set of pods to the next filter. - return next.Filter(logger, req, pods) - } -} - -// filterFunc filters a set of input pods to a subset. -type filterFunc func(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) - -// toFilterFunc is a helper function to convert a per pod filter func to the FilterFunc. -func toFilterFunc(pp podPredicate) filterFunc { - return func(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { - filtered := []backendmetrics.PodMetrics{} - for _, pod := range pods { - pass := pp(req, pod) - if pass { - filtered = append(filtered, pod) - } - } - if len(filtered) == 0 { - return nil, errors.New("no pods left") - } - return filtered, nil - } -} - -// leastQueuingFilterFunc finds the max and min queue size of all pods, divides the whole range -// (max-min) by the number of pods, and finds the pods that fall into the first range. -// The intuition is that if there are multiple pods that share similar queue size in the low range, -// we should consider them all instead of the absolute minimum one. This worked better than picking -// the least one as it gives more choices for the next filter, which on aggregate gave better -// results. -// TODO: Compare this strategy with other strategies such as top K. -func leastQueuingFilterFunc(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { - min := math.MaxInt - max := 0 - filtered := []backendmetrics.PodMetrics{} - - for _, pod := range pods { - if pod.GetMetrics().WaitingQueueSize <= min { - min = pod.GetMetrics().WaitingQueueSize - } - if pod.GetMetrics().WaitingQueueSize >= max { - max = pod.GetMetrics().WaitingQueueSize - } - } - - for _, pod := range pods { - if pod.GetMetrics().WaitingQueueSize >= min && pod.GetMetrics().WaitingQueueSize <= min+(max-min)/len(pods) { - filtered = append(filtered, pod) - } - } - return filtered, nil -} - -func lowQueueingPodPredicate(_ *LLMRequest, pod backendmetrics.PodMetrics) bool { - return pod.GetMetrics().WaitingQueueSize < config.QueueingThresholdLoRA -} - -// leastKVCacheFilterFunc finds the max and min KV cache of all pods, divides the whole range -// (max-min) by the number of pods, and finds the pods that fall into the first range. -// The intuition is that if there are multiple pods that share similar KV cache in the low range, we -// should consider them all instead of the absolute minimum one. This worked better than picking the -// least one as it gives more choices for the next filter, which on aggregate gave better results. -// TODO: Compare this strategy with other strategies such as top K. -func leastKVCacheFilterFunc(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { - min := math.MaxFloat64 - var max float64 = 0 - filtered := []backendmetrics.PodMetrics{} - - for _, pod := range pods { - if pod.GetMetrics().KVCacheUsagePercent <= min { - min = pod.GetMetrics().KVCacheUsagePercent - } - if pod.GetMetrics().KVCacheUsagePercent >= max { - max = pod.GetMetrics().KVCacheUsagePercent - } - } - - for _, pod := range pods { - if pod.GetMetrics().KVCacheUsagePercent >= min && pod.GetMetrics().KVCacheUsagePercent <= min+(max-min)/float64(len(pods)) { - filtered = append(filtered, pod) - } - } - return filtered, nil -} - -// podPredicate is a filter function to check whether a pod is desired. -type podPredicate func(req *LLMRequest, pod backendmetrics.PodMetrics) bool - -// We consider serving an adapter low cost it the adapter is active in the model server, or the -// model server has room to load the adapter. The lowLoRACostPredicate ensures weak affinity by -// spreading the load of a LoRA adapter across multiple pods, avoiding "pinning" all requests to -// a single pod. This gave good performance in our initial benchmarking results in the scenario -// where # of lora slots > # of lora adapters. -func lowLoRACostPredicate(req *LLMRequest, pod backendmetrics.PodMetrics) bool { - _, ok := pod.GetMetrics().ActiveModels[req.ResolvedTargetModel] - return ok || len(pod.GetMetrics().ActiveModels) < pod.GetMetrics().MaxActiveModels -} - -// loRASoftAffinityPredicate implements a pod selection strategy that prioritizes pods -// with existing LoRA model affinity while allowing for load balancing through randomization. -// -// The function works by: -// 1. Separating pods into two groups: those with target model affinity and those with available capacity -// 2. Using a probability threshold to sometimes select from non-affinity pods to enable load balancing -// 3. Falling back to whatever group has pods if one group is empty -// -// Parameters: -// - logger: Logger interface for diagnostic output -// - req: LLM request containing the resolved target model -// - pods: Slice of pod metrics to filter -// -// Returns: -// - Filtered slice of pod metrics based on affinity and availability -// - Error if any issues occur during filtering -func loRASoftAffinityFilter(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { - - // Pre-allocate slices with estimated capacity - filtered_affinity := make([]backendmetrics.PodMetrics, 0, len(pods)) - filtered_available := make([]backendmetrics.PodMetrics, 0, len(pods)) - - // Categorize pods based on affinity and availability - for _, pod := range pods { - - if _, exists := pod.GetMetrics().ActiveModels[req.ResolvedTargetModel]; exists { - filtered_affinity = append(filtered_affinity, pod) - } else if len(pod.GetMetrics().ActiveModels) < pod.GetMetrics().MaxActiveModels { - filtered_available = append(filtered_available, pod) - } - } - - // Use crypto/rand for better randomization in production environments - randSource := rand.NewSource(time.Now().UnixNano()) - randGen := rand.New(randSource) - - // If both groups have pods, use probability to select which group to return - if len(filtered_affinity) > 0 && len(filtered_available) > 0 { - if randGen.Float64() < config.LoraAffinityThreshold { - return filtered_affinity, nil - } - return filtered_available, nil - } - - // Return whichever group has pods - if len(filtered_affinity) > 0 { - return filtered_affinity, nil - } - - return filtered_available, nil -} - -func criticalRequestPredicate(req *LLMRequest, _ backendmetrics.PodMetrics) bool { - return req.Critical -} - -func noQueueAndLessThanKVCacheThresholdPredicate(queueThreshold int, kvCacheThreshold float64) podPredicate { - return func(req *LLMRequest, pod backendmetrics.PodMetrics) bool { - return pod.GetMetrics().WaitingQueueSize <= queueThreshold && pod.GetMetrics().KVCacheUsagePercent <= kvCacheThreshold - } -} diff --git a/pkg/epp/scheduling/filter_test.go b/pkg/epp/scheduling/filter_test.go deleted file mode 100644 index 127e6c215..000000000 --- a/pkg/epp/scheduling/filter_test.go +++ /dev/null @@ -1,554 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package scheduling - -import ( - "errors" - "testing" - - "github.com/go-logr/logr" - "github.com/google/go-cmp/cmp" - "k8s.io/apimachinery/pkg/types" - backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" -) - -func TestFilter(t *testing.T) { - logger := logutil.NewTestLogger() - - tests := []struct { - name string - req *LLMRequest - input []*backendmetrics.FakePodMetrics - output []*backendmetrics.FakePodMetrics - err bool - filter *filter - }{ - { - name: "simple filter without successor, failure", - filter: &filter{filter: func(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { - return nil, errors.New("filter error") - }}, - err: true, - }, - { - name: "default filter, critical request", - filter: defaultFilter, - req: &LLMRequest{ - Model: "critical", - ResolvedTargetModel: "critical", - Critical: true, - }, - // pod2 will be picked because it has relatively low queue size, with the requested - // model being active, and has low KV cache. - input: []*backendmetrics.FakePodMetrics{ - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - }, - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 3, - KVCacheUsagePercent: 0.1, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "critical": 1, - }, - }, - }, - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - }, - }, - }, - }, - output: []*backendmetrics.FakePodMetrics{ - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 3, - KVCacheUsagePercent: 0.1, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "critical": 1, - }, - }, - }, - }, - }, - { - name: "default filter, sheddable request, accepted", - filter: defaultFilter, - req: &LLMRequest{ - Model: "sheddable", - ResolvedTargetModel: "sheddable", - Critical: false, - }, - // pod1 will be picked because it has capacity for the sheddable request. - input: []*backendmetrics.FakePodMetrics{ - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - }, - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 3, - KVCacheUsagePercent: 0.1, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "critical": 1, - }, - }, - }, - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - }, - }, - }, - }, - output: []*backendmetrics.FakePodMetrics{ - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - }, - }, - }, - { - name: "default filter, sheddable request, dropped", - filter: defaultFilter, - req: &LLMRequest{ - Model: "sheddable", - ResolvedTargetModel: "sheddable", - Critical: false, - }, - // All pods have higher KV cache thant the threshold, so the sheddable request will be - // dropped. - input: []*backendmetrics.FakePodMetrics{ - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.9, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - }, - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 3, - KVCacheUsagePercent: 0.85, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "critical": 1, - }, - }, - }, - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.85, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - }, - }, - }, - }, - output: []*backendmetrics.FakePodMetrics{}, - err: true, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - got, err := test.filter.Filter(logger, test.req, toInterface(test.input)) - if test.err != (err != nil) { - t.Errorf("Unexpected error, got %v, want %v", err, test.err) - } - - if diff := cmp.Diff(test.output, toStruct(got)); diff != "" { - t.Errorf("Unexpected output (-want +got): %v", diff) - } - }) - } -} - -func TestFilterFunc(t *testing.T) { - logger := logutil.NewTestLogger() - - tests := []struct { - name string - f filterFunc - req *LLMRequest - input []*backendmetrics.FakePodMetrics - output []*backendmetrics.FakePodMetrics - err bool - }{ - { - name: "least queuing empty input", - f: leastQueuingFilterFunc, - input: []*backendmetrics.FakePodMetrics{}, - output: []*backendmetrics.FakePodMetrics{}, - }, - { - name: "least queuing", - f: leastQueuingFilterFunc, - input: []*backendmetrics.FakePodMetrics{ - { - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 0, - }, - }, - { - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 3, - }, - }, - { - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 10, - }, - }, - }, - output: []*backendmetrics.FakePodMetrics{ - { - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 0, - }, - }, - { - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 3, - }, - }, - }, - }, - { - name: "least kv cache empty input", - f: leastKVCacheFilterFunc, - input: []*backendmetrics.FakePodMetrics{}, - output: []*backendmetrics.FakePodMetrics{}, - }, - { - name: "least kv cache", - f: leastKVCacheFilterFunc, - input: []*backendmetrics.FakePodMetrics{ - { - Metrics: &backendmetrics.Metrics{ - KVCacheUsagePercent: 0, - }, - }, - { - Metrics: &backendmetrics.Metrics{ - KVCacheUsagePercent: 0.3, - }, - }, - { - Metrics: &backendmetrics.Metrics{ - KVCacheUsagePercent: 1.0, - }, - }, - }, - output: []*backendmetrics.FakePodMetrics{ - { - Metrics: &backendmetrics.Metrics{ - KVCacheUsagePercent: 0, - }, - }, - { - Metrics: &backendmetrics.Metrics{ - KVCacheUsagePercent: 0.3, - }, - }, - }, - }, - { - name: "noQueueAndLessThanKVCacheThresholdPredicate", - f: toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(0, 0.8)), - input: []*backendmetrics.FakePodMetrics{ - { - // This pod should be returned. - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0, - }, - }, - { - // Queue is non zero, despite low kv cache, should not return. - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 1, - KVCacheUsagePercent: 0.3, - }, - }, - { - // High kv cache despite zero queue, should not return - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 1.0, - }, - }, - }, - output: []*backendmetrics.FakePodMetrics{ - { - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0, - }, - }, - }, - }, - { - name: "low LoRA cost", - f: toFilterFunc(lowLoRACostPredicate), - req: &LLMRequest{ - Model: "model", - ResolvedTargetModel: "model", - }, - input: []*backendmetrics.FakePodMetrics{ - // ActiveModels include input model, should be returned. - { - Metrics: &backendmetrics.Metrics{ - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "model": 1, - }, - }, - }, - // Input model is not active, however the server has room to load another adapter. - { - Metrics: &backendmetrics.Metrics{ - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "another-model": 1, - }, - }, - }, - // Input is not active, and the server has reached max active models. - { - Metrics: &backendmetrics.Metrics{ - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - }, - }, - output: []*backendmetrics.FakePodMetrics{ - { - Metrics: &backendmetrics.Metrics{ - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "model": 1, - }, - }, - }, - { - Metrics: &backendmetrics.Metrics{ - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "another-model": 1, - }, - }, - }, - }, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - got, err := test.f(logger, test.req, toInterface(test.input)) - if test.err != (err != nil) { - t.Errorf("Unexpected error, got %v, want %v", err, test.err) - } - - if diff := cmp.Diff(test.output, toStruct(got)); diff != "" { - t.Errorf("Unexpected output (-want +got): %v", diff) - } - }) - } -} - -// TestLoRASoftAffinityDistribution tests that the loRASoftAffinityFilter function -// properly distributes requests according to the loraAffinityThreshold -func TestLoRASoftAffinityDistribution(t *testing.T) { - logger := logutil.NewTestLogger() - - const ( - testModelName = "test-model" - testAffinityModel = "test-affinity-model" - numIterations = 10000 - tolerancePercent = 5.0 // Allow 5% tolerance from expected distribution - ) - - // Save original config value to restore later - originalThreshold := config.LoraAffinityThreshold - - // Set a specific test value for this test - testThreshold := 0.75 // 75% - config.LoraAffinityThreshold = testThreshold - - // Ensure we restore the original threshold when test completes - defer func() { - config.LoraAffinityThreshold = originalThreshold - }() - - // Create a test request and pods - req := &LLMRequest{ - Model: testAffinityModel, - ResolvedTargetModel: testAffinityModel, - } - - // Test setup: One affinity pod and one available pod - pods := []*backendmetrics.FakePodMetrics{ - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "affinity-pod"}}, - Metrics: &backendmetrics.Metrics{ - MaxActiveModels: 2, - ActiveModels: map[string]int{ - testAffinityModel: 1, - }, - }, - }, - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "available-pod"}}, - Metrics: &backendmetrics.Metrics{ - MaxActiveModels: 2, - ActiveModels: map[string]int{}, - }, - }, - } - - // Run the filter function multiple times and count the results - affinityCount := 0 - availableCount := 0 - - // Use the test threshold value - expectedAffinityPercent := config.LoraAffinityThreshold * 100 - expectedAvailabilityPercent := 100 - expectedAffinityPercent - - for i := 0; i < numIterations; i++ { - result, err := loRASoftAffinityFilter(logger, req, toInterface(pods)) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } - - // Check which type of pod was returned - if len(result) != 1 { - t.Fatalf("Expected exactly one pod in result, got %d", len(result)) - } - - // Identify if the returned pod is the affinity pod or available pod - if _, exists := result[0].GetMetrics().ActiveModels[testAffinityModel]; exists { - affinityCount++ - } else { - availableCount++ - } - } - - // Calculate the actual percentages - actualAffinityPercent := float64(affinityCount) / float64(numIterations) * 100 - actualAvailablePercent := float64(availableCount) / float64(numIterations) * 100 - - // Check if the distribution matches expected threshold within tolerance - affinityLowerBound := expectedAffinityPercent - tolerancePercent - affinityUpperBound := expectedAffinityPercent + tolerancePercent - - availableLowerBound := expectedAvailabilityPercent - tolerancePercent - availableUpperBound := expectedAvailabilityPercent + tolerancePercent - - t.Logf("Distribution results over %d iterations:", numIterations) - t.Logf("Expected affinity percent: %.2f%% (threshold: %.2f)", expectedAffinityPercent, config.LoraAffinityThreshold) - t.Logf("Expected availability percent: %.2f%% (threshold: %.2f)", expectedAvailabilityPercent, config.LoraAffinityThreshold) - t.Logf("Actual affinity percent: %.2f%% (%d out of %d)", actualAffinityPercent, affinityCount, numIterations) - t.Logf("Actual available percent: %.2f%% (%d out of %d)", actualAvailablePercent, availableCount, numIterations) - - if actualAffinityPercent < affinityLowerBound || actualAffinityPercent > affinityUpperBound { - t.Errorf("Affinity selection percent %.2f%% outside expected range %.2f%% to %.2f%%", - actualAffinityPercent, affinityLowerBound, affinityUpperBound) - } - if actualAvailablePercent < availableLowerBound || actualAvailablePercent > availableUpperBound { - t.Errorf("Availability selection percent %.2f%% outside expected range %.2f%% to %.2f%%", - actualAvailablePercent, availableLowerBound, availableUpperBound) - } -} - -func toInterface(input []*backendmetrics.FakePodMetrics) []backendmetrics.PodMetrics { - output := []backendmetrics.PodMetrics{} - for _, i := range input { - output = append(output, i) - } - return output -} - -func toStruct(input []backendmetrics.PodMetrics) []*backendmetrics.FakePodMetrics { - if input == nil { - return nil - } - output := []*backendmetrics.FakePodMetrics{} - for _, i := range input { - output = append(output, i.(*backendmetrics.FakePodMetrics)) - } - return output -} diff --git a/pkg/epp/scheduling/framework/plugins.go b/pkg/epp/scheduling/framework/plugins.go new file mode 100644 index 000000000..7e22d8618 --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins.go @@ -0,0 +1,76 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package framework + +import ( + "context" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +const ( + ProfilePickerType = "ProfilePicker" + FilterPluginType = "Filter" + ScorerPluginType = "Scorer" + PickerPluginType = "Picker" + PostCyclePluginType = "PostCycle" + ProcessProfilesResultsType = "ProcessProfilesResults" +) + +// ProfileHandler defines the extension points for handling multi SchedulerProfile instances. +// More specifically, this interface defines the 'Pick' and 'ProcessResults' extension points. +type ProfileHandler interface { + plugins.Plugin + // Pick selects the SchedulingProfiles to run from a list of candidate profiles, while taking into consideration the request properties + // and the previously executed SchedluderProfile cycles along with their results. + Pick(ctx context.Context, cycleState *types.CycleState, request *types.LLMRequest, profiles map[string]*SchedulerProfile, + profileResults map[string]*types.ProfileRunResult) map[string]*SchedulerProfile + + // ProcessResults handles the outcome of the profile runs after all profiles ran. + // It may aggregate results, log test profile outputs, or apply custom logic. It specifies in the SchedulingResult the + // key of the primary profile that should be used to get the request selected destination. + // When a profile run fails, its result in the profileResults map is nil. + ProcessResults(ctx context.Context, cycleState *types.CycleState, request *types.LLMRequest, + profileResults map[string]*types.ProfileRunResult) (*types.SchedulingResult, error) +} + +// Filter defines the interface for filtering a list of pods based on context. +type Filter interface { + plugins.Plugin + Filter(ctx context.Context, cycleState *types.CycleState, request *types.LLMRequest, pods []types.Pod) []types.Pod +} + +// Scorer defines the interface for scoring a list of pods based on context. +// Scorers must score pods with a value within the range of [0,1] where 1 is the highest score. +type Scorer interface { + plugins.Plugin + Score(ctx context.Context, cycleState *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 +} + +// Picker picks the final pod(s) to send the request to. +type Picker interface { + plugins.Plugin + Pick(ctx context.Context, cycleState *types.CycleState, scoredPods []*types.ScoredPod) *types.ProfileRunResult +} + +// PostCycle is called by the scheduler after it selects a targetPod for the request in the SchedulerProfile cycle. +// DEPRECATED - do not use PostCycle. this is in the process of deprecation. +type PostCycle interface { + plugins.Plugin + PostCycle(ctx context.Context, cycleState *types.CycleState, res *types.ProfileRunResult) +} diff --git a/pkg/epp/scheduling/framework/plugins/README.md b/pkg/epp/scheduling/framework/plugins/README.md new file mode 100644 index 000000000..56ca315e6 --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/README.md @@ -0,0 +1,15 @@ +# Scheduling Plugins + +This package contains the scheduling plugin implementations. + +Plugins are organized by the following rule. Follow this rule when adding a new +plugin. + +``` +plugins/ +|__ filter/(Plugins that implement the Filter interface only.) +|__ scorer/ (Plugins that implement the Scorer interface only.) +|__ picker/(Plugins that implement the Picker interface only.) +|__ multi/ (Plugins that implement multiple plugin interfaces.) +|____prefix/ (Prefix cache aware scheduling plugin.) +``` diff --git a/pkg/epp/scheduling/framework/plugins/filter/decision_tree_filter.go b/pkg/epp/scheduling/framework/plugins/filter/decision_tree_filter.go new file mode 100644 index 000000000..e73d5f921 --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/filter/decision_tree_filter.go @@ -0,0 +1,179 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package filter + +import ( + "context" + "encoding/json" + "errors" + "fmt" + + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + DecisionTreeFilterType = "decision-tree" +) + +// compile-time type assertion +var _ framework.Filter = &DecisionTreeFilter{} + +// DecisionTreeFilter applies current fitler, and then recursively applies next filters +// depending success or failure of the current filter. +// It can be used to construct a flow chart algorithm. +type DecisionTreeFilter struct { + Current framework.Filter + // NextOnSuccess filter will be applied after successfully applying the current filter. + // The filtered results will be passed to the next filter. + NextOnSuccess framework.Filter + // NextOnFailure filter will be applied if current filter results in no pods. + // The original input will be passed to the next filter. + NextOnFailure framework.Filter + // NextOnSuccessOrFailure is a convenience field to configure the next filter regardless of the + // success or failure of the current filter. + // NOTE: When using NextOnSuccessOrFailure, both nextOnSuccess and nextOnFailure SHOULD be nil. + // However if that's not the case, nextOnSuccess and nextOnFailure will be used, instead of + // NextOnSuccessOrFailure, in the success and failure scenarios, respectively. + NextOnSuccessOrFailure framework.Filter +} + +type decisionTreeFilterParameters struct { + Current *decisionTreeFilterEntry `json:"current"` + NextOnSuccess *decisionTreeFilterEntry `json:"nextOnSuccess"` + NextOnFailure *decisionTreeFilterEntry `json:"nextOnFailure"` + NextOnSuccessOrFailure *decisionTreeFilterEntry `json:"nextOnSuccessOrFailure"` +} + +type decisionTreeFilterEntry struct { + PluginRef *string `json:"pluginRef"` + DecisionTree *decisionTreeFilterParameters `json:"decisionTree"` +} + +func DecisionTreeFilterFactory(name string, rawParameters json.RawMessage, handle plugins.Handle) (plugins.Plugin, error) { + parameters := decisionTreeFilterParameters{} + if err := json.Unmarshal(rawParameters, ¶meters); err != nil { + return nil, fmt.Errorf("failed to parse the parameters of the '%s' filter - %w", name, err) + } + return loadDecisionTree(¶meters, handle) +} + +func loadDecisionTree(parameters *decisionTreeFilterParameters, handle plugins.Handle) (*DecisionTreeFilter, error) { + result := &DecisionTreeFilter{} + var err error + + if parameters.Current == nil { + return nil, errors.New("a current filter must be specified") + } + result.Current, err = loadDecisionTreeEntry(parameters.Current, handle) + if err != nil { + return nil, err + } + + if parameters.NextOnSuccess != nil { + result.NextOnSuccess, err = loadDecisionTreeEntry(parameters.NextOnSuccess, handle) + if err != nil { + return nil, err + } + } + + if parameters.NextOnFailure != nil { + result.NextOnFailure, err = loadDecisionTreeEntry(parameters.NextOnFailure, handle) + if err != nil { + return nil, err + } + } + + if parameters.NextOnSuccessOrFailure != nil { + result.NextOnSuccessOrFailure, err = loadDecisionTreeEntry(parameters.NextOnSuccessOrFailure, handle) + if err != nil { + return nil, err + } + } + + return result, nil +} + +func loadDecisionTreeEntry(entry *decisionTreeFilterEntry, handle plugins.Handle) (framework.Filter, error) { + if entry.PluginRef != nil && entry.DecisionTree != nil { + return nil, errors.New("both pluginRef and decisionTree may not be specified") + } + + if entry.PluginRef != nil { + instance := handle.Plugins().Plugin(*entry.PluginRef) + if instance == nil { + return nil, errors.New(*entry.PluginRef + " is a reference to an undefined Plugin") + } + if theFilter, ok := instance.(framework.Filter); ok { + return theFilter, nil + } + return nil, errors.New(*entry.PluginRef + " is not a filter") + } else if entry.DecisionTree != nil { + return loadDecisionTree(entry.DecisionTree, handle) + } + return nil, errors.New("either pluginRef or decisionTree must be specified") +} + +// Type returns the type of the filter. +func (f *DecisionTreeFilter) Type() string { + if f == nil { + return "nil" + } + return f.Current.Type() +} + +// Name returns the name of the filter. +func (f *DecisionTreeFilter) Name() string { + if f == nil { + return "" + } + return f.Current.Name() +} + +// Filter filters out pods that doesn't meet the filter criteria. +func (f *DecisionTreeFilter) Filter(ctx context.Context, cycleState *types.CycleState, request *types.LLMRequest, pods []types.Pod) []types.Pod { + loggerTrace := log.FromContext(ctx).V(logutil.TRACE) + filteredPod := f.Current.Filter(ctx, cycleState, request, pods) + + next := f.NextOnSuccessOrFailure + if len(filteredPod) > 0 { + if f.NextOnSuccess == nil && f.NextOnSuccessOrFailure == nil { + // No succeeding filters to run, return. + return filteredPod + } + if f.NextOnSuccess != nil { + next = f.NextOnSuccess + } + loggerTrace.Info("Filter succeeded", "filter", f.Type(), "next", next.Type(), "filteredPodCount", len(filteredPod)) + // On success, pass the filtered result to the next filter. + return next.Filter(ctx, cycleState, request, filteredPod) + } else { + if f.NextOnFailure == nil && f.NextOnSuccessOrFailure == nil { + // No succeeding filters to run, return. + return filteredPod + } + if f.NextOnFailure != nil { + next = f.NextOnFailure + } + loggerTrace.Info("Filter failed", "filter", f.Type(), "next", next.Type()) + // On failure, pass the initial set of pods to the next filter. + return next.Filter(ctx, cycleState, request, pods) + } +} diff --git a/pkg/epp/scheduling/framework/plugins/filter/filter_test.go b/pkg/epp/scheduling/framework/plugins/filter/filter_test.go new file mode 100644 index 000000000..978e91c3e --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/filter/filter_test.go @@ -0,0 +1,535 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package filter + +import ( + "context" + "encoding/json" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/google/uuid" + k8stypes "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/config" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/scorer" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + "sigs.k8s.io/gateway-api-inference-extension/test/utils" +) + +// compile-time type assertion +var _ framework.Filter = &filterAll{} + +type filterAll struct{} + +func (f *filterAll) Type() string { + return "filter-all" +} + +func (f *filterAll) Name() string { + return "test-all" +} + +func (f *filterAll) Filter(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, pods []types.Pod) []types.Pod { + return []types.Pod{} +} + +func TestFilter(t *testing.T) { + tests := []struct { + name string + req *types.LLMRequest + filter framework.Filter + input []types.Pod + output []types.Pod + }{ + { + name: "simple filter filters all pods", + filter: &filterAll{}, + output: []types.Pod{}, + }, + { + name: "least queuing empty input", + filter: NewLeastQueueFilter(), + input: []types.Pod{}, + output: []types.Pod{}, + }, + { + name: "least queuing", + filter: NewLeastQueueFilter(), + input: []types.Pod{ + &types.PodMetrics{ + MetricsState: &backendmetrics.MetricsState{ + WaitingQueueSize: 0, + }, + }, + &types.PodMetrics{ + MetricsState: &backendmetrics.MetricsState{ + WaitingQueueSize: 3, + }, + }, + &types.PodMetrics{ + MetricsState: &backendmetrics.MetricsState{ + WaitingQueueSize: 10, + }, + }, + }, + output: []types.Pod{ + &types.PodMetrics{ + MetricsState: &backendmetrics.MetricsState{ + WaitingQueueSize: 0, + }, + }, + &types.PodMetrics{ + MetricsState: &backendmetrics.MetricsState{ + WaitingQueueSize: 3, + }, + }, + }, + }, + { + name: "least kv cache empty input", + filter: NewLeastKVCacheFilter(), + input: []types.Pod{}, + output: []types.Pod{}, + }, + { + name: "least kv cache", + filter: NewLeastKVCacheFilter(), + input: []types.Pod{ + &types.PodMetrics{ + MetricsState: &backendmetrics.MetricsState{ + KVCacheUsagePercent: 0, + }, + }, + &types.PodMetrics{ + MetricsState: &backendmetrics.MetricsState{ + KVCacheUsagePercent: 0.3, + }, + }, + &types.PodMetrics{ + MetricsState: &backendmetrics.MetricsState{ + KVCacheUsagePercent: 1.0, + }, + }, + }, + output: []types.Pod{ + &types.PodMetrics{ + MetricsState: &backendmetrics.MetricsState{ + KVCacheUsagePercent: 0, + }, + }, + &types.PodMetrics{ + MetricsState: &backendmetrics.MetricsState{ + KVCacheUsagePercent: 0.3, + }, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + got := test.filter.Filter(context.Background(), types.NewCycleState(), test.req, test.input) + + if diff := cmp.Diff(test.output, got); diff != "" { + t.Errorf("Unexpected output (-want +got): %v", diff) + } + }) + } +} + +// TestLoRASoftAffinityDistribution tests that the loRASoftAffinityFilter function +// properly distributes requests according to the loraAffinityThreshold +func TestLoRASoftAffinityDistribution(t *testing.T) { + const ( + testModelName = "test-model" + testAffinityModel = "test-affinity-model" + numIterations = 10000 + tolerancePercent = 5.0 // Allow 5% tolerance from expected distribution + ) + + // Save original config value to restore later + originalThreshold := config.Conf.LoraAffinityThreshold + + // Set a specific test value for this test + testThreshold := 0.75 // 75% + config.Conf.LoraAffinityThreshold = testThreshold + + // Ensure we restore the original threshold when test completes + defer func() { + config.Conf.LoraAffinityThreshold = originalThreshold + }() + + // Create a test request and pods + req := &types.LLMRequest{ + TargetModel: testAffinityModel, + RequestId: uuid.NewString(), + } + + // Test setup: One affinity pod and one available pod + pods := []types.Pod{ + &types.PodMetrics{ + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "affinity-pod"}}, + MetricsState: &backendmetrics.MetricsState{ + MaxActiveModels: 2, + ActiveModels: map[string]int{ + testAffinityModel: 1, + }, + }, + }, + &types.PodMetrics{ + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "available-pod"}}, + MetricsState: &backendmetrics.MetricsState{ + MaxActiveModels: 2, + ActiveModels: map[string]int{}, + }, + }, + } + // Run the filter function multiple times and count the results + affinityCount := 0 + availableCount := 0 + + // Use the test threshold value + expectedAffinityPercent := config.Conf.LoraAffinityThreshold * 100 + expectedAvailabilityPercent := 100 - expectedAffinityPercent + + // initialize LoraAffinityFilter + LoraAffinityFilter := NewLoraAffinityFilter(config.Conf.LoraAffinityThreshold) + + for range numIterations { + result := LoraAffinityFilter.Filter(context.Background(), types.NewCycleState(), req, pods) + + // Check which type of pod was returned + if len(result) != 1 { + t.Fatalf("Expected exactly one pod in result, got %d", len(result)) + } + + // Identify if the returned pod is the affinity pod or available pod + if _, exists := result[0].GetMetrics().ActiveModels[testAffinityModel]; exists { + affinityCount++ + } else { + availableCount++ + } + } + + // Calculate the actual percentages + actualAffinityPercent := float64(affinityCount) / float64(numIterations) * 100 + actualAvailablePercent := float64(availableCount) / float64(numIterations) * 100 + + // Check if the distribution matches expected threshold within tolerance + affinityLowerBound := expectedAffinityPercent - tolerancePercent + affinityUpperBound := expectedAffinityPercent + tolerancePercent + + availableLowerBound := expectedAvailabilityPercent - tolerancePercent + availableUpperBound := expectedAvailabilityPercent + tolerancePercent + + t.Logf("Distribution results over %d iterations:", numIterations) + t.Logf("Expected affinity percent: %.2f%% (threshold: %.2f)", expectedAffinityPercent, config.Conf.LoraAffinityThreshold) + t.Logf("Expected availability percent: %.2f%% (threshold: %.2f)", expectedAvailabilityPercent, config.Conf.LoraAffinityThreshold) + t.Logf("Actual affinity percent: %.2f%% (%d out of %d)", actualAffinityPercent, affinityCount, numIterations) + t.Logf("Actual available percent: %.2f%% (%d out of %d)", actualAvailablePercent, availableCount, numIterations) + + if actualAffinityPercent < affinityLowerBound || actualAffinityPercent > affinityUpperBound { + t.Errorf("Affinity selection percent %.2f%% outside expected range %.2f%% to %.2f%%", + actualAffinityPercent, affinityLowerBound, affinityUpperBound) + } + if actualAvailablePercent < availableLowerBound || actualAvailablePercent > availableUpperBound { + t.Errorf("Availability selection percent %.2f%% outside expected range %.2f%% to %.2f%%", + actualAvailablePercent, availableLowerBound, availableUpperBound) + } +} + +// TestDecisionTreeFilterFactory tests that the DecisionTreeFilterFactory function +// properly instantiates DecisionTreeFilter instances +func TestDecisionTreeFilterFactory(t *testing.T) { + + leastKvCacheFilter := NewLeastKVCacheFilter() + leastQueueFilter := NewLeastQueueFilter() + loraAffinityFilter := NewLoraAffinityFilter(config.Conf.LoraAffinityThreshold) + lowQueueFilter := NewLowQueueFilter(config.Conf.QueueingThresholdLoRA) + + kvCacheScorer := scorer.NewKVCacheScorer() + + testHandle := utils.NewTestHandle(context.Background()) + + testHandle.Plugins().AddPlugin("leastKvCache", leastKvCacheFilter) + testHandle.Plugins().AddPlugin("leastQueue", leastQueueFilter) + testHandle.Plugins().AddPlugin("loraAffinity", loraAffinityFilter) + testHandle.Plugins().AddPlugin("lowQueue", lowQueueFilter) + + testHandle.Plugins().AddPlugin("kvCacheScorer", kvCacheScorer) + + tests := []struct { + name string + parameters string + want *DecisionTreeFilter + wantErr bool + }{ + { + name: "success", + parameters: decisionTreeParametersSuccess, + want: &DecisionTreeFilter{ + Current: lowQueueFilter, + NextOnSuccess: &DecisionTreeFilter{ + Current: loraAffinityFilter, + NextOnSuccessOrFailure: &DecisionTreeFilter{ + Current: leastQueueFilter, + NextOnSuccessOrFailure: &DecisionTreeFilter{ + Current: leastKvCacheFilter, + }, + }, + }, + NextOnFailure: &DecisionTreeFilter{ + Current: leastQueueFilter, + NextOnSuccessOrFailure: &DecisionTreeFilter{ + Current: loraAffinityFilter, + NextOnSuccessOrFailure: &DecisionTreeFilter{ + Current: leastKvCacheFilter, + }, + }, + }, + }, + wantErr: false, + }, + { + name: "bothError", + parameters: decisionTreeParametersErrorBoth, + want: nil, + wantErr: true, + }, + { + name: "noneError", + parameters: decisionTreeParametersErrorNone, + want: nil, + wantErr: true, + }, + { + name: "badPlugin", + parameters: decisionTreeParametersErrorBadPlugin, + want: nil, + wantErr: true, + }, + { + name: "notFilter", + parameters: decisionTreeParametersErrorNotFilter, + want: nil, + wantErr: true, + }, + { + name: "noCurrent", + parameters: decisionTreeParametersErrorNoCurrent, + want: nil, + wantErr: true, + }, + { + name: "badNextOnSuccess", + parameters: decisionTreeParametersErrorBadNextOnSuccess, + want: nil, + wantErr: true, + }, + { + name: "badNextOnFailure", + parameters: decisionTreeParametersErrorBadNextOnFailure, + want: nil, + wantErr: true, + }, + { + name: "badNextOnSuccessOrFailure", + parameters: decisionTreeParametersErrorBadNextOnSuccessOrFailure, + want: nil, + wantErr: true, + }, + } + + cmpOptions := cmpopts.IgnoreUnexported(LeastKVCacheFilter{}, LeastQueueFilter{}, + LoraAffinityFilter{}, LowQueueFilter{}, scorer.KVCacheScorer{}) + + for _, test := range tests { + rawParameters := struct { + Parameters json.RawMessage `json:"parameters"` + }{} + err := json.Unmarshal([]byte(test.parameters), &rawParameters) + if err != nil { + if test.wantErr { + continue + } else { + t.Fatal("failed to parse JSON of test " + test.name) + } + } + got, err := DecisionTreeFilterFactory("testing", rawParameters.Parameters, testHandle) + if err != nil { + if test.wantErr { + continue + } + t.Fatalf("failed to instantiate DecisionTreeFilter. error: %s\n", err) + } + if test.wantErr { + t.Fatalf("test %s did not return the expected error", test.name) + } + if diff := cmp.Diff(test.want, got, cmpOptions); diff != "" { + t.Fatalf("In test %s DecisionTreeFactory returned unexpected response, diff(-want, +got): %v", test.name, diff) + } + } +} + +const decisionTreeParametersSuccess = ` +{ + "parameters": { + "current": { + "pluginRef": "lowQueue" + }, + "nextOnSuccess": { + "decisionTree": { + "current": { + "pluginRef": "loraAffinity" + }, + "nextOnSuccessOrFailure": { + "decisionTree": { + "current": { + "pluginRef": "leastQueue" + }, + "nextOnSuccessOrFailure": { + "decisionTree": { + "current": { + "pluginRef": "leastKvCache" + } + } + } + } + } + } + }, + "nextOnFailure": { + "decisionTree": { + "current": { + "pluginRef": "leastQueue" + }, + "nextOnSuccessOrFailure": { + "decisionTree": { + "current": { + "pluginRef": "loraAffinity" + }, + "nextOnSuccessOrFailure": { + "decisionTree": { + "current": { + "pluginRef": "leastKvCache" + } + } + } + } + } + } + } + } +} +` + +const decisionTreeParametersErrorBoth = ` +{ + "parameters": { + "current": { + "pluginRef": "lowQueue", + "decisionTree": { + "current": { + "pluginRef": "leastKvCache" + } + } + } + } +} +` + +const decisionTreeParametersErrorNone = ` +{ + "parameters": { + "current": { + } + } +} +` + +const decisionTreeParametersErrorBadPlugin = ` +{ + "parameters": { + "current": { + "pluginRef": "plover" + } + } +} +` + +const decisionTreeParametersErrorNotFilter = ` +{ + "parameters": { + "current": { + "pluginRef": "kvCacheScorer" + } + } +} +` + +const decisionTreeParametersErrorNoCurrent = ` +{ + "parameters": { + "NextOnSuccess": { + "pluginRef": "lowQueue" + } + } +} +` + +const decisionTreeParametersErrorBadNextOnSuccess = ` +{ + "parameters": { + "current": { + "pluginRef": "lowQueue" + }, + "NextOnSuccess": { + "pluginRef": "kvCacheScorer" + } + } +} +` + +const decisionTreeParametersErrorBadNextOnFailure = ` +{ + "parameters": { + "current": { + "pluginRef": "lowQueue" + }, + "NextOnFailure": { + "pluginRef": "kvCacheScorer" + } + } +} +` + +const decisionTreeParametersErrorBadNextOnSuccessOrFailure = ` +{ + "parameters": { + "current": { + "pluginRef": "lowQueue" + }, + "NextOnSuccessOrFailure": { + "pluginRef": "kvCacheScorer" + } + } +} +` diff --git a/pkg/epp/scheduling/framework/plugins/filter/least_kvcache_filter.go b/pkg/epp/scheduling/framework/plugins/filter/least_kvcache_filter.go new file mode 100644 index 000000000..ffb190cb0 --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/filter/least_kvcache_filter.go @@ -0,0 +1,95 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package filter + +import ( + "context" + "encoding/json" + "math" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +const ( + LeastKVCacheFilterType = "least-KV-cache" +) + +// compile-time type validation +var _ framework.Filter = &LeastKVCacheFilter{} + +// LeastKVCacheFilterFactory defines the factory function for LeastKVCacheFilter. +func LeastKVCacheFilterFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) { + return NewLeastKVCacheFilter().WithName(name), nil +} + +// NewLeastKVCacheFilter initializes a new LeastKVCacheFilter and returns its pointer. +func NewLeastKVCacheFilter() *LeastKVCacheFilter { + return &LeastKVCacheFilter{ + name: LeastKVCacheFilterType, + } +} + +// LeastKVCacheFilter finds the max and min KV cache of all pods, divides the whole range +// (max-min) by the number of pods, and finds the pods that fall into the first range. +// The intuition is that if there are multiple pods that share similar KV cache in the low range, we +// should consider them all instead of the absolute minimum one. This worked better than picking the +// least one as it gives more choices for the next filter, which on aggregate gave better results. +type LeastKVCacheFilter struct { + name string +} + +// Type returns the type of the filter. +func (f *LeastKVCacheFilter) Type() string { + return LeastKVCacheFilterType +} + +// Name returns the name of the filter. +func (f *LeastKVCacheFilter) Name() string { + return f.name +} + +// WithName sets the name of the filter. +func (f *LeastKVCacheFilter) WithName(name string) *LeastKVCacheFilter { + f.name = name + return f +} + +// Filter filters out pods that doesn't meet the filter criteria. +func (f *LeastKVCacheFilter) Filter(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, pods []types.Pod) []types.Pod { + filteredPods := []types.Pod{} + + min := math.MaxFloat64 + var max float64 = 0 + + for _, pod := range pods { + if pod.GetMetrics().KVCacheUsagePercent <= min { + min = pod.GetMetrics().KVCacheUsagePercent + } + if pod.GetMetrics().KVCacheUsagePercent >= max { + max = pod.GetMetrics().KVCacheUsagePercent + } + } + + for _, pod := range pods { + if pod.GetMetrics().KVCacheUsagePercent >= min && pod.GetMetrics().KVCacheUsagePercent <= min+(max-min)/float64(len(pods)) { + filteredPods = append(filteredPods, pod) + } + } + return filteredPods +} diff --git a/pkg/epp/scheduling/framework/plugins/filter/least_queue_filter.go b/pkg/epp/scheduling/framework/plugins/filter/least_queue_filter.go new file mode 100644 index 000000000..54b619c7e --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/filter/least_queue_filter.go @@ -0,0 +1,96 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package filter + +import ( + "context" + "encoding/json" + "math" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +const ( + LeastQueueFilterType = "least-queue" +) + +// compile-time type validation +var _ framework.Filter = &LeastQueueFilter{} + +// LeastQueueFilterFactory defines the factory function for LeastQueueFilter. +func LeastQueueFilterFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) { + return NewLeastQueueFilter().WithName(name), nil +} + +// NewLeastQueueFilter initializes a new LeastQueueFilter and returns its pointer. +func NewLeastQueueFilter() *LeastQueueFilter { + return &LeastQueueFilter{ + name: LeastQueueFilterType, + } +} + +// LeastQueueFilter finds the max and min queue size of all pods, divides the whole range +// (max-min) by the number of pods, and finds the pods that fall into the first range. +// The intuition is that if there are multiple pods that share similar queue size in the low range, +// we should consider them all instead of the absolute minimum one. This worked better than picking +// the least one as it gives more choices for the next filter, which on aggregate gave better results. +type LeastQueueFilter struct { + name string +} + +// Type returns the type of the filter. +func (f *LeastQueueFilter) Type() string { + return LeastQueueFilterType +} + +// Name returns the name of the filter. +func (f *LeastQueueFilter) Name() string { + return f.name +} + +// WithName sets the name of the filter. +func (f *LeastQueueFilter) WithName(name string) *LeastQueueFilter { + f.name = name + return f +} + +// Filter filters out pods that doesn't meet the filter criteria. +func (f *LeastQueueFilter) Filter(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, pods []types.Pod) []types.Pod { + filteredPods := []types.Pod{} + + min := math.MaxInt + max := 0 + + for _, pod := range pods { + if pod.GetMetrics().WaitingQueueSize <= min { + min = pod.GetMetrics().WaitingQueueSize + } + if pod.GetMetrics().WaitingQueueSize >= max { + max = pod.GetMetrics().WaitingQueueSize + } + } + + for _, pod := range pods { + if pod.GetMetrics().WaitingQueueSize >= min && pod.GetMetrics().WaitingQueueSize <= min+(max-min)/len(pods) { + filteredPods = append(filteredPods, pod) + } + } + + return filteredPods +} diff --git a/pkg/epp/scheduling/framework/plugins/filter/lora_affinity_filter.go b/pkg/epp/scheduling/framework/plugins/filter/lora_affinity_filter.go new file mode 100644 index 000000000..edb18a9d4 --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/filter/lora_affinity_filter.go @@ -0,0 +1,124 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package filter + +import ( + "context" + "encoding/json" + "fmt" + "math/rand" + "time" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/config" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +const ( + LoraAffinityFilterType = "lora-affinity" +) + +type loraAffinityFilterParameters struct { + Threshold float64 `json:"threshold"` +} + +// compile-time type validation +var _ framework.Filter = &LoraAffinityFilter{} + +// LoraAffinityFilterFactory defines the factory function for LoraAffinityFilter. +func LoraAffinityFilterFactory(name string, rawParameters json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) { + parameters := loraAffinityFilterParameters{Threshold: config.DefaultLoraAffinityThreshold} + if err := json.Unmarshal(rawParameters, ¶meters); err != nil { + return nil, fmt.Errorf("failed to parse the parameters of the '%s' filter - %w", LoraAffinityFilterType, err) + } + return NewLoraAffinityFilter(parameters.Threshold).WithName(name), nil +} + +// NewLoraAffinityFilter initializes a new LoraAffinityFilter and returns its pointer. +func NewLoraAffinityFilter(threshold float64) *LoraAffinityFilter { + return &LoraAffinityFilter{ + name: LoraAffinityFilterType, + loraAffinityThreshold: threshold, + } +} + +// LoraAffinityFilter implements a pod selection strategy that prioritizes pods +// with existing LoRA model affinity while allowing for load balancing through randomization. +// +// The function works by: +// 1. Separating pods into two groups: those with target model affinity and those with available capacity +// 2. Using a probability threshold to sometimes select from non-affinity pods to enable load balancing +// 3. Falling back to whatever group has pods if one group is empty +type LoraAffinityFilter struct { + name string + loraAffinityThreshold float64 +} + +// Type returns the type of the filter. +func (f *LoraAffinityFilter) Type() string { + return LoraAffinityFilterType +} + +// Name returns the type of the filter. +func (f *LoraAffinityFilter) Name() string { + return f.name +} + +// WithName sets the type of the filter. +func (f *LoraAffinityFilter) WithName(name string) *LoraAffinityFilter { + f.name = name + return f +} + +// Filter filters out pods that doesn't meet the filter criteria. +func (f *LoraAffinityFilter) Filter(_ context.Context, _ *types.CycleState, request *types.LLMRequest, pods []types.Pod) []types.Pod { + // Pre-allocate slices with estimated capacity + filtered_affinity := make([]types.Pod, 0, len(pods)) + filtered_available := make([]types.Pod, 0, len(pods)) + + // Categorize pods based on affinity and availability + for _, pod := range pods { + _, active := pod.GetMetrics().ActiveModels[request.TargetModel] + _, waiting := pod.GetMetrics().WaitingModels[request.TargetModel] + + if active || waiting { + filtered_affinity = append(filtered_affinity, pod) + } else if len(pod.GetMetrics().ActiveModels)+len(pod.GetMetrics().WaitingModels) < pod.GetMetrics().MaxActiveModels { + filtered_available = append(filtered_available, pod) + } + } + + // Use crypto/rand for better randomization in production environments + randSource := rand.NewSource(time.Now().UnixNano()) + randGen := rand.New(randSource) + + // If both groups have pods, use probability to select which group to return + if len(filtered_affinity) > 0 && len(filtered_available) > 0 { + if randGen.Float64() < f.loraAffinityThreshold { + return filtered_affinity + } + return filtered_available + } + + // Return whichever group has pods + if len(filtered_affinity) > 0 { + return filtered_affinity + } + + return filtered_available +} diff --git a/pkg/epp/scheduling/framework/plugins/filter/low_queue_filter.go b/pkg/epp/scheduling/framework/plugins/filter/low_queue_filter.go new file mode 100644 index 000000000..b72ccf857 --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/filter/low_queue_filter.go @@ -0,0 +1,93 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package filter + +import ( + "context" + "fmt" + + "encoding/json" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/config" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +const ( + LowQueueFilterType = "low-queue" +) + +type lowQueueFilterParameters struct { + Threshold int `json:"threshold"` +} + +// compile-time type validation +var _ framework.Filter = &LowQueueFilter{} + +// LowQueueFilterFactory defines the factory function for LowQueueFilter. +func LowQueueFilterFactory(name string, rawParameters json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) { + parameters := lowQueueFilterParameters{Threshold: config.DefaultQueueingThresholdLoRA} + if err := json.Unmarshal(rawParameters, ¶meters); err != nil { + return nil, fmt.Errorf("failed to parse the parameters of the '%s' filter - %w", LowQueueFilterType, err) + } + + return NewLowQueueFilter(parameters.Threshold).WithName(name), nil +} + +// NewLowQueueFilter initializes a new LowQueueFilter and returns its pointer. +func NewLowQueueFilter(threshold int) *LowQueueFilter { + return &LowQueueFilter{ + name: LowQueueFilterType, + queueingThresholdLoRA: threshold, + } +} + +// LowQueueFilter returns pods that their waiting queue size is less than a configured threshold +type LowQueueFilter struct { + name string + queueingThresholdLoRA int +} + +// Type returns the type of the filter. +func (f *LowQueueFilter) Type() string { + return LowQueueFilterType +} + +// Name returns the name of the filter. +func (f *LowQueueFilter) Name() string { + return f.name +} + +// WithName sets the name of the filter. +func (f *LowQueueFilter) WithName(name string) *LowQueueFilter { + f.name = name + return f +} + +// Filter filters out pods that doesn't meet the filter criteria. +func (f *LowQueueFilter) Filter(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, pods []types.Pod) []types.Pod { + filteredPods := []types.Pod{} + + for _, pod := range pods { + if pod.GetMetrics().WaitingQueueSize <= f.queueingThresholdLoRA { + filteredPods = append(filteredPods, pod) + } + } + + return filteredPods +} diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go new file mode 100644 index 000000000..716c9f265 --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer.go @@ -0,0 +1,149 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package prefix + +import ( + "context" + "sync" + "time" + + lru "github.com/hashicorp/golang-lru/v2" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +// An indexer maintains an LRU cache of prompt prefix hashes and the server(s) that might have that +// prefix cached. +type indexer struct { + mu sync.RWMutex + hashToPods map[BlockHash]podSet // the lookup data structure to find pods that have the BlockHash cached + podToLRU map[ServerID]*lru.Cache[BlockHash, struct{}] // key is pod namespacedName, value is an LRU cache + maxLRUSize int +} + +// newIndexer initializes an indexer with size limits and starts cache size reporting. +func newIndexer(maxLRUSize int) *indexer { + ix := &indexer{ + hashToPods: make(map[BlockHash]podSet), + podToLRU: make(map[ServerID]*lru.Cache[BlockHash, struct{}]), + maxLRUSize: maxLRUSize, + } + + go ix.ReportLRUSize(time.Second) + return ix +} + +// Add adds a list of prefix hashes to the cache, tied to the server. +func (i *indexer) Add(hashes []BlockHash, pod ServerID) { + i.mu.Lock() + // Check if the LRU pod exist + lruForPod, exists := i.podToLRU[pod] + if !exists { + newLRU, _ := lru.NewWithEvict[BlockHash, struct{}](i.maxLRUSize, i.makeEvictionFn(pod)) + i.podToLRU[pod] = newLRU + lruForPod = newLRU + } + + i.mu.Unlock() + + // Add to LRU (may evict) + for _, hash := range hashes { + lruForPod.Add(hash, struct{}{}) + } + + // Update hashToPods once under lock + i.mu.Lock() + for _, hash := range hashes { + pods := i.hashToPods[hash] + if pods == nil { + pods = make(podSet) + } + pods[pod] = struct{}{} + i.hashToPods[hash] = pods + } + + i.mu.Unlock() +} + +// Get returns a set of servers that have the given prefix hash cached. +func (i *indexer) Get(hash BlockHash) podSet { + i.mu.RLock() + defer i.mu.RUnlock() + + res := podSet{} + pods, ok := i.hashToPods[hash] + if !ok { + return res + } + + return pods +} + +// makeEvictionFn returns a per-pod LRU eviction callback that removes the pod from hashToPods on eviction. +func (i *indexer) makeEvictionFn(pod ServerID) func(BlockHash, struct{}) { + return func(hash BlockHash, _ struct{}) { + i.mu.Lock() + defer i.mu.Unlock() + // Remove the pod from the hash→pods map + if podSet, ok := i.hashToPods[hash]; ok { + delete(podSet, pod) + if len(podSet) == 0 { + delete(i.hashToPods, hash) + } + } + } +} + +// ReportLRUSize starts a goroutine that periodically reports the LRU cache size metric. +func (i *indexer) ReportLRUSize(interval time.Duration) { + ticker := time.NewTicker(interval) + defer ticker.Stop() + for range ticker.C { + i.mu.RLock() + totalEntries := 0 + maxPodEntries := 0 + maxPodName := ServerID{} + + for pod, lruCache := range i.podToLRU { + size := lruCache.Len() + totalEntries += size + if size > maxPodEntries { + maxPodEntries = size + maxPodName = pod + } + } + + numPods := len(i.podToLRU) + avg := 0.0 + if numPods > 0 { + avg = float64(totalEntries) / float64(numPods) + } + + metrics.RecordPrefixCacheSize(int64(totalEntries)) + log.FromContext(context.TODO()).V(logutil.TRACE).Info("Prefix cache state", + "total entries", totalEntries, + "# pods", numPods, + "avg entries per pod", avg, + "pod with max cache", maxPodName, + "max pod size", maxPodEntries, + "global max LRU cache capacity per pod", i.maxLRUSize, + ) + + i.mu.RUnlock() + } +} diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go new file mode 100644 index 000000000..240985033 --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/indexer_test.go @@ -0,0 +1,44 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package prefix + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestIndexer_AddAndGet(t *testing.T) { + i := newIndexer(2) + + hash1 := BlockHash(1) + server := ServerID{Namespace: "default", Name: "server1"} + // Add an entry to the cache + i.Add([]BlockHash{hash1}, server) + + // Retrieve the entry + assert.Equal(t, 1, i.podToLRU[server].Len(), "Cache size should be 1 after adding an entry") + servers := i.Get(hash1) + assert.Contains(t, servers, server, "Cache should contain the added server") + + // Add another entry to the cache, the cache size should be incremented to 2. + i.Add([]BlockHash{BlockHash(2)}, server) + assert.Equal(t, 2, i.podToLRU[server].Len(), "Cache size should be 2 after adding an entry") + + // Add another entry to the cache, which should evict the first one due to max size. + i.Add([]BlockHash{BlockHash(3)}, server) + assert.Equal(t, 2, i.podToLRU[server].Len(), "Cache size should still be 2 after adding an entry") +} diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go new file mode 100644 index 000000000..d7fe5d190 --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go @@ -0,0 +1,270 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package prefix + +import ( + "context" + "encoding/binary" + "encoding/json" + "fmt" + + "github.com/cespare/xxhash/v2" + k8stypes "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + DefaultScorerWeight = 1 + // vLLM default token block size is 16, and a good guess of average characters per token is 4. + DefaultHashBlockSize = 64 + // The maximum number of blocks to match. Two long requests with the same prefix up to this + // limit will be indistinguishable. + // This parameter provides a trade-off between cache size, prefix matching speed and matching + // accuracy. Use a small value if most requests are short to reduce cache size and speed up the + // matching process. Use a large value if most requests are long to increase the matching accuracy. + DefaultMaxPrefixBlocks = 256 + // The indexer is an approximation to the actual prefix LRU cache state on the model servers per server (pod). + // A small capacity ensures a high accuracy of cache hit on the model server, but it will + // increase the chance of false negatives. A high capacity does the opposite. + // To properly size this, consider the sum of the total number of cache entries on all model + // servers. Consider the llama3 8B model on a H100 80GB GPUs. The size of the model weight is + // about 16GB. The remaining HBM used for caching prefixes is 64GB. Each + // token is about 128KB in size, so we can cache 500K tokens. Using the default block size of 16 + // in vLLM, we will have 250K / 16 = 31.25K blocks. + DefaultLRUCapacityPerServer = 31250 + + PrefixCachePluginType = "prefix-cache" +) + +type Config struct { + // The input prompt is broken into sizes of HashBlockSize to calculate block hashes . Requests + // with length shorter than the block size will be ignored. + HashBlockSize int `json:"hashBlockSize"` + // MaxPrefixBlocksToMatch is the maximum number of prefix blocks to match. Input beyond this limit will + // be ignored. + MaxPrefixBlocksToMatch int `json:"maxPrefixBlocksToMatch"` + // Max capacity size of the LRU indexer in number of entries per server (pod). + LRUCapacityPerServer int `json:"lruCapacityPerServer"` +} + +type Plugin struct { + Config + name string + indexer Indexer +} + +// podSet holds an pods servers that may have a specific prefix hash. +type podSet map[ServerID]struct{} + +type Indexer interface { + Get(hash BlockHash) podSet + Add(hashes []BlockHash, server ServerID) +} + +// BlockHash is a hash of the block of request body. +type BlockHash uint64 + +type ServerID k8stypes.NamespacedName + +func (s ServerID) String() string { + return k8stypes.NamespacedName(s).String() +} + +// compile-time type validation +var _ types.StateData = &SchedulingContextState{} + +// SchedulingContextState is the state of this plugin to be used during a scheduling cycle. +type SchedulingContextState struct { + // PrefixHashes is a list of prefix hashes of the request prompt broken into blocks. + PrefixHashes []BlockHash + // A map of server to its longest prefix cache match length. + PrefixCacheServers map[ServerID]int +} + +func (s *SchedulingContextState) Clone() types.StateData { + prefixHashes := make([]BlockHash, len(s.PrefixHashes)) + copy(prefixHashes, s.PrefixHashes) + prefixCacheServers := make(map[ServerID]int, len(s.PrefixCacheServers)) + for key, value := range s.PrefixCacheServers { + prefixCacheServers[key] = value + } + + return &SchedulingContextState{ + PrefixHashes: prefixHashes, + PrefixCacheServers: prefixCacheServers, + } +} + +// compile-time type assertion +var _ framework.Scorer = &Plugin{} +var _ framework.PostCycle = &Plugin{} + +// PrefixCachePluginFactory defines the factory function for Prefix plugin. +func PrefixCachePluginFactory(name string, rawParameters json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) { + parameters := Config{ + HashBlockSize: DefaultHashBlockSize, + MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks, + LRUCapacityPerServer: DefaultLRUCapacityPerServer, + } + if err := json.Unmarshal(rawParameters, ¶meters); err != nil { + return nil, fmt.Errorf("failed to parse the parameters of the %s plugin. Error: %s", PrefixCachePluginType, err) + } + + return New(parameters).WithName(name), nil +} + +// New initializes a new prefix Plugin and returns its pointer. +func New(config Config) *Plugin { + capacity := config.LRUCapacityPerServer + if capacity <= 0 { + capacity = DefaultLRUCapacityPerServer + log.FromContext(context.TODO()).V(logutil.DEFAULT).Info( + "LRUCapacityPerServer is not positive, using default value", + "defaultCapacity", DefaultLRUCapacityPerServer, + ) + } + + return &Plugin{ + name: PrefixCachePluginType, + Config: config, + indexer: newIndexer(capacity), + } +} + +// Type returns the type of the plugin. +func (m *Plugin) Type() string { + return PrefixCachePluginType +} + +// Name returns the name of the plugin. +func (m *Plugin) Name() string { + return m.name +} + +// WithName sets the name of the plugin. +func (m *Plugin) WithName(name string) *Plugin { + m.name = name + return m +} + +// Score returns the scoring result for the given list of pods based on context. +func (m *Plugin) Score(ctx context.Context, cycleState *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 { + loggerTrace := log.FromContext(ctx).V(logutil.TRACE) + // pre score step, hashing prompt and find longest prefix match. + hashes := hashPrompt(ctx, request, m.HashBlockSize, m.MaxPrefixBlocksToMatch) + state := &SchedulingContextState{ + PrefixHashes: hashes, + PrefixCacheServers: m.matchLongestPrefix(ctx, hashes), + } + + cycleState.Write(types.StateKey(m.Type()), state) + loggerTrace.Info(fmt.Sprintf("cached servers: %+v", state.PrefixCacheServers), "hashes", state.PrefixHashes) + // calculate the scores of pods + scores := make(map[types.Pod]float64, len(pods)) + + total := len(state.PrefixHashes) + podScoreFunc := func(pod types.Pod) float64 { + if total == 0 { + return 0 + } + matchLen := state.PrefixCacheServers[ServerID(pod.GetPod().NamespacedName)] + return float64(matchLen) / float64(total) + } + + for _, pod := range pods { + scores[pod] = podScoreFunc(pod) + } + return scores +} + +// PostCycle records in the plugin cache the result of the scheduling selection. +func (m *Plugin) PostCycle(ctx context.Context, cycleState *types.CycleState, res *types.ProfileRunResult) { + targetPod := res.TargetPod.GetPod() + state, err := types.ReadCycleStateKey[*SchedulingContextState](cycleState, PrefixCachePluginType) + if err != nil { + log.FromContext(ctx).Error(err, "failed to read prefix plugin cycle state") + return + } + + m.indexer.Add(state.PrefixHashes, ServerID(targetPod.NamespacedName)) + + total := len(state.PrefixHashes) + matchLen := state.PrefixCacheServers[ServerID(targetPod.NamespacedName)] + metrics.RecordPrefixCacheMatch(matchLen*m.HashBlockSize, total*m.HashBlockSize) +} + +// matchLongestPrefix returns a map of servers and length of prefix that each server caches. +func (m *Plugin) matchLongestPrefix(ctx context.Context, hashes []BlockHash) map[ServerID]int { + loggerTrace := log.FromContext(ctx).V(logutil.TRACE) + res := make(map[ServerID]int) + // Use a greedy strategy to search from the longest prefix. + // NOTE: It's possible to further optimize this with a binary search. + for i := 0; i < len(hashes); i++ { + hash := hashes[i] + cachedServers := m.indexer.Get(hash) + if len(cachedServers) == 0 { + break + } else { + loggerTrace.Info("Found cached servers", "cachedServers", cachedServers, "total # blocks", len(hashes), "longest prefix", i) + for server := range cachedServers { + // Update servers with their longest prefix match. + res[server]++ + + } + } + } + return res +} + +// hashPrompt divides the prompt into blocks and calculate the prefix cache for each block. +// hash(0) is the hash of the model name, since different models generally don't share prefix cache. +// For block i, hash(i) = hash(block i content, hash(i-1)). +func hashPrompt(ctx context.Context, request *types.LLMRequest, cacheBlockSize int, maxPrefixBlocks int) []BlockHash { + loggerDebug := log.FromContext(ctx).V(logutil.DEBUG) + prompt := []byte(request.Prompt) + if len(prompt) < cacheBlockSize { + loggerDebug.Info("Request body too small for prefix cache", "size", len(prompt), "block size", cacheBlockSize) + return nil + } + if len(prompt) > cacheBlockSize*maxPrefixBlocks { + loggerDebug.Info("Truncating input", "size", len(prompt), "max prefix blocks", maxPrefixBlocks, "block size", cacheBlockSize) + prompt = prompt[:maxPrefixBlocks*cacheBlockSize] + } + // Split the body into blocks of size cacheBlockSize. The +1 is to account for the model. + // If the last block is smaller than cacheBlockSize, it will be ignored. + res := make([]BlockHash, 0, 1+len(prompt)/cacheBlockSize) + // Add the model to the first block hash so that different models have different hashes even with the same body. + res = append(res, BlockHash(xxhash.Sum64String(request.TargetModel))) + for i := 0; i+cacheBlockSize <= len(prompt); i += cacheBlockSize { + block := prompt[i : i+cacheBlockSize] + prevBlockHash := res[len(res)-1] + block = append(block, toBytes(prevBlockHash)...) + res = append(res, BlockHash(xxhash.Sum64(block))) + } + return res +} + +func toBytes(i BlockHash) []byte { + bytes := make([]byte, 8) + binary.LittleEndian.PutUint64(bytes, uint64(i)) + return bytes +} diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go new file mode 100644 index 000000000..ea7fab72c --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go @@ -0,0 +1,201 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package prefix + +import ( + "context" + "fmt" + "math" + "math/rand" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + k8stypes "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +func TestPrefixPlugin(t *testing.T) { + + config := Config{ + HashBlockSize: 4, + MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks, + LRUCapacityPerServer: DefaultLRUCapacityPerServer, + } + plugin := New(config) + + pod1 := &types.PodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}} + pod2 := &types.PodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}} + pods := []types.Pod{pod1, pod2} + + // First request. + req1 := &types.LLMRequest{ + TargetModel: "test-model1", + Prompt: "aaaaaa", + } + cycleState1 := types.NewCycleState() + scores := plugin.Score(context.Background(), cycleState1, req1, pods) + state, err := types.ReadCycleStateKey[*SchedulingContextState](cycleState1, PrefixCachePluginType) + assert.NoError(t, err) + t.Logf("Hashes %+v, cached servers: %+v", state.PrefixHashes, state.PrefixCacheServers) + // Input size is 6, hash block size is 4, the last 2 characters are ignored. + // Total hashes = 2 (the first one is for the model) + assert.Equal(t, 2, len(state.PrefixHashes), "number of hashes is incorrect") + assert.Equal(t, 0, len(state.PrefixCacheServers), "there shouldn't be any cached servers") + assert.Equal(t, float64(0), scores[pod1], "score for pod1") + assert.Equal(t, float64(0), scores[pod2], "score for pod2") + + // Simulate pod1 was picked. + plugin.PostCycle(context.Background(), cycleState1, &types.ProfileRunResult{TargetPod: pod1}) + + // Second request doesn't share any prefix with first one. It should be added to the cache but + // the pod score should be 0. + req2 := &types.LLMRequest{ + TargetModel: "test-model2", + Prompt: "bbbbbb", + } + cycleState2 := types.NewCycleState() + scores = plugin.Score(context.Background(), cycleState2, req2, pods) + state, err = types.ReadCycleStateKey[*SchedulingContextState](cycleState2, PrefixCachePluginType) + assert.NoError(t, err) + t.Logf("Hashes %+v, cached servers: %+v", state.PrefixHashes, state.PrefixCacheServers) + // Input size is 6, hash block size is 4, the last 2 characters are ignored. + // Total hashes = 2 (the first one is for the model) + assert.Equal(t, 2, len(state.PrefixHashes), "number of hashes is incorrect") + assert.Equal(t, 0, len(state.PrefixCacheServers), "there shouldn't be any cached servers") + assert.Equal(t, float64(0), scores[pod1], "score for pod1") + assert.Equal(t, float64(0), scores[pod2], "score for pod2") + + // Simulate pod2 was picked. + plugin.PostCycle(context.Background(), cycleState2, &types.ProfileRunResult{TargetPod: pod2}) + + // Third request shares partial prefix with first one. + req3 := &types.LLMRequest{ + TargetModel: "test-model1", + Prompt: "aaaabbbb", + } + cycleState3 := types.NewCycleState() + scores = plugin.Score(context.Background(), cycleState3, req3, pods) + state, err = types.ReadCycleStateKey[*SchedulingContextState](cycleState3, PrefixCachePluginType) + assert.NoError(t, err) + t.Logf("Hashes %+v, cached servers: %+v", state.PrefixHashes, state.PrefixCacheServers) + // Input size is 8, hash block size is 4, so 2 hashes will be calculated. + // Total hashes = 3 (the first one is for the model) + assert.Equal(t, 3, len(state.PrefixHashes), "number of hashes is incorrect") + assert.Equal(t, 1, len(state.PrefixCacheServers), "pod1 should have cached the aaaa prefix") + assert.Equal(t, float64(2)/float64(3), scores[pod1], "score should be 2/3 - the model and the first prefix block match") + assert.Equal(t, float64(0), scores[pod2], "score for pod2") + + plugin.PostCycle(context.Background(), cycleState3, &types.ProfileRunResult{TargetPod: pod1}) + + // 4th request is same as req3 except the model is different, still no match. + req4 := &types.LLMRequest{ + TargetModel: "test-model-new", + Prompt: "aaaabbbb", + } + cycleState4 := types.NewCycleState() + scores = plugin.Score(context.Background(), cycleState4, req4, pods) + state, err = types.ReadCycleStateKey[*SchedulingContextState](cycleState4, PrefixCachePluginType) + assert.NoError(t, err) + t.Logf("Hashes %+v, cached servers: %+v", state.PrefixHashes, state.PrefixCacheServers) + // Input size is 8, hash block size is 4, so 2 hashes will be calculated. + // Total hashes = 3 (the first one is for the model) + assert.Equal(t, 3, len(state.PrefixHashes), "number of hashes is incorrect") + assert.Equal(t, 0, len(state.PrefixCacheServers), "pod1 should have cached the aaaa prefix") + assert.Equal(t, float64(0), scores[pod1], "score for pod1") + assert.Equal(t, float64(0), scores[pod2], "score for pod2") + + plugin.PostCycle(context.Background(), cycleState4, &types.ProfileRunResult{TargetPod: pod1}) + + // 5th request shares partial prefix with 3rd one. + req5 := &types.LLMRequest{ + TargetModel: "test-model1", + Prompt: "aaaabbbbcccc", + } + cycleState5 := types.NewCycleState() + scores = plugin.Score(context.Background(), cycleState5, req5, pods) + state, err = types.ReadCycleStateKey[*SchedulingContextState](cycleState5, PrefixCachePluginType) + assert.NoError(t, err) + t.Logf("Hashes %+v, cached servers: %+v", state.PrefixHashes, state.PrefixCacheServers) + // Input size is 12, hash block size is 4, so 3 hashes will be calculated. + // Total hashes = 4 (the first one is for the model) + assert.Equal(t, 4, len(state.PrefixHashes), "number of hashes is incorrect") + assert.Equal(t, 1, len(state.PrefixCacheServers), "pod1 should have cached the aaaa prefix") + assert.Equal(t, 0.75, scores[pod1], "score should be 0.75 - the model and the first 2 prefix blocks match") + assert.Equal(t, float64(0), scores[pod2], "score for pod2") + + plugin.PostCycle(context.Background(), cycleState5, &types.ProfileRunResult{TargetPod: pod1}) +} + +// TestPrefixPluginStress is a stress test for the prefix scoring plugin, using prompts of increasing length. +func BenchmarkPrefixPluginStress(b *testing.B) { + blockSize := 4 + maxPrefixBlocks := 50000 + config := Config{ + HashBlockSize: blockSize, + MaxPrefixBlocksToMatch: maxPrefixBlocks, + LRUCapacityPerServer: DefaultLRUCapacityPerServer, + } + + plugin := New(config) + types.NewCycleState() + var promptLen []int + for i := 1; i <= 1024; i++ { + promptLen = append(promptLen, i) + } + promptLen = append(promptLen, 2048, 4096, 8192, 10000, 20000, 50000) + + for _, i := range promptLen { + // Generate increasing-length random prompts + prompt := randomPrompt(4 + i) + pod := &types.PodMetrics{ + Pod: &backend.Pod{ + NamespacedName: k8stypes.NamespacedName{ + Name: fmt.Sprintf("random-pod-%d", i), + }, + }, + } + + pods := []types.Pod{pod} + req := &types.LLMRequest{ + TargetModel: "model-stress", + Prompt: prompt, + } + + // First cycle: simulate scheduling and insert prefix info into the cache + cycleState := types.NewCycleState() + plugin.Score(context.Background(), cycleState, req, pods) + plugin.PostCycle(context.Background(), cycleState, &types.ProfileRunResult{TargetPod: pod}) + + // Second cycle: validate internal state + state, err := types.ReadCycleStateKey[*SchedulingContextState](cycleState, PrefixCachePluginType) + assert.NoError(b, err) + expectedHashes := int(math.Min(float64(maxPrefixBlocks+1), float64(len(req.Prompt)/blockSize+1))) // the extra one is for the model. + assert.Equal(b, expectedHashes, len(state.PrefixHashes), "number of hashes is incorrect") + } +} + +// randomPrompt generates a pseudo-random string of length n using lowercase letters. +func randomPrompt(n int) string { + runes := []rune("abcdefghijklmnopqrstuvwxyz") + var sb strings.Builder + for i := 0; i < n; i++ { + sb.WriteRune(runes[rand.Intn(len(runes))]) + } + return sb.String() +} diff --git a/pkg/epp/scheduling/framework/plugins/picker/max_score_picker.go b/pkg/epp/scheduling/framework/plugins/picker/max_score_picker.go new file mode 100644 index 000000000..db4e8487c --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/picker/max_score_picker.go @@ -0,0 +1,93 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package picker + +import ( + "context" + "encoding/json" + "fmt" + + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + MaxScorePickerType = "max-score" +) + +// compile-time type validation +var _ framework.Picker = &MaxScorePicker{} + +// MaxScorePickerFactory defines the factory function for MaxScorePicker. +func MaxScorePickerFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) { + return NewMaxScorePicker().WithName(name), nil +} + +// NewMaxScorePicker initializes a new MaxScorePicker and returns its pointer. +func NewMaxScorePicker() *MaxScorePicker { + return &MaxScorePicker{ + name: MaxScorePickerType, + random: NewRandomPicker(), + } +} + +// MaxScorePicker picks the pod with the maximum score from the list of candidates. +type MaxScorePicker struct { + name string + random *RandomPicker +} + +// Type returns the type of the picker. +func (p *MaxScorePicker) Type() string { + return MaxScorePickerType +} + +// Name returns the name of the picker. +func (p *MaxScorePicker) Name() string { + return p.name +} + +// WithName sets the picker's name +func (p *MaxScorePicker) WithName(name string) *MaxScorePicker { + p.name = name + return p +} + +// Pick selects the pod with the maximum score from the list of candidates. +func (p *MaxScorePicker) Pick(ctx context.Context, cycleState *types.CycleState, scoredPods []*types.ScoredPod) *types.ProfileRunResult { + log.FromContext(ctx).V(logutil.DEBUG).Info(fmt.Sprintf("Selecting a pod with the max score from %d candidates: %+v", len(scoredPods), scoredPods)) + + highestScorePods := []*types.ScoredPod{} + maxScore := -1.0 // pods min score is 0, putting value lower than 0 in order to find at least one pod as highest + for _, pod := range scoredPods { + if pod.Score > maxScore { + maxScore = pod.Score + highestScorePods = []*types.ScoredPod{pod} + } else if pod.Score == maxScore { + highestScorePods = append(highestScorePods, pod) + } + } + + if len(highestScorePods) > 1 { + return p.random.Pick(ctx, cycleState, highestScorePods) // pick randomly from the highest score pods + } + + return &types.ProfileRunResult{TargetPod: highestScorePods[0]} +} diff --git a/pkg/epp/scheduling/framework/plugins/picker/random_picker.go b/pkg/epp/scheduling/framework/plugins/picker/random_picker.go new file mode 100644 index 000000000..fb2d44b4f --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/picker/random_picker.go @@ -0,0 +1,77 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package picker + +import ( + "context" + "encoding/json" + "fmt" + "math/rand" + + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + RandomPickerType = "random" +) + +// compile-time type validation +var _ framework.Picker = &RandomPicker{} + +// RandomPickerFactory defines the factory function for RandomPicker. +func RandomPickerFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) { + return NewRandomPicker().WithName(name), nil +} + +// NewRandomPicker initializes a new RandomPicker and returns its pointer. +func NewRandomPicker() *RandomPicker { + return &RandomPicker{ + name: RandomPickerType, + } +} + +// RandomPicker picks a random pod from the list of candidates. +type RandomPicker struct { + name string +} + +// Type returns the type of the picker. +func (p *RandomPicker) Type() string { + return RandomPickerType +} + +// Name returns the name of the picker. +func (p *RandomPicker) Name() string { + return p.name +} + +// WithName sets the picker's name +func (p *RandomPicker) WithName(name string) *RandomPicker { + p.name = name + return p +} + +// Pick selects a random pod from the list of candidates. +func (p *RandomPicker) Pick(ctx context.Context, _ *types.CycleState, scoredPods []*types.ScoredPod) *types.ProfileRunResult { + log.FromContext(ctx).V(logutil.DEBUG).Info(fmt.Sprintf("Selecting a random pod from %d candidates: %+v", len(scoredPods), scoredPods)) + i := rand.Intn(len(scoredPods)) + return &types.ProfileRunResult{TargetPod: scoredPods[i]} +} diff --git a/pkg/epp/scheduling/framework/plugins/profile/single_profile_handler.go b/pkg/epp/scheduling/framework/plugins/profile/single_profile_handler.go new file mode 100644 index 000000000..48d75a6f6 --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/profile/single_profile_handler.go @@ -0,0 +1,105 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package profile + +import ( + "context" + "encoding/json" + "errors" + "fmt" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +const ( + SingleProfileHandlerType = "single-profile" +) + +// compile-time type assertion +var _ framework.ProfileHandler = &SingleProfileHandler{} + +// SingleProfileHandlerFactory defines the factory function for SingleProfileHandler. +func SingleProfileHandlerFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) { + return NewSingleProfileHandler().WithName(name), nil +} + +// NewSingleProfileHandler initializes a new SingleProfileHandler and returns its pointer. +func NewSingleProfileHandler() *SingleProfileHandler { + return &SingleProfileHandler{ + name: SingleProfileHandlerType, + } +} + +// SingleProfileHandler handles a single profile which is always the primary profile. +type SingleProfileHandler struct { + name string +} + +// Type returns the type of the Profile Handler. +func (h *SingleProfileHandler) Type() string { + return SingleProfileHandlerType +} + +// Name returns the name of the profile handler. +func (h *SingleProfileHandler) Name() string { + return h.name +} + +// WithName sets the name of the profile handler. +func (h *SingleProfileHandler) WithName(name string) *SingleProfileHandler { + h.name = name + return h +} + +// Pick selects the SchedulingProfiles to run from the list of candidate profiles, while taking into consideration the request properties and the +// previously executed cycles along with their results. +func (h *SingleProfileHandler) Pick(_ context.Context, _ *types.CycleState, request *types.LLMRequest, profiles map[string]*framework.SchedulerProfile, + profileResults map[string]*types.ProfileRunResult) map[string]*framework.SchedulerProfile { + if len(profiles) == len(profileResults) { // all profiles have been executed already in previous call + return map[string]*framework.SchedulerProfile{} + } + // return all profiles + return profiles +} + +// ProcessResults handles the outcome of the profile runs after all profiles ran. +// It may aggregate results, log test profile outputs, or apply custom logic. It specifies in the SchedulingResult the +// key of the primary profile that should be used to get the request selected destination. +// When a profile run fails, its result in the profileResults map is nil. +func (h *SingleProfileHandler) ProcessResults(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, + profileResults map[string]*types.ProfileRunResult) (*types.SchedulingResult, error) { + if len(profileResults) != 1 { + return nil, errors.New("single profile handler is intended to be used with a single profile, failed to process multiple profiles") + } + + var singleProfileName string + for profileName := range profileResults { + singleProfileName = profileName + break + } + + if profileResults[singleProfileName] == nil { // there was an error while running the profile + return nil, fmt.Errorf("failed to run scheduler profile '%s'", singleProfileName) + } + + return &types.SchedulingResult{ + ProfileResults: profileResults, + PrimaryProfileName: singleProfileName, + }, nil +} diff --git a/pkg/epp/scheduling/framework/plugins/scorer/kvcache.go b/pkg/epp/scheduling/framework/plugins/scorer/kvcache.go new file mode 100644 index 000000000..f0d4af9ae --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/scorer/kvcache.go @@ -0,0 +1,76 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scorer + +import ( + "context" + "encoding/json" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +const ( + DefaultKVCacheScorerWeight = 1 + KvCacheScorerType = "kv-cache" +) + +// compile-time type assertion +var _ framework.Scorer = &KVCacheScorer{} + +// KvCacheScorerFactory defines the factory function for KVCacheScorer. +func KvCacheScorerFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) { + return NewKVCacheScorer().WithName(name), nil +} + +// NewKVCacheScorer initializes a new KVCacheScorer and returns its pointer. +func NewKVCacheScorer() *KVCacheScorer { + return &KVCacheScorer{ + name: KvCacheScorerType, + } +} + +// KVCacheScorer scores list of candidate pods based on KV cache utilization. +type KVCacheScorer struct { + name string +} + +// Type returns the type of the scorer. +func (s *KVCacheScorer) Type() string { + return KvCacheScorerType +} + +// Name returns the name of the scorer. +func (s *KVCacheScorer) Name() string { + return s.name +} + +// WithName sets the name of the scorer. +func (s *KVCacheScorer) WithName(name string) *KVCacheScorer { + s.name = name + return s +} + +// Score returns the scoring result for the given list of pods based on context. +func (s *KVCacheScorer) Score(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 { + scores := make(map[types.Pod]float64, len(pods)) + for _, pod := range pods { + scores[pod] = 1 - pod.GetMetrics().KVCacheUsagePercent + } + return scores +} diff --git a/pkg/epp/scheduling/framework/plugins/scorer/kvcache_test.go b/pkg/epp/scheduling/framework/plugins/scorer/kvcache_test.go new file mode 100644 index 000000000..c0eeb5210 --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/scorer/kvcache_test.go @@ -0,0 +1,94 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scorer + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +func TestKvCacheScorer(t *testing.T) { + tests := []struct { + name string + pods []types.Pod + expectedScoresPod map[int]float64 // Map of pod index to expected score + }{ + { + name: "Different KV cache utilization", + pods: []types.Pod{ + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{KVCacheUsagePercent: 0.8}}, + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{KVCacheUsagePercent: 0.5}}, + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{KVCacheUsagePercent: 0.0}}, + }, + expectedScoresPod: map[int]float64{ + 0: 0.2, // Highest KV cache usage (0.8) gets lowest score (1-0.8=0.2) + 1: 0.5, // Medium KV cache usage (0.5) gets medium score (1-0.5=0.5) + 2: 1.0, // No KV cache usage (0.0) gets highest score (1-0=1.0) + }, + }, + { + name: "Same KV cache utilization", + pods: []types.Pod{ + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{KVCacheUsagePercent: 0.6}}, + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{KVCacheUsagePercent: 0.6}}, + }, + expectedScoresPod: map[int]float64{ + 0: 0.4, // Both get same score (1-0.6=0.4) + 1: 0.4, + }, + }, + { + name: "Zero KV cache utilization", + pods: []types.Pod{ + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{KVCacheUsagePercent: 0.0}}, + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{KVCacheUsagePercent: 0.0}}, + }, + expectedScoresPod: map[int]float64{ + 0: 1.0, // No KV cache usage gets highest score + 1: 1.0, + }, + }, + { + name: "Full KV cache utilization", + pods: []types.Pod{ + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{KVCacheUsagePercent: 1.0}}, + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{KVCacheUsagePercent: 0.5}}, + }, + expectedScoresPod: map[int]float64{ + 0: 0.0, // Full KV cache (1.0) gets lowest score (1-1=0) + 1: 0.5, // Half KV cache (0.5) gets medium score (1-0.5=0.5) + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + scorer := &KVCacheScorer{} + scores := scorer.Score(context.Background(), types.NewCycleState(), &types.LLMRequest{}, test.pods) + + for i, pod := range test.pods { + expectedScore := test.expectedScoresPod[i] + assert.InDelta(t, expectedScore, scores[pod], 0.0001, "Pod %d should have score %f", i, expectedScore) + } + }) + } +} diff --git a/pkg/epp/scheduling/framework/plugins/scorer/queue.go b/pkg/epp/scheduling/framework/plugins/scorer/queue.go new file mode 100644 index 000000000..dad6d0411 --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/scorer/queue.go @@ -0,0 +1,102 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scorer + +import ( + "context" + "encoding/json" + "math" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +const ( + DefaultQueueScorerWeight = 1 + QueueScorerType = "queue" +) + +// compile-time type assertion +var _ framework.Scorer = &QueueScorer{} + +// QueueScorerFactory defines the factory function for QueueScorer. +func QueueScorerFactory(name string, _ json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) { + return NewQueueScorer().WithName(name), nil +} + +// NewQueueScorer initializes a new QueueScorer and returns its pointer. +func NewQueueScorer() *QueueScorer { + return &QueueScorer{ + name: QueueScorerType, + } +} + +// QueueScorer scores list of candidate pods based on the pod's waiting queue size. +// the less waiting queue size the pod has, the higher score it will get (since it's more available to serve new request). +type QueueScorer struct { + name string +} + +// Type returns the type of the scorer. +func (s *QueueScorer) Type() string { + return QueueScorerType +} + +// Name returns the name of the scorer. +func (s *QueueScorer) Name() string { + return s.name +} + +// WithName sets the name of the scorer. +func (s *QueueScorer) WithName(name string) *QueueScorer { + s.name = name + return s +} + +// Score returns the scoring result for the given list of pods based on context. +func (s *QueueScorer) Score(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 { + minQueueSize := math.MaxInt + maxQueueSize := math.MinInt + + // Iterate through the remaining pods to find min and max + for _, pod := range pods { + queueSize := pod.GetMetrics().WaitingQueueSize + if queueSize < minQueueSize { + minQueueSize = queueSize + } + if queueSize > maxQueueSize { + maxQueueSize = queueSize + } + } + + // podScoreFunc calculates the score based on the queue size of each pod. Longer queue gets a lower score. + podScoreFunc := func(pod types.Pod) float64 { + if maxQueueSize == minQueueSize { + // If all pods have the same queue size, return a neutral score + return 1.0 + } + return float64(maxQueueSize-pod.GetMetrics().WaitingQueueSize) / float64(maxQueueSize-minQueueSize) + } + + // Create a map to hold the scores for each pod + scores := make(map[types.Pod]float64, len(pods)) + for _, pod := range pods { + scores[pod] = podScoreFunc(pod) + } + return scores +} diff --git a/pkg/epp/scheduling/framework/plugins/scorer/queue_test.go b/pkg/epp/scheduling/framework/plugins/scorer/queue_test.go new file mode 100644 index 000000000..a9a8115b3 --- /dev/null +++ b/pkg/epp/scheduling/framework/plugins/scorer/queue_test.go @@ -0,0 +1,84 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scorer + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +func TestQueueScorer(t *testing.T) { + tests := []struct { + name string + pods []types.Pod + expectedScoresPod map[int]float64 // Map of pod index to expected score + }{ + { + name: "Different queue sizes", + pods: []types.Pod{ + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{WaitingQueueSize: 10}}, + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{WaitingQueueSize: 5}}, + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{WaitingQueueSize: 0}}, + }, + expectedScoresPod: map[int]float64{ + 0: 0.0, // Longest queue (10) gets lowest score + 1: 0.5, // Medium queue (5) gets medium score + 2: 1.0, // Shortest queue (0) gets highest score + }, + }, + { + name: "Same queue sizes", + pods: []types.Pod{ + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{WaitingQueueSize: 5}}, + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{WaitingQueueSize: 5}}, + }, + expectedScoresPod: map[int]float64{ + 0: 1.0, // When all pods have the same queue size, they get the same neutral score + 1: 1.0, + }, + }, + { + name: "Zero queue sizes", + pods: []types.Pod{ + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{WaitingQueueSize: 0}}, + &types.PodMetrics{Pod: &backend.Pod{}, MetricsState: &backendmetrics.MetricsState{WaitingQueueSize: 0}}, + }, + expectedScoresPod: map[int]float64{ + 0: 1.0, + 1: 1.0, + }, + }, + } + + scorer := &QueueScorer{} + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + scores := scorer.Score(context.Background(), types.NewCycleState(), &types.LLMRequest{}, test.pods) + + for i, pod := range test.pods { + expectedScore := test.expectedScoresPod[i] + assert.InDelta(t, expectedScore, scores[pod], 0.0001, "Pod %d should have score %f", i, expectedScore) + } + }) + } +} diff --git a/pkg/epp/scheduling/framework/scheduler_profile.go b/pkg/epp/scheduling/framework/scheduler_profile.go new file mode 100644 index 000000000..f41a915f0 --- /dev/null +++ b/pkg/epp/scheduling/framework/scheduler_profile.go @@ -0,0 +1,193 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package framework + +import ( + "context" + "fmt" + "time" + + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +// NewSchedulerProfile creates a new SchedulerProfile object and returns its pointer. +func NewSchedulerProfile() *SchedulerProfile { + return &SchedulerProfile{ + filters: []Filter{}, + scorers: []*WeightedScorer{}, + postCyclePlugins: []PostCycle{}, + // picker remains nil since profile doesn't support multiple pickers + } +} + +// SchedulerProfile provides a profile configuration for the scheduler which influence routing decisions. +type SchedulerProfile struct { + filters []Filter + scorers []*WeightedScorer + picker Picker + postCyclePlugins []PostCycle +} + +// WithFilters sets the given filter plugins as the Filter plugins. +// if the SchedulerProfile has Filter plugins, this call replaces the existing plugins with the given ones. +func (p *SchedulerProfile) WithFilters(filters ...Filter) *SchedulerProfile { + p.filters = filters + return p +} + +// WithScorers sets the given scorer plugins as the Scorer plugins. +// if the SchedulerProfile has Scorer plugins, this call replaces the existing plugins with the given ones. +func (p *SchedulerProfile) WithScorers(scorers ...*WeightedScorer) *SchedulerProfile { + p.scorers = scorers + return p +} + +// WithPicker sets the given picker plugins as the Picker plugin. +// if the SchedulerProfile has Picker plugin, this call replaces the existing plugin with the given one. +func (p *SchedulerProfile) WithPicker(picker Picker) *SchedulerProfile { + p.picker = picker + return p +} + +// WithPostCyclePlugins sets the given plugins as the PostCycle plugins. +// If the SchedulerProfile has PostCycle plugins, this call replaces the existing plugins with the given ones. +func (p *SchedulerProfile) WithPostCyclePlugins(plugins ...PostCycle) *SchedulerProfile { + p.postCyclePlugins = plugins + return p +} + +// AddPlugins adds the given plugins to all scheduler plugins according to the interfaces each plugin implements. +// A plugin may implement more than one scheduler plugin interface. +// Special Case: In order to add a scorer, one must use the scorer.NewWeightedScorer function in order to provide a weight. +// if a scorer implements more than one interface, supplying a WeightedScorer is sufficient. The function will take the internal +// scorer object and register it to all interfaces it implements. +func (p *SchedulerProfile) AddPlugins(pluginObjects ...plugins.Plugin) error { + for _, plugin := range pluginObjects { + if weightedScorer, ok := plugin.(*WeightedScorer); ok { + p.scorers = append(p.scorers, weightedScorer) + plugin = weightedScorer.Scorer // if we got WeightedScorer, unwrap the plugin + } else if scorer, ok := plugin.(Scorer); ok { // if we got a Scorer instead of WeightedScorer that's an error. + return fmt.Errorf("failed to register scorer '%s' without a weight. follow function documentation to register a scorer", scorer.Type()) + } + if filter, ok := plugin.(Filter); ok { + p.filters = append(p.filters, filter) + } + if picker, ok := plugin.(Picker); ok { + if p.picker != nil { + return fmt.Errorf("failed to set '%s' as picker, already have a registered picker plugin '%s'", picker.Type(), p.picker.Type()) + } + p.picker = picker + } + if postCyclePlugin, ok := plugin.(PostCycle); ok { + p.postCyclePlugins = append(p.postCyclePlugins, postCyclePlugin) + } + } + return nil +} + +// RunCycle runs a SchedulerProfile cycle. In other words, it invokes all the SchedulerProfile plugins in this +// order - Filters, Scorers, Picker, PostCyclePlugins. After completing all, it returns the result. +func (p *SchedulerProfile) Run(ctx context.Context, request *types.LLMRequest, cycleState *types.CycleState, candidatePods []types.Pod) (*types.ProfileRunResult, error) { + pods := p.runFilterPlugins(ctx, request, cycleState, candidatePods) + if len(pods) == 0 { + return nil, errutil.Error{Code: errutil.Internal, Msg: "no pods available for the given request"} + } + // if we got here, there is at least one pod to score + weightedScorePerPod := p.runScorerPlugins(ctx, request, cycleState, pods) + + result := p.runPickerPlugin(ctx, cycleState, weightedScorePerPod) + + p.runPostCyclePlugins(ctx, cycleState, result) + + return result, nil +} + +func (p *SchedulerProfile) runFilterPlugins(ctx context.Context, request *types.LLMRequest, cycleState *types.CycleState, pods []types.Pod) []types.Pod { + loggerDebug := log.FromContext(ctx).V(logutil.DEBUG) + filteredPods := pods + loggerDebug.Info("Before running filter plugins", "pods", filteredPods) + + for _, filter := range p.filters { + loggerDebug.Info("Running filter plugin", "plugin", filter.Type()) + before := time.Now() + filteredPods = filter.Filter(ctx, cycleState, request, filteredPods) + metrics.RecordSchedulerPluginProcessingLatency(FilterPluginType, filter.Type(), time.Since(before)) + loggerDebug.Info("Filter plugin result", "plugin", filter.Type(), "pods", filteredPods) + if len(filteredPods) == 0 { + break + } + } + loggerDebug.Info("After running filter plugins") + + return filteredPods +} + +func (p *SchedulerProfile) runScorerPlugins(ctx context.Context, request *types.LLMRequest, cycleState *types.CycleState, pods []types.Pod) map[types.Pod]float64 { + loggerDebug := log.FromContext(ctx).V(logutil.DEBUG) + loggerDebug.Info("Before running scorer plugins", "pods", pods) + + weightedScorePerPod := make(map[types.Pod]float64, len(pods)) + for _, pod := range pods { + weightedScorePerPod[pod] = float64(0) // initialize weighted score per pod with 0 value + } + // Iterate through each scorer in the chain and accumulate the weighted scores. + for _, scorer := range p.scorers { + loggerDebug.Info("Running scorer", "scorer", scorer.Type()) + before := time.Now() + scores := scorer.Score(ctx, cycleState, request, pods) + metrics.RecordSchedulerPluginProcessingLatency(ScorerPluginType, scorer.Type(), time.Since(before)) + for pod, score := range scores { // weight is relative to the sum of weights + weightedScorePerPod[pod] += score * float64(scorer.Weight()) + } + loggerDebug.Info("After running scorer", "scorer", scorer.Type()) + } + loggerDebug.Info("After running scorer plugins") + + return weightedScorePerPod +} + +func (p *SchedulerProfile) runPickerPlugin(ctx context.Context, cycleState *types.CycleState, weightedScorePerPod map[types.Pod]float64) *types.ProfileRunResult { + loggerDebug := log.FromContext(ctx).V(logutil.DEBUG) + scoredPods := make([]*types.ScoredPod, len(weightedScorePerPod)) + i := 0 + for pod, score := range weightedScorePerPod { + scoredPods[i] = &types.ScoredPod{Pod: pod, Score: score} + i++ + } + + loggerDebug.Info("Before running picker plugin", "pods weighted score", fmt.Sprint(weightedScorePerPod)) + before := time.Now() + result := p.picker.Pick(ctx, cycleState, scoredPods) + metrics.RecordSchedulerPluginProcessingLatency(PickerPluginType, p.picker.Type(), time.Since(before)) + loggerDebug.Info("After running picker plugin", "result", result) + + return result +} + +func (p *SchedulerProfile) runPostCyclePlugins(ctx context.Context, cycleState *types.CycleState, result *types.ProfileRunResult) { + for _, plugin := range p.postCyclePlugins { + log.FromContext(ctx).V(logutil.DEBUG).Info("Running post-cycle plugin", "plugin", plugin.Type()) + before := time.Now() + plugin.PostCycle(ctx, cycleState, result) + metrics.RecordSchedulerPluginProcessingLatency(PostCyclePluginType, plugin.Type(), time.Since(before)) + } +} diff --git a/pkg/epp/scheduling/framework/scheduler_profile_test.go b/pkg/epp/scheduling/framework/scheduler_profile_test.go new file mode 100644 index 000000000..d94bb26ca --- /dev/null +++ b/pkg/epp/scheduling/framework/scheduler_profile_test.go @@ -0,0 +1,268 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package framework + +import ( + "context" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/uuid" + k8stypes "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" // Import config for thresholds + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +func TestSchedulePlugins(t *testing.T) { + tp1 := &testPlugin{ + TypeRes: "test1", + ScoreRes: 0.3, + FilterRes: []k8stypes.NamespacedName{{Name: "pod1"}, {Name: "pod2"}, {Name: "pod3"}}, + } + tp2 := &testPlugin{ + TypeRes: "test2", + ScoreRes: 0.8, + FilterRes: []k8stypes.NamespacedName{{Name: "pod1"}, {Name: "pod2"}}, + } + tp_filterAll := &testPlugin{ + TypeRes: "filter all", + FilterRes: []k8stypes.NamespacedName{}, + } + pickerPlugin := &testPlugin{ + TypeRes: "picker", + PickRes: k8stypes.NamespacedName{Name: "pod1"}, + } + + tests := []struct { + name string + profile *SchedulerProfile + input []backendmetrics.PodMetrics + wantTargetPod k8stypes.NamespacedName + targetPodScore float64 + // Number of expected pods to score (after filter) + numPodsToScore int + err bool + }{ + { + name: "all plugins executed successfully, all scorers with same weight", + profile: NewSchedulerProfile(). + WithFilters(tp1, tp2). + WithScorers(NewWeightedScorer(tp1, 1), NewWeightedScorer(tp2, 1)). + WithPicker(pickerPlugin). + WithPostCyclePlugins(tp1, tp2), + input: []backendmetrics.PodMetrics{ + &backendmetrics.FakePodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, + &backendmetrics.FakePodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, + &backendmetrics.FakePodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}, + }, + wantTargetPod: k8stypes.NamespacedName{Name: "pod1"}, + targetPodScore: 1.1, + numPodsToScore: 2, + err: false, + }, + { + name: "all plugins executed successfully, different scorers weights", + profile: NewSchedulerProfile(). + WithFilters(tp1, tp2). + WithScorers(NewWeightedScorer(tp1, 60), NewWeightedScorer(tp2, 40)). + WithPicker(pickerPlugin). + WithPostCyclePlugins(tp1, tp2), + input: []backendmetrics.PodMetrics{ + &backendmetrics.FakePodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, + &backendmetrics.FakePodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, + &backendmetrics.FakePodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}, + }, + wantTargetPod: k8stypes.NamespacedName{Name: "pod1"}, + targetPodScore: 50, + numPodsToScore: 2, + err: false, + }, + { + name: "filter all", + profile: NewSchedulerProfile(). + WithFilters(tp1, tp_filterAll). + WithScorers(NewWeightedScorer(tp1, 1), NewWeightedScorer(tp2, 1)). + WithPicker(pickerPlugin). + WithPostCyclePlugins(tp1, tp2), + input: []backendmetrics.PodMetrics{ + &backendmetrics.FakePodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, + &backendmetrics.FakePodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, + &backendmetrics.FakePodMetrics{Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}, + }, + numPodsToScore: 0, + err: true, // no available pods to server after filter all + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + // Reset all plugins before each new test case. + for _, plugin := range test.profile.filters { + plugin.(*testPlugin).reset() + } + for _, plugin := range test.profile.scorers { + plugin.Scorer.(*testPlugin).reset() + } + test.profile.picker.(*testPlugin).reset() + for _, plugin := range test.profile.postCyclePlugins { + plugin.(*testPlugin).reset() + } + + // Initialize the scheduling context + request := &types.LLMRequest{ + TargetModel: "test-model", + RequestId: uuid.NewString(), + } + // Run profile cycle + got, err := test.profile.Run(context.Background(), request, types.NewCycleState(), types.ToSchedulerPodMetrics(test.input)) + + // Validate error state + if test.err != (err != nil) { + t.Fatalf("Unexpected error, got %v, want %v", err, test.err) + } + + if err != nil { + return + } + + // Validate output + wantPod := &types.PodMetrics{ + Pod: &backend.Pod{NamespacedName: test.wantTargetPod, Labels: make(map[string]string)}, + } + wantRes := &types.ProfileRunResult{ + TargetPod: wantPod, + } + + if diff := cmp.Diff(wantRes, got); diff != "" { + t.Errorf("Unexpected output (-want +got): %v", diff) + } + // Validate plugin execution counts dynamically + for _, plugin := range test.profile.filters { + tp, _ := plugin.(*testPlugin) + if tp.FilterCallCount != 1 { + t.Errorf("Plugin %s Filter() called %d times, expected 1", plugin.Type(), tp.FilterCallCount) + } + } + for _, plugin := range test.profile.scorers { + tp, _ := plugin.Scorer.(*testPlugin) + if tp.ScoreCallCount != 1 { + t.Errorf("Plugin %s Score() called %d times, expected 1", plugin.Type(), tp.ScoreCallCount) + } + if test.numPodsToScore != tp.NumOfScoredPods { + t.Errorf("Plugin %s Score() called with %d pods, expected %d", plugin.Type(), tp.NumOfScoredPods, test.numPodsToScore) + } + } + tp, _ := test.profile.picker.(*testPlugin) + if tp.NumOfPickerCandidates != test.numPodsToScore { + t.Errorf("Picker plugin %s Pick() called with %d candidates, expected %d", tp.Type(), tp.NumOfPickerCandidates, tp.NumOfScoredPods) + } + if tp.PickCallCount != 1 { + t.Errorf("Picker plugin %s Pick() called %d times, expected 1", tp.Type(), tp.PickCallCount) + } + if tp.WinnderPodScore != test.targetPodScore { + t.Errorf("winnder pod score %v, expected %v", tp.WinnderPodScore, test.targetPodScore) + } + for _, plugin := range test.profile.postCyclePlugins { + tp, _ := plugin.(*testPlugin) + if tp.PostScheduleCallCount != 1 { + t.Errorf("Plugin %s PostSchedule() called %d times, expected 1", plugin.Type(), tp.PostScheduleCallCount) + } + } + }) + } +} + +// compile-time type assertion +var _ Filter = &testPlugin{} +var _ Scorer = &testPlugin{} +var _ Picker = &testPlugin{} +var _ PostCycle = &testPlugin{} + +// testPlugin is an implementation useful in unit tests. +type testPlugin struct { + TypeRes string + ScoreCallCount int + NumOfScoredPods int + ScoreRes float64 + FilterCallCount int + FilterRes []k8stypes.NamespacedName + PostScheduleCallCount int + PickCallCount int + NumOfPickerCandidates int + PickRes k8stypes.NamespacedName + WinnderPodScore float64 +} + +func (tp *testPlugin) Type() string { return tp.TypeRes } +func (tp *testPlugin) Name() string { return "test-plugin" } + +func (tp *testPlugin) Filter(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, pods []types.Pod) []types.Pod { + tp.FilterCallCount++ + return findPods(pods, tp.FilterRes...) + +} + +func (tp *testPlugin) Score(_ context.Context, _ *types.CycleState, _ *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 { + tp.ScoreCallCount++ + scoredPods := make(map[types.Pod]float64, len(pods)) + for _, pod := range pods { + scoredPods[pod] += tp.ScoreRes + } + tp.NumOfScoredPods = len(scoredPods) + return scoredPods +} + +func (tp *testPlugin) Pick(_ context.Context, _ *types.CycleState, scoredPods []*types.ScoredPod) *types.ProfileRunResult { + tp.PickCallCount++ + tp.NumOfPickerCandidates = len(scoredPods) + + var winnerPod types.Pod + for _, scoredPod := range scoredPods { + if scoredPod.GetPod().NamespacedName.String() == tp.PickRes.String() { + winnerPod = scoredPod.Pod + tp.WinnderPodScore = scoredPod.Score + } + } + + return &types.ProfileRunResult{TargetPod: winnerPod} +} + +func (tp *testPlugin) PostCycle(_ context.Context, _ *types.CycleState, res *types.ProfileRunResult) { + tp.PostScheduleCallCount++ +} + +func (tp *testPlugin) reset() { + tp.FilterCallCount = 0 + tp.ScoreCallCount = 0 + tp.NumOfScoredPods = 0 + tp.PostScheduleCallCount = 0 + tp.PickCallCount = 0 + tp.NumOfPickerCandidates = 0 +} + +func findPods(pods []types.Pod, names ...k8stypes.NamespacedName) []types.Pod { + res := []types.Pod{} + for _, pod := range pods { + for _, name := range names { + if pod.GetPod().NamespacedName.String() == name.String() { + res = append(res, pod) + } + } + } + return res +} diff --git a/pkg/epp/scheduling/framework/weighted_scorer.go b/pkg/epp/scheduling/framework/weighted_scorer.go new file mode 100644 index 000000000..3b8d80a42 --- /dev/null +++ b/pkg/epp/scheduling/framework/weighted_scorer.go @@ -0,0 +1,36 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package framework + +// NewWeightedScorer initializes a new WeightedScorer and returns its pointer. +func NewWeightedScorer(scorer Scorer, weight int) *WeightedScorer { + return &WeightedScorer{ + Scorer: scorer, + weight: weight, + } +} + +// WeightedScorer is a struct that encapsulates a scorer with its weight. +type WeightedScorer struct { + Scorer + weight int +} + +// Weight returns the weight of the scorer. +func (s *WeightedScorer) Weight() int { + return s.weight +} diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go index e874724d0..b848b26dc 100644 --- a/pkg/epp/scheduling/scheduler.go +++ b/pkg/epp/scheduling/scheduler.go @@ -20,140 +20,116 @@ package scheduling import ( "context" "fmt" - "math/rand" + "time" - "github.com/go-logr/logr" "sigs.k8s.io/controller-runtime/pkg/log" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" - envutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" - errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/config" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/filter" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/picker" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework/plugins/profile" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) -// Config holds all the configuration values for the scheduler -type Config struct { - KVCacheThreshold float64 - QueueThresholdCritical int - QueueingThresholdLoRA int - LoraAffinityThreshold float64 +type Datastore interface { + PodGetAll() []backendmetrics.PodMetrics } -const ( - // Default values to use if environment variables are not set - defaultKVCacheThreshold = 0.8 - defaultQueueThresholdCritical = 5 - defaultQueueingThresholdLoRA = 128 - defaultLoraAffinityThreshold = 0.999 -) - -// LoadConfig loads configuration from environment variables -func LoadConfig() Config { - // Use a default logger for initial configuration loading - baseLogger := log.Log.WithName("scheduling-config") - - config := Config{ - KVCacheThreshold: envutil.GetEnvFloat("KV_CACHE_THRESHOLD", defaultKVCacheThreshold, baseLogger), - QueueThresholdCritical: envutil.GetEnvInt("QUEUE_THRESHOLD_CRITICAL", defaultQueueThresholdCritical, baseLogger), - QueueingThresholdLoRA: envutil.GetEnvInt("QUEUING_THRESHOLD_LORA", defaultQueueingThresholdLoRA, baseLogger), - LoraAffinityThreshold: envutil.GetEnvFloat("LORA_AFFINITY_THRESHOLD", defaultLoraAffinityThreshold, baseLogger), - } - - baseLogger.V(logutil.DEFAULT).Info("Scheduler configuration loaded", "config", config) - - return config -} - -var config = LoadConfig() - -var ( - defaultFilter = &filter{ - name: "critical request", - filter: toFilterFunc(criticalRequestPredicate), - nextOnSuccess: lowLatencyFilter, - nextOnFailure: sheddableRequestFilter, - } - - // queueLoRAAndKVCacheFilter applied least queue -> low cost lora -> least KV Cache filter - queueLoRAAndKVCacheFilter = &filter{ - name: "least queuing", - filter: leastQueuingFilterFunc, - nextOnSuccessOrFailure: &filter{ - name: "low cost LoRA", - filter: loRASoftAffinityFilter, - nextOnSuccessOrFailure: &filter{ - name: "least KV cache percent", - filter: leastKVCacheFilterFunc, +// NewScheduler returns a new scheduler with default scheduler plugins configuration. +func NewScheduler() *Scheduler { + // When the scheduler is initialized with NewScheduler function, thw below config will be used as default. + // it's possible to call NewSchedulerWithConfig to pass a different scheduler config. + // For build time plugins changes, it's recommended to call in main.go to NewSchedulerWithConfig. + loraAffinityFilter := filter.NewLoraAffinityFilter(config.Conf.LoraAffinityThreshold) + leastQueueFilter := filter.NewLeastQueueFilter() + leastKvCacheFilter := filter.NewLeastKVCacheFilter() + + lowLatencyFilter := &filter.DecisionTreeFilter{ + Current: filter.NewLowQueueFilter(config.Conf.QueueingThresholdLoRA), + NextOnSuccess: &filter.DecisionTreeFilter{ + Current: loraAffinityFilter, + NextOnSuccessOrFailure: &filter.DecisionTreeFilter{ + Current: leastQueueFilter, + NextOnSuccessOrFailure: &filter.DecisionTreeFilter{ + Current: leastKvCacheFilter, + }, }, }, - } - - // queueAndKVCacheFilter applies least queue followed by least KV Cache filter - queueAndKVCacheFilter = &filter{ - name: "least queuing", - filter: leastQueuingFilterFunc, - nextOnSuccessOrFailure: &filter{ - name: "least KV cache percent", - filter: leastKVCacheFilterFunc, + NextOnFailure: &filter.DecisionTreeFilter{ + Current: leastQueueFilter, + NextOnSuccessOrFailure: &filter.DecisionTreeFilter{ + Current: loraAffinityFilter, + NextOnSuccessOrFailure: &filter.DecisionTreeFilter{ + Current: leastKvCacheFilter, + }, + }, }, } - lowLatencyFilter = &filter{ - name: "low queueing filter", - filter: toFilterFunc((lowQueueingPodPredicate)), - nextOnSuccess: &filter{ - name: "affinity LoRA", - filter: loRASoftAffinityFilter, - nextOnSuccessOrFailure: queueAndKVCacheFilter, - }, - nextOnFailure: queueLoRAAndKVCacheFilter, - } + defaultProfile := framework.NewSchedulerProfile(). + WithFilters(lowLatencyFilter). + WithPicker(&picker.RandomPicker{}) - sheddableRequestFilter = &filter{ - // When there is at least one model server that's not queuing requests, and still has KV - // cache below a certain threshold, we consider this model server has capacity to handle - // a sheddable request without impacting critical requests. - name: "has capacity for sheddable requests", - filter: toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(config.QueueThresholdCritical, config.KVCacheThreshold)), - nextOnSuccess: queueLoRAAndKVCacheFilter, - // If all pods are queuing or running above the KVCache threshold, we drop the sheddable - // request to make room for critical requests. - nextOnFailure: &filter{ - name: "drop request", - filter: func(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { - logger.V(logutil.DEFAULT).Info("Request dropped", "request", req) - return []backendmetrics.PodMetrics{}, errutil.Error{ - Code: errutil.InferencePoolResourceExhausted, Msg: "dropping request due to limited backend resources", - } - }, - }, - } -) + profileHandler := profile.NewSingleProfileHandler() -func NewScheduler(datastore datastore.Datastore) *Scheduler { + return NewSchedulerWithConfig(NewSchedulerConfig(profileHandler, map[string]*framework.SchedulerProfile{"default": defaultProfile})) +} + +// NewSchedulerWithConfig returns a new scheduler with the given scheduler plugins configuration. +func NewSchedulerWithConfig(config *SchedulerConfig) *Scheduler { return &Scheduler{ - datastore: datastore, - filter: defaultFilter, + profileHandler: config.profileHandler, + profiles: config.profiles, } } type Scheduler struct { - datastore datastore.Datastore - filter Filter + profileHandler framework.ProfileHandler + profiles map[string]*framework.SchedulerProfile } // Schedule finds the target pod based on metrics and the requested lora adapter. -func (s *Scheduler) Schedule(ctx context.Context, req *LLMRequest) (targetPod backendmetrics.PodMetrics, err error) { - logger := log.FromContext(ctx).WithValues("request", req) - - podMetrics := s.datastore.PodGetAll() - logger.V(logutil.DEBUG).Info(fmt.Sprintf("Scheduling a request. Metrics: %+v", podMetrics)) +func (s *Scheduler) Schedule(ctx context.Context, request *types.LLMRequest, candidatePods []types.Pod) (*types.SchedulingResult, error) { + logger := log.FromContext(ctx).WithValues("request", request) + loggerDebug := logger.V(logutil.DEBUG) + + scheduleStart := time.Now() + defer func() { + metrics.RecordSchedulerE2ELatency(time.Since(scheduleStart)) + }() + + profileRunResults := map[string]*types.ProfileRunResult{} + cycleState := types.NewCycleState() + + for { // get the next set of profiles to run iteratively based on the request and the previous execution results + before := time.Now() + profiles := s.profileHandler.Pick(ctx, cycleState, request, s.profiles, profileRunResults) + metrics.RecordSchedulerPluginProcessingLatency(framework.ProfilePickerType, s.profileHandler.Type(), time.Since(before)) + if len(profiles) == 0 { // profile picker didn't pick any profile to run + break + } + + for name, profile := range profiles { + // run the selected profiles and collect results (current code runs all profiles) + profileRunResult, err := profile.Run(ctx, request, cycleState, candidatePods) + if err != nil { + loggerDebug.Info("failed to run scheduler profile", "profile", name, "error", err.Error()) + } + + profileRunResults[name] = profileRunResult // if profile failed to run, the run result is nil + } + } - pods, err := s.filter.Filter(logger, req, podMetrics) - if err != nil || len(pods) == 0 { - return nil, fmt.Errorf("failed to apply filter, resulted %v pods, this should never happen: %w", len(pods), err) + if len(profileRunResults) == 0 { + return nil, fmt.Errorf("failed to run any SchedulingProfile for the request - %s", request) } - logger.V(logutil.DEBUG).Info(fmt.Sprintf("Selecting a random pod from %d candidates: %+v", len(pods), pods)) - i := rand.Intn(len(pods)) - return pods[i], nil + + before := time.Now() + result, err := s.profileHandler.ProcessResults(ctx, cycleState, request, profileRunResults) + metrics.RecordSchedulerPluginProcessingLatency(framework.ProcessProfilesResultsType, s.profileHandler.Type(), time.Since(before)) + + return result, err } diff --git a/pkg/epp/scheduling/scheduler_config.go b/pkg/epp/scheduling/scheduler_config.go new file mode 100644 index 000000000..41524f6da --- /dev/null +++ b/pkg/epp/scheduling/scheduler_config.go @@ -0,0 +1,35 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduling + +import ( + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/framework" +) + +// NewSchedulerConfig creates a new SchedulerConfig object and returns its pointer. +func NewSchedulerConfig(profileHandler framework.ProfileHandler, profiles map[string]*framework.SchedulerProfile) *SchedulerConfig { + return &SchedulerConfig{ + profileHandler: profileHandler, + profiles: profiles, + } +} + +// SchedulerConfig provides a configuration for the scheduler which influence routing decisions. +type SchedulerConfig struct { + profileHandler framework.ProfileHandler + profiles map[string]*framework.SchedulerProfile +} diff --git a/pkg/epp/scheduling/scheduler_test.go b/pkg/epp/scheduling/scheduler_test.go new file mode 100644 index 000000000..720a6b4f5 --- /dev/null +++ b/pkg/epp/scheduling/scheduler_test.go @@ -0,0 +1,133 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduling + +import ( + "context" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/uuid" + k8stypes "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" // Import config for thresholds + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +// Tests the default scheduler configuration and expected behavior. +func TestSchedule(t *testing.T) { + tests := []struct { + name string + req *types.LLMRequest + input []backendmetrics.PodMetrics + wantRes *types.SchedulingResult + err bool + }{ + { + name: "no candidate pods", + req: &types.LLMRequest{ + TargetModel: "any-model", + RequestId: uuid.NewString(), + }, + input: []backendmetrics.PodMetrics{}, + wantRes: nil, + err: true, + }, + { + name: "finds optimal pod", + req: &types.LLMRequest{ + TargetModel: "critical", + RequestId: uuid.NewString(), + }, + // pod2 will be picked because it has relatively low queue size, with the requested + // model being active, and has low KV cache. + input: []backendmetrics.PodMetrics{ + &backendmetrics.FakePodMetrics{ + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, + Metrics: &backendmetrics.MetricsState{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + }, + &backendmetrics.FakePodMetrics{ + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, + Metrics: &backendmetrics.MetricsState{ + WaitingQueueSize: 3, + KVCacheUsagePercent: 0.1, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "critical": 1, + }, + }, + }, + &backendmetrics.FakePodMetrics{ + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}, + Metrics: &backendmetrics.MetricsState{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + }, + }, + }, + }, + wantRes: &types.SchedulingResult{ + ProfileResults: map[string]*types.ProfileRunResult{ + "default": { + TargetPod: &types.ScoredPod{ + Pod: &types.PodMetrics{ + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}, Labels: make(map[string]string)}, + MetricsState: &backendmetrics.MetricsState{ + WaitingQueueSize: 3, + KVCacheUsagePercent: 0.1, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "critical": 1, + }, + WaitingModels: map[string]int{}, + }, + }, + }, + }, + }, + PrimaryProfileName: "default", + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + scheduler := NewScheduler() + got, err := scheduler.Schedule(context.Background(), test.req, types.ToSchedulerPodMetrics(test.input)) + if test.err != (err != nil) { + t.Errorf("Unexpected error, got %v, want %v", err, test.err) + } + + if diff := cmp.Diff(test.wantRes, got); diff != "" { + t.Errorf("Unexpected output (-want +got): %v", diff) + } + }) + } +} diff --git a/pkg/epp/scheduling/types/cycle_state.go b/pkg/epp/scheduling/types/cycle_state.go new file mode 100644 index 000000000..789ece245 --- /dev/null +++ b/pkg/epp/scheduling/types/cycle_state.go @@ -0,0 +1,111 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package types + +import ( + "errors" + "fmt" + "sync" +) + +var ( + // ErrNotFound is the not found error message. + ErrNotFound = errors.New("not found") +) + +// StateData is a generic type for arbitrary data stored in CycleState. +type StateData interface { + // Clone is an interface to make a copy of StateData. + Clone() StateData +} + +// StateKey is the type of keys stored in CycleState. +type StateKey string + +// NewCycleState initializes a new CycleState and returns its pointer. +func NewCycleState() *CycleState { + return &CycleState{} +} + +// CycleState provides a mechanism for plugins to store and retrieve arbitrary data. +// StateData stored by one plugin can be read, altered, or deleted by another plugin. +// CycleState does not provide any data protection, as all plugins are assumed to be +// trusted. +// Note: CycleState uses a sync.Map to back the storage, because it is thread safe. It's aimed to optimize for the "write once and read many times" scenarios. +type CycleState struct { + // key: StateKey, value: StateData + storage sync.Map +} + +// Clone creates a copy of CycleState and returns its pointer. Clone returns +// nil if the context being cloned is nil. +func (c *CycleState) Clone() *CycleState { + if c == nil { + return nil + } + copy := NewCycleState() + // Safe copy storage in case of overwriting. + c.storage.Range(func(k, v any) bool { + copy.storage.Store(k, v.(StateData).Clone()) + return true + }) + + return copy +} + +// Read retrieves data with the given "key" from CycleState. If the key is not +// present, ErrNotFound is returned. +// +// See CycleState for notes on concurrency. +func (c *CycleState) Read(key StateKey) (StateData, error) { + if v, ok := c.storage.Load(key); ok { + return v.(StateData), nil + } + return nil, ErrNotFound +} + +// Write stores the given "val" in CycleState with the given "key". +// +// See CycleState for notes on concurrency. +func (c *CycleState) Write(key StateKey, val StateData) { + c.storage.Store(key, val) +} + +// Delete deletes data with the given key from CycleState. +// +// See CycleState for notes on concurrency. +func (c *CycleState) Delete(key StateKey) { + c.storage.Delete(key) +} + +// ReadCycleStateKey retrieves data with the given key from CycleState and asserts it to type T. +// Returns an error if the key is not found or the type assertion fails. +func ReadCycleStateKey[T StateData](c *CycleState, key StateKey) (T, error) { + var zero T + + raw, err := c.Read(key) + if err != nil { + return zero, err + } + + val, ok := raw.(T) + if !ok { + return zero, fmt.Errorf("unexpected type for key %q: got %T", key, raw) + } + + return val, nil +} diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go new file mode 100644 index 000000000..451384751 --- /dev/null +++ b/pkg/epp/scheduling/types/types.go @@ -0,0 +1,90 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package types + +import ( + "fmt" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" +) + +// LLMRequest is a structured representation of the fields we parse out of the LLMRequest body. +type LLMRequest struct { + // RequestId is the Envoy generated Id for the request being processed + RequestId string + // TargetModel is the final target model after traffic split. + TargetModel string + // Prompt is the prompt that was sent in the request body. + Prompt string + // Headers is a map of the request headers. + Headers map[string]string +} + +func (r *LLMRequest) String() string { + return fmt.Sprintf("RequestID: %s, TargetModel: %s, PromptLength: %d, Headers: %v", r.RequestId, r.TargetModel, len(r.Prompt), r.Headers) +} + +type Pod interface { + GetPod() *backend.Pod + GetMetrics() *backendmetrics.MetricsState + String() string +} + +type ScoredPod struct { + Pod + Score float64 +} + +func (pm *PodMetrics) String() string { + if pm == nil { + return "" + } + return fmt.Sprintf("%+v", *pm) +} + +func (pm *PodMetrics) GetPod() *backend.Pod { + return pm.Pod +} + +func (pm *PodMetrics) GetMetrics() *backendmetrics.MetricsState { + return pm.MetricsState +} + +type PodMetrics struct { + *backend.Pod + *backendmetrics.MetricsState +} + +func ToSchedulerPodMetrics(pods []backendmetrics.PodMetrics) []Pod { + pm := make([]Pod, 0, len(pods)) + for _, pod := range pods { + pm = append(pm, &PodMetrics{Pod: pod.GetPod().Clone(), MetricsState: pod.GetMetrics().Clone()}) + } + return pm +} + +// ProfileRunResult captures the profile run result. +type ProfileRunResult struct { + TargetPod Pod +} + +// SchedulingResult captures the result of the scheduling cycle. +type SchedulingResult struct { + ProfileResults map[string]*ProfileRunResult + PrimaryProfileName string +} diff --git a/pkg/epp/server/controller_manager.go b/pkg/epp/server/controller_manager.go index 41fe86a93..89e509696 100644 --- a/pkg/epp/server/controller_manager.go +++ b/pkg/epp/server/controller_manager.go @@ -22,6 +22,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" @@ -29,6 +30,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/manager" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) @@ -36,42 +38,43 @@ var scheme = runtime.NewScheme() func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) - utilruntime.Must(v1alpha2.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.Install(scheme)) } -// DefaultManagerOptions returns the default options used to create the manager. -func DefaultManagerOptions(namespace, name string) ctrl.Options { +// defaultManagerOptions returns the default options used to create the manager. +func defaultManagerOptions(namespacedName types.NamespacedName, metricsServerOptions metricsserver.Options) ctrl.Options { return ctrl.Options{ Scheme: scheme, Cache: cache.Options{ ByObject: map[client.Object]cache.ByObject{ &corev1.Pod{}: { Namespaces: map[string]cache.Config{ - namespace: {}, + namespacedName.Namespace: {}, }, }, &v1alpha2.InferencePool{}: { Namespaces: map[string]cache.Config{ - namespace: { + namespacedName.Namespace: { FieldSelector: fields.SelectorFromSet(fields.Set{ - "metadata.name": name, + "metadata.name": namespacedName.Name, }), }, }, }, &v1alpha2.InferenceModel{}: { Namespaces: map[string]cache.Config{ - namespace: {}, + namespacedName.Namespace: {}, }, }, }, }, + Metrics: metricsServerOptions, } } // NewDefaultManager creates a new controller manager with default configuration. -func NewDefaultManager(namespace, name string, restConfig *rest.Config) (ctrl.Manager, error) { - manager, err := ctrl.NewManager(restConfig, DefaultManagerOptions(namespace, name)) +func NewDefaultManager(namespacedName types.NamespacedName, restConfig *rest.Config, metricsServerOptions metricsserver.Options) (ctrl.Manager, error) { + manager, err := ctrl.NewManager(restConfig, defaultManagerOptions(namespacedName, metricsServerOptions)) if err != nil { return nil, fmt.Errorf("failed to create controller manager: %v", err) } diff --git a/pkg/epp/server/runserver.go b/pkg/epp/server/runserver.go index a6c9f1d37..7b79ae90b 100644 --- a/pkg/epp/server/runserver.go +++ b/pkg/epp/server/runserver.go @@ -20,12 +20,15 @@ import ( "context" "crypto/tls" "fmt" + "time" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "github.com/go-logr/logr" "google.golang.org/grpc" "google.golang.org/grpc/credentials" + "google.golang.org/grpc/health" + healthgrpc "google.golang.org/grpc/health/grpc_health_v1" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -35,7 +38,7 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/controller" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol" ) // ExtProcServerRunner provides methods to manage an external process server. @@ -43,15 +46,16 @@ type ExtProcServerRunner struct { GrpcPort int DestinationEndpointHintMetadataNamespace string DestinationEndpointHintKey string - PoolName string - PoolNamespace string + PoolNamespacedName types.NamespacedName Datastore datastore.Datastore SecureServing bool + HealthChecking bool CertPath string - UseStreaming bool RefreshPrometheusMetricsInterval time.Duration + Director *requestcontrol.Director + SaturationDetector requestcontrol.SaturationDetector - // This should only be used in tests. We won't need this once we don't inject metrics in the tests. + // This should only be used in tests. We won't need this once we do not inject metrics in the tests. // TODO:(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/432) Cleanup TestPodMetricsClient *backendmetrics.FakePodMetricsClient } @@ -66,17 +70,21 @@ const ( DefaultRefreshMetricsInterval = 50 * time.Millisecond // default for --refreshMetricsInterval DefaultRefreshPrometheusMetricsInterval = 5 * time.Second // default for --refreshPrometheusMetricsInterval DefaultSecureServing = true // default for --secureServing + DefaultHealthChecking = false // default for --healthChecking ) +// NewDefaultExtProcServerRunner creates a runner with default values. +// Note: Dependencies like Datastore, Scheduler, SD need to be set separately. func NewDefaultExtProcServerRunner() *ExtProcServerRunner { return &ExtProcServerRunner{ GrpcPort: DefaultGrpcPort, DestinationEndpointHintKey: DefaultDestinationEndpointHintKey, DestinationEndpointHintMetadataNamespace: DefaultDestinationEndpointHintMetadataNamespace, - PoolName: DefaultPoolName, - PoolNamespace: DefaultPoolNamespace, + PoolNamespacedName: types.NamespacedName{Name: DefaultPoolName, Namespace: DefaultPoolNamespace}, SecureServing: DefaultSecureServing, - // Datastore can be assigned later. + HealthChecking: DefaultHealthChecking, + RefreshPrometheusMetricsInterval: DefaultRefreshPrometheusMetricsInterval, + // Dependencies can be assigned later. } } @@ -85,34 +93,27 @@ func (r *ExtProcServerRunner) SetupWithManager(ctx context.Context, mgr ctrl.Man // Create the controllers and register them with the manager if err := (&controller.InferencePoolReconciler{ Datastore: r.Datastore, - Client: mgr.GetClient(), - PoolNamespacedName: types.NamespacedName{ - Name: r.PoolName, - Namespace: r.PoolNamespace, - }, - Record: mgr.GetEventRecorderFor("InferencePool"), + Reader: mgr.GetClient(), + Record: mgr.GetEventRecorderFor("InferencePool"), }).SetupWithManager(mgr); err != nil { return fmt.Errorf("failed setting up InferencePoolReconciler: %w", err) } if err := (&controller.InferenceModelReconciler{ - Datastore: r.Datastore, - Client: mgr.GetClient(), - PoolNamespacedName: types.NamespacedName{ - Name: r.PoolName, - Namespace: r.PoolNamespace, - }, - Record: mgr.GetEventRecorderFor("InferenceModel"), + Datastore: r.Datastore, + Reader: mgr.GetClient(), + PoolNamespacedName: r.PoolNamespacedName, + Record: mgr.GetEventRecorderFor("InferenceModel"), }).SetupWithManager(ctx, mgr); err != nil { return fmt.Errorf("failed setting up InferenceModelReconciler: %w", err) } if err := (&controller.PodReconciler{ Datastore: r.Datastore, - Client: mgr.GetClient(), + Reader: mgr.GetClient(), Record: mgr.GetEventRecorderFor("pod"), }).SetupWithManager(mgr); err != nil { - return fmt.Errorf("failed setting up EndpointSliceReconciler: %v", err) + return fmt.Errorf("failed setting up PodReconciler: %v", err) } return nil } @@ -145,19 +146,28 @@ func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable { } else { srv = grpc.NewServer() } - var extProcServer extProcPb.ExternalProcessorServer - if r.UseStreaming { - logger.Info("Using streaming extproc server") - extProcServer = handlers.NewStreamingServer(scheduling.NewScheduler(r.Datastore), r.DestinationEndpointHintMetadataNamespace, r.DestinationEndpointHintKey, r.Datastore) - } else { - logger.Info("Using standard extproc server") - extProcServer = handlers.NewServer(scheduling.NewScheduler(r.Datastore), r.DestinationEndpointHintMetadataNamespace, r.DestinationEndpointHintKey, r.Datastore) - } + + extProcServer := handlers.NewStreamingServer( + r.DestinationEndpointHintMetadataNamespace, + r.DestinationEndpointHintKey, + r.Datastore, + r.Director, + ) extProcPb.RegisterExternalProcessorServer( srv, extProcServer, ) + if r.HealthChecking { + healthcheck := health.NewServer() + healthgrpc.RegisterHealthServer(srv, + healthcheck, + ) + svcName := extProcPb.ExternalProcessor_ServiceDesc.ServiceName + logger.Info("Setting ExternalProcessor service status to SERVING", "serviceName", svcName) + healthcheck.SetServingStatus(svcName, healthgrpc.HealthCheckResponse_SERVING) + } + // Forward to the gRPC runnable. return runnable.GRPCServer("ext-proc", srv, r.GrpcPort).Start(ctx) })) diff --git a/pkg/epp/server/server_test.go b/pkg/epp/server/server_test.go new file mode 100644 index 000000000..3696f5a71 --- /dev/null +++ b/pkg/epp/server/server_test.go @@ -0,0 +1,192 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package server + +import ( + "context" + "fmt" + "testing" + + pb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" + testutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" + "sigs.k8s.io/gateway-api-inference-extension/test/utils" +) + +const ( + bufSize = 1024 * 1024 + podName = "pod1" + podAddress = "1.2.3.4" + poolPort = int32(5678) + destinationEndpointHintKey = "test-target" + namespace = "ns1" +) + +func TestServer(t *testing.T) { + theHeaderValue := "body" + requestHeader := "x-test" + + expectedRequestHeaders := map[string]string{destinationEndpointHintKey: fmt.Sprintf("%s:%d", podAddress, poolPort), + "Content-Length": "42", ":method": "POST", requestHeader: theHeaderValue} + expectedResponseHeaders := map[string]string{"x-went-into-resp-headers": "true", ":method": "POST", requestHeader: theHeaderValue} + expectedSchedulerHeaders := map[string]string{":method": "POST", requestHeader: theHeaderValue} + + t.Run("server", func(t *testing.T) { + tsModel := "food-review" + model := testutil.MakeInferenceModel("v1"). + CreationTimestamp(metav1.Unix(1000, 0)). + ModelName(tsModel).ObjRef() + + director := &testDirector{} + ctx, cancel, ds, _ := utils.PrepareForTestStreamingServer([]*v1alpha2.InferenceModel{model}, + []*v1.Pod{{ObjectMeta: metav1.ObjectMeta{Name: podName}}}, "test-pool1", namespace, poolPort) + + streamingServer := handlers.NewStreamingServer(namespace, destinationEndpointHintKey, ds, director) + + testListener, errChan := utils.SetupTestStreamingServer(t, ctx, ds, streamingServer) + process, conn := utils.GetStreamingServerClient(ctx, t) + defer conn.Close() + + // Send request headers - no response expected + headers := utils.BuildEnvoyGRPCHeaders(map[string]string{requestHeader: theHeaderValue, ":method": "POST"}, true) + request := &pb.ProcessingRequest{ + Request: &pb.ProcessingRequest_RequestHeaders{ + RequestHeaders: headers, + }, + } + err := process.Send(request) + if err != nil { + t.Error("Error sending request headers", err) + } + + // Send request body + requestBody := "{\"model\":\"food-review\",\"prompt\":\"Is banana tasty?\"}" + expectedBody := "{\"model\":\"v1\",\"prompt\":\"Is banana tasty?\"}" + request = &pb.ProcessingRequest{ + Request: &pb.ProcessingRequest_RequestBody{ + RequestBody: &pb.HttpBody{ + Body: []byte(requestBody), + EndOfStream: true, + }, + }, + } + err = process.Send(request) + if err != nil { + t.Error("Error sending request body", err) + } + + // Receive request headers and check + responseReqHeaders, err := process.Recv() + if err != nil { + t.Error("Error receiving response", err) + } else { + if responseReqHeaders == nil || responseReqHeaders.GetRequestHeaders() == nil || + responseReqHeaders.GetRequestHeaders().Response == nil || + responseReqHeaders.GetRequestHeaders().Response.HeaderMutation == nil || + responseReqHeaders.GetRequestHeaders().Response.HeaderMutation.SetHeaders == nil { + t.Error("Invalid request headers response") + } else if !utils.CheckEnvoyGRPCHeaders(t, responseReqHeaders.GetRequestHeaders().Response, expectedRequestHeaders) { + t.Error("Incorrect request headers") + } + } + + // Receive request body and check + responseReqBody, err := process.Recv() + if err != nil { + t.Error("Error receiving response", err) + } else { + if responseReqBody == nil || responseReqBody.GetRequestBody() == nil || + responseReqBody.GetRequestBody().Response == nil || + responseReqBody.GetRequestBody().Response.BodyMutation == nil || + responseReqBody.GetRequestBody().Response.BodyMutation.GetStreamedResponse() == nil { + t.Error("Invalid request body response") + } else { + body := responseReqBody.GetRequestBody().Response.BodyMutation.GetStreamedResponse().Body + if string(body) != expectedBody { + t.Errorf("Incorrect body %s expected %s", string(body), expectedBody) + } + } + } + + // Check headers passed to the scheduler + if len(director.requestHeaders) != 2 { + t.Errorf("Incorrect number of request headers %d instead of 2", len(director.requestHeaders)) + } + for expectedKey, expectedValue := range expectedSchedulerHeaders { + got, ok := director.requestHeaders[expectedKey] + if !ok { + t.Errorf("Missing header %s", expectedKey) + } else if got != expectedValue { + t.Errorf("Incorrect value for header %s, want %s got %s", expectedKey, expectedValue, got) + } + } + + // Send response headers + headers = utils.BuildEnvoyGRPCHeaders(map[string]string{requestHeader: theHeaderValue, ":method": "POST"}, false) + request = &pb.ProcessingRequest{ + Request: &pb.ProcessingRequest_ResponseHeaders{ + ResponseHeaders: headers, + }, + } + err = process.Send(request) + if err != nil { + t.Error("Error sending response", err) + } + + // Receive response headers and check + response, err := process.Recv() + if err != nil { + t.Error("Error receiving response", err) + } else { + if response == nil || response.GetResponseHeaders() == nil || response.GetResponseHeaders().Response == nil || + response.GetResponseHeaders().Response.HeaderMutation == nil || + response.GetResponseHeaders().Response.HeaderMutation.SetHeaders == nil { + t.Error("Invalid response") + } else if !utils.CheckEnvoyGRPCHeaders(t, response.GetResponseHeaders().Response, expectedResponseHeaders) { + t.Error("Incorrect response headers") + } + } + + cancel() + <-errChan + testListener.Close() + }) +} + +type testDirector struct { + requestHeaders map[string]string +} + +func (ts *testDirector) HandleRequest(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { + ts.requestHeaders = reqCtx.Request.Headers + + reqCtx.Request.Body["model"] = "v1" + reqCtx.TargetEndpoint = fmt.Sprintf("%s:%d", podAddress, poolPort) + return reqCtx, nil +} + +func (ts *testDirector) HandleResponse(ctx context.Context, reqCtx *handlers.RequestContext) (*handlers.RequestContext, error) { + return reqCtx, nil +} + +func (ts *testDirector) GetRandomPod() *backend.Pod { + return nil +} diff --git a/pkg/epp/util/env/env.go b/pkg/epp/util/env/env.go index 11e3bde19..37cb0738d 100644 --- a/pkg/epp/util/env/env.go +++ b/pkg/epp/util/env/env.go @@ -1,51 +1,74 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package env import ( + "fmt" "os" + "reflect" "strconv" + "time" "github.com/go-logr/logr" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) -// getEnvFloat gets a float64 from an environment variable with a default value -func GetEnvFloat(key string, defaultVal float64, logger logr.Logger) float64 { - val, exists := os.LookupEnv(key) +// getEnvWithParser retrieves an environment variable. If set, it uses the provided parser to parse it. +// It logs success or failure and returns the parsed value or the default value in case of a failure. +func getEnvWithParser[T any](key string, defaultVal T, parser func(string) (T, error), logger logr.Logger) T { + valueStr, exists := os.LookupEnv(key) if !exists { - logger.V(logutil.VERBOSE).Info("Environment variable not set, using default value", - "key", key, "defaultValue", defaultVal) + logger.Info("Environment variable not set, using default value", "key", key, "defaultValue", defaultVal) return defaultVal } - floatVal, err := strconv.ParseFloat(val, 64) + parsedValue, err := parser(valueStr) if err != nil { - logger.V(logutil.VERBOSE).Info("Failed to parse environment variable as float, using default value", - "key", key, "value", val, "error", err, "defaultValue", defaultVal) + logger.Info(fmt.Sprintf("Failed to parse environment variable as %s, using default value", reflect.TypeOf(defaultVal)), + "key", key, "rawValue", valueStr, "error", err, "defaultValue", defaultVal) return defaultVal } - logger.V(logutil.VERBOSE).Info("Successfully loaded environment variable", - "key", key, "value", floatVal) - return floatVal + logger.Info("Successfully loaded environment variable", "key", key, "value", parsedValue) + return parsedValue } -// getEnvInt gets an int from an environment variable with a default value +// GetEnvFloat gets a float64 from an environment variable with a default value. +func GetEnvFloat(key string, defaultVal float64, logger logr.Logger) float64 { + parser := func(s string) (float64, error) { return strconv.ParseFloat(s, 64) } + return getEnvWithParser(key, defaultVal, parser, logger) +} + +// GetEnvInt gets an int from an environment variable with a default value. func GetEnvInt(key string, defaultVal int, logger logr.Logger) int { - val, exists := os.LookupEnv(key) - if !exists { - logger.V(logutil.VERBOSE).Info("Environment variable not set, using default value", - "key", key, "defaultValue", defaultVal) - return defaultVal - } + return getEnvWithParser(key, defaultVal, strconv.Atoi, logger) +} - intVal, err := strconv.Atoi(val) - if err != nil { - logger.V(logutil.VERBOSE).Info("Failed to parse environment variable as int, using default value", - "key", key, "value", val, "error", err, "defaultValue", defaultVal) - return defaultVal - } +// GetEnvDuration gets a time.Duration from an environment variable with a default value. +func GetEnvDuration(key string, defaultVal time.Duration, logger logr.Logger) time.Duration { + return getEnvWithParser(key, defaultVal, time.ParseDuration, logger) +} + +// GetEnvBool gets a boolean from an environment variable with a default value. +func GetEnvBool(key string, defaultVal bool, logger logr.Logger) bool { + return getEnvWithParser(key, defaultVal, strconv.ParseBool, logger) +} - logger.V(logutil.VERBOSE).Info("Successfully loaded environment variable", - "key", key, "value", intVal) - return intVal +// GetEnvString gets a string from an environment variable with a default value. +func GetEnvString(key string, defaultVal string, logger logr.Logger) string { + parser := func(s string) (string, error) { return s, nil } + return getEnvWithParser(key, defaultVal, parser, logger) } diff --git a/pkg/epp/util/env/env_test.go b/pkg/epp/util/env/env_test.go index 02513e283..4cf0223db 100644 --- a/pkg/epp/util/env/env_test.go +++ b/pkg/epp/util/env/env_test.go @@ -3,6 +3,7 @@ package env import ( "os" "testing" + "time" "github.com/go-logr/logr/testr" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" @@ -14,7 +15,6 @@ func TestGetEnvFloat(t *testing.T) { tests := []struct { name string key string - value string defaultVal float64 expected float64 setup func() @@ -23,7 +23,6 @@ func TestGetEnvFloat(t *testing.T) { { name: "env variable exists and is valid", key: "TEST_FLOAT", - value: "123.456", defaultVal: 0.0, expected: 123.456, setup: func() { @@ -36,7 +35,6 @@ func TestGetEnvFloat(t *testing.T) { { name: "env variable exists but is invalid", key: "TEST_FLOAT", - value: "invalid", defaultVal: 99.9, expected: 99.9, setup: func() { @@ -69,13 +67,82 @@ func TestGetEnvFloat(t *testing.T) { } } +func TestGetEnvDuration(t *testing.T) { + logger := testr.New(t) + + tests := []struct { + name string + key string + defaultVal time.Duration + expected time.Duration + setup func() + teardown func() + }{ + { + name: "env variable exists and is valid", + key: "TEST_DURATION", + defaultVal: 0, + expected: 1*time.Hour + 30*time.Minute, + setup: func() { + os.Setenv("TEST_DURATION", "1h30m") + }, + teardown: func() { + os.Unsetenv("TEST_DURATION") + }, + }, + { + name: "env variable exists but is invalid", + key: "TEST_DURATION", + defaultVal: 5 * time.Minute, + expected: 5 * time.Minute, + setup: func() { + os.Setenv("TEST_DURATION", "invalid-duration") + }, + teardown: func() { + os.Unsetenv("TEST_DURATION") + }, + }, + { + name: "env variable does not exist", + key: "TEST_DURATION_MISSING", + defaultVal: 10 * time.Second, + expected: 10 * time.Second, + setup: func() {}, + teardown: func() {}, + }, + { + name: "env variable is empty string", + key: "TEST_DURATION_EMPTY", + defaultVal: 1 * time.Millisecond, + expected: 1 * time.Millisecond, + setup: func() { + os.Setenv("TEST_DURATION_EMPTY", "") + }, + teardown: func() { + os.Unsetenv("TEST_DURATION_EMPTY") + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + tc.setup() + defer tc.teardown() + + result := GetEnvDuration(tc.key, tc.defaultVal, logger.V(logutil.VERBOSE)) + if result != tc.expected { + t.Errorf("GetEnvDuration(%s, %v) = %v, expected %v", tc.key, tc.defaultVal, result, tc.expected) + } + }) + } +} + func TestGetEnvInt(t *testing.T) { logger := testr.New(t) tests := []struct { name string key string - value string defaultVal int expected int setup func() @@ -84,7 +151,6 @@ func TestGetEnvInt(t *testing.T) { { name: "env variable exists and is valid", key: "TEST_INT", - value: "123", defaultVal: 0, expected: 123, setup: func() { @@ -97,7 +163,6 @@ func TestGetEnvInt(t *testing.T) { { name: "env variable exists but is invalid", key: "TEST_INT", - value: "invalid", defaultVal: 99, expected: 99, setup: func() { @@ -118,7 +183,6 @@ func TestGetEnvInt(t *testing.T) { { name: "env variable is empty string", key: "TEST_INT_EMPTY", - value: "", defaultVal: 77, expected: 77, setup: func() { @@ -142,3 +206,131 @@ func TestGetEnvInt(t *testing.T) { }) } } + +func TestGetEnvBool(t *testing.T) { + logger := testr.New(t) + + tests := []struct { + name string + key string + defaultVal bool + expected bool + setup func() + teardown func() + }{ + { + name: "env variable exists and is valid", + key: "TEST_BOOL", + defaultVal: false, + expected: true, + setup: func() { + os.Setenv("TEST_BOOL", "true") + }, + teardown: func() { + os.Unsetenv("TEST_BOOL") + }, + }, + { + name: "env variable exists but is invalid", + key: "TEST_BOOL", + defaultVal: false, + expected: false, + setup: func() { + os.Setenv("TEST_BOOL", "invalid") + }, + teardown: func() { + os.Unsetenv("TEST_BOOL") + }, + }, + { + name: "env variable does not exist", + key: "TEST_BOOL_MISSING", + defaultVal: false, + expected: false, + setup: func() {}, + teardown: func() {}, + }, + { + name: "env variable is empty string", + key: "TEST_BOOL_EMPTY", + defaultVal: false, + expected: false, + setup: func() { + os.Setenv("TEST_BOOL_EMPTY", "") + }, + teardown: func() { + os.Unsetenv("TEST_BOOL_EMPTY") + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + tc.setup() + defer tc.teardown() + + result := GetEnvBool(tc.key, tc.defaultVal, logger.V(logutil.VERBOSE)) + if result != tc.expected { + t.Errorf("GetEnvBool(%s, %v) = %v, expected %v", tc.key, tc.defaultVal, result, tc.expected) + } + }) + } +} + +func TestGetEnvString(t *testing.T) { + logger := testr.New(t) + + tests := []struct { + name string + key string + defaultVal string + expected string + setup func() + teardown func() + }{ + { + name: "env variable exists and is valid", + key: "TEST_STR", + defaultVal: "default", + expected: "123", + setup: func() { + os.Setenv("TEST_STR", "123") + }, + teardown: func() { + os.Unsetenv("TEST_STR") + }, + }, + { + name: "env variable does not exist", + key: "TEST_STR_MISSING", + defaultVal: "default", + expected: "default", + setup: func() {}, + teardown: func() {}, + }, + { + name: "env variable is empty string", + key: "TEST_STR_EMPTY", + defaultVal: "default", + expected: "", + setup: func() { + os.Setenv("TEST_STR_EMPTY", "") + }, + teardown: func() { + os.Unsetenv("TEST_STR_EMPTY") + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + tc.setup() + defer tc.teardown() + + result := GetEnvString(tc.key, tc.defaultVal, logger.V(logutil.VERBOSE)) + if result != tc.expected { + t.Errorf("GetEnvString(%s, %s) = %s, expected %s", tc.key, tc.defaultVal, result, tc.expected) + } + }) + } +} diff --git a/pkg/epp/util/error/error.go b/pkg/epp/util/error/error.go index 2f9c992c8..264830980 100644 --- a/pkg/epp/util/error/error.go +++ b/pkg/epp/util/error/error.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package error import ( @@ -14,6 +30,7 @@ const ( Unknown = "Unknown" BadRequest = "BadRequest" Internal = "Internal" + ServiceUnavailable = "ServiceUnavailable" ModelServerError = "ModelServerError" BadConfiguration = "BadConfiguration" InferencePoolResourceExhausted = "InferencePoolResourceExhausted" diff --git a/pkg/epp/util/error/error_test.go b/pkg/epp/util/error/error_test.go new file mode 100644 index 000000000..8905e847f --- /dev/null +++ b/pkg/epp/util/error/error_test.go @@ -0,0 +1,235 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package error + +import ( + "errors" + "testing" +) + +func TestError_Error(t *testing.T) { + tests := []struct { + name string + err Error + want string + }{ + { + name: "BadRequest error", + err: Error{ + Code: BadRequest, + Msg: "invalid model name", + }, + want: "inference gateway: BadRequest - invalid model name", + }, + { + name: "Internal error", + err: Error{ + Code: Internal, + Msg: "unexpected condition", + }, + want: "inference gateway: Internal - unexpected condition", + }, + { + name: "ServiceUnavailable error", + err: Error{ + Code: ServiceUnavailable, + Msg: "service unavailable", + }, + want: "inference gateway: ServiceUnavailable - service unavailable", + }, + { + name: "ModelServerError", + err: Error{ + Code: ModelServerError, + Msg: "connection timeout", + }, + want: "inference gateway: ModelServerError - connection timeout", + }, + { + name: "BadConfiguration error", + err: Error{ + Code: BadConfiguration, + Msg: "missing required field", + }, + want: "inference gateway: BadConfiguration - missing required field", + }, + { + name: "InferencePoolResourceExhausted error", + err: Error{ + Code: InferencePoolResourceExhausted, + Msg: "no available pods", + }, + want: "inference gateway: InferencePoolResourceExhausted - no available pods", + }, + { + name: "Unknown error", + err: Error{ + Code: Unknown, + Msg: "something went wrong", + }, + want: "inference gateway: Unknown - something went wrong", + }, + { + name: "Empty message", + err: Error{ + Code: BadRequest, + Msg: "", + }, + want: "inference gateway: BadRequest - ", + }, + { + name: "Empty code", + err: Error{ + Code: "", + Msg: "error occurred", + }, + want: "inference gateway: - error occurred", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := tt.err.Error(); got != tt.want { + t.Errorf("Error.Error() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestCanonicalCode(t *testing.T) { + tests := []struct { + name string + err error + want string + }{ + { + name: "Error type with BadRequest code", + err: Error{ + Code: BadRequest, + Msg: "invalid input", + }, + want: BadRequest, + }, + { + name: "Error type with Internal code", + err: Error{ + Code: Internal, + Msg: "server error", + }, + want: Internal, + }, + { + name: "Error type with ServiceUnavailable code", + err: Error{ + Code: ServiceUnavailable, + Msg: "Service unavailable error", + }, + want: ServiceUnavailable, + }, + { + name: "Error type with ModelServerError code", + err: Error{ + Code: ModelServerError, + Msg: "model unavailable", + }, + want: ModelServerError, + }, + { + name: "Error type with BadConfiguration code", + err: Error{ + Code: BadConfiguration, + Msg: "invalid config", + }, + want: BadConfiguration, + }, + { + name: "Error type with InferencePoolResourceExhausted code", + err: Error{ + Code: InferencePoolResourceExhausted, + Msg: "no resources", + }, + want: InferencePoolResourceExhausted, + }, + { + name: "Error type with Unknown code", + err: Error{ + Code: Unknown, + Msg: "unknown error", + }, + want: Unknown, + }, + { + name: "Error type with empty code", + err: Error{ + Code: "", + Msg: "no code provided", + }, + want: "", + }, + { + name: "Non-Error type", + err: errors.New("standard go error"), + want: Unknown, + }, + { + name: "Nil error", + err: nil, + want: Unknown, + }, + { + name: "Custom error type that is not Error", + err: customError{msg: "custom error"}, + want: Unknown, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := CanonicalCode(tt.err); got != tt.want { + t.Errorf("CanonicalCode() = %v, want %v", got, tt.want) + } + }) + } +} + +// customError is a helper type for testing non-Error error types +type customError struct { + msg string +} + +func (e customError) Error() string { + return e.msg +} + +func TestErrorConstants(t *testing.T) { + // Verify that error constants match their expected string values + tests := map[string]string{ + Unknown: "Unknown", + BadRequest: "BadRequest", + Internal: "Internal", + ServiceUnavailable: "ServiceUnavailable", + ModelServerError: "ModelServerError", + BadConfiguration: "BadConfiguration", + InferencePoolResourceExhausted: "InferencePoolResourceExhausted", + } + + for constant, expected := range tests { + if constant != expected { + t.Errorf("Constant value %q != expected %q", constant, expected) + } + } +} diff --git a/pkg/epp/util/logging/fatal.go b/pkg/epp/util/logging/fatal.go index d8a9a9379..ddc15c400 100644 --- a/pkg/epp/util/logging/fatal.go +++ b/pkg/epp/util/logging/fatal.go @@ -25,7 +25,7 @@ import ( // Fatal calls logger.Error followed by os.Exit(1). // // This is a utility function and should not be used in production code! -func Fatal(logger logr.Logger, err error, msg string, keysAndValues ...interface{}) { +func Fatal(logger logr.Logger, err error, msg string, keysAndValues ...any) { logger.Error(err, msg, keysAndValues...) os.Exit(1) } diff --git a/pkg/epp/util/metrics/metrics.go b/pkg/epp/util/metrics/metrics.go new file mode 100644 index 000000000..167669435 --- /dev/null +++ b/pkg/epp/util/metrics/metrics.go @@ -0,0 +1,28 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "fmt" + + compbasemetrics "k8s.io/component-base/metrics" +) + +// HelpMsgWithStability is a helper function to create a help message with stability level. +func HelpMsgWithStability(msg string, stability compbasemetrics.StabilityLevel) string { + return fmt.Sprintf("[%v] %v", stability, msg) +} diff --git a/pkg/epp/util/pod/pod.go b/pkg/epp/util/pod/pod.go new file mode 100644 index 000000000..4fcb948fc --- /dev/null +++ b/pkg/epp/util/pod/pod.go @@ -0,0 +1,36 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package pod + +import ( + corev1 "k8s.io/api/core/v1" +) + +func IsPodReady(pod *corev1.Pod) bool { + if !pod.DeletionTimestamp.IsZero() { + return false + } + for _, condition := range pod.Status.Conditions { + if condition.Type == corev1.PodReady { + if condition.Status == corev1.ConditionTrue { + return true + } + break + } + } + return false +} diff --git a/pkg/epp/util/request/body.go b/pkg/epp/util/request/body.go new file mode 100644 index 000000000..46de1fa54 --- /dev/null +++ b/pkg/epp/util/request/body.go @@ -0,0 +1,86 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package request + +import ( + "fmt" + + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" +) + +func ExtractPromptFromRequestBody(body map[string]any) (string, error) { + if _, ok := body["messages"]; ok { + return extractPromptFromMessagesField(body) + } + return extractPromptField(body) +} + +func extractPromptField(body map[string]any) (string, error) { + prompt, ok := body["prompt"] + if !ok { + return "", errutil.Error{Code: errutil.BadRequest, Msg: "prompt not found in request"} + } + promptStr, ok := prompt.(string) + if !ok { + return "", errutil.Error{Code: errutil.BadRequest, Msg: "prompt is not a string"} + } + return promptStr, nil +} + +func extractPromptFromMessagesField(body map[string]any) (string, error) { + messages, ok := body["messages"] + if !ok { + return "", errutil.Error{Code: errutil.BadRequest, Msg: "messages not found in request"} + } + messageList, ok := messages.([]any) + if !ok { + return "", errutil.Error{Code: errutil.BadRequest, Msg: "messages is not a list"} + } + if len(messageList) == 0 { + return "", errutil.Error{Code: errutil.BadRequest, Msg: "messages is empty"} + } + + prompt := "" + for _, msg := range messageList { + msgMap, ok := msg.(map[string]any) + if !ok { + continue + } + content, ok := msgMap["content"] + if !ok { + continue + } + contentStr, ok := content.(string) + if !ok { + continue + } + role, ok := msgMap["role"] + if !ok { + continue + } + roleStr, ok := role.(string) + if !ok { + continue + } + prompt += constructChatMessage(roleStr, contentStr) + } + return prompt, nil +} + +func constructChatMessage(role string, content string) string { + return fmt.Sprintf("<|im_start|>%s\n%s<|im_end|>\n", role, content) +} diff --git a/pkg/epp/util/request/body_test.go b/pkg/epp/util/request/body_test.go new file mode 100644 index 000000000..ce5a93921 --- /dev/null +++ b/pkg/epp/util/request/body_test.go @@ -0,0 +1,207 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package request + +import ( + "testing" +) + +func TestExtractPromptFromRequestBody(t *testing.T) { + tests := []struct { + name string + body map[string]any + want string + wantErr bool + errType error + }{ + { + name: "chat completions request body", + body: map[string]any{ + "model": "test", + "messages": []any{ + map[string]any{ + "role": "system", "content": "this is a system message", + }, + map[string]any{ + "role": "user", "content": "hello", + }, + map[string]any{ + "role": "assistant", "content": "hi, what can I do for you?", + }, + }, + }, + want: "<|im_start|>system\nthis is a system message<|im_end|>\n" + + "<|im_start|>user\nhello<|im_end|>\n" + + "<|im_start|>assistant\nhi, what can I do for you?<|im_end|>\n", + }, + { + name: "completions request body", + body: map[string]any{ + "model": "test", + "prompt": "test prompt", + }, + want: "test prompt", + }, + { + name: "invalid prompt format", + body: map[string]any{ + "model": "test", + "prompt": []any{ + map[string]any{ + "role": "system", "content": "this is a system message", + }, + map[string]any{ + "role": "user", "content": "hello", + }, + map[string]any{ + "role": "assistant", "content": "hi, what can I", + }, + }, + }, + wantErr: true, + }, + { + name: "invalid messaged format", + body: map[string]any{ + "model": "test", + "messages": map[string]any{ + "role": "system", "content": "this is a system message", + }, + }, + wantErr: true, + }, + { + name: "prompt does not exist", + body: map[string]any{ + "model": "test", + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := ExtractPromptFromRequestBody(tt.body) + if (err != nil) != tt.wantErr { + t.Errorf("ExtractPromptFromRequestBody() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("ExtractPromptFromRequestBody() got = %v, want %v", got, tt.want) + } + }) + } +} + +func TestExtractPromptField(t *testing.T) { + tests := []struct { + name string + body map[string]any + want string + wantErr bool + }{ + { + name: "valid prompt", + body: map[string]any{ + "prompt": "test prompt", + }, + want: "test prompt", + }, + { + name: "prompt not found", + body: map[string]any{}, + wantErr: true, + }, + { + name: "non-string prompt", + body: map[string]any{ + "prompt": 123, + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := extractPromptField(tt.body) + if (err != nil) != tt.wantErr { + t.Errorf("extractPromptField() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("extractPromptField() got = %v, want %v", got, tt.want) + } + }) + } +} + +func TestExtractPromptFromMessagesField(t *testing.T) { + tests := []struct { + name string + body map[string]any + want string + wantErr bool + }{ + { + name: "valid messages", + body: map[string]any{ + "messages": []any{ + map[string]any{"role": "user", "content": "test1"}, + map[string]any{"role": "assistant", "content": "test2"}, + }, + }, + want: "<|im_start|>user\ntest1<|im_end|>\n<|im_start|>assistant\ntest2<|im_end|>\n", + }, + { + name: "invalid messages format", + body: map[string]any{ + "messages": "invalid", + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := extractPromptFromMessagesField(tt.body) + if (err != nil) != tt.wantErr { + t.Errorf("extractPromptFromMessagesField() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("extractPromptFromMessagesField() got = %v, want %v", got, tt.want) + } + }) + } +} + +func TestConstructChatMessage(t *testing.T) { + tests := []struct { + role string + content string + want string + }{ + {"user", "hello", "<|im_start|>user\nhello<|im_end|>\n"}, + {"assistant", "hi", "<|im_start|>assistant\nhi<|im_end|>\n"}, + } + + for _, tt := range tests { + if got := constructChatMessage(tt.role, tt.content); got != tt.want { + t.Errorf("constructChatMessage() = %v, want %v", got, tt.want) + } + } +} diff --git a/pkg/epp/util/request/headers.go b/pkg/epp/util/request/headers.go new file mode 100644 index 000000000..b1936d31a --- /dev/null +++ b/pkg/epp/util/request/headers.go @@ -0,0 +1,40 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package request + +import ( + "strings" + + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" +) + +const ( + RequestIdHeaderKey = "x-request-id" +) + +func ExtractHeaderValue(req *extProcPb.ProcessingRequest_RequestHeaders, headerKey string) string { + // header key should be case insensitive + headerKeyInLower := strings.ToLower(headerKey) + if req != nil && req.RequestHeaders != nil && req.RequestHeaders.Headers != nil { + for _, headerKv := range req.RequestHeaders.Headers.Headers { + if strings.ToLower(headerKv.Key) == headerKeyInLower { + return string(headerKv.RawValue) + } + } + } + return "" +} diff --git a/pkg/epp/util/request/headers_test.go b/pkg/epp/util/request/headers_test.go new file mode 100644 index 000000000..fb3e2ebd1 --- /dev/null +++ b/pkg/epp/util/request/headers_test.go @@ -0,0 +1,75 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package request + +import ( + "testing" + + corev3 "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" +) + +func TestExtractHeaderValue(t *testing.T) { + tests := []struct { + name string + headers []*corev3.HeaderValue + key string + expected string + }{ + { + name: "Exact match", + headers: []*corev3.HeaderValue{ + {Key: "x-request-id", RawValue: []byte("123")}, + }, + key: "x-request-id", + expected: "123", + }, + { + name: "Case-insensitive match", + headers: []*corev3.HeaderValue{ + {Key: "X-Request-ID", RawValue: []byte("456")}, + }, + key: "x-request-id", + expected: "456", + }, + { + name: "Non-existent key", + headers: []*corev3.HeaderValue{ + {Key: "other-header", RawValue: []byte("abc")}, + }, + key: "x-request-id", + expected: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + req := &extProcPb.ProcessingRequest_RequestHeaders{ + RequestHeaders: &extProcPb.HttpHeaders{ + Headers: &corev3.HeaderMap{ + Headers: tt.headers, + }, + }, + } + + result := ExtractHeaderValue(req, tt.key) + if result != tt.expected { + t.Errorf("ExtractHeaderValue() = %v, want %v", result, tt.expected) + } + }) + } +} diff --git a/pkg/epp/util/request/metadata.go b/pkg/epp/util/request/metadata.go new file mode 100644 index 000000000..228640758 --- /dev/null +++ b/pkg/epp/util/request/metadata.go @@ -0,0 +1,31 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package request + +import ( + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" +) + +func ExtractMetadataValues(req *extProcPb.ProcessingRequest) map[string]any { + metadata := make(map[string]any) + if req != nil && req.MetadataContext != nil && req.MetadataContext.FilterMetadata != nil { + for key, val := range req.MetadataContext.FilterMetadata { + metadata[key] = val.AsMap() + } + } + return metadata +} diff --git a/pkg/epp/util/request/metadata_test.go b/pkg/epp/util/request/metadata_test.go new file mode 100644 index 000000000..f03d83e58 --- /dev/null +++ b/pkg/epp/util/request/metadata_test.go @@ -0,0 +1,71 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package request + +import ( + "testing" + + corev3 "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "github.com/google/go-cmp/cmp" + "google.golang.org/protobuf/types/known/structpb" +) + +func TestExtractMetadataValues(t *testing.T) { + var makeFilterMetadata = func() map[string]*structpb.Struct { + structVal, _ := structpb.NewStruct(map[string]any{ + "hello": "world", + "random-key": []any{"hello", "world"}, + }) + + return map[string]*structpb.Struct{ + "key-1": structVal, + } + } + + tests := []struct { + name string + metadata map[string]*structpb.Struct + expected map[string]any + }{ + { + name: "Exact match", + metadata: makeFilterMetadata(), + expected: map[string]any{ + "key-1": map[string]any{ + "hello": "world", + "random-key": []any{"hello", "world"}, + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + req := &extProcPb.ProcessingRequest{ + MetadataContext: &corev3.Metadata{ + FilterMetadata: tt.metadata, + }, + } + + result := ExtractMetadataValues(req) + if diff := cmp.Diff(result, tt.expected); diff != "" { + t.Errorf("ExtractMetadataValues() unexpected response (-want +got): %v", diff) + } + }) + } +} diff --git a/pkg/epp/util/testing/request.go b/pkg/epp/util/testing/request.go deleted file mode 100644 index 30772ad54..000000000 --- a/pkg/epp/util/testing/request.go +++ /dev/null @@ -1,67 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package testing - -import ( - "encoding/json" - - envoyCorev3 "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" - extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" - "github.com/go-logr/logr" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" -) - -func GenerateRequest(logger logr.Logger, prompt, model string) *extProcPb.ProcessingRequest { - j := map[string]interface{}{ - "model": model, - "prompt": prompt, - "max_tokens": 100, - "temperature": 0, - } - - llmReq, err := json.Marshal(j) - if err != nil { - logutil.Fatal(logger, err, "Failed to unmarshal LLM request") - } - req := &extProcPb.ProcessingRequest{ - Request: &extProcPb.ProcessingRequest_RequestBody{ - RequestBody: &extProcPb.HttpBody{Body: llmReq, EndOfStream: true}, - }, - } - return req -} - -func GenerateStreamedRequestSet(logger logr.Logger, prompt, model string) []*extProcPb.ProcessingRequest { - requests := []*extProcPb.ProcessingRequest{} - headerReq := &extProcPb.ProcessingRequest{ - Request: &extProcPb.ProcessingRequest_RequestHeaders{ - RequestHeaders: &extProcPb.HttpHeaders{ - Headers: &envoyCorev3.HeaderMap{ - Headers: []*envoyCorev3.HeaderValue{ - { - Key: "hi", - Value: "mom", - }, - }, - }, - }, - }, - } - requests = append(requests, headerReq) - requests = append(requests, GenerateRequest(logger, prompt, model)) - return requests -} diff --git a/site-src/api-types/inferencepool.md b/site-src/api-types/inferencepool.md index baa604b61..1494d314e 100644 --- a/site-src/api-types/inferencepool.md +++ b/site-src/api-types/inferencepool.md @@ -7,28 +7,56 @@ ## Background -The InferencePool resource is a logical grouping of compute resources, e.g. Pods, that run model servers. The InferencePool would deploy its own routing, and offer administrative configuration to the Platform Admin. +The **InferencePool** API defines a group of Pods (containers) dedicated to serving AI models. Pods within an InferencePool share the same compute configuration, accelerator type, base language model, and model server. This abstraction simplifies the management of AI model serving resources, providing a centralized point of administrative configuration for Platform Admins. -It is expected for the InferencePool to: +An InferencePool is expected to be bundled with an [Endpoint Picker](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp) extension. This extension is responsible for tracking key metrics on each model server (i.e. the KV-cache utilization, queue length of pending requests, active LoRA adapters, etc.) and routing incoming inference requests to the optimal model server replica based on these metrics. An EPP can only be associated with a single InferencePool. The associated InferencePool is specified by the [poolName](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/config/manifests/inferencepool-resources.yaml#L54) and [poolNamespace](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/config/manifests/inferencepool-resources.yaml#L56) flags. An HTTPRoute can have multiple backendRefs that reference the same InferencePool and therefore routes to the same EPP. An HTTPRoute can have multiple backendRefs that reference different InferencePools and therefore routes to different EPPs. - - Enforce fair consumption of resources across competing workloads - - Efficiently route requests across shared compute (as displayed by the PoC) - -It is _not_ expected for the InferencePool to: +Additionally, any Pod that seeks to join an InferencePool would need to support the [model server protocol](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol), defined by this project, to ensure the Endpoint Picker has adequate information to intelligently route requests. - - Enforce any common set of adapters or base models are available on the Pods - - Manage Deployments of Pods within the Pool - - Manage Pod lifecycle of pods within the pool +## How to Configure an InferencePool -Additionally, any Pod that seeks to join an InferencePool would need to support a protocol, defined by this project, to ensure the Pool has adequate information to intelligently route requests. +The full spec of the InferencePool is defined [here](/reference/spec/#inferencepool). -`InferencePool` has some small overlap with `Service`, displayed here: +In summary, the InferencePoolSpec consists of 3 major parts: + +- The `selector` field specifies which Pods belong to this pool. The labels in this selector must exactly match the labels applied to your model server Pods. +- The `targetPortNumber` field defines the port number that the Inference Gateway should route to on model server Pods that belong to this pool. +- The `extensionRef` field references the [endpoint picker extension](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp) (EPP) service that monitors key metrics from model servers within the InferencePool and provides intelligent routing decisions. + +### Example Configuration + +Here is an example InferencePool configuration: + +``` +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + name: vllm-llama3-8b-instruct +spec: + targetPortNumber: 8000 + selector: + app: vllm-llama3-8b-instruct + extensionRef: + name: vllm-llama3-8b-instruct-epp + port: 9002 + failureMode: FailClose +``` + +In this example: + +- An InferencePool named `vllm-llama3-8b-instruct` is created in the `default` namespace. +- It will select Pods that have the label `app: vllm-llama3-8b-instruct`. +- Traffic routed to this InferencePool will call out to the EPP service `vllm-llama3-8b-instruct-epp` on port `9002` for making routing decisions. If EPP fails to pick an endpoint, or is not responsive, the request will be dropped. +- Traffic routed to this InferencePool will be forwarded to the port `8000` on the selected Pods. + +## Overlap with Service + +**InferencePool** has some small overlap with **Service**, displayed here: Comparing InferencePool with Service -The InferencePool is _not_ intended to be a mask of the Service object, simply exposing the absolute bare minimum required to allow the Platform Admin to focus less on networking, and more on Pool management. - -## Spec +The InferencePool is not intended to be a mask of the Service object. It provides a specialized abstraction tailored for managing and routing traffic to groups of LLM model servers, allowing Platform Admins to focus on pool-level management rather than low-level networking details. -The full spec of the InferencePool is defined [here](/reference/spec/#inferencepool). \ No newline at end of file +## Replacing an InferencePool +Please refer to the [Replacing an InferencePool](/guides/replacing-inference-pool) guide for details on uses cases and how to replace an InferencePool. diff --git a/site-src/concepts/api-overview.md b/site-src/concepts/api-overview.md index 9c5c04163..ab07a1d2d 100644 --- a/site-src/concepts/api-overview.md +++ b/site-src/concepts/api-overview.md @@ -1,15 +1,27 @@ # API Overview ## Background -The Gateway API Inference Extension project is an extension of the Kubernetes Gateway API for serving Generative AI models on Kubernetes. Gateway API Inference Extension facilitates standardization of APIs for Kubernetes cluster operators and developers running generative AI inference, while allowing flexibility for underlying gateway implementations (such as Envoy Proxy) to iterate on mechanisms for optimized serving of models. +Gateway API Inference Extension optimizes self-hosting Generative AI Models on Kubernetes. +It provides optimized load-balancing for self-hosted Generative AI Models on Kubernetes. +The project’s goal is to improve and standardize routing to inference workloads across the ecosystem. -Overview of API integration +This is achieved by leveraging Envoy's [External Processing](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter) to extend any gateway that supports both ext-proc and [Gateway API](https://github.com/kubernetes-sigs/gateway-api) into an [inference gateway](../index.md#concepts-and-definitions). +This extension extends popular gateways like Envoy Gateway, kgateway, and GKE Gateway - to become [Inference Gateway](../index.md#concepts-and-definitions) - +supporting inference platform teams self-hosting Generative Models (with a current focus on large language models) on Kubernetes. +This integration makes it easy to expose and control access to your local [OpenAI-compatible chat completion endpoints](https://platform.openai.com/docs/api-reference/chat) +to other workloads on or off cluster, or to integrate your self-hosted models alongside model-as-a-service providers +in a higher level **AI Gateways** like [LiteLLM](https://www.litellm.ai/), [Gloo AI Gateway](https://www.solo.io/products/gloo-ai-gateway), or [Apigee](https://cloud.google.com/apigee). ## API Resources +Gateway API Inference Extension introduces two inference-focused API resources with distinct responsibilities, +each aligning with a specific user persona in the Generative AI serving workflow. + +Overview of API integration + ### InferencePool -InferencePool represents a set of Inference-focused Pods and an extension that will be used to route to them. Within the broader Gateway API resource model, this resource is considered a "backend". In practice, that means that you'd replace a Kubernetes Service with an InferencePool. This resource has some similarities to Service (a way to select Pods and specify a port), but has some unique capabilities. With InferenceModel, you can configure a routing extension as well as inference-specific routing optimizations. For more information on this resource, refer to our [InferencePool documentation](/api-types/inferencepool) or go directly to the [InferencePool spec](/reference/spec/#inferencepool). +InferencePool represents a set of Inference-focused Pods and an extension that will be used to route to them. Within the broader Gateway API resource model, this resource is considered a "backend". In practice, that means that you'd replace a Kubernetes Service with an InferencePool. This resource has some similarities to Service (a way to select Pods and specify a port), but has some unique capabilities. With InferencePool, you can configure a routing extension as well as inference-specific routing optimizations. For more information on this resource, refer to our [InferencePool documentation](/api-types/inferencepool) or go directly to the [InferencePool spec](/reference/spec/#inferencepool). ### InferenceModel diff --git a/site-src/concepts/design-principles.md b/site-src/concepts/design-principles.md new file mode 100644 index 000000000..76c3b77a9 --- /dev/null +++ b/site-src/concepts/design-principles.md @@ -0,0 +1,47 @@ +# Design Principles + +These principles guide our efforts to build flexible [Gateway API] extensions +that empower the development of high-performance [AI Inference] routing +technologies—balancing rapid delivery with long-term growth. + +!!! note "Inference Gateways" + + For simplicity, we'll refer to Gateway API Gateways which are + composed together with AI Inference extensions as "Inference Gateways" + throughout this document. + +[Gateway API]:https://github.com/kubernetes-sigs/gateway-api +[AI Inference]:https://www.arm.com/glossary/ai-inference + + +## Prioritize stability of the core interfaces + +The most critical part of this project is the interfaces between components. To encourage both controller and extension developers to integrate with this project, we need to prioritize the stability of these interfaces. +Although we can extend these interfaces in the future, it’s critical the core is stable as soon as possible. + +When describing "core interfaces", we are referring to both of the following: + +### 1. Gateway -> Endpoint Picker +At a high level, this defines how a Gateway provides information to an Endpoint Picker, and how the Endpoint Picker selects endpoint(s) that the Gateway should route to. + +### 2. Endpoint Picker -> Model Server Framework +This defines what an Endpoint Picker should expect from a compatible Model Server Framework with a focus on health checks and metrics. + +## Our presets are finely tuned + +We provide APIs and reference implementations for the most common inference requirements. Our defaults for those APIs and implementations—shaped by extensive experience with leading model serving platforms and APIs—are designed to provide the majority of Inference Gateway users with a great default experience without the need for extensive configuration or customization. If you take all of our default extensions and attach them to a compatible `Gateway`, it just "works out of the box". + +## Encourage innovation via extensibility + +This project is largely based on the idea that extensibility will enable innovation. With that in mind, we should make it as easy as possible for AI researchers to experiment with custom scheduling and routing logic. They should not need to know how to build a Kubernetes controller, or replicate a full networking stack. Instead, all the information needed to make a routing decision should be provided in an accessible format, with clear guidelines and examples of how to customize routing logic. + +## Objectives over instructions + +The pace of innovation in this ecosystem has been rapid. Focusing too heavily on the specifics of current techniques could result in the API becoming outdated quickly. Instead of making the API too descriptive about _how_ an objective should be achieved, this API should focus on the objectives that a Gateway and/or Endpoint Picker should strive to attain. Overly specific instructions or configuration can start as implementation specific APIs and grow into standards as the concepts become more stable and widespread. + +## Composable components and reducing reinvention +While it may be tempting to develop an entirely new AI-focused Gateway, many essential routing capabilities are already well established by Kubernetes. Our focus is on creating a layer of composable components that can be assembled together with other Kubernetes components. This approach empowers engineers to use our solution as a building block—combining established technologies like Gateway API with our extensible model to build higher level solutions. Should you encounter a limitation, consider how existing tooling may be extended or improved first. + +## Additions to the API should be carefully prioritized + +Every addition to the API should take the principles described above into account. Given that the goal of the API is to encourage a highly extensible ecosystem, each additional feature in the API is raising the barrier for entry to any new controller or extension. Our top priority should be to focus on concepts that we expect to be broadly implementable and useful. The extensible nature of this API will allow each individual implementation to experiment with new features via custom flags or APIs before they become part of the core API surface. diff --git a/site-src/contributing/index.md b/site-src/contributing/index.md index db0bdc170..e33bbb3b6 100644 --- a/site-src/contributing/index.md +++ b/site-src/contributing/index.md @@ -30,7 +30,7 @@ channels: Gateway API community meetings happen every Thursday at 10am Pacific Time ([convert to your -timezone](https://dateful.com/time-zone-converter?t=08:00&tz=PT%20%28Pacific%20Time%29)). +timezone](https://dateful.com/time-zone-converter?t=10:00&tz=PT%20%28Pacific%20Time%29)). To receive an invite to this and other WG-Serving community meetings, join the [WG-Serving mailing list](https://groups.google.com/a/kubernetes.io/g/wg-serving). @@ -45,5 +45,4 @@ doc. Feel free to add topics for discussion at an upcoming meeting. All meetings are recorded and automatically uploaded to the [WG-Serving meetings YouTube -playlist][https://www.youtube.com/playlist?list=PL69nYSiGNLP30qNanabU75ayPK7OPNAAS]. - +playlist](https://www.youtube.com/playlist?list=PL69nYSiGNLP30qNanabU75ayPK7OPNAAS). diff --git a/site-src/guides/adapter-rollout.md b/site-src/guides/adapter-rollout.md index fdf62c3a0..3eb4d42ce 100644 --- a/site-src/guides/adapter-rollout.md +++ b/site-src/guides/adapter-rollout.md @@ -1,13 +1,18 @@ -# Adapter Rollout +# Lora Adapter Rollout -The goal of this guide is to demonstrate how to rollout a new adapter version. +The goal of this guide is to show you how to perform incremental roll out operations, +which gradually deploy new versions of your inference infrastructure. +You can update LoRA adapters and Inference Pool with minimal service disruption. +This page also provides guidance on traffic splitting and rollbacks to help ensure reliable deployments for LoRA adapters rollout. -## **Prerequisites** - -Follow the steps in the [main guide](index.md) +LoRA adapter rollouts let you deploy new versions of LoRA adapters in phases, +without altering the underlying base model or infrastructure. +Use LoRA adapter rollouts to test improvements, bug fixes, or new features in your LoRA adapters. +## Example -## **Safely rollout v2 adapter** +### Prerequisites +Follow the steps in the [main guide](index.md) ### Load the new adapter version to the model servers @@ -18,28 +23,28 @@ Modify the LoRA syncer ConfigMap to initiate loading of the new adapter version. ```bash - kubectl edit configmap vllm-llama3-8b-instruct-adapters +kubectl edit configmap vllm-llama3-8b-instruct-adapters ``` Change the ConfigMap to match the following (note the new entry under models): ```yaml - apiVersion: v1 - kind: ConfigMap - metadata: - name: vllm-llama3-8b-instruct-adapters - data: - configmap.yaml: | - vLLMLoRAConfig: - name: vllm-llama3-8b-instruct-adapters - port: 8000 - defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct - ensureExist: - models: - - id: food-review-1 - source: Kawon/llama3.1-food-finetune_v14_r8 - - id: food-review-2 - source: Kawon/llama3.1-food-finetune_v14_r8 +apiVersion: v1 +kind: ConfigMap +metadata: + name: vllm-llama3-8b-instruct-adapters +data: + configmap.yaml: | + vLLMLoRAConfig: + name: vllm-llama3-8b-instruct-adapters + port: 8000 + defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct + ensureExist: + models: + - id: food-review-1 + source: Kawon/llama3.1-food-finetune_v14_r8 + - id: food-review-2 + source: Kawon/llama3.1-food-finetune_v14_r8 ``` The new adapter version is applied to the model servers live, without requiring a restart. @@ -51,35 +56,34 @@ Modify the InferenceModel to configure a canary rollout with traffic splitting. ```bash - kubectl edit inferencemodel food-review +kubectl edit inferencemodel food-review ``` Change the targetModels list in InferenceModel to match the following: ```yaml -apiVersion: inference.networking.x-k8s.io/v1alpha1 +apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferenceModel metadata: - name: inferencemodel-sample + name: food-review spec: modelName: food-review - criticality: Critical + criticality: Standard poolRef: - name: vllm-llama3-8b-instruct-pool + name: vllm-llama3-8b-instruct targetModels: - name: food-review-1 weight: 90 - name: food-review-2 weight: 10 - ``` The above configuration means one in every ten requests should be sent to the new version. Try it out: 1. Get the gateway IP: ```bash -IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=8081 +IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=80 ``` 2. Send a few requests as follows: @@ -98,34 +102,42 @@ curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ Modify the InferenceModel to direct 100% of the traffic to the latest version of the adapter. ```yaml -model: - name: food-review - targetModels: - targetModelName: food-review-2 - weight: 100 +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: food-review +spec: + modelName: food-review + criticality: Standard + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: food-review-2 + weight: 100 ``` Unload the older versions from the servers by updating the LoRA syncer ConfigMap to list the older version under the `ensureNotExist` list: ```yaml - apiVersion: v1 - kind: ConfigMap - metadata: - name: dynamic-lora-config - data: - configmap.yaml: | - vLLMLoRAConfig: - name: sql-loras-llama - port: 8000 - defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct - ensureExist: - models: - - id: food-review-2 - source: Kawon/llama3.1-food-finetune_v14_r8 - ensureNotExist: - models: - - id: food-review-1 - source: Kawon/llama3.1-food-finetune_v14_r8 +apiVersion: v1 +kind: ConfigMap +metadata: + name: vllm-llama3-8b-instruct-adapters +data: + configmap.yaml: | + vLLMLoRAConfig: + name: vllm-llama3-8b-instruct-adapters + port: 8000 + defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct + ensureExist: + models: + - id: food-review-2 + source: Kawon/llama3.1-food-finetune_v14_r8 + ensureNotExist: + models: + - id: food-review-1 + source: Kawon/llama3.1-food-finetune_v14_r8 ``` With this, all requests should be served by the new adapter version. + diff --git a/site-src/guides/conformance-tests.md b/site-src/guides/conformance-tests.md new file mode 100644 index 000000000..aa8843871 --- /dev/null +++ b/site-src/guides/conformance-tests.md @@ -0,0 +1,46 @@ + +# Test Setup and Execution + +This document provides steps to run the Gateway API Inference Extension conformance tests. + +## Prerequisites + +1. You need a Kubernetes cluster with [LoadBalancer](https://kubernetes.io/docs/concepts/services-networking/service/#loadbalancer) support. + +2. Choose an Implementation - +Install an [existing implementation](https://gateway-api-inference-extension.sigs.k8s.io/implementations/gateways/). For setup instructions, refer to the [The Quickstart Guide](https://gateway-api-inference-extension.sigs.k8s.io/guides/). Alternatively run tests against your implementation after completing the [implementer's guide](https://gateway-api-inference-extension.sigs.k8s.io/guides/implementers/#implementers-guide). + +Note: Since the EPP (EndPoint Picker) takes the `InferencePool` name as an environment variable, each conformance test creates a corresponding EPP deployment for each `InferencePool` it defines. For conformance testing, the EPP is configured with the `HeaderBasedTestingFilter`. This is enabled by setting the `ENABLE_REQ_HEADER_BASED_SCHEDULER_FOR_TESTING=true` environment variable in the EPP deployment manifest. + +## Running Conformance Tests + +1. **Clone the Repository**: + Create a local copy of the Gateway API Inference Extension repository: + ```bash + git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension.git + cd gateway-api-inference-extension + ``` + +2. **Execute Tests**: + Run the following command to execute all available tests. Replace `` with the GatewayClass used by the implementation under test. + + ```bash + go test ./conformance -args -gateway-class + ``` + +### Test Execution Options + +* **Speeding up Reruns**: For repeated runs, you can add the flag `-cleanup-base-resources=false`. This will preserve resources such as namespaces and gateways between test runs, speeding up the process. + ```bash + go test ./conformance -args -gateway-class -cleanup-base-resources=false + ``` + +* **Running Specific Tests**: To run a specific test, you can reference the test name by using the `-run-test` flag. For example: + ```bash + go test ./conformance -args -gateway-class -run-test HTTPRouteMultipleGatewaysDifferentPools + ``` + +* **Detailed Logging**: To view detailed logs, you can enable logging mode by adding the `-v` as well as `-debug` flags. + ```bash + go test -v ./conformance -args -debug -gateway-class -cleanup-base-resources=false -run-test HTTPRouteMultipleGatewaysDifferentPools + ``` diff --git a/site-src/guides/epp-configuration/prefix-aware.md b/site-src/guides/epp-configuration/prefix-aware.md new file mode 100644 index 000000000..bb18125a3 --- /dev/null +++ b/site-src/guides/epp-configuration/prefix-aware.md @@ -0,0 +1,89 @@ +# Prefix Cache Aware Plugin Configuration + +The [prefix cache plugin](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/7617439188b410670ed0f1ff805a3b7f9918a75b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go#L63) +takes advantage of the prefix caching (e.g., [vllm APC](https://docs.vllm.ai/en/latest/features/automatic_prefix_caching.html)) +feature of model servers, and optimizes request scheduling by placing requests sharing the longest +prefixes to the same server as much as possible, while balancing the server load by considering kv-cache +and queue depth. + +## Enable the prefix cache plugin + +Currently prefix cache aware plugin is implemented in the V2 scheduler as an experimental feature. +To enable it, set the following environment variables when starting the EndpointPicker(EPP). + +``` +EXPERIMENTAL_USE_SCHEDULER_V2: true +ENABLE_PREFIX_CACHE_SCHEDULING: true +``` + +See the [Use Helm section](#helm) to install an inferencepool with the environment variables. + + +## Customize the prefix cache plugin + +The prefix cache plugin exposes the following advanced configuration options via environment variables: + +* `PREFIX_CACHE_HASH_BLOCK_SIZE`: The plugin matches prefixes in the unit of blocks. This is the size +of each block in number of bytes. vLLM default block size is 16 tokens. Assume 4 characters per token, the default +is set to 64 in EPP. The default is recommended unless performance is critical for use cases with +extremely long inputs. + +* `PREFIX_CACHE_MAX_PREFIX_BLOCKS`: The maximum number of blocks to find prefix match. The default is +128 (or 128*64=8192 characters, or roughly 2048 tokens). This is useful to tradeoff prefix match accuracy +for performance. + +* `PREFIX_CACHE_LRU_CAPACITY_PER_SERVER`: Maximum capacity the prefix LRU cache in number of block hashes per server (pod). Below +shows a detailed analysis on how to estimate this. + + + + The prefix cache plugin estimates the prefix cache indexes in model server HBMs. In the perfect + scenario, EPP has the exact same prefix cache entries per model server as their HBM cache entries. If + the EPP cache is smaller than HBM cache, a positive EPP cache match is more accurate, but there are more + false cache misses. If the EPP cache is larger than the HBM cache, then there are more false cache hits. + Therefore **the EPP prefix cache indexer size should be as close as possible to the HBM cache size.** + + NOTE: EPP builds prefix cache based on characters, while model server maintains prefix cache entries + in tokens, a conversion between character <-> token is needed. + + Below are the formulas to estimate the EPP prefix indexer size: + + ``` + max_kv_tokens_per_server = (HBM_size - model_size)/ kv_size_per_token + lru_indexer_capacity_per_server = (max_kv_tokens_per_server * avg_chars_per_token)/prefix_indexer_hash_block_size + ``` + + Let's take an example: + + * Model: llama3 8B + * Accelerator: Nvidia H100 80GB + * Num replicas: 3 + * Estimated # characters per token: 4 ([source](https://genai.stackexchange.com/questions/34/how-long-is-a-token)) + + ``` + max_kv_tokens_per_server = (80GB - 16GB) / 128KB = 500,000 + # assume avg_chars_per_token = 4, prefix_indexer_hash_block_size = 64 (default) + # each entry is about 358KB, so the memory footrpint is abut 11 MB per server + lru_indexer_capacity_per_server = 500,000*4/64 = 31250 + ``` + +See the [Use Helm section](#helm) to install an inferencepool with the environment variables. + + + +## Use Helm + +Use the following reference command to install an inferencepool with the prefix +cache plugin environment variable configurations: + +```txt +$ helm install vllm-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set inferencePool.modelServerType=vllm \ + --set provider.name=[none|gke] \ + --set inferenceExtension.env.EXPERIMENTAL_USE_SCHEDULER_V2=true \ + --set inferenceExtension.env.ENABLE_PREFIX_CACHE_SCHEDULING=true \ + --set inferenceExtension.env.PREFIX_CACHE_LRU_CAPACITY_PER_SERVER=31250 \ + --set inferenceExtension.env.PREFIX_CACHE_MAX_PREFIX_BLOCKS=1024 \ + oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0 +``` diff --git a/site-src/guides/implementers.md b/site-src/guides/implementers.md index 5d1c6267d..204c41738 100644 --- a/site-src/guides/implementers.md +++ b/site-src/guides/implementers.md @@ -1,3 +1,113 @@ # Implementer's Guide -TODO \ No newline at end of file +This guide is intended for developers looking to implement support for the InferencePool custom resources within their Gateway API controller. It outlines how InferencePool fits into the existing resource model, discusses implementation options, explains how to interact with extensions, and provides guidance on testing. + +## InferencePool as a Gateway Backend +Before we dive into the implementation, let’s recap how an InferencePool works. + +Overview of API integration + +**InferencePool** represents a set of Inference-focused Pods and an extension that will be used to route to them. The InferencePool introduces a new type of backend within the Gateway API resource model. Instead of targeting Services, a Gateway can route traffic to an InferencePool. This InferencePool then becomes responsible for intelligent routing to the underlying model server pods based on the associated InferenceModel configurations. + +Here is an example of how to route traffic to an InferencePool using an HTTPRoute: +``` +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: base-model + matches: + - path: + type: PathPrefix + value: / +``` + +Note that the `rules.backendRefs` describes which InferencePool should receive the forwarded traffic when the path matches the corresponding path prefix. This is very similar to how we configure a Gateway with an HTTPRoute that directs traffic to a Service (a way to select Pods and specify a port). By using the InferencePool, it provides an abstraction over a set of compute resources (model server pods), and allows the controller to implement specialized routing strategies for these inference workloads. + +## Building the Gateway controller +The general idea of implementing a Gateway controller supporting the InferencePool involves two major steps: + +1. Tracking the endpoints for InferencePool backends +2. Callout to an extension to make intelligent routing decisions + +### Endpoint Tracking +Consider a simple inference pool like this: +``` +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + name: vllm-llama3-8b-instruct +spec: + targetPortNumber: 8000 + selector: + app: vllm-llama3-8b-instruct + extensionRef: + name: vllm-llama3-8b-instruct-epp +``` + +There are mainly two options for how to treat the Inference Pool in your controller. + +**Option 1: Shadow Service Creation** + +If your Gateway controller already handles Service as a backend, you can choose to create a headless Service that mirrors the endpoints defined by the InferencePool, like this: + +``` +apiVersion: v1 +kind: Service +metadata: + name: vllm-llama3-8b-instruct-shadow-service +spec: + ports: + - port: 54321 + protocol: TCP + targetPort: 8000 + selector: + app: vllm-llama3-8b-instruct + type: ClusterIP + clusterIP: None +``` + +The gateway controller would then treat this shadow service just like any other backend service it routes traffic to. + +This approach likely allows you to leverage existing service discovery, healthcheck infrastructure, and load balancing mechanisms that your controller already supports. However, it does come with the overhead of managing additional Service objects, and hence may affect the overall latency of the reconciliation of the Gateways. + +**Option 2: Tracking InferencePool Endpoints Separately** + +You can also choose to directly select and monitor the endpoints belonging to the InferencePool. For the simple inference pool example we have above, the controller would use the label `app: vllm-llama3-8b-instruct` to discover the pods matching the criteria, and get their endpoints (i.e. IP and port number). It would then need to monitor these pods for health and availability. + +With this approach, you can tailor the endpoint tracking and routing logic specifically to the characteristics and requirements of your InferencePool. + +### Callout Extension + +The [Endpoint Picker](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp), or EPP, is a core component of the inference extension. The primary interaction for routing requests is defined between the proxy (e.g., Envoy) and the EPP using the Envoy [external processing service protocol](https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor.proto). See the [Endpoint Picker Protocol](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/004-endpoint-picker-protocol) for more information. + +#### How to Callout to EPP + +For each HTTP request, the proxy CAN communicate the subset of endpoints the EPP MUST pick from by setting `x-gateway-destination-endpoint-subset` key in the filter metadata field of the ext-proc request. If this key is set, the EPP must select from this endpoint list. If the list is empty or no endpoints are eligible, it should return a 503 error. If the key isn't set, the EPP selects from the endpoints defined by the InferencePool selector. + +#### Response from the extension + +The EPP communicates the chosen endpoint to the proxy via the `x-gateway-destination-endpoint` HTTP header and the `dynamic_metadata` field of the ext-proc response. Failure to communicate the endpoint using both methods results in a 503 error if no endpoints are ready, or a 429 error if the request should be dropped. The header and metadata values must match. In addition to the chosen endpoint, a single fallback endpoint CAN be set using the key `x-gateway-destination-endpoint-fallback` in the same metadata namespace as one used for `x-gateway-destination-endpoint`. + +## Testing Tips + +Here are some tips for testing your controller end-to-end: + +- **Focus on Key Scenarios**: Add common scenarios like creating, updating, and deleting InferencePool resources, as well as different routing rules that target InferencePool backends. +- **Verify Routing Behaviors**: Design more complex routing scenarios and verify that requests are correctly routed to the appropriate model server pods within the InferencePool based on the InferenceModel configuration. +- **Test Error Handling**: Verify that the controller correctly handles scenarios like unsupported model names or resource constraints (if criticality-based shedding is implemented). Test with state transitions (such as constant requests while Pods behind EPP are being replaced and Pods behind InferencePool are being replaced) to ensure that the system is resilient to failures and can automatically recover by redirecting traffic to healthy Pods. +- **Using Reference EPP Implementation + Echoserver**: You can use the [reference EPP implementation](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp) for testing your controller end-to-end. Instead of a full-fledged model server, a simple mock server (like the [echoserver](https://github.com/kubernetes-sigs/ingress-controller-conformance/tree/master/images/echoserver)) can be very useful for verifying routing to ensure the correct pod received the request. +- **Performance Test**: Run end-to-end [benchmarks](https://gateway-api-inference-extension.sigs.k8s.io/performance/benchmark/) to make sure that your inference gateway can achieve the latency target that is desired. + +### Conformance Tests + +See [Conformance Test Setup and Execution](https://gateway-api-inference-extension.sigs.k8s.io/guides/conformance-tests). diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 7fdb211cd..38b631d25 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -1,10 +1,10 @@ -# Getting started with Gateway API Inference Extension +# Getting started with an Inference Gateway ??? example "Experimental" This project is still in an alpha state and breaking changes may occur in the future. -This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get an Inference Gateway up and running! +This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get an Inference Gateway up and running! ## **Prerequisites** @@ -18,22 +18,26 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Deploy Sample Model Server - Two options are supported for running the model server: + Three options are supported for running the model server: - 1. GPU-based model server. + 1. GPU-based model server. Requirements: a Hugging Face access token that grants access to the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). - 1. CPU-based model server (not using GPUs). - The sample uses the model [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct). + 1. CPU-based model server (not using GPUs). + The sample uses the model [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct). - Choose one of these options and follow the steps below. Please do not deploy both, as the deployments have the same name and will override each other. + 1. [vLLM Simulator](https://github.com/llm-d/llm-d-inference-sim/tree/main) model server (not using GPUs). + The sample is configured to simulate the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model. + + Choose one of these options and follow the steps below. Please do not deploy more than one, as the deployments have the same name and will override each other. === "GPU-Based Model Server" For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/gpu-deployment.yaml` as needed. Create a Hugging Face secret to download the model [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct). Ensure that the token grants access to this model. - + Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. + ```bash kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to the set of Llama models kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml @@ -42,24 +46,35 @@ This quickstart guide is intended for engineers familiar with k8s and model serv === "CPU-Based Model Server" This setup is using the formal `vllm-cpu` image, which according to the documentation can run vLLM on x86 CPU platform. - For this setup, we use approximately 9.5GB of memory and 12 CPUs for each replica. - + For this setup, we use approximately 9.5GB of memory and 12 CPUs for each replica. + While it is possible to deploy the model server with less resources, this is not recommended. For example, in our tests, loading the model using 8GB of memory and 1 CPU was possible but took almost 3.5 minutes and inference requests took unreasonable time. In general, there is a tradeoff between the memory and CPU we allocate to our pods and the performance. The more memory and CPU we allocate the better performance we can get. - - After running multiple configurations of these values we decided in this sample to use 9.5GB of memory and 12 CPUs for each replica, which gives reasonable response times. You can increase those numbers and potentially may even get better response times. For modifying the allocated resources, adjust the numbers in [cpu-deployment.yaml](https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml) as needed. + + After running multiple configurations of these values we decided in this sample to use 9.5GB of memory and 12 CPUs for each replica, which gives reasonable response times. You can increase those numbers and potentially may even get better response times. For modifying the allocated resources, adjust the numbers in [cpu-deployment.yaml](https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml) as needed. Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. + ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml ``` +=== "vLLM Simulator Model Server" + + This option uses the [vLLM simulator](https://github.com/llm-d/llm-d-inference-sim/tree/main) to simulate a backend model server. + This setup uses the least amount of compute resources, does not require GPU's, and is ideal for test/dev environments. + + To deploy the vLLM simulator, run the following command. + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/sim-deployment.yaml + ``` + ### Install the Inference Extension CRDs === "Latest Release" ```bash - VERSION=v0.2.0 - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/$VERSION/manifests.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/latest/download/manifests.yaml ``` === "Dev Version" @@ -70,26 +85,25 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Deploy InferenceModel - Deploy the sample InferenceModel which is configured to load balance traffic between the `food-review-0` and `food-review-1` - [LoRA adapters](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server. + Deploy the sample InferenceModel which is configured to forward traffic to the `food-review-1` [LoRA adapter](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server. ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml ``` -### Deploy the InferencePool and Extension +### Deploy the InferencePool and Endpoint Picker Extension ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool-resources.yaml ``` -### Deploy Inference Gateway +### Deploy an Inference Gateway Choose one of the following options to deploy an Inference Gateway. === "GKE" - 1. Enable the Gateway API and configure proxy-only subnets when necessary. See [Deploy Gateways](https://cloud.google.com/kubernetes-engine/docs/how-to/deploying-gateways) + 1. Enable the Gateway API and configure proxy-only subnets when necessary. See [Deploy Gateways](https://cloud.google.com/kubernetes-engine/docs/how-to/deploying-gateways) for detailed instructions. 1. Deploy Gateway and HealthCheckPolicy resources @@ -120,13 +134,13 @@ This quickstart guide is intended for engineers familiar with k8s and model serv 5. Given that the default connection timeout may be insufficient for most inference workloads, it is recommended to configure a timeout appropriate for your intended use case. - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gcp-backend-policy.yaml - ``` + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gcp-backend-policy.yaml + ``` === "Istio" - Please note that this feature is currently in an experimental phase and is not intended for production use. + Please note that this feature is currently in an experimental phase and is not intended for production use. The implementation and user experience are subject to changes as we continue to iterate on this project. 1. Requirements @@ -134,9 +148,9 @@ This quickstart guide is intended for engineers familiar with k8s and model serv - Gateway API [CRDs](https://gateway-api.sigs.k8s.io/guides/#installing-gateway-api) installed. 2. Install Istio - + ``` - TAG=1.26-alpha.80c74f7f43482c226f4f4b10b4dda6261b67a71f + TAG=1.27-alpha.0551127f00634403cddd4634567e65a8ecc499a7 # on Linux wget https://storage.googleapis.com/istio-build/dev/$TAG/istioctl-$TAG-linux-amd64.tar.gz tar -xvf istioctl-$TAG-linux-amd64.tar.gz @@ -165,7 +179,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv 5. Label the gateway ```bash - kubectl label gateway llm-gateway istio.io/enable-inference-extproc=true + kubectl label gateway inference-gateway istio.io/enable-inference-extproc=true ``` Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: @@ -201,7 +215,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv 2. Set the Kgateway version and install the Kgateway CRDs. ```bash - KGTW_VERSION=v2.0.0-rc.2 + KGTW_VERSION=v2.0.3 helm upgrade -i --create-namespace --namespace kgateway-system --version $KGTW_VERSION kgateway-crds oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds ``` @@ -240,24 +254,40 @@ This quickstart guide is intended for engineers familiar with k8s and model serv Wait until the gateway is ready. - ```bash - IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') - PORT=80 - - curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ - "model": "food-review", - "prompt": "Write as if you were a critic: San Francisco", - "max_tokens": 100, - "temperature": 0 - }' - ``` +=== "GPU-Based Model Server" + + ```bash + IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') + PORT=80 + + curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ + "model": "food-review", + "prompt": "Write as if you were a critic: San Francisco", + "max_tokens": 100, + "temperature": 0 + }' + ``` + +=== "CPU-Based Model Server" + + ```bash + IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') + PORT=80 + + curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "prompt": "Write as if you were a critic: San Francisco", + "max_tokens": 100, + "temperature": 0 + }' + ``` ### Cleanup - The following cleanup assumes you would like to clean ALL resources that were created in this quickstart guide. + The following instructions assume you would like to cleanup ALL resources that were created in this quickstart guide. Please be careful not to delete resources you'd like to keep. - 1. Uninstall the Inference Pool + 1. Uninstall the InferencePool, InferenceModel, and model server resources ```bash kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool-resources.yaml --ignore-not-found @@ -267,7 +297,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl delete secret hf-token --ignore-not-found ``` - 1. Uninstall the Gateway + 1. Uninstall the Gateway API resources ```bash kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gateway.yaml --ignore-not-found @@ -281,8 +311,53 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/httproute.yaml --ignore-not-found ``` - 1. Uninstall the CRDs + 1. Uninstall the Gateway API Inference Extension CRDs ```bash kubectl delete -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd --ignore-not-found ``` + + 1. Choose one of the following options to cleanup the Inference Gateway. + +=== "GKE" + + No further clean up is needed. + +=== "Istio" + + The following instructions assume you would like to clean up ALL Istio resources that were created in this quickstart guide. + + 1. Uninstall All Istio resources + + ```bash + istioctl uninstall -y --purge + ``` + + 1. Remove the Istio namespace + + ```bash + kubectl delete ns istio-system + ``` + + +=== "Kgateway" + + The following instructions assume you would like to cleanup ALL Kgateway resources that were created in this quickstart guide. + + 1. Uninstall Kgateway + + ```bash + helm uninstall kgateway -n kgateway-system + ``` + + 1. Uninstall the Kgateway CRDs. + + ```bash + helm uninstall kgateway-crds -n kgateway-system + ``` + + 1. Remove the Kgateway namespace. + + ```bash + kubectl delete ns kgateway-system + ``` diff --git a/site-src/guides/inferencepool-rollout.md b/site-src/guides/inferencepool-rollout.md new file mode 100644 index 000000000..89a384ab4 --- /dev/null +++ b/site-src/guides/inferencepool-rollout.md @@ -0,0 +1,379 @@ +# InferencePool Rollout +The goal of this guide is to show you how to perform incremental roll out operations, +which gradually deploy new versions of your inference infrastructure. +You can update Inference Pool with minimal service disruption. +This page also provides guidance on traffic splitting and rollbacks to help ensure reliable deployments for InferencePool rollout. + +InferencePool rollout is a powerful technique for performing various infrastructure and model updates with minimal disruption and built-in rollback capabilities. +This method allows you to introduce changes incrementally, monitor their impact, and revert to the previous state if necessary. + +## Use Cases +Use Cases for InferencePool Rollout: + +- Node(compute, accelerator) update roll out +- Base model roll out +- Model server framework rollout + +### Node(compute, accelerator) update roll out +Node update roll outs safely migrate inference workloads to new node hardware or accelerator configurations. +This process happens in a controlled manner without interrupting model service. +Use node update roll outs to minimize service disruption during hardware upgrades, driver updates, or security issue resolution. + +### Base model roll out +Base model updates roll out in phases to a new base LLM, retaining compatibility with existing LoRA adapters. +You can use base model update roll outs to upgrade to improved model architectures or to address model-specific issues. + +### Model server framework rollout +Model server framework rollouts enable the seamless deployment of new versions or entirely different serving frameworks, +like updating from an older vLLM version to a newer one, or even migrating from a custom serving solution to a managed one. +This type of rollout is critical for introducing performance enhancements, new features, or security patches within the serving layer itself, +without requiring changes to the underlying base models or application logic. By incrementally rolling out framework updates, +teams can ensure stability and performance, quickly identifying and reverting any regressions before they impact the entire inference workload. + +## How to do InferencePool rollout + +1. **Deploy new infrastructure**: Create a new InferencePool configured with the new node(compute/accelerator) / model server / base model that you chose. +1. **Configure traffic splitting**: Use an HTTPRoute to split traffic between the existing InferencePool and the new InferencePool. The `backendRefs.weight` field controls the traffic percentage allocated to each pool. +1. **Maintain InferenceModel integrity**: Retain the existing InferenceModel configuration to ensure uniform model behavior across both node configurations or base model versions or model server versions. +1. **Preserve rollback capability**: Retain the original nodes and InferencePool during the roll out to facilitate a rollback if necessary. + +## Example +This is an example of InferencePool rollout with node(compute, accelerator) update roll out + +### Prerequisites +Follow the steps in the [main guide](index.md) + +### Deploy new infrastructure +You start with an existing InferencePool named vllm-llama3-8b-instruct. +To replace the original InferencePool, you create a new InferencePool named vllm-llama3-8b-instruct-new along with +InferenceModels and Endpoint Picker Extension configured with the updated node specifications of `nvidia-h100-80gb` accelerator type, + +```yaml +kubectl apply -f - <**Description** |
**Labels**
| **Status** | +|:---------------------------------------------|:-----------------|:------------------------------------------------------------------|:-----------------------------------------------------------------------------------|:------------| +| inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_request_error_total | Counter | The counter of requests errors broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| normalized_time_per_output_token_seconds | Distribution | Distribution of ntpot (response latency per output token) | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_input_tokens | Distribution | Distribution of input token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_output_tokens | Distribution | Distribution of output token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_running_requests | Gauge | Number of running requests for each model. | `model_name`=<model-name> | ALPHA | +| inference_pool_average_kv_cache_utilization | Gauge | The average kv cache utilization for an inference server pool. | `name`=<inference-pool-name> | ALPHA | +| inference_pool_average_queue_size | Gauge | The average number of requests pending in the model server queue. | `name`=<inference-pool-name> | ALPHA | +| inference_pool_per_pod_queue_size | Gauge | The total number of queue for each model server pod under the inference pool | `model_server_pod`=<model-server-pod-name>
`name`=<inference-pool-name> | ALPHA | +| inference_pool_ready_pods | Gauge | The number of ready pods for an inference server pool. | `name`=<inference-pool-name> | ALPHA | +| inference_extension_info | Gauge | The general information of the current build. | `commit`=<hash-of-the-build>
`build_ref`=<ref-to-the-build> | ALPHA | + +### Dynamic LoRA Adapter Sidecar + +| **Metric name** | **Metric Type** |
**Description**
|
**Labels**
| **Status** | +|:---------------------------|:-----------------|:-------------------------------------------------|:------------------------------------------|:------------| +| lora_syncer_adapter_status | Gauge | Status of LoRA adapters (1=loaded, 0=not_loaded) | `adapter_name`=<adapter-id> | ALPHA | + +## Scrape Metrics & Pprof profiles + +The metrics endpoints are exposed on different ports by default: + +- EPP exposes the metrics endpoint at port 9090 +- Dynamic LoRA adapter sidecar exposes the metrics endpoint at port 8080 + +To scrape metrics, the client needs a ClusterRole with the following rule: +`nonResourceURLs: "/metrics", verbs: get`. + +Here is one example if the client needs to mound the secret to act as the service account +``` +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: inference-gateway-metrics-reader +rules: +- nonResourceURLs: + - /metrics + - /debug/pprof/* + verbs: + - get +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: inference-gateway-sa-metrics-reader + namespace: default +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: inference-gateway-sa-metrics-reader-role-binding + namespace: default +subjects: +- kind: ServiceAccount + name: inference-gateway-sa-metrics-reader + namespace: default +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: inference-gateway-metrics-reader +--- +apiVersion: v1 +kind: Secret +metadata: + name: inference-gateway-sa-metrics-reader-secret + namespace: default + annotations: + kubernetes.io/service-account.name: inference-gateway-sa-metrics-reader +type: kubernetes.io/service-account-token +``` + +Then, you can curl the appropriate port as follows. For EPP (port 9090) + +``` +TOKEN=$(kubectl -n default get secret inference-gateway-sa-metrics-reader-secret -o jsonpath='{.secrets[0].name}' -o jsonpath='{.data.token}' | base64 --decode) + +kubectl -n default port-forward inference-gateway-ext-proc-pod-name 9090 + +curl -H "Authorization: Bearer $TOKEN" localhost:9090/metrics +``` + +### Pprof profiles + +Currently only the [predefined profiles](https://pkg.go.dev/runtime/pprof#Profile) are supported, CPU profiling will require code changes. Assuming the EPP has been port-forwarded as in the above example, to get the PGN display of the `heap` profile simply run: + +``` +PROFILE_NAME=heap +curl -H "Authorization: Bearer $TOKEN" localhost:9090/debug/pprof/$PROFILE_NAME -o profile.out +go tool pprof -png profile.out +``` + +## Prometheus Alerts + +The section instructs how to configure prometheus alerts using collected metrics. + +### Configure alerts + +You can follow this [blog post](https://grafana.com/blog/2020/02/25/step-by-step-guide-to-setting-up-prometheus-alertmanager-with-slack-pagerduty-and-gmail/) for instruction of setting up alerts in your monitoring stacks with Prometheus. + +A template alert rule is available at [alert.yaml](../../tools/alerts/alert.yaml). You can modify and append these rules to your existing Prometheus deployment. + +#### High Inference Request Latency P99 + +```yaml +alert: HighInferenceRequestLatencyP99 +expr: histogram_quantile(0.99, rate(inference_model_request_duration_seconds_bucket[5m])) > 10.0 # Adjust threshold as needed (e.g., 10.0 seconds) +for: 5m +annotations: + title: 'High latency (P99) for model {% raw %}{{ $labels.model_name }}{% endraw %}' + description: 'The 99th percentile request duration for model {% raw %}{{ $labels.model_name }}{% endraw %} and target model {% raw %}{{ $labels.target_model_name }}{% endraw %} has been consistently above 10.0 seconds for 5 minutes.' +labels: + severity: 'warning' +``` + +#### High Inference Error Rate + +```yaml +alert: HighInferenceErrorRate +expr: sum by (model_name) (rate(inference_model_request_error_total[5m])) / sum by (model_name) (rate(inference_model_request_total[5m])) > 0.05 # Adjust threshold as needed (e.g., 5% error rate) +for: 5m +annotations: + title: 'High error rate for model {% raw %}{{ $labels.model_name }}{% endraw %}' + description: 'The error rate for model {% raw %}{{ $labels.model_name }}{% endraw %} and target model {% raw %}{{ $labels.target_model_name }}{% endraw %} has been consistently above 5% for 5 minutes.' +labels: + severity: 'critical' + impact: 'availability' +``` + +#### High Inference Pool Queue Average Size + +```yaml +alert: HighInferencePoolAvgQueueSize +expr: inference_pool_average_queue_size > 50 # Adjust threshold based on expected queue size +for: 5m +annotations: + title: 'High average queue size for inference pool {% raw %}{{ $labels.name }}{% endraw %}' + description: 'The average number of requests pending in the queue for inference pool {% raw %}{{ $labels.name }}{% endraw %} has been consistently above 50 for 5 minutes.' +labels: + severity: 'critical' + impact: 'performance' +``` + +#### High Inference Pool Average KV Cache + +```yaml +alert: HighInferencePoolAvgKVCacheUtilization +expr: inference_pool_average_kv_cache_utilization > 0.9 # 90% utilization +for: 5m +annotations: + title: 'High KV cache utilization for inference pool {% raw %}{{ $labels.name }}{% endraw %}' + description: 'The average KV cache utilization for inference pool {% raw %}{{ $labels.name }}{% endraw %} has been consistently above 90% for 5 minutes, indicating potential resource exhaustion.' +labels: + severity: 'critical' + impact: 'resource_exhaustion' +``` diff --git a/site-src/guides/metrics.md b/site-src/guides/metrics.md deleted file mode 100644 index a781f721d..000000000 --- a/site-src/guides/metrics.md +++ /dev/null @@ -1,92 +0,0 @@ -# Metrics - -This guide describes the current state of exposed metrics and how to scrape them. - -## Requirements - -To have response metrics, ensure the body mode is set to `Buffered` or `Streamed` (this should be the default behavior for all implementations). - -If you want to include usage metrics for vLLM model server streaming request, send the request with `include_usage`: - -``` -curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ -"model": "food-review", -"prompt": "whats your fav movie?", -"max_tokens": 10, -"temperature": 0, -"stream": true, -"stream_options": {"include_usage": "true"} -}' -``` - -## Exposed metrics - -| **Metric name** | **Metric Type** |
**Description**
|
**Labels**
| **Status** | -|:---------------------------------------------|:-----------------|:------------------------------------------------------------------|:-----------------------------------------------------------------------------------|:------------| -| inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_request_error_total | Counter | The counter of requests errors broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_input_tokens | Distribution | Distribution of input token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_output_tokens | Distribution | Distribution of output token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_running_requests | Gauge | Number of running requests for each model. | `model_name`=<model-name> | ALPHA | -| inference_pool_average_kv_cache_utilization | Gauge | The average kv cache utilization for an inference server pool. | `name`=<inference-pool-name> | ALPHA | -| inference_pool_average_queue_size | Gauge | The average number of requests pending in the model server queue. | `name`=<inference-pool-name> | ALPHA | -| inference_pool_ready_pods | Gauge | The number of ready pods for an inference server pool. | `name`=<inference-pool-name> | ALPHA | - -## Scrape Metrics - -Metrics endpoint is exposed at port 9090 by default. To scrape metrics, the client needs a ClusterRole with the following rule: -`nonResourceURLs: "/metrics", verbs: get`. - -Here is one example if the client needs to mound the secret to act as the service account -``` ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: inference-gateway-metrics-reader -rules: -- nonResourceURLs: - - /metrics - verbs: - - get ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: inference-gateway-sa-metrics-reader - namespace: default ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: inference-gateway-sa-metrics-reader-role-binding - namespace: default -subjects: -- kind: ServiceAccount - name: inference-gateway-sa-metrics-reader - namespace: default -roleRef: - kind: ClusterRole - name: inference-gateway-metrics-reader - apiGroup: rbac.authorization.k8s.io ---- -apiVersion: v1 -kind: Secret -metadata: - name: inference-gateway-sa-metrics-reader-secret - namespace: default - annotations: - kubernetes.io/service-account.name: inference-gateway-sa-metrics-reader -type: kubernetes.io/service-account-token -``` -Then, you can curl the 9090 port like following -``` -TOKEN=$(kubectl -n default get secret inference-gateway-sa-metrics-reader-secret -o jsonpath='{.secrets[0].name}' -o jsonpath='{.data.token}' | base64 --decode) - -kubectl -n default port-forward inference-gateway-ext-proc-pod-name 9090 - -curl -H "Authorization: Bearer $TOKEN" localhost:9090/metrics -``` \ No newline at end of file diff --git a/site-src/guides/serve-multiple-genai-models.md b/site-src/guides/serve-multiple-genai-models.md new file mode 100644 index 000000000..4eb120a4b --- /dev/null +++ b/site-src/guides/serve-multiple-genai-models.md @@ -0,0 +1,74 @@ +# Serve multiple generative AI models +A company wants to deploy multiple large language models (LLMs) to serve different workloads. +For example, they might want to deploy a Gemma3 model for a chatbot interface and a Deepseek model for a recommendation application. +The company needs to ensure optimal serving performance for these LLMs. +By using an Inference Gateway, you can deploy these LLMs on your cluster with your chosen accelerator configuration in an `InferencePool`. +You can then route requests based on the model name (such as "chatbot" and "recommender") and the `Criticality` property. + +## How +The following diagram illustrates how an Inference Gateway routes requests to different models based on the model name. +The model name is extracted by [Body-Based routing](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) + from the request body to the header. The header is then matched to dispatch + requests to different `InferencePool` (and their EPPs) instances. +![Serving multiple generative AI models](../images/serve-mul-gen-AI-models.png) + +This example illustrates a conceptual example regarding how to use the `HTTPRoute` object to route based on model name like “chatbot” or “recommender” to `InferencePool`. +```yaml +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: routes-to-llms +spec: + parentRefs: + - name: inference-gateway + rules: + - matches: + - headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: chatbot + path: + type: PathPrefix + value: / + backendRefs: + - name: gemma3 + kind: InferencePool + - matches: + - headers: + - type: Exact + #Body-Based routing(https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/bbr/README.md) is being used to copy the model name from the request body to the header. + name: X-Gateway-Model-Name + value: recommender + path: + type: PathPrefix + value: / + backendRefs: + - name: deepseek-r1 + kind: InferencePool +``` + +## Try it out + +1. Get the gateway IP: +```bash +IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=80 +``` +2. Send a few requests to model "chatbot" as follows: +```bash +curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ +"model": "chatbot", +"prompt": "What is the color of the sky", +"max_tokens": 100, +"temperature": 0 +}' +``` +3. Send a few requests to model "recommender" as follows: +```bash +curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ +"model": "recommender", +"prompt": "Give me restaurant recommendations in Paris", +"max_tokens": 100, +"temperature": 0 +}' +``` diff --git a/site-src/guides/serve-multiple-lora-adapters.md b/site-src/guides/serve-multiple-lora-adapters.md new file mode 100644 index 000000000..d1090b3aa --- /dev/null +++ b/site-src/guides/serve-multiple-lora-adapters.md @@ -0,0 +1,100 @@ +# Serve LoRA adapters on a shared pool +A company wants to serve LLMs for document analysis and focuses on audiences in multiple languages, such as English and Spanish. +They have a fine-tuned LoRA adapter for each language, but need to efficiently use their GPU and TPU capacity. +You can use an Inference Gateway to deploy dynamic LoRA fine-tuned adapters for each language (for example, `english-bot` and `spanish-bot`) on a common base model and accelerator. +This lets you reduce the number of required accelerators by densely packing multiple models in a shared pool. + +## How +The following diagram illustrates how Inference Gateway serves multiple LoRA adapters on a shared pool. +![Serving LoRA adapters on a shared pool](../images/serve-LoRA-adapters.png) +This example illustrates how you can densely serve multiple LoRA adapters with distinct workload performance objectives on a common InferencePool. +```yaml +apiVersion: gateway.networking.x-k8s.io/v1alpha1 +kind: InferencePool +metadata: + name: gemma3 +spec: + selector: + pool: gemma3 +``` +Let us say we have a couple of LoRA adapters named “english-bot” and “spanish-bot” for the Gemma3 base model. +You can create an `InferenceModel` resource and associate these LoRA adapters to the relevant InferencePool resource. +In this case, we associate these LoRA adapters to the gemma3 InferencePool resource created above. + +```yaml +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: english-bot +spec: + modelName: english-bot + criticality: Standard + poolRef: + name: gemma3 + +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: spanish-bot +spec: + modelName: spanish-bot + criticality: Critical + poolRef: + name: gemma3 + +``` +Now, you can route your requests from the gateway using the `HTTPRoute` object. +```yaml +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: inference-gateway +spec: + listeners: + - protocol: HTTP + port: 80 + name: http + +--- +kind: HTTPRoute +apiVersion: gateway.networking.k8s.io/v1 +metadata: + name: routes-to-llms +spec: + parentRefs: + - name: inference-gateway + rules: + - matches: + path: + type: PathPrefix + value: / + backendRefs: + - name: gemma3 + kind: InferencePool +``` + +## Try it out + +1. Get the gateway IP: +```bash +IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=80 +``` +2. Send a few requests to model "english-bot" as follows: +```bash +curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ +"model": "english-bot", +"prompt": "What is the color of the sky", +"max_tokens": 100, +"temperature": 0 +}' +``` +3. Send a few requests to model "spanish-bot" as follows: +```bash +curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ +"model": "spanish-bot", +"prompt": "¿De qué color es...?", +"max_tokens": 100, +"temperature": 0 +}' +``` \ No newline at end of file diff --git a/site-src/images/favicon-64.png b/site-src/images/favicon-64.png new file mode 100644 index 000000000..f2bd3d64a Binary files /dev/null and b/site-src/images/favicon-64.png differ diff --git a/site-src/images/logo/logo-text-xl-dark.png b/site-src/images/logo/logo-text-xl-dark.png new file mode 100644 index 000000000..4d878e5c8 Binary files /dev/null and b/site-src/images/logo/logo-text-xl-dark.png differ diff --git a/site-src/images/serve-LoRA-adapters.png b/site-src/images/serve-LoRA-adapters.png new file mode 100644 index 000000000..e33dc708a Binary files /dev/null and b/site-src/images/serve-LoRA-adapters.png differ diff --git a/site-src/images/serve-mul-gen-AI-models.png b/site-src/images/serve-mul-gen-AI-models.png new file mode 100644 index 000000000..957a054f1 Binary files /dev/null and b/site-src/images/serve-mul-gen-AI-models.png differ diff --git a/site-src/implementations.md b/site-src/implementations.md deleted file mode 100644 index 89acb4367..000000000 --- a/site-src/implementations.md +++ /dev/null @@ -1,56 +0,0 @@ -# Implementations - -This project has several implementations that are planned or in progress: - -* [Envoy Gateway][1] -* [Kgateway][2] -* [Google Kubernetes Engine][3] - -[1]:#envoy-gateway -[2]:#kgateway -[3]:#google-kubernetes-engine - -## Envoy Gateway - -[Envoy Gateway][eg-home] is an [Envoy][envoy-org] subproject for managing -Envoy-based application gateways. The supported APIs and fields of the Gateway -API are outlined [here][eg-supported]. Use the [quickstart][eg-quickstart] to -get Envoy Gateway running with Gateway API in a few simple steps. - -Progress towards supporting this project is tracked with a [GitHub -Issue](https://github.com/envoyproxy/gateway/issues/4423). - -[eg-home]:https://gateway.envoyproxy.io/ -[envoy-org]:https://github.com/envoyproxy -[eg-supported]:https://gateway.envoyproxy.io/docs/tasks/quickstart/ -[eg-quickstart]:https://gateway.envoyproxy.io/docs/tasks/quickstart - -## Kgateway - -[Kgateway](https://kgateway.dev/) is a feature-rich, Kubernetes-native -ingress controller and next-generation API gateway. Kgateway brings the -full power and community support of Gateway API to its existing control-plane -implementation. - -Progress towards supporting this project is tracked with a [GitHub -Issue](https://github.com/kgateway-dev/kgateway/issues/10411). - -## Google Kubernetes Engine - -[Google Kubernetes Engine (GKE)][gke] is a managed Kubernetes platform offered -by Google Cloud. GKE's implementation of the Gateway API is through the [GKE -Gateway controller][gke-gateway] which provisions Google Cloud Load Balancers -for Pods in GKE clusters. - -The GKE Gateway controller supports weighted traffic splitting, mirroring, -advanced routing, multi-cluster load balancing and more. See the docs to deploy -[private or public Gateways][gke-gateway-deploy] and also [multi-cluster -Gateways][gke-multi-cluster-gateway]. - -Progress towards supporting this project is tracked with a [GitHub -Issue](https://github.com/GoogleCloudPlatform/gke-gateway-api/issues/20). - -[gke]:https://cloud.google.com/kubernetes-engine -[gke-gateway]:https://cloud.google.com/kubernetes-engine/docs/concepts/gateway-api -[gke-gateway-deploy]:https://cloud.google.com/kubernetes-engine/docs/how-to/deploying-gateways -[gke-multi-cluster-gateway]:https://cloud.google.com/kubernetes-engine/docs/how-to/deploying-multi-cluster-gateways diff --git a/site-src/implementations/gateways.md b/site-src/implementations/gateways.md new file mode 100644 index 000000000..950c0833e --- /dev/null +++ b/site-src/implementations/gateways.md @@ -0,0 +1,88 @@ +# Gateway Implementations + +This project has several implementations that are planned or in progress: + +* [Envoy AI Gateway][1] +* [Kgateway][2] +* [Google Kubernetes Engine][3] +* [Istio][4] +* [Alibaba Cloud Container Service for Kubernetes][5] + +[1]:#envoy-gateway +[2]:#kgateway +[3]:#google-kubernetes-engine +[4]:#istio +[5]:#alibaba-cloud-container-service-for-kubernetes + +## Envoy AI Gateway + +[Envoy AI Gateway][aigw-home] is an open source project built on top of +[Envoy][envoy-org] and [Envoy Gateway][envoy-gateway] to handle request traffic +from application clients to GenAI services. The features and capabilities are outlined [here][aigw-capabilities]. Use the [quickstart][aigw-quickstart] to get Envoy AI Gateway running with Gateway API in a few simple steps. + +Progress towards supporting this project is tracked with a [GitHub +Issue](https://github.com/envoyproxy/ai-gateway/issues/423). + +[aigw-home]:https://aigateway.envoyproxy.io/ +[envoy-org]:https://github.com/envoyproxy +[envoy-gateway]: https://gateway.envoyproxy.io/ +[aigw-capabilities]:https://aigateway.envoyproxy.io/docs/capabilities/ +[aigw-quickstart]:https://aigateway.envoyproxy.io/docs/capabilities/gateway-api-inference-extension + +## Kgateway + +[Kgateway](https://kgateway.dev/) is a feature-rich, Kubernetes-native +ingress controller and next-generation API gateway. Kgateway brings the +full power and community support of Gateway API to its existing control-plane +implementation. + +Progress towards supporting this project is tracked with a [GitHub +Issue](https://github.com/kgateway-dev/kgateway/issues/10411). + +## Google Kubernetes Engine + +[Google Kubernetes Engine (GKE)][gke] is a managed Kubernetes platform offered +by Google Cloud. GKE's implementation of the Gateway API is through the [GKE +Gateway controller][gke-gateway] which provisions Google Cloud Load Balancers +for Pods in GKE clusters. + +The GKE Gateway controller supports weighted traffic splitting, mirroring, +advanced routing, multi-cluster load balancing and more. See the docs to deploy +[private or public Gateways][gke-gateway-deploy] and also [multi-cluster +Gateways][gke-multi-cluster-gateway]. + +Progress towards supporting this project is tracked with a [GitHub +Issue](https://github.com/GoogleCloudPlatform/gke-gateway-api/issues/20). + +[gke]:https://cloud.google.com/kubernetes-engine +[gke-gateway]:https://cloud.google.com/kubernetes-engine/docs/concepts/gateway-api +[gke-gateway-deploy]:https://cloud.google.com/kubernetes-engine/docs/how-to/deploying-gateways +[gke-multi-cluster-gateway]:https://cloud.google.com/kubernetes-engine/docs/how-to/deploying-multi-cluster-gateways + +## Istio + +[Istio](https://istio.io/) is an open source service mesh and gateway implementation. +It provides a fully compliant implementation of the Kubernetes Gateway API for cluster ingress traffic control. +For service mesh users, Istio also fully supports east-west (including [GAMMA](https://gateway-api.sigs.k8s.io/mesh/)) traffic management within the mesh. + +Gateway API Inference Extension support is being tracked by this [GitHub +Issue](https://github.com/istio/istio/issues/55768). + +## Alibaba Cloud Container Service for Kubernetes + +[Alibaba Cloud Container Service for Kubernetes (ACK)][ack] is a managed Kubernetes platform +offered by Alibaba Cloud. The implementation of the Gateway API in ACK is through the +[ACK Gateway with Inference Extension][ack-gie] component, which introduces model-aware, +GPU-efficient load balancing for AI workloads beyond basic HTTP routing. + +The ACK Gateway with Inference Extension implements the Gateway API Inference Extension +and provides optimized routing for serving generative AI workloads, +including weighted traffic splitting, mirroring, advanced routing, etc. +See the docs for the [usage][ack-gie-usage]. + +Progress towards supporting Gateway API Inference Extension is being tracked +by [this Issue](https://github.com/AliyunContainerService/ack-gateway-api/issues/1). + +[ack]:https://www.alibabacloud.com/help/en/ack +[ack-gie]:https://www.alibabacloud.com/help/en/ack/product-overview/ack-gateway-with-inference-extension +[ack-gie-usage]:https://www.alibabacloud.com/help/en/ack/ack-managed-and-ack-dedicated/user-guide/intelligent-routing-and-traffic-management-with-ack-gateway-inference-extension \ No newline at end of file diff --git a/site-src/implementations/model-servers.md b/site-src/implementations/model-servers.md new file mode 100644 index 000000000..7ec824bc5 --- /dev/null +++ b/site-src/implementations/model-servers.md @@ -0,0 +1,38 @@ + + +# Supported Model Servers + +Any model server that conform to the [model server protocol](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol) are supported by the inference extension. + +## Compatible Model Server Versions + +| Model Server | Version | Commit | Notes | +| -------------------- | ---------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | +| vLLM V0 | v0.6.4 and above | [commit 0ad216f](https://github.com/vllm-project/vllm/commit/0ad216f5750742115c686723bf38698372d483fd) | | +| vLLM V1 | v0.8.0 and above | [commit bc32bc7](https://github.com/vllm-project/vllm/commit/bc32bc73aad076849ac88565cff745b01b17d89c) | | +| Triton(TensorRT-LLM) | [25.03](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-25-03.html#rel-25-03) and above | [commit 15cb989](https://github.com/triton-inference-server/tensorrtllm_backend/commit/15cb989b00523d8e92dce5165b9b9846c047a70d). | LoRA affinity feature is not available as the required LoRA metrics haven't been implemented in Triton yet. [Feature request](https://github.com/triton-inference-server/server/issues/8181) | + +## vLLM + +vLLM is configured as the default in the [endpoint picker extension](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp). No further configuration is required. + +## Triton with TensorRT-LLM Backend + +Triton specific metric names need to be specified when starting the EPP. + +### Option 1: Use Helm + +Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install the [`inferencepool` via helm](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/charts/inferencepool). See the [`inferencepool` helm guide](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/charts/inferencepool/README.md) for more details. + +### Option 2: Edit EPP deployment yaml + + Add the following to the `args` of the [EPP deployment](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/manifests/inferencepool-resources.yaml#L32) + + ``` +- -totalQueuedRequestsMetric +- "nv_trt_llm_request_metrics{request_type=waiting}" +- -kvCacheUsagePercentageMetric +- "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}" +- -loraInfoMetric +- "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet. +``` \ No newline at end of file diff --git a/site-src/index.md b/site-src/index.md index 04d1fadb8..7a2a116bf 100644 --- a/site-src/index.md +++ b/site-src/index.md @@ -1,8 +1,6 @@ # Introduction -Gateway API Inference Extension is an official Kubernetes project focused on -extending [Gateway API](https://gateway-api.sigs.k8s.io/) with inference -specific routing extensions. +Gateway API Inference Extension is an official Kubernetes project that optimizes self-hosting Generative Models on Kubernetes. The overall resource model focuses on 2 new inference-focused [personas](/concepts/roles-and-personas) and corresponding resources that @@ -11,20 +9,49 @@ they are expected to manage: Gateway API Inference Extension Resource Model +## Concepts and Definitions + +The following specific terms to this project: + +- **Inference Gateway**: A proxy/load-balancer that has been coupled with the + EndPointer Picker extension. It provides optimized routing and load balancing for + serving Kubernetes self-hosted generative Artificial Intelligence (AI) + workloads. It simplifies the deployment, management, and observability of AI + inference workloads. +- **Inference Scheduler**: An extendable component that makes decisions about which endpoint is optimal (best cost / + best performance) for an inference request based on `Metrics and Capabilities` + from [Model Serving](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol/README.md). +- **Metrics and Capabilities**: Data provided by model serving platforms about + performance, availability and capabilities to optimize routing. Includes + things like [Prefix Cache] status or [LoRA Adapters] availability. +- **Endpoint Picker(EPP)**: An implementation of an `Inference Scheduler` with additional Routing, Flow, and Request Control layers to allow for sophisticated routing strategies. Additional info on the architecture of the EPP [here](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/0683-epp-architecture-proposal). + +[Inference Gateway]:#concepts-and-definitions + ## Key Features -Gateway API Inference Extension, along with a reference implementation in Envoy Proxy, provides the following key features: +Gateway API Inference Extension optimizes self-hosting Generative AI Models on Kubernetes. +It provides optimized load-balancing for self-hosted Generative AI Models on Kubernetes. +The project’s goal is to improve and standardize routing to inference workloads across the ecosystem. + +This is achieved by leveraging Envoy's [External Processing](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter) to extend any gateway that supports both ext-proc and [Gateway API](https://github.com/kubernetes-sigs/gateway-api) into an [inference gateway](#concepts-and-definitions). +This extension extends popular gateways like Envoy Gateway, kgateway, and GKE Gateway - to become [Inference Gateway](#concepts-and-definitions) - +supporting inference platform teams self-hosting Generative Models (with a current focus on large language models) on Kubernetes. +This integration makes it easy to expose and control access to your local [OpenAI-compatible chat completion endpoints](https://platform.openai.com/docs/api-reference/chat) +to other workloads on or off cluster, or to integrate your self-hosted models alongside model-as-a-service providers +in a higher level **AI Gateways** like [LiteLLM](https://www.litellm.ai/), [Gloo AI Gateway](https://www.solo.io/products/gloo-ai-gateway), or [Apigee](https://cloud.google.com/apigee). -- **Model-aware routing**: Instead of simply routing based on the path of the request, Gateway API Inference Extension allows you to route to models based on the model names. This is enabled by support for GenAI Inference API specifications (such as OpenAI API) in the gateway implementations such as in Envoy Proxy. This model-aware routing also extends to Low-Rank Adaptation (LoRA) fine-tuned models. -- **Serving priority**: Gateway API Inference Extension allows you to specify the serving priority of your models. For example, you can specify that your models for online inference of chat tasks (which is more latency sensitive) have a higher [*Criticality*](/reference/spec/#criticality) than a model for latency tolerant tasks such as a summarization. +- **Model-aware routing**: Instead of simply routing based on the path of the request, an **[inference gateway]** allows you to route to models based on the model names. This is enabled by support for GenAI Inference API specifications (such as OpenAI API) in the gateway implementations such as in Envoy Proxy. This model-aware routing also extends to Low-Rank Adaptation (LoRA) fine-tuned models. -- **Model rollouts**: Gateway API Inference Extension allows you to incrementally roll out new model versions by traffic splitting definitions based on the model names. +- **Serving priority**: an **[inference gateway]** allows you to specify the serving priority of your models. For example, you can specify that your models for online inference of chat tasks (which is more latency sensitive) have a higher [*Criticality*](/reference/spec/#criticality) than a model for latency tolerant tasks such as a summarization. -- **Extensibility for Inference Services**: Gateway API Inference Extension defines extensibility pattern for additional Inference services to create bespoke routing capabilities should out of the box solutions not fit your needs. +- **Model rollouts**: an **[inference gateway]** allows you to incrementally roll out new model versions by traffic splitting definitions based on the model names. +- **Extensibility for Inference Services**: an **[inference gateway]** defines extensibility pattern for additional Inference services to create bespoke routing capabilities should out of the box solutions not fit your needs. -- **Customizable Load Balancing for Inference**: Gateway API Inference Extension defines a pattern for customizable load balancing and request routing that is optimized for Inference. Gateway API Inference Extension provides a reference implementation of model endpoint picking leveraging metrics emitted from the model servers. This endpoint picking mechanism can be used in lieu of traditional load balancing mechanisms. Model Server-aware load balancing ("smart" load balancing as its sometimes referred to in this repo) has been proven to reduce the serving latency and improve utilization of accelerators in your clusters. +- **Customizable Load Balancing for Inference**: an **[inference gateway]** defines a pattern for customizable load balancing and request routing that is optimized for Inference. An **[inference gateway]** provides a reference implementation of model endpoint picking leveraging metrics emitted from the model servers. This endpoint picking mechanism can be used in lieu of traditional load balancing mechanisms. Model Server-aware load balancing ("smart" load balancing as its sometimes referred to in this repo) has been proven to reduce the serving latency and improve utilization of accelerators in your clusters. +By achieving these, the project aims to reduce latency and improve accelerator (GPU) utilization for AI workloads. ## API Resources @@ -42,13 +69,11 @@ that are relevant to this project: Gateway API has [more than 25 implementations](https://gateway-api.sigs.k8s.io/implementations/). As this pattern stabilizes, we expect a wide set of these implementations to support -this project. +this project to become an **[inference gateway]** -### Endpoint Selection Extension +### Endpoint Picker -As part of this project, we're building an initial reference extension. Over -time, we hope to see a wide variety of extensions emerge that follow this -pattern and provide a wide range of choices. +As part of this project, we've built the Endpoint Picker. A pluggable & extensible ext-proc deployment that implements [this architecture](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/0683-epp-architecture-proposal). ### Model Server Frameworks @@ -73,16 +98,16 @@ to any Gateway API users or implementers. 2. If the request should be routed to an InferencePool, the Gateway will forward the request information to the endpoint selection extension for that pool. -3. The extension will fetch metrics from whichever portion of the InferencePool +3. The inference gateway will fetch metrics from whichever portion of the InferencePool endpoints can best achieve the configured objectives. Note that this kind of -metrics probing may happen asynchronously, depending on the extension. +metrics probing may happen asynchronously, depending on the inference gateway. -4. The extension will instruct the Gateway which endpoint the request should be +4. The inference gateway will instruct the Gateway which endpoint the request should be routed to. 5. The Gateway will route the request to the desired endpoint. -Gateway API Inference Extension Request Flow +Inference Gateway Request Flow ## Who is working on Gateway API Inference Extension? @@ -91,7 +116,7 @@ This project is being driven by [WG-Serving](https://github.com/kubernetes/community/tree/master/wg-serving) [SIG-Network](https://github.com/kubernetes/community/tree/master/sig-network) to improve and standardize routing to inference workloads in Kubernetes. Check -out the [implementations reference](implementations.md) to see the latest +out the [implementations reference](implementations/gateways.md) to see the latest projects & products that support this project. If you are interested in contributing to or building an implementation using Gateway API then don’t hesitate to [get involved!](/contributing) diff --git a/site-src/performance/benchmark/index.md b/site-src/performance/benchmark/index.md index 39457bf66..42d5e727b 100644 --- a/site-src/performance/benchmark/index.md +++ b/site-src/performance/benchmark/index.md @@ -1,45 +1,49 @@ # Benchmark -This user guide shows how to run benchmarks against a vLLM deployment, by using both the Gateway API -inference extension, and a Kubernetes service as the load balancing strategy. The -benchmark uses the [Latency Profile Generator](https://github.com/AI-Hypercomputer/inference-benchmark) (LPG) -tool to generate load and collect results. +This user guide shows how to run benchmarks against a vLLM model server deployment by using both Gateway API +Inference Extension, and a Kubernetes service as the load balancing strategy. The benchmark uses the +[Latency Profile Generator](https://github.com/AI-Hypercomputer/inference-benchmark) (LPG) tool to generate +load and collect results. ## Prerequisites ### Deploy the inference extension and sample model server -Follow this user guide https://gateway-api-inference-extension.sigs.k8s.io/guides/ to deploy the -sample vLLM application, and the inference extension. +Follow the [getting started guide](https://gateway-api-inference-extension.sigs.k8s.io/guides/#getting-started-with-gateway-api-inference-extension) +to deploy the vLLM model server, CRDs, etc. + +__Note:__ Only the GPU-based model server deployment option is supported for benchmark testing. ### [Optional] Scale the sample vLLM deployment -You will more likely to see the benefits of the inference extension when there are a decent number of replicas to make the optimal routing decision. +You are more likely to see the benefits of the inference extension when there are a decent number of replicas to make the optimal routing decision. ```bash -kubectl scale --replicas=8 -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml +kubectl scale deployment vllm-llama3-8b-instruct --replicas=8 ``` ### Expose the model server via a k8s service -As the baseline, let's also expose the vLLM deployment as a k8s service: +To establish a baseline, expose the vLLM deployment as a k8s service: ```bash -kubectl expose -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml --port=8081 --target-port=8000 --type=LoadBalancer +kubectl expose deployment vllm-llama3-8b-instruct --port=80 --target-port=8000 --type=LoadBalancer ``` ## Run benchmark -The LPG benchmark tool works by sending traffic to the specified target IP and port, and collect results. Follow the steps below to run a single benchmark. You can deploy multiple LPG instances if you want to run benchmarks in parallel against different targets. +The LPG benchmark tool works by sending traffic to the specified target IP and port, and collecting the results. +Follow the steps below to run a single benchmark. Multiple LPG instances can be deployed to run benchmarks in +parallel against different targets. 1. Check out the repo. - + ```bash git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension cd gateway-api-inference-extension ``` -1. Get the target IP. Examples below show how to get the IP of a gateway or a LoadBalancer k8s service. +1. Get the target IP. The examples below shows how to get the IP of a gateway or a k8s service. ```bash # Get gateway IP @@ -51,32 +55,43 @@ The LPG benchmark tool works by sending traffic to the specified target IP and p echo $SVC_IP ``` -1. Then update the `` in `./config/manifests/benchmark/benchmark.yaml` to your target IP. Feel free to adjust other parameters such as request_rates as well. For a complete list of LPG configurations, pls refer to the [LPG user guide](https://github.com/AI-Hypercomputer/inference-benchmark?tab=readme-ov-file#configuring-the-benchmark). +1. Then update the `` in `./config/manifests/benchmark/benchmark.yaml` to the value of `$SVC_IP` or `$GW_IP`. + Feel free to adjust other parameters such as `request_rates` as well. For a complete list of LPG configurations, refer to the + [LPG user guide](https://github.com/AI-Hypercomputer/inference-benchmark?tab=readme-ov-file#configuring-the-benchmark). -1. Start the benchmark tool. `kubectl apply -f ./config/manifests/benchmark/benchmark.yaml` +1. Start the benchmark tool. -1. Wait for benchmark to finish and download the results. Use the `benchmark_id` environment variable -to specify what this benchmark is for. For instance, `inference-extension` or `k8s-svc`. When the LPG tool finishes benchmarking, it will print a log line `LPG_FINISHED`, -the script below will watch for that log line and then start downloading results. + ```bash + kubectl apply -f ./config/manifests/benchmark/benchmark.yaml + ``` + +1. Wait for benchmark to finish and download the results. Use the `benchmark_id` environment variable to specify what this + benchmark is for. For instance, `inference-extension` or `k8s-svc`. When the LPG tool finishes benchmarking, it will print + a log line `LPG_FINISHED`. The script below will watch for that log line and then start downloading results. ```bash - benchmark_id='my-benchmark' ./tools/benchmark/download-benchmark-results.bash + benchmark_id='k8s-svc' ./tools/benchmark/download-benchmark-results.bash ``` - 1. After the script finishes, you should see benchmark results under `./tools/benchmark/output/default-run/my-benchmark/results/json` folder. Here is a [sample json file](./sample.json). + + After the script finishes, you should see benchmark results under `./tools/benchmark/output/default-run/k8s-svc/results/json` folder. + Here is a [sample json file](./sample.json). Replace `k8s-svc` with `inference-extension` when running an inference extension benchmark. ### Tips +* When using a `benchmark_id` other than `k8s-svc` or `inference-extension`, the labels in `./tools/benchmark/benchmark.ipynb` must be + updated accordingly to analyze the results. * You can specify `run_id="runX"` environment variable when running the `./download-benchmark-results.bash` script. This is useful when you run benchmarks multiple times to get a more statistically meaningful results and group the results accordingly. * Update the `request_rates` that best suit your benchmark environment. ### Advanced Benchmark Configurations -Pls refer to the [LPG user guide](https://github.com/AI-Hypercomputer/inference-benchmark?tab=readme-ov-file#configuring-the-benchmark) for a detailed list of configuration knobs. +Refer to the [LPG user guide](https://github.com/AI-Hypercomputer/inference-benchmark?tab=readme-ov-file#configuring-the-benchmark) for a +detailed list of configuration knobs. ## Analyze the results -This guide shows how to run the jupyter notebook using vscode. +This guide shows how to run the jupyter notebook using vscode after completing k8s service and inference extension benchmarks. 1. Create a python virtual environment. @@ -91,7 +106,7 @@ This guide shows how to run the jupyter notebook using vscode. pip install -r ./tools/benchmark/requirements.txt ``` -1. Open the notebook `./tools/benchmark/benchmark.ipynb`, and run each cell. At the end you should +1. Open the notebook `./tools/benchmark/benchmark.ipynb`, and run each cell. In the last cell update the benchmark ids with`inference-extension` and `k8s-svc`. At the end you should see a bar chart like below where **"ie"** represents inference extension. This chart is generated using this benchmarking tool with 6 vLLM (v1) model servers (H100 80 GB), [llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/tree/main) and the [ShareGPT dataset](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json). ![alt text](example-bar-chart.png) \ No newline at end of file diff --git a/site-src/performance/regression-testing/index.md b/site-src/performance/regression-testing/index.md new file mode 100644 index 000000000..16b5552f5 --- /dev/null +++ b/site-src/performance/regression-testing/index.md @@ -0,0 +1,103 @@ +# Regression Testing + +Regression testing verifies that recent code changes have not adversely affected the performance or stability of the Inference Gateway. + +This guide explains how to run regression tests against the Gateway API inference extension using the [Latency Profile Generator (LPG)](https://github.com/AI-Hypercomputer/inference-benchmark/) to simulate traffic and collect performance metrics. + +## Prerequisites + +Refer to the [benchmark guide](/site-src/performance/benchmark/index.md) for common setup steps, including deployment of the inference extension, model server setup, scaling the vLLM deployment, and obtaining the Gateway IP. + +## Create the LPG Docker Image + +Follow the detailed instructions [here](https://github.com/AI-Hypercomputer/inference-benchmark/blob/1c92df607751a7ddb04e2152ed7f6aaf85bd9ca7/README.md) to build the LPG Docker image: + +* Create an artifact repository: + +```bash +gcloud artifacts repositories create ai-benchmark --location=us-central1 --repository-format=docker +``` + +* Prepare datasets for [Infinity-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) and [billsum]((https://huggingface.co/datasets/FiscalNote/billsum)): + +```bash +pip install datasets transformers numpy pandas tqdm matplotlib +python datasets/import_dataset.py --hf_token YOUR_TOKEN +``` + +* Build the benchmark Docker image: + +```bash +docker build -t inference-benchmark . +``` + +* Push the Docker image to your artifact registry: + +```bash +docker tag inference-benchmark us-central1-docker.pkg.dev/{project-name}/ai-benchmark/inference-benchmark +docker push us-central1-docker.pkg.dev/{project-name}/ai-benchmark/inference-benchmark +``` + +## Conduct Regression Tests + +Run benchmarks using the configurations below, which are optimized for NVIDIA H100 GPUs (80 GB). Adjust configurations for other hardware as necessary. + +### Test Case 1: Single Workload + +- **Dataset:** `billsum_conversations.json` (created from [HuggingFace billsum dataset](https://huggingface.co/datasets/FiscalNote/billsum)). + * This dataset features long prompts, making it prefill-heavy and ideal for testing scenarios that emphasize initial token generation. +- **Model:** [Llama 3 (8B)](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) (*critical*) +- **Replicas:** 10 (vLLM) +- **Request Rates:** 300–350 (increments of 10) + +Refer to example manifest: +`./config/manifests/regression-testing/single-workload-regression.yaml` + +### Test Case 2: Multi-LoRA + +- **Dataset:** `Infinity-Instruct_conversations.json` (created from [HuggingFace Infinity-Instruct dataset](https://huggingface.co/datasets/BAAI/Infinity-Instruct)). + * This dataset has long outputs, making it decode-heavy and useful for testing scenarios focusing on sustained token generation. +- **Model:** [Llama 3 (8B)](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) +- **LoRA Adapters:** 15 adapters (`nvidia/llama-3.1-nemoguard-8b-topic-control`, rank 8, critical) +- **Hardware:** NVIDIA H100 GPUs (80 GB) +- **Traffic Distribution:** 60% (first 5 adapters, each 12%), 30% (next 5, each 6%), 10% (last 5, each 2%) simulating prod/dev/test tiers +- **Max LoRA:** 3 +- **Replicas:** 10 (vLLM) +- **Request Rates:** 20–200 (increments of 20) + +Optionally, you can also run benchmarks using the `ShareGPT` dataset for additional coverage. + +Update deployments for multi-LoRA support: +- vLLM Deployment: `./config/manifests/regression-testing/vllm/multi-lora-deployment.yaml` +- InferenceModel: `./config/manifests/inferencemodel.yaml` + +Refer to example manifest: +`./config/manifests/regression-testing/multi-lora-regression.yaml` + +### Execute Benchmarks + +Benchmark in two phases: before and after applying your changes: + +- **Before changes:** + +```bash +benchmark_id='regression-before' ./tools/benchmark/download-benchmark-results.bash +``` + +- **After changes:** + +```bash +benchmark_id='regression-after' ./tools/benchmark/download-benchmark-results.bash +``` + +## Analyze Benchmark Results + +Use the provided Jupyter notebook (`./tools/benchmark/benchmark.ipynb`) to analyze results: + +- Update benchmark IDs to `regression-before` and `regression-after`. +- Compare latency and throughput metrics, performing regression analysis. +- Check R² values specifically: + - **Prompts Attempted/Succeeded:** Expect R² ≈ 1 + - **Output Tokens per Minute, P90 per Output Token Latency, P90 Latency:** Expect R² close to 1 (allow minor variance). + +Identify significant deviations, investigate causes, and confirm performance meets expected standards. \ No newline at end of file diff --git a/site-src/reference/spec.md b/site-src/reference/spec.md index e16c113c1..263f54d44 100644 --- a/site-src/reference/spec.md +++ b/site-src/reference/spec.md @@ -1,12 +1,14 @@ # API Reference ## Packages -- [inference.networking.x-k8s.io/v1alpha1](#inferencenetworkingx-k8siov1alpha1) +- [inference.networking.x-k8s.io/v1alpha2](#inferencenetworkingx-k8siov1alpha2) -## inference.networking.x-k8s.io/v1alpha1 +## inference.networking.x-k8s.io/v1alpha2 + +Package v1alpha2 contains API Schema definitions for the +inference.networking.x-k8s.io API group. -Package v1alpha1 contains API Schema definitions for the gateway v1alpha1 API group ### Resource Types - [InferenceModel](#inferencemodel) @@ -18,26 +20,152 @@ Package v1alpha1 contains API Schema definitions for the gateway v1alpha1 API gr _Underlying type:_ _string_ -Defines how important it is to serve the model compared to other models. +Criticality defines how important it is to serve the model compared to other models. +Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional(use a pointer), and set no default. +This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior. _Validation:_ -- Enum: [Critical Default Sheddable] +- Enum: [Critical Standard Sheddable] _Appears in:_ - [InferenceModelSpec](#inferencemodelspec) | Field | Description | | --- | --- | -| `Critical` | Most important. Requests to this band will be shed last.
| -| `Default` | More important than Sheddable, less important than Critical.
Requests in this band will be shed before critical traffic.
+kubebuilder:default=Default
| -| `Sheddable` | Least important. Requests to this band will be shed before all other bands.
| +| `Critical` | Critical defines the highest level of criticality. Requests to this band will be shed last.
| +| `Standard` | Standard defines the base criticality level and is more important than Sheddable but less
important than Critical. Requests in this band will be shed before critical traffic.
Most models are expected to fall within this band.
| +| `Sheddable` | Sheddable defines the lowest level of criticality. Requests to this band will be shed before
all other bands.
| + + +#### EndpointPickerConfig + + + +EndpointPickerConfig specifies the configuration needed by the proxy to discover and connect to the endpoint picker extension. +This type is intended to be a union of mutually exclusive configuration options that we may add in the future. + + + +_Appears in:_ +- [InferencePoolSpec](#inferencepoolspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `extensionRef` _[Extension](#extension)_ | Extension configures an endpoint picker as an extension service. | | Required: \{\}
| + + +#### Extension + + + +Extension specifies how to configure an extension that runs the endpoint picker. + + + +_Appears in:_ +- [EndpointPickerConfig](#endpointpickerconfig) +- [InferencePoolSpec](#inferencepoolspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `group` _[Group](#group)_ | Group is the group of the referent.
The default value is "", representing the Core API group. | | MaxLength: 253
Pattern: `^$\|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
| +| `kind` _[Kind](#kind)_ | Kind is the Kubernetes resource kind of the referent. For example
"Service".
Defaults to "Service" when not specified.
ExternalName services can refer to CNAME DNS records that may live
outside of the cluster and as such are difficult to reason about in
terms of conformance. They also may not be safe to forward to (see
CVE-2021-25740 for more information). Implementations MUST NOT
support ExternalName Services. | Service | MaxLength: 63
MinLength: 1
Pattern: `^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$`
| +| `name` _[ObjectName](#objectname)_ | Name is the name of the referent. | | MaxLength: 253
MinLength: 1
Required: \{\}
| +| `portNumber` _[PortNumber](#portnumber)_ | The port number on the service running the extension. When unspecified,
implementations SHOULD infer a default value of 9002 when the Kind is
Service. | | Maximum: 65535
Minimum: 1
| +| `failureMode` _[ExtensionFailureMode](#extensionfailuremode)_ | Configures how the gateway handles the case when the extension is not responsive.
Defaults to failClose. | FailClose | Enum: [FailOpen FailClose]
| + + +#### ExtensionConnection + + + +ExtensionConnection encapsulates options that configures the connection to the extension. + + + +_Appears in:_ +- [Extension](#extension) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `failureMode` _[ExtensionFailureMode](#extensionfailuremode)_ | Configures how the gateway handles the case when the extension is not responsive.
Defaults to failClose. | FailClose | Enum: [FailOpen FailClose]
| + + +#### ExtensionFailureMode + +_Underlying type:_ _string_ + +ExtensionFailureMode defines the options for how the gateway handles the case when the extension is not +responsive. + +_Validation:_ +- Enum: [FailOpen FailClose] + +_Appears in:_ +- [Extension](#extension) +- [ExtensionConnection](#extensionconnection) + +| Field | Description | +| --- | --- | +| `FailOpen` | FailOpen specifies that the proxy should not drop the request and forward the request to and endpoint of its picking.
| +| `FailClose` | FailClose specifies that the proxy should drop the request.
| + + +#### ExtensionReference + + + +ExtensionReference is a reference to the extension deployment. + + + +_Appears in:_ +- [Extension](#extension) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `group` _[Group](#group)_ | Group is the group of the referent.
The default value is "", representing the Core API group. | | MaxLength: 253
Pattern: `^$\|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
| +| `kind` _[Kind](#kind)_ | Kind is the Kubernetes resource kind of the referent. For example
"Service".
Defaults to "Service" when not specified.
ExternalName services can refer to CNAME DNS records that may live
outside of the cluster and as such are difficult to reason about in
terms of conformance. They also may not be safe to forward to (see
CVE-2021-25740 for more information). Implementations MUST NOT
support ExternalName Services. | Service | MaxLength: 63
MinLength: 1
Pattern: `^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$`
| +| `name` _[ObjectName](#objectname)_ | Name is the name of the referent. | | MaxLength: 253
MinLength: 1
Required: \{\}
| +| `portNumber` _[PortNumber](#portnumber)_ | The port number on the service running the extension. When unspecified,
implementations SHOULD infer a default value of 9002 when the Kind is
Service. | | Maximum: 65535
Minimum: 1
| + + +#### Group + +_Underlying type:_ _string_ + +Group refers to a Kubernetes Group. It must either be an empty string or a +RFC 1123 subdomain. + +This validation is based off of the corresponding Kubernetes validation: +https://github.com/kubernetes/apimachinery/blob/02cfb53916346d085a6c6c7c66f882e3c6b0eca6/pkg/util/validation/validation.go#L208 + +Valid values include: + +* "" - empty string implies core Kubernetes API group +* "gateway.networking.k8s.io" +* "foo.example.com" + +Invalid values include: + +* "example.com/bar" - "/" is an invalid character + +_Validation:_ +- MaxLength: 253 +- Pattern: `^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$` + +_Appears in:_ +- [Extension](#extension) +- [ExtensionReference](#extensionreference) +- [PoolObjectReference](#poolobjectreference) + #### InferenceModel -InferenceModel is the Schema for the InferenceModels API +InferenceModel is the Schema for the InferenceModels API. @@ -45,29 +173,31 @@ InferenceModel is the Schema for the InferenceModels API | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `apiVersion` _string_ | `inference.networking.x-k8s.io/v1alpha1` | | | +| `apiVersion` _string_ | `inference.networking.x-k8s.io/v1alpha2` | | | | `kind` _string_ | `InferenceModel` | | | | `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | | `spec` _[InferenceModelSpec](#inferencemodelspec)_ | | | | | `status` _[InferenceModelStatus](#inferencemodelstatus)_ | | | | + + + + #### InferenceModelSpec -InferenceModelSpec represents a specific model use case. This resource is +InferenceModelSpec represents the desired state of a specific model use case. This resource is managed by the "Inference Workload Owner" persona. - -The Inference Workload Owner persona is: a team that trains, verifies, and +The Inference Workload Owner persona is someone that trains, verifies, and leverages a large language model from a model frontend, drives the lifecycle and rollout of new versions of those models, and defines the specific performance and latency goals for the model. These workloads are expected to operate within an InferencePool sharing compute capacity with other InferenceModels, defined by the Inference Platform Admin. - InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool, if the name is reused, an error will be shown on the status of a InferenceModel that attempted to reuse. The oldest InferenceModel, based on @@ -81,10 +211,10 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `modelName` _string_ | The name of the model as the users set in the "model" parameter in the requests.
The name should be unique among the workloads that reference the same backend pool.
This is the parameter that will be used to match the request with. In the future, we may
allow to match on other request parameters. The other approach to support matching on
on other request parameters is to use a different ModelName per HTTPFilter.
Names can be reserved without implementing an actual model in the pool.
This can be done by specifying a target model and setting the weight to zero,
an error will be returned specifying that no valid target model is found. | | MaxLength: 253
| -| `criticality` _[Criticality](#criticality)_ | Defines how important it is to serve the model compared to other models referencing the same pool. | Default | Enum: [Critical Default Sheddable]
| -| `targetModels` _[TargetModel](#targetmodel) array_ | Allow multiple versions of a model for traffic splitting.
If not specified, the target model name is defaulted to the modelName parameter.
modelName is often in reference to a LoRA adapter. | | MaxItems: 10
| -| `poolRef` _[PoolObjectReference](#poolobjectreference)_ | Reference to the inference pool, the pool must exist in the same namespace. | | Required: \{\}
| +| `modelName` _string_ | ModelName is the name of the model as it will be set in the "model" parameter for an incoming request.
ModelNames must be unique for a referencing InferencePool
(names can be reused for a different pool in the same cluster).
The modelName with the oldest creation timestamp is retained, and the incoming
InferenceModel's Ready status is set to false with a corresponding reason.
In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
Names can be reserved without an underlying model configured in the pool.
This can be done by specifying a target model and setting the weight to zero,
an error will be returned specifying that no valid target model is found. | | MaxLength: 256
Required: \{\}
| +| `criticality` _[Criticality](#criticality)_ | Criticality defines how important it is to serve the model compared to other models referencing the same pool.
Criticality impacts how traffic is handled in resource constrained situations. It handles this by
queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
and the proportionality of fairness will be configurable.
Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field.
Any implementations that may consume this field may treat an unset value as the 'Standard' range. | | Enum: [Critical Standard Sheddable]
| +| `targetModels` _[TargetModel](#targetmodel) array_ | TargetModels allow multiple versions of a model for traffic splitting.
If not specified, the target model name is defaulted to the modelName parameter.
modelName is often in reference to a LoRA adapter. | | MaxItems: 10
| +| `poolRef` _[PoolObjectReference](#poolobjectreference)_ | PoolRef is a reference to the inference pool, the pool must exist in the same namespace. | | Required: \{\}
| #### InferenceModelStatus @@ -100,14 +230,14 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#condition-v1-meta) array_ | Conditions track the state of the InferencePool. | | | +| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#condition-v1-meta) array_ | Conditions track the state of the InferenceModel.
Known condition types are:
* "Accepted" | [map[lastTransitionTime:1970-01-01T00:00:00Z message:Waiting for controller reason:Pending status:Unknown type:Ready]] | MaxItems: 8
| #### InferencePool -InferencePool is the Schema for the Inferencepools API +InferencePool is the Schema for the InferencePools API. @@ -115,13 +245,17 @@ InferencePool is the Schema for the Inferencepools API | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `apiVersion` _string_ | `inference.networking.x-k8s.io/v1alpha1` | | | +| `apiVersion` _string_ | `inference.networking.x-k8s.io/v1alpha2` | | | | `kind` _string_ | `InferencePool` | | | | `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | | `spec` _[InferencePoolSpec](#inferencepoolspec)_ | | | | | `status` _[InferencePoolStatus](#inferencepoolstatus)_ | | | | + + + + #### InferencePoolSpec @@ -135,8 +269,9 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `selector` _object (keys:[LabelKey](#labelkey), values:[LabelValue](#labelvalue))_ | Selector uses a map of label to watch model server pods
that should be included in the InferencePool. ModelServers should not
be with any other Service or InferencePool, that behavior is not supported
and will result in sub-optimal utilization.
In some cases, implementations may translate this to a Service selector, so this matches the simple
map used for Service selectors instead of the full Kubernetes LabelSelector type. | | Required: \{\}
| -| `targetPortNumber` _integer_ | TargetPortNumber is the port number that the model servers within the pool expect
to receive traffic from.
This maps to the TargetPort in: https://pkg.go.dev/k8s.io/api/core/v1#ServicePort | | Maximum: 65535
Minimum: 0
Required: \{\}
| +| `selector` _object (keys:[LabelKey](#labelkey), values:[LabelValue](#labelvalue))_ | Selector defines a map of labels to watch model server pods
that should be included in the InferencePool.
In some cases, implementations may translate this field to a Service selector, so this matches the simple
map used for Service selectors instead of the full Kubernetes LabelSelector type.
If sepecified, it will be applied to match the model server pods in the same namespace as the InferencePool.
Cross namesoace selector is not supported. | | Required: \{\}
| +| `targetPortNumber` _integer_ | TargetPortNumber defines the port number to access the selected model servers.
The number must be in the range 1 to 65535. | | Maximum: 65535
Minimum: 1
Required: \{\}
| +| `extensionRef` _[Extension](#extension)_ | Extension configures an endpoint picker as an extension service. | | Required: \{\}
| #### InferencePoolStatus @@ -152,33 +287,56 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#condition-v1-meta) array_ | Conditions track the state of the InferencePool. | | | +| `parent` _[PoolStatus](#poolstatus) array_ | Parents is a list of parent resources (usually Gateways) that are
associated with the route, and the status of the InferencePool with respect to
each parent.
A maximum of 32 Gateways will be represented in this list. An empty list
means the route has not been attached to any Gateway. | | MaxItems: 32
| + + +#### Kind + +_Underlying type:_ _string_ + +Kind refers to a Kubernetes Kind. + +Valid values include: + +* "Service" +* "HTTPRoute" + +Invalid values include: + +* "invalid/kind" - "/" is an invalid character + +_Validation:_ +- MaxLength: 63 +- MinLength: 1 +- Pattern: `^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$` + +_Appears in:_ +- [Extension](#extension) +- [ExtensionReference](#extensionreference) +- [PoolObjectReference](#poolobjectreference) + #### LabelKey _Underlying type:_ _string_ -Originally copied from: https://github.com/kubernetes-sigs/gateway-api/blob/99a3934c6bc1ce0874f3a4c5f20cafd8977ffcb4/apis/v1/shared_types.go#L694-L731 +LabelKey was originally copied from: https://github.com/kubernetes-sigs/gateway-api/blob/99a3934c6bc1ce0874f3a4c5f20cafd8977ffcb4/apis/v1/shared_types.go#L694-L731 Duplicated as to not take an unexpected dependency on gw's API. - LabelKey is the key of a label. This is used for validation of maps. This matches the Kubernetes "qualified name" validation that is used for labels. - +Labels are case sensitive, so: my-label and My-Label are considered distinct. Valid values include: - * example * example.com * example.com/path * example.com/path.html - Invalid values include: - * example~ - "~" is an invalid character * example.com. - can not start or end with "." @@ -202,10 +360,8 @@ of maps. This matches the Kubernetes label validation rules: * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]), * could contain dashes (-), underscores (_), dots (.), and alphanumerics between. - Valid values include: - * MyValue * my.name * 123-my-value @@ -220,6 +376,25 @@ _Appears in:_ +#### ObjectName + +_Underlying type:_ _string_ + +ObjectName refers to the name of a Kubernetes object. +Object names can have a variety of forms, including RFC 1123 subdomains, +RFC 1123 labels, or RFC 1035 labels. + +_Validation:_ +- MaxLength: 253 +- MinLength: 1 + +_Appears in:_ +- [Extension](#extension) +- [ExtensionReference](#extensionreference) +- [PoolObjectReference](#poolobjectreference) + + + #### PoolObjectReference @@ -234,9 +409,42 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `group` _string_ | Group is the group of the referent. | inference.networking.x-k8s.io | MaxLength: 253
Pattern: `^$\|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
| -| `kind` _string_ | Kind is kind of the referent. For example "InferencePool". | InferencePool | MaxLength: 63
MinLength: 1
Pattern: `^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$`
| -| `name` _string_ | Name is the name of the referent. | | MaxLength: 253
MinLength: 1
Required: \{\}
| +| `group` _[Group](#group)_ | Group is the group of the referent. | inference.networking.x-k8s.io | MaxLength: 253
Pattern: `^$\|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
| +| `kind` _[Kind](#kind)_ | Kind is kind of the referent. For example "InferencePool". | InferencePool | MaxLength: 63
MinLength: 1
Pattern: `^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$`
| +| `name` _[ObjectName](#objectname)_ | Name is the name of the referent. | | MaxLength: 253
MinLength: 1
Required: \{\}
| + + +#### PoolStatus + + + +PoolStatus defines the observed state of InferencePool from a Gateway. + + + +_Appears in:_ +- [InferencePoolStatus](#inferencepoolstatus) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `parentRef` _[ObjectReference](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#objectreference-v1-core)_ | GatewayRef indicates the gateway that observed state of InferencePool. | | | +| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#condition-v1-meta) array_ | Conditions track the state of the InferencePool.
Known condition types are:
* "Accepted"
* "ResolvedRefs" | [map[lastTransitionTime:1970-01-01T00:00:00Z message:Waiting for controller reason:Pending status:Unknown type:Accepted]] | MaxItems: 8
| + + +#### PortNumber + +_Underlying type:_ _integer_ + +PortNumber defines a network port. + +_Validation:_ +- Maximum: 65535 +- Minimum: 1 + +_Appears in:_ +- [Extension](#extension) +- [ExtensionReference](#extensionreference) + #### TargetModel @@ -246,10 +454,10 @@ _Appears in:_ TargetModel represents a deployed model or a LoRA adapter. The Name field is expected to match the name of the LoRA adapter (or base model) as it is registered within the model server. Inference -Gateway assumes that the model exists on the model server and is the +Gateway assumes that the model exists on the model server and it's the responsibility of the user to validate a correct match. Should a model fail -to exist at request time, the error is processed by the Instance Gateway, -and then emitted on the appropriate InferenceModel object. +to exist at request time, the error is processed by the Inference Gateway +and emitted on the appropriate InferenceModel object. @@ -258,7 +466,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `name` _string_ | The name of the adapter as expected by the ModelServer. | | MaxLength: 253
| -| `weight` _integer_ | Weight is used to determine the proportion of traffic that should be
sent to this target model when multiple versions of the model are specified. | 1 | Maximum: 1e+06
Minimum: 0
| +| `name` _string_ | Name is the name of the adapter or base model, as expected by the ModelServer. | | MaxLength: 253
Required: \{\}
| +| `weight` _integer_ | Weight is used to determine the proportion of traffic that should be
sent to this model when multiple target models are specified.
Weight defines the proportion of requests forwarded to the specified
model. This is computed as weight/(sum of all weights in this
TargetModels list). For non-zero values, there may be some epsilon from
the exact proportion defined here depending on the precision an
implementation supports. Weight is not a percentage and the sum of
weights does not need to equal 100.
If a weight is set for any targetModel, it must be set for all targetModels.
Conversely weights are optional, so long as ALL targetModels do not specify a weight. | | Maximum: 1e+06
Minimum: 1
| diff --git a/test/e2e/epp/README.md b/test/e2e/epp/README.md index 247e8b126..c3e4aa17b 100644 --- a/test/e2e/epp/README.md +++ b/test/e2e/epp/README.md @@ -10,7 +10,13 @@ The end-to-end tests are designed to validate end-to-end Gateway API Inference E - [Go](https://golang.org/doc/install) installed on your machine. - [Make](https://www.gnu.org/software/make/manual/make.html) installed to run the end-to-end test target. -- A Hugging Face Hub token with access to the [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model. +- (Optional) When using the GPU-based vLLM deployment, a Hugging Face Hub token with access to the + [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model is required. + After obtaining the token and being granted access to the model, set the `HF_TOKEN` environment variable: + + ```sh + export HF_TOKEN= + ``` ## Running the End-to-End Tests @@ -22,11 +28,22 @@ Follow these steps to run the end-to-end tests: git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension.git && cd gateway-api-inference-extension ``` -1. **Export Your Hugging Face Hub Token**: The token is required to run the test model server: +1. **Optional Settings** - ```sh - export HF_TOKEN= - ``` + - **Set the test namespace**: By default, the e2e test creates resources in the `inf-ext-e2e` namespace. + If you would like to change this namespace, set the following environment variable: + + ```sh + export E2E_NS= + ``` + + - **Set the model server manifest**: By default, the e2e test uses the [vLLM Simulator](https://github.com/llm-d/llm-d-inference-sim) + (`config/manifests/vllm/sim-deployment.yaml`) to simulate a backend model server. If you would like to change the model server + deployment type, set the following environment variable to one of the following: + + ```sh + export E2E_MANIFEST_PATH=[config/manifests/vllm/gpu-deployment.yaml|config/manifests/vllm/cpu-deployment.yaml] + ``` 1. **Run the Tests**: Run the `test-e2e` target: @@ -34,5 +51,5 @@ Follow these steps to run the end-to-end tests: make test-e2e ``` - The test suite prints details for each step. Note that the `vllm-llama3-8b-instruct-pool` model server deployment - may take several minutes to report an `Available=True` status due to the time required for bootstraping. + The test suite prints details for each step. Note that the `vllm-llama3-8b-instruct` model server deployment + may take several minutes to report an `Available=True` status due to the time required for bootstrapping. diff --git a/test/e2e/epp/e2e_suite_test.go b/test/e2e/epp/e2e_suite_test.go index 643bbf753..0ccde422a 100644 --- a/test/e2e/epp/e2e_suite_test.go +++ b/test/e2e/epp/e2e_suite_test.go @@ -18,6 +18,7 @@ package epp import ( "context" + "errors" "fmt" "os" "strings" @@ -30,6 +31,7 @@ import ( corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" apiextv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/serializer" @@ -49,23 +51,28 @@ const ( defaultReadyTimeout = 3 * time.Minute // defaultModelReadyTimeout is the default timeout for the model server deployment to report a ready state. defaultModelReadyTimeout = 10 * time.Minute + // defaultCurlTimeout is the default timeout for the curl command to get a response. + defaultCurlTimeout = 30 * time.Second // defaultInterval is the default interval to check if a resource exists or ready conditions. defaultInterval = time.Millisecond * 250 // defaultCurlInterval is the default interval to run the test curl command. defaultCurlInterval = time.Second * 5 - // nsName is the name of the Namespace used for tests. - // TODO [danehans]: Must be "default" until https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/227 is fixed - nsName = "default" + // defaultNsName is the default name of the Namespace used for tests. Can override using the E2E_NS environment variable. + defaultNsName = "inf-ext-e2e" // modelServerName is the name of the model server test resources. modelServerName = "vllm-llama3-8b-instruct" // modelName is the test model name. modelName = "food-review" + // targetModelName is the target model name of the test model server. + targetModelName = modelName + "-1" // envoyName is the name of the envoy proxy test resources. envoyName = "envoy" // envoyPort is the listener port number of the test envoy proxy. envoyPort = "8081" // inferExtName is the name of the inference extension test resources. inferExtName = "vllm-llama3-8b-instruct-epp" + // metricsReaderSecretName is the name of the metrics reader secret which stores sa token to read epp metrics. + metricsReaderSecretName = "inference-gateway-sa-metrics-reader-secret" // clientManifest is the manifest for the client test resources. clientManifest = "../../testdata/client.yaml" // modelServerSecretManifest is the manifest for the model server secret resource. @@ -75,20 +82,24 @@ const ( // inferModelManifest is the manifest for the inference model CRD. inferModelManifest = "../../../config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml" // inferExtManifest is the manifest for the inference extension test resources. - inferExtManifest = "../../../config/manifests/inferencepool-resources.yaml" + inferExtManifest = "../../testdata/inferencepool-e2e.yaml" // envoyManifest is the manifest for the envoy proxy test resources. envoyManifest = "../../testdata/envoy.yaml" + // metricsRbacManifest is the manifest for the rbac resources for testing metrics. + metricsRbacManifest = "../../testdata/metrics-rbac.yaml" // modelServerManifestFilepathEnvVar is the env var that holds absolute path to the manifest for the model server test resource. modelServerManifestFilepathEnvVar = "MANIFEST_PATH" ) var ( - ctx context.Context + ctx = context.Background() cli client.Client // Required for exec'ing in curl pod - kubeCli *kubernetes.Clientset - scheme = runtime.NewScheme() - cfg = config.GetConfigOrDie() + kubeCli *kubernetes.Clientset + scheme = runtime.NewScheme() + cfg = config.GetConfigOrDie() + nsName string + e2eImage string ) func TestAPIs(t *testing.T) { @@ -99,6 +110,13 @@ func TestAPIs(t *testing.T) { } var _ = ginkgo.BeforeSuite(func() { + nsName = os.Getenv("E2E_NS") + if nsName == "" { + nsName = defaultNsName + } + e2eImage = os.Getenv("E2E_IMAGE") + gomega.Expect(e2eImage).NotTo(gomega.BeEmpty(), "E2E_IMAGE environment variable is not set") + ginkgo.By("Setting up the test suite") setupSuite() @@ -107,17 +125,29 @@ var _ = ginkgo.BeforeSuite(func() { }) func setupInfra() { - modelServerManifest := readModelServerManifestPath() + // this function ensures ModelServer manifest path exists. + // run this before createNs to fail fast in case it doesn't. + modelServerManifestPath := readModelServerManifestPath() + + createNamespace(cli, nsName) + + modelServerManifestArray := getYamlsFromModelServerManifest(modelServerManifestPath) + if strings.Contains(modelServerManifestArray[0], "hf-token") { + createHfSecret(cli, modelServerSecretManifest) + } crds := map[string]string{ "inferencepools.inference.networking.x-k8s.io": inferPoolManifest, "inferencemodels.inference.networking.x-k8s.io": inferModelManifest, } + createCRDs(cli, crds) createInferExt(cli, inferExtManifest) createClient(cli, clientManifest) createEnvoy(cli, envoyManifest) + createMetricsRbac(cli, metricsRbacManifest) // Run this step last, as it requires additional time for the model server to become ready. - createModelServer(cli, modelServerSecretManifest, modelServerManifest) + ginkgo.By("Creating model server resources from manifest: " + modelServerManifestPath) + createModelServer(cli, modelServerManifestArray) } var _ = ginkgo.AfterSuite(func() { @@ -128,7 +158,6 @@ var _ = ginkgo.AfterSuite(func() { // setupSuite initializes the test suite by setting up the Kubernetes client, // loading required API schemes, and validating configuration. func setupSuite() { - ctx = context.Background() gomega.ExpectWithOffset(1, cfg).NotTo(gomega.BeNil()) err := clientgoscheme.AddToScheme(scheme) @@ -137,7 +166,7 @@ func setupSuite() { err = apiextv1.AddToScheme(scheme) gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) - err = infextv1a2.AddToScheme(scheme) + err = infextv1a2.Install(scheme) gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) cli, err = client.New(cfg, client.Options{Scheme: scheme}) @@ -150,6 +179,10 @@ func setupSuite() { } func cleanupResources() { + if cli == nil { + return // could happen if BeforeSuite had an error + } + gomega.Expect(testutils.DeleteClusterResources(ctx, cli)).To(gomega.Succeed()) gomega.Expect(testutils.DeleteNamespacedResources(ctx, cli, nsName)).To(gomega.Succeed()) } @@ -171,10 +204,22 @@ var ( existsTimeout = getTimeout("EXISTS_TIMEOUT", defaultExistsTimeout) readyTimeout = getTimeout("READY_TIMEOUT", defaultReadyTimeout) modelReadyTimeout = getTimeout("MODEL_READY_TIMEOUT", defaultModelReadyTimeout) + curlTimeout = getTimeout("CURL_TIMEOUT", defaultCurlTimeout) interval = defaultInterval curlInterval = defaultCurlInterval ) +func createNamespace(k8sClient client.Client, ns string) { + ginkgo.By("Creating e2e namespace: " + ns) + obj := &corev1.Namespace{ + ObjectMeta: v1.ObjectMeta{ + Name: ns, + }, + } + err := k8sClient.Create(ctx, obj) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), "Failed to create e2e test namespace") +} + // namespaceExists ensures that a specified namespace exists and is ready for use. func namespaceExists(k8sClient client.Client, ns string) { ginkgo.By("Ensuring namespace exists: " + ns) @@ -191,6 +236,13 @@ func readModelServerManifestPath() string { return modelServerManifestFilepath } +func getYamlsFromModelServerManifest(modelServerManifestPath string) []string { + ginkgo.By("Ensuring the model server manifest points to an existing file") + modelServerManifestArray := readYaml(modelServerManifestPath) + gomega.Expect(modelServerManifestArray).NotTo(gomega.BeEmpty()) + return modelServerManifestArray +} + // createCRDs creates the Inference Extension CRDs used for testing. func createCRDs(k8sClient client.Client, crds map[string]string) { for name, path := range crds { @@ -223,17 +275,32 @@ func createClient(k8sClient client.Client, filePath string) { testutils.PodReady(ctx, k8sClient, pod, readyTimeout, interval) } -// createModelServer creates the model server resources used for testing from the given filePaths. -func createModelServer(k8sClient client.Client, secretPath, deployPath string) { - ginkgo.By("Ensuring the model server manifest points to an existing file") - modelServerManifestArray := readYaml(deployPath) - gomega.Expect(modelServerManifestArray).NotTo(gomega.BeEmpty()) - modelServerManifestYaml := modelServerManifestArray[0] - if strings.Contains(modelServerManifestYaml, "hf-token") { - createHfSecret(k8sClient, secretPath) +// createMetricsRbac creates the metrics RBAC resources from the manifest file. +func createMetricsRbac(k8sClient client.Client, filePath string) { + inManifests := readYaml(filePath) + ginkgo.By("Replacing placeholder namespace with E2E_NS environment variable") + outManifests := []string{} + for _, m := range inManifests { + outManifests = append(outManifests, strings.ReplaceAll(m, "$E2E_NS", nsName)) } + ginkgo.By("Creating RBAC resources for scraping metrics from manifest: " + filePath) + createObjsFromYaml(k8sClient, outManifests) + + // wait for sa token to exist + testutils.EventuallyExists(ctx, func() error { + token, err := getMetricsReaderToken(k8sClient) + if err != nil { + return err + } + if len(token) == 0 { + return errors.New("failed to get metrics reader token") + } + return nil + }, existsTimeout, interval) +} - ginkgo.By("Creating model server resources from manifest: " + deployPath) +// createModelServer creates the model server resources used for testing from the given filePaths. +func createModelServer(k8sClient client.Client, modelServerManifestArray []string) { createObjsFromYaml(k8sClient, modelServerManifestArray) // Wait for the deployment to exist. @@ -270,8 +337,15 @@ func createHfSecret(k8sClient client.Client, secretPath string) { // createEnvoy creates the envoy proxy resources used for testing from the given filePath. func createEnvoy(k8sClient client.Client, filePath string) { + inManifests := readYaml(filePath) + ginkgo.By("Replacing placeholder namespace with E2E_NS environment variable") + outManifests := []string{} + for _, m := range inManifests { + outManifests = append(outManifests, strings.ReplaceAll(m, "$E2E_NS", nsName)) + } + ginkgo.By("Creating envoy proxy resources from manifest: " + filePath) - applyYAMLFile(k8sClient, filePath) + createObjsFromYaml(k8sClient, outManifests) // Wait for the configmap to exist before proceeding with test. cfgMap := &corev1.ConfigMap{} @@ -296,8 +370,19 @@ func createEnvoy(k8sClient client.Client, filePath string) { // createInferExt creates the inference extension resources used for testing from the given filePath. func createInferExt(k8sClient client.Client, filePath string) { + inManifests := readYaml(filePath) + ginkgo.By("Replacing placeholders with environment variables") + outManifests := []string{} + for _, manifest := range inManifests { + replacer := strings.NewReplacer( + "$E2E_NS", nsName, + "$E2E_IMAGE", e2eImage, + ) + outManifests = append(outManifests, replacer.Replace(manifest)) + } + ginkgo.By("Creating inference extension resources from manifest: " + filePath) - applyYAMLFile(k8sClient, filePath) + createObjsFromYaml(k8sClient, outManifests) // Wait for the clusterrole to exist. testutils.EventuallyExists(ctx, func() error { diff --git a/test/e2e/epp/e2e_test.go b/test/e2e/epp/e2e_test.go index 09c8835ac..7bf31ac92 100644 --- a/test/e2e/epp/e2e_test.go +++ b/test/e2e/epp/e2e_test.go @@ -17,75 +17,215 @@ limitations under the License. package epp import ( + "encoding/json" + "errors" "fmt" + "strconv" "strings" + "time" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" "k8s.io/utils/ptr" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + client "sigs.k8s.io/controller-runtime/pkg/client" + v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" testutils "sigs.k8s.io/gateway-api-inference-extension/test/utils" ) var _ = ginkgo.Describe("InferencePool", func() { + var infModel *v1alpha2.InferenceModel ginkgo.BeforeEach(func() { ginkgo.By("Waiting for the namespace to exist.") namespaceExists(cli, nsName) + + ginkgo.By("Creating an InferenceModel resource") + infModel = newInferenceModel(nsName) + gomega.Expect(cli.Create(ctx, infModel)).To(gomega.Succeed()) + + ginkgo.By("Ensuring the InferenceModel resource exists in the namespace") + gomega.Eventually(func() error { + return cli.Get(ctx, types.NamespacedName{Namespace: infModel.Namespace, Name: infModel.Name}, infModel) + }, existsTimeout, interval).Should(gomega.Succeed()) }) ginkgo.AfterEach(func() { ginkgo.By("Deleting the InferenceModel test resource.") cleanupInferModelResources() + gomega.Eventually(func() error { + err := cli.Get(ctx, types.NamespacedName{Namespace: infModel.Namespace, Name: infModel.Name}, infModel) + if err == nil { + return errors.New("InferenceModel resource still exists") + } + if !k8serrors.IsNotFound(err) { + return nil + } + return nil + }, existsTimeout, interval).Should(gomega.Succeed()) }) ginkgo.When("The Inference Extension is running", func() { ginkgo.It("Should route traffic to target model servers", func() { - ginkgo.By("Creating an InferenceModel resource") - infModel := newInferenceModel(nsName) - gomega.Expect(cli.Create(ctx, infModel)).To(gomega.Succeed()) + for _, t := range []struct { + api string + promptOrMessages any + }{ + { + api: "/completions", + promptOrMessages: "Write as if you were a critic: San Francisco", + }, + { + api: "/chat/completions", + promptOrMessages: []map[string]any{ + { + "role": "user", + "content": "Write as if you were a critic: San Francisco", + }, + }, + }, + { + api: "/chat/completions", + promptOrMessages: []map[string]any{ + { + "role": "user", + "content": "Write as if you were a critic: San Francisco", + }, + {"role": "assistant", "content": "Okay, let's see..."}, + {"role": "user", "content": "Now summarize your thoughts."}, + }, + }, + } { + ginkgo.By(fmt.Sprintf("Verifying connectivity through the inference extension with %s api and prompt/messages: %v", t.api, t.promptOrMessages)) + + // Ensure the expected responses include the inferencemodel target model names. + var expected []string + for _, m := range infModel.Spec.TargetModels { + expected = append(expected, m.Name) + } + curlCmd := getCurlCommand(envoyName, nsName, envoyPort, modelName, curlTimeout, t.api, t.promptOrMessages, false) + + actual := make(map[string]int) + gomega.Eventually(func() error { + resp, err := testutils.ExecCommandInPod(ctx, cfg, scheme, kubeCli, nsName, "curl", "curl", curlCmd) + if err != nil { + return err + } + if !strings.Contains(resp, "200 OK") { + return fmt.Errorf("did not get 200 OK: %s", resp) + } + for _, m := range expected { + if strings.Contains(resp, m) { + actual[m] = 0 + } + } + var got []string + for m := range actual { + got = append(got, m) + } + // Compare ignoring order + if !cmp.Equal(got, expected, cmpopts.SortSlices(func(a, b string) bool { return a < b })) { + return fmt.Errorf("actual (%v) != expected (%v); resp=%q", got, expected, resp) + } + return nil + }, readyTimeout, curlInterval).Should(gomega.Succeed()) + } + }) + + ginkgo.It("Should expose EPP metrics after generating traffic", func() { + // Define the metrics we expect to see + expectedMetrics := []string{ + "inference_model_request_total", + "inference_model_request_error_total", + "inference_model_request_duration_seconds", + // TODO: normalized_time_per_output_token_seconds is not actually recorded yet + // "normalized_time_per_output_token_seconds", + "inference_model_request_sizes", + "inference_model_response_sizes", + "inference_model_input_tokens", + "inference_model_output_tokens", + "inference_pool_average_kv_cache_utilization", + "inference_pool_average_queue_size", + "inference_pool_per_pod_queue_size", + "inference_model_running_requests", + "inference_pool_ready_pods", + "inference_extension_info", + } + + // Generate traffic by sending requests through the inference extension + ginkgo.By("Generating traffic through the inference extension") + curlCmd := getCurlCommand(envoyName, nsName, envoyPort, modelName, curlTimeout, "/completions", "Write as if you were a critic: San Francisco", true) + + // Run the curl command multiple times to generate some metrics data + for i := 0; i < 5; i++ { + _, err := testutils.ExecCommandInPod(ctx, cfg, scheme, kubeCli, nsName, "curl", "curl", curlCmd) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } + + // modify the curl command to generate some error metrics + curlCmd[len(curlCmd)-1] = "invalid input" + for i := 0; i < 5; i++ { + _, err := testutils.ExecCommandInPod(ctx, cfg, scheme, kubeCli, nsName, "curl", "curl", curlCmd) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + } - ginkgo.By("Ensuring the InferenceModel resource exists in the namespace") + // Now scrape metrics from the EPP endpoint via the curl pod + ginkgo.By("Scraping metrics from the EPP endpoint") + + // Get Pod IP instead of Service + podList := &corev1.PodList{} + err := cli.List(ctx, podList, client.InNamespace(nsName), client.MatchingLabels{"app": inferExtName}) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(podList.Items).NotTo(gomega.BeEmpty()) + podIP := podList.Items[0].Status.PodIP + gomega.Expect(podIP).NotTo(gomega.BeEmpty()) + + // Get the authorization token for reading metrics + token := "" gomega.Eventually(func() error { - return cli.Get(ctx, types.NamespacedName{Namespace: infModel.Namespace, Name: infModel.Name}, infModel) + token, err = getMetricsReaderToken(cli) + if err != nil { + return err + } + if token == "" { + return errors.New("token not found") + } + return nil }, existsTimeout, interval).Should(gomega.Succeed()) - ginkgo.By("Verifying connectivity through the inference extension") - curlCmd := getCurlCommand(envoyName, nsName, envoyPort, modelName) - - // Ensure the expected responses include the inferencemodel target model names. - var expected []string - for _, m := range infModel.Spec.TargetModels { - expected = append(expected, m.Name) + // Construct the metric scraping curl command using Pod IP + metricScrapeCmd := []string{ + "curl", + "-i", + "--max-time", + strconv.Itoa((int)(curlTimeout.Seconds())), + "-H", + "Authorization: Bearer " + token, + fmt.Sprintf("http://%s:%d/metrics", podIP, 9090), } - actual := make(map[string]int) + + ginkgo.By("Verifying that all expected metrics are present.") gomega.Eventually(func() error { - resp, err := testutils.ExecCommandInPod(ctx, cfg, scheme, kubeCli, nsName, "curl", "curl", curlCmd) + // Execute the metrics scrape command inside the curl pod + resp, err := testutils.ExecCommandInPod(ctx, cfg, scheme, kubeCli, nsName, "curl", "curl", metricScrapeCmd) if err != nil { return err } + // Verify that we got a 200 OK responsecurl if !strings.Contains(resp, "200 OK") { return fmt.Errorf("did not get 200 OK: %s", resp) } - for _, m := range expected { - if strings.Contains(resp, m) { - actual[m] = 0 + // Check if all expected metrics are present in the metrics output + for _, metric := range expectedMetrics { + if !strings.Contains(resp, metric) { + return fmt.Errorf("expected metric %s not found in metrics output", metric) } } - var got []string - for m := range actual { - got = append(got, m) - } - // Compare ignoring order - if !cmp.Equal(got, expected, cmpopts.SortSlices(func(a, b string) bool { return a < b })) { - return fmt.Errorf("actual (%v) != expected (%v); resp=%q", got, expected, resp) - } - return nil }, readyTimeout, curlInterval).Should(gomega.Succeed()) - }) }) }) @@ -94,15 +234,11 @@ var _ = ginkgo.Describe("InferencePool", func() { func newInferenceModel(ns string) *v1alpha2.InferenceModel { targets := []v1alpha2.TargetModel{ { - Name: modelName + "-0", - Weight: ptr.To(int32(50)), - }, - { - Name: modelName + "-1", - Weight: ptr.To(int32(50)), + Name: targetModelName, + Weight: ptr.To(int32(100)), }, } - return testutils.MakeModelWrapper("inferencemodel-sample", ns). + return testutils.MakeModelWrapper(types.NamespacedName{Name: "inferencemodel-sample", Namespace: ns}). SetCriticality(v1alpha2.Critical). SetModelName(modelName). SetPoolRef(modelServerName). @@ -110,16 +246,47 @@ func newInferenceModel(ns string) *v1alpha2.InferenceModel { Obj() } +func getMetricsReaderToken(k8sClient client.Client) (string, error) { + secret := &corev1.Secret{} + err := k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: metricsReaderSecretName}, secret) + if err != nil { + return "", err + } + return string(secret.Data["token"]), nil +} + // getCurlCommand returns the command, as a slice of strings, for curl'ing // the test model server at the given name, namespace, port, and model name. -func getCurlCommand(name, ns, port, model string) []string { +func getCurlCommand(name, ns, port, model string, timeout time.Duration, api string, promptOrMessages any, streaming bool) []string { + body := map[string]any{ + "model": model, + "max_tokens": 100, + "temperature": 0, + } + body["model"] = model + switch api { + case "/completions": + body["prompt"] = promptOrMessages + case "/chat/completions": + body["messages"] = promptOrMessages + } + if streaming { + body["stream"] = true + body["stream_options"] = map[string]any{ + "include_usage": true, + } + } + b, err := json.Marshal(body) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) return []string{ "curl", "-i", - fmt.Sprintf("%s.%s.svc:%s/v1/completions", name, ns, port), + "--max-time", + strconv.Itoa((int)(timeout.Seconds())), + fmt.Sprintf("%s.%s.svc:%s/v1%s", name, ns, port, api), "-H", "Content-Type: application/json", "-d", - fmt.Sprintf(`{"model": "%s", "prompt": "Write as if you were a critic: San Francisco", "max_tokens": 100, "temperature": 0}`, model), + string(b), } } diff --git a/test/integration/bbr/hermetic_test.go b/test/integration/bbr/hermetic_test.go index 718bfedfb..8fd479fe3 100644 --- a/test/integration/bbr/hermetic_test.go +++ b/test/integration/bbr/hermetic_test.go @@ -19,20 +19,19 @@ package bbr import ( "context" - "encoding/json" "fmt" "testing" "time" configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" - "github.com/go-logr/logr" "github.com/google/go-cmp/cmp" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" "google.golang.org/protobuf/testing/protocmp" - runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/body-based-routing/server" + runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/bbr/server" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + integrationutils "sigs.k8s.io/gateway-api-inference-extension/test/integration" ) var logger = logutil.NewTestLogger().V(logutil.VERBOSE) @@ -46,7 +45,7 @@ func TestBodyBasedRouting(t *testing.T) { }{ { name: "success adding model parameter to header", - req: generateRequest(logger, "llama"), + req: integrationutils.GenerateRequest(logger, "test", "llama", nil), wantHeaders: []*configPb.HeaderValueOption{ { Header: &configPb.HeaderValue{ @@ -59,7 +58,7 @@ func TestBodyBasedRouting(t *testing.T) { }, { name: "no model parameter", - req: generateRequest(logger, ""), + req: integrationutils.GenerateRequest(logger, "test1", "", nil), wantHeaders: []*configPb.HeaderValueOption{}, wantErr: false, }, @@ -67,7 +66,7 @@ func TestBodyBasedRouting(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - client, cleanup := setUpHermeticServer() + client, cleanup := setUpHermeticServer(false) t.Cleanup(cleanup) want := &extProcPb.ProcessingResponse{} @@ -88,7 +87,7 @@ func TestBodyBasedRouting(t *testing.T) { } } - res, err := sendRequest(t, client, test.req) + res, err := integrationutils.SendRequest(t, client, test.req) if err != nil && !test.wantErr { t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) } @@ -99,12 +98,171 @@ func TestBodyBasedRouting(t *testing.T) { } } -func setUpHermeticServer() (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { +func TestFullDuplexStreamed_BodyBasedRouting(t *testing.T) { + tests := []struct { + name string + reqs []*extProcPb.ProcessingRequest + wantResponses []*extProcPb.ProcessingResponse + wantErr bool + }{ + { + name: "success adding model parameter to header", + reqs: integrationutils.GenerateStreamedRequestSet(logger, "test", "foo", nil), + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "X-Gateway-Model-Name", + RawValue: []byte("foo"), + }, + }, + }}, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"foo\",\"prompt\":\"test\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "success adding model parameter to header with multiple body chunks", + reqs: []*extProcPb.ProcessingRequest{ + { + Request: &extProcPb.ProcessingRequest_RequestHeaders{ + RequestHeaders: &extProcPb.HttpHeaders{ + Headers: &configPb.HeaderMap{ + Headers: []*configPb.HeaderValue{ + { + Key: "hi", + Value: "mom", + }, + }, + }, + }, + }, + }, + { + Request: &extProcPb.ProcessingRequest_RequestBody{ + RequestBody: &extProcPb.HttpBody{Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lo"), EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_RequestBody{ + RequestBody: &extProcPb.HttpBody{Body: []byte("ra-sheddable\",\"prompt\":\"test\",\"temperature\":0}"), EndOfStream: true}, + }, + }, + }, + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "X-Gateway-Model-Name", + RawValue: []byte("sql-lora-sheddable"), + }, + }, + }}, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-sheddable\",\"prompt\":\"test\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "no model parameter", + reqs: integrationutils.GenerateStreamedRequestSet(logger, "test", "", nil), + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{}, + }, + }, + { + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"prompt\":\"test\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + client, cleanup := setUpHermeticServer(true) + t.Cleanup(cleanup) + + responses, err := integrationutils.StreamedRequest(t, client, test.reqs, len(test.wantResponses)) + if err != nil && !test.wantErr { + t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) + } + + if diff := cmp.Diff(test.wantResponses, responses, protocmp.Transform()); diff != "" { + t.Errorf("Unexpected response, (-want +got): %v", diff) + } + }) + } +} + +func setUpHermeticServer(streaming bool) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { port := 9004 serverCtx, stopServer := context.WithCancel(context.Background()) serverRunner := runserver.NewDefaultExtProcServerRunner(port, false) serverRunner.SecureServing = false + serverRunner.Streaming = streaming go func() { if err := serverRunner.AsRunnable(logger.WithName("ext-proc")).Start(serverCtx); err != nil { @@ -133,41 +291,3 @@ func setUpHermeticServer() (client extProcPb.ExternalProcessor_ProcessClient, cl time.Sleep(5 * time.Second) } } - -func generateRequest(logger logr.Logger, model string) *extProcPb.ProcessingRequest { - j := map[string]interface{}{ - "prompt": "test1", - "max_tokens": 100, - "temperature": 0, - } - if model != "" { - j["model"] = model - } - - llmReq, err := json.Marshal(j) - if err != nil { - logutil.Fatal(logger, err, "Failed to unmarshal LLM request") - } - req := &extProcPb.ProcessingRequest{ - Request: &extProcPb.ProcessingRequest_RequestBody{ - RequestBody: &extProcPb.HttpBody{Body: llmReq}, - }, - } - return req -} - -func sendRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { - t.Logf("Sending request: %v", req) - if err := client.Send(req); err != nil { - t.Logf("Failed to send request %+v: %v", req, err) - return nil, err - } - - res, err := client.Recv() - if err != nil { - t.Logf("Failed to receive: %v", err) - return nil, err - } - t.Logf("Received request %+v", res) - return res, err -} diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go index 2acdacf8d..6d439d17d 100644 --- a/test/integration/epp/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -24,11 +24,8 @@ import ( "errors" "fmt" "io" - "net" - "net/http" "os" "path/filepath" - "strconv" "strings" "testing" "time" @@ -37,50 +34,66 @@ import ( extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3" "github.com/google/go-cmp/cmp" - "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/stretchr/testify/assert" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" "google.golang.org/protobuf/testing/protocmp" - "google.golang.org/protobuf/types/known/structpb" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" utilruntime "k8s.io/apimachinery/pkg/util/runtime" k8syaml "k8s.io/apimachinery/pkg/util/yaml" clientgoscheme "k8s.io/client-go/kubernetes/scheme" - "k8s.io/component-base/metrics/legacyregistry" metricsutils "k8s.io/component-base/metrics/testutil" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/cache" - "sigs.k8s.io/controller-runtime/pkg/client" k8sclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/config" "sigs.k8s.io/controller-runtime/pkg/envtest" - "sigs.k8s.io/controller-runtime/pkg/manager" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" + "sigs.k8s.io/controller-runtime/pkg/metrics/filters" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/requestcontrol" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/saturationdetector" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" - runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" - utiltesting "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" + epptestutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" + integrationutils "sigs.k8s.io/gateway-api-inference-extension/test/integration" "sigs.k8s.io/yaml" ) const ( - port = runserver.DefaultGrpcPort - metricsPort = 8888 + // Test Infrastructure + testPoolName = "vllm-llama3-8b-instruct-pool" + testNamespace = "default" + testMetricsPort = 8889 + testPort = server.DefaultGrpcPort + + // Model Names + modelMyModel = "my-model" + modelMyModelTarget = "my-model-12345" + modelSQLLora = "sql-lora" + modelSQLLoraTarget = "sql-lora-1fdg2" + modelSheddable = "sql-lora-sheddable" + modelSheddableTarget = "sql-lora-1fdg3" + modelDirect = "direct-model" ) var ( - serverRunner *runserver.ExtProcServerRunner - k8sClient k8sclient.Client - testEnv *envtest.Environment - scheme = runtime.NewScheme() - logger = logutil.NewTestLogger().V(logutil.VERBOSE) + testGRPCAddress = fmt.Sprintf("localhost:%d", server.DefaultGrpcPort) + serverRunner *server.ExtProcServerRunner + k8sClient k8sclient.Client + testEnv *envtest.Environment + scheme = runtime.NewScheme() + logger = logutil.NewTestLogger().V(logutil.VERBOSE) ) func TestMain(m *testing.M) { @@ -90,318 +103,47 @@ func TestMain(m *testing.M) { os.Exit(code) } -func TestKubeInferenceModelRequest(t *testing.T) { - tests := []struct { - name string - req *extProcPb.ProcessingRequest - pods map[backendmetrics.Pod]*backendmetrics.Metrics - wantHeaders []*configPb.HeaderValueOption - wantMetadata *structpb.Struct - wantBody []byte - wantMetrics string - wantErr bool - immediateResponse *extProcPb.ImmediateResponse - }{ - { - name: "select lower queue and kv cache, no active lora", - req: utiltesting.GenerateRequest(logger, "test1", "my-model"), - // pod-1 will be picked because it has relatively low queue size and low KV cache. - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 3, - KVCacheUsagePercent: 0.2, - }, - fakePod(1): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.1, - }, - fakePod(2): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - }, - }, - wantHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: runserver.DefaultDestinationEndpointHintKey, - RawValue: []byte("192.168.1.2:8000"), - }, - }, - { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte("76"), - }, - }, - }, - wantMetadata: makeMetadata("192.168.1.2:8000"), - wantBody: []byte("{\"max_tokens\":100,\"model\":\"my-model-12345\",\"prompt\":\"test1\",\"temperature\":0}"), - wantMetrics: ` - # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. - # TYPE inference_model_request_total counter - inference_model_request_total{model_name="my-model",target_model_name="my-model-12345"} 1 - `, - wantErr: false, - }, - { - name: "select active lora, low queue", - req: utiltesting.GenerateRequest(logger, "test2", "sql-lora"), - // pod-1 will be picked because it has relatively low queue size, with the requested - // model being active, and has low KV cache. - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - fakePod(1): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.1, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg2": 1, - }, - }, - fakePod(2): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - }, - wantHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: runserver.DefaultDestinationEndpointHintKey, - RawValue: []byte("192.168.1.2:8000"), - }, - }, - { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte("76"), - }, - }, - }, - wantMetadata: makeMetadata("192.168.1.2:8000"), - wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test2\",\"temperature\":0}"), - wantMetrics: ` - # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. - # TYPE inference_model_request_total counter - inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1 - `, - wantErr: false, - }, - { - name: "select no lora despite active model, avoid excessive queue size", - req: utiltesting.GenerateRequest(logger, "test3", "sql-lora"), - // pod-2 will be picked despite it NOT having the requested model being active - // as it's above the affinity for queue size. Also is critical, so we should - // still honor request despite all queues > 5 - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - fakePod(1): { - WaitingQueueSize: 200, - KVCacheUsagePercent: 0.1, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg2": 1, - }, - }, - fakePod(2): { - WaitingQueueSize: 6, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - }, - }, - }, - wantHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: runserver.DefaultDestinationEndpointHintKey, - RawValue: []byte("192.168.1.3:8000"), - }, - }, - { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte("76"), - }, - }, - }, - wantMetadata: makeMetadata("192.168.1.3:8000"), - wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test3\",\"temperature\":0}"), - wantMetrics: ` - # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. - # TYPE inference_model_request_total counter - inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1 - `, - wantErr: false, - }, - { - name: "noncritical and all models past threshold, shed request", - req: utiltesting.GenerateRequest(logger, "test4", "sql-lora-sheddable"), - // no pods will be picked as all models are either above kv threshold, - // queue threshold, or both. - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 6, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - "sql-lora-1fdg3": 1, - }, - }, - fakePod(1): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.85, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, - }, - fakePod(2): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.9, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, - }, - }, - wantHeaders: []*configPb.HeaderValueOption{}, - wantMetadata: &structpb.Struct{}, - wantBody: []byte(""), - wantErr: false, - immediateResponse: &extProcPb.ImmediateResponse{ - Status: &envoyTypePb.HttpStatus{ - Code: envoyTypePb.StatusCode_TooManyRequests, - }, - }, - wantMetrics: "", - }, - { - name: "noncritical, but one server has capacity, do not shed", - req: utiltesting.GenerateRequest(logger, "test5", "sql-lora-sheddable"), - // pod 0 will be picked as all other models are above threshold - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 4, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - "sql-lora-1fdg3": 1, - }, - }, - fakePod(1): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.85, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, - }, - fakePod(2): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.9, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, - }, - }, - wantHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: runserver.DefaultDestinationEndpointHintKey, - RawValue: []byte("192.168.1.1:8000"), - }, - }, - { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte("76"), - }, - }, - }, - wantMetadata: makeMetadata("192.168.1.1:8000"), - wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"test5\",\"temperature\":0}"), - wantMetrics: ` - # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. - # TYPE inference_model_request_total counter - inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1 - `, - wantErr: false, - }, - } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - client, cleanup := setUpHermeticServer(t, test.pods, false) - t.Cleanup(cleanup) - want := &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_RequestBody{ - RequestBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: test.wantHeaders, - }, - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_Body{ - Body: test.wantBody, - }, - }, - }, - }, - }, - DynamicMetadata: test.wantMetadata, - } - res, err := sendRequest(t, client, test.req) +type label struct { + name, + value string +} - if err != nil && !test.wantErr { - t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) - } - if test.immediateResponse != nil { - want = &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_ImmediateResponse{ - ImmediateResponse: test.immediateResponse, - }, - } - } - if diff := cmp.Diff(want, res, protocmp.Transform()); diff != "" { - t.Errorf("Unexpected response, (-want +got): %v", diff) - } +func labelsToString(labels []label) string { + var sb strings.Builder + i := 0 + for _, l := range labels { + if i > 0 { + sb.WriteString(",") + } + sb.WriteString(fmt.Sprintf("%s=%q", l.name, l.value)) + i++ + } + return sb.String() +} - if test.wantMetrics != "" { - if err := metricsutils.GatherAndCompare(legacyregistry.DefaultGatherer, strings.NewReader(test.wantMetrics), "inference_model_request_total"); err != nil { - t.Error(err) - } - } +func inferenceModelRequestTotal(labels []label) string { + return fmt.Sprintf(` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{%s} 1 + `, labelsToString(labels), + ) +} - legacyregistry.Reset() - }) - } +func inferencePoolReadyPods(v int, labels []label) string { + return fmt.Sprintf(` + # HELP inference_pool_ready_pods [ALPHA] The number of ready pods in the inference server pool. + # TYPE inference_pool_ready_pods gauge + inference_pool_ready_pods{%s} %d + `, labelsToString(labels), v, + ) } func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { tests := []struct { name string requests []*extProcPb.ProcessingRequest - pods map[backendmetrics.Pod]*backendmetrics.Metrics + pods map[*backend.Pod]*backendmetrics.MetricsState wantResponses []*extProcPb.ProcessingResponse wantMetrics map[string]string wantErr bool @@ -410,366 +152,170 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { // Request flow tests { name: "select lower queue and kv cache, no active lora", - requests: utiltesting.GenerateStreamedRequestSet(logger, "test1", "my-model"), - // pod-1 will be picked because it has relatively low queue size and low KV cache. - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 3, - KVCacheUsagePercent: 0.2, - }, - fakePod(1): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.1, - }, - fakePod(2): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - }, - }, - wantMetrics: map[string]string{`inference_model_request_total`: ` - # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. - # TYPE inference_model_request_total counter - inference_model_request_total{model_name="my-model",target_model_name="my-model-12345"} 1 - `, - `inference_pool_ready_pods`: ` - # HELP inference_pool_ready_pods [ALPHA] The number of ready pods in the inference server pool. - # TYPE inference_pool_ready_pods gauge - inference_pool_ready_pods{name="vllm-llama3-8b-instruct-pool"} 3 - `, + requests: integrationutils.GenerateStreamedRequestSet(logger, "test1", modelMyModel, nil), + // Pod 1 will be picked because it has relatively low queue size and low KV cache. + pods: newPodStates( + podState{index: 0, queueSize: 3, kvCacheUsage: 0.2}, + podState{index: 1, queueSize: 0, kvCacheUsage: 0.1}, + podState{index: 2, queueSize: 10, kvCacheUsage: 0.2}, + ), + wantMetrics: map[string]string{ + "inference_model_request_total": inferenceModelRequestTotal([]label{ + {"model_name", modelMyModel}, + {"target_model_name", modelMyModelTarget}, + }), + "inference_pool_ready_pods": inferencePoolReadyPods(3, []label{ + {"name", testPoolName}, + }), }, wantErr: false, - wantResponses: []*extProcPb.ProcessingResponse{ - { - Response: &extProcPb.ProcessingResponse_RequestHeaders{ - RequestHeaders: &extProcPb.HeadersResponse{ - Response: &extProcPb.CommonResponse{ - ClearRouteCache: true, - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: "x-gateway-destination-endpoint", - RawValue: []byte("192.168.1.2:8000"), - }, - }, - { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte(strconv.Itoa(76)), - }, - }, - }}, - }, - }, + wantResponses: integrationutils.NewRequestBufferedResponse( + "192.168.1.2:8000", + fmt.Sprintf(`{"max_tokens":100,"model":%q,"prompt":"test1","temperature":0}`, modelMyModelTarget), + &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: "hi", + RawValue: []byte("mom"), }, - DynamicMetadata: makeMetadata("192.168.1.2:8000"), }, + ), + }, + { + name: "invalid json; return body", + requests: []*extProcPb.ProcessingRequest{ { - Response: &extProcPb.ProcessingResponse_RequestBody{ - RequestBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte("{\"max_tokens\":100,\"model\":\"my-model-12345\",\"prompt\":\"test1\",\"temperature\":0}"), - EndOfStream: true, - }, + Request: &extProcPb.ProcessingRequest_RequestHeaders{ + RequestHeaders: &extProcPb.HttpHeaders{ + Headers: &configPb.HeaderMap{ + Headers: []*configPb.HeaderValue{ + { + Key: "hi", + Value: "mom", }, }, }, }, }, }, + { + Request: &extProcPb.ProcessingRequest_RequestBody{ + RequestBody: &extProcPb.HttpBody{Body: []byte("no healthy upstream"), EndOfStream: true}, + }, + }, }, + // Pod 1 will be picked because it has relatively low queue size, the requested model active, and low KV cache. + pods: newPodStates( + podState{index: 0, queueSize: 0, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar"}}, + podState{index: 1, queueSize: 0, kvCacheUsage: 0.1, activeModels: []string{"foo", modelSQLLoraTarget}}, + podState{index: 2, queueSize: 10, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar"}}, + ), + wantErr: false, + wantResponses: integrationutils.NewImmediateErrorResponse( + envoyTypePb.StatusCode_BadRequest, + "inference gateway: BadRequest - Error unmarshaling request body: no healthy upstream", + ), }, { name: "select active lora, low queue", - requests: utiltesting.GenerateStreamedRequestSet(logger, "test2", "sql-lora"), - // pod-1 will be picked because it has relatively low queue size, with the requested - // model being active, and has low KV cache. - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - fakePod(1): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.1, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg2": 1, - }, - }, - fakePod(2): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, + requests: integrationutils.GenerateStreamedRequestSet(logger, "test2", modelSQLLora, nil), + // Pod 1 will be picked because it has relatively low queue size, the requested model active, and low KV cache. + pods: newPodStates( + podState{index: 0, queueSize: 0, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar"}}, + podState{index: 1, queueSize: 0, kvCacheUsage: 0.1, activeModels: []string{"foo", modelSQLLoraTarget}}, + podState{index: 2, queueSize: 10, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar"}}, + ), + + wantMetrics: map[string]string{ + "inference_model_request_total": inferenceModelRequestTotal([]label{ + {"model_name", modelSQLLora}, + {"target_model_name", modelSQLLoraTarget}, + }), }, - wantMetrics: map[string]string{`inference_model_request_total`: ` - # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. - # TYPE inference_model_request_total counter - inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1 - `}, wantErr: false, - wantResponses: []*extProcPb.ProcessingResponse{ - { - Response: &extProcPb.ProcessingResponse_RequestHeaders{ - RequestHeaders: &extProcPb.HeadersResponse{ - Response: &extProcPb.CommonResponse{ - ClearRouteCache: true, - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: "x-gateway-destination-endpoint", - RawValue: []byte("192.168.1.2:8000"), - }, - }, - { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte(strconv.Itoa(76)), - }, - }, - }}, - }, - }, - }, - DynamicMetadata: makeMetadata("192.168.1.2:8000"), - }, - { - Response: &extProcPb.ProcessingResponse_RequestBody{ - RequestBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test2\",\"temperature\":0}"), - EndOfStream: true, - }, - }, - }, - }, - }, + wantResponses: integrationutils.NewRequestBufferedResponse( + "192.168.1.2:8000", + fmt.Sprintf(`{"max_tokens":100,"model":%q,"prompt":"test2","temperature":0}`, modelSQLLoraTarget), + &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: "hi", + RawValue: []byte("mom"), }, }, - }, + ), }, { name: "select no lora despite active model, avoid excessive queue size", - requests: utiltesting.GenerateStreamedRequestSet(logger, "test3", "sql-lora"), - // pod-2 will be picked despite it NOT having the requested model being active - // as it's above the affinity for queue size. Also is critical, so we should - // still honor request despite all queues > 5 - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - fakePod(1): { - WaitingQueueSize: 200, - KVCacheUsagePercent: 0.1, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg2": 1, - }, - }, - fakePod(2): { - WaitingQueueSize: 6, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - }, - }, + requests: integrationutils.GenerateStreamedRequestSet(logger, "test3", modelSQLLora, nil), + // Pod 2 will be picked despite NOT having the requested model active as it is above the affinity for queue size. + // Also it is critical, so we should still admit the request despite all queue sizes being greater than the queue + // size threshold. + pods: newPodStates( + podState{index: 0, queueSize: 10, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar"}}, + podState{index: 1, queueSize: 200, kvCacheUsage: 0.1, activeModels: []string{"foo", modelSQLLoraTarget}}, + podState{index: 2, queueSize: 6, kvCacheUsage: 0.2, activeModels: []string{"foo"}}, + ), + wantMetrics: map[string]string{ + "inference_model_request_total": inferenceModelRequestTotal([]label{ + {"model_name", modelSQLLora}, + {"target_model_name", modelSQLLoraTarget}, + }), }, - wantMetrics: map[string]string{`inference_model_request_total`: ` - # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. - # TYPE inference_model_request_total counter - inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1 - `}, wantErr: false, - wantResponses: []*extProcPb.ProcessingResponse{ - { - Response: &extProcPb.ProcessingResponse_RequestHeaders{ - RequestHeaders: &extProcPb.HeadersResponse{ - Response: &extProcPb.CommonResponse{ - ClearRouteCache: true, - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: "x-gateway-destination-endpoint", - RawValue: []byte("192.168.1.3:8000"), - }, - }, - { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte(strconv.Itoa(76)), - }, - }, - }}, - }, - }, - }, - DynamicMetadata: makeMetadata("192.168.1.3:8000"), - }, - { - Response: &extProcPb.ProcessingResponse_RequestBody{ - RequestBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test3\",\"temperature\":0}"), - EndOfStream: true, - }, - }, - }, - }, - }, + wantResponses: integrationutils.NewRequestBufferedResponse( + "192.168.1.3:8000", + fmt.Sprintf(`{"max_tokens":100,"model":%q,"prompt":"test3","temperature":0}`, modelSQLLoraTarget), + &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: "hi", + RawValue: []byte("mom"), }, }, - }, + ), }, { name: "noncritical and all models past threshold, shed request", - requests: utiltesting.GenerateStreamedRequestSet(logger, "test4", "sql-lora-sheddable"), - // no pods will be picked as all models are either above kv threshold, - // queue threshold, or both. - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 6, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - "sql-lora-1fdg3": 1, - }, - }, - fakePod(1): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.85, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, - }, - fakePod(2): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.9, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, - }, - }, + requests: integrationutils.GenerateStreamedRequestSet(logger, "test4", modelSheddable, nil), + // pod 0: excluded; above queue size threshold + // pod 1: excluded; above KV cache threshold + // pod 2: excluded; above queue size threshold + pods: newPodStates( + podState{index: 0, queueSize: 6, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar", modelSheddableTarget}}, + podState{index: 1, queueSize: 0, kvCacheUsage: 0.85, activeModels: []string{"foo", modelSheddableTarget}}, + podState{index: 2, queueSize: 10, kvCacheUsage: 0.9, activeModels: []string{"foo", modelSheddableTarget}}, + ), wantErr: false, wantMetrics: map[string]string{}, - wantResponses: []*extProcPb.ProcessingResponse{ - { - Response: &extProcPb.ProcessingResponse_ImmediateResponse{ - ImmediateResponse: &extProcPb.ImmediateResponse{ - Status: &envoyTypePb.HttpStatus{ - Code: envoyTypePb.StatusCode_TooManyRequests, - }, - }, - }, - }, - }, + wantResponses: integrationutils.NewImmediateErrorResponse( + envoyTypePb.StatusCode_TooManyRequests, + "inference gateway: InferencePoolResourceExhausted - system saturated, non-critical request dropped", + ), }, { name: "noncritical, but one server has capacity, do not shed", - requests: utiltesting.GenerateStreamedRequestSet(logger, "test5", "sql-lora-sheddable"), - // pod 0 will be picked as all other models are above threshold - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 4, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - "sql-lora-1fdg3": 1, - }, - }, - fakePod(1): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.85, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, - }, - fakePod(2): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.9, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, - }, + requests: integrationutils.GenerateStreamedRequestSet(logger, "test5", modelSheddable, nil), + // Pod 1 will be picked because it has relatively low queue size and low KV cache. + pods: newPodStates( + podState{index: 0, queueSize: 4, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar", modelSheddableTarget}}, + podState{index: 1, queueSize: 0, kvCacheUsage: 0.85, activeModels: []string{"foo", modelSheddableTarget}}, + podState{index: 2, queueSize: 10, kvCacheUsage: 0.9, activeModels: []string{"foo", modelSheddableTarget}}, + ), + wantMetrics: map[string]string{ + "inference_model_request_total": inferenceModelRequestTotal([]label{ + {"model_name", modelSheddable}, + {"target_model_name", modelSheddableTarget}, + }), }, - wantMetrics: map[string]string{`inference_model_request_total`: ` - # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. - # TYPE inference_model_request_total counter - inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1 - `}, wantErr: false, - wantResponses: []*extProcPb.ProcessingResponse{ - { - Response: &extProcPb.ProcessingResponse_RequestHeaders{ - RequestHeaders: &extProcPb.HeadersResponse{ - Response: &extProcPb.CommonResponse{ - ClearRouteCache: true, - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: "x-gateway-destination-endpoint", - RawValue: []byte("192.168.1.1:8000"), - }, - }, - { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte(strconv.Itoa(76)), - }, - }, - }}, - }, - }, - }, - DynamicMetadata: makeMetadata("192.168.1.1:8000"), - }, - { - Response: &extProcPb.ProcessingResponse_RequestBody{ - RequestBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"test5\",\"temperature\":0}"), - EndOfStream: true, - }, - }, - }, - }, - }, + wantResponses: integrationutils.NewRequestBufferedResponse( + "192.168.1.2:8000", + fmt.Sprintf(`{"max_tokens":100,"model":%q,"prompt":"test5","temperature":0}`, modelSheddableTarget), + &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: "hi", + RawValue: []byte("mom"), }, }, - }, + ), }, { name: "body sent over multiple requests, noncritical, but one server has capacity, do not shed", @@ -799,85 +345,29 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { }, }, }, - - // - // pod 0 will be picked as all other models are above threshold - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 4, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - "sql-lora-1fdg3": 1, - }, - }, - fakePod(1): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.85, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, - }, - fakePod(2): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.9, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, - }, + // Pod 1 will be picked because it has relatively low queue size and low KV cache. + pods: newPodStates( + podState{index: 0, queueSize: 4, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar", modelSheddableTarget}}, + podState{index: 1, queueSize: 0, kvCacheUsage: 0.85, activeModels: []string{"foo", modelSheddableTarget}}, + podState{index: 2, queueSize: 10, kvCacheUsage: 0.9, activeModels: []string{"foo", modelSheddableTarget}}, + ), + wantMetrics: map[string]string{ + "inference_model_request_total": inferenceModelRequestTotal([]label{ + {"model_name", modelSheddable}, + {"target_model_name", modelSheddableTarget}, + }), }, - wantMetrics: map[string]string{`inference_model_request_total`: ` - # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. - # TYPE inference_model_request_total counter - inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1 - `}, wantErr: false, - wantResponses: []*extProcPb.ProcessingResponse{ - { - Response: &extProcPb.ProcessingResponse_RequestHeaders{ - RequestHeaders: &extProcPb.HeadersResponse{ - Response: &extProcPb.CommonResponse{ - ClearRouteCache: true, - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: "x-gateway-destination-endpoint", - RawValue: []byte("192.168.1.1:8000"), - }, - }, - { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte(strconv.Itoa(76)), - }, - }, - }}, - }, - }, - }, - DynamicMetadata: makeMetadata("192.168.1.1:8000"), - }, - { - Response: &extProcPb.ProcessingResponse_RequestBody{ - RequestBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"test6\",\"temperature\":0}"), - EndOfStream: true, - }, - }, - }, - }, - }, + wantResponses: integrationutils.NewRequestBufferedResponse( + "192.168.1.2:8000", + fmt.Sprintf(`{"max_tokens":100,"model":%q,"prompt":"test6","temperature":0}`, modelSheddableTarget), + &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: "hi", + RawValue: []byte("mom"), }, }, - }, + ), }, { name: "inferencemodel's modelName is not translated, passthrough", @@ -907,85 +397,31 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { }, }, }, - - // - // pod 0 will be picked as all other models are above threshold - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 4, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - "sql-lora-1fdg3": 1, - }, - }, - fakePod(1): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.85, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, - }, - fakePod(2): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.9, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, - }, + // pod 0: selected + // pod 1: excluded; above KV cache threshold + // pod 2: excluded; above queue size threshold + pods: newPodStates( + podState{index: 0, queueSize: 4, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar", modelSheddableTarget}}, + podState{index: 1, queueSize: 0, kvCacheUsage: 0.85, activeModels: []string{"foo", modelSheddableTarget}}, + podState{index: 2, queueSize: 10, kvCacheUsage: 0.9, activeModels: []string{"foo", modelSheddableTarget}}, + ), + wantMetrics: map[string]string{ + "inference_model_request_total": inferenceModelRequestTotal([]label{ + {"model_name", modelDirect}, + {"target_model_name", modelDirect}, + }), }, - wantMetrics: map[string]string{`inference_model_request_total`: ` - # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. - # TYPE inference_model_request_total counter - inference_model_request_total{model_name="direct-model",target_model_name="direct-model"} 1 - `}, wantErr: false, - wantResponses: []*extProcPb.ProcessingResponse{ - { - Response: &extProcPb.ProcessingResponse_RequestHeaders{ - RequestHeaders: &extProcPb.HeadersResponse{ - Response: &extProcPb.CommonResponse{ - ClearRouteCache: true, - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: "x-gateway-destination-endpoint", - RawValue: []byte("192.168.1.2:8000"), - }, - }, - { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte(strconv.Itoa(74)), - }, - }, - }}, - }, - }, - }, - DynamicMetadata: makeMetadata("192.168.1.2:8000"), - }, - { - Response: &extProcPb.ProcessingResponse_RequestBody{ - RequestBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte("{\"max_tokens\":100,\"model\":\"direct-model\",\"prompt\":\"test6\",\"temperature\":0}"), - EndOfStream: true, - }, - }, - }, - }, - }, + wantResponses: integrationutils.NewRequestBufferedResponse( + "192.168.1.2:8000", + fmt.Sprintf(`{"max_tokens":100,"model":%q,"prompt":"test6","temperature":0}`, modelDirect), + &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: "hi", + RawValue: []byte("mom"), }, }, - }, + ), }, // Response flow tests { @@ -1016,50 +452,42 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { }, }, }, - - // - // pod 0 will be picked as all other models are above threshold - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 4, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - "sql-lora-1fdg3": 1, - }, - }, - fakePod(1): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.85, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, + // pod 0: selected + // pod 1: excluded; above KV cache threshold + // pod 2: excluded; above queue size threshold + pods: newPodStates( + podState{index: 0, queueSize: 4, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar", modelSheddableTarget}}, + podState{index: 1, queueSize: 0, kvCacheUsage: 0.85, activeModels: []string{"foo", modelSheddableTarget}}, + podState{index: 2, queueSize: 10, kvCacheUsage: 0.9, activeModels: []string{"foo", modelSheddableTarget}}, + ), + wantErr: false, + wantResponses: integrationutils.NewResponseBufferedResponse( + fmt.Sprintf(`{"max_tokens":100,"model":%q,"prompt":"test6","temperature":0}`, modelSheddable), + &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: "x-went-into-resp-headers", + RawValue: []byte("true"), }, }, - fakePod(2): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.9, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, + &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: "content-type", + RawValue: []uint8("application/json"), }, }, - }, - wantErr: false, - wantResponses: []*extProcPb.ProcessingResponse{ + ), + }, + { + name: "Response is invalid json; return body", + requests: []*extProcPb.ProcessingRequest{ { - Response: &extProcPb.ProcessingResponse_ResponseHeaders{ - ResponseHeaders: &extProcPb.HeadersResponse{ - Response: &extProcPb.CommonResponse{ - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: "x-went-into-resp-headers", - RawValue: []byte("true"), - }, - }, + Request: &extProcPb.ProcessingRequest_ResponseHeaders{ + ResponseHeaders: &extProcPb.HttpHeaders{ + Headers: &configPb.HeaderMap{ + Headers: []*configPb.HeaderValue{ + { + Key: "content-type", + Value: "application/json", }, }, }, @@ -1067,22 +495,35 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { }, }, { - Response: &extProcPb.ProcessingResponse_ResponseBody{ - ResponseBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-sheddable\",\"prompt\":\"test6\",\"temperature\":0}"), - EndOfStream: true, - }, - }, - }, - }, - }, + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{Body: []byte("no healthy upstream"), EndOfStream: true}, }, }, }, + // pod 0: selected + // pod 1: excluded; above KV cache threshold + // pod 2: excluded; above queue size threshold + pods: newPodStates( + podState{index: 0, queueSize: 4, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar", modelSheddableTarget}}, + podState{index: 1, queueSize: 0, kvCacheUsage: 0.85, activeModels: []string{"foo", modelSheddableTarget}}, + podState{index: 2, queueSize: 10, kvCacheUsage: 0.9, activeModels: []string{"foo", modelSheddableTarget}}, + ), + wantErr: false, + wantResponses: integrationutils.NewResponseBufferedResponse( + "no healthy upstream", + &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: "x-went-into-resp-headers", + RawValue: []byte("true"), + }, + }, + &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: "content-type", + RawValue: []uint8("application/json"), + }, + }, + ), }, { name: "responsebody sent over a single request, but empty body with EndOfStream in the second request(this is how envoy operates); content-type is json, buffer", @@ -1112,73 +553,30 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { }, }, }, - - // - // pod 0 will be picked as all other models are above threshold - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 4, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - "sql-lora-1fdg3": 1, - }, - }, - fakePod(1): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.85, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, - }, - fakePod(2): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.9, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, - }, - }, + // pod 0: selected + // pod 1: excluded; above KV cache threshold + // pod 2: excluded; above queue size threshold + pods: newPodStates( + podState{index: 0, queueSize: 4, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar", modelSheddableTarget}}, + podState{index: 1, queueSize: 0, kvCacheUsage: 0.85, activeModels: []string{"foo", modelSheddableTarget}}, + podState{index: 2, queueSize: 10, kvCacheUsage: 0.9, activeModels: []string{"foo", modelSheddableTarget}}, + ), wantErr: false, - wantResponses: []*extProcPb.ProcessingResponse{ - { - Response: &extProcPb.ProcessingResponse_ResponseHeaders{ - ResponseHeaders: &extProcPb.HeadersResponse{ - Response: &extProcPb.CommonResponse{ - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: "x-went-into-resp-headers", - RawValue: []byte("true"), - }, - }, - }, - }, - }, - }, + wantResponses: integrationutils.NewResponseBufferedResponse( + fmt.Sprintf(`{"max_tokens":100,"model":%q,"prompt":"test6","temperature":0}`, modelSheddable), + &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: "x-went-into-resp-headers", + RawValue: []byte("true"), }, }, - { - Response: &extProcPb.ProcessingResponse_ResponseBody{ - ResponseBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-sheddable\",\"prompt\":\"test6\",\"temperature\":0}"), - EndOfStream: true, - }, - }, - }, - }, - }, + &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: "content-type", + RawValue: []uint8("application/json"), }, }, - }, + ), }, { name: "responsebody sent over a single request, but empty body with EndOfStream in the second request(this is how envoy operates); content-type is json, buffer", @@ -1239,9 +637,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { { Request: &extProcPb.ProcessingRequest_ResponseBody{ ResponseBody: &extProcPb.HttpBody{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}} - data: [DONE]`, - ), + Body: []byte("data: {\"id\":\"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9\",\"object\":\"text_completion\",\"created\":1741379018,\"model\":\"food-review-1\",\"choices\":[],\"usage\":{\"prompt_tokens\":7,\"total_tokens\":17,\"completion_tokens\":10}}\ndata: [DONE]"), EndOfStream: false}, }, }, @@ -1281,138 +677,33 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { inference_model_input_tokens_count{model_name="",target_model_name=""} 1 `}, wantResponses: []*extProcPb.ProcessingResponse{ - { - Response: &extProcPb.ProcessingResponse_ResponseHeaders{ - ResponseHeaders: &extProcPb.HeadersResponse{ - Response: &extProcPb.CommonResponse{ - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: "x-went-into-resp-headers", - RawValue: []byte("true"), - }, - }, - }, - }, - }, - }, - }, - }, - { - Response: &extProcPb.ProcessingResponse_ResponseBody{ - ResponseBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"NEVER","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), - EndOfStream: false, - }, - }, - }, - }, + integrationutils.NewResponseHeaders( + &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: "x-went-into-resp-headers", + RawValue: []byte("true"), }, }, - }, - { - Response: &extProcPb.ProcessingResponse_ResponseBody{ - ResponseBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"GONNA","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), - EndOfStream: false, - }, - }, - }, - }, + &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: "content-type", + RawValue: []byte("text/event-stream"), }, }, - }, - { - Response: &extProcPb.ProcessingResponse_ResponseBody{ - ResponseBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"GIVE","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), - EndOfStream: false, - }, - }, - }, - }, + &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: "status", + RawValue: []byte("200"), }, }, - }, - { - Response: &extProcPb.ProcessingResponse_ResponseBody{ - ResponseBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"YOU","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), - EndOfStream: false, - }, - }, - }, - }, - }, - }, - }, - { - Response: &extProcPb.ProcessingResponse_ResponseBody{ - ResponseBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"UP","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), - EndOfStream: false, - }, - }, - }, - }, - }, - }, - }, - { - Response: &extProcPb.ProcessingResponse_ResponseBody{ - ResponseBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}} - data: [DONE]`, - ), - EndOfStream: false, - }, - }, - }, - }, - }, - }, - }, - { - Response: &extProcPb.ProcessingResponse_ResponseBody{ - ResponseBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: []byte(""), - EndOfStream: true, - }, - }, - }, - }, - }, - }, - }, + ), + integrationutils.NewResponseStreamChunk(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"NEVER","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`, false), + integrationutils.NewResponseStreamChunk(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"GONNA","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`, false), + integrationutils.NewResponseStreamChunk(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"GIVE","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`, false), + integrationutils.NewResponseStreamChunk(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"YOU","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`, false), + integrationutils.NewResponseStreamChunk(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"food-review-1","choices":[{"index":0,"text":"UP","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`, false), + integrationutils.NewResponseStreamChunk("data: {\"id\":\"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9\",\"object\":\"text_completion\",\"created\":1741379018,\"model\":\"food-review-1\",\"choices\":[],\"usage\":{\"prompt_tokens\":7,\"total_tokens\":17,\"completion_tokens\":10}}\ndata: [DONE]", false), + integrationutils.NewResponseStreamChunk("", true), }, }, // Bodyless Request test @@ -1439,90 +730,154 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { }, }, }, + wantResponses: []*extProcPb.ProcessingResponse{}, + pods: newPodStates( + podState{index: 0, queueSize: 4, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar", modelSheddableTarget}}, + ), + wantMetrics: map[string]string{}, + }, + { + name: "select active lora with subsetting tag, all pods available", + requests: integrationutils.GenerateStreamedRequestSet( + logger, + "test2", + modelSQLLora, + []string{"192.168.1.1:8000", "192.168.1.2:8000", "192.168.1.3:8000"}), + // Pod 1 will be picked because it has relatively low queue size, the requested model active, low KV cache, and within subset. + pods: newPodStates( + podState{index: 0, queueSize: 0, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar"}}, + podState{index: 1, queueSize: 0, kvCacheUsage: 0.1, activeModels: []string{"foo", modelSQLLoraTarget}}, + podState{index: 2, queueSize: 10, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar"}}, + ), + + wantMetrics: map[string]string{ + "inference_model_request_total": inferenceModelRequestTotal([]label{ + {"model_name", modelSQLLora}, + {"target_model_name", modelSQLLoraTarget}, + }), + }, + wantErr: false, + wantResponses: integrationutils.NewRequestBufferedResponse( + "192.168.1.2:8000", + fmt.Sprintf(`{"max_tokens":100,"model":%q,"prompt":"test2","temperature":0}`, modelSQLLoraTarget), + &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: "hi", + RawValue: []byte("mom"), + }, + }, + ), + }, + { + name: "select active lora with subsetting tag, some pods match", + requests: integrationutils.GenerateStreamedRequestSet( + logger, + "test2", + modelSQLLora, + []string{"192.168.1.3:8000"}), + // Pod 3 has high queue and kv cache utilization, but it will still be picked because it is the only one matching subsetting target. + pods: newPodStates( + podState{index: 0, queueSize: 0, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar"}}, + podState{index: 1, queueSize: 0, kvCacheUsage: 0.1, activeModels: []string{"foo", modelSQLLoraTarget}}, + podState{index: 2, queueSize: 10, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar"}}, + ), + + wantMetrics: map[string]string{ + "inference_model_request_total": inferenceModelRequestTotal([]label{ + {"model_name", modelSQLLora}, + {"target_model_name", modelSQLLoraTarget}, + }), + }, + wantErr: false, + wantResponses: integrationutils.NewRequestBufferedResponse( + "192.168.1.3:8000", + fmt.Sprintf(`{"max_tokens":100,"model":%q,"prompt":"test2","temperature":0}`, modelSQLLoraTarget), + &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: "hi", + RawValue: []byte("mom"), + }, + }, + ), + }, + { + name: "select active lora with subsetting tag, no pods available", + requests: integrationutils.GenerateStreamedRequestSet( + logger, + "test2", + modelSQLLora, + []string{"192.168.1.4:8000", "192.168.1.5:8000", "192.168.1.6:8000"}), + // No pods will be picked as none are within the subset. + pods: newPodStates( + podState{index: 0, queueSize: 0, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar"}}, + podState{index: 1, queueSize: 0, kvCacheUsage: 0.1, activeModels: []string{"foo", modelSQLLoraTarget}}, + podState{index: 2, queueSize: 10, kvCacheUsage: 0.2, activeModels: []string{"foo", "bar"}}, + ), + + wantMetrics: map[string]string{}, + wantErr: true, wantResponses: []*extProcPb.ProcessingResponse{ { - Response: &extProcPb.ProcessingResponse_RequestHeaders{ - RequestHeaders: &extProcPb.HeadersResponse{ - Response: &extProcPb.CommonResponse{ - ClearRouteCache: true, - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: "x-gateway-destination-endpoint", - RawValue: []byte("192.168.1.1:8000"), - }, - }, - }}, + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: envoyTypePb.StatusCode_ServiceUnavailable, }, + Body: []byte("inference gateway: ServiceUnavailable - failed to find candidate pods for serving the request"), }, }, - DynamicMetadata: makeMetadata("192.168.1.1:8000"), - }, - }, - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 4, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - "sql-lora-1fdg3": 1, - }, }, }, - wantMetrics: map[string]string{`inference_pool_ready_pods`: ` - # HELP inference_pool_ready_pods [ALPHA] The number of ready pods in the inference server pool. - # TYPE inference_pool_ready_pods gauge - inference_pool_ready_pods{name="vllm-llama3-8b-instruct-pool"} 1 - `}, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - client, cleanup := setUpHermeticServer(t, test.pods, true) + client, cleanup := setUpHermeticServer(t, test.pods) t.Cleanup(cleanup) - responses, err := streamedRequest(t, client, test.requests, len(test.wantResponses)) + responses, err := integrationutils.StreamedRequest(t, client, test.requests, len(test.wantResponses)) if err != nil && !test.wantErr { t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) } - if diff := cmp.Diff(test.wantResponses, responses, protocmp.Transform()); diff != "" { + if diff := cmp.Diff(test.wantResponses, responses, + protocmp.Transform(), + protocmp.SortRepeated(func(a, b *configPb.HeaderValueOption) bool { + return a.GetHeader().GetKey() < b.GetHeader().GetKey() + }), + ); diff != "" { t.Errorf("Unexpected response, (-want +got): %v", diff) } if len(test.wantMetrics) != 0 { for metricName, value := range test.wantMetrics { - if err := metricsutils.GatherAndCompare(legacyregistry.DefaultGatherer, strings.NewReader(value), metricName); err != nil { + if err := metricsutils.GatherAndCompare(crmetrics.Registry, strings.NewReader(value), metricName); err != nil { t.Error(err) } } } - - legacyregistry.Reset() + metrics.Reset() }) } } -func setUpHermeticServer(t *testing.T, podAndMetrics map[backendmetrics.Pod]*backendmetrics.Metrics, streamed bool) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { +func setUpHermeticServer(t *testing.T, podAndMetrics map[*backend.Pod]*backendmetrics.MetricsState) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { // Reconfigure the TestPodMetricsClient. - res := map[types.NamespacedName]*backendmetrics.Metrics{} + res := map[types.NamespacedName]*backendmetrics.MetricsState{} for pod, metrics := range podAndMetrics { res[pod.NamespacedName] = metrics } serverRunner.TestPodMetricsClient.SetRes(res) - serverRunner.UseStreaming = streamed serverCtx, stopServer := context.WithCancel(context.Background()) // TODO: this should be consistent with the inference pool podLabels := map[string]string{ - "app": "vllm-llama3-8b-instruct-pool", + "app": testPoolName, } for pod := range podAndMetrics { - pod := utiltesting.MakePod(pod.NamespacedName.Name). + pod := epptestutil.MakePod(pod.NamespacedName.Name). Namespace(pod.NamespacedName.Namespace). ReadyCondition(). Labels(podLabels). @@ -1547,16 +902,17 @@ func setUpHermeticServer(t *testing.T, podAndMetrics map[backendmetrics.Pod]*bac } }() + time.Sleep(serverRunner.RefreshPrometheusMetricsInterval) // wait for metrics to get available before running tests that rely on these metrics + // check if all pods are synced to datastore assert.EventuallyWithT(t, func(t *assert.CollectT) { assert.Len(t, serverRunner.Datastore.PodGetAll(), len(podAndMetrics), "Datastore not synced") }, 10*time.Second, time.Second) - address := fmt.Sprintf("localhost:%v", port) // Create a grpc connection - conn, err := grpc.NewClient(address, grpc.WithTransportCredentials(insecure.NewCredentials())) + conn, err := grpc.NewClient(testGRPCAddress, grpc.WithTransportCredentials(insecure.NewCredentials())) if err != nil { - logutil.Fatal(logger, err, "Failed to connect", "address", address) + logutil.Fatal(logger, err, "Failed to connect", "address", testGRPCAddress) } ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) @@ -1571,7 +927,7 @@ func setUpHermeticServer(t *testing.T, podAndMetrics map[backendmetrics.Pod]*bac // clear created pods for pod := range podAndMetrics { - pod := utiltesting.MakePod(pod.NamespacedName.Name). + pod := epptestutil.MakePod(pod.NamespacedName.Name). Namespace(pod.NamespacedName.Namespace).Complete().ObjRef() if err := k8sClient.Delete(context.Background(), pod); err != nil { @@ -1581,11 +937,39 @@ func setUpHermeticServer(t *testing.T, podAndMetrics map[backendmetrics.Pod]*bac } } -func fakePod(index int) backendmetrics.Pod { - return backendmetrics.Pod{ - NamespacedName: types.NamespacedName{Name: fmt.Sprintf("pod-%v", index), Namespace: "default"}, +func fakePod(index int) *backend.Pod { + return &backend.Pod{ + NamespacedName: types.NamespacedName{Name: fmt.Sprintf("pod-%v", index), Namespace: testNamespace}, Address: fmt.Sprintf("192.168.1.%d", index+1), + Labels: make(map[string]string, 0), + } +} + +// podState is a descriptor for a pod's simulated metrics. +type podState struct { + index int + queueSize int + kvCacheUsage float64 + activeModels []string +} + +// newPodStates generates the backend metrics map required by the test setup. +func newPodStates(states ...podState) map[*backend.Pod]*backendmetrics.MetricsState { + res := make(map[*backend.Pod]*backendmetrics.MetricsState) + for _, s := range states { + pod := fakePod(s.index) + activeModelsMap := make(map[string]int) + for _, model := range s.activeModels { + activeModelsMap[model] = 1 + } + res[pod] = &backendmetrics.MetricsState{ + WaitingQueueSize: s.queueSize, + KVCacheUsagePercent: s.kvCacheUsage, + ActiveModels: activeModelsMap, + WaitingModels: make(map[string]int), + } } + return res } // Sets up a test environment and returns the runner struct @@ -1601,7 +985,7 @@ func BeforeSuite() func() { } utilruntime.Must(clientgoscheme.AddToScheme(scheme)) - utilruntime.Must(v1alpha2.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.Install(scheme)) k8sClient, err = k8sclient.New(cfg, k8sclient.Options{Scheme: scheme}) if err != nil { @@ -1613,21 +997,37 @@ func BeforeSuite() func() { // Init runtime. ctrl.SetLogger(logger) - mgr, err := server.NewManagerWithOptions(cfg, managerTestOptions("default", "vllm-llama3-8b-instruct-pool")) + metrics.Register() + // Register metrics handler. + // Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server. + // More info: + // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.19.1/pkg/metrics/server + // - https://book.kubebuilder.io/reference/metrics.html + metricsServerOptions := metricsserver.Options{ + BindAddress: fmt.Sprintf(":%d", testMetricsPort), + FilterProvider: filters.WithAuthenticationAndAuthorization, + } + mgr, err := server.NewManagerWithOptions(cfg, managerTestOptions(testNamespace, testPoolName, metricsServerOptions)) if err != nil { logutil.Fatal(logger, err, "Failed to create controller manager") } - if err := registerMetricsHandler(mgr, metricsPort); err != nil { - logutil.Fatal(logger, err, "Failed to register metrics handler") - } - - serverRunner = runserver.NewDefaultExtProcServerRunner() + serverRunner = server.NewDefaultExtProcServerRunner() serverRunner.TestPodMetricsClient = &backendmetrics.FakePodMetricsClient{} pmf := backendmetrics.NewPodMetricsFactory(serverRunner.TestPodMetricsClient, 10*time.Millisecond) // Adjust from defaults - serverRunner.PoolName = "vllm-llama3-8b-instruct-pool" + serverRunner.PoolNamespacedName = types.NamespacedName{Name: testPoolName, Namespace: testNamespace} serverRunner.Datastore = datastore.NewDatastore(context.Background(), pmf) + scheduler := scheduling.NewScheduler() + + sdConfig := &saturationdetector.Config{ + QueueDepthThreshold: saturationdetector.DefaultQueueDepthThreshold, + KVCacheUtilThreshold: saturationdetector.DefaultKVCacheUtilThreshold, + MetricsStalenessThreshold: saturationdetector.DefaultMetricsStalenessThreshold, + } + detector := saturationdetector.NewDetector(sdConfig, serverRunner.Datastore, logger.WithName("saturation-detector")) + serverRunner.SaturationDetector = detector + serverRunner.Director = requestcontrol.NewDirectorWithConfig(serverRunner.Datastore, scheduler, detector, requestcontrol.NewConfig()) serverRunner.SecureServing = false if err := serverRunner.SetupWithManager(context.Background(), mgr); err != nil { @@ -1651,32 +1051,18 @@ func BeforeSuite() func() { } for _, doc := range docs { - inferenceModel := &v1alpha2.InferenceModel{} - if err = yaml.Unmarshal(doc, inferenceModel); err != nil { + obj := &unstructured.Unstructured{} + if err = yaml.Unmarshal(doc, obj); err != nil { logutil.Fatal(logger, err, "Can't unmarshal object", "document", doc) } - if inferenceModel.Kind == "InferenceModel" { - logger.Info("Creating inference model", "model", inferenceModel) - if err := k8sClient.Create(context.Background(), inferenceModel); err != nil { - logutil.Fatal(logger, err, "Unable to create inferenceModel", "modelName", inferenceModel.Name) - } - } - } - for _, doc := range docs { - inferencePool := &v1alpha2.InferencePool{} - if err = yaml.Unmarshal(doc, inferencePool); err != nil { - logutil.Fatal(logger, err, "Can't unmarshal object", "document", doc) - } - if inferencePool.Kind == "InferencePool" { - logger.Info("Creating inference pool", "pool", inferencePool) - if err := k8sClient.Create(context.Background(), inferencePool); err != nil { - logutil.Fatal(logger, err, "Unable to create inferencePool", "poolName", inferencePool.Name) - } + logger.Info("Creating object", "kind", obj.GetKind(), "object", obj) + if err := k8sClient.Create(context.Background(), obj); err != nil { + logutil.Fatal(logger, err, "Unable to create object", "object", obj.GetName()) } } assert.Eventually(nil, func() bool { - modelExist := serverRunner.Datastore.ModelGet("my-model") + modelExist := serverRunner.Datastore.ModelGet(modelMyModel) synced := serverRunner.Datastore.PoolHasSynced() && modelExist != nil return synced }, 10*time.Second, 10*time.Millisecond) @@ -1688,55 +1074,6 @@ func BeforeSuite() func() { } } -func sendRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { - t.Logf("Sending request: %v", req) - if err := client.Send(req); err != nil { - t.Logf("Failed to send request %+v: %v", req, err) - return nil, err - } - - res, err := client.Recv() - if err != nil { - t.Logf("Failed to receive: %v", err) - return nil, err - } - t.Logf("Received request %+v", res) - return res, err -} - -func streamedRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, requests []*extProcPb.ProcessingRequest, expectedResponses int) ([]*extProcPb.ProcessingResponse, error) { - for _, req := range requests { - t.Logf("Sending request: %v", req) - if err := client.Send(req); err != nil { - t.Logf("Failed to send request %+v: %v", req, err) - return nil, err - } - } - responses := []*extProcPb.ProcessingResponse{} - - // Make an incredible simple timeout func in the case where - // there is less than the expected amount of responses; bail and fail. - var simpleTimeout bool - go func() { - time.Sleep(10 * time.Second) - simpleTimeout = true - }() - - for range expectedResponses { - if simpleTimeout { - break - } - res, err := client.Recv() - if err != nil && err != io.EOF { - t.Logf("Failed to receive: %v", err) - return nil, err - } - t.Logf("Received request %+v", res) - responses = append(responses, res) - } - return responses, nil -} - // readDocuments reads documents from file. func readDocuments(fp string) ([][]byte, error) { b, err := os.ReadFile(fp) @@ -1760,61 +1097,13 @@ func readDocuments(fp string) ([][]byte, error) { return docs, nil } -func makeMetadata(endpoint string) *structpb.Struct { - return &structpb.Struct{ - Fields: map[string]*structpb.Value{ - runserver.DefaultDestinationEndpointHintMetadataNamespace: { - Kind: &structpb.Value_StructValue{ - StructValue: &structpb.Struct{ - Fields: map[string]*structpb.Value{ - runserver.DefaultDestinationEndpointHintKey: { - Kind: &structpb.Value_StringValue{ - StringValue: endpoint, - }, - }, - }, - }, - }, - }, - }, - } -} - -// registerMetricsHandler is a simplified version of metrics endpoint handler -// without Authentication for integration tests. -func registerMetricsHandler(mgr manager.Manager, port int) error { - metrics.Register() - - // Init HTTP server. - h := promhttp.HandlerFor( - legacyregistry.DefaultGatherer, - promhttp.HandlerOpts{}, - ) - - mux := http.NewServeMux() - mux.Handle("/metrics", h) - - srv := &http.Server{ - Addr: net.JoinHostPort("", strconv.Itoa(port)), - Handler: mux, - } - - if err := mgr.Add(&manager.Server{ - Name: "metrics", - Server: srv, - }); err != nil { - return err - } - return nil -} - // inject options that allow multiple test runs to run // https://github.com/kubernetes-sigs/controller-runtime/issues/2937 -func managerTestOptions(namespace, name string) ctrl.Options { +func managerTestOptions(namespace, name string, metricsServerOptions metricsserver.Options) ctrl.Options { return ctrl.Options{ Scheme: scheme, Cache: cache.Options{ - ByObject: map[client.Object]cache.ByObject{ + ByObject: map[k8sclient.Object]cache.ByObject{ &corev1.Pod{}: { Namespaces: map[string]cache.Config{ namespace: {}, @@ -1839,6 +1128,7 @@ func managerTestOptions(namespace, name string) ctrl.Options { Controller: config.Controller{ SkipNameValidation: boolPointer(true), }, + Metrics: metricsServerOptions, } } diff --git a/test/integration/util.go b/test/integration/util.go new file mode 100644 index 000000000..925107bf8 --- /dev/null +++ b/test/integration/util.go @@ -0,0 +1,298 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration + +import ( + "encoding/json" + "io" + "strconv" + "testing" + "time" + + envoyCorev3 "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3" + "github.com/go-logr/logr" + "google.golang.org/protobuf/types/known/structpb" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + headerKeyDestination = "x-gateway-destination-endpoint" + headerKeyContentLength = "Content-Length" +) + +func SendRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { + t.Logf("Sending request: %v", req) + if err := client.Send(req); err != nil { + t.Logf("Failed to send request %+v: %v", req, err) + return nil, err + } + + res, err := client.Recv() + if err != nil { + t.Logf("Failed to receive: %v", err) + return nil, err + } + t.Logf("Received response %+v", res) + return res, err +} + +func StreamedRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, requests []*extProcPb.ProcessingRequest, expectedResponses int) ([]*extProcPb.ProcessingResponse, error) { + for _, req := range requests { + t.Logf("Sending request: %v", req) + if err := client.Send(req); err != nil { + t.Logf("Failed to send request %+v: %v", req, err) + return nil, err + } + } + responses := []*extProcPb.ProcessingResponse{} + + // Make an incredible simple timeout func in the case where + // there is less than the expected amount of responses; bail and fail. + var simpleTimeout bool + go func() { + time.Sleep(10 * time.Second) + simpleTimeout = true + }() + + for range expectedResponses { + if simpleTimeout { + break + } + res, err := client.Recv() + if err != nil && err != io.EOF { + t.Logf("Failed to receive: %v", err) + return nil, err + } + t.Logf("Received response %+v", res) + responses = append(responses, res) + } + return responses, nil +} + +func GenerateRequest(logger logr.Logger, prompt, model string, filterMetadata []string) *extProcPb.ProcessingRequest { + j := map[string]any{ + "prompt": prompt, + "max_tokens": 100, + "temperature": 0, + } + if model != "" { + j["model"] = model + } + + llmReq, err := json.Marshal(j) + if err != nil { + logutil.Fatal(logger, err, "Failed to unmarshal LLM request") + } + req := &extProcPb.ProcessingRequest{ + Request: &extProcPb.ProcessingRequest_RequestBody{ + RequestBody: &extProcPb.HttpBody{Body: llmReq, EndOfStream: true}, + }, + MetadataContext: &envoyCorev3.Metadata{ + FilterMetadata: GenerateRequestMetadata(filterMetadata), + }, + } + return req +} + +func GenerateStreamedRequestSet(logger logr.Logger, prompt, model string, filterMetadata []string) []*extProcPb.ProcessingRequest { + requests := []*extProcPb.ProcessingRequest{} + headerReq := &extProcPb.ProcessingRequest{ + Request: &extProcPb.ProcessingRequest_RequestHeaders{ + RequestHeaders: &extProcPb.HttpHeaders{ + Headers: &envoyCorev3.HeaderMap{ + Headers: []*envoyCorev3.HeaderValue{ + { + Key: "hi", + Value: "mom", + }, + }, + }, + }, + }, + } + + headerReq.MetadataContext = &envoyCorev3.Metadata{ + FilterMetadata: GenerateRequestMetadata(filterMetadata), + } + + requests = append(requests, headerReq) + requests = append(requests, GenerateRequest(logger, prompt, model, filterMetadata)) + return requests +} + +func GenerateRequestMetadata(filterMetadata []string) map[string]*structpb.Struct { + metadata := make(map[string]*structpb.Struct) + interfaceList := make([]any, len(filterMetadata)) + for i, val := range filterMetadata { + interfaceList[i] = val + } + if filterMetadata != nil { + structVal, _ := structpb.NewStruct(map[string]any{ + "x-gateway-destination-endpoint-subset": interfaceList, + }) + metadata["envoy.lb.subset_hint"] = structVal + } + return metadata +} + +// NewRequestBufferedResponse creates a complete set of responses for the request phase. +// It modifies request headers (e.g., for routing) and replaces the entire request body. +// It returns a slice of two messages, representing the complete buffered action. +func NewRequestBufferedResponse( + destinationEndpoint string, + rewrittenBody string, + otherHeaders ...*envoyCorev3.HeaderValueOption, +) []*extProcPb.ProcessingResponse { + setHeaders := []*envoyCorev3.HeaderValueOption{ + { + Header: &envoyCorev3.HeaderValue{ + Key: headerKeyDestination, + RawValue: []byte(destinationEndpoint), + }, + }, + { + Header: &envoyCorev3.HeaderValue{ + Key: headerKeyContentLength, + RawValue: []byte(strconv.Itoa(len(rewrittenBody))), + }, + }, + } + setHeaders = append(setHeaders, otherHeaders...) + + headerResponse := &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: setHeaders, + }, + }, + }, + }, + DynamicMetadata: makeMetadata(destinationEndpoint), + } + + bodyResponse := &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte(rewrittenBody), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + } + + return []*extProcPb.ProcessingResponse{headerResponse, bodyResponse} +} + +// NewResponseBufferedResponse creates a complete set of responses for the response phase. +// It modifies response headers and replaces the entire response body. +// It is used when the processor buffers the upstream response before sending its own. +func NewResponseBufferedResponse( + rewrittenBody string, + headersToSet ...*envoyCorev3.HeaderValueOption, +) []*extProcPb.ProcessingResponse { + return []*extProcPb.ProcessingResponse{ + NewResponseHeaders(headersToSet...), + NewResponseStreamChunk(rewrittenBody, true), + } +} + +// NewResponseHeaders creates a single response message to modify the response headers. +// This is the first step in either a buffered or streaming response modification. +func NewResponseHeaders(headersToSet ...*envoyCorev3.HeaderValueOption) *extProcPb.ProcessingResponse { + return &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ResponseHeaders{ + ResponseHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: headersToSet, + }, + }, + }, + }, + } +} + +// NewResponseStreamChunk creates a single response for one body chunk in a stream. +// This is used to test streaming behaviors like text/event-stream pass-through. +func NewResponseStreamChunk(body string, endOfStream bool) *extProcPb.ProcessingResponse { + return &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte(body), + EndOfStream: endOfStream, + }, + }, + }, + }, + }, + }, + } +} + +// NewImmediateErrorResponse creates an immediate response to terminate processing. +// This is used for errors like load shedding or bad requests. +func NewImmediateErrorResponse(code envoyTypePb.StatusCode, body string) []*extProcPb.ProcessingResponse { + response := &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: code, + }, + Body: []byte(body), + }, + }, + } + return []*extProcPb.ProcessingResponse{response} +} + +// makeMetadata creates the dynamic metadata struct that Envoy uses for routing hints. +func makeMetadata(endpoint string) *structpb.Struct { + return &structpb.Struct{ + Fields: map[string]*structpb.Value{ + server.DefaultDestinationEndpointHintMetadataNamespace: { + Kind: &structpb.Value_StructValue{ + StructValue: &structpb.Struct{ + Fields: map[string]*structpb.Value{ + server.DefaultDestinationEndpointHintKey: { + Kind: &structpb.Value_StringValue{ + StringValue: endpoint, + }, + }, + }, + }, + }, + }, + }, + } +} diff --git a/test/testdata/configloader_1_test.yaml b/test/testdata/configloader_1_test.yaml new file mode 100644 index 000000000..f1f167efb --- /dev/null +++ b/test/testdata/configloader_1_test.yaml @@ -0,0 +1,22 @@ +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: EndpointPickerConfig +plugins: +- name: test1 + type: test-one + parameters: + threshold: 10 +- name: profileHandler + type: test-profile-handler +- type: test-two + parameters: + hashBlockSize: 32 +- name: testPicker + type: test-picker + +schedulingProfiles: +- name: default + plugins: + - pluginRef: test1 + - pluginRef: test-two + weight: 50 + - pluginRef: testPicker diff --git a/test/testdata/envoy.yaml b/test/testdata/envoy.yaml index fc32b5aa2..38418f978 100644 --- a/test/testdata/envoy.yaml +++ b/test/testdata/envoy.yaml @@ -100,14 +100,15 @@ data: grpc_service: envoy_grpc: cluster_name: ext_proc - authority: vllm-llama3-8b-instruct-epp.default:9002 + authority: vllm-llama3-8b-instruct-epp.$E2E_NS:9002 timeout: 10s processing_mode: request_header_mode: SEND - response_header_mode: SKIP - request_body_mode: BUFFERED - request_trailer_mode: SKIP - response_trailer_mode: SKIP + response_header_mode: SEND + request_body_mode: FULL_DUPLEX_STREAMED + response_body_mode: FULL_DUPLEX_STREAMED + request_trailer_mode: SEND + response_trailer_mode: SEND message_timeout: 1000s # Mark it as disabled if needed for troubleshooting: # disabled: true @@ -169,6 +170,16 @@ data: max_pending_requests: 40000 max_requests: 40000 max_retries: 1024 + health_checks: + - timeout: 2s + interval: 10s + unhealthy_threshold: 3 + healthy_threshold: 2 + reuse_connection: true + grpc_health_check: + service_name: "envoy.service.ext_proc.v3.ExternalProcessor" + tls_options: + alpn_protocols: ["h2"] # This ensures that envoy accepts untrusted certificates. We tried to explicitly # set TrustChainVerification to ACCEPT_UNSTRUSTED, but that actually didn't work # and what worked is setting the common_tls_context to empty. @@ -194,9 +205,8 @@ data: - endpoint: address: socket_address: - address: vllm-llama3-8b-instruct-epp.default + address: vllm-llama3-8b-instruct-epp.$E2E_NS port_value: 9002 - health_status: HEALTHY load_balancing_weight: 1 --- apiVersion: apps/v1 @@ -221,10 +231,10 @@ spec: spec: containers: - name: envoy - image: docker.io/envoyproxy/envoy:distroless-v1.32.2 + image: docker.io/envoyproxy/envoy:distroless-v1.33.2 args: - "--service-cluster" - - "default/inference-gateway" + - "$E2E_NS/inference-gateway" - "--service-node" - "$(ENVOY_POD_NAME)" - "--log-level" diff --git a/test/testdata/inferencepool-e2e.yaml b/test/testdata/inferencepool-e2e.yaml new file mode 100644 index 000000000..e92a1e0a9 --- /dev/null +++ b/test/testdata/inferencepool-e2e.yaml @@ -0,0 +1,124 @@ +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + labels: + name: vllm-llama3-8b-instruct +spec: + targetPortNumber: 8000 + selector: + app: vllm-llama3-8b-instruct + extensionRef: + name: vllm-llama3-8b-instruct-epp + namespace: $E2E_NS +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-llama3-8b-instruct-epp + namespace: $E2E_NS +spec: + selector: + app: vllm-llama3-8b-instruct-epp + ports: + - protocol: TCP + port: 9002 + targetPort: 9002 + appProtocol: http2 + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-llama3-8b-instruct-epp + namespace: $E2E_NS + labels: + app: vllm-llama3-8b-instruct-epp +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-llama3-8b-instruct-epp + template: + metadata: + labels: + app: vllm-llama3-8b-instruct-epp + spec: + # Conservatively, this timeout should mirror the longest grace period of the pods within the pool + terminationGracePeriodSeconds: 130 + containers: + - name: epp + image: $E2E_IMAGE + imagePullPolicy: IfNotPresent + args: + - -poolName + - "vllm-llama3-8b-instruct" + - -poolNamespace + - "$E2E_NS" + - -v + - "4" + - --zap-encoder + - "json" + - -grpcPort + - "9002" + - -grpcHealthPort + - "9003" + env: + - name: USE_STREAMING + value: "true" + ports: + - containerPort: 9002 + - containerPort: 9003 + - name: metrics + containerPort: 9090 + livenessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-read +rules: +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencepools"] + verbs: ["get", "watch", "list"] +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencemodels"] + verbs: ["get", "watch", "list"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "watch", "list"] +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-read-binding +subjects: +- kind: ServiceAccount + name: default + namespace: $E2E_NS +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: pod-read diff --git a/test/testdata/metrics-rbac.yaml b/test/testdata/metrics-rbac.yaml new file mode 100644 index 000000000..051891649 --- /dev/null +++ b/test/testdata/metrics-rbac.yaml @@ -0,0 +1,37 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: inference-gateway-metrics-reader +rules: +- nonResourceURLs: + - /metrics + verbs: + - get +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: inference-gateway-sa-metrics-reader + namespace: $E2E_NS +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: inference-gateway-sa-metrics-reader-role-binding +subjects: +- kind: ServiceAccount + name: inference-gateway-sa-metrics-reader + namespace: $E2E_NS +roleRef: + kind: ClusterRole + name: inference-gateway-metrics-reader + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: v1 +kind: Secret +metadata: + name: inference-gateway-sa-metrics-reader-secret + namespace: $E2E_NS + annotations: + kubernetes.io/service-account.name: inference-gateway-sa-metrics-reader +type: kubernetes.io/service-account-token \ No newline at end of file diff --git a/test/utils/handle.go b/test/utils/handle.go new file mode 100644 index 000000000..417346f97 --- /dev/null +++ b/test/utils/handle.go @@ -0,0 +1,71 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + "context" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/plugins" +) + +// testHandle is an implmentation of plugins.Handle for test purposes +type testHandle struct { + ctx context.Context + plugins plugins.HandlePlugins +} + +// Context returns a context the plugins can use, if they need one +func (h *testHandle) Context() context.Context { + return h.ctx +} + +func (h *testHandle) Plugins() plugins.HandlePlugins { + return h.plugins +} + +type testHandlePlugins struct { + thePlugins map[string]plugins.Plugin +} + +func (h *testHandlePlugins) Plugin(name string) plugins.Plugin { + return h.thePlugins[name] +} + +func (h *testHandlePlugins) AddPlugin(name string, plugin plugins.Plugin) { + h.thePlugins[name] = plugin +} + +func (h *testHandlePlugins) GetAllPlugins() []plugins.Plugin { + result := make([]plugins.Plugin, 0) + for _, plugin := range h.thePlugins { + result = append(result, plugin) + } + return result +} + +func (h *testHandlePlugins) GetAllPluginsWithNames() map[string]plugins.Plugin { + return h.thePlugins +} + +func NewTestHandle(ctx context.Context) plugins.Handle { + return &testHandle{ + ctx: ctx, + plugins: &testHandlePlugins{ + thePlugins: map[string]plugins.Plugin{}, + }, + } +} diff --git a/test/utils/server.go b/test/utils/server.go new file mode 100644 index 000000000..f3d0a5a94 --- /dev/null +++ b/test/utils/server.go @@ -0,0 +1,190 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package utils + +import ( + "context" + "net" + "testing" + "time" + + corev3 "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" + pb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/test/bufconn" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + testutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" +) + +const bufSize = 1024 * 1024 + +var testListener *bufconn.Listener + +func PrepareForTestStreamingServer(models []*v1alpha2.InferenceModel, pods []*v1.Pod, poolName string, namespace string, + poolPort int32) (context.Context, context.CancelFunc, datastore.Datastore, *metrics.FakePodMetricsClient) { + ctx, cancel := context.WithCancel(context.Background()) + + pmc := &metrics.FakePodMetricsClient{} + pmf := metrics.NewPodMetricsFactory(pmc, time.Second) + ds := datastore.NewDatastore(ctx, pmf) + + initObjs := []client.Object{} + for _, model := range models { + initObjs = append(initObjs, model) + ds.ModelSetIfOlder(model) + } + for _, pod := range pods { + initObjs = append(initObjs, pod) + ds.PodUpdateOrAddIfNotExist(pod) + } + + scheme := runtime.NewScheme() + _ = clientgoscheme.AddToScheme(scheme) + _ = v1alpha2.Install(scheme) + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(initObjs...). + Build() + pool := testutil.MakeInferencePool(poolName).Namespace(namespace).ObjRef() + pool.Spec.TargetPortNumber = poolPort + _ = ds.PoolSet(context.Background(), fakeClient, pool) + + return ctx, cancel, ds, pmc +} + +func SetupTestStreamingServer(t *testing.T, ctx context.Context, ds datastore.Datastore, + streamingServer pb.ExternalProcessorServer) (*bufconn.Listener, chan error) { + testListener = bufconn.Listen(bufSize) + + errChan := make(chan error) + go func() { + err := LaunchTestGRPCServer(streamingServer, ctx, testListener) + if err != nil { + t.Error("Error launching listener", err) + } + errChan <- err + }() + + time.Sleep(2 * time.Second) + return testListener, errChan +} + +func testDialer(context.Context, string) (net.Conn, error) { + return testListener.Dial() +} + +func GetStreamingServerClient(ctx context.Context, t *testing.T) (pb.ExternalProcessor_ProcessClient, *grpc.ClientConn) { + opts := []grpc.DialOption{ + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithContextDialer(testDialer), + } + conn, err := grpc.NewClient("passthrough://bufconn", opts...) + if err != nil { + t.Error(err) + return nil, nil + } + + extProcClient := pb.NewExternalProcessorClient(conn) + process, err := extProcClient.Process(ctx) + if err != nil { + t.Error(err) + return nil, nil + } + + return process, conn +} + +// LaunchTestGRPCServer actually starts the server (enables testing) +func LaunchTestGRPCServer(s pb.ExternalProcessorServer, ctx context.Context, listener net.Listener) error { + grpcServer := grpc.NewServer() + + pb.RegisterExternalProcessorServer(grpcServer, s) + + // Shutdown on context closed. + // Terminate the server on context closed. + go func() { + <-ctx.Done() + grpcServer.GracefulStop() + }() + + if err := grpcServer.Serve(listener); err != nil { + return err + } + + return nil +} + +func CheckEnvoyGRPCHeaders(t *testing.T, response *pb.CommonResponse, expectedHeaders map[string]string) bool { + headers := response.HeaderMutation.SetHeaders + for expectedKey, expectedValue := range expectedHeaders { + found := false + for _, header := range headers { + if header.Header.Key == expectedKey { + if expectedValue != string(header.Header.RawValue) { + t.Errorf("Incorrect value for header %s, want %s got %s", expectedKey, expectedValue, + string(header.Header.RawValue)) + return false + } + found = true + break + } + } + if !found { + t.Errorf("Missing header %s", expectedKey) + return false + } + } + + for _, header := range headers { + expectedValue, ok := expectedHeaders[header.Header.Key] + if !ok { + t.Errorf("Unexpected header %s", header.Header.Key) + return false + } else if expectedValue != string(header.Header.RawValue) { + t.Errorf("Incorrect value for header %s, want %s got %s", header.Header.Key, expectedValue, + string(header.Header.RawValue)) + return false + } + } + return true +} + +func BuildEnvoyGRPCHeaders(headers map[string]string, rawValue bool) *pb.HttpHeaders { + headerValues := make([]*corev3.HeaderValue, 0) + for key, value := range headers { + header := &corev3.HeaderValue{Key: key} + if rawValue { + header.RawValue = []byte(value) + } else { + header.Value = value + } + headerValues = append(headerValues, header) + } + return &pb.HttpHeaders{ + Headers: &corev3.HeaderMap{ + Headers: headerValues, + }, + } +} diff --git a/test/utils/utils.go b/test/utils/utils.go index 1ec0fbaae..ba74069ff 100644 --- a/test/utils/utils.go +++ b/test/utils/utils.go @@ -59,6 +59,24 @@ func DeleteClusterResources(ctx context.Context, cli client.Client) error { if err != nil && !apierrors.IsNotFound(err) { return err } + metricsReaderBinding := &rbacv1.ClusterRoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: "inference-gateway-sa-metrics-reader-role-binding", + }, + } + err = cli.Delete(ctx, metricsReaderBinding, client.PropagationPolicy(metav1.DeletePropagationForeground)) + if err != nil && !apierrors.IsNotFound(err) { + return err + } + metricsReaderRole := &rbacv1.ClusterRole{ + ObjectMeta: metav1.ObjectMeta{ + Name: "inference-gateway-metrics-reader", + }, + } + err = cli.Delete(ctx, metricsReaderRole, client.PropagationPolicy(metav1.DeletePropagationForeground)) + if err != nil && !apierrors.IsNotFound(err) { + return err + } model := &apiextv1.CustomResourceDefinition{ ObjectMeta: metav1.ObjectMeta{ Name: "inferencemodels.inference.networking.x-k8s.io", @@ -106,6 +124,10 @@ func DeleteNamespacedResources(ctx context.Context, cli client.Client, ns string if err != nil && !apierrors.IsNotFound(err) { return err } + err = cli.DeleteAllOf(ctx, &corev1.ServiceAccount{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + if err != nil && !apierrors.IsNotFound(err) { + return err + } err = cli.DeleteAllOf(ctx, &v1alpha2.InferencePool{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err diff --git a/test/utils/wrappers.go b/test/utils/wrappers.go index 867118c15..4f12591a4 100644 --- a/test/utils/wrappers.go +++ b/test/utils/wrappers.go @@ -18,6 +18,7 @@ package utils import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) @@ -27,12 +28,12 @@ type InferenceModelWrapper struct { } // MakeModelWrapper creates a wrapper for an MakeModelWrapper. -func MakeModelWrapper(name, ns string) *InferenceModelWrapper { +func MakeModelWrapper(namespacedName types.NamespacedName) *InferenceModelWrapper { return &InferenceModelWrapper{ v1alpha2.InferenceModel{ ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: ns, + Name: namespacedName.Name, + Namespace: namespacedName.Namespace, }, Spec: v1alpha2.InferenceModelSpec{ ModelName: "", diff --git a/tools/alerts/alert.yaml b/tools/alerts/alert.yaml new file mode 100644 index 000000000..c712207a4 --- /dev/null +++ b/tools/alerts/alert.yaml @@ -0,0 +1,38 @@ +groups: +- name: gateway-api-inference-extension + rules: + - alert: HighInferenceRequestLatencyP99 + annotations: + title: 'High latency (P99) for model {{ $labels.model_name }}' + description: 'The 99th percentile request duration for model {{ $labels.model_name }} and target model {{ $labels.target_model_name }} has been consistently above 10.0 seconds for 5 minutes.' + expr: histogram_quantile(0.99, rate(inference_model_request_duration_seconds_bucket[5m])) > 10.0 + for: 5m + labels: + severity: 'warning' + - alert: HighInferenceErrorRate + annotations: + title: 'High error rate for model {{ $labels.model_name }}' + description: 'The error rate for model {{ $labels.model_name }} and target model {{ $labels.target_model_name }} has been consistently above 5% for 5 minutes.' + expr: sum by (model_name) (rate(inference_model_request_error_total[5m])) / sum by (model_name) (rate(inference_model_request_total[5m])) > 0.05 + for: 5m + labels: + severity: 'critical' + impact: 'availability' + - alert: HighInferencePoolAvgQueueSize + annotations: + title: 'High average queue size for inference pool {{ $labels.name }}' + description: 'The average number of requests pending in the queue for inference pool {{ $labels.name }} has been consistently above 50 for 5 minutes.' + expr: inference_pool_average_queue_size > 50 + for: 5m + labels: + severity: 'critical' + impact: 'performance' + - alert: HighInferencePoolAvgKVCacheUtilization + annotations: + title: 'High KV cache utilization for inference pool {{ $labels.name }}' + description: 'The average KV cache utilization for inference pool {{ $labels.name }} has been consistently above 90% for 5 minutes, indicating potential resource exhaustion.' + expr: inference_pool_average_kv_cache_utilization > 0.9 + for: 5m + labels: + severity: 'critical' + impact: 'resource_exhaustion' diff --git a/tools/benchmark/benchmark.ipynb b/tools/benchmark/benchmark.ipynb index 993279cb9..21723fbd7 100644 --- a/tools/benchmark/benchmark.ipynb +++ b/tools/benchmark/benchmark.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": { "executionInfo": { "elapsed": 391, @@ -25,12 +25,13 @@ "# Path to the benchmark dir under `gateway-api-inference-extension/benchmark`\n", "BENCHMARK_DIR =\"./\"\n", "# A regex to match the model name, which matches the output file name.\n", - "MODEL_MATCHER='.*llama.*'" + "MODEL_MATCHER='.*llama.*'\n", + "INTERACTIVE_PLOT='False'" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": { "executionInfo": { "elapsed": 33, @@ -55,6 +56,7 @@ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import math\n", + "from sklearn.metrics import r2_score\n", "import logging\n", "level = logging.INFO\n", "logger = logging.getLogger(__name__)\n", @@ -82,11 +84,11 @@ " XY(x = 'request_rate', x_label = 'QPS', y = 'output_tokens_per_min'),\n", " XY(x = \"request_rate\", x_label = 'QPS', y = \"p90_per_output_token_latency\"),\n", " XY(x = \"request_rate\", x_label = 'QPS', y = \"p90_latency\"),\n", + " XY(x = \"request_rate\", x_label = 'QPS', y=\"num_prompts_attempted\"),\n", + " XY(x = \"request_rate\", x_label = 'QPS', y=\"num_prompts_succeeded\"),\n", "]\n", "SANITY_CHECK_METRICS = [\n", " XY(x = 'request_rate', x_label = 'QPS', y = 'benchmark_time'),\n", - " XY(x = \"request_rate\", x_label = 'QPS', y=\"num_prompts_attempted\"),\n", - " XY(x = \"request_rate\", x_label = 'QPS', y=\"num_prompts_succeeded\"),\n", " XY(x = 'request_rate', x_label = 'QPS', y = 'throughput_rps'),\n", " XY(x = 'request_rate', x_label = 'QPS', y = 'total_input_tokens'),\n", " XY(x = 'request_rate', x_label = 'QPS', y = 'total_output_token'),\n", @@ -110,6 +112,8 @@ " self.interactive = interactive\n", " self.annotate = annotate\n", " self.output_dir = output_dir\n", + " self.data = load_data(self.labels, self.run_id, self.output_dir)\n", + " self.groups = group_data(self.data, self.metrics)\n", "\n", " def withRunId(self, run_id):\n", " return Plotter(run_id, self.labels, self.metrics, self.num_plots_per_row, self.interactive, self.annotate, self.output_dir)\n", @@ -124,10 +128,16 @@ " return Plotter(self.run_id, self.labels, self.metrics, self.num_plots_per_row, self.interactive, self.annotate, output_dir)\n", "\n", " def plot_bar(self):\n", - " data = load_data(self.labels, self.run_id, self.output_dir)\n", - " groups = group_data(data, self.metrics)\n", + " \n", " logger.debug(\"Plotting run id...\")\n", - " plot_bar(self.labels, groups, self.metrics, self.num_plots_per_row, self.interactive, annotate=self.annotate)\n", + " plot_bar(self.labels, self.groups, self.metrics, self.num_plots_per_row, self.interactive, annotate=self.annotate)\n", + "\n", + " def plot_delta(self):\n", + " \"\"\"\n", + " Plot the delta between two labels.\n", + " \"\"\"\n", + " logger.debug(\"Plotting delta for run id...\")\n", + " plot_delta(self.labels, self.groups, self.metrics, self.num_plots_per_row, self.interactive, annotate=self.annotate)\n", "\n", "def filepaths(root_dir):\n", " \"\"\"\n", @@ -201,6 +211,27 @@ " groups = data.groupby(by=['label'],sort=True)\n", " return groups\n", "\n", + "def compute_r2_for_metrics(groups, metrics, label_before, label_after):\n", + " print(\"\\nCoefficient of Determination (R^2) between before and after runs:\")\n", + " for m in metrics:\n", + " try:\n", + " df_b = groups.get_group(label_before).set_index('request_rate')\n", + " df_a = groups.get_group(label_after).set_index('request_rate')\n", + " except KeyError:\n", + " print(f\" Skipping {m.y}: missing group data for '{label_before}' or '{label_after}'\")\n", + " continue\n", + " common = sorted(set(df_b.index).intersection(df_a.index))\n", + " yb = df_b.loc[common, m.y].values\n", + " ya = df_a.loc[common, m.y].values\n", + " mask = ~np.isnan(yb) & ~np.isnan(ya)\n", + " yb, ya = yb[mask], ya[mask]\n", + " if len(yb) > 1 and np.any(yb != 0):\n", + " r2 = r2_score(yb, ya)\n", + " print(f\" {m.y:<30} R^2 = {r2:.4f}\")\n", + " else:\n", + " print(f\" {m.y:<30} insufficient data for R^2 calculation\")\n", + "\n", + "\n", "def init_plot(metrics, num_plots_per_row=NUM_PLOTS_PER_ROW):\n", " num_plots_per_row = min(num_plots_per_row, len(metrics))\n", " row_count = math.ceil(len(metrics) / num_plots_per_row)\n", @@ -294,7 +325,106 @@ " fig, axes = plot_metrics(metrics, plot_func, num_plots_per_row)\n", " fig.tight_layout(rect=[0, 0.03, 1, 0.95])\n", " plt.show()\n", - "\n" + "\n", + "def plot_delta(labels, groups, metrics=CORE_METRICS, num_plots_per_row=NUM_PLOTS_PER_ROW, interactive=True, annotate=False):\n", + " \"\"\"\n", + " Plot the delta between base_label and compare_label for each metric.\n", + " A positive delta means compare_label has a higher value than base_label.\n", + " \"\"\"\n", + " base_label = labels[0].name\n", + " compare_label = labels[1].name\n", + " logger.debug(f'Printing delta chart for {base_label} vs {compare_label}')\n", + "\n", + " try:\n", + " base_df = groups.get_group((base_label,))\n", + " compare_df = groups.get_group((compare_label,))\n", + " except Exception as e:\n", + " logger.error(f\"Error getting data for labels {base_label} and {compare_label}: {e}\")\n", + " return\n", + "\n", + " y_columns = [m.y for m in metrics]\n", + "\n", + " # 1. Find common request rates\n", + " base_rates = set(base_df['request_rate'].astype(int))\n", + " compare_rates = set(compare_df['request_rate'].astype(int))\n", + " common_rates = sorted(list(base_rates.intersection(compare_rates)))[:6]\n", + "\n", + " if not common_rates:\n", + " logger.error(f\"No common request rates found between {base_label} and {compare_label}\")\n", + " return\n", + "\n", + " # 2. Prepare data for delta calculation\n", + " base_data = base_df.set_index('request_rate').to_dict()\n", + " compare_data = compare_df.set_index('request_rate').to_dict()\n", + "\n", + " # Calculate deltas (compare_label - base_label)\n", + " delta_data = {y_col: {} for y_col in y_columns}\n", + " for y_col in y_columns:\n", + " for rate in common_rates:\n", + " base_val = base_data.get(y_col, {}).get(rate, np.nan)\n", + " compare_val = compare_data.get(y_col, {}).get(rate, np.nan)\n", + "\n", + " if not np.isnan(base_val) and not np.isnan(compare_val):\n", + " delta_data[y_col][rate] = (compare_val - base_val)/base_val*100\n", + " else:\n", + " delta_data[y_col][rate] = np.nan\n", + "\n", + " # 3. Plotting\n", + " def plot_func(curAx, m):\n", + " x = np.arange(len(common_rates))\n", + " y_values = [delta_data[m.y].get(rr, np.nan) for rr in common_rates]\n", + "\n", + " # Determine colors based on positive/negative values\n", + " colors = ['green' if val > 0 else 'blue' for val in y_values]\n", + "\n", + " rects = curAx.bar(x, y_values, 0.6, color=colors)\n", + "\n", + " # Add a horizontal line at y=0\n", + " curAx.axhline(y=0, color='black', linestyle='-', linewidth=1)\n", + "\n", + " if annotate:\n", + " for rect, val in zip(rects, y_values):\n", + " if not np.isnan(val):\n", + " height = rect.get_height()\n", + " # For negative bars, put text above the bar\n", + " vert_align = 'bottom' if val >= 0 else 'top'\n", + " y_offset = 3 if val >= 0 else -3\n", + "\n", + " curAx.annotate(f'{val:.2f}',\n", + " xy=(rect.get_x() + rect.get_width() / 2, val),\n", + " xytext=(0, y_offset), # vertical offset\n", + " textcoords=\"offset points\",\n", + " ha='center', va=vert_align)\n", + "\n", + " # Create a title that shows what this delta represents\n", + " title = f\"Delta: {compare_label} - {base_label} ({m.y})\"\n", + " curAx.set_title(title, fontsize=12)\n", + "\n", + " # Add labels\n", + " curAx.set_xlabel(m.x_label, fontsize=axis_label_fontsize)\n", + " #curAx.set_ylabel(f\"% Delta in {m.y_label}\", fontsize=axis_label_fontsize)\n", + " curAx.set_xticks(x)\n", + " curAx.set_xticklabels(common_rates)\n", + " curAx.tick_params(axis='both', labelsize=tick_label_fontsize)\n", + "\n", + " # Create a dummy handle for the legend\n", + " legend_handle = [plt.Rectangle((0,0),1,1,color='green'),\n", + " plt.Rectangle((0,0),1,1,color='blue')]\n", + " legend_label = [f'{compare_label} > {base_label}',\n", + " f'{compare_label} < {base_label}']\n", + "\n", + " return legend_handle, legend_label\n", + "\n", + " # Create plot with metrics\n", + " fig, axes = plot_metrics(metrics, plot_func, num_plots_per_row)\n", + "\n", + " # Add an overall title for the figure\n", + " fig.suptitle(f\"% Delta Metrics: {compare_label} - {base_label}\",\n", + " fontsize=title_fontsize, y=0.98)\n", + "\n", + " plt.subplots_adjust(bottom=0.15, top=0.9) # Make room for legends\n", + " fig.tight_layout(rect=[0, 0.1, 1, 0.95]) # Adjust the rectangle in which the subplots fit\n", + " plt.show()" ] }, { @@ -320,9 +450,26 @@ "outputs": [], "source": [ "#@title Plot Result\n", - "\n", - "pl = Plotter(run_id=RUN_ID, labels=[Label('inference-extension'),Label('k8s-svc')], output_dir=OUTPUT_DIR)\n", - "pl.plot_bar()" + "# initialize the plotter with the run id and labels. \n", + "# Example labels are 'inference-extension' and 'k8s-svc' if comparing Inference Extension and K8s Service \n", + "# 'regression-before' and 'regression-after' if comparing two different runs of inference extension to see the regression\n", + "\n", + "benchmark_id1 = # eg 'regression-before' or 'inference-extension'\n", + "benchmark_id2 = # eg 'regression-after' or 'k8s-svc'\n", + "labels = [Label(benchmark_id1), Label(benchmark_id2,)]\n", + "\n", + "# Plot bar chart of metrics\n", + "pl = Plotter(run_id=RUN_ID, labels=labels, output_dir=OUTPUT_DIR)\n", + "pl.plot_bar()\n", + "pl.plot_delta()\n", + "\n", + "# Load & group data to compute R^2\n", + "all_data = load_data(labels, RUN_ID, OUTPUT_DIR)\n", + "groups = group_data(all_data)\n", + "compute_r2_for_metrics(groups, CORE_METRICS,\n", + " label_before=benchmark_id1,\n", + " label_after=benchmark_id2)\n", + "\n" ] } ], @@ -355,4 +502,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file diff --git a/tools/dashboards/README.md b/tools/dashboards/README.md index 7be2a5b8e..21282bf23 100644 --- a/tools/dashboards/README.md +++ b/tools/dashboards/README.md @@ -4,7 +4,7 @@ This documentation provides instructions for setting up grafana dashboards to se ## Requirements -Please follow [metrics](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp/metrics) page to configure the proxy to enable all metrics. +Please follow [metrics](https://gateway-api-inference-extension.sigs.k8s.io/guides/metrics/?h=metrics) page to configure the proxy to enable all metrics. ## Load Inference Extension dashboard into Grafana @@ -12,7 +12,7 @@ Please follow [grafana instructions](https://grafana.com/docs/grafana/latest/das ## Configure Google Managed Prometheus as source for metrics -If you run the inferece gateway with [Google Managed Prometheus](https://cloud.google.com/stackdriver/docs/managed-prometheus), please follow the [instructions](https://cloud.google.com/stackdriver/docs/managed-prometheus/query) to configure Google Managed Prometheus as data source for the grafana dashboard. +If you run the inference gateway with [Google Managed Prometheus](https://cloud.google.com/stackdriver/docs/managed-prometheus), please follow the [instructions](https://cloud.google.com/stackdriver/docs/managed-prometheus/query) to configure Google Managed Prometheus as data source for the grafana dashboard. ## Troubleshooting diff --git a/tools/dashboards/inference_gateway.json b/tools/dashboards/inference_gateway.json index cf00420d3..244f4ab14 100644 --- a/tools/dashboards/inference_gateway.json +++ b/tools/dashboards/inference_gateway.json @@ -18,14 +18,11 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 1, + "id": 2, "links": [], + "liveNow": false, "panels": [ { - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, "gridPos": { "h": 3, "w": 20, @@ -42,12 +39,11 @@ "content": "# Inferece Gateway Dashboard\n\nPlease see https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp/metrics for more details of underlying metrics used in the dashboard.", "mode": "markdown" }, - "pluginVersion": "11.5.2", - "title": "", + "pluginVersion": "10.2.4", "type": "text" }, { - "collapsed": false, + "collapsed": true, "gridPos": { "h": 1, "w": 24, @@ -55,1067 +51,1161 @@ "y": 3 }, "id": 15, - "panels": [], - "title": "Inference Pool", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "deap2an4eadc0d" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 10, - "x": 0, - "y": 4 - }, - "id": 16, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.5.2", - "targets": [ + "panels": [ { - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum by(name) (inference_pool_average_kv_cache_utilization)", - "fullMetaSearch": false, - "includeNullMetadata": true, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Average KV Cache Utilization", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "deap2an4eadc0d" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" }, - "thresholdsStyle": { - "mode": "off" - } + "overrides": [] }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 10, - "x": 10, - "y": 4 - }, - "id": 17, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.5.2", - "targets": [ - { - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum by(name) (inference_pool_average_queue_size)", - "fullMetaSearch": false, - "includeNullMetadata": true, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Average Queue Size", - "type": "timeseries" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 12 - }, - "id": 3, - "panels": [], - "title": "Inference Model", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "gridPos": { + "h": 8, + "w": 10, + "x": 0, + "y": 4 }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 20, - "x": 0, - "y": 13 - }, - "id": 2, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.5.2", - "targets": [ - { - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "includeNullMetadata": false, - "legendFormat": "95%", - "range": true, - "refId": "A", - "useBackend": false + "pluginVersion": "11.5.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(name) (inference_pool_average_kv_cache_utilization)", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Average KV Cache Utilization", + "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "90%", - "range": true, - "refId": "B", - "useBackend": false + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 10, + "x": 10, + "y": 4 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(name) (inference_pool_average_queue_size)", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Average Queue Size", + "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "50%", - "range": true, - "refId": "C", - "useBackend": false - } - ], - "title": "E2E Request Latency", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - { - "color": "red", - "value": 80 + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 10, - "x": 0, - "y": 21 - }, - "id": 1, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.5.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + }, + "overrides": [] }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "sum by(model_name, target_model_name) (rate(inference_model_request_total{}[$__rate_interval]))", - "fullMetaSearch": false, - "includeNullMetadata": true, - "interval": "", - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Request / s", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "gridPos": { + "h": 8, + "w": 10, + "x": 0, + "y": 12 }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "id": 19, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "tooltip": { + "mode": "single", + "sort": "none" } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 10, - "x": 10, - "y": 21 - }, - "id": 18, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.5.2", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "sum by(error_code, model_name, target_model_name) (rate(inference_model_request_error_total[$__rate_interval]))", - "fullMetaSearch": false, - "includeNullMetadata": true, - "interval": "", - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(name, model_server_pod, pod) (inference_pool_per_pod_queue_size)", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Queue Size Per Pod", + "type": "timeseries" } ], - "title": "Request Error / s", - "type": "timeseries" + "title": "Inference Pool", + "type": "row" }, { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, + "collapsed": true, "gridPos": { - "h": 8, - "w": 10, + "h": 1, + "w": 24, "x": 0, - "y": 29 - }, - "id": 6, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "single", - "sort": "none" - } + "y": 4 }, - "pluginVersion": "11.5.2", - "targets": [ - { - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "includeNullMetadata": false, - "legendFormat": "95%", - "range": true, - "refId": "A", - "useBackend": false - }, + "id": 3, + "panels": [ { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "90%", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "50%", - "range": true, - "refId": "C", - "useBackend": false - } - ], - "title": "Request Size", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "gridPos": { + "h": 8, + "w": 20, + "x": 0, + "y": 5 }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "legendFormat": "95%", + "range": true, + "refId": "A", + "useBackend": false }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "legendFormat": "90%", + "range": true, + "refId": "B", + "useBackend": false }, - "thresholdsStyle": { - "mode": "off" + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "legendFormat": "50%", + "range": true, + "refId": "C", + "useBackend": false } + ], + "title": "E2E Request Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 10, + "x": 0, + "y": 13 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "sum by(model_name, target_model_name) (rate(inference_model_request_total{}[$__rate_interval]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Request / s", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 10, + "x": 10, + "y": 13 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "sum by(error_code, model_name, target_model_name) (rate(inference_model_request_error_total[$__rate_interval]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Request Error / s", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 10, + "x": 0, + "y": 21 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "legendFormat": "95%", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "legendFormat": "90%", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "legendFormat": "50%", + "range": true, + "refId": "C", + "useBackend": false + } + ], + "title": "Request Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 10, + "x": 10, + "y": 21 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "legendFormat": "95%", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "legendFormat": "90%", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "legendFormat": "50%", + "range": true, + "refId": "C", + "useBackend": false + } + ], + "title": "Response Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } }, - { - "color": "red", - "value": 80 + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 10, - "x": 10, - "y": 29 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.5.2", - "targets": [ - { - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "includeNullMetadata": false, - "legendFormat": "95%", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "90%", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + }, + "overrides": [] }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "50%", - "range": true, - "refId": "C", - "useBackend": false - } - ], - "title": "Response Size", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "gridPos": { + "h": 8, + "w": 10, + "x": 0, + "y": 29 }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "pluginVersion": "11.5.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "legendFormat": "95%", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 10, - "x": 0, - "y": 37 - }, - "id": 8, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.5.2", - "targets": [ - { - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "includeNullMetadata": false, - "legendFormat": "95%", - "range": true, - "refId": "A", - "useBackend": false + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "legendFormat": "90%", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "legendFormat": "50%", + "range": true, + "refId": "C", + "useBackend": false + } + ], + "title": "Input Token Count", + "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "90%", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "50%", - "range": true, - "refId": "C", - "useBackend": false - } - ], - "title": "Input Token Count", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "gridPos": { + "h": 8, + "w": 10, + "x": 10, + "y": 29 }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "pluginVersion": "11.5.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "legendFormat": "95%", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 10, - "x": 10, - "y": 37 - }, - "id": 9, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.5.2", - "targets": [ - { - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "includeNullMetadata": false, - "legendFormat": "95%", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "90%", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "50%", - "range": true, - "refId": "C", - "useBackend": false + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "legendFormat": "90%", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "legendFormat": "50%", + "range": true, + "refId": "C", + "useBackend": false + } + ], + "title": "Output Token Count", + "type": "timeseries" } ], - "title": "Output Token Count", - "type": "timeseries" + "title": "Inference Model", + "type": "row" }, { "collapsed": true, @@ -1123,7 +1213,7 @@ "h": 1, "w": 24, "x": 0, - "y": 45 + "y": 5 }, "id": 10, "panels": [ @@ -1175,8 +1265,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1191,7 +1280,7 @@ "h": 7, "w": 10, "x": 0, - "y": 52 + "y": 60 }, "id": 14, "options": { @@ -1288,8 +1377,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1304,7 +1392,7 @@ "h": 7, "w": 10, "x": 10, - "y": 52 + "y": 60 }, "id": 11, "options": { @@ -1417,8 +1505,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1433,7 +1520,7 @@ "h": 7, "w": 10, "x": 0, - "y": 59 + "y": 67 }, "id": 13, "options": { @@ -1546,8 +1633,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1562,7 +1648,7 @@ "h": 7, "w": 10, "x": 10, - "y": 59 + "y": 67 }, "id": 12, "options": { @@ -1632,35 +1718,39 @@ "type": "row" } ], - "preload": false, - "schemaVersion": 40, + "refresh": false, + "schemaVersion": 39, "tags": [], "templating": { "list": [ { "current": { + "selected": false, "text": "prometheus", - "value": "deap2an4eadc0d" + "value": "d3d7e79a-f83c-46ad-8326-cdd0108978b3" }, + "hide": 0, "includeAll": false, "label": "datasource", + "multi": false, "name": "DS_PROMETHEUS", "options": [], "query": "prometheus", "refresh": 1, "regex": "", + "skipUrlSync": false, "type": "datasource" } ] }, "time": { - "from": "now-1h", - "to": "now" + "from": "2025-05-04T16:16:14.919Z", + "to": "2025-05-04T16:51:40.407Z" }, "timepicker": {}, "timezone": "browser", "title": "Inference Gateway", "uid": "aeap3g4ujefb4b", - "version": 20, + "version": 1, "weekStart": "" } diff --git a/tools/dashboards/inference_gateway_dashboard_1.png b/tools/dashboards/inference_gateway_dashboard_1.png index 98ca0b32f..3cb828940 100644 Binary files a/tools/dashboards/inference_gateway_dashboard_1.png and b/tools/dashboards/inference_gateway_dashboard_1.png differ diff --git a/tools/dashboards/inference_gateway_dashboard_2.png b/tools/dashboards/inference_gateway_dashboard_2.png index 00f3194ac..98ca0b32f 100644 Binary files a/tools/dashboards/inference_gateway_dashboard_2.png and b/tools/dashboards/inference_gateway_dashboard_2.png differ diff --git a/tools/dashboards/inference_gateway_dashboard_3.png b/tools/dashboards/inference_gateway_dashboard_3.png new file mode 100644 index 000000000..00f3194ac Binary files /dev/null and b/tools/dashboards/inference_gateway_dashboard_3.png differ diff --git a/tools/dynamic-lora-sidecar/Dockerfile b/tools/dynamic-lora-sidecar/Dockerfile index 4faf360cf..14703d1aa 100644 --- a/tools/dynamic-lora-sidecar/Dockerfile +++ b/tools/dynamic-lora-sidecar/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.9-slim-buster AS test +FROM python:3.10-slim-buster AS test WORKDIR /dynamic-lora-reconciler-test COPY requirements.txt . diff --git a/tools/dynamic-lora-sidecar/Makefile b/tools/dynamic-lora-sidecar/Makefile new file mode 100644 index 000000000..d498750ac --- /dev/null +++ b/tools/dynamic-lora-sidecar/Makefile @@ -0,0 +1,33 @@ +# Makefile for dynamic-lora-sidecar + +PYTHON_VERSION := 3.10 +VENV_DIR := venv +PYTHON := $(VENV_DIR)/bin/python +PIP := $(VENV_DIR)/bin/pip + +.PHONY: help venv install test clean + +help: ## Show available targets + @echo "Available targets:" + @echo " venv - Create virtual environment" + @echo " install - Install dependencies" + @echo " test - Run unit tests" + @echo " clean - Clean up virtual environment" + +venv: $(VENV_DIR)/bin/activate ## Create virtual environment + +$(VENV_DIR)/bin/activate: + python$(PYTHON_VERSION) -m venv $(VENV_DIR) + +install: venv ## Install dependencies + $(PIP) install --upgrade pip + $(PIP) install -r requirements.txt + +test: install ## Run unit tests + $(PYTHON) -m unittest discover -v -s sidecar + +clean: ## Clean up virtual environment + rm -rf $(VENV_DIR) + rm -rf .pytest_cache + find . -name "*.pyc" -delete + find . -name "__pycache__" -type d -exec rm -rf {} + diff --git a/tools/dynamic-lora-sidecar/README.md b/tools/dynamic-lora-sidecar/README.md index 65dc0d784..f50c78319 100644 --- a/tools/dynamic-lora-sidecar/README.md +++ b/tools/dynamic-lora-sidecar/README.md @@ -1,6 +1,6 @@ # Dynamic LORA Adapter Sidecar for vLLM -This is a sidecar-based tool to help rolling out new LoRA adapters to a set of running vLLM model servers. The user deploys the sidecar with a vLLM server, and using a ConfigMap, the user can express their intent as to which LoRA adapters they want to have the running vLLM servers to be configure with. The sidecar watches the ConfigMap and sends load/unload requests to the vLLM container to actuate on the user intent. +This is a sidecar-based tool to help rolling out new LoRA adapters to a set of running vLLM model servers. The user deploys the sidecar with a vLLM server, and using a ConfigMap, the user can express their intent as to which LoRA adapters they want to have the running vLLM servers to be configure with. The sidecar watches the ConfigMap and sends load/unload requests to the vLLM container to actuate on the user intent. ## Overview @@ -48,6 +48,17 @@ The sidecar uses the vLLM server's API to load or unload adapters based on the c ``` Do not use subPath, since configmap updates are not reflected in the file +## Development + +For local development and testing, use the provided Makefile: + +```bash +make venv # Create Python 3.10 virtual environment +make install # Install dependencies +make test # Run unit tests +make clean # Clean up +``` + ## Command Line Arguments The sidecar supports the following command-line arguments: @@ -59,7 +70,7 @@ The sidecar supports the following command-line arguments: - `--config-validation`: Enable config validation (default: True) ## Configuration Fields -- `vLLMLoRAConfig`[**required**] base key +- `vLLMLoRAConfig`[**required**] base key - `host` [*optional*] Model server's host. defaults to localhost - `port` [*optional*] Model server's port. defaults to 8000 - `name` [*optional*] Name of this config @@ -121,6 +132,9 @@ spec: - name: reconciler image: your-image:tag command: ["python", "sidecar.py", "--health-check-timeout", "600", "--health-check-interval", "5", "--reconcile-trigger", "10"] #optional if overriding default values + ports: + - containerPort: 8080 + name: metrics volumeMounts: - name: config-volume mountPath: /config diff --git a/tools/dynamic-lora-sidecar/deployment.yaml b/tools/dynamic-lora-sidecar/deployment.yaml index 0c0c1781e..dd1b6bbc7 100644 --- a/tools/dynamic-lora-sidecar/deployment.yaml +++ b/tools/dynamic-lora-sidecar/deployment.yaml @@ -69,7 +69,10 @@ spec: image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer:main restartPolicy: Always imagePullPolicy: Always - env: + ports: + - containerPort: 8080 + name: metrics + env: - name: DYNAMIC_LORA_ROLLOUT_CONFIG value: "/config/configmap.yaml" volumeMounts: # DO NOT USE subPath diff --git a/tools/dynamic-lora-sidecar/requirements.txt b/tools/dynamic-lora-sidecar/requirements.txt index 1f9b9272e..66a76bc40 100644 --- a/tools/dynamic-lora-sidecar/requirements.txt +++ b/tools/dynamic-lora-sidecar/requirements.txt @@ -1,6 +1,7 @@ -aiohttp -jsonschema -pyyaml -requests -watchfiles -watchdog \ No newline at end of file +aiohttp==3.12.12 +jsonschema==4.24.0 +prometheus_client==0.22.1 +PyYAML==6.0.2 +requests==2.32.4 +watchfiles==1.0.5 +watchdog==6.0.0 diff --git a/tools/dynamic-lora-sidecar/sidecar/sidecar.py b/tools/dynamic-lora-sidecar/sidecar/sidecar.py index 30724478a..a78f29d1f 100644 --- a/tools/dynamic-lora-sidecar/sidecar/sidecar.py +++ b/tools/dynamic-lora-sidecar/sidecar/sidecar.py @@ -1,3 +1,17 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import requests import yaml import time @@ -10,9 +24,17 @@ import datetime import os import sys +from prometheus_client import Gauge, start_http_server from watchdog.observers.polling import PollingObserver as Observer from watchdog.events import FileSystemEventHandler +# Initialize Prometheus metrics +ADAPTER_STATUS_METRICS = Gauge( + 'lora_syncer_adapter_status', + 'Status of LoRA adapters (1=loaded, 0=not_loaded)', + ['adapter_name'] +) + CONFIG_MAP_FILE = os.environ.get( "DYNAMIC_LORA_ROLLOUT_CONFIG", "/config/configmap.yaml" ) @@ -44,6 +66,8 @@ def parse_arguments(): help=f'Path to config map file (default: {CONFIG_MAP_FILE})') parser.add_argument('--config-validation', action='store_true', default=True, help='Enable config validation (default: True)') + parser.add_argument('--metrics-port', type=int, default=8080, + help='Port to listen for Prometheus metrics (default: 8080)') return parser.parse_args() @@ -212,7 +236,7 @@ def check_health() -> bool: time.sleep(self.health_check_interval.seconds) return False - def load_adapter(self, adapter: LoraAdapter): + def load_adapter(self, adapter: LoraAdapter) -> None | str: """Sends a request to load the specified model.""" if adapter in self.registered_adapters: logging.info( @@ -229,10 +253,12 @@ def load_adapter(self, adapter: LoraAdapter): response = requests.post(url, json=payload) response.raise_for_status() logging.info(f"loaded model {adapter.id}") + return None except requests.exceptions.RequestException as e: logging.error(f"error loading model {adapter.id}: {e}") + return f"error loading model {adapter.id}: {e}" - def unload_adapter(self, adapter: LoraAdapter): + def unload_adapter(self, adapter: LoraAdapter) -> None | str: """Sends a request to unload the specified model.""" if adapter not in self.registered_adapters: logging.info( @@ -270,20 +296,30 @@ def reconcile(self): adapters_to_load_id = ", ".join(str(a.id) for a in adapters_to_load) logging.info(f"adapter to load {adapters_to_load_id}") for adapter in adapters_to_load: - self.load_adapter(adapter) + err = self.load_adapter(adapter) + if err is None: + self.update_adapter_status_metrics(adapter.id, is_loaded=True) adapters_to_unload = self.ensure_not_exist_adapters - self.ensure_exist_adapters adapters_to_unload_id = ", ".join(str(a.id) for a in adapters_to_unload) logging.info(f"adapters to unload {adapters_to_unload_id}") for adapter in adapters_to_unload: - self.unload_adapter(adapter) + err = self.unload_adapter(adapter) + if err is None: + self.update_adapter_status_metrics(adapter.id, is_loaded=False) + + def update_adapter_status_metrics(self, adapter_id: str, is_loaded: bool): + """Update adapter status metrics""" + status = 1 if is_loaded else 0 + ADAPTER_STATUS_METRICS.labels(adapter_name=adapter_id).set(status) + async def main(): args = parse_arguments() - + # Update CONFIG_MAP_FILE with argument value config_file = args.config - + reconciler_instance = LoraReconciler( config_file=config_file, health_check_timeout=args.health_check_timeout, @@ -291,7 +327,11 @@ async def main(): reconcile_trigger_seconds=args.reconcile_trigger, config_validation=args.config_validation ) - + + # Start metrics server + logging.info(f"Starting metrics server on port {args.metrics_port}") + start_http_server(args.metrics_port) + logging.info(f"Running initial reconcile for config map {config_file}") reconciler_instance.reconcile() @@ -320,4 +360,4 @@ async def main(): if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) diff --git a/tools/dynamic-lora-sidecar/sidecar/test_sidecar.py b/tools/dynamic-lora-sidecar/sidecar/test_sidecar.py index 59a60e6b3..bd6c7ed42 100644 --- a/tools/dynamic-lora-sidecar/sidecar/test_sidecar.py +++ b/tools/dynamic-lora-sidecar/sidecar/test_sidecar.py @@ -1,9 +1,23 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import unittest from unittest.mock import patch, Mock, mock_open, call import yaml import os import datetime -from sidecar import LoraReconciler, LoraAdapter, CONFIG_MAP_FILE, BASE_FIELD +from sidecar import LoraReconciler, LoraAdapter, CONFIG_MAP_FILE, BASE_FIELD, ADAPTER_STATUS_METRICS # Update TEST_CONFIG_DATA to include the new configuration parameters TEST_CONFIG_DATA = { @@ -213,12 +227,55 @@ def test_health_check_settings(self): reconcile_trigger_seconds=45, config_validation=False ) - + # Check that values are properly set self.assertEqual(reconciler.health_check_timeout, datetime.timedelta(seconds=240)) self.assertEqual(reconciler.health_check_interval, datetime.timedelta(seconds=15)) self.assertEqual(reconciler.reconcile_trigger_seconds, 45) + def test_update_adapter_status_metrics(self): + """Test that update_adapter_status_metrics method works correctly""" + # Clear any existing metrics + ADAPTER_STATUS_METRICS.clear() + + # Create reconciler + reconciler = LoraReconciler( + config_file=CONFIG_MAP_FILE, + health_check_timeout=180, + health_check_interval=10, + reconcile_trigger_seconds=30, + config_validation=False + ) + + # Test setting loaded status + reconciler.update_adapter_status_metrics("test-adapter-1", is_loaded=True) + reconciler.update_adapter_status_metrics("test-adapter-2", is_loaded=False) + + # Get all metric samples + metric_samples = list(ADAPTER_STATUS_METRICS.collect())[0].samples + + # Check that metrics were set correctly + adapter_metrics = {} + for sample in metric_samples: + adapter_name = sample.labels['adapter_name'] + adapter_metrics[adapter_name] = sample.value + + self.assertEqual(adapter_metrics.get('test-adapter-1'), 1.0, "test-adapter-1 should be marked as loaded") + self.assertEqual(adapter_metrics.get('test-adapter-2'), 0.0, "test-adapter-2 should be marked as not loaded") + + def test_metrics_endpoint(self): + """Test that Prometheus metrics can be collected""" + from prometheus_client import generate_latest + + # Clear metrics and set a test value + ADAPTER_STATUS_METRICS.clear() + ADAPTER_STATUS_METRICS.labels(adapter_name='test-adapter').set(1) + + # Test that generate_latest produces valid output + metrics_bytes = generate_latest() + metrics = metrics_bytes.decode('utf-8') + self.assertIn('lora_syncer_adapter_status{adapter_name="test-adapter"} 1.0', metrics) + if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/tools/simulations/llm_ig_simulation/src/__init__.py b/tools/simulations/llm_ig_simulation/src/__init__.py index e69de29bb..1a72134ef 100644 --- a/tools/simulations/llm_ig_simulation/src/__init__.py +++ b/tools/simulations/llm_ig_simulation/src/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/tools/simulations/llm_ig_simulation/src/benchmark_one_server.py b/tools/simulations/llm_ig_simulation/src/benchmark_one_server.py index bffef7249..c4923f3c4 100644 --- a/tools/simulations/llm_ig_simulation/src/benchmark_one_server.py +++ b/tools/simulations/llm_ig_simulation/src/benchmark_one_server.py @@ -1,3 +1,17 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse from collections import Counter import csv diff --git a/tools/simulations/llm_ig_simulation/src/constants.py b/tools/simulations/llm_ig_simulation/src/constants.py index 7c84251a5..b0e2ccc30 100644 --- a/tools/simulations/llm_ig_simulation/src/constants.py +++ b/tools/simulations/llm_ig_simulation/src/constants.py @@ -1,3 +1,17 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + PREFILL_LATENCY_CONST_2 = 0 PREFILL_LATENCY_CONST_1 = 0.00006769375513 PREFILL_LATENCY_CONST_0 = 0.01969 diff --git a/tools/simulations/llm_ig_simulation/src/continous_batching.py b/tools/simulations/llm_ig_simulation/src/continous_batching.py index 4e166c31e..f61b9c8bd 100644 --- a/tools/simulations/llm_ig_simulation/src/continous_batching.py +++ b/tools/simulations/llm_ig_simulation/src/continous_batching.py @@ -1,3 +1,17 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from constants import MAX_NUM_SEQ, MAX_NUM_BATCH_TOKENS, MAX_GPU_MEMORY_PERC_BEFORE_RECOMPUTE, TOKENIZE_LATENCY_CONST, PREFILL_LATENCY_CONST_2, PREFILL_LATENCY_CONST_1, PREFILL_LATENCY_CONST_0, PREFILL_LATENCY_CONST_MIN, DECODE_LATENCY_CONST_1, DECODE_LATENCY_CONST_0, DECODE_LATENCY_CONST_BATCH, LORA_DICT import simpy diff --git a/tools/simulations/llm_ig_simulation/src/llmactor.py b/tools/simulations/llm_ig_simulation/src/llmactor.py index 567e43285..378378111 100644 --- a/tools/simulations/llm_ig_simulation/src/llmactor.py +++ b/tools/simulations/llm_ig_simulation/src/llmactor.py @@ -1,3 +1,17 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from constants import MAX_NUM_TOKENS_ALLOWED import simpy import numpy as np diff --git a/tools/simulations/llm_ig_simulation/src/loadbalancer.py b/tools/simulations/llm_ig_simulation/src/loadbalancer.py index b8ae187ff..f679e3515 100644 --- a/tools/simulations/llm_ig_simulation/src/loadbalancer.py +++ b/tools/simulations/llm_ig_simulation/src/loadbalancer.py @@ -1,3 +1,17 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from queue import Queue import random from re import I diff --git a/tools/simulations/llm_ig_simulation/src/main.py b/tools/simulations/llm_ig_simulation/src/main.py index 2bfb6daf8..9c7733dc0 100644 --- a/tools/simulations/llm_ig_simulation/src/main.py +++ b/tools/simulations/llm_ig_simulation/src/main.py @@ -1,3 +1,17 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse from collections import Counter import csv @@ -361,4 +375,4 @@ def main(): #print(f"Results have been saved to {output_file}") if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tools/simulations/llm_ig_simulation/src/request.py b/tools/simulations/llm_ig_simulation/src/request.py index b1a674d89..51d80a73c 100644 --- a/tools/simulations/llm_ig_simulation/src/request.py +++ b/tools/simulations/llm_ig_simulation/src/request.py @@ -1,3 +1,17 @@ +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import numpy as np class Request: