diff --git a/.github/ISSUE_TEMPLATE/new-release.md b/.github/ISSUE_TEMPLATE/new-release.md index 6ed3df8c..be569844 100644 --- a/.github/ISSUE_TEMPLATE/new-release.md +++ b/.github/ISSUE_TEMPLATE/new-release.md @@ -34,10 +34,10 @@ This document defines the process for releasing Gateway API Inference Extension. export RC=1 ``` -4. The vLLM image tag defaults to `v0.7.1` for a release. Optionally, change the vLLM image tag. For example: +4. The vLLM image tag defaults to `v0.7.2` for a release. Set the `VLLM` environment variable if a newer [tag][vllm-tag] has been published. For example: ```shell - export VLLM=0.7.2 + export VLLM=0.7.3 ``` ## Release Process @@ -45,16 +45,25 @@ This document defines the process for releasing Gateway API Inference Extension. 1. If needed, clone the Gateway API Inference Extension [repo][repo]. ```shell - git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension.git -b main + git clone -o ${REMOTE} https://github.com/kubernetes-sigs/gateway-api-inference-extension.git ``` 2. If you already have the repo cloned, ensure it’s up-to-date and your local branch is clean. -3. Create a new release branch from the `main` branch. The release branch should be named `release-v${MAJOR}.${MINOR}`, e.g. `release-v0.1`. +3. Release Branch Handling: + - For a Release Candidate: + Create a new release branch from the `main` branch. The branch should be named `release-${MAJOR}.${MINOR}`, for example, `release-0.1`: - ```shell - git checkout -b release-v${MAJOR}.${MINOR} - ``` + ```shell + git checkout -b release-${MAJOR}.${MINOR} + ``` + + - For a Major or Minor Release: + A release branch should already exist. In this case, check out the existing branch: + + ```shell + git checkout -b release-${MAJOR}.${MINOR} ${REMOTE}/release-${MAJOR}.${MINOR} + ``` 4. Update release-specific content, generate release artifacts, and stage the changes. @@ -79,7 +88,7 @@ This document defines the process for releasing Gateway API Inference Extension. 6. Push your release branch to the Gateway API Inference Extension remote. ```shell - git push ${REMOTE} release-v${MAJOR}.${MINOR} + git push ${REMOTE} release-${MAJOR}.${MINOR} ``` 7. Tag the head of your release branch with the number. @@ -114,7 +123,8 @@ This document defines the process for releasing Gateway API Inference Extension. 9. Pushing the tag triggers Prow to build and publish the container image to the [staging registry][]. 10. Submit a PR against [k8s.io][] to add the staging image tag and SHA to [`k8s-staging-gateway-api-inference-extension/images.yaml`][yaml]. This will - promote the image to the production registry. **Note:** Add a link to this issue when the PR is merged. + promote the image to the production registry, e.g. `registry.k8s.io/gateway-api-inference-extension/epp:v${MAJOR}.${MINOR}.0`. + **Note:** Add a link to this issue when the PR is merged. 11. Test the steps in the tagged quickstart guide after the PR merges, for example: `https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/v0.1.0-rc.1/pkg/README.md`. 12. Create a [new release][]: 1. Choose the tag that you created for the release. @@ -148,3 +158,4 @@ Use the following steps to announce the release. [k8s.io]: https://github.com/kubernetes/k8s.io [yaml]: https://github.com/kubernetes/k8s.io/blob/main/registry.k8s.io/images/k8s-staging-gateway-api-inference-extension/images.yaml [issue]: https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/new/choose +[vllm-tag]: https://hub.docker.com/r/vllm/vllm-openai/tags diff --git a/.golangci.yml b/.golangci.yml index 2ad3b93d..d1b1e112 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -25,7 +25,6 @@ linters: - makezero - errcheck - goconst - - gocyclo - gofmt - goimports - gosimple diff --git a/Dockerfile b/Dockerfile index e854e133..312700bc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,24 +1,31 @@ # Dockerfile has specific requirement to put this ARG at the beginning: # https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact -ARG BUILDER_IMAGE=golang:1.23-alpine -ARG BASE_IMAGE=gcr.io/distroless/base-debian10 +ARG BUILDER_IMAGE=golang:1.23 +ARG BASE_IMAGE=gcr.io/distroless/static:nonroot ## Multistage build -FROM ${BUILDER_IMAGE} as builder +FROM ${BUILDER_IMAGE} AS builder ENV CGO_ENABLED=0 ENV GOOS=linux ENV GOARCH=amd64 +# Dependencies WORKDIR /src -COPY . . -WORKDIR /src/pkg/ext-proc +COPY go.mod go.sum ./ RUN go mod download -RUN go build -o /ext-proc + +# Sources +COPY cmd ./cmd +COPY pkg ./pkg +COPY internal ./internal +COPY api ./api +WORKDIR /src/cmd/epp +RUN go build -o /epp ## Multistage deploy FROM ${BASE_IMAGE} WORKDIR / -COPY --from=builder /ext-proc /ext-proc +COPY --from=builder /epp /epp -ENTRYPOINT ["/ext-proc"] \ No newline at end of file +ENTRYPOINT ["/epp"] diff --git a/Makefile b/Makefile index 83de8dd1..40cb0b75 100644 --- a/Makefile +++ b/Makefile @@ -26,24 +26,41 @@ PLATFORMS ?= linux/amd64 DOCKER_BUILDX_CMD ?= docker buildx IMAGE_BUILD_CMD ?= $(DOCKER_BUILDX_CMD) build IMAGE_BUILD_EXTRA_OPTS ?= +SYNCER_IMAGE_BUILD_EXTRA_OPTS ?= +BBR_IMAGE_BUILD_EXTRA_OPTS ?= IMAGE_REGISTRY ?= us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension IMAGE_NAME := epp IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(IMAGE_NAME) IMAGE_TAG ?= $(IMAGE_REPO):$(GIT_TAG) -BASE_IMAGE ?= gcr.io/distroless/base-debian10 -BUILDER_IMAGE ?= golang:1.23-alpine +SYNCER_IMAGE_NAME := lora-syncer +SYNCER_IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(SYNCER_IMAGE_NAME) +SYNCER_IMAGE_TAG ?= $(SYNCER_IMAGE_REPO):$(GIT_TAG) + +BBR_IMAGE_NAME := bbr +BBR_IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(BBR_IMAGE_NAME) +BBR_IMAGE_TAG ?= $(BBR_IMAGE_REPO):$(GIT_TAG) + +BASE_IMAGE ?= gcr.io/distroless/static:nonroot +BUILDER_IMAGE ?= golang:1.23 ifdef GO_VERSION BUILDER_IMAGE = golang:$(GO_VERSION) endif ifdef EXTRA_TAG IMAGE_EXTRA_TAG ?= $(IMAGE_REPO):$(EXTRA_TAG) +SYNCER_IMAGE_EXTRA_TAG ?= $(SYNCER_IMAGE_REPO):$(EXTRA_TAG) +BBR_IMAGE_EXTRA_TAG ?= $(BBR_IMAGE_REPO):$(EXTRA_TAG) endif ifdef IMAGE_EXTRA_TAG IMAGE_BUILD_EXTRA_OPTS += -t $(IMAGE_EXTRA_TAG) +SYNCER_IMAGE_BUILD_EXTRA_OPTS += -t $(SYNCER_IMAGE_EXTRA_TAG) +BBR_IMAGE_BUILD_EXTRA_OPTS += -t $(BBR_IMAGE_EXTRA_TAG) endif +# The name of the kind cluster to use for the "kind-load" target. +KIND_CLUSTER ?= kind + ##@ General # The help target prints out all targets with their descriptions organized @@ -102,11 +119,11 @@ vet: ## Run go vet against code. .PHONY: test test: manifests generate fmt vet envtest ## Run tests. - KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -coverprofile cover.out + KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -race -coverprofile cover.out .PHONY: test-integration test-integration: manifests generate fmt vet envtest ## Run tests. - KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./test/integration -coverprofile cover.out + KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./test/integration/epp/... -race -coverprofile cover.out .PHONY: test-e2e test-e2e: ## Run end-to-end tests against an existing Kubernetes cluster with at least 3 available GPUs. @@ -132,28 +149,107 @@ verify: vet fmt-verify manifests generate ci-lint # Build the container image .PHONY: image-local-build -image-local-build: +image-local-build: ## Build the EPP image using Docker Buildx for local development. BUILDER=$(shell $(DOCKER_BUILDX_CMD) create --use) $(MAKE) image-build PUSH=$(PUSH) + $(MAKE) image-build LOAD=$(LOAD) $(DOCKER_BUILDX_CMD) rm $$BUILDER .PHONY: image-local-push -image-local-push: PUSH=--push +image-local-push: PUSH=--push ## Build the EPP image for local development and push it to $IMAGE_REPO. image-local-push: image-local-build +.PHONY: image-local-load +image-local-load: LOAD=--load ## Build the EPP image for local development and load it in the local Docker registry. +image-local-load: image-local-build + .PHONY: image-build -image-build: +image-build: ## Build the EPP image using Docker Buildx. $(IMAGE_BUILD_CMD) -t $(IMAGE_TAG) \ --platform=$(PLATFORMS) \ --build-arg BASE_IMAGE=$(BASE_IMAGE) \ --build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \ $(PUSH) \ + $(LOAD) \ $(IMAGE_BUILD_EXTRA_OPTS) ./ .PHONY: image-push -image-push: PUSH=--push +image-push: PUSH=--push ## Build the EPP image and push it to $IMAGE_REPO. image-push: image-build +.PHONY: image-load +image-load: LOAD=--load ## Build the EPP image and load it in the local Docker registry. +image-load: image-build + +.PHONY: image-kind +image-kind: image-build ## Build the EPP image and load it to kind cluster $KIND_CLUSTER ("kind" by default). + kind load docker-image $(IMAGE_TAG) --name $(KIND_CLUSTER) + +##@ Lora Syncer + +.PHONY: syncer-image-local-build +syncer-image-local-build: + BUILDER=$(shell $(DOCKER_BUILDX_CMD) create --use) + $(MAKE) image-build PUSH=$(PUSH) + $(DOCKER_BUILDX_CMD) rm $$BUILDER + +.PHONY: syncer-image-local-push +syncer-image-local-push: PUSH=--push +syncer-image-local-push: syncer-image-local-build + +.PHONY: syncer-image-build +syncer-image-build: + $ cd $(CURDIR)/tools/dynamic-lora-sidecar && $(IMAGE_BUILD_CMD) -t $(SYNCER_IMAGE_TAG) \ + --platform=$(PLATFORMS) \ + --build-arg BASE_IMAGE=$(BASE_IMAGE) \ + --build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \ + $(PUSH) \ + $(SYNCER_IMAGE_BUILD_EXTRA_OPTS) ./ + +.PHONY: syncer-image-push +syncer-image-push: PUSH=--push +syncer-image-push: syncer-image-build + +##@ Body-based Routing extension + +# Build the container image +.PHONY: bbr-image-local-build +bbr-image-local-build: ## Build the image using Docker Buildx for local development. + BUILDER=$(shell $(DOCKER_BUILDX_CMD) create --use) + $(MAKE) bbr-image-build PUSH=$(PUSH) + $(MAKE) bbr-image-build LOAD=$(LOAD) + $(DOCKER_BUILDX_CMD) rm $$BUILDER + +.PHONY: bbr-image-local-push +bbr-image-local-push: PUSH=--push ## Build the image for local development and push it to $IMAGE_REPO. +bbr-image-local-push: bbr-image-local-build + +.PHONY: bbr-image-local-load +bbr-image-local-load: LOAD=--load ## Build the image for local development and load it in the local Docker registry. +bbr-image-local-load: bbr-image-local-build + +.PHONY: bbr-image-build +bbr-image-build: ## Build the image using Docker Buildx. + $(IMAGE_BUILD_CMD) -f body-based-routing.Dockerfile -t $(BBR_IMAGE_TAG) \ + --platform=$(PLATFORMS) \ + --build-arg BASE_IMAGE=$(BASE_IMAGE) \ + --build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \ + $(PUSH) \ + $(LOAD) \ + . + +.PHONY: bbr-image-push +bbr-image-push: PUSH=--push ## Build the image and push it to $IMAGE_REPO. +bbr-image-push: bbr-image-build + +.PHONY: bbr-image-load +bbr-image-load: LOAD=--load ## Build the image and load it in the local Docker registry. +bbr-image-load: bbr-image-build + +.PHONY: bbr-image-kind +bbr-image-kind: bbr-image-build ## Build the image and load it to kind cluster $KIND_CLUSTER ("kind" by default). + kind load docker-image $(BBR_IMAGE_TAG) --name $(KIND_CLUSTER) + ##@ Docs .PHONY: build-docs diff --git a/README.md b/README.md index a15e9542..6ad19cdb 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,44 @@ # Gateway API Inference Extension -The Gateway API Inference Extension came out of [wg-serving](https://github.com/kubernetes/community/tree/master/wg-serving) and is sponsored by [SIG Network](https://github.com/kubernetes/community/blob/master/sig-network/README.md#gateway-api-inference-extension). This repo contains: the load balancing algorithm, [ext-proc](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter) code, CRDs, and controllers of the extension. +This extension upgrades an [ext-proc](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter)-capable proxy or gateway - such as Envoy Gateway, kGateway, or the GKE Gateway - to become an **inference gateway** - supporting inference platform teams self-hosting large language models on Kubernetes. This integration makes it easy to expose and control access to your local [OpenAI-compatible chat completion endpoints](https://platform.openai.com/docs/api-reference/chat) to other workloads on or off cluster, or to integrate your self-hosted models alongside model-as-a-service providers in a higher level **AI Gateway** like LiteLLM, Solo AI Gateway, or Apigee. -This extension is intented to provide value to multiplexed LLM services on a shared pool of compute. See the [proposal](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/012-llm-instance-gateway) for more info. +The inference gateway: + +* Improves the tail latency and throughput of LLM completion requests against Kubernetes-hosted model servers using an extensible request scheduling alogrithm that is kv-cache and request cost aware, avoiding evictions or queueing as load increases +* Provides [Kubernetes-native declarative APIs](https://gateway-api-inference-extension.sigs.k8s.io/concepts/api-overview/) to route client model names to use-case specific LoRA adapters and control incremental rollout of new adapter versions, A/B traffic splitting, and safe blue-green base model and model server upgrades +* Adds end to end observability around service objective attainment +* Ensures operational guardrails between different client model names, allowing a platform team to safely serve many different GenAI workloads on the same pool of shared foundation model servers for higher utilization and fewer required accelerators + +![Architecture Diagram](./docs/inference-gateway-architecture.svg) + +It currently requires a version of vLLM that supports the necessary metrics to predict traffic load which is defined in the [model server protocol](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol). Support for Google's Jetstream, nVidia Triton, text-generation-inference, and SGLang is coming soon. ## Status -This project is currently in development. +This project is [alpha (0.1 release)](https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/tag/v0.1.0). It should not be used in production yet. ## Getting Started -Follow this [README](./pkg/README.md) to get the inference-extension up and running on your cluster! +Follow our [Getting Started Guide](./pkg/README.md) to get the inference-extension up and running on your cluster! -## End-to-End Tests +See our website at https://gateway-api-inference-extension.sigs.k8s.io/ for detailed API documentation on leveraging our Kubernetes-native declarative APIs -Follow this [README](./test/e2e/README.md) to learn more about running the inference-extension end-to-end test suite on your cluster. +## Roadmap -## Website +As Inference Gateway builds towards a GA release. We will continue to expand our capabilities, namely: +1. Prefix-cache aware load balancing with interfaces for remote caches +1. Recommended LoRA adapter pipeline for automated rollout +1. Fairness and priority between workloads within the same criticality band +1. HPA support for autoscaling on aggregate metrics derived from the load balancer +1. Support for large multi-modal inputs and outputs +1. Support for other GenAI model types (diffusion and other non-completion protocols) +1. Heterogeneous accelerators - serve workloads on multiple types of accelerator using latency and request cost-aware load balancing +1. Disaggregated serving support with independently scaling pools + + +## End-to-End Tests -Detailed documentation is available on our website: https://gateway-api-inference-extension.sigs.k8s.io/ +Follow this [README](./test/e2e/epp/README.md) to learn more about running the inference-extension end-to-end test suite on your cluster. ## Contributing diff --git a/api/doc.go b/api/doc.go new file mode 100644 index 00000000..c91adb92 --- /dev/null +++ b/api/doc.go @@ -0,0 +1,17 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package api diff --git a/api/v1alpha2/doc.go b/api/v1alpha2/doc.go new file mode 100644 index 00000000..90a35f58 --- /dev/null +++ b/api/v1alpha2/doc.go @@ -0,0 +1,23 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1alpha2 contains API Schema definitions for the +// inference.networking.x-k8s.io API group. +// +// +k8s:openapi-gen=true +// +kubebuilder:object:generate=true +// +groupName=inference.networking.x-k8s.io +package v1alpha2 diff --git a/api/v1alpha1/groupversion_info.go b/api/v1alpha2/groupversion_info.go similarity index 89% rename from api/v1alpha1/groupversion_info.go rename to api/v1alpha2/groupversion_info.go index 8c0a449f..f9eb9b1e 100644 --- a/api/v1alpha1/groupversion_info.go +++ b/api/v1alpha2/groupversion_info.go @@ -1,5 +1,5 @@ /* -Copyright 2024 The Kubernetes Authors. +Copyright 2025 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,10 +14,10 @@ See the License for the specific language governing permissions and limitations under the License. */ -// Package v1alpha1 contains API Schema definitions for the gateway v1alpha1 API group +// Package v1alpha2 contains API Schema definitions for the gateway v1alpha2 API group // +kubebuilder:object:generate=true // +groupName=inference.networking.x-k8s.io -package v1alpha1 +package v1alpha2 import ( "k8s.io/apimachinery/pkg/runtime/schema" @@ -26,7 +26,7 @@ import ( var ( // GroupVersion is group version used to register these objects - GroupVersion = schema.GroupVersion{Group: "inference.networking.x-k8s.io", Version: "v1alpha1"} + GroupVersion = schema.GroupVersion{Group: "inference.networking.x-k8s.io", Version: "v1alpha2"} // SchemeGroupVersion is alias to GroupVersion for client-go libraries. // It is required by pkg/client/informers/externalversions/... diff --git a/api/v1alpha1/inferencemodel_types.go b/api/v1alpha2/inferencemodel_types.go similarity index 90% rename from api/v1alpha1/inferencemodel_types.go rename to api/v1alpha2/inferencemodel_types.go index 3661820d..c011031e 100644 --- a/api/v1alpha1/inferencemodel_types.go +++ b/api/v1alpha2/inferencemodel_types.go @@ -1,5 +1,5 @@ /* -Copyright 2024 The Kubernetes Authors. +Copyright 2025 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v1alpha1 +package v1alpha2 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -24,6 +24,7 @@ import ( // // +kubebuilder:object:root=true // +kubebuilder:subresource:status +// +kubebuilder:storageversion // +genclient type InferenceModel struct { metav1.TypeMeta `json:",inline"` @@ -70,6 +71,7 @@ type InferenceModelSpec struct { // // +kubebuilder:validation:MaxLength=256 // +kubebuilder:validation:Required + // +kubebuilder:validation:XValidation:rule="self == oldSelf",message="modelName is immutable" ModelName string `json:"modelName"` // Criticality defines how important it is to serve the model compared to other models referencing the same pool. @@ -105,25 +107,18 @@ type PoolObjectReference struct { // // +optional // +kubebuilder:default="inference.networking.x-k8s.io" - // +kubebuilder:validation:MaxLength=253 - // +kubebuilder:validation:Pattern=`^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$` - Group string `json:"group,omitempty"` + Group Group `json:"group,omitempty"` // Kind is kind of the referent. For example "InferencePool". // // +optional // +kubebuilder:default="InferencePool" - // +kubebuilder:validation:MinLength=1 - // +kubebuilder:validation:MaxLength=63 - // +kubebuilder:validation:Pattern=`^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$` - Kind string `json:"kind,omitempty"` + Kind Kind `json:"kind,omitempty"` // Name is the name of the referent. // - // +kubebuilder:validation:MinLength=1 - // +kubebuilder:validation:MaxLength=253 // +kubebuilder:validation:Required - Name string `json:"name"` + Name ObjectName `json:"name"` } // Criticality defines how important it is to serve the model compared to other models. @@ -174,7 +169,7 @@ type TargetModel struct { // Conversely weights are optional, so long as ALL targetModels do not specify a weight. // // +optional - // +kubebuilder:validation:Minimum=0 + // +kubebuilder:validation:Minimum=1 // +kubebuilder:validation:Maximum=1000000 Weight *int32 `json:"weight,omitempty"` } @@ -202,7 +197,7 @@ type InferenceModelConditionType string type InferenceModelConditionReason string const ( - // This condition indicates if the model config is accepted, and if not, why. + // ModelConditionAccepted indicates if the model config is accepted, and if not, why. // // Possible reasons for this condition to be True are: // @@ -218,14 +213,14 @@ const ( // ModelConditionAccepted InferenceModelConditionType = "Accepted" - // Desired state. Model conforms to the state of the pool. + // ModelReasonAccepted is the desired state. Model conforms to the state of the pool. ModelReasonAccepted InferenceModelConditionReason = "Accepted" - // This reason is used when a given ModelName already exists within the pool. + // ModelReasonNameInUse is used when a given ModelName already exists within the pool. // Details about naming conflict resolution are on the ModelName field itself. ModelReasonNameInUse InferenceModelConditionReason = "ModelNameInUse" - // This reason is the initial state, and indicates that the controller has not yet reconciled the InferenceModel. + // ModelReasonPending is the initial state, and indicates that the controller has not yet reconciled the InferenceModel. ModelReasonPending InferenceModelConditionReason = "Pending" ) diff --git a/api/v1alpha1/inferencepool_types.go b/api/v1alpha2/inferencepool_types.go similarity index 60% rename from api/v1alpha1/inferencepool_types.go rename to api/v1alpha2/inferencepool_types.go index 61a3764d..b411dbe3 100644 --- a/api/v1alpha1/inferencepool_types.go +++ b/api/v1alpha2/inferencepool_types.go @@ -1,5 +1,5 @@ /* -Copyright 2024 The Kubernetes Authors. +Copyright 2025 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,9 +14,10 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v1alpha1 +package v1alpha2 import ( + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -24,6 +25,7 @@ import ( // // +kubebuilder:object:root=true // +kubebuilder:subresource:status +// +kubebuilder:storageversion // +genclient type InferencePool struct { metav1.TypeMeta `json:",inline"` @@ -48,6 +50,8 @@ type InferencePoolSpec struct { // that should be included in the InferencePool. // In some cases, implementations may translate this field to a Service selector, so this matches the simple // map used for Service selectors instead of the full Kubernetes LabelSelector type. + // If sepecified, it will be applied to match the model server pods in the same namespace as the InferencePool. + // Cross namesoace selector is not supported. // // +kubebuilder:validation:Required Selector map[LabelKey]LabelValue `json:"selector"` @@ -86,11 +90,11 @@ type Extension struct { // ExtensionReference is a reference to the extension deployment. type ExtensionReference struct { // Group is the group of the referent. - // When unspecified or empty string, core API group is inferred. + // The default value is "", representing the Core API group. // // +optional // +kubebuilder:default="" - Group *string `json:"group,omitempty"` + Group *Group `json:"group,omitempty"` // Kind is the Kubernetes resource kind of the referent. For example // "Service". @@ -105,20 +109,19 @@ type ExtensionReference struct { // // +optional // +kubebuilder:default=Service - Kind *string `json:"kind,omitempty"` + Kind *Kind `json:"kind,omitempty"` // Name is the name of the referent. // // +kubebuilder:validation:Required - Name string `json:"name"` + Name ObjectName `json:"name"` - // The port number on the pods running the extension. When unspecified, implementations SHOULD infer a - // default value of 9002 when the Kind is Service. + // The port number on the service running the extension. When unspecified, + // implementations SHOULD infer a default value of 9002 when the Kind is + // Service. // - // +kubebuilder:validation:Minimum=1 - // +kubebuilder:validation:Maximum=65535 // +optional - TargetPortNumber *int32 `json:"targetPortNumber,omitempty"` + PortNumber *PortNumber `json:"portNumber,omitempty"` } // ExtensionConnection encapsulates options that configures the connection to the extension. @@ -143,94 +146,103 @@ const ( FailClose ExtensionFailureMode = "FailClose" ) -// LabelKey was originally copied from: https://github.com/kubernetes-sigs/gateway-api/blob/99a3934c6bc1ce0874f3a4c5f20cafd8977ffcb4/apis/v1/shared_types.go#L694-L731 -// Duplicated as to not take an unexpected dependency on gw's API. -// -// LabelKey is the key of a label. This is used for validation -// of maps. This matches the Kubernetes "qualified name" validation that is used for labels. -// Labels are case sensitive, so: my-label and My-Label are considered distinct. -// -// Valid values include: -// -// * example -// * example.com -// * example.com/path -// * example.com/path.html -// -// Invalid values include: -// -// * example~ - "~" is an invalid character -// * example.com. - can not start or end with "." -// -// +kubebuilder:validation:MinLength=1 -// +kubebuilder:validation:MaxLength=253 -// +kubebuilder:validation:Pattern=`^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9]$` -type LabelKey string - -// LabelValue is the value of a label. This is used for validation -// of maps. This matches the Kubernetes label validation rules: -// * must be 63 characters or less (can be empty), -// * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]), -// * could contain dashes (-), underscores (_), dots (.), and alphanumerics between. -// -// Valid values include: -// -// * MyValue -// * my.name -// * 123-my-value -// -// +kubebuilder:validation:MinLength=0 -// +kubebuilder:validation:MaxLength=63 -// +kubebuilder:validation:Pattern=`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$` -type LabelValue string - // InferencePoolStatus defines the observed state of InferencePool type InferencePoolStatus struct { + // Parents is a list of parent resources (usually Gateways) that are + // associated with the route, and the status of the InferencePool with respect to + // each parent. + // + // A maximum of 32 Gateways will be represented in this list. An empty list + // means the route has not been attached to any Gateway. + // + // +kubebuilder:validation:MaxItems=32 + Parents []PoolStatus `json:"parent,omitempty"` +} + +// PoolStatus defines the observed state of InferencePool from a Gateway. +type PoolStatus struct { + // GatewayRef indicates the gateway that observed state of InferencePool. + GatewayRef corev1.ObjectReference `json:"parentRef"` + // Conditions track the state of the InferencePool. // // Known condition types are: // - // * "Ready" + // * "Accepted" + // * "ResolvedRefs" // // +optional // +listType=map // +listMapKey=type // +kubebuilder:validation:MaxItems=8 - // +kubebuilder:default={{type: "Ready", status: "Unknown", reason:"Pending", message:"Waiting for controller", lastTransitionTime: "1970-01-01T00:00:00Z"}} + // +kubebuilder:default={{type: "Accepted", status: "Unknown", reason:"Pending", message:"Waiting for controller", lastTransitionTime: "1970-01-01T00:00:00Z"}} Conditions []metav1.Condition `json:"conditions,omitempty"` } // InferencePoolConditionType is a type of condition for the InferencePool type InferencePoolConditionType string -// InferencePoolConditionReason is the reason for a given InferencePoolConditionType -type InferencePoolConditionReason string +// InferencePoolReason is the reason for a given InferencePoolConditionType +type InferencePoolReason string const ( - // This condition indicates if the pool is ready to accept traffic, and if not, why. + // This condition indicates whether the route has been accepted or rejected + // by a Gateway, and why. // // Possible reasons for this condition to be True are: // - // * "Ready" + // * "Accepted" // // Possible reasons for this condition to be False are: // - // * "EndpointPickerNotHealthy" + // * "NotSupportedByGateway" // // Possible reasons for this condition to be Unknown are: // // * "Pending" // - PoolConditionReady InferencePoolConditionType = "Ready" + // Controllers MAY raise this condition with other reasons, but should + // prefer to use the reasons listed above to improve interoperability. + InferencePoolConditionAccepted InferencePoolConditionType = "Accepted" + + // This reason is used with the "Accepted" condition when the Route has been + // accepted by the Gateway. + InferencePoolReasonAccepted InferencePoolReason = "Accepted" + + // This reason is used with the "Accepted" condition when the InferencePool + // has not been accepted by a Gateway because the Gateway does not support + // InferencePool as a backend. + InferencePoolReasonNotSupportedByGateway InferencePoolReason = "NotSupportedByGateway" + + // This reason is used with the "Accepted" when a controller has not yet + // reconciled the route. + InferencePoolReasonPending InferencePoolReason = "Pending" +) - // Desired state. The pool and its components are initialized and ready for traffic. - PoolReasonReady InferencePoolConditionReason = "Ready" +const ( + // This condition indicates whether the controller was able to resolve all + // the object references for the InferencePool. + // + // Possible reasons for this condition to be true are: + // + // * "ResolvedRefs" + // + // Possible reasons for this condition to be False are: + // + // * "InvalidExtnesionRef" + // + // Controllers MAY raise this condition with other reasons, but should + // prefer to use the reasons listed above to improve interoperability. + InferencePoolConditionResolvedRefs InferencePoolConditionType = "ResolvedRefs" - // This reason is used when the EPP has not yet passed health checks, or has started failing them. - PoolReasonEPPNotHealthy InferencePoolConditionReason = "EndpointPickerNotHealthy" + // This reason is used with the "ResolvedRefs" condition when the condition + // is true. + InferencePoolReasonResolvedRefs InferencePoolReason = "ResolvedRefs" - // This reason is the initial state, and indicates that the controller has not yet reconciled this pool. - PoolReasonPending InferencePoolConditionReason = "Pending" + // This reason is used with the "ResolvedRefs" condition when the + // ExtensionRef is invalid in some way. This can include an unsupported kind + // or API group, or a reference to a resource that can not be found. + InferencePoolReasonInvalidExtensionRef InferencePoolReason = "InvalidExtensionRef" ) func init() { diff --git a/api/v1alpha2/shared_types.go b/api/v1alpha2/shared_types.go new file mode 100644 index 00000000..ea5ef299 --- /dev/null +++ b/api/v1alpha2/shared_types.go @@ -0,0 +1,108 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha2 + +// Group refers to a Kubernetes Group. It must either be an empty string or a +// RFC 1123 subdomain. +// +// This validation is based off of the corresponding Kubernetes validation: +// https://github.com/kubernetes/apimachinery/blob/02cfb53916346d085a6c6c7c66f882e3c6b0eca6/pkg/util/validation/validation.go#L208 +// +// Valid values include: +// +// * "" - empty string implies core Kubernetes API group +// * "gateway.networking.k8s.io" +// * "foo.example.com" +// +// Invalid values include: +// +// * "example.com/bar" - "/" is an invalid character +// +// +kubebuilder:validation:MaxLength=253 +// +kubebuilder:validation:Pattern=`^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$` +type Group string + +// Kind refers to a Kubernetes Kind. +// +// Valid values include: +// +// * "Service" +// * "HTTPRoute" +// +// Invalid values include: +// +// * "invalid/kind" - "/" is an invalid character +// +// +kubebuilder:validation:MinLength=1 +// +kubebuilder:validation:MaxLength=63 +// +kubebuilder:validation:Pattern=`^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$` +type Kind string + +// ObjectName refers to the name of a Kubernetes object. +// Object names can have a variety of forms, including RFC 1123 subdomains, +// RFC 1123 labels, or RFC 1035 labels. +// +// +kubebuilder:validation:MinLength=1 +// +kubebuilder:validation:MaxLength=253 +type ObjectName string + +// PortNumber defines a network port. +// +// +kubebuilder:validation:Minimum=1 +// +kubebuilder:validation:Maximum=65535 +type PortNumber int32 + +// LabelKey was originally copied from: https://github.com/kubernetes-sigs/gateway-api/blob/99a3934c6bc1ce0874f3a4c5f20cafd8977ffcb4/apis/v1/shared_types.go#L694-L731 +// Duplicated as to not take an unexpected dependency on gw's API. +// +// LabelKey is the key of a label. This is used for validation +// of maps. This matches the Kubernetes "qualified name" validation that is used for labels. +// Labels are case sensitive, so: my-label and My-Label are considered distinct. +// +// Valid values include: +// +// * example +// * example.com +// * example.com/path +// * example.com/path.html +// +// Invalid values include: +// +// * example~ - "~" is an invalid character +// * example.com. - can not start or end with "." +// +// +kubebuilder:validation:MinLength=1 +// +kubebuilder:validation:MaxLength=253 +// +kubebuilder:validation:Pattern=`^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9]$` +type LabelKey string + +// LabelValue is the value of a label. This is used for validation +// of maps. This matches the Kubernetes label validation rules: +// * must be 63 characters or less (can be empty), +// * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]), +// * could contain dashes (-), underscores (_), dots (.), and alphanumerics between. +// +// Valid values include: +// +// * MyValue +// * my.name +// * 123-my-value +// +// +kubebuilder:validation:MinLength=0 +// +kubebuilder:validation:MaxLength=63 +// +kubebuilder:validation:Pattern=`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$` +type LabelValue string diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha2/zz_generated.deepcopy.go similarity index 93% rename from api/v1alpha1/zz_generated.deepcopy.go rename to api/v1alpha2/zz_generated.deepcopy.go index fd55379e..4dad0eff 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha2/zz_generated.deepcopy.go @@ -18,7 +18,7 @@ limitations under the License. // Code generated by controller-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 import ( "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -87,17 +87,17 @@ func (in *ExtensionReference) DeepCopyInto(out *ExtensionReference) { *out = *in if in.Group != nil { in, out := &in.Group, &out.Group - *out = new(string) + *out = new(Group) **out = **in } if in.Kind != nil { in, out := &in.Kind, &out.Kind - *out = new(string) + *out = new(Kind) **out = **in } - if in.TargetPortNumber != nil { - in, out := &in.TargetPortNumber, &out.TargetPortNumber - *out = new(int32) + if in.PortNumber != nil { + in, out := &in.PortNumber, &out.PortNumber + *out = new(PortNumber) **out = **in } } @@ -306,9 +306,9 @@ func (in *InferencePoolSpec) DeepCopy() *InferencePoolSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *InferencePoolStatus) DeepCopyInto(out *InferencePoolStatus) { *out = *in - if in.Conditions != nil { - in, out := &in.Conditions, &out.Conditions - *out = make([]v1.Condition, len(*in)) + if in.Parents != nil { + in, out := &in.Parents, &out.Parents + *out = make([]PoolStatus, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -340,6 +340,29 @@ func (in *PoolObjectReference) DeepCopy() *PoolObjectReference { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PoolStatus) DeepCopyInto(out *PoolStatus) { + *out = *in + out.GatewayRef = in.GatewayRef + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PoolStatus. +func (in *PoolStatus) DeepCopy() *PoolStatus { + if in == nil { + return nil + } + out := new(PoolStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *TargetModel) DeepCopyInto(out *TargetModel) { *out = *in diff --git a/body-based-routing.Dockerfile b/body-based-routing.Dockerfile new file mode 100644 index 00000000..e0afcf20 --- /dev/null +++ b/body-based-routing.Dockerfile @@ -0,0 +1,30 @@ +# Dockerfile has specific requirement to put this ARG at the beginning: +# https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact +ARG BUILDER_IMAGE=golang:1.23 +ARG BASE_IMAGE=gcr.io/distroless/static:nonroot + +## Multistage build +FROM ${BUILDER_IMAGE} AS builder +ENV CGO_ENABLED=0 +ENV GOOS=linux +ENV GOARCH=amd64 + +# Dependencies +WORKDIR /src +COPY go.mod go.sum ./ +RUN go mod download + +# Sources +COPY cmd ./cmd +COPY pkg ./pkg +COPY internal ./internal +WORKDIR /src/cmd/body-based-routing +RUN go build -o /body-based-routing + +## Multistage deploy +FROM ${BASE_IMAGE} + +WORKDIR / +COPY --from=builder /body-based-routing /body-based-routing + +ENTRYPOINT ["/body-based-routing"] diff --git a/client-go/applyconfiguration/api/v1alpha1/endpointpickerconfig.go b/client-go/applyconfiguration/api/v1alpha2/endpointpickerconfig.go similarity index 98% rename from client-go/applyconfiguration/api/v1alpha1/endpointpickerconfig.go rename to client-go/applyconfiguration/api/v1alpha2/endpointpickerconfig.go index 91895ddc..007b8870 100644 --- a/client-go/applyconfiguration/api/v1alpha1/endpointpickerconfig.go +++ b/client-go/applyconfiguration/api/v1alpha2/endpointpickerconfig.go @@ -15,7 +15,7 @@ limitations under the License. */ // Code generated by applyconfiguration-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 // EndpointPickerConfigApplyConfiguration represents a declarative configuration of the EndpointPickerConfig type for use // with apply. diff --git a/client-go/applyconfiguration/api/v1alpha1/extension.go b/client-go/applyconfiguration/api/v1alpha2/extension.go similarity index 76% rename from client-go/applyconfiguration/api/v1alpha1/extension.go rename to client-go/applyconfiguration/api/v1alpha2/extension.go index 27807448..5e17e030 100644 --- a/client-go/applyconfiguration/api/v1alpha1/extension.go +++ b/client-go/applyconfiguration/api/v1alpha2/extension.go @@ -15,10 +15,10 @@ limitations under the License. */ // Code generated by applyconfiguration-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 import ( - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) // ExtensionApplyConfiguration represents a declarative configuration of the Extension type for use @@ -37,7 +37,7 @@ func Extension() *ExtensionApplyConfiguration { // WithGroup sets the Group field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Group field is set to the value of the last call. -func (b *ExtensionApplyConfiguration) WithGroup(value string) *ExtensionApplyConfiguration { +func (b *ExtensionApplyConfiguration) WithGroup(value apiv1alpha2.Group) *ExtensionApplyConfiguration { b.ExtensionReferenceApplyConfiguration.Group = &value return b } @@ -45,7 +45,7 @@ func (b *ExtensionApplyConfiguration) WithGroup(value string) *ExtensionApplyCon // WithKind sets the Kind field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Kind field is set to the value of the last call. -func (b *ExtensionApplyConfiguration) WithKind(value string) *ExtensionApplyConfiguration { +func (b *ExtensionApplyConfiguration) WithKind(value apiv1alpha2.Kind) *ExtensionApplyConfiguration { b.ExtensionReferenceApplyConfiguration.Kind = &value return b } @@ -53,23 +53,23 @@ func (b *ExtensionApplyConfiguration) WithKind(value string) *ExtensionApplyConf // WithName sets the Name field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Name field is set to the value of the last call. -func (b *ExtensionApplyConfiguration) WithName(value string) *ExtensionApplyConfiguration { +func (b *ExtensionApplyConfiguration) WithName(value apiv1alpha2.ObjectName) *ExtensionApplyConfiguration { b.ExtensionReferenceApplyConfiguration.Name = &value return b } -// WithTargetPortNumber sets the TargetPortNumber field in the declarative configuration to the given value +// WithPortNumber sets the PortNumber field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the TargetPortNumber field is set to the value of the last call. -func (b *ExtensionApplyConfiguration) WithTargetPortNumber(value int32) *ExtensionApplyConfiguration { - b.ExtensionReferenceApplyConfiguration.TargetPortNumber = &value +// If called multiple times, the PortNumber field is set to the value of the last call. +func (b *ExtensionApplyConfiguration) WithPortNumber(value apiv1alpha2.PortNumber) *ExtensionApplyConfiguration { + b.ExtensionReferenceApplyConfiguration.PortNumber = &value return b } // WithFailureMode sets the FailureMode field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the FailureMode field is set to the value of the last call. -func (b *ExtensionApplyConfiguration) WithFailureMode(value apiv1alpha1.ExtensionFailureMode) *ExtensionApplyConfiguration { +func (b *ExtensionApplyConfiguration) WithFailureMode(value apiv1alpha2.ExtensionFailureMode) *ExtensionApplyConfiguration { b.ExtensionConnectionApplyConfiguration.FailureMode = &value return b } diff --git a/client-go/applyconfiguration/api/v1alpha1/extensionconnection.go b/client-go/applyconfiguration/api/v1alpha2/extensionconnection.go similarity index 86% rename from client-go/applyconfiguration/api/v1alpha1/extensionconnection.go rename to client-go/applyconfiguration/api/v1alpha2/extensionconnection.go index be9eeaa1..2a59b830 100644 --- a/client-go/applyconfiguration/api/v1alpha1/extensionconnection.go +++ b/client-go/applyconfiguration/api/v1alpha2/extensionconnection.go @@ -15,16 +15,16 @@ limitations under the License. */ // Code generated by applyconfiguration-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 import ( - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) // ExtensionConnectionApplyConfiguration represents a declarative configuration of the ExtensionConnection type for use // with apply. type ExtensionConnectionApplyConfiguration struct { - FailureMode *apiv1alpha1.ExtensionFailureMode `json:"failureMode,omitempty"` + FailureMode *apiv1alpha2.ExtensionFailureMode `json:"failureMode,omitempty"` } // ExtensionConnectionApplyConfiguration constructs a declarative configuration of the ExtensionConnection type for use with @@ -36,7 +36,7 @@ func ExtensionConnection() *ExtensionConnectionApplyConfiguration { // WithFailureMode sets the FailureMode field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the FailureMode field is set to the value of the last call. -func (b *ExtensionConnectionApplyConfiguration) WithFailureMode(value apiv1alpha1.ExtensionFailureMode) *ExtensionConnectionApplyConfiguration { +func (b *ExtensionConnectionApplyConfiguration) WithFailureMode(value apiv1alpha2.ExtensionFailureMode) *ExtensionConnectionApplyConfiguration { b.FailureMode = &value return b } diff --git a/client-go/applyconfiguration/api/v1alpha1/extensionreference.go b/client-go/applyconfiguration/api/v1alpha2/extensionreference.go similarity index 66% rename from client-go/applyconfiguration/api/v1alpha1/extensionreference.go rename to client-go/applyconfiguration/api/v1alpha2/extensionreference.go index c72c0306..937e5795 100644 --- a/client-go/applyconfiguration/api/v1alpha1/extensionreference.go +++ b/client-go/applyconfiguration/api/v1alpha2/extensionreference.go @@ -15,15 +15,19 @@ limitations under the License. */ // Code generated by applyconfiguration-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 + +import ( + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" +) // ExtensionReferenceApplyConfiguration represents a declarative configuration of the ExtensionReference type for use // with apply. type ExtensionReferenceApplyConfiguration struct { - Group *string `json:"group,omitempty"` - Kind *string `json:"kind,omitempty"` - Name *string `json:"name,omitempty"` - TargetPortNumber *int32 `json:"targetPortNumber,omitempty"` + Group *apiv1alpha2.Group `json:"group,omitempty"` + Kind *apiv1alpha2.Kind `json:"kind,omitempty"` + Name *apiv1alpha2.ObjectName `json:"name,omitempty"` + PortNumber *apiv1alpha2.PortNumber `json:"portNumber,omitempty"` } // ExtensionReferenceApplyConfiguration constructs a declarative configuration of the ExtensionReference type for use with @@ -35,7 +39,7 @@ func ExtensionReference() *ExtensionReferenceApplyConfiguration { // WithGroup sets the Group field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Group field is set to the value of the last call. -func (b *ExtensionReferenceApplyConfiguration) WithGroup(value string) *ExtensionReferenceApplyConfiguration { +func (b *ExtensionReferenceApplyConfiguration) WithGroup(value apiv1alpha2.Group) *ExtensionReferenceApplyConfiguration { b.Group = &value return b } @@ -43,7 +47,7 @@ func (b *ExtensionReferenceApplyConfiguration) WithGroup(value string) *Extensio // WithKind sets the Kind field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Kind field is set to the value of the last call. -func (b *ExtensionReferenceApplyConfiguration) WithKind(value string) *ExtensionReferenceApplyConfiguration { +func (b *ExtensionReferenceApplyConfiguration) WithKind(value apiv1alpha2.Kind) *ExtensionReferenceApplyConfiguration { b.Kind = &value return b } @@ -51,15 +55,15 @@ func (b *ExtensionReferenceApplyConfiguration) WithKind(value string) *Extension // WithName sets the Name field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Name field is set to the value of the last call. -func (b *ExtensionReferenceApplyConfiguration) WithName(value string) *ExtensionReferenceApplyConfiguration { +func (b *ExtensionReferenceApplyConfiguration) WithName(value apiv1alpha2.ObjectName) *ExtensionReferenceApplyConfiguration { b.Name = &value return b } -// WithTargetPortNumber sets the TargetPortNumber field in the declarative configuration to the given value +// WithPortNumber sets the PortNumber field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the TargetPortNumber field is set to the value of the last call. -func (b *ExtensionReferenceApplyConfiguration) WithTargetPortNumber(value int32) *ExtensionReferenceApplyConfiguration { - b.TargetPortNumber = &value +// If called multiple times, the PortNumber field is set to the value of the last call. +func (b *ExtensionReferenceApplyConfiguration) WithPortNumber(value apiv1alpha2.PortNumber) *ExtensionReferenceApplyConfiguration { + b.PortNumber = &value return b } diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencemodel.go b/client-go/applyconfiguration/api/v1alpha2/inferencemodel.go similarity index 99% rename from client-go/applyconfiguration/api/v1alpha1/inferencemodel.go rename to client-go/applyconfiguration/api/v1alpha2/inferencemodel.go index b6201467..1fbfe106 100644 --- a/client-go/applyconfiguration/api/v1alpha1/inferencemodel.go +++ b/client-go/applyconfiguration/api/v1alpha2/inferencemodel.go @@ -15,7 +15,7 @@ limitations under the License. */ // Code generated by applyconfiguration-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -39,7 +39,7 @@ func InferenceModel(name, namespace string) *InferenceModelApplyConfiguration { b.WithName(name) b.WithNamespace(namespace) b.WithKind("InferenceModel") - b.WithAPIVersion("api/v1alpha1") + b.WithAPIVersion("inference.networking.x-k8s.io/v1alpha2") return b } diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go b/client-go/applyconfiguration/api/v1alpha2/inferencemodelspec.go similarity index 93% rename from client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go rename to client-go/applyconfiguration/api/v1alpha2/inferencemodelspec.go index 9bbdda06..438ccd48 100644 --- a/client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go +++ b/client-go/applyconfiguration/api/v1alpha2/inferencemodelspec.go @@ -15,17 +15,17 @@ limitations under the License. */ // Code generated by applyconfiguration-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 import ( - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) // InferenceModelSpecApplyConfiguration represents a declarative configuration of the InferenceModelSpec type for use // with apply. type InferenceModelSpecApplyConfiguration struct { ModelName *string `json:"modelName,omitempty"` - Criticality *apiv1alpha1.Criticality `json:"criticality,omitempty"` + Criticality *apiv1alpha2.Criticality `json:"criticality,omitempty"` TargetModels []TargetModelApplyConfiguration `json:"targetModels,omitempty"` PoolRef *PoolObjectReferenceApplyConfiguration `json:"poolRef,omitempty"` } @@ -47,7 +47,7 @@ func (b *InferenceModelSpecApplyConfiguration) WithModelName(value string) *Infe // WithCriticality sets the Criticality field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Criticality field is set to the value of the last call. -func (b *InferenceModelSpecApplyConfiguration) WithCriticality(value apiv1alpha1.Criticality) *InferenceModelSpecApplyConfiguration { +func (b *InferenceModelSpecApplyConfiguration) WithCriticality(value apiv1alpha2.Criticality) *InferenceModelSpecApplyConfiguration { b.Criticality = &value return b } diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencemodelstatus.go b/client-go/applyconfiguration/api/v1alpha2/inferencemodelstatus.go similarity index 99% rename from client-go/applyconfiguration/api/v1alpha1/inferencemodelstatus.go rename to client-go/applyconfiguration/api/v1alpha2/inferencemodelstatus.go index b0b003bb..e8142efe 100644 --- a/client-go/applyconfiguration/api/v1alpha1/inferencemodelstatus.go +++ b/client-go/applyconfiguration/api/v1alpha2/inferencemodelstatus.go @@ -15,7 +15,7 @@ limitations under the License. */ // Code generated by applyconfiguration-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 import ( v1 "k8s.io/client-go/applyconfigurations/meta/v1" diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencepool.go b/client-go/applyconfiguration/api/v1alpha2/inferencepool.go similarity index 99% rename from client-go/applyconfiguration/api/v1alpha1/inferencepool.go rename to client-go/applyconfiguration/api/v1alpha2/inferencepool.go index a7f3ed6d..cd725cb6 100644 --- a/client-go/applyconfiguration/api/v1alpha1/inferencepool.go +++ b/client-go/applyconfiguration/api/v1alpha2/inferencepool.go @@ -15,7 +15,7 @@ limitations under the License. */ // Code generated by applyconfiguration-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -39,7 +39,7 @@ func InferencePool(name, namespace string) *InferencePoolApplyConfiguration { b.WithName(name) b.WithNamespace(namespace) b.WithKind("InferencePool") - b.WithAPIVersion("api/v1alpha1") + b.WithAPIVersion("inference.networking.x-k8s.io/v1alpha2") return b } diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go b/client-go/applyconfiguration/api/v1alpha2/inferencepoolspec.go similarity index 88% rename from client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go rename to client-go/applyconfiguration/api/v1alpha2/inferencepoolspec.go index e132f74b..e4d5a97d 100644 --- a/client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go +++ b/client-go/applyconfiguration/api/v1alpha2/inferencepoolspec.go @@ -15,16 +15,16 @@ limitations under the License. */ // Code generated by applyconfiguration-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 import ( - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) // InferencePoolSpecApplyConfiguration represents a declarative configuration of the InferencePoolSpec type for use // with apply. type InferencePoolSpecApplyConfiguration struct { - Selector map[apiv1alpha1.LabelKey]apiv1alpha1.LabelValue `json:"selector,omitempty"` + Selector map[apiv1alpha2.LabelKey]apiv1alpha2.LabelValue `json:"selector,omitempty"` TargetPortNumber *int32 `json:"targetPortNumber,omitempty"` EndpointPickerConfigApplyConfiguration `json:",inline"` } @@ -39,9 +39,9 @@ func InferencePoolSpec() *InferencePoolSpecApplyConfiguration { // and returns the receiver, so that objects can be build by chaining "With" function invocations. // If called multiple times, the entries provided by each call will be put on the Selector field, // overwriting an existing map entries in Selector field with the same key. -func (b *InferencePoolSpecApplyConfiguration) WithSelector(entries map[apiv1alpha1.LabelKey]apiv1alpha1.LabelValue) *InferencePoolSpecApplyConfiguration { +func (b *InferencePoolSpecApplyConfiguration) WithSelector(entries map[apiv1alpha2.LabelKey]apiv1alpha2.LabelValue) *InferencePoolSpecApplyConfiguration { if b.Selector == nil && len(entries) > 0 { - b.Selector = make(map[apiv1alpha1.LabelKey]apiv1alpha1.LabelValue, len(entries)) + b.Selector = make(map[apiv1alpha2.LabelKey]apiv1alpha2.LabelValue, len(entries)) } for k, v := range entries { b.Selector[k] = v diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencepoolstatus.go b/client-go/applyconfiguration/api/v1alpha2/inferencepoolstatus.go similarity index 71% rename from client-go/applyconfiguration/api/v1alpha1/inferencepoolstatus.go rename to client-go/applyconfiguration/api/v1alpha2/inferencepoolstatus.go index f61a81b3..9587dabe 100644 --- a/client-go/applyconfiguration/api/v1alpha1/inferencepoolstatus.go +++ b/client-go/applyconfiguration/api/v1alpha2/inferencepoolstatus.go @@ -15,16 +15,12 @@ limitations under the License. */ // Code generated by applyconfiguration-gen. DO NOT EDIT. -package v1alpha1 - -import ( - v1 "k8s.io/client-go/applyconfigurations/meta/v1" -) +package v1alpha2 // InferencePoolStatusApplyConfiguration represents a declarative configuration of the InferencePoolStatus type for use // with apply. type InferencePoolStatusApplyConfiguration struct { - Conditions []v1.ConditionApplyConfiguration `json:"conditions,omitempty"` + Parents []PoolStatusApplyConfiguration `json:"parent,omitempty"` } // InferencePoolStatusApplyConfiguration constructs a declarative configuration of the InferencePoolStatus type for use with @@ -33,15 +29,15 @@ func InferencePoolStatus() *InferencePoolStatusApplyConfiguration { return &InferencePoolStatusApplyConfiguration{} } -// WithConditions adds the given value to the Conditions field in the declarative configuration +// WithParents adds the given value to the Parents field in the declarative configuration // and returns the receiver, so that objects can be build by chaining "With" function invocations. -// If called multiple times, values provided by each call will be appended to the Conditions field. -func (b *InferencePoolStatusApplyConfiguration) WithConditions(values ...*v1.ConditionApplyConfiguration) *InferencePoolStatusApplyConfiguration { +// If called multiple times, values provided by each call will be appended to the Parents field. +func (b *InferencePoolStatusApplyConfiguration) WithParents(values ...*PoolStatusApplyConfiguration) *InferencePoolStatusApplyConfiguration { for i := range values { if values[i] == nil { - panic("nil value passed to WithConditions") + panic("nil value passed to WithParents") } - b.Conditions = append(b.Conditions, *values[i]) + b.Parents = append(b.Parents, *values[i]) } return b } diff --git a/client-go/applyconfiguration/api/v1alpha1/poolobjectreference.go b/client-go/applyconfiguration/api/v1alpha2/poolobjectreference.go similarity index 77% rename from client-go/applyconfiguration/api/v1alpha1/poolobjectreference.go rename to client-go/applyconfiguration/api/v1alpha2/poolobjectreference.go index 692a185e..20abf6b2 100644 --- a/client-go/applyconfiguration/api/v1alpha1/poolobjectreference.go +++ b/client-go/applyconfiguration/api/v1alpha2/poolobjectreference.go @@ -15,14 +15,18 @@ limitations under the License. */ // Code generated by applyconfiguration-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 + +import ( + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" +) // PoolObjectReferenceApplyConfiguration represents a declarative configuration of the PoolObjectReference type for use // with apply. type PoolObjectReferenceApplyConfiguration struct { - Group *string `json:"group,omitempty"` - Kind *string `json:"kind,omitempty"` - Name *string `json:"name,omitempty"` + Group *apiv1alpha2.Group `json:"group,omitempty"` + Kind *apiv1alpha2.Kind `json:"kind,omitempty"` + Name *apiv1alpha2.ObjectName `json:"name,omitempty"` } // PoolObjectReferenceApplyConfiguration constructs a declarative configuration of the PoolObjectReference type for use with @@ -34,7 +38,7 @@ func PoolObjectReference() *PoolObjectReferenceApplyConfiguration { // WithGroup sets the Group field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Group field is set to the value of the last call. -func (b *PoolObjectReferenceApplyConfiguration) WithGroup(value string) *PoolObjectReferenceApplyConfiguration { +func (b *PoolObjectReferenceApplyConfiguration) WithGroup(value apiv1alpha2.Group) *PoolObjectReferenceApplyConfiguration { b.Group = &value return b } @@ -42,7 +46,7 @@ func (b *PoolObjectReferenceApplyConfiguration) WithGroup(value string) *PoolObj // WithKind sets the Kind field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Kind field is set to the value of the last call. -func (b *PoolObjectReferenceApplyConfiguration) WithKind(value string) *PoolObjectReferenceApplyConfiguration { +func (b *PoolObjectReferenceApplyConfiguration) WithKind(value apiv1alpha2.Kind) *PoolObjectReferenceApplyConfiguration { b.Kind = &value return b } @@ -50,7 +54,7 @@ func (b *PoolObjectReferenceApplyConfiguration) WithKind(value string) *PoolObje // WithName sets the Name field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Name field is set to the value of the last call. -func (b *PoolObjectReferenceApplyConfiguration) WithName(value string) *PoolObjectReferenceApplyConfiguration { +func (b *PoolObjectReferenceApplyConfiguration) WithName(value apiv1alpha2.ObjectName) *PoolObjectReferenceApplyConfiguration { b.Name = &value return b } diff --git a/client-go/applyconfiguration/api/v1alpha2/poolstatus.go b/client-go/applyconfiguration/api/v1alpha2/poolstatus.go new file mode 100644 index 00000000..bff29935 --- /dev/null +++ b/client-go/applyconfiguration/api/v1alpha2/poolstatus.go @@ -0,0 +1,57 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/client-go/applyconfigurations/meta/v1" +) + +// PoolStatusApplyConfiguration represents a declarative configuration of the PoolStatus type for use +// with apply. +type PoolStatusApplyConfiguration struct { + GatewayRef *v1.ObjectReference `json:"parentRef,omitempty"` + Conditions []metav1.ConditionApplyConfiguration `json:"conditions,omitempty"` +} + +// PoolStatusApplyConfiguration constructs a declarative configuration of the PoolStatus type for use with +// apply. +func PoolStatus() *PoolStatusApplyConfiguration { + return &PoolStatusApplyConfiguration{} +} + +// WithGatewayRef sets the GatewayRef field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the GatewayRef field is set to the value of the last call. +func (b *PoolStatusApplyConfiguration) WithGatewayRef(value v1.ObjectReference) *PoolStatusApplyConfiguration { + b.GatewayRef = &value + return b +} + +// WithConditions adds the given value to the Conditions field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the Conditions field. +func (b *PoolStatusApplyConfiguration) WithConditions(values ...*metav1.ConditionApplyConfiguration) *PoolStatusApplyConfiguration { + for i := range values { + if values[i] == nil { + panic("nil value passed to WithConditions") + } + b.Conditions = append(b.Conditions, *values[i]) + } + return b +} diff --git a/client-go/applyconfiguration/api/v1alpha1/targetmodel.go b/client-go/applyconfiguration/api/v1alpha2/targetmodel.go similarity index 99% rename from client-go/applyconfiguration/api/v1alpha1/targetmodel.go rename to client-go/applyconfiguration/api/v1alpha2/targetmodel.go index f6ac83f8..4ed9b4bc 100644 --- a/client-go/applyconfiguration/api/v1alpha1/targetmodel.go +++ b/client-go/applyconfiguration/api/v1alpha2/targetmodel.go @@ -15,7 +15,7 @@ limitations under the License. */ // Code generated by applyconfiguration-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 // TargetModelApplyConfiguration represents a declarative configuration of the TargetModel type for use // with apply. diff --git a/client-go/applyconfiguration/utils.go b/client-go/applyconfiguration/utils.go index 1a71b674..e1ad5ea4 100644 --- a/client-go/applyconfiguration/utils.go +++ b/client-go/applyconfiguration/utils.go @@ -18,43 +18,45 @@ limitations under the License. package applyconfiguration import ( - v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" - internal "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/internal" runtime "k8s.io/apimachinery/pkg/runtime" schema "k8s.io/apimachinery/pkg/runtime/schema" testing "k8s.io/client-go/testing" + v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha2" + internal "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/internal" ) // ForKind returns an apply configuration type for the given GroupVersionKind, or nil if no // apply configuration type exists for the given GroupVersionKind. func ForKind(kind schema.GroupVersionKind) interface{} { switch kind { - // Group=api, Version=v1alpha1 - case v1alpha1.SchemeGroupVersion.WithKind("EndpointPickerConfig"): - return &apiv1alpha1.EndpointPickerConfigApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("Extension"): - return &apiv1alpha1.ExtensionApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("ExtensionConnection"): - return &apiv1alpha1.ExtensionConnectionApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("ExtensionReference"): - return &apiv1alpha1.ExtensionReferenceApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("InferenceModel"): - return &apiv1alpha1.InferenceModelApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("InferenceModelSpec"): - return &apiv1alpha1.InferenceModelSpecApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("InferenceModelStatus"): - return &apiv1alpha1.InferenceModelStatusApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("InferencePool"): - return &apiv1alpha1.InferencePoolApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("InferencePoolSpec"): - return &apiv1alpha1.InferencePoolSpecApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("InferencePoolStatus"): - return &apiv1alpha1.InferencePoolStatusApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("PoolObjectReference"): - return &apiv1alpha1.PoolObjectReferenceApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("TargetModel"): - return &apiv1alpha1.TargetModelApplyConfiguration{} + // Group=inference.networking.x-k8s.io, Version=v1alpha2 + case v1alpha2.SchemeGroupVersion.WithKind("EndpointPickerConfig"): + return &apiv1alpha2.EndpointPickerConfigApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("Extension"): + return &apiv1alpha2.ExtensionApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("ExtensionConnection"): + return &apiv1alpha2.ExtensionConnectionApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("ExtensionReference"): + return &apiv1alpha2.ExtensionReferenceApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("InferenceModel"): + return &apiv1alpha2.InferenceModelApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("InferenceModelSpec"): + return &apiv1alpha2.InferenceModelSpecApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("InferenceModelStatus"): + return &apiv1alpha2.InferenceModelStatusApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("InferencePool"): + return &apiv1alpha2.InferencePoolApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("InferencePoolSpec"): + return &apiv1alpha2.InferencePoolSpecApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("InferencePoolStatus"): + return &apiv1alpha2.InferencePoolStatusApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("PoolObjectReference"): + return &apiv1alpha2.PoolObjectReferenceApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("PoolStatus"): + return &apiv1alpha2.PoolStatusApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("TargetModel"): + return &apiv1alpha2.TargetModelApplyConfiguration{} } return nil diff --git a/client-go/clientset/versioned/clientset.go b/client-go/clientset/versioned/clientset.go index 18e3236a..c56d11c7 100644 --- a/client-go/clientset/versioned/clientset.go +++ b/client-go/clientset/versioned/clientset.go @@ -21,26 +21,26 @@ import ( fmt "fmt" http "net/http" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" discovery "k8s.io/client-go/discovery" rest "k8s.io/client-go/rest" flowcontrol "k8s.io/client-go/util/flowcontrol" + inferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha2" ) type Interface interface { Discovery() discovery.DiscoveryInterface - ApiV1alpha1() apiv1alpha1.ApiV1alpha1Interface + InferenceV1alpha2() inferencev1alpha2.InferenceV1alpha2Interface } // Clientset contains the clients for groups. type Clientset struct { *discovery.DiscoveryClient - apiV1alpha1 *apiv1alpha1.ApiV1alpha1Client + inferenceV1alpha2 *inferencev1alpha2.InferenceV1alpha2Client } -// ApiV1alpha1 retrieves the ApiV1alpha1Client -func (c *Clientset) ApiV1alpha1() apiv1alpha1.ApiV1alpha1Interface { - return c.apiV1alpha1 +// InferenceV1alpha2 retrieves the InferenceV1alpha2Client +func (c *Clientset) InferenceV1alpha2() inferencev1alpha2.InferenceV1alpha2Interface { + return c.inferenceV1alpha2 } // Discovery retrieves the DiscoveryClient @@ -87,7 +87,7 @@ func NewForConfigAndClient(c *rest.Config, httpClient *http.Client) (*Clientset, var cs Clientset var err error - cs.apiV1alpha1, err = apiv1alpha1.NewForConfigAndClient(&configShallowCopy, httpClient) + cs.inferenceV1alpha2, err = inferencev1alpha2.NewForConfigAndClient(&configShallowCopy, httpClient) if err != nil { return nil, err } @@ -112,7 +112,7 @@ func NewForConfigOrDie(c *rest.Config) *Clientset { // New creates a new Clientset for the given RESTClient. func New(c rest.Interface) *Clientset { var cs Clientset - cs.apiV1alpha1 = apiv1alpha1.New(c) + cs.inferenceV1alpha2 = inferencev1alpha2.New(c) cs.DiscoveryClient = discovery.NewDiscoveryClient(c) return &cs diff --git a/client-go/clientset/versioned/fake/clientset_generated.go b/client-go/clientset/versioned/fake/clientset_generated.go index dda29ec6..b0ecd50b 100644 --- a/client-go/clientset/versioned/fake/clientset_generated.go +++ b/client-go/clientset/versioned/fake/clientset_generated.go @@ -18,15 +18,15 @@ limitations under the License. package fake import ( - applyconfiguration "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration" - clientset "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" - fakeapiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1/fake" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/discovery" fakediscovery "k8s.io/client-go/discovery/fake" "k8s.io/client-go/testing" + applyconfiguration "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration" + clientset "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" + inferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha2" + fakeinferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha2/fake" ) // NewSimpleClientset returns a clientset that will respond with the provided objects. @@ -115,7 +115,7 @@ var ( _ testing.FakeClient = &Clientset{} ) -// ApiV1alpha1 retrieves the ApiV1alpha1Client -func (c *Clientset) ApiV1alpha1() apiv1alpha1.ApiV1alpha1Interface { - return &fakeapiv1alpha1.FakeApiV1alpha1{Fake: &c.Fake} +// InferenceV1alpha2 retrieves the InferenceV1alpha2Client +func (c *Clientset) InferenceV1alpha2() inferencev1alpha2.InferenceV1alpha2Interface { + return &fakeinferencev1alpha2.FakeInferenceV1alpha2{Fake: &c.Fake} } diff --git a/client-go/clientset/versioned/fake/register.go b/client-go/clientset/versioned/fake/register.go index f252a096..365ccb75 100644 --- a/client-go/clientset/versioned/fake/register.go +++ b/client-go/clientset/versioned/fake/register.go @@ -18,19 +18,19 @@ limitations under the License. package fake import ( - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" schema "k8s.io/apimachinery/pkg/runtime/schema" serializer "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" + inferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) var scheme = runtime.NewScheme() var codecs = serializer.NewCodecFactory(scheme) var localSchemeBuilder = runtime.SchemeBuilder{ - apiv1alpha1.AddToScheme, + inferencev1alpha2.AddToScheme, } // AddToScheme adds all types of this clientset into the given scheme. This allows composition diff --git a/client-go/clientset/versioned/scheme/register.go b/client-go/clientset/versioned/scheme/register.go index 6e243827..b656f121 100644 --- a/client-go/clientset/versioned/scheme/register.go +++ b/client-go/clientset/versioned/scheme/register.go @@ -18,19 +18,19 @@ limitations under the License. package scheme import ( - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" schema "k8s.io/apimachinery/pkg/runtime/schema" serializer "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" + inferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) var Scheme = runtime.NewScheme() var Codecs = serializer.NewCodecFactory(Scheme) var ParameterCodec = runtime.NewParameterCodec(Scheme) var localSchemeBuilder = runtime.SchemeBuilder{ - apiv1alpha1.AddToScheme, + inferencev1alpha2.AddToScheme, } // AddToScheme adds all types of this clientset into the given scheme. This allows composition diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go deleted file mode 100644 index e33b311d..00000000 --- a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go +++ /dev/null @@ -1,52 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by client-gen. DO NOT EDIT. - -package fake - -import ( - v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" - typedapiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" - gentype "k8s.io/client-go/gentype" -) - -// fakeInferenceModels implements InferenceModelInterface -type fakeInferenceModels struct { - *gentype.FakeClientWithListAndApply[*v1alpha1.InferenceModel, *v1alpha1.InferenceModelList, *apiv1alpha1.InferenceModelApplyConfiguration] - Fake *FakeApiV1alpha1 -} - -func newFakeInferenceModels(fake *FakeApiV1alpha1, namespace string) typedapiv1alpha1.InferenceModelInterface { - return &fakeInferenceModels{ - gentype.NewFakeClientWithListAndApply[*v1alpha1.InferenceModel, *v1alpha1.InferenceModelList, *apiv1alpha1.InferenceModelApplyConfiguration]( - fake.Fake, - namespace, - v1alpha1.SchemeGroupVersion.WithResource("inferencemodels"), - v1alpha1.SchemeGroupVersion.WithKind("InferenceModel"), - func() *v1alpha1.InferenceModel { return &v1alpha1.InferenceModel{} }, - func() *v1alpha1.InferenceModelList { return &v1alpha1.InferenceModelList{} }, - func(dst, src *v1alpha1.InferenceModelList) { dst.ListMeta = src.ListMeta }, - func(list *v1alpha1.InferenceModelList) []*v1alpha1.InferenceModel { - return gentype.ToPointerSlice(list.Items) - }, - func(list *v1alpha1.InferenceModelList, items []*v1alpha1.InferenceModel) { - list.Items = gentype.FromPointerSlice(items) - }, - ), - fake, - } -} diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go deleted file mode 100644 index 92bc5cbe..00000000 --- a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go +++ /dev/null @@ -1,52 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by client-gen. DO NOT EDIT. - -package fake - -import ( - v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" - typedapiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" - gentype "k8s.io/client-go/gentype" -) - -// fakeInferencePools implements InferencePoolInterface -type fakeInferencePools struct { - *gentype.FakeClientWithListAndApply[*v1alpha1.InferencePool, *v1alpha1.InferencePoolList, *apiv1alpha1.InferencePoolApplyConfiguration] - Fake *FakeApiV1alpha1 -} - -func newFakeInferencePools(fake *FakeApiV1alpha1, namespace string) typedapiv1alpha1.InferencePoolInterface { - return &fakeInferencePools{ - gentype.NewFakeClientWithListAndApply[*v1alpha1.InferencePool, *v1alpha1.InferencePoolList, *apiv1alpha1.InferencePoolApplyConfiguration]( - fake.Fake, - namespace, - v1alpha1.SchemeGroupVersion.WithResource("inferencepools"), - v1alpha1.SchemeGroupVersion.WithKind("InferencePool"), - func() *v1alpha1.InferencePool { return &v1alpha1.InferencePool{} }, - func() *v1alpha1.InferencePoolList { return &v1alpha1.InferencePoolList{} }, - func(dst, src *v1alpha1.InferencePoolList) { dst.ListMeta = src.ListMeta }, - func(list *v1alpha1.InferencePoolList) []*v1alpha1.InferencePool { - return gentype.ToPointerSlice(list.Items) - }, - func(list *v1alpha1.InferencePoolList, items []*v1alpha1.InferencePool) { - list.Items = gentype.FromPointerSlice(items) - }, - ), - fake, - } -} diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/api_client.go b/client-go/clientset/versioned/typed/api/v1alpha2/api_client.go similarity index 61% rename from client-go/clientset/versioned/typed/api/v1alpha1/api_client.go rename to client-go/clientset/versioned/typed/api/v1alpha2/api_client.go index 84a4a0bb..b011ca92 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/api_client.go +++ b/client-go/clientset/versioned/typed/api/v1alpha2/api_client.go @@ -15,39 +15,39 @@ limitations under the License. */ // Code generated by client-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 import ( http "net/http" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - scheme "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" rest "k8s.io/client-go/rest" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + scheme "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" ) -type ApiV1alpha1Interface interface { +type InferenceV1alpha2Interface interface { RESTClient() rest.Interface InferenceModelsGetter InferencePoolsGetter } -// ApiV1alpha1Client is used to interact with features provided by the api group. -type ApiV1alpha1Client struct { +// InferenceV1alpha2Client is used to interact with features provided by the inference.networking.x-k8s.io group. +type InferenceV1alpha2Client struct { restClient rest.Interface } -func (c *ApiV1alpha1Client) InferenceModels(namespace string) InferenceModelInterface { +func (c *InferenceV1alpha2Client) InferenceModels(namespace string) InferenceModelInterface { return newInferenceModels(c, namespace) } -func (c *ApiV1alpha1Client) InferencePools(namespace string) InferencePoolInterface { +func (c *InferenceV1alpha2Client) InferencePools(namespace string) InferencePoolInterface { return newInferencePools(c, namespace) } -// NewForConfig creates a new ApiV1alpha1Client for the given config. +// NewForConfig creates a new InferenceV1alpha2Client for the given config. // NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), // where httpClient was generated with rest.HTTPClientFor(c). -func NewForConfig(c *rest.Config) (*ApiV1alpha1Client, error) { +func NewForConfig(c *rest.Config) (*InferenceV1alpha2Client, error) { config := *c if err := setConfigDefaults(&config); err != nil { return nil, err @@ -59,9 +59,9 @@ func NewForConfig(c *rest.Config) (*ApiV1alpha1Client, error) { return NewForConfigAndClient(&config, httpClient) } -// NewForConfigAndClient creates a new ApiV1alpha1Client for the given config and http client. +// NewForConfigAndClient creates a new InferenceV1alpha2Client for the given config and http client. // Note the http client provided takes precedence over the configured transport values. -func NewForConfigAndClient(c *rest.Config, h *http.Client) (*ApiV1alpha1Client, error) { +func NewForConfigAndClient(c *rest.Config, h *http.Client) (*InferenceV1alpha2Client, error) { config := *c if err := setConfigDefaults(&config); err != nil { return nil, err @@ -70,12 +70,12 @@ func NewForConfigAndClient(c *rest.Config, h *http.Client) (*ApiV1alpha1Client, if err != nil { return nil, err } - return &ApiV1alpha1Client{client}, nil + return &InferenceV1alpha2Client{client}, nil } -// NewForConfigOrDie creates a new ApiV1alpha1Client for the given config and +// NewForConfigOrDie creates a new InferenceV1alpha2Client for the given config and // panics if there is an error in the config. -func NewForConfigOrDie(c *rest.Config) *ApiV1alpha1Client { +func NewForConfigOrDie(c *rest.Config) *InferenceV1alpha2Client { client, err := NewForConfig(c) if err != nil { panic(err) @@ -83,13 +83,13 @@ func NewForConfigOrDie(c *rest.Config) *ApiV1alpha1Client { return client } -// New creates a new ApiV1alpha1Client for the given RESTClient. -func New(c rest.Interface) *ApiV1alpha1Client { - return &ApiV1alpha1Client{c} +// New creates a new InferenceV1alpha2Client for the given RESTClient. +func New(c rest.Interface) *InferenceV1alpha2Client { + return &InferenceV1alpha2Client{c} } func setConfigDefaults(config *rest.Config) error { - gv := apiv1alpha1.SchemeGroupVersion + gv := apiv1alpha2.SchemeGroupVersion config.GroupVersion = &gv config.APIPath = "/apis" config.NegotiatedSerializer = rest.CodecFactoryForGeneratedClient(scheme.Scheme, scheme.Codecs).WithoutConversion() @@ -103,7 +103,7 @@ func setConfigDefaults(config *rest.Config) error { // RESTClient returns a RESTClient that is used to communicate // with API server by this client implementation. -func (c *ApiV1alpha1Client) RESTClient() rest.Interface { +func (c *InferenceV1alpha2Client) RESTClient() rest.Interface { if c == nil { return nil } diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/doc.go b/client-go/clientset/versioned/typed/api/v1alpha2/doc.go similarity index 97% rename from client-go/clientset/versioned/typed/api/v1alpha1/doc.go rename to client-go/clientset/versioned/typed/api/v1alpha2/doc.go index 28991e22..2bcba220 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/doc.go +++ b/client-go/clientset/versioned/typed/api/v1alpha2/doc.go @@ -16,4 +16,4 @@ limitations under the License. // Code generated by client-gen. DO NOT EDIT. // This package has the automatically generated typed clients. -package v1alpha1 +package v1alpha2 diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/fake/doc.go b/client-go/clientset/versioned/typed/api/v1alpha2/fake/doc.go similarity index 100% rename from client-go/clientset/versioned/typed/api/v1alpha1/fake/doc.go rename to client-go/clientset/versioned/typed/api/v1alpha2/fake/doc.go diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go b/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_api_client.go similarity index 70% rename from client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go rename to client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_api_client.go index d5dbc1a8..0296608c 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go +++ b/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_api_client.go @@ -18,26 +18,26 @@ limitations under the License. package fake import ( - v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" rest "k8s.io/client-go/rest" testing "k8s.io/client-go/testing" + v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha2" ) -type FakeApiV1alpha1 struct { +type FakeInferenceV1alpha2 struct { *testing.Fake } -func (c *FakeApiV1alpha1) InferenceModels(namespace string) v1alpha1.InferenceModelInterface { +func (c *FakeInferenceV1alpha2) InferenceModels(namespace string) v1alpha2.InferenceModelInterface { return newFakeInferenceModels(c, namespace) } -func (c *FakeApiV1alpha1) InferencePools(namespace string) v1alpha1.InferencePoolInterface { +func (c *FakeInferenceV1alpha2) InferencePools(namespace string) v1alpha2.InferencePoolInterface { return newFakeInferencePools(c, namespace) } // RESTClient returns a RESTClient that is used to communicate // with API server by this client implementation. -func (c *FakeApiV1alpha1) RESTClient() rest.Interface { +func (c *FakeInferenceV1alpha2) RESTClient() rest.Interface { var ret *rest.RESTClient return ret } diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencemodel.go b/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencemodel.go new file mode 100644 index 00000000..2492a557 --- /dev/null +++ b/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencemodel.go @@ -0,0 +1,52 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + gentype "k8s.io/client-go/gentype" + v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha2" + typedapiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha2" +) + +// fakeInferenceModels implements InferenceModelInterface +type fakeInferenceModels struct { + *gentype.FakeClientWithListAndApply[*v1alpha2.InferenceModel, *v1alpha2.InferenceModelList, *apiv1alpha2.InferenceModelApplyConfiguration] + Fake *FakeInferenceV1alpha2 +} + +func newFakeInferenceModels(fake *FakeInferenceV1alpha2, namespace string) typedapiv1alpha2.InferenceModelInterface { + return &fakeInferenceModels{ + gentype.NewFakeClientWithListAndApply[*v1alpha2.InferenceModel, *v1alpha2.InferenceModelList, *apiv1alpha2.InferenceModelApplyConfiguration]( + fake.Fake, + namespace, + v1alpha2.SchemeGroupVersion.WithResource("inferencemodels"), + v1alpha2.SchemeGroupVersion.WithKind("InferenceModel"), + func() *v1alpha2.InferenceModel { return &v1alpha2.InferenceModel{} }, + func() *v1alpha2.InferenceModelList { return &v1alpha2.InferenceModelList{} }, + func(dst, src *v1alpha2.InferenceModelList) { dst.ListMeta = src.ListMeta }, + func(list *v1alpha2.InferenceModelList) []*v1alpha2.InferenceModel { + return gentype.ToPointerSlice(list.Items) + }, + func(list *v1alpha2.InferenceModelList, items []*v1alpha2.InferenceModel) { + list.Items = gentype.FromPointerSlice(items) + }, + ), + fake, + } +} diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencepool.go b/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencepool.go new file mode 100644 index 00000000..64b087dd --- /dev/null +++ b/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencepool.go @@ -0,0 +1,52 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + gentype "k8s.io/client-go/gentype" + v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha2" + typedapiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha2" +) + +// fakeInferencePools implements InferencePoolInterface +type fakeInferencePools struct { + *gentype.FakeClientWithListAndApply[*v1alpha2.InferencePool, *v1alpha2.InferencePoolList, *apiv1alpha2.InferencePoolApplyConfiguration] + Fake *FakeInferenceV1alpha2 +} + +func newFakeInferencePools(fake *FakeInferenceV1alpha2, namespace string) typedapiv1alpha2.InferencePoolInterface { + return &fakeInferencePools{ + gentype.NewFakeClientWithListAndApply[*v1alpha2.InferencePool, *v1alpha2.InferencePoolList, *apiv1alpha2.InferencePoolApplyConfiguration]( + fake.Fake, + namespace, + v1alpha2.SchemeGroupVersion.WithResource("inferencepools"), + v1alpha2.SchemeGroupVersion.WithKind("InferencePool"), + func() *v1alpha2.InferencePool { return &v1alpha2.InferencePool{} }, + func() *v1alpha2.InferencePoolList { return &v1alpha2.InferencePoolList{} }, + func(dst, src *v1alpha2.InferencePoolList) { dst.ListMeta = src.ListMeta }, + func(list *v1alpha2.InferencePoolList) []*v1alpha2.InferencePool { + return gentype.ToPointerSlice(list.Items) + }, + func(list *v1alpha2.InferencePoolList, items []*v1alpha2.InferencePool) { + list.Items = gentype.FromPointerSlice(items) + }, + ), + fake, + } +} diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/generated_expansion.go b/client-go/clientset/versioned/typed/api/v1alpha2/generated_expansion.go similarity index 97% rename from client-go/clientset/versioned/typed/api/v1alpha1/generated_expansion.go rename to client-go/clientset/versioned/typed/api/v1alpha2/generated_expansion.go index 65c88eb1..399789d8 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/generated_expansion.go +++ b/client-go/clientset/versioned/typed/api/v1alpha2/generated_expansion.go @@ -15,7 +15,7 @@ limitations under the License. */ // Code generated by client-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 type InferenceModelExpansion interface{} diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go b/client-go/clientset/versioned/typed/api/v1alpha2/inferencemodel.go similarity index 59% rename from client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go rename to client-go/clientset/versioned/typed/api/v1alpha2/inferencemodel.go index 1f5315ad..ee0d92c1 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go +++ b/client-go/clientset/versioned/typed/api/v1alpha2/inferencemodel.go @@ -15,18 +15,18 @@ limitations under the License. */ // Code generated by client-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 import ( context "context" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - applyconfigurationapiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" - scheme "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" types "k8s.io/apimachinery/pkg/types" watch "k8s.io/apimachinery/pkg/watch" gentype "k8s.io/client-go/gentype" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + applyconfigurationapiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha2" + scheme "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" ) // InferenceModelsGetter has a method to return a InferenceModelInterface. @@ -37,37 +37,37 @@ type InferenceModelsGetter interface { // InferenceModelInterface has methods to work with InferenceModel resources. type InferenceModelInterface interface { - Create(ctx context.Context, inferenceModel *apiv1alpha1.InferenceModel, opts v1.CreateOptions) (*apiv1alpha1.InferenceModel, error) - Update(ctx context.Context, inferenceModel *apiv1alpha1.InferenceModel, opts v1.UpdateOptions) (*apiv1alpha1.InferenceModel, error) + Create(ctx context.Context, inferenceModel *apiv1alpha2.InferenceModel, opts v1.CreateOptions) (*apiv1alpha2.InferenceModel, error) + Update(ctx context.Context, inferenceModel *apiv1alpha2.InferenceModel, opts v1.UpdateOptions) (*apiv1alpha2.InferenceModel, error) // Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). - UpdateStatus(ctx context.Context, inferenceModel *apiv1alpha1.InferenceModel, opts v1.UpdateOptions) (*apiv1alpha1.InferenceModel, error) + UpdateStatus(ctx context.Context, inferenceModel *apiv1alpha2.InferenceModel, opts v1.UpdateOptions) (*apiv1alpha2.InferenceModel, error) Delete(ctx context.Context, name string, opts v1.DeleteOptions) error DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error - Get(ctx context.Context, name string, opts v1.GetOptions) (*apiv1alpha1.InferenceModel, error) - List(ctx context.Context, opts v1.ListOptions) (*apiv1alpha1.InferenceModelList, error) + Get(ctx context.Context, name string, opts v1.GetOptions) (*apiv1alpha2.InferenceModel, error) + List(ctx context.Context, opts v1.ListOptions) (*apiv1alpha2.InferenceModelList, error) Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) - Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *apiv1alpha1.InferenceModel, err error) - Apply(ctx context.Context, inferenceModel *applyconfigurationapiv1alpha1.InferenceModelApplyConfiguration, opts v1.ApplyOptions) (result *apiv1alpha1.InferenceModel, err error) + Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *apiv1alpha2.InferenceModel, err error) + Apply(ctx context.Context, inferenceModel *applyconfigurationapiv1alpha2.InferenceModelApplyConfiguration, opts v1.ApplyOptions) (result *apiv1alpha2.InferenceModel, err error) // Add a +genclient:noStatus comment above the type to avoid generating ApplyStatus(). - ApplyStatus(ctx context.Context, inferenceModel *applyconfigurationapiv1alpha1.InferenceModelApplyConfiguration, opts v1.ApplyOptions) (result *apiv1alpha1.InferenceModel, err error) + ApplyStatus(ctx context.Context, inferenceModel *applyconfigurationapiv1alpha2.InferenceModelApplyConfiguration, opts v1.ApplyOptions) (result *apiv1alpha2.InferenceModel, err error) InferenceModelExpansion } // inferenceModels implements InferenceModelInterface type inferenceModels struct { - *gentype.ClientWithListAndApply[*apiv1alpha1.InferenceModel, *apiv1alpha1.InferenceModelList, *applyconfigurationapiv1alpha1.InferenceModelApplyConfiguration] + *gentype.ClientWithListAndApply[*apiv1alpha2.InferenceModel, *apiv1alpha2.InferenceModelList, *applyconfigurationapiv1alpha2.InferenceModelApplyConfiguration] } // newInferenceModels returns a InferenceModels -func newInferenceModels(c *ApiV1alpha1Client, namespace string) *inferenceModels { +func newInferenceModels(c *InferenceV1alpha2Client, namespace string) *inferenceModels { return &inferenceModels{ - gentype.NewClientWithListAndApply[*apiv1alpha1.InferenceModel, *apiv1alpha1.InferenceModelList, *applyconfigurationapiv1alpha1.InferenceModelApplyConfiguration]( + gentype.NewClientWithListAndApply[*apiv1alpha2.InferenceModel, *apiv1alpha2.InferenceModelList, *applyconfigurationapiv1alpha2.InferenceModelApplyConfiguration]( "inferencemodels", c.RESTClient(), scheme.ParameterCodec, namespace, - func() *apiv1alpha1.InferenceModel { return &apiv1alpha1.InferenceModel{} }, - func() *apiv1alpha1.InferenceModelList { return &apiv1alpha1.InferenceModelList{} }, + func() *apiv1alpha2.InferenceModel { return &apiv1alpha2.InferenceModel{} }, + func() *apiv1alpha2.InferenceModelList { return &apiv1alpha2.InferenceModelList{} }, ), } } diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go b/client-go/clientset/versioned/typed/api/v1alpha2/inferencepool.go similarity index 59% rename from client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go rename to client-go/clientset/versioned/typed/api/v1alpha2/inferencepool.go index 46a2b378..8482451e 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go +++ b/client-go/clientset/versioned/typed/api/v1alpha2/inferencepool.go @@ -15,18 +15,18 @@ limitations under the License. */ // Code generated by client-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 import ( context "context" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - applyconfigurationapiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" - scheme "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" types "k8s.io/apimachinery/pkg/types" watch "k8s.io/apimachinery/pkg/watch" gentype "k8s.io/client-go/gentype" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + applyconfigurationapiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha2" + scheme "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" ) // InferencePoolsGetter has a method to return a InferencePoolInterface. @@ -37,37 +37,37 @@ type InferencePoolsGetter interface { // InferencePoolInterface has methods to work with InferencePool resources. type InferencePoolInterface interface { - Create(ctx context.Context, inferencePool *apiv1alpha1.InferencePool, opts v1.CreateOptions) (*apiv1alpha1.InferencePool, error) - Update(ctx context.Context, inferencePool *apiv1alpha1.InferencePool, opts v1.UpdateOptions) (*apiv1alpha1.InferencePool, error) + Create(ctx context.Context, inferencePool *apiv1alpha2.InferencePool, opts v1.CreateOptions) (*apiv1alpha2.InferencePool, error) + Update(ctx context.Context, inferencePool *apiv1alpha2.InferencePool, opts v1.UpdateOptions) (*apiv1alpha2.InferencePool, error) // Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). - UpdateStatus(ctx context.Context, inferencePool *apiv1alpha1.InferencePool, opts v1.UpdateOptions) (*apiv1alpha1.InferencePool, error) + UpdateStatus(ctx context.Context, inferencePool *apiv1alpha2.InferencePool, opts v1.UpdateOptions) (*apiv1alpha2.InferencePool, error) Delete(ctx context.Context, name string, opts v1.DeleteOptions) error DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error - Get(ctx context.Context, name string, opts v1.GetOptions) (*apiv1alpha1.InferencePool, error) - List(ctx context.Context, opts v1.ListOptions) (*apiv1alpha1.InferencePoolList, error) + Get(ctx context.Context, name string, opts v1.GetOptions) (*apiv1alpha2.InferencePool, error) + List(ctx context.Context, opts v1.ListOptions) (*apiv1alpha2.InferencePoolList, error) Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) - Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *apiv1alpha1.InferencePool, err error) - Apply(ctx context.Context, inferencePool *applyconfigurationapiv1alpha1.InferencePoolApplyConfiguration, opts v1.ApplyOptions) (result *apiv1alpha1.InferencePool, err error) + Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *apiv1alpha2.InferencePool, err error) + Apply(ctx context.Context, inferencePool *applyconfigurationapiv1alpha2.InferencePoolApplyConfiguration, opts v1.ApplyOptions) (result *apiv1alpha2.InferencePool, err error) // Add a +genclient:noStatus comment above the type to avoid generating ApplyStatus(). - ApplyStatus(ctx context.Context, inferencePool *applyconfigurationapiv1alpha1.InferencePoolApplyConfiguration, opts v1.ApplyOptions) (result *apiv1alpha1.InferencePool, err error) + ApplyStatus(ctx context.Context, inferencePool *applyconfigurationapiv1alpha2.InferencePoolApplyConfiguration, opts v1.ApplyOptions) (result *apiv1alpha2.InferencePool, err error) InferencePoolExpansion } // inferencePools implements InferencePoolInterface type inferencePools struct { - *gentype.ClientWithListAndApply[*apiv1alpha1.InferencePool, *apiv1alpha1.InferencePoolList, *applyconfigurationapiv1alpha1.InferencePoolApplyConfiguration] + *gentype.ClientWithListAndApply[*apiv1alpha2.InferencePool, *apiv1alpha2.InferencePoolList, *applyconfigurationapiv1alpha2.InferencePoolApplyConfiguration] } // newInferencePools returns a InferencePools -func newInferencePools(c *ApiV1alpha1Client, namespace string) *inferencePools { +func newInferencePools(c *InferenceV1alpha2Client, namespace string) *inferencePools { return &inferencePools{ - gentype.NewClientWithListAndApply[*apiv1alpha1.InferencePool, *apiv1alpha1.InferencePoolList, *applyconfigurationapiv1alpha1.InferencePoolApplyConfiguration]( + gentype.NewClientWithListAndApply[*apiv1alpha2.InferencePool, *apiv1alpha2.InferencePoolList, *applyconfigurationapiv1alpha2.InferencePoolApplyConfiguration]( "inferencepools", c.RESTClient(), scheme.ParameterCodec, namespace, - func() *apiv1alpha1.InferencePool { return &apiv1alpha1.InferencePool{} }, - func() *apiv1alpha1.InferencePoolList { return &apiv1alpha1.InferencePoolList{} }, + func() *apiv1alpha2.InferencePool { return &apiv1alpha2.InferencePool{} }, + func() *apiv1alpha2.InferencePoolList { return &apiv1alpha2.InferencePoolList{} }, ), } } diff --git a/client-go/informers/externalversions/api/interface.go b/client-go/informers/externalversions/api/interface.go index 6ca4f9da..10eef397 100644 --- a/client-go/informers/externalversions/api/interface.go +++ b/client-go/informers/externalversions/api/interface.go @@ -18,14 +18,14 @@ limitations under the License. package api import ( - v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/api/v1alpha1" - internalinterfaces "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" + v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/api/v1alpha2" + internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" ) // Interface provides access to each of this group's versions. type Interface interface { - // V1alpha1 provides access to shared informers for resources in V1alpha1. - V1alpha1() v1alpha1.Interface + // V1alpha2 provides access to shared informers for resources in V1alpha2. + V1alpha2() v1alpha2.Interface } type group struct { @@ -39,7 +39,7 @@ func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakList return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} } -// V1alpha1 returns a new v1alpha1.Interface. -func (g *group) V1alpha1() v1alpha1.Interface { - return v1alpha1.New(g.factory, g.namespace, g.tweakListOptions) +// V1alpha2 returns a new v1alpha2.Interface. +func (g *group) V1alpha2() v1alpha2.Interface { + return v1alpha2.New(g.factory, g.namespace, g.tweakListOptions) } diff --git a/client-go/informers/externalversions/api/v1alpha1/inferencemodel.go b/client-go/informers/externalversions/api/v1alpha2/inferencemodel.go similarity index 76% rename from client-go/informers/externalversions/api/v1alpha1/inferencemodel.go rename to client-go/informers/externalversions/api/v1alpha2/inferencemodel.go index f887ff4a..74f640d1 100644 --- a/client-go/informers/externalversions/api/v1alpha1/inferencemodel.go +++ b/client-go/informers/externalversions/api/v1alpha2/inferencemodel.go @@ -15,27 +15,27 @@ limitations under the License. */ // Code generated by informer-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 import ( context "context" time "time" - gatewayapiinferenceextensionapiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - versioned "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" - internalinterfaces "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/listers/api/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" watch "k8s.io/apimachinery/pkg/watch" cache "k8s.io/client-go/tools/cache" + gatewayapiinferenceextensionapiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + versioned "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" + internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/listers/api/v1alpha2" ) // InferenceModelInformer provides access to a shared informer and lister for // InferenceModels. type InferenceModelInformer interface { Informer() cache.SharedIndexInformer - Lister() apiv1alpha1.InferenceModelLister + Lister() apiv1alpha2.InferenceModelLister } type inferenceModelInformer struct { @@ -61,16 +61,16 @@ func NewFilteredInferenceModelInformer(client versioned.Interface, namespace str if tweakListOptions != nil { tweakListOptions(&options) } - return client.ApiV1alpha1().InferenceModels(namespace).List(context.TODO(), options) + return client.InferenceV1alpha2().InferenceModels(namespace).List(context.TODO(), options) }, WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { if tweakListOptions != nil { tweakListOptions(&options) } - return client.ApiV1alpha1().InferenceModels(namespace).Watch(context.TODO(), options) + return client.InferenceV1alpha2().InferenceModels(namespace).Watch(context.TODO(), options) }, }, - &gatewayapiinferenceextensionapiv1alpha1.InferenceModel{}, + &gatewayapiinferenceextensionapiv1alpha2.InferenceModel{}, resyncPeriod, indexers, ) @@ -81,9 +81,9 @@ func (f *inferenceModelInformer) defaultInformer(client versioned.Interface, res } func (f *inferenceModelInformer) Informer() cache.SharedIndexInformer { - return f.factory.InformerFor(&gatewayapiinferenceextensionapiv1alpha1.InferenceModel{}, f.defaultInformer) + return f.factory.InformerFor(&gatewayapiinferenceextensionapiv1alpha2.InferenceModel{}, f.defaultInformer) } -func (f *inferenceModelInformer) Lister() apiv1alpha1.InferenceModelLister { - return apiv1alpha1.NewInferenceModelLister(f.Informer().GetIndexer()) +func (f *inferenceModelInformer) Lister() apiv1alpha2.InferenceModelLister { + return apiv1alpha2.NewInferenceModelLister(f.Informer().GetIndexer()) } diff --git a/client-go/informers/externalversions/api/v1alpha1/inferencepool.go b/client-go/informers/externalversions/api/v1alpha2/inferencepool.go similarity index 76% rename from client-go/informers/externalversions/api/v1alpha1/inferencepool.go rename to client-go/informers/externalversions/api/v1alpha2/inferencepool.go index 2311a025..d04591dd 100644 --- a/client-go/informers/externalversions/api/v1alpha1/inferencepool.go +++ b/client-go/informers/externalversions/api/v1alpha2/inferencepool.go @@ -15,27 +15,27 @@ limitations under the License. */ // Code generated by informer-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 import ( context "context" time "time" - gatewayapiinferenceextensionapiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - versioned "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" - internalinterfaces "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/listers/api/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" watch "k8s.io/apimachinery/pkg/watch" cache "k8s.io/client-go/tools/cache" + gatewayapiinferenceextensionapiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + versioned "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" + internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/listers/api/v1alpha2" ) // InferencePoolInformer provides access to a shared informer and lister for // InferencePools. type InferencePoolInformer interface { Informer() cache.SharedIndexInformer - Lister() apiv1alpha1.InferencePoolLister + Lister() apiv1alpha2.InferencePoolLister } type inferencePoolInformer struct { @@ -61,16 +61,16 @@ func NewFilteredInferencePoolInformer(client versioned.Interface, namespace stri if tweakListOptions != nil { tweakListOptions(&options) } - return client.ApiV1alpha1().InferencePools(namespace).List(context.TODO(), options) + return client.InferenceV1alpha2().InferencePools(namespace).List(context.TODO(), options) }, WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { if tweakListOptions != nil { tweakListOptions(&options) } - return client.ApiV1alpha1().InferencePools(namespace).Watch(context.TODO(), options) + return client.InferenceV1alpha2().InferencePools(namespace).Watch(context.TODO(), options) }, }, - &gatewayapiinferenceextensionapiv1alpha1.InferencePool{}, + &gatewayapiinferenceextensionapiv1alpha2.InferencePool{}, resyncPeriod, indexers, ) @@ -81,9 +81,9 @@ func (f *inferencePoolInformer) defaultInformer(client versioned.Interface, resy } func (f *inferencePoolInformer) Informer() cache.SharedIndexInformer { - return f.factory.InformerFor(&gatewayapiinferenceextensionapiv1alpha1.InferencePool{}, f.defaultInformer) + return f.factory.InformerFor(&gatewayapiinferenceextensionapiv1alpha2.InferencePool{}, f.defaultInformer) } -func (f *inferencePoolInformer) Lister() apiv1alpha1.InferencePoolLister { - return apiv1alpha1.NewInferencePoolLister(f.Informer().GetIndexer()) +func (f *inferencePoolInformer) Lister() apiv1alpha2.InferencePoolLister { + return apiv1alpha2.NewInferencePoolLister(f.Informer().GetIndexer()) } diff --git a/client-go/informers/externalversions/api/v1alpha1/interface.go b/client-go/informers/externalversions/api/v1alpha2/interface.go similarity index 92% rename from client-go/informers/externalversions/api/v1alpha1/interface.go rename to client-go/informers/externalversions/api/v1alpha2/interface.go index 9ba07025..9e5c4d9c 100644 --- a/client-go/informers/externalversions/api/v1alpha1/interface.go +++ b/client-go/informers/externalversions/api/v1alpha2/interface.go @@ -15,10 +15,10 @@ limitations under the License. */ // Code generated by informer-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 import ( - internalinterfaces "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" + internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" ) // Interface provides access to all the informers in this group version. diff --git a/client-go/informers/externalversions/factory.go b/client-go/informers/externalversions/factory.go index 39c96068..c06ea464 100644 --- a/client-go/informers/externalversions/factory.go +++ b/client-go/informers/externalversions/factory.go @@ -22,13 +22,13 @@ import ( sync "sync" time "time" - versioned "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" - api "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/api" - internalinterfaces "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" schema "k8s.io/apimachinery/pkg/runtime/schema" cache "k8s.io/client-go/tools/cache" + versioned "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" + api "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/api" + internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" ) // SharedInformerOption defines the functional option type for SharedInformerFactory. @@ -253,9 +253,9 @@ type SharedInformerFactory interface { // client. InformerFor(obj runtime.Object, newFunc internalinterfaces.NewInformerFunc) cache.SharedIndexInformer - Api() api.Interface + Inference() api.Interface } -func (f *sharedInformerFactory) Api() api.Interface { +func (f *sharedInformerFactory) Inference() api.Interface { return api.New(f, f.namespace, f.tweakListOptions) } diff --git a/client-go/informers/externalversions/generic.go b/client-go/informers/externalversions/generic.go index a5f15f73..4186b2f6 100644 --- a/client-go/informers/externalversions/generic.go +++ b/client-go/informers/externalversions/generic.go @@ -20,9 +20,9 @@ package externalversions import ( fmt "fmt" - v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" schema "k8s.io/apimachinery/pkg/runtime/schema" cache "k8s.io/client-go/tools/cache" + v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) // GenericInformer is type of SharedIndexInformer which will locate and delegate to other @@ -51,11 +51,11 @@ func (f *genericInformer) Lister() cache.GenericLister { // TODO extend this to unknown resources with a client pool func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) { switch resource { - // Group=api, Version=v1alpha1 - case v1alpha1.SchemeGroupVersion.WithResource("inferencemodels"): - return &genericInformer{resource: resource.GroupResource(), informer: f.Api().V1alpha1().InferenceModels().Informer()}, nil - case v1alpha1.SchemeGroupVersion.WithResource("inferencepools"): - return &genericInformer{resource: resource.GroupResource(), informer: f.Api().V1alpha1().InferencePools().Informer()}, nil + // Group=inference.networking.x-k8s.io, Version=v1alpha2 + case v1alpha2.SchemeGroupVersion.WithResource("inferencemodels"): + return &genericInformer{resource: resource.GroupResource(), informer: f.Inference().V1alpha2().InferenceModels().Informer()}, nil + case v1alpha2.SchemeGroupVersion.WithResource("inferencepools"): + return &genericInformer{resource: resource.GroupResource(), informer: f.Inference().V1alpha2().InferencePools().Informer()}, nil } diff --git a/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go b/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go index 488aca6f..5b70862a 100644 --- a/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go +++ b/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go @@ -20,10 +20,10 @@ package internalinterfaces import ( time "time" - versioned "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" cache "k8s.io/client-go/tools/cache" + versioned "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" ) // NewInformerFunc takes versioned.Interface and time.Duration to return a SharedIndexInformer. diff --git a/client-go/listers/api/v1alpha1/expansion_generated.go b/client-go/listers/api/v1alpha2/expansion_generated.go similarity index 98% rename from client-go/listers/api/v1alpha1/expansion_generated.go rename to client-go/listers/api/v1alpha2/expansion_generated.go index ffbe67cf..204c375b 100644 --- a/client-go/listers/api/v1alpha1/expansion_generated.go +++ b/client-go/listers/api/v1alpha2/expansion_generated.go @@ -15,7 +15,7 @@ limitations under the License. */ // Code generated by lister-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 // InferenceModelListerExpansion allows custom methods to be added to // InferenceModelLister. diff --git a/client-go/listers/api/v1alpha1/inferencemodel.go b/client-go/listers/api/v1alpha2/inferencemodel.go similarity index 80% rename from client-go/listers/api/v1alpha1/inferencemodel.go rename to client-go/listers/api/v1alpha2/inferencemodel.go index b0c33b61..ce83b85f 100644 --- a/client-go/listers/api/v1alpha1/inferencemodel.go +++ b/client-go/listers/api/v1alpha2/inferencemodel.go @@ -15,13 +15,13 @@ limitations under the License. */ // Code generated by lister-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 import ( - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" labels "k8s.io/apimachinery/pkg/labels" listers "k8s.io/client-go/listers" cache "k8s.io/client-go/tools/cache" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) // InferenceModelLister helps list InferenceModels. @@ -29,7 +29,7 @@ import ( type InferenceModelLister interface { // List lists all InferenceModels in the indexer. // Objects returned here must be treated as read-only. - List(selector labels.Selector) (ret []*apiv1alpha1.InferenceModel, err error) + List(selector labels.Selector) (ret []*apiv1alpha2.InferenceModel, err error) // InferenceModels returns an object that can list and get InferenceModels. InferenceModels(namespace string) InferenceModelNamespaceLister InferenceModelListerExpansion @@ -37,17 +37,17 @@ type InferenceModelLister interface { // inferenceModelLister implements the InferenceModelLister interface. type inferenceModelLister struct { - listers.ResourceIndexer[*apiv1alpha1.InferenceModel] + listers.ResourceIndexer[*apiv1alpha2.InferenceModel] } // NewInferenceModelLister returns a new InferenceModelLister. func NewInferenceModelLister(indexer cache.Indexer) InferenceModelLister { - return &inferenceModelLister{listers.New[*apiv1alpha1.InferenceModel](indexer, apiv1alpha1.Resource("inferencemodel"))} + return &inferenceModelLister{listers.New[*apiv1alpha2.InferenceModel](indexer, apiv1alpha2.Resource("inferencemodel"))} } // InferenceModels returns an object that can list and get InferenceModels. func (s *inferenceModelLister) InferenceModels(namespace string) InferenceModelNamespaceLister { - return inferenceModelNamespaceLister{listers.NewNamespaced[*apiv1alpha1.InferenceModel](s.ResourceIndexer, namespace)} + return inferenceModelNamespaceLister{listers.NewNamespaced[*apiv1alpha2.InferenceModel](s.ResourceIndexer, namespace)} } // InferenceModelNamespaceLister helps list and get InferenceModels. @@ -55,15 +55,15 @@ func (s *inferenceModelLister) InferenceModels(namespace string) InferenceModelN type InferenceModelNamespaceLister interface { // List lists all InferenceModels in the indexer for a given namespace. // Objects returned here must be treated as read-only. - List(selector labels.Selector) (ret []*apiv1alpha1.InferenceModel, err error) + List(selector labels.Selector) (ret []*apiv1alpha2.InferenceModel, err error) // Get retrieves the InferenceModel from the indexer for a given namespace and name. // Objects returned here must be treated as read-only. - Get(name string) (*apiv1alpha1.InferenceModel, error) + Get(name string) (*apiv1alpha2.InferenceModel, error) InferenceModelNamespaceListerExpansion } // inferenceModelNamespaceLister implements the InferenceModelNamespaceLister // interface. type inferenceModelNamespaceLister struct { - listers.ResourceIndexer[*apiv1alpha1.InferenceModel] + listers.ResourceIndexer[*apiv1alpha2.InferenceModel] } diff --git a/client-go/listers/api/v1alpha1/inferencepool.go b/client-go/listers/api/v1alpha2/inferencepool.go similarity index 80% rename from client-go/listers/api/v1alpha1/inferencepool.go rename to client-go/listers/api/v1alpha2/inferencepool.go index 0b0c1d6e..c7e49a1e 100644 --- a/client-go/listers/api/v1alpha1/inferencepool.go +++ b/client-go/listers/api/v1alpha2/inferencepool.go @@ -15,13 +15,13 @@ limitations under the License. */ // Code generated by lister-gen. DO NOT EDIT. -package v1alpha1 +package v1alpha2 import ( - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" labels "k8s.io/apimachinery/pkg/labels" listers "k8s.io/client-go/listers" cache "k8s.io/client-go/tools/cache" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) // InferencePoolLister helps list InferencePools. @@ -29,7 +29,7 @@ import ( type InferencePoolLister interface { // List lists all InferencePools in the indexer. // Objects returned here must be treated as read-only. - List(selector labels.Selector) (ret []*apiv1alpha1.InferencePool, err error) + List(selector labels.Selector) (ret []*apiv1alpha2.InferencePool, err error) // InferencePools returns an object that can list and get InferencePools. InferencePools(namespace string) InferencePoolNamespaceLister InferencePoolListerExpansion @@ -37,17 +37,17 @@ type InferencePoolLister interface { // inferencePoolLister implements the InferencePoolLister interface. type inferencePoolLister struct { - listers.ResourceIndexer[*apiv1alpha1.InferencePool] + listers.ResourceIndexer[*apiv1alpha2.InferencePool] } // NewInferencePoolLister returns a new InferencePoolLister. func NewInferencePoolLister(indexer cache.Indexer) InferencePoolLister { - return &inferencePoolLister{listers.New[*apiv1alpha1.InferencePool](indexer, apiv1alpha1.Resource("inferencepool"))} + return &inferencePoolLister{listers.New[*apiv1alpha2.InferencePool](indexer, apiv1alpha2.Resource("inferencepool"))} } // InferencePools returns an object that can list and get InferencePools. func (s *inferencePoolLister) InferencePools(namespace string) InferencePoolNamespaceLister { - return inferencePoolNamespaceLister{listers.NewNamespaced[*apiv1alpha1.InferencePool](s.ResourceIndexer, namespace)} + return inferencePoolNamespaceLister{listers.NewNamespaced[*apiv1alpha2.InferencePool](s.ResourceIndexer, namespace)} } // InferencePoolNamespaceLister helps list and get InferencePools. @@ -55,15 +55,15 @@ func (s *inferencePoolLister) InferencePools(namespace string) InferencePoolName type InferencePoolNamespaceLister interface { // List lists all InferencePools in the indexer for a given namespace. // Objects returned here must be treated as read-only. - List(selector labels.Selector) (ret []*apiv1alpha1.InferencePool, err error) + List(selector labels.Selector) (ret []*apiv1alpha2.InferencePool, err error) // Get retrieves the InferencePool from the indexer for a given namespace and name. // Objects returned here must be treated as read-only. - Get(name string) (*apiv1alpha1.InferencePool, error) + Get(name string) (*apiv1alpha2.InferencePool, error) InferencePoolNamespaceListerExpansion } // inferencePoolNamespaceLister implements the InferencePoolNamespaceLister // interface. type inferencePoolNamespaceLister struct { - listers.ResourceIndexer[*apiv1alpha1.InferencePool] + listers.ResourceIndexer[*apiv1alpha2.InferencePool] } diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 2da147f4..3a8e008f 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -12,6 +12,22 @@ steps: - GIT_TAG=$_GIT_TAG - EXTRA_TAG=$_PULL_BASE_REF - DOCKER_BUILDX_CMD=/buildx-entrypoint + - name: gcr.io/k8s-testimages/gcb-docker-gcloud:v20220830-45cbff55bc + entrypoint: make + args: + - syncer-image-push + env: + - GIT_TAG=$_GIT_TAG + - EXTRA_TAG=$_PULL_BASE_REF + - DOCKER_BUILDX_CMD=/buildx-entrypoint + - name: gcr.io/k8s-testimages/gcb-docker-gcloud:v20220830-45cbff55bc + entrypoint: make + args: + - bbr-image-push + env: + - GIT_TAG=$_GIT_TAG + - EXTRA_TAG=$_PULL_BASE_REF + - DOCKER_BUILDX_CMD=/buildx-entrypoint substitutions: # _GIT_TAG will be filled with a git-based tag for the image, of the form vYYYYMMDD-hash, and # can be used as a substitution diff --git a/cmd/body-based-routing/health.go b/cmd/body-based-routing/health.go new file mode 100644 index 00000000..7d1b5fd5 --- /dev/null +++ b/cmd/body-based-routing/health.go @@ -0,0 +1,40 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "context" + + "github.com/go-logr/logr" + "google.golang.org/grpc/codes" + healthPb "google.golang.org/grpc/health/grpc_health_v1" + "google.golang.org/grpc/status" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +type healthServer struct { + logger logr.Logger +} + +func (s *healthServer) Check(ctx context.Context, in *healthPb.HealthCheckRequest) (*healthPb.HealthCheckResponse, error) { + s.logger.V(logutil.VERBOSE).Info("gRPC health check serving", "service", in.Service) + return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_SERVING}, nil +} + +func (s *healthServer) Watch(in *healthPb.HealthCheckRequest, srv healthPb.Health_WatchServer) error { + return status.Error(codes.Unimplemented, "Watch is not implemented") +} diff --git a/cmd/body-based-routing/main.go b/cmd/body-based-routing/main.go new file mode 100644 index 00000000..3f586788 --- /dev/null +++ b/cmd/body-based-routing/main.go @@ -0,0 +1,137 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "flag" + "os" + + "github.com/go-logr/logr" + uberzap "go.uber.org/zap" + "go.uber.org/zap/zapcore" + "google.golang.org/grpc" + healthPb "google.golang.org/grpc/health/grpc_health_v1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" + runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/body-based-routing/server" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +var ( + grpcPort = flag.Int( + "grpcPort", + runserver.DefaultGrpcPort, + "The gRPC port used for communicating with Envoy proxy") + grpcHealthPort = flag.Int( + "grpcHealthPort", + 9003, + "The port used for gRPC liveness and readiness probes") + logVerbosity = flag.Int("v", logging.DEFAULT, "number for the log level verbosity") + + setupLog = ctrl.Log.WithName("setup") +) + +func main() { + if err := run(); err != nil { + os.Exit(1) + } +} + +func run() error { + opts := zap.Options{Development: true} + opts.BindFlags(flag.CommandLine) + flag.Parse() + initLogging(&opts) + + // Print all flag values + flags := make(map[string]any) + flag.VisitAll(func(f *flag.Flag) { + flags[f.Name] = f.Value + }) + setupLog.Info("Flags processed", "flags", flags) + + // Init runtime. + cfg, err := ctrl.GetConfig() + if err != nil { + setupLog.Error(err, "Failed to get rest config") + return err + } + + mgr, err := ctrl.NewManager(cfg, ctrl.Options{}) + if err != nil { + setupLog.Error(err, "Failed to create manager", "config", cfg) + return err + } + + ctx := ctrl.SetupSignalHandler() + + // Setup runner. + serverRunner := &runserver.ExtProcServerRunner{GrpcPort: *grpcPort} + + // Register health server. + if err := registerHealthServer(mgr, ctrl.Log.WithName("health"), *grpcHealthPort); err != nil { + return err + } + + // Register ext-proc server. + if err := mgr.Add(serverRunner.AsRunnable(ctrl.Log.WithName("ext-proc"))); err != nil { + setupLog.Error(err, "Failed to register ext-proc gRPC server") + return err + } + + // Start the manager. This blocks until a signal is received. + setupLog.Info("Manager starting") + if err := mgr.Start(ctx); err != nil { + setupLog.Error(err, "Error starting manager") + return err + } + setupLog.Info("Manager terminated") + return nil +} + +// registerHealthServer adds the Health gRPC server as a Runnable to the given manager. +func registerHealthServer(mgr manager.Manager, logger logr.Logger, port int) error { + srv := grpc.NewServer() + healthPb.RegisterHealthServer(srv, &healthServer{ + logger: logger, + }) + if err := mgr.Add( + runnable.NoLeaderElection(runnable.GRPCServer("health", srv, port))); err != nil { + setupLog.Error(err, "Failed to register health server") + return err + } + return nil +} + +func initLogging(opts *zap.Options) { + useV := true + flag.Visit(func(f *flag.Flag) { + if f.Name == "zap-log-level" { + useV = false + } + }) + if useV { + // See https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/log/zap#Options.Level + lvl := -1 * (*logVerbosity) + opts.Level = uberzap.NewAtomicLevelAt(zapcore.Level(int8(lvl))) + } + + logger := zap.New(zap.UseFlagOptions(opts), zap.RawZapOpts(uberzap.AddCaller())) + ctrl.SetLogger(logger) +} diff --git a/cmd/epp/health.go b/cmd/epp/health.go new file mode 100644 index 00000000..335c0849 --- /dev/null +++ b/cmd/epp/health.go @@ -0,0 +1,46 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "context" + + "github.com/go-logr/logr" + "google.golang.org/grpc/codes" + healthPb "google.golang.org/grpc/health/grpc_health_v1" + "google.golang.org/grpc/status" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +type healthServer struct { + logger logr.Logger + datastore datastore.Datastore +} + +func (s *healthServer) Check(ctx context.Context, in *healthPb.HealthCheckRequest) (*healthPb.HealthCheckResponse, error) { + if !s.datastore.PoolHasSynced() { + s.logger.V(logutil.VERBOSE).Info("gRPC health check not serving", "service", in.Service) + return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_NOT_SERVING}, nil + } + s.logger.V(logutil.VERBOSE).Info("gRPC health check serving", "service", in.Service) + return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_SERVING}, nil +} + +func (s *healthServer) Watch(in *healthPb.HealthCheckRequest, srv healthPb.Health_WatchServer) error { + return status.Error(codes.Unimplemented, "Watch is not implemented") +} diff --git a/cmd/epp/main.go b/cmd/epp/main.go new file mode 100644 index 00000000..e1cd5015 --- /dev/null +++ b/cmd/epp/main.go @@ -0,0 +1,284 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "flag" + "fmt" + "net" + "net/http" + "os" + "strconv" + + "github.com/go-logr/logr" + "github.com/prometheus/client_golang/prometheus/promhttp" + uberzap "go.uber.org/zap" + "go.uber.org/zap/zapcore" + "google.golang.org/grpc" + healthPb "google.golang.org/grpc/health/grpc_health_v1" + "k8s.io/client-go/rest" + "k8s.io/component-base/metrics/legacyregistry" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/metrics/filters" + "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/vllm" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + defaultMetricsEndpoint = "/metrics" +) + +var ( + grpcPort = flag.Int( + "grpcPort", + runserver.DefaultGrpcPort, + "The gRPC port used for communicating with Envoy proxy") + grpcHealthPort = flag.Int( + "grpcHealthPort", + 9003, + "The port used for gRPC liveness and readiness probes") + metricsPort = flag.Int( + "metricsPort", 9090, "The metrics port") + destinationEndpointHintKey = flag.String( + "destinationEndpointHintKey", + runserver.DefaultDestinationEndpointHintKey, + "Header and response metadata key used by Envoy to route to the appropriate pod. This must match Envoy configuration.") + destinationEndpointHintMetadataNamespace = flag.String( + "DestinationEndpointHintMetadataNamespace", + runserver.DefaultDestinationEndpointHintMetadataNamespace, + "The key for the outer namespace struct in the metadata field of the extproc response that is used to wrap the"+ + "target endpoint. If not set, then an outer namespace struct should not be created.") + poolName = flag.String( + "poolName", + runserver.DefaultPoolName, + "Name of the InferencePool this Endpoint Picker is associated with.") + poolNamespace = flag.String( + "poolNamespace", + runserver.DefaultPoolNamespace, + "Namespace of the InferencePool this Endpoint Picker is associated with.") + refreshMetricsInterval = flag.Duration( + "refreshMetricsInterval", + runserver.DefaultRefreshMetricsInterval, + "interval to refresh metrics") + refreshPrometheusMetricsInterval = flag.Duration( + "refreshPrometheusMetricsInterval", + runserver.DefaultRefreshPrometheusMetricsInterval, + "interval to flush prometheus metrics") + logVerbosity = flag.Int("v", logging.DEFAULT, "number for the log level verbosity") + secureServing = flag.Bool( + "secureServing", runserver.DefaultSecureServing, "Enables secure serving. Defaults to true.") + certPath = flag.String( + "certPath", "", "The path to the certificate for secure serving. The certificate and private key files "+ + "are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+ + "then a self-signed certificate is used.") + + setupLog = ctrl.Log.WithName("setup") +) + +func main() { + if err := run(); err != nil { + os.Exit(1) + } +} + +func run() error { + opts := zap.Options{ + Development: true, + } + opts.BindFlags(flag.CommandLine) + flag.Parse() + initLogging(&opts) + + useStreamingServer, err := strconv.ParseBool(os.Getenv("USE_STREAMING")) + if err != nil { + setupLog.Error(err, "Failed to parse env var USE_STREAMING, defaulting to false") + } + + // Validate flags + if err := validateFlags(); err != nil { + setupLog.Error(err, "Failed to validate flags") + return err + } + + // Print all flag values + flags := make(map[string]any) + flag.VisitAll(func(f *flag.Flag) { + flags[f.Name] = f.Value + }) + setupLog.Info("Flags processed", "flags", flags) + + // Init runtime. + cfg, err := ctrl.GetConfig() + if err != nil { + setupLog.Error(err, "Failed to get rest config") + return err + } + + mgr, err := runserver.NewDefaultManager(*poolNamespace, *poolName, cfg) + if err != nil { + setupLog.Error(err, "Failed to create controller manager") + return err + } + + ctx := ctrl.SetupSignalHandler() + + pmf := backendmetrics.NewPodMetricsFactory(&vllm.PodMetricsClientImpl{}, *refreshMetricsInterval) + // Setup runner. + datastore := datastore.NewDatastore(ctx, pmf) + serverRunner := &runserver.ExtProcServerRunner{ + GrpcPort: *grpcPort, + DestinationEndpointHintMetadataNamespace: *destinationEndpointHintMetadataNamespace, + DestinationEndpointHintKey: *destinationEndpointHintKey, + PoolName: *poolName, + PoolNamespace: *poolNamespace, + Datastore: datastore, + SecureServing: *secureServing, + CertPath: *certPath, + UseStreaming: useStreamingServer, + RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval, + } + if err := serverRunner.SetupWithManager(ctx, mgr); err != nil { + setupLog.Error(err, "Failed to setup ext-proc controllers") + return err + } + + // Register health server. + if err := registerHealthServer(mgr, ctrl.Log.WithName("health"), datastore, *grpcHealthPort); err != nil { + return err + } + + // Register ext-proc server. + if err := mgr.Add(serverRunner.AsRunnable(ctrl.Log.WithName("ext-proc"))); err != nil { + setupLog.Error(err, "Failed to register ext-proc gRPC server") + return err + } + + // Register metrics handler. + if err := registerMetricsHandler(mgr, *metricsPort, cfg); err != nil { + return err + } + + // Start the manager. This blocks until a signal is received. + setupLog.Info("Controller manager starting") + if err := mgr.Start(ctx); err != nil { + setupLog.Error(err, "Error starting controller manager") + return err + } + setupLog.Info("Controller manager terminated") + return nil +} + +func initLogging(opts *zap.Options) { + // Unless -zap-log-level is explicitly set, use -v + useV := true + flag.Visit(func(f *flag.Flag) { + if f.Name == "zap-log-level" { + useV = false + } + }) + if useV { + // See https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/log/zap#Options.Level + lvl := -1 * (*logVerbosity) + opts.Level = uberzap.NewAtomicLevelAt(zapcore.Level(int8(lvl))) + } + + logger := zap.New(zap.UseFlagOptions(opts), zap.RawZapOpts(uberzap.AddCaller())) + ctrl.SetLogger(logger) +} + +// registerHealthServer adds the Health gRPC server as a Runnable to the given manager. +func registerHealthServer(mgr manager.Manager, logger logr.Logger, ds datastore.Datastore, port int) error { + srv := grpc.NewServer() + healthPb.RegisterHealthServer(srv, &healthServer{ + logger: logger, + datastore: ds, + }) + if err := mgr.Add( + runnable.NoLeaderElection(runnable.GRPCServer("health", srv, port))); err != nil { + setupLog.Error(err, "Failed to register health server") + return err + } + return nil +} + +// registerMetricsHandler adds the metrics HTTP handler as a Runnable to the given manager. +func registerMetricsHandler(mgr manager.Manager, port int, cfg *rest.Config) error { + metrics.Register() + + // Init HTTP server. + h, err := metricsHandlerWithAuthenticationAndAuthorization(cfg) + if err != nil { + return err + } + + mux := http.NewServeMux() + mux.Handle(defaultMetricsEndpoint, h) + + srv := &http.Server{ + Addr: net.JoinHostPort("", strconv.Itoa(port)), + Handler: mux, + } + + if err := mgr.Add(&manager.Server{ + Name: "metrics", + Server: srv, + }); err != nil { + setupLog.Error(err, "Failed to register metrics HTTP handler") + return err + } + return nil +} + +func metricsHandlerWithAuthenticationAndAuthorization(cfg *rest.Config) (http.Handler, error) { + h := promhttp.HandlerFor( + legacyregistry.DefaultGatherer, + promhttp.HandlerOpts{}, + ) + httpClient, err := rest.HTTPClientFor(cfg) + if err != nil { + setupLog.Error(err, "Failed to create http client for metrics auth") + return nil, err + } + + filter, err := filters.WithAuthenticationAndAuthorization(cfg, httpClient) + if err != nil { + setupLog.Error(err, "Failed to create metrics filter for auth") + return nil, err + } + metricsLogger := ctrl.Log.WithName("metrics").WithValues("path", defaultMetricsEndpoint) + metricsAuthHandler, err := filter(metricsLogger, h) + if err != nil { + setupLog.Error(err, "Failed to create metrics auth handler") + return nil, err + } + return metricsAuthHandler, nil +} + +func validateFlags() error { + if *poolName == "" { + return fmt.Errorf("required %q flag not set", "poolName") + } + + return nil +} diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml index bca19605..63c7fb51 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml @@ -14,7 +14,7 @@ spec: singular: inferencemodel scope: Namespaced versions: - - name: v1alpha1 + - name: v1alpha2 schema: openAPIV3Schema: description: InferenceModel is the Schema for the InferenceModels API. @@ -82,6 +82,9 @@ spec: an error will be returned specifying that no valid target model is found. maxLength: 256 type: string + x-kubernetes-validations: + - message: modelName is immutable + rule: self == oldSelf poolRef: description: PoolRef is a reference to the inference pool, the pool must exist in the same namespace. @@ -143,7 +146,7 @@ spec: Conversely weights are optional, so long as ALL targetModels do not specify a weight. format: int32 maximum: 1000000 - minimum: 0 + minimum: 1 type: integer required: - name diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml index 9e6473b9..8386db82 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml @@ -14,7 +14,7 @@ spec: singular: inferencepool scope: Namespaced versions: - - name: v1alpha1 + - name: v1alpha2 schema: openAPIV3Schema: description: InferencePool is the Schema for the InferencePools API. @@ -56,7 +56,9 @@ spec: default: "" description: |- Group is the group of the referent. - When unspecified or empty string, core API group is inferred. + The default value is "", representing the Core API group. + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ type: string kind: default: Service @@ -71,14 +73,20 @@ spec: terms of conformance. They also may not be safe to forward to (see CVE-2021-25740 for more information). Implementations MUST NOT support ExternalName Services. + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ type: string name: description: Name is the name of the referent. + maxLength: 253 + minLength: 1 type: string - targetPortNumber: + portNumber: description: |- - The port number on the pods running the extension. When unspecified, implementations SHOULD infer a - default value of 9002 when the Kind is Service. + The port number on the service running the extension. When unspecified, + implementations SHOULD infer a default value of 9002 when the Kind is + Service. format: int32 maximum: 65535 minimum: 1 @@ -109,6 +117,8 @@ spec: that should be included in the InferencePool. In some cases, implementations may translate this field to a Service selector, so this matches the simple map used for Service selectors instead of the full Kubernetes LabelSelector type. + If sepecified, it will be applied to match the model server pods in the same namespace as the InferencePool. + Cross namesoace selector is not supported. type: object targetPortNumber: description: |- @@ -126,78 +136,141 @@ spec: status: description: InferencePoolStatus defines the observed state of InferencePool properties: - conditions: - default: - - lastTransitionTime: "1970-01-01T00:00:00Z" - message: Waiting for controller - reason: Pending - status: Unknown - type: Ready + parent: description: |- - Conditions track the state of the InferencePool. + Parents is a list of parent resources (usually Gateways) that are + associated with the route, and the status of the InferencePool with respect to + each parent. - Known condition types are: - - * "Ready" + A maximum of 32 Gateways will be represented in this list. An empty list + means the route has not been attached to any Gateway. items: - description: Condition contains details for one aspect of the current - state of this API Resource. + description: PoolStatus defines the observed state of InferencePool + from a Gateway. properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: + conditions: + default: + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for controller + reason: Pending + status: Unknown + type: Accepted description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string + Conditions track the state of the InferencePool. + + Known condition types are: + + * "Accepted" + * "ResolvedRefs" + items: + description: Condition contains details for one aspect of + the current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, + Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + maxItems: 8 + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + parentRef: + description: GatewayRef indicates the gateway that observed + state of InferencePool. + properties: + apiVersion: + description: API version of the referent. + type: string + fieldPath: + description: |- + If referring to a piece of an object instead of an entire object, this string + should contain a valid JSON/Go field access statement, such as desiredState.manifest.containers[2]. + For example, if the object reference is to a container within a pod, this would take on a value like: + "spec.containers{name}" (where "name" refers to the name of the container that triggered + the event) or if no container name is specified "spec.containers[2]" (container with + index 2 in this pod). This syntax is chosen only to have some well-defined way of + referencing a part of an object. + type: string + kind: + description: |- + Kind of the referent. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + name: + description: |- + Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + namespace: + description: |- + Namespace of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ + type: string + resourceVersion: + description: |- + Specific resourceVersion to which this reference is made, if any. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency + type: string + uid: + description: |- + UID of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids + type: string + type: object + x-kubernetes-map-type: atomic required: - - lastTransitionTime - - message - - reason - - status - - type + - parentRef type: object - maxItems: 8 + maxItems: 32 type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map type: object type: object served: true diff --git a/pkg/manifests/ext_proc.yaml b/config/manifests/ext_proc.yaml similarity index 88% rename from pkg/manifests/ext_proc.yaml rename to config/manifests/ext_proc.yaml index b9b860dc..15bebb6a 100644 --- a/pkg/manifests/ext_proc.yaml +++ b/config/manifests/ext_proc.yaml @@ -40,15 +40,15 @@ roleRef: kind: ClusterRole name: pod-read --- -apiVersion: inference.networking.x-k8s.io/v1alpha1 +apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferencePool metadata: labels: - name: vllm-llama2-7b-pool + name: my-pool spec: targetPortNumber: 8000 selector: - app: vllm-llama2-7b-pool + app: my-pool extensionRef: name: inference-gateway-ext-proc --- @@ -71,16 +71,20 @@ spec: spec: containers: - name: inference-gateway-ext-proc - image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main + image: registry.k8s.io/gateway-api-inference-extension/epp:v0.2.0 + imagePullPolicy: IfNotPresent args: - -poolName - - "vllm-llama2-7b-pool" + - "my-pool" - -v - - "3" + - "4" - -grpcPort - "9002" - -grpcHealthPort - "9003" + env: + - name: USE_STREAMING + value: "false" ports: - containerPort: 9002 - containerPort: 9003 diff --git a/pkg/manifests/gateway/enable_patch_policy.yaml b/config/manifests/gateway/enable_patch_policy.yaml similarity index 100% rename from pkg/manifests/gateway/enable_patch_policy.yaml rename to config/manifests/gateway/enable_patch_policy.yaml diff --git a/pkg/manifests/gateway/extension_policy.yaml b/config/manifests/gateway/extension_policy.yaml similarity index 96% rename from pkg/manifests/gateway/extension_policy.yaml rename to config/manifests/gateway/extension_policy.yaml index a8105d6d..14b7b123 100644 --- a/pkg/manifests/gateway/extension_policy.yaml +++ b/config/manifests/gateway/extension_policy.yaml @@ -11,6 +11,7 @@ spec: name: inference-gateway-ext-proc port: 9002 processingMode: + allowModeOverride: true request: body: Buffered response: diff --git a/pkg/manifests/gateway/gateway.yaml b/config/manifests/gateway/gateway.yaml similarity index 100% rename from pkg/manifests/gateway/gateway.yaml rename to config/manifests/gateway/gateway.yaml diff --git a/config/manifests/gateway/patch_policy.yaml b/config/manifests/gateway/patch_policy.yaml new file mode 100644 index 00000000..3c36ed7a --- /dev/null +++ b/config/manifests/gateway/patch_policy.yaml @@ -0,0 +1,88 @@ +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyPatchPolicy +metadata: + name: custom-response-patch-policy + namespace: default +spec: + targetRef: + group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + type: JSONPatch + jsonPatches: + # Necessary to create a cluster of the type: ORIGINAL_DST to allow for + # direct pod scheduling. Which is heavily utilized in our scheduling. + # Specifically the field `original_dst_lb_config` allows us to enable + # `use_http_header` and `http_header_name`. + # Source: https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/cluster/v3/cluster.proto + - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" + name: original_destination_cluster + operation: + op: add + path: "" + value: + name: original_destination_cluster + type: ORIGINAL_DST + original_dst_lb_config: + use_http_header: true + http_header_name: "x-gateway-destination-endpoint" + connect_timeout: 1000s + lb_policy: CLUSTER_PROVIDED + dns_lookup_family: V4_ONLY + circuit_breakers: + thresholds: + - max_connections: 40000 + max_pending_requests: 40000 + max_requests: 40000 + + # This ensures that envoy accepts untrusted certificates. We tried to explicitly + # set TrustChainVerification to ACCEPT_UNSTRUSTED, but that actually didn't work + # and what worked is setting the common_tls_context to empty. + - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" + name: "envoyextensionpolicy/default/ext-proc-policy/extproc/0" + operation: + op: add + path: "/transport_socket" + value: + name: "envoy.transport_sockets.tls" + typed_config: + "@type": "type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext" + common_tls_context: {} + - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration" + name: default/inference-gateway/llm-gw + operation: + op: replace + path: "/virtual_hosts/0/routes/0/route/cluster" + value: original_destination_cluster +# Uncomment the below to enable full duplex streaming + # - type: "type.googleapis.com/envoy.config.listener.v3.Listener" + # name: "default/inference-gateway/llm-gw" + # operation: + # op: add + # path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/request_body_mode" + # value: FULL_DUPLEX_STREAMED + # - type: "type.googleapis.com/envoy.config.listener.v3.Listener" + # name: "default/inference-gateway/llm-gw" + # operation: + # op: add + # path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/request_trailer_mode" + # value: SEND + # - type: "type.googleapis.com/envoy.config.listener.v3.Listener" + # name: "default/inference-gateway/llm-gw" + # operation: + # op: add + # path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_body_mode" + # value: FULL_DUPLEX_STREAMED + # - type: "type.googleapis.com/envoy.config.listener.v3.Listener" + # name: "default/inference-gateway/llm-gw" + # operation: + # op: replace + # path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_trailer_mode" + # value: SEND + # - type: "type.googleapis.com/envoy.config.listener.v3.Listener" + # name: "default/inference-gateway/llm-gw" + # operation: + # op: replace + # path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_header_mode" + # value: SEND + diff --git a/pkg/manifests/gateway/traffic_policy.yaml b/config/manifests/gateway/traffic_policy.yaml similarity index 100% rename from pkg/manifests/gateway/traffic_policy.yaml rename to config/manifests/gateway/traffic_policy.yaml diff --git a/config/manifests/inferencemodel.yaml b/config/manifests/inferencemodel.yaml new file mode 100644 index 00000000..12fb00b7 --- /dev/null +++ b/config/manifests/inferencemodel.yaml @@ -0,0 +1,23 @@ +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: inferencemodel-sample +spec: + modelName: tweet-summary + criticality: Critical + poolRef: + name: my-pool + targetModels: + - name: tweet-summary-1 + weight: 100 + +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: inferencemodel-base-model +spec: + modelName: meta-llama/Llama-2-7b-hf + criticality: Critical + poolRef: + name: my-pool diff --git a/pkg/manifests/vllm/deployment.yaml b/config/manifests/vllm/cpu-deployment.yaml similarity index 58% rename from pkg/manifests/vllm/deployment.yaml rename to config/manifests/vllm/cpu-deployment.yaml index 1f5073e9..a0925c83 100644 --- a/pkg/manifests/vllm/deployment.yaml +++ b/config/manifests/vllm/cpu-deployment.yaml @@ -1,47 +1,31 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: vllm-llama2-7b-pool + name: my-pool spec: replicas: 3 selector: matchLabels: - app: vllm-llama2-7b-pool + app: my-pool template: metadata: labels: - app: vllm-llama2-7b-pool + app: my-pool spec: containers: - name: lora - image: "vllm/vllm-openai:latest" + image: "seedjeffwan/vllm-cpu-env:bb392af4-20250203" imagePullPolicy: Always command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] args: - "--model" - - "meta-llama/Llama-2-7b-hf" - - "--tensor-parallel-size" - - "1" + - "Qwen/Qwen2.5-1.5B-Instruct" - "--port" - "8000" - "--enable-lora" - - "--max-loras" - - "4" - - "--max-cpu-loras" - - "12" - "--lora-modules" - - "sql-lora=/adapters/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/" - - "tweet-summary=/adapters/hub/models--vineetsharma--qlora-adapter-Llama-2-7b-hf-TweetSumm/snapshots/796337d8e866318c59e38f16416e3ecd11fe5403" - - 'sql-lora-0=/adapters/yard1/llama-2-7b-sql-lora-test_0' - - 'sql-lora-1=/adapters/yard1/llama-2-7b-sql-lora-test_1' - - 'sql-lora-2=/adapters/yard1/llama-2-7b-sql-lora-test_2' - - 'sql-lora-3=/adapters/yard1/llama-2-7b-sql-lora-test_3' - - 'sql-lora-4=/adapters/yard1/llama-2-7b-sql-lora-test_4' - - 'tweet-summary-0=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_0' - - 'tweet-summary-1=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_1' - - 'tweet-summary-2=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_2' - - 'tweet-summary-3=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_3' - - 'tweet-summary-4=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_4' + - '{"name": "tweet-summary-0", "path": "/adapters/hub/models--ai-blond--Qwen-Qwen2.5-Coder-1.5B-Instruct-lora/snapshots/9cde18d8ed964b0519fb481cca6acd936b2ca811"}' + - '{"name": "tweet-summary-1", "path": "/adapters/hub/models--ai-blond--Qwen-Qwen2.5-Coder-1.5B-Instruct-lora/snapshots/9cde18d8ed964b0519fb481cca6acd936b2ca811"}' env: - name: PORT value: "8000" @@ -50,6 +34,8 @@ spec: secretKeyRef: name: hf-token key: token + - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING + value: "true" ports: - containerPort: 8000 name: http @@ -74,11 +60,6 @@ spec: periodSeconds: 5 successThreshold: 1 timeoutSeconds: 1 - resources: - limits: - nvidia.com/gpu: 1 - requests: - nvidia.com/gpu: 1 volumeMounts: - mountPath: /data name: data @@ -93,11 +74,9 @@ spec: args: - ./pull_adapters.py - --adapter - - yard1/llama-2-7b-sql-lora-test - - --adapter - - vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + - ai-blond/Qwen-Qwen2.5-Coder-1.5B-Instruct-lora - --duplicate-count - - "5" + - "4" env: - name: HF_TOKEN valueFrom: diff --git a/config/manifests/vllm/gpu-deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml new file mode 100644 index 00000000..a4ccfc0b --- /dev/null +++ b/config/manifests/vllm/gpu-deployment.yaml @@ -0,0 +1,123 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: my-pool +spec: + replicas: 3 + selector: + matchLabels: + app: my-pool + template: + metadata: + labels: + app: my-pool + spec: + containers: + - name: lora + image: "vllm/vllm-openai:v0.7.3" + imagePullPolicy: IfNotPresent + command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args: + - "--model" + - "meta-llama/Llama-2-7b-hf" + - "--tensor-parallel-size" + - "1" + - "--port" + - "8000" + - "--enable-lora" + - "--max-loras" + - "4" + - "--max-cpu-loras" + - "12" + - "--lora-modules" + - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' + - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' + env: + - name: PORT + value: "8000" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: token + - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING + value: "true" + ports: + - containerPort: 8000 + name: http + protocol: TCP + livenessProbe: + failureThreshold: 240 + httpGet: + path: /health + port: http + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + readinessProbe: + failureThreshold: 600 + httpGet: + path: /health + port: http + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + resources: + limits: + nvidia.com/gpu: 1 + requests: + nvidia.com/gpu: 1 + volumeMounts: + - mountPath: /data + name: data + - mountPath: /dev/shm + name: shm + - name: adapters + mountPath: "/adapters" + initContainers: + - name: lora-adapter-syncer + tty: true + stdin: true + image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer:main + restartPolicy: Always + imagePullPolicy: Always + env: + - name: DYNAMIC_LORA_ROLLOUT_CONFIG + value: "/config/configmap.yaml" + volumeMounts: # DO NOT USE subPath, dynamic configmap updates don't work on subPaths + - name: config-volume + mountPath: /config + restartPolicy: Always + schedulerName: default-scheduler + terminationGracePeriodSeconds: 30 + volumes: + - name: data + emptyDir: {} + - name: shm + emptyDir: + medium: Memory + - name: adapters + emptyDir: {} + - name: config-volume + configMap: + name: vllm-llama2-7b-adapters +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: vllm-llama2-7b-adapters +data: + configmap.yaml: | + vLLMLoRAConfig: + name: vllm-llama2-7b + port: 8000 + ensureExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-1 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + diff --git a/docs/dev.md b/docs/dev.md index efd2023a..d223ed6a 100644 --- a/docs/dev.md +++ b/docs/dev.md @@ -1,27 +1,33 @@ - ## Logging +We use `logr.Logger` interface for logging everywhere. +The logger instance is loaded from `context.Context` or passed around as an argument directly. +This is aligned with contextual logging as explained in [k8s instrumentation logging guidelines](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-instrumentation/logging.md). + +In other words, we explicitly don't use `klog` global logging calls. +Using `klog` log value helpers like `klog.KObj` is just fine. + ### Change log verbosity -We use the `k8s.io/klog/v2` package to manage logging. We generally follow the [k8s instrumentation logging guidelines](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-instrumentation/logging.md), which states "the practical default level is V(2). Developers and QE environments may wish to run at V(3) or V(4)". -To configure logging verbosity, specify the `v` flag such as `--v=2`. +To configure logging verbosity, specify the `v` flag such as `--v=2`. ### Add logs The [k8s instrumentation logging guidelines](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-instrumentation/logging.md) has the following definitions: -* `klog.V(0).InfoS` = `klog.InfoS` - Generally useful for this to **always** be visible to a cluster operator -* `klog.V(1).InfoS` - A reasonable default log level if you don't want verbosity. -* `klog.V(2).InfoS` - Useful steady state information about the service and important log messages that may correlate to significant changes in the system. This is the recommended default log level for most systems. -* `klog.V(3).InfoS` - Extended information about changes -* `klog.V(4).InfoS` - Debug level verbosity -* `klog.V(5).InfoS` - Trace level verbosity +- `logger.V(0).Info` = `logger.Info` - Generally useful for this to **always** be visible to a cluster operator +- `logger.V(1).Info` - A reasonable default log level if you don't want verbosity. +- `logger.V(2).Info` - Useful steady state information about the service and important log messages that may correlate to significant changes in the system. This is the recommended default log level for most systems. +- `logger.V(3).Info` - Extended information about changes +- `logger.V(4).Info` - Debug level verbosity +- `logger.V(5).Info` - Trace level verbosity We choose to simplify to the following 3 common levels. + ``` const( DEFAULT=2 @@ -31,36 +37,48 @@ const( ) ``` -The guidelines are written in the context of a k8s controller. Our [ext-proc](../pkg/ext-proc/) does more things such as handling requests and scraping metrics, therefore we adapt the guidelines as follows: +The guidelines are written in the context of a k8s controller. Our [epp](../pkg/epp/) does more things such as handling requests and scraping metrics, therefore we adapt the guidelines as follows: + +1. The server startup process and configuration. -1. The server startup process and configuration. - * `klog.InfoS` Logging at the `V(0)` verbosity level is generally welcome here as this is only logged once at startup, and provides useful info for debugging. + - `logger.Info` Logging at the `V(0)` verbosity level is generally welcome here as this is only logged once at startup, and provides useful info for debugging. 2. Reconciler loops. The reconciler loops watch for CR changes such as the `InferenceModel` CR. And given changes in these CRs significantly affect the behavior of the extension, we recommend using v=1 verbosity level as default, and sparsely use higher verbosity levels. - - * `klog.V(DEFAULT).InfoS` - * Default log level in the reconcilers. - * Information about config (listening on X, watching Y) - * Errors that repeat frequently that relate to conditions that can be corrected (e.g., inference model not initialized yet) - * System state changing (adding/removing objects in the data store) - * `V(VERBOSE)` and above: Use your best judgement. + + - `logger.V(DEFAULT)` + - Default log level in the reconcilers. + - Information about config (listening on X, watching Y) + - Errors that repeat frequently that relate to conditions that can be corrected (e.g., inference model not initialized yet) + - System state changing (adding/removing objects in the data store) + - `logger.V(VERBOSE)` and above: Use your best judgement. 3. Inference request handling. These requests are expected to be much higher volume than the control flow in the reconcilers and therefore we should be mindful of log spamming. We recommend using v=2 to log important info about a request, such as the HTTP response code, and higher verbosity levels for less important info. - * `klog.V(DEFAULT).InfoS` - * Logging the status code of an HTTP request - * Important decision making such as picking the target model, target pod - * `klog.V(VERBOSE).InfoS` - * Detailed request scheduling algorithm operations, such as running the filtering logic - * `V(DEBUG)` and above: Use your best judgement. + - `logger.V(DEFAULT)` + - Logging the status code of an HTTP request + - Important decision making such as picking the target model, target pod + - `logger.V(VERBOSE)` + - Detailed request scheduling algorithm operations, such as running the filtering logic + - `logger.V(DEBUG)` and above: Use your best judgement. 4. Metric scraping loops. These loops run at a very high frequency, and logs can be very spammy if not handled properly. - * `klog.V(TRACE).InfoS` - * Transient errors/warnings, such as failure to get response from a pod. - * Important state changes, such as updating a metric. -5. Misc + - `logger.V(TRACE)` + - Transient errors/warnings, such as failure to get response from a pod. + - Important state changes, such as updating a metric. + +5. Misc 1. Periodic (every 5s) debug loop which prints the current pods and metrics. - * `klog.WarningS` If the metrics are not fresh enough, which indicates an error occurred during the metric scraping loop. - * `klog.V(DEBUG).InfoS` - * This is very important to debug the request scheduling algorithm, and yet not spammy compared to the metric scraping loop logs. \ No newline at end of file + - `logger.V(DEFAULT).Error` If the metrics are not fresh enough, which indicates an error occurred during the metric scraping loop. + - `logger.V(DEBUG)` + - This is very important to debug the request scheduling algorithm, and yet not spammy compared to the metric scraping loop logs. + +### Passing Logger Around + +You can pass around a `context.Context` that contains a logger or a `logr.Logger` instance directly. +You need to make the call which one to use. Passing a `context.Context` is more standard, +on the other hand you then need to call `log.FromContext` everywhere. + +As `logger.V` calls are cummulative, i.e. `logger.V(2).V(3)` results in `logger.V(5)`, +a logger should be passed around with no verbosity level set so that `logger.V(DEFAULT)` +actually uses `DEFAULT` verbosity level. diff --git a/docs/endpoint-picker.svg b/docs/endpoint-picker.svg new file mode 100644 index 00000000..3ec8eed4 --- /dev/null +++ b/docs/endpoint-picker.svg @@ -0,0 +1,3 @@ +Endpoint PickerServiceModelServerL7 Proxy / Gateway InferencePool API Selects - the model servers (the endpoints) - the endpoint picker serviceModel ServerProtocolTrafficExtensionProtocolGateway ControllerClientTrafficConfiguresWatchesWatches InferenceModel API Defines - the model/adapter to serve - the serving objectives for the modelObservabilityMetrics ScrapingObservabilityDashboardsStandard GatewayElementsInference ExtensionElementsInference Gateway \ No newline at end of file diff --git a/docs/inference-gateway-architecture.svg b/docs/inference-gateway-architecture.svg new file mode 100644 index 00000000..6c887ebe --- /dev/null +++ b/docs/inference-gateway-architecture.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/proposals/003-endpoint-picker-protocol/README.md b/docs/proposals/003-model-server-protocol/README.md similarity index 65% rename from docs/proposals/003-endpoint-picker-protocol/README.md rename to docs/proposals/003-model-server-protocol/README.md index 8e96a630..02efbe5c 100644 --- a/docs/proposals/003-endpoint-picker-protocol/README.md +++ b/docs/proposals/003-model-server-protocol/README.md @@ -1,21 +1,4 @@ -# Endpoint Picker Protocol - -The Endpoint Picker, or EPP, is a core component of the inference extension. Ultimately it's -responsible for picking an endpoint from the `InferencePool`. A reference implementation can be -found [here](../../../pkg/ext-proc/). - -## Proxy Protocol - -This is the protocol between the EPP and the proxy (e.g, Envoy). - -The EPP MUST implement the Envoy -[external processing service](https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor)protocol. - -For each HTTP request, the EPP MUST communicate to the proxy the picked model server endpoint, via -adding the `x-gateway-destination-endpoint` HTTP header in the request and as an unstructured entry in the [dynamic_metadata](https://github.com/envoyproxy/go-control-plane/blob/c19bf63a811c90bf9e02f8e0dc1dcef94931ebb4/envoy/service/ext_proc/v3/external_processor.pb.go#L320) field of the ext-proc response, or otherwise return an error. The EPP MUST not set two different values in the header and the response metadata. -Setting different value leads to unpredictable behavior because proxies aren't guaranteed to support both paths, and so this protocol does not define what takes precedence. - -## Model Server Protocol +# Model Server Protocol This is the protocol between the EPP and the model servers. @@ -60,7 +43,8 @@ The model server MUST expose the following LoRA adapter metrics via the same Pro * Metric value: The last updated timestamp (so the EPP can find the latest). * Metric labels: * `max_lora`: The maximum number of adapters that can be loaded to GPU memory to serve a batch. - Requests will be queued if the model server has reached MaxActiveAdapter and canno load the + Requests will be queued if the model server has reached MaxActiveAdapter and cannot load the requested adapter. Example: `"max_lora": "8"`. * `running_lora_adapters`: A comma separated list of adapters that are currently loaded in GPU memory and ready to serve requests. Example: `"running_lora_adapters": "adapter1, adapter2"` + * `waiting_lora_adapters`: A comma separated list of adapters that are waiting to be served. Example: `"waiting_lora_adapters": "adapter1, adapter2"` diff --git a/docs/proposals/004-endpoint-picker-protocol/README.md b/docs/proposals/004-endpoint-picker-protocol/README.md new file mode 100644 index 00000000..3657a10e --- /dev/null +++ b/docs/proposals/004-endpoint-picker-protocol/README.md @@ -0,0 +1,35 @@ +# Endpoint Picker Protocol + +The Endpoint Picker, or EPP, is a core component of the inference extension. Ultimately it's +responsible for picking an endpoint from the `InferencePool`. A reference implementation can be +found [here](../../../pkg/epp/). + +This doc defines the protocol between the EPP and the proxy (e.g, Envoy). + +The EPP MUST implement the Envoy +[external processing service](https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor) protocol. + +For each HTTP request, the EPP MUST communicate to the proxy the picked model server endpoint via: + +1. Setting the `x-gateway-destination-endpoint` HTTP header to the selected endpoint in format. + +2. Set an unstructured entry in the [dynamic_metadata](https://github.com/envoyproxy/go-control-plane/blob/c19bf63a811c90bf9e02f8e0dc1dcef94931ebb4/envoy/service/ext_proc/v3/external_processor.pb.go#L320) field of the ext-proc response. The metadata entry for the picked endpoint MUST be wrapped with an outer key (which represents the metadata namespace) with a default of `envoy.lb`. + +The final metadata necessary would look like: +```go +dynamicMetadata: { + "envoy.lb": { + "x-gateway-destination-endpoint": " + } +} +``` + +Note: +- If the EPP did not communicate the server endpoint via these two methods, it MUST return an error. +- The EPP MUST not set two different values in the header and the inner response metadata value. + +## Why envoy.lb namespace as a default? +The `envoy.lb` namesapce is a predefined namespace used for subsetting. One common way to use the selected endpoint returned from the server, is [envoy subsets](https://www.envoyproxy.io/docs/envoy/latest/intro/arch_overview/upstream/load_balancing/subsets) where host metadata for subset load balancing must be placed under `envoy.lb`. + +Setting different value leads to unpredictable behavior because proxies aren't guaranteed to support both paths, and so this protocol does not define what takes precedence. + diff --git a/docs/schedular-flowchart.png b/docs/scheduler-flowchart.png similarity index 100% rename from docs/schedular-flowchart.png rename to docs/scheduler-flowchart.png diff --git a/go.mod b/go.mod index 8dd59e3e..13ad16c4 100644 --- a/go.mod +++ b/go.mod @@ -1,70 +1,63 @@ -module inference.networking.x-k8s.io/gateway-api-inference-extension +module sigs.k8s.io/gateway-api-inference-extension go 1.23.0 toolchain go1.23.2 require ( - github.com/bojand/ghz v0.120.0 github.com/elastic/crd-ref-docs v0.1.0 github.com/envoyproxy/go-control-plane/envoy v1.32.4 - github.com/google/go-cmp v0.6.0 - github.com/jhump/protoreflect v1.17.0 - github.com/onsi/ginkgo/v2 v2.22.2 + github.com/go-logr/logr v1.4.2 + github.com/google/go-cmp v0.7.0 + github.com/onsi/ginkgo/v2 v2.23.0 github.com/onsi/gomega v1.36.2 - github.com/prometheus/client_golang v1.20.5 + github.com/prometheus/client_golang v1.21.1 github.com/prometheus/client_model v0.6.1 github.com/prometheus/common v0.62.0 github.com/stretchr/testify v1.10.0 go.uber.org/multierr v1.11.0 - google.golang.org/grpc v1.70.0 - google.golang.org/protobuf v1.36.4 - k8s.io/api v0.32.1 - k8s.io/apiextensions-apiserver v0.32.1 - k8s.io/apimachinery v0.32.1 - k8s.io/client-go v0.32.1 - k8s.io/code-generator v0.32.1 - k8s.io/component-base v0.32.1 - k8s.io/klog/v2 v2.130.1 + go.uber.org/zap v1.27.0 + google.golang.org/grpc v1.71.0 + google.golang.org/protobuf v1.36.5 + k8s.io/api v0.32.2 + k8s.io/apiextensions-apiserver v0.32.2 + k8s.io/apimachinery v0.32.2 + k8s.io/client-go v0.32.2 + k8s.io/code-generator v0.32.2 + k8s.io/component-base v0.32.2 k8s.io/utils v0.0.0-20241210054802-24370beab758 - sigs.k8s.io/controller-runtime v0.20.1 - sigs.k8s.io/structured-merge-diff/v4 v4.5.0 + sigs.k8s.io/controller-runtime v0.20.3 + sigs.k8s.io/structured-merge-diff/v4 v4.6.0 sigs.k8s.io/yaml v1.4.0 ) require ( - cel.dev/expr v0.19.0 // indirect - cloud.google.com/go/compute/metadata v0.5.2 // indirect - github.com/BurntSushi/toml v1.1.0 // indirect + cel.dev/expr v0.19.1 // indirect github.com/Masterminds/goutils v1.1.1 // indirect github.com/Masterminds/semver v1.5.0 // indirect - github.com/Masterminds/semver/v3 v3.2.0 // indirect github.com/Masterminds/sprig v2.22.0+incompatible // indirect - github.com/Masterminds/sprig/v3 v3.2.3 // indirect - github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect github.com/antlr4-go/antlr/v4 v4.13.0 // indirect github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect - github.com/bufbuild/protocompile v0.14.1 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78 // indirect + github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/dustin/go-humanize v1.0.1 // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect - github.com/envoyproxy/go-control-plane/ratelimit v0.1.0 // indirect github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect - github.com/evanphx/json-patch/v5 v5.9.0 // indirect + github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/fatih/color v1.16.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/fxamacker/cbor/v2 v2.7.0 // indirect - github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect + github.com/go-logr/zapr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.23.0 // indirect + github.com/go-playground/locales v0.14.1 // indirect + github.com/go-playground/universal-translator v0.18.0 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/gobuffalo/flect v1.0.2 // indirect github.com/goccy/go-yaml v1.11.3 // indirect @@ -81,11 +74,11 @@ require ( github.com/huandu/xstrings v1.3.3 // indirect github.com/imdario/mergo v0.3.11 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect - github.com/jinzhu/configor v1.2.1 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/compress v1.17.9 // indirect + github.com/klauspost/compress v1.17.11 // indirect github.com/kylelemons/godebug v1.1.0 // indirect + github.com/leodido/go-urn v1.2.1 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.20 // indirect @@ -100,42 +93,41 @@ require ( github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/procfs v0.15.1 // indirect - github.com/shopspring/decimal v1.2.0 // indirect - github.com/spf13/cast v1.4.1 // indirect github.com/spf13/cobra v1.8.1 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/stoewer/go-strcase v1.3.0 // indirect github.com/x448/float16 v0.8.4 // indirect + go.opentelemetry.io/auto/sdk v1.1.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 // indirect - go.opentelemetry.io/otel v1.32.0 // indirect + go.opentelemetry.io/otel v1.34.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 // indirect - go.opentelemetry.io/otel/metric v1.32.0 // indirect - go.opentelemetry.io/otel/sdk v1.32.0 // indirect - go.opentelemetry.io/otel/trace v1.32.0 // indirect + go.opentelemetry.io/otel/metric v1.34.0 // indirect + go.opentelemetry.io/otel/sdk v1.34.0 // indirect + go.opentelemetry.io/otel/trace v1.34.0 // indirect go.opentelemetry.io/proto/otlp v1.3.1 // indirect - go.uber.org/zap v1.27.0 // indirect - golang.org/x/crypto v0.32.0 // indirect + golang.org/x/crypto v0.33.0 // indirect golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect - golang.org/x/mod v0.22.0 // indirect - golang.org/x/net v0.34.0 // indirect - golang.org/x/oauth2 v0.24.0 // indirect - golang.org/x/sync v0.10.0 // indirect - golang.org/x/sys v0.29.0 // indirect - golang.org/x/term v0.28.0 // indirect - golang.org/x/text v0.21.0 // indirect + golang.org/x/mod v0.23.0 // indirect + golang.org/x/net v0.35.0 // indirect + golang.org/x/oauth2 v0.25.0 // indirect + golang.org/x/sync v0.11.0 // indirect + golang.org/x/sys v0.30.0 // indirect + golang.org/x/term v0.29.0 // indirect + golang.org/x/text v0.22.0 // indirect golang.org/x/time v0.7.0 // indirect - golang.org/x/tools v0.28.0 // indirect + golang.org/x/tools v0.30.0 // indirect golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20241202173237-19429a94021a // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20241202173237-19429a94021a // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiserver v0.32.1 // indirect + k8s.io/apiserver v0.32.2 // indirect k8s.io/gengo/v2 v2.0.0-20240911193312-2b36238f13e9 // indirect + k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 // indirect sigs.k8s.io/controller-tools v0.14.0 // indirect diff --git a/go.sum b/go.sum index 6d1cd8bd..463e55ff 100644 --- a/go.sum +++ b/go.sum @@ -1,22 +1,11 @@ -cel.dev/expr v0.19.0 h1:lXuo+nDhpyJSpWxpPVi5cPUwzKb+dsdOiw6IreM5yt0= -cel.dev/expr v0.19.0/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= -cloud.google.com/go/compute/metadata v0.5.2 h1:UxK4uu/Tn+I3p2dYWTfiX4wva7aYlKixAHn3fyqngqo= -cloud.google.com/go/compute/metadata v0.5.2/go.mod h1:C66sj2AluDcIqakBq/M8lw8/ybHgOZqin2obFxa/E5k= -github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= -github.com/BurntSushi/toml v1.1.0 h1:ksErzDEI1khOiGPgpwuI7x2ebx/uXQNw7xJpn9Eq1+I= -github.com/BurntSushi/toml v1.1.0/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= +cel.dev/expr v0.19.1 h1:NciYrtDRIR0lNCnH1LFJegdjspNx9fI59O7TWcua/W4= +cel.dev/expr v0.19.1/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJLSYI= github.com/Masterminds/goutils v1.1.1/go.mod h1:8cTjp+g8YejhMuvIA5y2vz3BpJxksy863GQaJW2MFNU= github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y= -github.com/Masterminds/semver/v3 v3.2.0 h1:3MEsd0SM6jqZojhjLWWeBY+Kcjy9i6MQAeY7YgDP83g= -github.com/Masterminds/semver/v3 v3.2.0/go.mod h1:qvl/7zhW3nngYb5+80sSMF+FG2BjYrf8m9wsX0PNOMQ= github.com/Masterminds/sprig v2.22.0+incompatible h1:z4yfnGrZ7netVz+0EDJ0Wi+5VZCSYp4Z0m2dk6cEM60= github.com/Masterminds/sprig v2.22.0+incompatible/go.mod h1:y6hNFY5UBTIWBxnzTeuNhlNS5hqE0NB0E6fgfo2Br3o= -github.com/Masterminds/sprig/v3 v3.2.3 h1:eL2fZNezLomi0uOLqjQoN6BfsDD+fyLtgbJMAj9n6YA= -github.com/Masterminds/sprig/v3 v3.2.3/go.mod h1:rXcFaZ2zZbLRJv/xSysmlgIM1u11eBaRMhvYXJNkGuM= -github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 h1:JYp7IbQjafoB+tBA3gMyHYHrpOtNuDiK/uB5uXxq5wM= -github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= @@ -27,40 +16,30 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= -github.com/bojand/ghz v0.120.0 h1:6F4wsmZVwFg5UnD+/R+IABWk6sKE/0OKIBdUQUZnOdo= -github.com/bojand/ghz v0.120.0/go.mod h1:HfECuBZj1v02XObGnRuoZgyB1PR24/25dIYiJIMjJnE= -github.com/bufbuild/protocompile v0.14.1 h1:iA73zAf/fyljNjQKwYzUHD6AD4R8KMasmwa/FBatYVw= -github.com/bufbuild/protocompile v0.14.1/go.mod h1:ppVdAIhbr2H8asPk6k4pY7t9zB1OU5DoEw9xY/FUi1c= github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78 h1:QVw89YDxXxEe+l8gU8ETbOasdwEV+avkR75ZzsVV9WI= -github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= +github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3 h1:boJj011Hh+874zpIySeApCX4GeOjPl9qhRF3QuIZq+Q= +github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= -github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/elastic/crd-ref-docs v0.1.0 h1:Cr5kz89QB3Iuuj7dhAfLMApCrChEGAaIBTxGk/xuRKw= github.com/elastic/crd-ref-docs v0.1.0/go.mod h1:X83mMBdJt05heJUYiS3T0yJ/JkCuliuhSUNav5Gjo/U= github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/envoyproxy/go-control-plane v0.13.4 h1:zEqyPVyku6IvWCFwux4x9RxkLOMUL+1vC9xUFv5l2/M= -github.com/envoyproxy/go-control-plane v0.13.4/go.mod h1:kDfuBlDVsSj2MjrLEtRWtHlsWIFcGyB2RMO44Dc5GZA= github.com/envoyproxy/go-control-plane/envoy v1.32.4 h1:jb83lalDRZSpPWW2Z7Mck/8kXZ5CQAFYVjQcdVIr83A= github.com/envoyproxy/go-control-plane/envoy v1.32.4/go.mod h1:Gzjc5k8JcJswLjAx1Zm+wSYE20UrLtt7JZMWiWQXQEw= -github.com/envoyproxy/go-control-plane/ratelimit v0.1.0 h1:/G9QYbddjL25KvtKTv3an9lx6VBE2cnb8wp1vEGNYGI= -github.com/envoyproxy/go-control-plane/ratelimit v0.1.0/go.mod h1:Wk+tMFAFbCXaJPzVVHnPgRKdUdwW/KdbRt94AzgRee4= github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8= github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU= github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ= -github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0/FOJfg= -github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ= +github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= +github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= @@ -84,11 +63,11 @@ github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= +github.com/go-playground/locales v0.14.0/go.mod h1:sawfccIbzZTqEDETgFXqTho0QybSa7l++s0DH+LDiLs= github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA= github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY= github.com/go-playground/universal-translator v0.18.0 h1:82dyy6p4OuJq4/CByFNOn/jYrnRPArHwAcmLoJZxyho= github.com/go-playground/universal-translator v0.18.0/go.mod h1:UvRDBj+xPUEGrFYl+lu/H90nyDXpg0fqeB/AQUGNTVA= -github.com/go-playground/validator v9.31.0+incompatible h1:UA72EPEogEnq76ehGdEDp4Mit+3FDh548oRqwVgNsHA= github.com/go-playground/validator/v10 v10.4.1 h1:pH2c5ADXtd66mxoE0Zm9SUhxE20r7aM3F26W0hOn+GE= github.com/go-playground/validator/v10 v10.4.1/go.mod h1:nlOn6nFhuKACm19sB/8EGNn9GlaMV7XkbRSipzJ0Ii4= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= @@ -108,14 +87,13 @@ github.com/google/cel-go v0.22.0/go.mod h1:BuznPXXfQDpXKWQ9sPW3TzlAJN5zzFe+i9tIs github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= -github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad h1:a6HEuzUHeKH6hwfN/ZoQgRgVIWFJljSWa/zetS2WTvg= github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= -github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= @@ -128,18 +106,14 @@ github.com/imdario/mergo v0.3.11 h1:3tnifQM4i+fbajXKBHXWEH+KvNHqojZ778UH75j3bGA= github.com/imdario/mergo v0.3.11/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= -github.com/jhump/protoreflect v1.17.0 h1:qOEr613fac2lOuTgWN4tPAtLL7fUSbuJL5X5XumQh94= -github.com/jhump/protoreflect v1.17.0/go.mod h1:h9+vUUL38jiBzck8ck+6G/aeMX8Z4QUY/NiJPwPNi+8= -github.com/jinzhu/configor v1.2.1 h1:OKk9dsR8i6HPOCZR8BcMtcEImAFjIhbJFZNyn5GCZko= -github.com/jinzhu/configor v1.2.1/go.mod h1:nX89/MOmDba7ZX7GCyU/VIaQ2Ar2aizBl2d3JLF/rDc= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= -github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= +github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= +github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= @@ -158,10 +132,8 @@ github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovk github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= -github.com/mitchellh/copystructure v1.0.0/go.mod h1:SNtv71yrdKgLRyLFxmLdkAbkKEFWgYaq1OVrnRcwhnw= github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa15WveJJGw= github.com/mitchellh/copystructure v1.2.0/go.mod h1:qLl+cE2AmVv+CoeAwDPye/v+N2HKCj9FbZEVFJRxO9s= -github.com/mitchellh/reflectwalk v1.0.0/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU= @@ -179,8 +151,8 @@ github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= -github.com/onsi/ginkgo/v2 v2.22.2 h1:/3X8Panh8/WwhU/3Ssa6rCKqPLuAkVY2I0RoyDLySlU= -github.com/onsi/ginkgo/v2 v2.22.2/go.mod h1:oeMosUL+8LtarXBHu/c0bx2D/K9zyQ6uX3cTyztHwsk= +github.com/onsi/ginkgo/v2 v2.23.0 h1:FA1xjp8ieYDzlgS5ABTpdUDB7wtngggONc8a7ku2NqQ= +github.com/onsi/ginkgo/v2 v2.23.0/go.mod h1:zXTP6xIp3U8aVuXN8ENK9IXRaTjFnpVB9mGmaSRvxnM= github.com/onsi/gomega v1.36.2 h1:koNYke6TVk6ZmnyHrCXba/T/MoLBXFjeC1PtvYgw0A8= github.com/onsi/gomega v1.36.2/go.mod h1:DdwyADRjrc825LhMEkD76cHR5+pUnjhUN8GlHlRPHzY= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= @@ -190,22 +162,17 @@ github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y= -github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= +github.com/prometheus/client_golang v1.21.1 h1:DOvXXTqVzvkIewV/CDPFdejpMCGeMcbGCQ8YOmu+Ibk= +github.com/prometheus/client_golang v1.21.1/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg= github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io= github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= -github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= -github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= +github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= +github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/shopspring/decimal v1.2.0 h1:abSATXmQEYyShuxI4/vyW3tV1MrKAJzCZ/0zLUXYbsQ= -github.com/shopspring/decimal v1.2.0/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= -github.com/spf13/cast v1.3.1/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= -github.com/spf13/cast v1.4.1 h1:s0hze+J0196ZfEMTs80N7UlFt0BDuQ7Q+JDnHiMWKdA= -github.com/spf13/cast v1.4.1/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= @@ -215,9 +182,8 @@ github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8w github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= -github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= @@ -227,23 +193,24 @@ github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= +go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 h1:4K4tsIXefpVJtvA/8srF4V4y0akAoPHkIslgAkjixJA= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0/go.mod h1:jjdQuTGVsXV4vSs+CJ2qYDeDPf9yIJV23qlIzBm73Vg= -go.opentelemetry.io/otel v1.32.0 h1:WnBN+Xjcteh0zdk01SVqV55d/m62NJLJdIyb4y/WO5U= -go.opentelemetry.io/otel v1.32.0/go.mod h1:00DCVSB0RQcnzlwyTfqtxSm+DRr9hpYrHjNGiBHVQIg= +go.opentelemetry.io/otel v1.34.0 h1:zRLXxLCgL1WyKsPVrgbSdMN4c0FMkDAskSTQP+0hdUY= +go.opentelemetry.io/otel v1.34.0/go.mod h1:OWFPOQ+h4G8xpyjgqo4SxJYdDQ/qmRH+wivy7zzx9oI= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 h1:3Q/xZUyC1BBkualc9ROb4G8qkH90LXEIICcs5zv1OYY= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0/go.mod h1:s75jGIWA9OfCMzF0xr+ZgfrB5FEbbV7UuYo32ahUiFI= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 h1:qFffATk0X+HD+f1Z8lswGiOQYKHRlzfmdJm0wEaVrFA= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0/go.mod h1:MOiCmryaYtc+V0Ei+Tx9o5S1ZjA7kzLucuVuyzBZloQ= -go.opentelemetry.io/otel/metric v1.32.0 h1:xV2umtmNcThh2/a/aCP+h64Xx5wsj8qqnkYZktzNa0M= -go.opentelemetry.io/otel/metric v1.32.0/go.mod h1:jH7CIbbK6SH2V2wE16W05BHCtIDzauciCRLoc/SyMv8= -go.opentelemetry.io/otel/sdk v1.32.0 h1:RNxepc9vK59A8XsgZQouW8ue8Gkb4jpWtJm9ge5lEG4= -go.opentelemetry.io/otel/sdk v1.32.0/go.mod h1:LqgegDBjKMmb2GC6/PrTnteJG39I8/vJCAP9LlJXEjU= -go.opentelemetry.io/otel/sdk/metric v1.32.0 h1:rZvFnvmvawYb0alrYkjraqJq0Z4ZUJAiyYCU9snn1CU= -go.opentelemetry.io/otel/sdk/metric v1.32.0/go.mod h1:PWeZlq0zt9YkYAp3gjKZ0eicRYvOh1Gd+X99x6GHpCQ= -go.opentelemetry.io/otel/trace v1.32.0 h1:WIC9mYrXf8TmY/EXuULKc8hR17vE+Hjv2cssQDe03fM= -go.opentelemetry.io/otel/trace v1.32.0/go.mod h1:+i4rkvCraA+tG6AzwloGaCtkx53Fa+L+V8e9a7YvhT8= +go.opentelemetry.io/otel/metric v1.34.0 h1:+eTR3U0MyfWjRDhmFMxe2SsW64QrZ84AOhvqS7Y+PoQ= +go.opentelemetry.io/otel/metric v1.34.0/go.mod h1:CEDrp0fy2D0MvkXE+dPV7cMi8tWZwX3dmaIhwPOaqHE= +go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A= +go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU= +go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk= +go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= +go.opentelemetry.io/otel/trace v1.34.0 h1:+ouXS2V8Rd4hp4580a8q23bg0azF2nI8cqLYnC8mh/k= +go.opentelemetry.io/otel/trace v1.34.0/go.mod h1:Svm7lSjQD7kG7KJ/MUHPVXSDGz2OX4h0M2jHBhmSfRE= go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0= go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= @@ -255,66 +222,49 @@ go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.3.0/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= -golang.org/x/crypto v0.32.0 h1:euUpcYgM8WcP71gNpTqQCn6rC2t6ULUPiOzfWaXVVfc= -golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc= +golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus= +golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M= golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.22.0 h1:D4nJWe9zXqHOmWqj4VMOJhvzj7bEZg4wEYa759z1pH4= -golang.org/x/mod v0.22.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= +golang.org/x/mod v0.23.0 h1:Zb7khfcRGKk+kqfxFaP5tZqCnDZMjC5VtUBs87Hr6QM= +golang.org/x/mod v0.23.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= -golang.org/x/net v0.34.0 h1:Mb7Mrk043xzHgnRM88suvJFwzVrRfHEHJEl5/71CKw0= -golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k= -golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE= -golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8= +golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk= +golang.org/x/oauth2 v0.25.0 h1:CY4y7XT9v0cRI9oupztF8AgiIu99L/ksR/Xp/6jrZ70= +golang.org/x/oauth2 v0.25.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= -golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w= +golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.29.0 h1:TPYlXGxvx1MGTn2GiZDhnjPA9wZzZeGKHHmKhHYvgaU= -golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= -golang.org/x/term v0.28.0 h1:/Ts8HFuMR2E6IP/jlo7QVLZHggjKQbhu/7H0LJFr3Gg= -golang.org/x/term v0.28.0/go.mod h1:Sw/lC2IAUZ92udQNf3WodGtn4k/XoLyZoh8v/8uiwek= +golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= +golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU= +golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= -golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= +golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.28.0 h1:WuB6qZ4RPCQo5aP3WdKZS7i595EdWqWR8vqJTlwTVK8= -golang.org/x/tools v0.28.0/go.mod h1:dcIOrVd3mfQKTgrDVQHqCPMWy6lnhfhtX3hLXYVLfRw= +golang.org/x/tools v0.30.0 h1:BgcpHewrV5AUp2G9MebG4XPFI1E2W41zU1SaqVA9vJY= +golang.org/x/tools v0.30.0/go.mod h1:c347cR/OJfw5TI+GfX7RUPNMdDRRbjvYTS0jPyvsVtY= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -323,14 +273,14 @@ golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 h1:+cNy6SZtPcJQH3LJVLOSm golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/genproto/googleapis/api v0.0.0-20241202173237-19429a94021a h1:OAiGFfOiA0v9MRYsSidp3ubZaBnteRUyn3xB2ZQ5G/E= -google.golang.org/genproto/googleapis/api v0.0.0-20241202173237-19429a94021a/go.mod h1:jehYqy3+AhJU9ve55aNOaSml7wUXjF9x6z2LcCfpAhY= -google.golang.org/genproto/googleapis/rpc v0.0.0-20241202173237-19429a94021a h1:hgh8P4EuoxpsuKMXX/To36nOFD7vixReXgn8lPGnt+o= -google.golang.org/genproto/googleapis/rpc v0.0.0-20241202173237-19429a94021a/go.mod h1:5uTbfoYQed2U9p3KIj2/Zzm02PYhndfdmML0qC3q3FU= -google.golang.org/grpc v1.70.0 h1:pWFv03aZoHzlRKHWicjsZytKAiYCtNS0dHbXnIdq7jQ= -google.golang.org/grpc v1.70.0/go.mod h1:ofIJqVKDXx/JiXrwr2IG4/zwdH9txy3IlF40RmcJSQw= -google.golang.org/protobuf v1.36.4 h1:6A3ZDJHn/eNqc1i+IdefRzy/9PokBTPvcqMySR7NNIM= -google.golang.org/protobuf v1.36.4/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 h1:GVIKPyP/kLIyVOgOnTwFOrvQaQUzOzGMCxgFUOEmm24= +google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422/go.mod h1:b6h1vNKhxaSoEI+5jc3PJUCustfli/mRab7295pY7rw= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f h1:OxYkA3wjPsZyBylwymxSHa7ViiW1Sml4ToBrncvFehI= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f/go.mod h1:+2Yz8+CLJbIfL9z73EW45avw8Lmge3xVElCP9zEKi50= +google.golang.org/grpc v1.71.0 h1:kF77BGdPTQ4/JZWMlb9VpJ5pa25aqvVqogsxNHHdeBg= +google.golang.org/grpc v1.71.0/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= +google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM= +google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= @@ -340,27 +290,26 @@ gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= -gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.32.1 h1:f562zw9cy+GvXzXf0CKlVQ7yHJVYzLfL6JAS4kOAaOc= -k8s.io/api v0.32.1/go.mod h1:/Yi/BqkuueW1BgpoePYBRdDYfjPF5sgTr5+YqDZra5k= -k8s.io/apiextensions-apiserver v0.32.1 h1:hjkALhRUeCariC8DiVmb5jj0VjIc1N0DREP32+6UXZw= -k8s.io/apiextensions-apiserver v0.32.1/go.mod h1:sxWIGuGiYov7Io1fAS2X06NjMIk5CbRHc2StSmbaQto= -k8s.io/apimachinery v0.32.1 h1:683ENpaCBjma4CYqsmZyhEzrGz6cjn1MY/X2jB2hkZs= -k8s.io/apimachinery v0.32.1/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= -k8s.io/apiserver v0.32.1 h1:oo0OozRos66WFq87Zc5tclUX2r0mymoVHRq8JmR7Aak= -k8s.io/apiserver v0.32.1/go.mod h1:UcB9tWjBY7aryeI5zAgzVJB/6k7E97bkr1RgqDz0jPw= -k8s.io/client-go v0.32.1 h1:otM0AxdhdBIaQh7l1Q0jQpmo7WOFIk5FFa4bg6YMdUU= -k8s.io/client-go v0.32.1/go.mod h1:aTTKZY7MdxUaJ/KiUs8D+GssR9zJZi77ZqtzcGXIiDg= -k8s.io/code-generator v0.32.1 h1:4lw1kFNDuFYXquTkB7Sl5EwPMUP2yyW9hh6BnFfRZFY= -k8s.io/code-generator v0.32.1/go.mod h1:zaILfm00CVyP/6/pJMJ3zxRepXkxyDfUV5SNG4CjZI4= -k8s.io/component-base v0.32.1 h1:/5IfJ0dHIKBWysGV0yKTFfacZ5yNV1sulPh3ilJjRZk= -k8s.io/component-base v0.32.1/go.mod h1:j1iMMHi/sqAHeG5z+O9BFNCF698a1u0186zkjMZQ28w= +k8s.io/api v0.32.2 h1:bZrMLEkgizC24G9eViHGOPbW+aRo9duEISRIJKfdJuw= +k8s.io/api v0.32.2/go.mod h1:hKlhk4x1sJyYnHENsrdCWw31FEmCijNGPJO5WzHiJ6Y= +k8s.io/apiextensions-apiserver v0.32.2 h1:2YMk285jWMk2188V2AERy5yDwBYrjgWYggscghPCvV4= +k8s.io/apiextensions-apiserver v0.32.2/go.mod h1:GPwf8sph7YlJT3H6aKUWtd0E+oyShk/YHWQHf/OOgCA= +k8s.io/apimachinery v0.32.2 h1:yoQBR9ZGkA6Rgmhbp/yuT9/g+4lxtsGYwW6dR6BDPLQ= +k8s.io/apimachinery v0.32.2/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= +k8s.io/apiserver v0.32.2 h1:WzyxAu4mvLkQxwD9hGa4ZfExo3yZZaYzoYvvVDlM6vw= +k8s.io/apiserver v0.32.2/go.mod h1:PEwREHiHNU2oFdte7BjzA1ZyjWjuckORLIK/wLV5goM= +k8s.io/client-go v0.32.2 h1:4dYCD4Nz+9RApM2b/3BtVvBHw54QjMFUl1OLcJG5yOA= +k8s.io/client-go v0.32.2/go.mod h1:fpZ4oJXclZ3r2nDOv+Ux3XcJutfrwjKTCHz2H3sww94= +k8s.io/code-generator v0.32.2 h1:CIvyPrLWP7cMgrqval2qYT839YAwCDeSvGfXgWSNpHQ= +k8s.io/code-generator v0.32.2/go.mod h1:plh7bWk7JztAUkHM4zpbdy0KOMdrhsePcZL2HLWFH7Y= +k8s.io/component-base v0.32.2 h1:1aUL5Vdmu7qNo4ZsE+569PV5zFatM9hl+lb3dEea2zU= +k8s.io/component-base v0.32.2/go.mod h1:PXJ61Vx9Lg+P5mS8TLd7bCIr+eMJRQTyXe8KvkrvJq0= k8s.io/gengo/v2 v2.0.0-20240911193312-2b36238f13e9 h1:si3PfKm8dDYxgfbeA6orqrtLkvvIeH8UqffFJDl0bz4= k8s.io/gengo/v2 v2.0.0-20240911193312-2b36238f13e9/go.mod h1:EJykeLsmFC60UQbYJezXkEsG2FLrt0GPNkU5iK5GWxU= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= @@ -371,13 +320,15 @@ k8s.io/utils v0.0.0-20241210054802-24370beab758 h1:sdbE21q2nlQtFh65saZY+rRM6x6aJ k8s.io/utils v0.0.0-20241210054802-24370beab758/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 h1:CPT0ExVicCzcpeN4baWEV2ko2Z/AsiZgEdwgcfwLgMo= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= -sigs.k8s.io/controller-runtime v0.20.1 h1:JbGMAG/X94NeM3xvjenVUaBjy6Ui4Ogd/J5ZtjZnHaE= -sigs.k8s.io/controller-runtime v0.20.1/go.mod h1:BrP3w158MwvB3ZbNpaAcIKkHQ7YGpYnzpoSTZ8E14WU= +sigs.k8s.io/controller-runtime v0.20.3 h1:I6Ln8JfQjHH7JbtCD2HCYHoIzajoRxPNuvhvcDbZgkI= +sigs.k8s.io/controller-runtime v0.20.3/go.mod h1:xg2XB0K5ShQzAgsoujxuKN4LNXR2LfwwHsPj7Iaw+XY= sigs.k8s.io/controller-tools v0.14.0 h1:rnNoCC5wSXlrNoBKKzL70LNJKIQKEzT6lloG6/LF73A= sigs.k8s.io/controller-tools v0.14.0/go.mod h1:TV7uOtNNnnR72SpzhStvPkoS/U5ir0nMudrkrC4M9Sc= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= -sigs.k8s.io/structured-merge-diff/v4 v4.5.0 h1:nbCitCK2hfnhyiKo6uf2HxUPTCodY6Qaf85SbDIaMBk= -sigs.k8s.io/structured-merge-diff/v4 v4.5.0/go.mod h1:N8f93tFZh9U6vpxwRArLiikrE5/2tiu1w1AGfACIGE4= +sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016 h1:kXv6kKdoEtedwuqMmkqhbkgvYKeycVbC8+iPCP9j5kQ= +sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/structured-merge-diff/v4 v4.6.0 h1:IUA9nvMmnKWcj5jl84xn+T5MnlZKThmUW1TdblaLVAc= +sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= diff --git a/hack/release-quickstart.sh b/hack/release-quickstart.sh index b156b160..832bd872 100755 --- a/hack/release-quickstart.sh +++ b/hack/release-quickstart.sh @@ -15,8 +15,8 @@ else RELEASE_TAG="v${MAJOR}.${MINOR}.0-rc.${RC}" fi -# vLLM image version (default to 0.7.1 if not defined) -VLLM="${VLLM:-0.7.1}" +# vLLM image version (default to 0.7.2 if not defined) +VLLM="${VLLM:-0.7.2}" echo "Using release tag: ${RELEASE_TAG}" echo "Using vLLM image version: ${VLLM}" @@ -36,28 +36,31 @@ sed -i.bak -E "s|(releases/download/)v[0-9]+\.[0-9]+\.0-rc\.?[0-9]+|\1${RELEASE_ sed -i.bak "s|kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd|kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/${RELEASE_TAG}/manifests.yaml|g" "$README" # ----------------------------------------------------------------------------- -# Update pkg/manifests/ext_proc.yaml +# Update config/manifests/ext_proc.yaml # ----------------------------------------------------------------------------- -EXT_PROC="pkg/manifests/ext_proc.yaml" +EXT_PROC="config/manifests/ext_proc.yaml" echo "Updating ${EXT_PROC} ..." -# Update any image reference for the EPP container. -# For images from registry.k8s.io: -sed -i.bak -E "s|(registry\.k8s\.io/gateway-api-inference-extension/epp:)[^\"[:space:]]+|\1${RELEASE_TAG}|g" "$EXT_PROC" -# In case there is still any reference from us-central1-docker.pkg.dev: +# Update the EPP container tag. sed -i.bak -E "s|(us-central1-docker\.pkg\.dev/k8s-staging-images/gateway-api-inference-extension/epp:)[^\"[:space:]]+|\1${RELEASE_TAG}|g" "$EXT_PROC" +# Update the EPP container image pull policy. +sed -i.bak '/us-central1-docker.pkg.dev\/k8s-staging-images\/gateway-api-inference-extension\/epp/ { n; s/Always/IfNotPresent/ }' "$EXT_PROC" + +# Update the EPP container registry. +sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$EXT_PROC" + # ----------------------------------------------------------------------------- -# Update pkg/manifests/vllm/deployment.yaml +# Update config/manifests/vllm/gpu-deployment.yaml # ----------------------------------------------------------------------------- -VLLM_DEPLOY="pkg/manifests/vllm/deployment.yaml" +VLLM_DEPLOY="config/manifests/vllm/gpu-deployment.yaml" echo "Updating ${VLLM_DEPLOY} ..." # Update the vLLM image version -sed -i.bak -E "s|(vllm/vllm-openai:)[^\"[:space:]]+|\1${VLLM}|g" "$VLLM_DEPLOY" +sed -i.bak -E "s|(vllm/vllm-openai:)[^\"[:space:]]+|\1v${VLLM}|g" "$VLLM_DEPLOY" # Also change the imagePullPolicy from Always to IfNotPresent on lines containing the vLLM image. -sed -i.bak "/vllm\/vllm-openai/ s/Always/IfNotPresent/g" "$VLLM_DEPLOY" +sed -i.bak '/vllm\/vllm-openai/ { n; s/Always/IfNotPresent/ }' "$VLLM_DEPLOY" # ----------------------------------------------------------------------------- # Stage the changes diff --git a/hack/test-e2e.sh b/hack/test-e2e.sh new file mode 100755 index 00000000..716e626a --- /dev/null +++ b/hack/test-e2e.sh @@ -0,0 +1,137 @@ +#!/bin/bash +# +# This script verifies end-to-end connectivity for an example inference extension test environment based on +# resources from the quickstart guide or e2e test framework. It can optionally launch a "curl" client pod to +# run these tests within the cluster. +# +# USAGE: ./hack/e2e-test.sh +# +# OPTIONAL ENVIRONMENT VARIABLES: +# - TIME: The duration (in seconds) for which the test will run. Defaults to 1 second. +# - CURL_POD: If set to "true", the script will use a Kubernetes pod named "curl" for making requests. +# - IP: Override the detected IP address. If not provided, the script attempts to use a Gateway based on +# the quickstart guide or an Envoy service IP based on the e2e test framework. +# - PORT: Override the detected port. If not provided, the script attempts to use a Gateway based on the +# quickstart guide or an Envoy service IP based on the e2e test framework. +# +# WHAT THE SCRIPT DOES: +# 1. Determines if there is a Gateway named "inference-gateway" in the "default" namespace. If found, it extracts the IP +# address and port from the Gateway's "llm-gw" listener. Otherwise, it falls back to the Envoy service in the "default" namespace. +# 2. Optionally checks for (or creates) a "curl" pod, ensuring it is ready to execute requests. +# 3. Loops for $TIME seconds, sending requests every 5 seconds to the /v1/completions endpoint to confirm successful connectivity. + +set -euo pipefail + +# Determine the directory of this script and build an absolute path to client.yaml. +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +CLIENT_YAML="$SCRIPT_DIR/../test/testdata/client.yaml" + +# TIME is the amount of time, in seconds, to run the test. +TIME=${TIME:-1} +# Optionally use a client curl pod for executing the curl command. +CURL_POD=${CURL_POD:-false} + +check_resource_exists() { + local type=$1 + local name=$2 + local namespace=$3 + + if kubectl get "$type" "$name" -n "$namespace" &>/dev/null; then + return 0 + else + return 1 + fi +} + +check_pod_ready() { + local pod_name=$1 + local namespace=$2 + # Check the Ready condition using jsonpath. Default to False if not found. + local ready_status + ready_status=$(kubectl get pod "$pod_name" -n "$namespace" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "False") + if [[ "$ready_status" == "True" ]]; then + return 0 + else + return 1 + fi +} + +# Try to get the Gateway's IP and the port from the listener named "llm-gw" if it exists. +if check_resource_exists "gateway" "inference-gateway" "default"; then + GATEWAY_IP=$(kubectl get gateway inference-gateway -n default -o jsonpath='{.status.addresses[0].value}') + # Use JSONPath to select the port from the listener with name "llm-gw" + GATEWAY_PORT=$(kubectl get gateway inference-gateway -n default -o jsonpath='{.spec.listeners[?(@.name=="llm-gw")].port}') +else + GATEWAY_IP="" + GATEWAY_PORT="" +fi + +if [[ -n "$GATEWAY_IP" && -n "$GATEWAY_PORT" ]]; then + echo "Using Gateway inference-gateway IP and port from listener 'llm-gw'." + IP=${IP:-$GATEWAY_IP} + PORT=${PORT:-$GATEWAY_PORT} +else + echo "Gateway inference-gateway not found or missing IP/port. Falling back to Envoy service." + # Ensure the Envoy service exists. + if ! check_resource_exists "svc" "envoy" "default"; then + echo "Error: Envoy service not found in namespace 'default'." + exit 1 + fi + IP=${IP:-$(kubectl get svc envoy -n default -o jsonpath='{.spec.clusterIP}')} + PORT=${PORT:-$(kubectl get svc envoy -n default -o jsonpath='{.spec.ports[0].port}')} +fi + +# Optionally verify that the curl pod exists and is ready. +if [[ "$CURL_POD" == "true" ]]; then + if ! check_resource_exists "pod" "curl" "default"; then + echo "Pod 'curl' not found in namespace 'default'. Applying client.yaml from $CLIENT_YAML..." + kubectl apply -f "$CLIENT_YAML" + fi + echo "Waiting for pod 'curl' to be ready..." + # Retry every 5 seconds for up to 30 seconds (6 attempts) + for i in {1..6}; do + if check_pod_ready "curl" "default"; then + echo "Pod 'curl' is now ready." + break + fi + echo "Retry attempt $i: Pod 'curl' not ready; waiting 5 seconds..." + sleep 5 + done + + if ! check_pod_ready "curl" "default"; then + echo "Error: Pod 'curl' is still not ready in namespace 'default' after 30 seconds." + exit 1 + fi +fi + +# Validate that we have a non-empty IP and PORT. +if [[ -z "$IP" ]]; then + echo "Error: Unable to determine a valid IP from either Gateway or Envoy service." + exit 1 +fi + +if [[ -z "$PORT" ]]; then + echo "Error: Unable to determine a valid port from either Gateway or Envoy service." + exit 1 +fi + +echo "Using IP: $IP" +echo "Using PORT: $PORT" + +# Run the test for the specified duration. +end=$((SECONDS + TIME)) +if [[ "$CURL_POD" == "true" ]]; then + while [ $SECONDS -lt $end ]; do + kubectl exec po/curl -- curl -i "$IP:$PORT/v1/completions" \ + -H 'Content-Type: application/json' \ + -d '{"model": "tweet-summary","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}' + sleep 5 + done +else + while [ $SECONDS -lt $end ]; do + curl -i "$IP:$PORT/v1/completions" \ + -H 'Content-Type: application/json' \ + -d '{"model": "tweet-summary","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}' + sleep 5 + done +fi diff --git a/hack/update-codegen.sh b/hack/update-codegen.sh index cfe75f81..c825507b 100755 --- a/hack/update-codegen.sh +++ b/hack/update-codegen.sh @@ -23,7 +23,7 @@ echo "$SCRIPT_ROOT script" CODEGEN_PKG=${2:-bin} echo $CODEGEN_PKG source "${CODEGEN_PKG}/kube_codegen.sh" -THIS_PKG="inference.networking.x-k8s.io/gateway-api-inference-extension" +THIS_PKG="sigs.k8s.io/gateway-api-inference-extension" kube::codegen::gen_helpers \ diff --git a/internal/runnable/grpc.go b/internal/runnable/grpc.go new file mode 100644 index 00000000..a619f788 --- /dev/null +++ b/internal/runnable/grpc.go @@ -0,0 +1,52 @@ +package runnable + +import ( + "context" + "fmt" + "net" + + "google.golang.org/grpc" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/manager" +) + +// GRPCServer converts the given gRPC server into a runnable. +// The server name is just being used for logging. +func GRPCServer(name string, srv *grpc.Server, port int) manager.Runnable { + return manager.RunnableFunc(func(ctx context.Context) error { + // Use "name" key as that is what manager.Server does as well. + log := ctrl.Log.WithValues("name", name) + log.Info("gRPC server starting") + + // Start listening. + lis, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) + if err != nil { + log.Error(err, "gRPC server failed to listen") + return err + } + + log.Info("gRPC server listening", "port", port) + + // Shutdown on context closed. + // Terminate the server on context closed. + // Make sure the goroutine does not leak. + doneCh := make(chan struct{}) + defer close(doneCh) + go func() { + select { + case <-ctx.Done(): + log.Info("gRPC server shutting down") + srv.GracefulStop() + case <-doneCh: + } + }() + + // Keep serving until terminated. + if err := srv.Serve(lis); err != nil && err != grpc.ErrServerStopped { + log.Error(err, "gRPC server failed") + return err + } + log.Info("gRPC server terminated") + return nil + }) +} diff --git a/internal/runnable/leader_election.go b/internal/runnable/leader_election.go new file mode 100644 index 00000000..00dfc782 --- /dev/null +++ b/internal/runnable/leader_election.go @@ -0,0 +1,31 @@ +package runnable + +import "sigs.k8s.io/controller-runtime/pkg/manager" + +type leaderElection struct { + manager.Runnable + needsLeaderElection bool +} + +// LeaderElection wraps the given runnable to implement manager.LeaderElectionRunnable. +func LeaderElection(runnable manager.Runnable, needsLeaderElection bool) manager.Runnable { + return &leaderElection{ + Runnable: runnable, + needsLeaderElection: needsLeaderElection, + } +} + +// RequireLeaderElection wraps the given runnable, marking it as requiring leader election. +func RequireLeaderElection(runnable manager.Runnable) manager.Runnable { + return LeaderElection(runnable, true) +} + +// RequireLeaderElection wraps the given runnable, marking it as not requiring leader election. +func NoLeaderElection(runnable manager.Runnable) manager.Runnable { + return LeaderElection(runnable, false) +} + +// NeedLeaderElection implements manager.NeedLeaderElection interface. +func (r *leaderElection) NeedLeaderElection() bool { + return r.needsLeaderElection +} diff --git a/internal/tls/tls.go b/internal/tls/tls.go new file mode 100644 index 00000000..fb8092c6 --- /dev/null +++ b/internal/tls/tls.go @@ -0,0 +1,73 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tls + +import ( + "crypto/rand" + "crypto/rsa" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "fmt" + "math/big" + "time" + + "github.com/go-logr/logr" +) + +// CreateSelfSignedTLSCertificate creates a self-signed cert the server can use to serve TLS. +func CreateSelfSignedTLSCertificate(logger logr.Logger) (tls.Certificate, error) { + serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) + serialNumber, err := rand.Int(rand.Reader, serialNumberLimit) + if err != nil { + return tls.Certificate{}, fmt.Errorf("error creating serial number: %v", err) + } + now := time.Now() + notBefore := now.UTC() + template := x509.Certificate{ + SerialNumber: serialNumber, + Subject: pkix.Name{ + Organization: []string{"Inference Ext"}, + }, + NotBefore: notBefore, + NotAfter: now.Add(time.Hour * 24 * 365 * 10).UTC(), // 10 years + KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + BasicConstraintsValid: true, + } + + priv, err := rsa.GenerateKey(rand.Reader, 4096) + if err != nil { + return tls.Certificate{}, fmt.Errorf("error generating key: %v", err) + } + + derBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, &priv.PublicKey, priv) + if err != nil { + return tls.Certificate{}, fmt.Errorf("error creating certificate: %v", err) + } + + certBytes := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: derBytes}) + + privBytes, err := x509.MarshalPKCS8PrivateKey(priv) + if err != nil { + return tls.Certificate{}, fmt.Errorf("error marshalling private key: %v", err) + } + keyBytes := pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: privBytes}) + + return tls.X509KeyPair(certBytes, keyBytes) +} diff --git a/mkdocs.yml b/mkdocs.yml index c9bc30e0..8cd3f3fb 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -56,6 +56,8 @@ nav: - Guides: - User Guides: - Getting started: guides/index.md + - Adapter Rollout: guides/adapter-rollout.md + - Metrics: guides/metrics.md - Implementer's Guide: guides/implementers.md - Reference: - API Reference: reference/spec.md diff --git a/pkg/README.md b/pkg/README.md index 04ebfde2..b53ef777 100644 --- a/pkg/README.md +++ b/pkg/README.md @@ -1,96 +1,3 @@ ## Quickstart -This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get a first, single InferencePool up and running! - -### Requirements - - Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher - - A cluster with: - - Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind, - you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). - - 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed. - -### Steps - -1. **Deploy Sample Model Server** - - Create a Hugging Face secret to download the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). Ensure that the token grants access to this model. - Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. - ```bash - kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2 - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml - ``` - -1. **Install the Inference Extension CRDs:** - - ```sh - kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd - ``` - -1. **Deploy InferenceModel** - - Deploy the sample InferenceModel which is configured to load balance traffic between the `tweet-summary-0` and `tweet-summary-1` - [LoRA adapters](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server. - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/inferencemodel.yaml - ``` - -1. **Update Envoy Gateway Config to enable Patch Policy** - - Our custom LLM Gateway ext-proc is patched into the existing envoy gateway via `EnvoyPatchPolicy`. To enable this feature, we must extend the Envoy Gateway config map. To do this, simply run: - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/enable_patch_policy.yaml - kubectl rollout restart deployment envoy-gateway -n envoy-gateway-system - ``` - Additionally, if you would like to enable the admin interface, you can uncomment the admin lines and run this again. - -1. **Deploy Gateway** - - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/gateway.yaml - ``` - > **_NOTE:_** This file couples together the gateway infra and the HTTPRoute infra for a convenient, quick startup. Creating additional/different InferencePools on the same gateway will require an additional set of: `Backend`, `HTTPRoute`, the resources included in the `./manifests/gateway/ext-proc.yaml` file, and an additional `./manifests/gateway/patch_policy.yaml` file. ***Should you choose to experiment, familiarity with xDS and Envoy are very useful.*** - - Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: - ```bash - $ kubectl get gateway inference-gateway - NAME CLASS ADDRESS PROGRAMMED AGE - inference-gateway inference-gateway True 22s - ``` - -1. **Deploy the Inference Extension and InferencePool** - - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/ext_proc.yaml - ``` - -1. **Deploy Envoy Gateway Custom Policies** - - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/extension_policy.yaml - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/patch_policy.yaml - ``` - > **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further. - -1. **OPTIONALLY**: Apply Traffic Policy - - For high-traffic benchmarking you can apply this manifest to avoid any defaults that can cause timeouts/errors. - - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/traffic_policy.yaml - ``` - -1. **Try it out** - - Wait until the gateway is ready. - - ```bash - IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') - PORT=8081 - - curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ - "model": "tweet-summary", - "prompt": "Write as if you were a critic: San Francisco", - "max_tokens": 100, - "temperature": 0 - }' - ``` \ No newline at end of file +Please refer to our Getting started guide here: https://gateway-api-inference-extension.sigs.k8s.io/guides/ \ No newline at end of file diff --git a/pkg/body-based-routing/README.md b/pkg/body-based-routing/README.md new file mode 100644 index 00000000..b5b6f770 --- /dev/null +++ b/pkg/body-based-routing/README.md @@ -0,0 +1,14 @@ +# Body-Based Routing +This package provides an extension that can be deployed to write the `model` +HTTP body parameter as a header (X-Gateway-Model-Name) so as to enable routing capabilities on the +model name. + +As per OpenAI spec, it is standard for the model name to be included in the +body of the HTTP request. However, most implementations do not support routing +based on the request body. This extension helps bridge that gap for clients. +This extension works by parsing the request body. If it finds a `model` parameter in the +request body, it will copy the value of that parameter into a request header. + +This extension is intended to be paired with an `ext_proc` capable Gateway. There is not +a standard way to represent this kind of extension in Gateway API yet, so we recommend +referring to implementation-specific documentation for how to deploy this extension. diff --git a/pkg/body-based-routing/handlers/request.go b/pkg/body-based-routing/handlers/request.go new file mode 100644 index 00000000..3c5037a9 --- /dev/null +++ b/pkg/body-based-routing/handlers/request.go @@ -0,0 +1,97 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package handlers + +import ( + "context" + "encoding/json" + "fmt" + + basepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" + eppb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "sigs.k8s.io/controller-runtime/pkg/log" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +// HandleRequestBody handles request bodies. +func (s *Server) HandleRequestBody(ctx context.Context, body *eppb.HttpBody) (*eppb.ProcessingResponse, error) { + logger := log.FromContext(ctx) + + var data map[string]any + if err := json.Unmarshal(body.GetBody(), &data); err != nil { + return nil, err + } + + modelVal, ok := data["model"] + if !ok { + logger.V(logutil.DEFAULT).Info("Request body does not contain model parameter") + return &eppb.ProcessingResponse{ + Response: &eppb.ProcessingResponse_RequestBody{ + RequestBody: &eppb.BodyResponse{}, + }, + }, nil + } + + modelStr, ok := modelVal.(string) + if !ok { + logger.V(logutil.DEFAULT).Info("Model parameter value is not a string") + return &eppb.ProcessingResponse{ + Response: &eppb.ProcessingResponse_RequestBody{ + RequestBody: &eppb.BodyResponse{}, + }, + }, fmt.Errorf("the model parameter value %v is not a string", modelVal) + } + + return &eppb.ProcessingResponse{ + Response: &eppb.ProcessingResponse_RequestBody{ + RequestBody: &eppb.BodyResponse{ + Response: &eppb.CommonResponse{ + // Necessary so that the new headers are used in the routing decision. + ClearRouteCache: true, + HeaderMutation: &eppb.HeaderMutation{ + SetHeaders: []*basepb.HeaderValueOption{ + { + Header: &basepb.HeaderValue{ + Key: "X-Gateway-Model-Name", + RawValue: []byte(modelStr), + }, + }, + }, + }, + }, + }, + }, + }, nil +} + +// HandleRequestHeaders handles request headers. +func (s *Server) HandleRequestHeaders(headers *eppb.HttpHeaders) (*eppb.ProcessingResponse, error) { + return &eppb.ProcessingResponse{ + Response: &eppb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &eppb.HeadersResponse{}, + }, + }, nil +} + +// HandleRequestTrailers handles request trailers. +func (s *Server) HandleRequestTrailers(trailers *eppb.HttpTrailers) (*eppb.ProcessingResponse, error) { + return &eppb.ProcessingResponse{ + Response: &eppb.ProcessingResponse_RequestTrailers{ + RequestTrailers: &eppb.TrailersResponse{}, + }, + }, nil +} diff --git a/pkg/body-based-routing/handlers/request_test.go b/pkg/body-based-routing/handlers/request_test.go new file mode 100644 index 00000000..9bdac521 --- /dev/null +++ b/pkg/body-based-routing/handlers/request_test.go @@ -0,0 +1,128 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package handlers + +import ( + "context" + "testing" + + basepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "github.com/google/go-cmp/cmp" + "google.golang.org/protobuf/testing/protocmp" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + bodyWithModel = ` + { + "model": "foo", + "prompt": "Tell me a joke" + } + ` + bodyWithModelNoStr = ` + { + "model": 1, + "prompt": "Tell me a joke" + } + ` + bodyWithoutModel = ` + { + "prompt": "Tell me a joke" + } + ` +) + +func TestHandleRequestBody(t *testing.T) { + ctx := logutil.NewTestLoggerIntoContext(context.Background()) + + tests := []struct { + name string + body *extProcPb.HttpBody + want *extProcPb.ProcessingResponse + wantErr bool + }{ + { + name: "malformed body", + body: &extProcPb.HttpBody{ + Body: []byte("malformed json"), + }, + wantErr: true, + }, + { + name: "model not found", + body: &extProcPb.HttpBody{ + Body: []byte(bodyWithoutModel), + }, + want: &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{}, + }, + }, + }, + { + name: "model is not string", + body: &extProcPb.HttpBody{ + Body: []byte(bodyWithModelNoStr), + }, + wantErr: true, + }, + { + name: "success", + body: &extProcPb.HttpBody{ + Body: []byte(bodyWithModel), + }, + want: &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + // Necessary so that the new headers are used in the routing decision. + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*basepb.HeaderValueOption{ + { + Header: &basepb.HeaderValue{ + Key: "X-Gateway-Model-Name", + RawValue: []byte("foo"), + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + server := &Server{} + resp, err := server.HandleRequestBody(ctx, test.body) + if err != nil { + if !test.wantErr { + t.Fatalf("HandleRequestBody returned unexpected error: %v, want %v", err, test.wantErr) + } + return + } + + if diff := cmp.Diff(test.want, resp, protocmp.Transform()); diff != "" { + t.Errorf("HandleRequestBody returned unexpected response, diff(-want, +got): %v", diff) + } + }) + } +} diff --git a/pkg/body-based-routing/handlers/response.go b/pkg/body-based-routing/handlers/response.go new file mode 100644 index 00000000..a62aa076 --- /dev/null +++ b/pkg/body-based-routing/handlers/response.go @@ -0,0 +1,48 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package handlers + +import ( + eppb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" +) + +// HandleResponseHeaders handles response headers. +func (s *Server) HandleResponseHeaders(headers *eppb.HttpHeaders) (*eppb.ProcessingResponse, error) { + return &eppb.ProcessingResponse{ + Response: &eppb.ProcessingResponse_ResponseHeaders{ + ResponseHeaders: &eppb.HeadersResponse{}, + }, + }, nil +} + +// HandleResponseBody handles response bodies. +func (s *Server) HandleResponseBody(body *eppb.HttpBody) (*eppb.ProcessingResponse, error) { + return &eppb.ProcessingResponse{ + Response: &eppb.ProcessingResponse_ResponseBody{ + ResponseBody: &eppb.BodyResponse{}, + }, + }, nil +} + +// HandleResponseTrailers handles response trailers. +func (s *Server) HandleResponseTrailers(trailers *eppb.HttpTrailers) (*eppb.ProcessingResponse, error) { + return &eppb.ProcessingResponse{ + Response: &eppb.ProcessingResponse_ResponseTrailers{ + ResponseTrailers: &eppb.TrailersResponse{}, + }, + }, nil +} diff --git a/pkg/body-based-routing/handlers/server.go b/pkg/body-based-routing/handlers/server.go new file mode 100644 index 00000000..813c55c8 --- /dev/null +++ b/pkg/body-based-routing/handlers/server.go @@ -0,0 +1,92 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package handlers + +import ( + "context" + "errors" + "io" + + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + "sigs.k8s.io/controller-runtime/pkg/log" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +func NewServer() *Server { + return &Server{} +} + +// Server implements the Envoy external processing server. +// https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor.proto +type Server struct{} + +func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { + ctx := srv.Context() + logger := log.FromContext(ctx) + loggerVerbose := logger.V(logutil.VERBOSE) + loggerVerbose.Info("Processing") + + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + req, recvErr := srv.Recv() + if recvErr == io.EOF || errors.Is(recvErr, context.Canceled) { + return nil + } + if recvErr != nil { + // This error occurs very frequently, though it doesn't seem to have any impact. + // TODO Figure out if we can remove this noise. + loggerVerbose.Error(recvErr, "Cannot receive stream request") + return status.Errorf(codes.Unknown, "cannot receive stream request: %v", recvErr) + } + + var resp *extProcPb.ProcessingResponse + var err error + switch v := req.Request.(type) { + case *extProcPb.ProcessingRequest_RequestHeaders: + resp, err = s.HandleRequestHeaders(req.GetRequestHeaders()) + case *extProcPb.ProcessingRequest_RequestBody: + resp, err = s.HandleRequestBody(ctx, req.GetRequestBody()) + case *extProcPb.ProcessingRequest_RequestTrailers: + resp, err = s.HandleRequestTrailers(req.GetRequestTrailers()) + case *extProcPb.ProcessingRequest_ResponseHeaders: + resp, err = s.HandleResponseHeaders(req.GetResponseHeaders()) + case *extProcPb.ProcessingRequest_ResponseBody: + resp, err = s.HandleResponseBody(req.GetResponseBody()) + default: + logger.V(logutil.DEFAULT).Error(nil, "Unknown Request type", "request", v) + return status.Error(codes.Unknown, "unknown request type") + } + + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Failed to process request", "request", req) + return status.Errorf(status.Code(err), "failed to handle request: %v", err) + } + + loggerVerbose.Info("Response generated", "response", resp) + if err := srv.Send(resp); err != nil { + logger.V(logutil.DEFAULT).Error(err, "Send failed") + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + } +} diff --git a/pkg/body-based-routing/server/runserver.go b/pkg/body-based-routing/server/runserver.go new file mode 100644 index 00000000..55e79422 --- /dev/null +++ b/pkg/body-based-routing/server/runserver.go @@ -0,0 +1,73 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package server + +import ( + "context" + "crypto/tls" + + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "github.com/go-logr/logr" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" + tlsutil "sigs.k8s.io/gateway-api-inference-extension/internal/tls" + "sigs.k8s.io/gateway-api-inference-extension/pkg/body-based-routing/handlers" +) + +// ExtProcServerRunner provides methods to manage an external process server. +type ExtProcServerRunner struct { + GrpcPort int + SecureServing bool +} + +// Default values for CLI flags in main +const ( + DefaultGrpcPort = 9002 // default for --grpcPort +) + +func NewDefaultExtProcServerRunner() *ExtProcServerRunner { + return &ExtProcServerRunner{ + GrpcPort: DefaultGrpcPort, + SecureServing: true, + } +} + +// AsRunnable returns a Runnable that can be used to start the ext-proc gRPC server. +// The runnable implements LeaderElectionRunnable with leader election disabled. +func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable { + return runnable.NoLeaderElection(manager.RunnableFunc(func(ctx context.Context) error { + var srv *grpc.Server + if r.SecureServing { + cert, err := tlsutil.CreateSelfSignedTLSCertificate(logger) + if err != nil { + logger.Error(err, "Failed to create self signed certificate") + return err + } + creds := credentials.NewTLS(&tls.Config{Certificates: []tls.Certificate{cert}}) + srv = grpc.NewServer(grpc.Creds(creds)) + } else { + srv = grpc.NewServer() + } + + extProcPb.RegisterExternalProcessorServer(srv, handlers.NewServer()) + + // Forward to the gRPC runnable. + return runnable.GRPCServer("ext-proc", srv, r.GrpcPort).Start(ctx) + })) +} diff --git a/pkg/epp/README.md b/pkg/epp/README.md new file mode 100644 index 00000000..1bf47993 --- /dev/null +++ b/pkg/epp/README.md @@ -0,0 +1,28 @@ +# The EndPoint Picker (EPP) +This package provides the reference implementation for the Endpoint Picker (EPP). As demonistrated in the diagram below, it implements the [extension protocol](../../docs/proposals/004-endpoint-picker-protocol), enabling a proxy or gateway to request endpoint hints from an extension, and interacts with the model servers through the defined [model server protocol](../..//docs/proposals/003-model-server-protocol). + +![Architecture Diagram](../../docs/endpoint-picker.svg) + + +## Core Functions + +An EPP instance handles a single `InferencePool` (and so for each `InferencePool`, one must create a dedicated EPP deployment), it performs the following core functions: + +- Endpoint Selection + - The EPP determines the appropriate Pod endpoint for the load balancer (LB) to route requests. + - It selects from the pool of ready Pods designated by the assigned InferencePool's [Selector](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/7e3cd457cdcd01339b65861c8e472cf27e6b6e80/api/v1alpha1/inferencepool_types.go#L53) field. + - Endpoint selection is contingent on the request's ModelName matching an `InferenceModel` that references the `InferencePool`. + - Requests with unmatched ModelName values trigger an error response to the proxy. +- Traffic Splitting and ModelName Rewriting + - The EPP facilitates controlled rollouts of new adapter versions by implementing traffic splitting between adapters within the same `InferencePool`, as defined by the `InferenceModel`. + - EPP rewrites the model name in the request to the [target model name](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/7e3cd457cdcd01339b65861c8e472cf27e6b6e80/api/v1alpha1/inferencemodel_types.go#L161) as defined on the `InferenceModel` object. +- Observability + - The EPP generates metrics to enhance observability. + - It reports InferenceModel-level metrics, further broken down by target model. + - Detailed information regarding metrics can be found on the [website](https://gateway-api-inference-extension.sigs.k8s.io/guides/metrics/). + + +## Scheduling Algorithm +The scheduling package implements request scheduling algorithms for load balancing requests across backend pods in an inference gateway. The scheduler ensures efficient resource utilization while maintaining low latency and prioritizing critical requests. It applies a series of filters based on metrics and heuristics to select the best pod for a given request. The following flow chart summarizes the current scheduling algorithm + +Scheduling Algorithm diff --git a/pkg/epp/backend/metrics/fake.go b/pkg/epp/backend/metrics/fake.go new file mode 100644 index 00000000..7fd4970d --- /dev/null +++ b/pkg/epp/backend/metrics/fake.go @@ -0,0 +1,94 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "context" + "fmt" + "sync" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +// FakePodMetrics is an implementation of PodMetrics that doesn't run the async refresh loop. +type FakePodMetrics struct { + Pod *Pod + Metrics *Metrics +} + +func (fpm *FakePodMetrics) String() string { + return fmt.Sprintf("Pod: %v; Metrics: %v", fpm.GetPod(), fpm.GetMetrics()) +} + +func (fpm *FakePodMetrics) GetPod() *Pod { + return fpm.Pod +} +func (fpm *FakePodMetrics) GetMetrics() *Metrics { + return fpm.Metrics +} +func (fpm *FakePodMetrics) UpdatePod(pod *corev1.Pod) { + fpm.Pod = toInternalPod(pod) +} +func (fpm *FakePodMetrics) StopRefreshLoop() {} // noop + +type FakePodMetricsClient struct { + errMu sync.RWMutex + Err map[types.NamespacedName]error + resMu sync.RWMutex + Res map[types.NamespacedName]*Metrics +} + +func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, pod *Pod, existing *Metrics, port int32) (*Metrics, error) { + f.errMu.RLock() + err, ok := f.Err[pod.NamespacedName] + f.errMu.RUnlock() + if ok { + return nil, err + } + f.resMu.RLock() + res, ok := f.Res[pod.NamespacedName] + f.resMu.RUnlock() + if !ok { + return nil, fmt.Errorf("no pod found: %v", pod.NamespacedName) + } + log.FromContext(ctx).V(logutil.VERBOSE).Info("Fetching metrics for pod", "existing", existing, "new", res) + return res.Clone(), nil +} + +func (f *FakePodMetricsClient) SetRes(new map[types.NamespacedName]*Metrics) { + f.resMu.Lock() + defer f.resMu.Unlock() + f.Res = new +} + +func (f *FakePodMetricsClient) SetErr(new map[types.NamespacedName]error) { + f.errMu.Lock() + defer f.errMu.Unlock() + f.Err = new +} + +type FakeDataStore struct { + Res map[string]*v1alpha2.InferenceModel +} + +func (fds *FakeDataStore) FetchModelData(modelName string) (returnModel *v1alpha2.InferenceModel) { + return fds.Res[modelName] +} diff --git a/pkg/epp/backend/metrics/logger.go b/pkg/epp/backend/metrics/logger.go new file mode 100644 index 00000000..74735755 --- /dev/null +++ b/pkg/epp/backend/metrics/logger.go @@ -0,0 +1,113 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "context" + "fmt" + "time" + + "github.com/go-logr/logr" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + // Note currently the EPP treats stale metrics same as fresh. + // TODO: https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/336 + metricsValidityPeriod = 5 * time.Second +) + +type Datastore interface { + PoolGet() (*v1alpha2.InferencePool, error) + // PodMetrics operations + // PodGetAll returns all pods and metrics, including fresh and stale. + PodGetAll() []PodMetrics + PodList(func(PodMetrics) bool) []PodMetrics +} + +// StartMetricsLogger starts goroutines to 1) Print metrics debug logs if the DEBUG log level is +// enabled; 2) flushes Prometheus metrics about the backend servers. +func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometheusMetricsInterval time.Duration) { + logger := log.FromContext(ctx) + + // Periodically flush prometheus metrics for inference pool + go func() { + for { + select { + case <-ctx.Done(): + logger.V(logutil.DEFAULT).Info("Shutting down prometheus metrics thread") + return + default: + time.Sleep(refreshPrometheusMetricsInterval) + flushPrometheusMetricsOnce(logger, datastore) + } + } + }() + + // Periodically print out the pods and metrics for DEBUGGING. + if logger := logger.V(logutil.DEBUG); logger.Enabled() { + go func() { + for { + select { + case <-ctx.Done(): + logger.V(logutil.DEFAULT).Info("Shutting down metrics logger thread") + return + default: + time.Sleep(5 * time.Second) + podsWithFreshMetrics := datastore.PodList(func(pm PodMetrics) bool { + return time.Since(pm.GetMetrics().UpdateTime) <= metricsValidityPeriod + }) + podsWithStaleMetrics := datastore.PodList(func(pm PodMetrics) bool { + return time.Since(pm.GetMetrics().UpdateTime) > metricsValidityPeriod + }) + s := fmt.Sprintf("Current Pods and metrics gathered. Fresh metrics: %+v, Stale metrics: %+v", podsWithFreshMetrics, podsWithStaleMetrics) + logger.Info(s) + } + } + }() + } +} + +func flushPrometheusMetricsOnce(logger logr.Logger, datastore Datastore) { + pool, err := datastore.PoolGet() + if err != nil { + // No inference pool or not initialize. + logger.V(logutil.VERBOSE).Info("pool is not initialized, skipping flushing metrics") + return + } + + var kvCacheTotal float64 + var queueTotal int + + podMetrics := datastore.PodGetAll() + logger.V(logutil.VERBOSE).Info("Flushing Prometheus Metrics", "ReadyPods", len(podMetrics)) + if len(podMetrics) == 0 { + return + } + + for _, pod := range podMetrics { + kvCacheTotal += pod.GetMetrics().KVCacheUsagePercent + queueTotal += pod.GetMetrics().WaitingQueueSize + } + + podTotalCount := len(podMetrics) + metrics.RecordInferencePoolAvgKVCache(pool.Name, kvCacheTotal/float64(podTotalCount)) + metrics.RecordInferencePoolAvgQueueSize(pool.Name, float64(queueTotal/podTotalCount)) +} diff --git a/pkg/epp/backend/metrics/pod_metrics.go b/pkg/epp/backend/metrics/pod_metrics.go new file mode 100644 index 00000000..b954a98c --- /dev/null +++ b/pkg/epp/backend/metrics/pod_metrics.go @@ -0,0 +1,134 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "context" + "fmt" + "sync" + "sync/atomic" + "time" + "unsafe" + + "github.com/go-logr/logr" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + fetchMetricsTimeout = 5 * time.Second +) + +type podMetrics struct { + pod unsafe.Pointer // stores a *Pod + metrics unsafe.Pointer // stores a *Metrics + pmc PodMetricsClient + ds Datastore + interval time.Duration + + parentCtx context.Context + once sync.Once // ensure the StartRefreshLoop is only called once. + done chan struct{} + + logger logr.Logger +} + +type PodMetricsClient interface { + FetchMetrics(ctx context.Context, pod *Pod, existing *Metrics, port int32) (*Metrics, error) +} + +func (pm *podMetrics) String() string { + return fmt.Sprintf("Pod: %v; Metrics: %v", pm.GetPod(), pm.GetMetrics()) +} + +func (pm *podMetrics) GetPod() *Pod { + return (*Pod)(atomic.LoadPointer(&pm.pod)) +} + +func (pm *podMetrics) GetMetrics() *Metrics { + return (*Metrics)(atomic.LoadPointer(&pm.metrics)) +} + +func (pm *podMetrics) UpdatePod(in *corev1.Pod) { + atomic.StorePointer(&pm.pod, unsafe.Pointer(toInternalPod(in))) +} + +func toInternalPod(in *corev1.Pod) *Pod { + return &Pod{ + NamespacedName: types.NamespacedName{ + Name: in.Name, + Namespace: in.Namespace, + }, + Address: in.Status.PodIP, + } +} + +// start starts a goroutine exactly once to periodically update metrics. The goroutine will be +// stopped either when stop() is called, or the parentCtx is cancelled. +func (pm *podMetrics) startRefreshLoop() { + pm.once.Do(func() { + go func() { + pm.logger.V(logutil.DEFAULT).Info("Starting refresher", "pod", pm.GetPod()) + for { + select { + case <-pm.done: + return + case <-pm.parentCtx.Done(): + return + default: + } + + err := pm.refreshMetrics() + if err != nil { + pm.logger.V(logutil.TRACE).Error(err, "Failed to refresh metrics", "pod", pm.GetPod()) + } + + time.Sleep(pm.interval) + } + }() + }) +} + +func (pm *podMetrics) refreshMetrics() error { + pool, err := pm.ds.PoolGet() + if err != nil { + // No inference pool or not initialize. + return err + } + ctx, cancel := context.WithTimeout(context.Background(), fetchMetricsTimeout) + defer cancel() + updated, err := pm.pmc.FetchMetrics(ctx, pm.GetPod(), pm.GetMetrics(), pool.Spec.TargetPortNumber) + if err != nil { + // As refresher is running in the background, it's possible that the pod is deleted but + // the refresh goroutine doesn't read the done channel yet. In this case, we just return nil. + // The refresher will be stopped after this interval. + return nil + } + updated.UpdateTime = time.Now() + + pm.logger.V(logutil.TRACE).Info("Refreshed metrics", "updated", updated) + + atomic.StorePointer(&pm.metrics, unsafe.Pointer(updated)) + return nil +} + +func (pm *podMetrics) StopRefreshLoop() { + pm.logger.V(logutil.DEFAULT).Info("Stopping refresher", "pod", pm.GetPod()) + close(pm.done) +} diff --git a/pkg/epp/backend/metrics/pod_metrics_test.go b/pkg/epp/backend/metrics/pod_metrics_test.go new file mode 100644 index 00000000..cf6698ca --- /dev/null +++ b/pkg/epp/backend/metrics/pod_metrics_test.go @@ -0,0 +1,96 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package metrics + +import ( + "context" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" +) + +var ( + pod1 = &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Namespace: "default", + }, + } + initial = &Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + } + updated = &Metrics{ + WaitingQueueSize: 9999, + KVCacheUsagePercent: 0.99, + MaxActiveModels: 99, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + } +) + +func TestMetricsRefresh(t *testing.T) { + ctx := context.Background() + pmc := &FakePodMetricsClient{} + pmf := NewPodMetricsFactory(pmc, time.Millisecond) + + // The refresher is initialized with empty metrics. + pm := pmf.NewPodMetrics(ctx, pod1, &fakeDataStore{}) + + namespacedName := types.NamespacedName{Name: pod1.Name, Namespace: pod1.Namespace} + // Use SetRes to simulate an update of metrics from the pod. + // Verify that the metrics are updated. + pmc.SetRes(map[types.NamespacedName]*Metrics{namespacedName: initial}) + condition := func(collect *assert.CollectT) { + assert.True(collect, cmp.Equal(pm.GetMetrics(), initial, cmpopts.IgnoreFields(Metrics{}, "UpdateTime"))) + } + assert.EventuallyWithT(t, condition, time.Second, time.Millisecond) + + // Stop the loop, and simulate metric update again, this time the PodMetrics won't get the + // new update. + pm.StopRefreshLoop() + pmc.SetRes(map[types.NamespacedName]*Metrics{namespacedName: updated}) + // Still expect the same condition (no metrics update). + assert.EventuallyWithT(t, condition, time.Second, time.Millisecond) +} + +type fakeDataStore struct{} + +func (f *fakeDataStore) PoolGet() (*v1alpha2.InferencePool, error) { + return &v1alpha2.InferencePool{Spec: v1alpha2.InferencePoolSpec{TargetPortNumber: 8000}}, nil +} +func (f *fakeDataStore) PodGetAll() []PodMetrics { + // Not implemented. + return nil +} +func (f *fakeDataStore) PodList(func(PodMetrics) bool) []PodMetrics { + // Not implemented. + return nil +} diff --git a/pkg/epp/backend/metrics/types.go b/pkg/epp/backend/metrics/types.go new file mode 100644 index 00000000..fd600163 --- /dev/null +++ b/pkg/epp/backend/metrics/types.go @@ -0,0 +1,122 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package metrics is a library to interact with backend metrics. +package metrics + +import ( + "context" + "fmt" + "sync" + "time" + "unsafe" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +func NewPodMetricsFactory(pmc PodMetricsClient, refreshMetricsInterval time.Duration) *PodMetricsFactory { + return &PodMetricsFactory{ + pmc: pmc, + refreshMetricsInterval: refreshMetricsInterval, + } +} + +type PodMetricsFactory struct { + pmc PodMetricsClient + refreshMetricsInterval time.Duration +} + +func (f *PodMetricsFactory) NewPodMetrics(parentCtx context.Context, in *corev1.Pod, ds Datastore) PodMetrics { + pm := &podMetrics{ + pod: unsafe.Pointer(toInternalPod(in)), + metrics: unsafe.Pointer(newMetrics()), + pmc: f.pmc, + ds: ds, + interval: f.refreshMetricsInterval, + parentCtx: parentCtx, + once: sync.Once{}, + done: make(chan struct{}), + logger: log.FromContext(parentCtx), + } + pm.startRefreshLoop() + return pm +} + +type PodMetrics interface { + GetPod() *Pod + GetMetrics() *Metrics + UpdatePod(*corev1.Pod) + StopRefreshLoop() + String() string +} + +type Pod struct { + NamespacedName types.NamespacedName + Address string +} + +func (p *Pod) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("%+v", *p) +} + +type Metrics struct { + // ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU. + ActiveModels map[string]int + // MaxActiveModels is the maximum number of models that can be loaded to GPU. + MaxActiveModels int + RunningQueueSize int + WaitingQueueSize int + KVCacheUsagePercent float64 + KvCacheMaxTokenCapacity int + + // UpdateTime record the last time when the metrics were updated. + UpdateTime time.Time +} + +func newMetrics() *Metrics { + return &Metrics{ + ActiveModels: make(map[string]int), + } +} + +func (m *Metrics) String() string { + if m == nil { + return "" + } + return fmt.Sprintf("%+v", *m) +} + +func (m *Metrics) Clone() *Metrics { + cm := make(map[string]int, len(m.ActiveModels)) + for k, v := range m.ActiveModels { + cm[k] = v + } + clone := &Metrics{ + ActiveModels: cm, + MaxActiveModels: m.MaxActiveModels, + RunningQueueSize: m.RunningQueueSize, + WaitingQueueSize: m.WaitingQueueSize, + KVCacheUsagePercent: m.KVCacheUsagePercent, + KvCacheMaxTokenCapacity: m.KvCacheMaxTokenCapacity, + UpdateTime: m.UpdateTime, + } + return clone +} diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go similarity index 55% rename from pkg/ext-proc/backend/vllm/metrics.go rename to pkg/epp/backend/vllm/metrics.go index 8800868a..8d2dd715 100644 --- a/pkg/ext-proc/backend/vllm/metrics.go +++ b/pkg/epp/backend/vllm/metrics.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + // Package vllm provides vllm specific pod metrics implementation. package vllm @@ -9,17 +25,22 @@ import ( "strings" "time" + "github.com/go-logr/logr" dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" "go.uber.org/multierr" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" - klog "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) +// Metric names used in the vLLM metrics implementation. +// Refer to the protocol doc for more details: +// https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol const ( LoraRequestInfoMetricName = "vllm:lora_requests_info" LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters" + LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters" LoraRequestInfoMaxAdaptersMetricName = "max_lora" // TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork. RunningQueueSizeMetricName = "vllm:num_requests_running" @@ -28,38 +49,41 @@ const ( RunningQueueSizeMetricName = "vllm:num_tokens_running" WaitingQueueSizeMetricName = "vllm:num_tokens_waiting" */ - KVCacheUsagePercentMetricName = "vllm:gpu_cache_usage_perc" - KvCacheMaxTokenCapacityMetricName = "vllm:gpu_cache_max_token_capacity" + KVCacheUsagePercentMetricName = "vllm:gpu_cache_usage_perc" ) -type PodMetricsClientImpl struct { -} +type PodMetricsClientImpl struct{} // FetchMetrics fetches metrics from a given pod. func (p *PodMetricsClientImpl) FetchMetrics( ctx context.Context, - pod backend.Pod, - existing *backend.PodMetrics, -) (*backend.PodMetrics, error) { + pod *metrics.Pod, + existing *metrics.Metrics, + port int32, +) (*metrics.Metrics, error) { + logger := log.FromContext(ctx).V(logutil.TRACE) + // Currently the metrics endpoint is hard-coded, which works with vLLM. // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. - url := fmt.Sprintf("http://%s/metrics", pod.Address) + url := "http://" + pod.Address + ":" + strconv.Itoa(int(port)) + "/metrics" + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { + logger.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) return nil, fmt.Errorf("failed to create request: %v", err) } resp, err := http.DefaultClient.Do(req) if err != nil { - klog.Errorf("failed to fetch metrics from %s: %v", pod, err) - return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod, err) + logger.Error(err, "Failed to fetch metrics", "pod", pod.NamespacedName) + return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod.NamespacedName, err) } defer func() { _ = resp.Body.Close() }() if resp.StatusCode != http.StatusOK { - klog.Errorf("unexpected status code from %s: %v", pod, resp.StatusCode) - return nil, fmt.Errorf("unexpected status code from %s: %v", pod, resp.StatusCode) + logger.Error(nil, "Unexpected status code returned", "pod", pod.NamespacedName, "statusCode", resp.StatusCode) + return nil, fmt.Errorf("unexpected status code from %s: %v", pod.NamespacedName, resp.StatusCode) } parser := expfmt.TextParser{} @@ -67,35 +91,36 @@ func (p *PodMetricsClientImpl) FetchMetrics( if err != nil { return nil, err } - return promToPodMetrics(metricFamilies, existing) + return promToPodMetrics(logger, metricFamilies, existing) } // promToPodMetrics updates internal pod metrics with scraped prometheus metrics. // A combined error is returned if errors occur in one or more metric processing. // it returns a new PodMetrics pointer which can be used to atomically update the pod metrics map. func promToPodMetrics( + logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, - existing *backend.PodMetrics, -) (*backend.PodMetrics, error) { + existing *metrics.Metrics, +) (*metrics.Metrics, error) { var errs error updated := existing.Clone() - runningQueueSize, err := getLatestMetric(metricFamilies, RunningQueueSizeMetricName) + runningQueueSize, err := getLatestMetric(logger, metricFamilies, RunningQueueSizeMetricName) errs = multierr.Append(errs, err) if err == nil { updated.RunningQueueSize = int(runningQueueSize.GetGauge().GetValue()) } - waitingQueueSize, err := getLatestMetric(metricFamilies, WaitingQueueSizeMetricName) + waitingQueueSize, err := getLatestMetric(logger, metricFamilies, WaitingQueueSizeMetricName) errs = multierr.Append(errs, err) if err == nil { updated.WaitingQueueSize = int(waitingQueueSize.GetGauge().GetValue()) } - cachePercent, err := getLatestMetric(metricFamilies, KVCacheUsagePercentMetricName) + cachePercent, err := getLatestMetric(logger, metricFamilies, KVCacheUsagePercentMetricName) errs = multierr.Append(errs, err) if err == nil { updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue() } - loraMetrics, _, err := getLatestLoraMetric(metricFamilies) + loraMetrics, _, err := getLatestLoraMetric(logger, metricFamilies) errs = multierr.Append(errs, err) /* TODO: uncomment once this is available in vllm. kvCap, _, err := getGaugeLatestValue(metricFamilies, KvCacheMaxTokenCapacityMetricName) @@ -116,6 +141,14 @@ func promToPodMetrics( } } } + if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName { + if label.GetValue() != "" { + adapterList := strings.Split(label.GetValue(), ",") + for _, adapter := range adapterList { + updated.ActiveModels[adapter] = 0 + } + } + } if label.GetName() == LoraRequestInfoMaxAdaptersMetricName { if label.GetValue() != "" { updated.MaxActiveModels, err = strconv.Atoi(label.GetValue()) @@ -135,29 +168,57 @@ func promToPodMetrics( // reason its specially fetched is because each label key value pair permutation generates new series // and only most recent is useful. The value of each series is the creation timestamp so we can // retrieve the latest by sorting the value. -func getLatestLoraMetric(metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) { +func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) { loraRequests, ok := metricFamilies[LoraRequestInfoMetricName] if !ok { - klog.Warningf("metric family %q not found", LoraRequestInfoMetricName) + logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName) return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName) } - var latestTs float64 + var latest *dto.Metric + var latestTs float64 + + // Iterate over all metrics in the family. for _, m := range loraRequests.GetMetric() { + var running, waiting string + // Read the label values for running and waiting adapters. + for _, lp := range m.GetLabel() { + switch lp.GetName() { + case LoraRequestInfoRunningAdaptersMetricName: + running = lp.GetValue() + case LoraRequestInfoWaitingAdaptersMetricName: + waiting = lp.GetValue() + } + } + + // Ignore metrics with both labels empty. This happens when there are no running or waiting requests on + // the server, in this case it is best to use the last set of active adapters. + if running == "" && waiting == "" { + continue + } + + // Select the metric with the latest creation timestamp. if m.GetGauge().GetValue() > latestTs { latestTs = m.GetGauge().GetValue() latest = m } } + + if latest == nil { + logger.V(logutil.TRACE).Info("Metric value Empty", "value", latest, "metric", LoraRequestInfoMetricName) + return nil, time.Time{}, nil + } + + // Convert the gauge value (creation timestamp) to time.Time. return latest, time.Unix(0, int64(latestTs*1000)), nil } // getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric. // Since vllm doesn't set the timestamp in metric, this metric essentially gets the first metric. -func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, error) { +func getLatestMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, error) { mf, ok := metricFamilies[metricName] if !ok { - klog.Warningf("metric family %q not found", metricName) + logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", metricName) return nil, fmt.Errorf("metric family %q not found", metricName) } if len(mf.GetMetric()) == 0 { @@ -171,6 +232,6 @@ func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName str latest = m } } - klog.V(logutil.TRACE).Infof("Got metric value %+v for metric %v", latest, metricName) + logger.V(logutil.TRACE).Info("Metric value selected", "value", latest, "metric", metricName) return latest, nil } diff --git a/pkg/ext-proc/backend/vllm/metrics_test.go b/pkg/epp/backend/vllm/metrics_test.go similarity index 78% rename from pkg/ext-proc/backend/vllm/metrics_test.go rename to pkg/epp/backend/vllm/metrics_test.go index e3c1449d..5555bd26 100644 --- a/pkg/ext-proc/backend/vllm/metrics_test.go +++ b/pkg/epp/backend/vllm/metrics_test.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package vllm import ( @@ -7,16 +23,19 @@ import ( dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" "google.golang.org/protobuf/proto" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) func TestPromToPodMetrics(t *testing.T) { + logger := logutil.NewTestLogger() + testCases := []struct { - name string - metricFamilies map[string]*dto.MetricFamily - expectedMetrics *backend.Metrics - expectedErr error - initialPodMetrics *backend.PodMetrics + name string + metricFamilies map[string]*dto.MetricFamily + initialMetrics *metrics.Metrics + expectedMetrics *metrics.Metrics + expectedErr error }{ { name: "all metrics available", @@ -104,7 +123,7 @@ func TestPromToPodMetrics(t *testing.T) { }, }, }, - expectedMetrics: &backend.Metrics{ + expectedMetrics: &metrics.Metrics{ RunningQueueSize: 15, WaitingQueueSize: 25, KVCacheUsagePercent: 0.9, @@ -114,8 +133,8 @@ func TestPromToPodMetrics(t *testing.T) { }, MaxActiveModels: 2, }, - initialPodMetrics: &backend.PodMetrics{}, - expectedErr: nil, + initialMetrics: &metrics.Metrics{}, + expectedErr: nil, }, { name: "invalid max lora", @@ -203,7 +222,7 @@ func TestPromToPodMetrics(t *testing.T) { }, }, }, - expectedMetrics: &backend.Metrics{ + expectedMetrics: &metrics.Metrics{ RunningQueueSize: 15, WaitingQueueSize: 25, KVCacheUsagePercent: 0.9, @@ -213,18 +232,18 @@ func TestPromToPodMetrics(t *testing.T) { }, MaxActiveModels: 0, }, - initialPodMetrics: &backend.PodMetrics{}, - expectedErr: errors.New("strconv.Atoi: parsing '2a': invalid syntax"), + initialMetrics: &metrics.Metrics{}, + expectedErr: errors.New("strconv.Atoi: parsing '2a': invalid syntax"), }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - updated, err := promToPodMetrics(tc.metricFamilies, tc.initialPodMetrics) + updated, err := promToPodMetrics(logger, tc.metricFamilies, tc.initialMetrics) if tc.expectedErr != nil { assert.Error(t, err) } else { assert.NoError(t, err) - assert.Equal(t, tc.expectedMetrics, &updated.Metrics) + assert.Equal(t, tc.expectedMetrics, updated) } }) } diff --git a/pkg/epp/controller/inferencemodel_reconciler.go b/pkg/epp/controller/inferencemodel_reconciler.go new file mode 100644 index 00000000..a7f365b7 --- /dev/null +++ b/pkg/epp/controller/inferencemodel_reconciler.go @@ -0,0 +1,130 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "fmt" + + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +type InferenceModelReconciler struct { + client.Client + Record record.EventRecorder + Datastore datastore.Datastore + PoolNamespacedName types.NamespacedName +} + +func (c *InferenceModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx).V(logutil.DEFAULT).WithValues("inferenceModel", req.NamespacedName) + ctx = ctrl.LoggerInto(ctx, logger) + + logger.Info("Reconciling InferenceModel") + + infModel := &v1alpha2.InferenceModel{} + notFound := false + if err := c.Get(ctx, req.NamespacedName, infModel); err != nil { + if !errors.IsNotFound(err) { + logger.Error(err, "Unable to get InferenceModel") + return ctrl.Result{}, err + } + notFound = true + } + + if notFound || !infModel.DeletionTimestamp.IsZero() || infModel.Spec.PoolRef.Name != v1alpha2.ObjectName(c.PoolNamespacedName.Name) { + // InferenceModel object got deleted or changed the referenced pool. + err := c.handleModelDeleted(ctx, req.NamespacedName) + return ctrl.Result{}, err + } + + // Add or update if the InferenceModel instance has a creation timestamp older than the existing entry of the model. + logger = logger.WithValues("poolRef", infModel.Spec.PoolRef).WithValues("modelName", infModel.Spec.ModelName) + if !c.Datastore.ModelSetIfOlder(infModel) { + logger.Info("Skipping InferenceModel, existing instance has older creation timestamp") + } else { + logger.Info("Added/Updated InferenceModel") + } + + return ctrl.Result{}, nil +} + +func (c *InferenceModelReconciler) handleModelDeleted(ctx context.Context, req types.NamespacedName) error { + logger := log.FromContext(ctx) + + // We will lookup and delete the modelName associated with this object, and search for + // other instances referencing the same modelName if exist, and store the oldest in + // its place. This ensures that the InferenceModel with the oldest creation + // timestamp is active. + existing := c.Datastore.ModelDelete(req) + if existing == nil { + // No entry exists in the first place, nothing to do. + return nil + } + logger.Info("InferenceModel removed from datastore", "poolRef", existing.Spec.PoolRef, "modelName", existing.Spec.ModelName) + + // TODO(#409): replace this backfill logic with one that is based on InferenceModel Ready conditions once those are set by an external controller. + updated, err := c.Datastore.ModelResync(ctx, c.Client, existing.Spec.ModelName) + if err != nil { + return err + } + if updated { + logger.Info("Model replaced.", "modelName", existing.Spec.ModelName) + } + return nil +} + +func indexInferenceModelsByModelName(obj client.Object) []string { + m, ok := obj.(*v1alpha2.InferenceModel) + if !ok { + return nil + } + return []string{m.Spec.ModelName} +} + +func (c *InferenceModelReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error { + // Create an index on ModelName for InferenceModel objects. + indexer := mgr.GetFieldIndexer() + if err := indexer.IndexField(ctx, &v1alpha2.InferenceModel{}, datastore.ModelNameIndexKey, indexInferenceModelsByModelName); err != nil { + return fmt.Errorf("setting index on ModelName for InferenceModel: %w", err) + } + return ctrl.NewControllerManagedBy(mgr). + For(&v1alpha2.InferenceModel{}). + WithEventFilter(predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { return c.eventPredicate(e.Object.(*v1alpha2.InferenceModel)) }, + UpdateFunc: func(e event.UpdateEvent) bool { + return c.eventPredicate(e.ObjectOld.(*v1alpha2.InferenceModel)) || c.eventPredicate(e.ObjectNew.(*v1alpha2.InferenceModel)) + }, + DeleteFunc: func(e event.DeleteEvent) bool { return c.eventPredicate(e.Object.(*v1alpha2.InferenceModel)) }, + GenericFunc: func(e event.GenericEvent) bool { return c.eventPredicate(e.Object.(*v1alpha2.InferenceModel)) }, + }). + Complete(c) +} + +func (c *InferenceModelReconciler) eventPredicate(infModel *v1alpha2.InferenceModel) bool { + return string(infModel.Spec.PoolRef.Name) == c.PoolNamespacedName.Name +} diff --git a/pkg/epp/controller/inferencemodel_reconciler_test.go b/pkg/epp/controller/inferencemodel_reconciler_test.go new file mode 100644 index 00000000..cd1ff1fb --- /dev/null +++ b/pkg/epp/controller/inferencemodel_reconciler_test.go @@ -0,0 +1,230 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + utiltest "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" +) + +var ( + pool = utiltest.MakeInferencePool("test-pool1").Namespace("ns1").ObjRef() + infModel1 = utiltest.MakeInferenceModel("model1"). + Namespace(pool.Namespace). + ModelName("fake model1"). + Criticality(v1alpha2.Standard). + CreationTimestamp(metav1.Unix(1000, 0)). + PoolName(pool.Name).ObjRef() + infModel1Pool2 = utiltest.MakeInferenceModel(infModel1.Name). + Namespace(infModel1.Namespace). + ModelName(infModel1.Spec.ModelName). + Criticality(*infModel1.Spec.Criticality). + CreationTimestamp(metav1.Unix(1001, 0)). + PoolName("test-pool2").ObjRef() + infModel1NS2 = utiltest.MakeInferenceModel(infModel1.Name). + Namespace("ns2"). + ModelName(infModel1.Spec.ModelName). + Criticality(*infModel1.Spec.Criticality). + CreationTimestamp(metav1.Unix(1002, 0)). + PoolName(pool.Name).ObjRef() + infModel1Critical = utiltest.MakeInferenceModel(infModel1.Name). + Namespace(infModel1.Namespace). + ModelName(infModel1.Spec.ModelName). + Criticality(v1alpha2.Critical). + CreationTimestamp(metav1.Unix(1003, 0)). + PoolName(pool.Name).ObjRef() + infModel1Deleted = utiltest.MakeInferenceModel(infModel1.Name). + Namespace(infModel1.Namespace). + ModelName(infModel1.Spec.ModelName). + CreationTimestamp(metav1.Unix(1004, 0)). + DeletionTimestamp(). + PoolName(pool.Name).ObjRef() + // Same ModelName, different object with newer creation timestamp + infModel1Newer = utiltest.MakeInferenceModel("model1-newer"). + Namespace(pool.Namespace). + ModelName("fake model1"). + Criticality(v1alpha2.Standard). + CreationTimestamp(metav1.Unix(1005, 0)). + PoolName(pool.Name).ObjRef() + // Same ModelName, different object with older creation timestamp + infModel1Older = utiltest.MakeInferenceModel("model1-older"). + Namespace(pool.Namespace). + ModelName("fake model1"). + Criticality(v1alpha2.Standard). + CreationTimestamp(metav1.Unix(999, 0)). + PoolName(pool.Name).ObjRef() + + infModel2 = utiltest.MakeInferenceModel("model2"). + Namespace(pool.Namespace). + ModelName("fake model2"). + CreationTimestamp(metav1.Unix(1000, 0)). + PoolName(pool.Name).ObjRef() +) + +func TestInferenceModelReconciler(t *testing.T) { + tests := []struct { + name string + modelsInStore []*v1alpha2.InferenceModel + modelsInAPIServer []*v1alpha2.InferenceModel + model *v1alpha2.InferenceModel + incomingReq *types.NamespacedName + wantModels []*v1alpha2.InferenceModel + wantResult ctrl.Result + }{ + { + name: "Empty store, add new model", + model: infModel1, + wantModels: []*v1alpha2.InferenceModel{infModel1}, + }, + { + name: "Existing model changed pools", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + model: infModel1Pool2, + wantModels: []*v1alpha2.InferenceModel{}, + }, + { + name: "Not found, delete existing model", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + incomingReq: &types.NamespacedName{Name: infModel1.Name, Namespace: infModel1.Namespace}, + wantModels: []*v1alpha2.InferenceModel{}, + }, + { + name: "Deletion timestamp set, delete existing model", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + model: infModel1Deleted, + wantModels: []*v1alpha2.InferenceModel{}, + }, + { + name: "Model referencing a different pool, different pool name but same namespace", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + model: infModel1NS2, + wantModels: []*v1alpha2.InferenceModel{infModel1}, + }, + { + name: "Existing model changed pools, replaced with another", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + model: infModel1Pool2, + modelsInAPIServer: []*v1alpha2.InferenceModel{infModel1Newer}, + wantModels: []*v1alpha2.InferenceModel{infModel1Newer}, + }, + { + name: "Not found, delete existing model, replaced with another", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + incomingReq: &types.NamespacedName{Name: infModel1.Name, Namespace: infModel1.Namespace}, + modelsInAPIServer: []*v1alpha2.InferenceModel{infModel1Newer}, + wantModels: []*v1alpha2.InferenceModel{infModel1Newer}, + }, + { + name: "Deletion timestamp set, delete existing model, replaced with another", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + model: infModel1Deleted, + modelsInAPIServer: []*v1alpha2.InferenceModel{infModel1Newer}, + wantModels: []*v1alpha2.InferenceModel{infModel1Newer}, + }, + { + name: "Older instance of the model observed", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + model: infModel1Older, + wantModels: []*v1alpha2.InferenceModel{infModel1Older}, + }, + { + name: "Model changed criticality", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + model: infModel1Critical, + wantModels: []*v1alpha2.InferenceModel{infModel1Critical}, + }, + { + name: "Model not found, no matching existing model to delete", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + incomingReq: &types.NamespacedName{Name: "non-existent-model", Namespace: pool.Namespace}, + wantModels: []*v1alpha2.InferenceModel{infModel1}, + }, + { + name: "Add to existing", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + model: infModel2, + wantModels: []*v1alpha2.InferenceModel{infModel1, infModel2}, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + // Create a fake client with no InferenceModel objects. + scheme := runtime.NewScheme() + _ = v1alpha2.AddToScheme(scheme) + initObjs := []client.Object{} + if test.model != nil { + initObjs = append(initObjs, test.model) + } + for _, m := range test.modelsInAPIServer { + initObjs = append(initObjs, m) + } + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(initObjs...). + WithIndex(&v1alpha2.InferenceModel{}, datastore.ModelNameIndexKey, indexInferenceModelsByModelName). + Build() + pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) + ds := datastore.NewDatastore(t.Context(), pmf) + for _, m := range test.modelsInStore { + ds.ModelSetIfOlder(m) + } + ds.PoolSet(pool) + reconciler := &InferenceModelReconciler{ + Client: fakeClient, + Record: record.NewFakeRecorder(10), + Datastore: ds, + PoolNamespacedName: types.NamespacedName{Name: pool.Name, Namespace: pool.Namespace}, + } + if test.incomingReq == nil { + test.incomingReq = &types.NamespacedName{Name: test.model.Name, Namespace: test.model.Namespace} + } + + // Call Reconcile. + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{NamespacedName: *test.incomingReq}) + if err != nil { + t.Fatalf("expected no error when resource is not found, got %v", err) + } + + if diff := cmp.Diff(result, test.wantResult); diff != "" { + t.Errorf("Unexpected result diff (+got/-want): %s", diff) + } + + if len(test.wantModels) != len(ds.ModelGetAll()) { + t.Errorf("Unexpected; want: %d, got:%d", len(test.wantModels), len(ds.ModelGetAll())) + } + + if diff := diffStore(ds, diffStoreParams{wantPool: pool, wantModels: test.wantModels}); diff != "" { + t.Errorf("Unexpected diff (+got/-want): %s", diff) + } + + }) + } +} diff --git a/pkg/epp/controller/inferencepool_reconciler.go b/pkg/epp/controller/inferencepool_reconciler.go new file mode 100644 index 00000000..c92d4ecc --- /dev/null +++ b/pkg/epp/controller/inferencepool_reconciler.go @@ -0,0 +1,91 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "reflect" + + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +// InferencePoolReconciler utilizes the controller runtime to reconcile Instance Gateway resources +// This implementation is just used for reading & maintaining data sync. The Gateway implementation +// will have the proper controller that will create/manage objects on behalf of the server pool. +type InferencePoolReconciler struct { + client.Client + Record record.EventRecorder + PoolNamespacedName types.NamespacedName + Datastore datastore.Datastore +} + +func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx).WithValues("inferencePool", req.NamespacedName).V(logutil.DEFAULT) + ctx = ctrl.LoggerInto(ctx, logger) + + logger.Info("Reconciling InferencePool") + + infPool := &v1alpha2.InferencePool{} + + if err := c.Get(ctx, req.NamespacedName, infPool); err != nil { + if errors.IsNotFound(err) { + logger.Info("InferencePool not found. Clearing the datastore") + c.Datastore.Clear() + return ctrl.Result{}, nil + } + logger.Error(err, "Unable to get InferencePool") + return ctrl.Result{}, err + } else if !infPool.DeletionTimestamp.IsZero() { + logger.Info("InferencePool is marked for deletion. Clearing the datastore") + c.Datastore.Clear() + return ctrl.Result{}, nil + } + + c.updateDatastore(ctx, infPool) + + return ctrl.Result{}, nil +} + +func (c *InferencePoolReconciler) updateDatastore(ctx context.Context, newPool *v1alpha2.InferencePool) { + logger := log.FromContext(ctx) + oldPool, err := c.Datastore.PoolGet() + c.Datastore.PoolSet(newPool) + if err != nil || !reflect.DeepEqual(newPool.Spec.Selector, oldPool.Spec.Selector) { + logger.V(logutil.DEFAULT).Info("Updating inference pool endpoints", "selector", newPool.Spec.Selector) + // A full resync is required to address two cases: + // 1) At startup, the pod events may get processed before the pool is synced with the datastore, + // and hence they will not be added to the store since pool selector is not known yet + // 2) If the selector on the pool was updated, then we will not get any pod events, and so we need + // to resync the whole pool: remove pods in the store that don't match the new selector and add + // the ones that may have existed already to the store. + c.Datastore.PodResyncAll(ctx, c.Client, newPool) + } +} + +func (c *InferencePoolReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&v1alpha2.InferencePool{}). + Complete(c) +} diff --git a/pkg/epp/controller/inferencepool_reconciler_test.go b/pkg/epp/controller/inferencepool_reconciler_test.go new file mode 100644 index 00000000..27c4238e --- /dev/null +++ b/pkg/epp/controller/inferencepool_reconciler_test.go @@ -0,0 +1,188 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + utiltest "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" +) + +var ( + selector_v1 = map[string]string{"app": "vllm_v1"} + selector_v2 = map[string]string{"app": "vllm_v2"} + pool1 = utiltest.MakeInferencePool("pool1"). + Namespace("pool1-ns"). + Selector(selector_v1). + TargetPortNumber(8080).ObjRef() + pool2 = utiltest.MakeInferencePool("pool2").Namespace("pool2-ns").ObjRef() + pods = []*corev1.Pod{ + // Two ready pods matching pool1 + utiltest.MakePod("pod1"). + Namespace("pool1-ns"). + Labels(selector_v1).ReadyCondition().ObjRef(), + utiltest.MakePod("pod2"). + Namespace("pool1-ns"). + Labels(selector_v1). + ReadyCondition().ObjRef(), + // A not ready pod matching pool1 + utiltest.MakePod("pod3"). + Namespace("pool1-ns"). + Labels(selector_v1).ObjRef(), + // A pod not matching pool1 namespace + utiltest.MakePod("pod4"). + Namespace("pool2-ns"). + Labels(selector_v1). + ReadyCondition().ObjRef(), + // A ready pod matching pool1 with a new selector + utiltest.MakePod("pod5"). + Namespace("pool1-ns"). + Labels(selector_v2). + ReadyCondition().ObjRef(), + } +) + +func TestInferencePoolReconciler(t *testing.T) { + // The best practice is to use table-driven tests, however in this scaenario it seems + // more logical to do a single test with steps that depend on each other. + + // Set up the scheme. + scheme := runtime.NewScheme() + _ = clientgoscheme.AddToScheme(scheme) + _ = v1alpha2.AddToScheme(scheme) + + // Create a fake client with the pool and the pods. + initialObjects := []client.Object{pool1, pool2} + for i := range pods { + initialObjects = append(initialObjects, pods[i]) + } + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(initialObjects...). + Build() + + // Create a request for the existing resource. + namespacedName := types.NamespacedName{Name: pool1.Name, Namespace: pool1.Namespace} + req := ctrl.Request{NamespacedName: namespacedName} + ctx := context.Background() + + pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) + datastore := datastore.NewDatastore(ctx, pmf) + inferencePoolReconciler := &InferencePoolReconciler{PoolNamespacedName: namespacedName, Client: fakeClient, Datastore: datastore} + + // Step 1: Inception, only ready pods matching pool1 are added to the store. + if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { + t.Errorf("Unexpected InferencePool reconcile error: %v", err) + } + if diff := diffStore(datastore, diffStoreParams{wantPool: pool1, wantPods: []string{"pod1", "pod2"}}); diff != "" { + t.Errorf("Unexpected diff (+got/-want): %s", diff) + } + + newPool1 := &v1alpha2.InferencePool{} + if err := fakeClient.Get(ctx, req.NamespacedName, newPool1); err != nil { + t.Errorf("Unexpected pool get error: %v", err) + } + newPool1.Spec.Selector = map[v1alpha2.LabelKey]v1alpha2.LabelValue{"app": "vllm_v2"} + if err := fakeClient.Update(ctx, newPool1, &client.UpdateOptions{}); err != nil { + t.Errorf("Unexpected pool update error: %v", err) + } + + if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { + t.Errorf("Unexpected InferencePool reconcile error: %v", err) + } + if diff := diffStore(datastore, diffStoreParams{wantPool: newPool1, wantPods: []string{"pod5"}}); diff != "" { + t.Errorf("Unexpected diff (+got/-want): %s", diff) + } + + // Step 3: update the pool port + if err := fakeClient.Get(ctx, req.NamespacedName, newPool1); err != nil { + t.Errorf("Unexpected pool get error: %v", err) + } + newPool1.Spec.TargetPortNumber = 9090 + if err := fakeClient.Update(ctx, newPool1, &client.UpdateOptions{}); err != nil { + t.Errorf("Unexpected pool update error: %v", err) + } + if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { + t.Errorf("Unexpected InferencePool reconcile error: %v", err) + } + if diff := diffStore(datastore, diffStoreParams{wantPool: newPool1, wantPods: []string{"pod5"}}); diff != "" { + t.Errorf("Unexpected diff (+got/-want): %s", diff) + } + + // Step 4: delete the pool to trigger a datastore clear + if err := fakeClient.Get(ctx, req.NamespacedName, newPool1); err != nil { + t.Errorf("Unexpected pool get error: %v", err) + } + if err := fakeClient.Delete(ctx, newPool1, &client.DeleteOptions{}); err != nil { + t.Errorf("Unexpected pool delete error: %v", err) + } + if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { + t.Errorf("Unexpected InferencePool reconcile error: %v", err) + } + if diff := diffStore(datastore, diffStoreParams{wantPods: []string{}}); diff != "" { + t.Errorf("Unexpected diff (+got/-want): %s", diff) + } +} + +type diffStoreParams struct { + wantPool *v1alpha2.InferencePool + wantPods []string + wantModels []*v1alpha2.InferenceModel +} + +func diffStore(datastore datastore.Datastore, params diffStoreParams) string { + gotPool, _ := datastore.PoolGet() + if diff := cmp.Diff(params.wantPool, gotPool); diff != "" { + return "pool:" + diff + } + + // Default wantPods if not set because PodGetAll returns an empty slice when empty. + if params.wantPods == nil { + params.wantPods = []string{} + } + gotPods := []string{} + for _, pm := range datastore.PodGetAll() { + gotPods = append(gotPods, pm.GetPod().NamespacedName.Name) + } + if diff := cmp.Diff(params.wantPods, gotPods, cmpopts.SortSlices(func(a, b string) bool { return a < b })); diff != "" { + return "pods:" + diff + } + + // Default wantModels if not set because ModelGetAll returns an empty slice when empty. + if params.wantModels == nil { + params.wantModels = []*v1alpha2.InferenceModel{} + } + gotModels := datastore.ModelGetAll() + if diff := utiltest.DiffModelLists(params.wantModels, gotModels); diff != "" { + return "models:" + diff + } + return "" +} diff --git a/pkg/epp/controller/pod_reconciler.go b/pkg/epp/controller/pod_reconciler.go new file mode 100644 index 00000000..046561e4 --- /dev/null +++ b/pkg/epp/controller/pod_reconciler.go @@ -0,0 +1,96 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + + "github.com/go-logr/logr" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +type PodReconciler struct { + client.Client + Datastore datastore.Datastore + Record record.EventRecorder +} + +func (c *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + pool, err := c.Datastore.PoolGet() + if err != nil { + logger.V(logutil.TRACE).Info("Skipping reconciling Pod because the InferencePool is not available yet") + // When the inferencePool is initialized it lists the appropriate pods and populates the datastore, so no need to requeue. + return ctrl.Result{}, nil + } + + logger.V(logutil.VERBOSE).Info("Pod being reconciled", "name", req.NamespacedName) + + pod := &corev1.Pod{} + if err := c.Get(ctx, req.NamespacedName, pod); err != nil { + if apierrors.IsNotFound(err) { + c.Datastore.PodDelete(req.NamespacedName) + return ctrl.Result{}, nil + } + logger.V(logutil.DEFAULT).Error(err, "Unable to get pod", "name", req.NamespacedName) + return ctrl.Result{}, err + } + + c.updateDatastore(logger, pod, pool) + return ctrl.Result{}, nil +} + +func (c *PodReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&corev1.Pod{}). + Complete(c) +} + +func (c *PodReconciler) updateDatastore(logger logr.Logger, pod *corev1.Pod, pool *v1alpha2.InferencePool) { + namespacedName := types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace} + if !pod.DeletionTimestamp.IsZero() || !c.Datastore.PoolLabelsMatch(pod.Labels) || !podIsReady(pod) { + logger.V(logutil.DEBUG).Info("Pod removed or not added", "name", namespacedName) + c.Datastore.PodDelete(namespacedName) + } else { + if c.Datastore.PodUpdateOrAddIfNotExist(pod, pool) { + logger.V(logutil.DEFAULT).Info("Pod added", "name", namespacedName) + } else { + logger.V(logutil.DEFAULT).Info("Pod already exists", "name", namespacedName) + } + } +} + +func podIsReady(pod *corev1.Pod) bool { + for _, condition := range pod.Status.Conditions { + if condition.Type == corev1.PodReady { + if condition.Status == corev1.ConditionTrue { + return true + } + break + } + } + return false +} diff --git a/pkg/epp/controller/pod_reconciler_test.go b/pkg/epp/controller/pod_reconciler_test.go new file mode 100644 index 00000000..e4cb0b62 --- /dev/null +++ b/pkg/epp/controller/pod_reconciler_test.go @@ -0,0 +1,209 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + utiltest "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" +) + +var ( + basePod1 = &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1"}, Status: corev1.PodStatus{PodIP: "address-1"}} + basePod2 = &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod2"}, Status: corev1.PodStatus{PodIP: "address-2"}} + basePod3 = &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod3"}, Status: corev1.PodStatus{PodIP: "address-3"}} + basePod11 = &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1"}, Status: corev1.PodStatus{PodIP: "address-11"}} + pmc = &backendmetrics.FakePodMetricsClient{} + pmf = backendmetrics.NewPodMetricsFactory(pmc, time.Second) +) + +func TestPodReconciler(t *testing.T) { + tests := []struct { + name string + pool *v1alpha2.InferencePool + existingPods []*corev1.Pod + incomingPod *corev1.Pod + wantPods []*corev1.Pod + req *ctrl.Request + }{ + { + name: "Add new pod", + existingPods: []*corev1.Pod{basePod1, basePod2}, + pool: &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ + "some-key": "some-val", + }, + }, + }, + incomingPod: utiltest.FromBase(basePod3). + Labels(map[string]string{"some-key": "some-val"}). + ReadyCondition().ObjRef(), + wantPods: []*corev1.Pod{basePod1, basePod2, basePod3}, + }, + { + name: "Update pod1 address", + existingPods: []*corev1.Pod{basePod1, basePod2}, + pool: &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ + "some-key": "some-val", + }, + }, + }, + incomingPod: utiltest.FromBase(basePod11). + Labels(map[string]string{"some-key": "some-val"}). + ReadyCondition().ObjRef(), + wantPods: []*corev1.Pod{basePod11, basePod2}, + }, + { + name: "Delete pod with DeletionTimestamp", + existingPods: []*corev1.Pod{basePod1, basePod2}, + pool: &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ + "some-key": "some-val", + }, + }, + }, + incomingPod: utiltest.FromBase(basePod1). + Labels(map[string]string{"some-key": "some-val"}). + DeletionTimestamp(). + ReadyCondition().ObjRef(), + wantPods: []*corev1.Pod{basePod2}, + }, + { + name: "Delete notfound pod", + existingPods: []*corev1.Pod{basePod1, basePod2}, + pool: &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ + "some-key": "some-val", + }, + }, + }, + req: &ctrl.Request{NamespacedName: types.NamespacedName{Name: "pod1"}}, + wantPods: []*corev1.Pod{basePod2}, + }, + { + name: "New pod, not ready, valid selector", + existingPods: []*corev1.Pod{basePod1, basePod2}, + pool: &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ + "some-key": "some-val", + }, + }, + }, + incomingPod: utiltest.FromBase(basePod3). + Labels(map[string]string{"some-key": "some-val"}).ObjRef(), + wantPods: []*corev1.Pod{basePod1, basePod2}, + }, + { + name: "Remove pod that does not match selector", + existingPods: []*corev1.Pod{basePod1, basePod2}, + pool: &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ + "some-key": "some-val", + }, + }, + }, + incomingPod: utiltest.FromBase(basePod1). + Labels(map[string]string{"some-wrong-key": "some-val"}). + ReadyCondition().ObjRef(), + wantPods: []*corev1.Pod{basePod2}, + }, + { + name: "Remove pod that is not ready", + existingPods: []*corev1.Pod{basePod1, basePod2}, + pool: &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ + "some-key": "some-val", + }, + }, + }, + incomingPod: utiltest.FromBase(basePod1). + Labels(map[string]string{"some-wrong-key": "some-val"}). + ReadyCondition().ObjRef(), + wantPods: []*corev1.Pod{basePod2}, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + // Set up the scheme. + scheme := runtime.NewScheme() + _ = clientgoscheme.AddToScheme(scheme) + initialObjects := []client.Object{} + if test.incomingPod != nil { + initialObjects = append(initialObjects, test.incomingPod) + } + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(initialObjects...). + Build() + + // Configure the initial state of the datastore. + store := datastore.NewDatastore(t.Context(), pmf) + store.PoolSet(test.pool) + for _, pod := range test.existingPods { + store.PodUpdateOrAddIfNotExist(pod, pool) + } + + podReconciler := &PodReconciler{Client: fakeClient, Datastore: store} + if test.req == nil { + namespacedName := types.NamespacedName{Name: test.incomingPod.Name, Namespace: test.incomingPod.Namespace} + test.req = &ctrl.Request{NamespacedName: namespacedName} + } + if _, err := podReconciler.Reconcile(context.Background(), *test.req); err != nil { + t.Errorf("Unexpected InferencePool reconcile error: %v", err) + } + + var gotPods []*corev1.Pod + for _, pm := range store.PodGetAll() { + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: pm.GetPod().NamespacedName.Name, Namespace: pm.GetPod().NamespacedName.Namespace}, Status: corev1.PodStatus{PodIP: pm.GetPod().Address}} + gotPods = append(gotPods, pod) + } + if !cmp.Equal(gotPods, test.wantPods, cmpopts.SortSlices(func(a, b *corev1.Pod) bool { return a.Name < b.Name })) { + t.Errorf("got (%v) != want (%v);", gotPods, test.wantPods) + } + }) + } +} diff --git a/pkg/epp/datastore/datastore.go b/pkg/epp/datastore/datastore.go new file mode 100644 index 00000000..af31da42 --- /dev/null +++ b/pkg/epp/datastore/datastore.go @@ -0,0 +1,354 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package datastore + +import ( + "context" + "errors" + "fmt" + "math/rand" + "sync" + + "github.com/go-logr/logr" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + ModelNameIndexKey = "spec.modelName" +) + +var ( + errPoolNotSynced = errors.New("InferencePool is not initialized in data store") +) + +// The datastore is a local cache of relevant data for the given InferencePool (currently all pulled from k8s-api) +type Datastore interface { + // InferencePool operations + PoolSet(pool *v1alpha2.InferencePool) + PoolGet() (*v1alpha2.InferencePool, error) + PoolHasSynced() bool + PoolLabelsMatch(podLabels map[string]string) bool + + // InferenceModel operations + ModelSetIfOlder(infModel *v1alpha2.InferenceModel) bool + ModelGet(modelName string) *v1alpha2.InferenceModel + ModelDelete(namespacedName types.NamespacedName) *v1alpha2.InferenceModel + ModelResync(ctx context.Context, ctrlClient client.Client, modelName string) (bool, error) + ModelGetAll() []*v1alpha2.InferenceModel + + // PodMetrics operations + // PodGetAll returns all pods and metrics, including fresh and stale. + PodGetAll() []backendmetrics.PodMetrics + // PodList lists pods matching the given predicate. + PodList(func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics + PodUpdateOrAddIfNotExist(pod *corev1.Pod, pool *v1alpha2.InferencePool) bool + PodDelete(namespacedName types.NamespacedName) + PodResyncAll(ctx context.Context, ctrlClient client.Client, pool *v1alpha2.InferencePool) + + // Clears the store state, happens when the pool gets deleted. + Clear() +} + +func NewDatastore(parentCtx context.Context, pmf *backendmetrics.PodMetricsFactory) *datastore { + store := &datastore{ + parentCtx: parentCtx, + poolAndModelsMu: sync.RWMutex{}, + models: make(map[string]*v1alpha2.InferenceModel), + pods: &sync.Map{}, + pmf: pmf, + } + return store +} + +type datastore struct { + // parentCtx controls the lifecycle of the background metrics goroutines that spawn up by the datastore. + parentCtx context.Context + // poolAndModelsMu is used to synchronize access to pool and the models map. + poolAndModelsMu sync.RWMutex + pool *v1alpha2.InferencePool + // key: InferenceModel.Spec.ModelName, value: *InferenceModel + models map[string]*v1alpha2.InferenceModel + // key: types.NamespacedName, value: backendmetrics.PodMetrics + pods *sync.Map + pmf *backendmetrics.PodMetricsFactory +} + +func (ds *datastore) Clear() { + ds.poolAndModelsMu.Lock() + defer ds.poolAndModelsMu.Unlock() + ds.pool = nil + ds.models = make(map[string]*v1alpha2.InferenceModel) + ds.pods.Clear() +} + +// /// InferencePool APIs /// +func (ds *datastore) PoolSet(pool *v1alpha2.InferencePool) { + ds.poolAndModelsMu.Lock() + defer ds.poolAndModelsMu.Unlock() + ds.pool = pool +} + +func (ds *datastore) PoolGet() (*v1alpha2.InferencePool, error) { + ds.poolAndModelsMu.RLock() + defer ds.poolAndModelsMu.RUnlock() + if !ds.PoolHasSynced() { + return nil, errPoolNotSynced + } + return ds.pool, nil +} + +func (ds *datastore) PoolHasSynced() bool { + ds.poolAndModelsMu.RLock() + defer ds.poolAndModelsMu.RUnlock() + return ds.pool != nil +} + +func (ds *datastore) PoolLabelsMatch(podLabels map[string]string) bool { + ds.poolAndModelsMu.RLock() + defer ds.poolAndModelsMu.RUnlock() + poolSelector := selectorFromInferencePoolSelector(ds.pool.Spec.Selector) + podSet := labels.Set(podLabels) + return poolSelector.Matches(podSet) +} + +func (ds *datastore) ModelSetIfOlder(infModel *v1alpha2.InferenceModel) bool { + ds.poolAndModelsMu.Lock() + defer ds.poolAndModelsMu.Unlock() + + // Check first if the existing model is older. + // One exception is if the incoming model object is the same, in which case, we should not + // check for creation timestamp since that means the object was re-created, and so we should override. + existing, exists := ds.models[infModel.Spec.ModelName] + if exists { + diffObj := infModel.Name != existing.Name || infModel.Namespace != existing.Namespace + if diffObj && existing.ObjectMeta.CreationTimestamp.Before(&infModel.ObjectMeta.CreationTimestamp) { + return false + } + } + // Set the model. + ds.models[infModel.Spec.ModelName] = infModel + return true +} + +func (ds *datastore) ModelResync(ctx context.Context, c client.Client, modelName string) (bool, error) { + ds.poolAndModelsMu.Lock() + defer ds.poolAndModelsMu.Unlock() + + var models v1alpha2.InferenceModelList + if err := c.List(ctx, &models, client.MatchingFields{ModelNameIndexKey: modelName}, client.InNamespace(ds.pool.Namespace)); err != nil { + return false, fmt.Errorf("listing models that match the modelName %s: %w", modelName, err) + } + if len(models.Items) == 0 { + // No other instances of InferenceModels with this ModelName exists. + return false, nil + } + + var oldest *v1alpha2.InferenceModel + for i := range models.Items { + m := &models.Items[i] + if m.Spec.ModelName != modelName || // The index should filter those out, but just in case! + m.Spec.PoolRef.Name != v1alpha2.ObjectName(ds.pool.Name) || // We don't care about other pools, we could setup an index on this too! + !m.DeletionTimestamp.IsZero() { // ignore objects marked for deletion + continue + } + if oldest == nil || m.ObjectMeta.CreationTimestamp.Before(&oldest.ObjectMeta.CreationTimestamp) { + oldest = m + } + } + if oldest == nil { + return false, nil + } + ds.models[modelName] = oldest + return true, nil +} + +func (ds *datastore) ModelGet(modelName string) *v1alpha2.InferenceModel { + ds.poolAndModelsMu.RLock() + defer ds.poolAndModelsMu.RUnlock() + return ds.models[modelName] +} + +func (ds *datastore) ModelDelete(namespacedName types.NamespacedName) *v1alpha2.InferenceModel { + ds.poolAndModelsMu.Lock() + defer ds.poolAndModelsMu.Unlock() + for _, m := range ds.models { + if m.Name == namespacedName.Name && m.Namespace == namespacedName.Namespace { + delete(ds.models, m.Spec.ModelName) + return m + } + } + return nil +} + +func (ds *datastore) ModelGetAll() []*v1alpha2.InferenceModel { + ds.poolAndModelsMu.RLock() + defer ds.poolAndModelsMu.RUnlock() + res := []*v1alpha2.InferenceModel{} + for _, v := range ds.models { + res = append(res, v) + } + return res +} + +// /// Pods/endpoints APIs /// + +func (ds *datastore) PodGetAll() []backendmetrics.PodMetrics { + return ds.PodList(func(backendmetrics.PodMetrics) bool { return true }) +} + +func (ds *datastore) PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics { + res := []backendmetrics.PodMetrics{} + fn := func(k, v any) bool { + pm := v.(backendmetrics.PodMetrics) + if predicate(pm) { + res = append(res, pm) + } + return true + } + ds.pods.Range(fn) + return res +} + +func (ds *datastore) PodUpdateOrAddIfNotExist(pod *corev1.Pod, pool *v1alpha2.InferencePool) bool { + namespacedName := types.NamespacedName{ + Name: pod.Name, + Namespace: pod.Namespace, + } + var pm backendmetrics.PodMetrics + existing, ok := ds.pods.Load(namespacedName) + if !ok { + pm = ds.pmf.NewPodMetrics(ds.parentCtx, pod, ds) + ds.pods.Store(namespacedName, pm) + } else { + pm = existing.(backendmetrics.PodMetrics) + } + // Update pod properties if anything changed. + pm.UpdatePod(pod) + return ok +} + +func (ds *datastore) PodResyncAll(ctx context.Context, ctrlClient client.Client, pool *v1alpha2.InferencePool) { + logger := log.FromContext(ctx) + podList := &corev1.PodList{} + if err := ctrlClient.List(ctx, podList, &client.ListOptions{ + LabelSelector: selectorFromInferencePoolSelector(pool.Spec.Selector), + Namespace: pool.Namespace, + }); err != nil { + log.FromContext(ctx).V(logutil.DEFAULT).Error(err, "Failed to list clients") + return + } + + activePods := make(map[string]bool) + for _, pod := range podList.Items { + if podIsReady(&pod) { + namespacedName := types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace} + activePods[pod.Name] = true + if ds.PodUpdateOrAddIfNotExist(&pod, pool) { + logger.V(logutil.DEFAULT).Info("Pod added", "name", namespacedName) + } else { + logger.V(logutil.DEFAULT).Info("Pod already exists", "name", namespacedName) + } + } + } + + // Remove pods that don't belong to the pool or not ready any more. + deleteFn := func(k, v any) bool { + pm := v.(backendmetrics.PodMetrics) + if exist := activePods[pm.GetPod().NamespacedName.Name]; !exist { + logger.V(logutil.VERBOSE).Info("Removing pod", "pod", pm.GetPod()) + ds.PodDelete(pm.GetPod().NamespacedName) + } + return true + } + ds.pods.Range(deleteFn) +} + +func (ds *datastore) PodDelete(namespacedName types.NamespacedName) { + v, ok := ds.pods.LoadAndDelete(namespacedName) + if ok { + pmr := v.(backendmetrics.PodMetrics) + pmr.StopRefreshLoop() + } +} + +func selectorFromInferencePoolSelector(selector map[v1alpha2.LabelKey]v1alpha2.LabelValue) labels.Selector { + return labels.SelectorFromSet(stripLabelKeyAliasFromLabelMap(selector)) +} + +func stripLabelKeyAliasFromLabelMap(labels map[v1alpha2.LabelKey]v1alpha2.LabelValue) map[string]string { + outMap := make(map[string]string) + for k, v := range labels { + outMap[string(k)] = string(v) + } + return outMap +} + +func RandomWeightedDraw(logger logr.Logger, model *v1alpha2.InferenceModel, seed int64) string { + source := rand.NewSource(rand.Int63()) + if seed > 0 { + source = rand.NewSource(seed) + } + r := rand.New(source) + + // all the weight values are nil, then we should return random model name + if model.Spec.TargetModels[0].Weight == nil { + index := r.Int31n(int32(len(model.Spec.TargetModels))) + return model.Spec.TargetModels[index].Name + } + + var weights int32 + for _, model := range model.Spec.TargetModels { + weights += *model.Weight + } + logger.V(logutil.TRACE).Info("Weights for model computed", "model", model.Name, "weights", weights) + randomVal := r.Int31n(weights) + // TODO: optimize this without using loop + for _, model := range model.Spec.TargetModels { + if randomVal < *model.Weight { + return model.Name + } + randomVal -= *model.Weight + } + return "" +} + +func IsCritical(model *v1alpha2.InferenceModel) bool { + if model.Spec.Criticality != nil && *model.Spec.Criticality == v1alpha2.Critical { + return true + } + return false +} + +// TODO: move out to share with pod_reconciler.go +func podIsReady(pod *corev1.Pod) bool { + for _, condition := range pod.Status.Conditions { + if condition.Type == corev1.PodReady { + if condition.Status == corev1.ConditionTrue { + return true + } + break + } + } + return false +} diff --git a/pkg/epp/datastore/datastore_test.go b/pkg/epp/datastore/datastore_test.go new file mode 100644 index 00000000..f60a4cc9 --- /dev/null +++ b/pkg/epp/datastore/datastore_test.go @@ -0,0 +1,447 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package datastore + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + testutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" +) + +func TestPool(t *testing.T) { + pool1Selector := map[string]string{"app": "vllm_v1"} + pool1 := testutil.MakeInferencePool("pool1"). + Namespace("default"). + Selector(pool1Selector).ObjRef() + tests := []struct { + name string + inferencePool *v1alpha2.InferencePool + labels map[string]string + wantSynced bool + wantPool *v1alpha2.InferencePool + wantErr error + wantLabelsMatch bool + }{ + { + name: "Ready when InferencePool exists in data store", + inferencePool: pool1, + labels: pool1Selector, + wantSynced: true, + wantPool: pool1, + wantLabelsMatch: true, + }, + { + name: "Labels not matched", + inferencePool: pool1, + labels: map[string]string{"app": "vllm_v2"}, + wantSynced: true, + wantPool: pool1, + wantLabelsMatch: false, + }, + { + name: "Not ready when InferencePool is nil in data store", + wantErr: errPoolNotSynced, + wantSynced: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) + datastore := NewDatastore(context.Background(), pmf) + datastore.PoolSet(tt.inferencePool) + gotPool, gotErr := datastore.PoolGet() + if diff := cmp.Diff(tt.wantErr, gotErr, cmpopts.EquateErrors()); diff != "" { + t.Errorf("Unexpected error diff (+got/-want): %s", diff) + } + if diff := cmp.Diff(tt.wantPool, gotPool); diff != "" { + t.Errorf("Unexpected pool diff (+got/-want): %s", diff) + } + gotSynced := datastore.PoolHasSynced() + if diff := cmp.Diff(tt.wantSynced, gotSynced); diff != "" { + t.Errorf("Unexpected synced diff (+got/-want): %s", diff) + } + if tt.labels != nil { + gotLabelsMatch := datastore.PoolLabelsMatch(tt.labels) + if diff := cmp.Diff(tt.wantLabelsMatch, gotLabelsMatch); diff != "" { + t.Errorf("Unexpected labels match diff (+got/-want): %s", diff) + } + } + }) + } +} + +func TestModel(t *testing.T) { + chatModel := "chat" + tsModel := "tweet-summary" + model1ts := testutil.MakeInferenceModel("model1"). + CreationTimestamp(metav1.Unix(1000, 0)). + ModelName(tsModel).ObjRef() + // Same model name as model1ts, different object name. + model2ts := testutil.MakeInferenceModel("model2"). + CreationTimestamp(metav1.Unix(1001, 0)). + ModelName(tsModel).ObjRef() + // Same model name as model1ts, newer timestamp + model1tsNewer := testutil.MakeInferenceModel("model1"). + CreationTimestamp(metav1.Unix(1002, 0)). + Criticality(v1alpha2.Critical). + ModelName(tsModel).ObjRef() + model2tsNewer := testutil.MakeInferenceModel("model2"). + CreationTimestamp(metav1.Unix(1003, 0)). + ModelName(tsModel).ObjRef() + // Same object name as model2ts, different model name. + model2chat := testutil.MakeInferenceModel(model2ts.Name). + CreationTimestamp(metav1.Unix(1005, 0)). + ModelName(chatModel).ObjRef() + + tests := []struct { + name string + existingModels []*v1alpha2.InferenceModel + op func(ds Datastore) bool + wantOpResult bool + wantModels []*v1alpha2.InferenceModel + }{ + { + name: "Add model1 with tweet-summary as modelName", + op: func(ds Datastore) bool { + return ds.ModelSetIfOlder(model1ts) + }, + wantModels: []*v1alpha2.InferenceModel{model1ts}, + wantOpResult: true, + }, + { + name: "Set model1 with the same modelName, but with diff criticality and newer creation timestamp, should update.", + existingModels: []*v1alpha2.InferenceModel{model1ts}, + op: func(ds Datastore) bool { + return ds.ModelSetIfOlder(model1tsNewer) + }, + wantOpResult: true, + wantModels: []*v1alpha2.InferenceModel{model1tsNewer}, + }, + { + name: "set model2 with the same modelName, but newer creation timestamp, should not update.", + existingModels: []*v1alpha2.InferenceModel{model1tsNewer}, + op: func(ds Datastore) bool { + return ds.ModelSetIfOlder(model2tsNewer) + }, + wantOpResult: false, + wantModels: []*v1alpha2.InferenceModel{model1tsNewer}, + }, + { + name: "Set model2 with the same modelName, but older creation timestamp, should update", + existingModels: []*v1alpha2.InferenceModel{model1tsNewer}, + op: func(ds Datastore) bool { + return ds.ModelSetIfOlder(model2ts) + }, + wantOpResult: true, + wantModels: []*v1alpha2.InferenceModel{model2ts}, + }, + { + name: "Set model1 with the tweet-summary modelName, both models should exist", + existingModels: []*v1alpha2.InferenceModel{model2chat}, + op: func(ds Datastore) bool { + return ds.ModelSetIfOlder(model1ts) + }, + wantOpResult: true, + wantModels: []*v1alpha2.InferenceModel{model2chat, model1ts}, + }, + { + name: "Set model1 with the tweet-summary modelName, both models should exist", + existingModels: []*v1alpha2.InferenceModel{model2chat, model1ts}, + op: func(ds Datastore) bool { + return ds.ModelSetIfOlder(model1ts) + }, + wantOpResult: true, + wantModels: []*v1alpha2.InferenceModel{model2chat, model1ts}, + }, + { + name: "Getting by model name, chat -> model2", + existingModels: []*v1alpha2.InferenceModel{model2chat, model1ts}, + op: func(ds Datastore) bool { + gotChat := ds.ModelGet(chatModel) + return gotChat != nil && cmp.Diff(model2chat, gotChat) == "" + }, + wantOpResult: true, + wantModels: []*v1alpha2.InferenceModel{model2chat, model1ts}, + }, + { + name: "Delete the model", + existingModels: []*v1alpha2.InferenceModel{model2chat, model1ts}, + op: func(ds Datastore) bool { + existing := ds.ModelDelete(types.NamespacedName{Name: model1ts.Name, Namespace: model1ts.Namespace}) + got := ds.ModelGet(tsModel) + return existing != nil && got == nil + + }, + wantOpResult: true, + wantModels: []*v1alpha2.InferenceModel{model2chat}, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) + ds := NewDatastore(t.Context(), pmf) + for _, m := range test.existingModels { + ds.ModelSetIfOlder(m) + } + + gotOpResult := test.op(ds) + if gotOpResult != test.wantOpResult { + t.Errorf("Unexpected operation result, want: %v, got: %v", test.wantOpResult, gotOpResult) + } + + if diff := testutil.DiffModelLists(test.wantModels, ds.ModelGetAll()); diff != "" { + t.Errorf("Unexpected models diff: %s", diff) + } + + }) + } +} + +func TestRandomWeightedDraw(t *testing.T) { + logger := logutil.NewTestLogger() + tests := []struct { + name string + model *v1alpha2.InferenceModel + want string + }{ + { + name: "'random' distribution", + model: &v1alpha2.InferenceModel{ + Spec: v1alpha2.InferenceModelSpec{ + TargetModels: []v1alpha2.TargetModel{ + { + Name: "canary", + Weight: pointer(50), + }, + { + Name: "v1", + Weight: pointer(50), + }, + }, + }, + }, + want: "canary", + }, + { + name: "'random' distribution", + model: &v1alpha2.InferenceModel{ + Spec: v1alpha2.InferenceModelSpec{ + TargetModels: []v1alpha2.TargetModel{ + { + Name: "canary", + Weight: pointer(25), + }, + { + Name: "v1.1", + Weight: pointer(55), + }, + { + Name: "v1", + Weight: pointer(50), + }, + }, + }, + }, + want: "v1", + }, + { + name: "'random' distribution", + model: &v1alpha2.InferenceModel{ + Spec: v1alpha2.InferenceModelSpec{ + TargetModels: []v1alpha2.TargetModel{ + { + Name: "canary", + Weight: pointer(20), + }, + { + Name: "v1.1", + Weight: pointer(20), + }, + { + Name: "v1", + Weight: pointer(10), + }, + }, + }, + }, + want: "v1.1", + }, + { + name: "weighted distribution with weight unset", + model: &v1alpha2.InferenceModel{ + Spec: v1alpha2.InferenceModelSpec{ + TargetModels: []v1alpha2.TargetModel{ + { + Name: "canary", + }, + { + Name: "v1.1", + }, + { + Name: "v1", + }, + }, + }, + }, + want: "canary", + }, + } + var seedVal int64 = 420 + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + for range 10000 { + model := RandomWeightedDraw(logger, test.model, seedVal) + if model != test.want { + t.Errorf("Model returned: %v != %v", model, test.want) + break + } + } + }) + } +} + +func pointer(v int32) *int32 { + return &v +} + +var ( + pod1 = &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + }, + } + pod1Metrics = &backendmetrics.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + } + pod2 = &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod2", + }, + } + pod2Metrics = &backendmetrics.Metrics{ + WaitingQueueSize: 1, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo1": 1, + "bar1": 1, + }, + } + pod1NamespacedName = types.NamespacedName{Name: pod1.Name, Namespace: pod1.Namespace} + pod2NamespacedName = types.NamespacedName{Name: pod2.Name, Namespace: pod2.Namespace} + inferencePool = &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ + TargetPortNumber: 8000, + }, + } +) + +func TestMetrics(t *testing.T) { + tests := []struct { + name string + pmc backendmetrics.PodMetricsClient + storePods []*corev1.Pod + want []*backendmetrics.Metrics + }{ + { + name: "Probing metrics success", + pmc: &backendmetrics.FakePodMetricsClient{ + Res: map[types.NamespacedName]*backendmetrics.Metrics{ + pod1NamespacedName: pod1Metrics, + pod2NamespacedName: pod2Metrics, + }, + }, + storePods: []*corev1.Pod{pod1, pod2}, + want: []*backendmetrics.Metrics{pod1Metrics, pod2Metrics}, + }, + { + name: "Only pods in are probed", + pmc: &backendmetrics.FakePodMetricsClient{ + Res: map[types.NamespacedName]*backendmetrics.Metrics{ + pod1NamespacedName: pod1Metrics, + pod2NamespacedName: pod2Metrics, + }, + }, + storePods: []*corev1.Pod{pod1}, + want: []*backendmetrics.Metrics{pod1Metrics}, + }, + { + name: "Probing metrics error", + pmc: &backendmetrics.FakePodMetricsClient{ + Err: map[types.NamespacedName]error{ + pod2NamespacedName: errors.New("injected error"), + }, + Res: map[types.NamespacedName]*backendmetrics.Metrics{ + pod1NamespacedName: pod1Metrics, + }, + }, + storePods: []*corev1.Pod{pod1, pod2}, + want: []*backendmetrics.Metrics{ + pod1Metrics, + // Failed to fetch pod2 metrics so it remains the default values. + { + ActiveModels: map[string]int{}, + WaitingQueueSize: 0, + KVCacheUsagePercent: 0, + MaxActiveModels: 0, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + pmf := backendmetrics.NewPodMetricsFactory(test.pmc, time.Millisecond) + ds := NewDatastore(ctx, pmf) + ds.PoolSet(inferencePool) + for _, pod := range test.storePods { + ds.PodUpdateOrAddIfNotExist(pod, inferencePool) + } + assert.EventuallyWithT(t, func(t *assert.CollectT) { + got := ds.PodGetAll() + metrics := []*backendmetrics.Metrics{} + for _, one := range got { + metrics = append(metrics, one.GetMetrics()) + } + diff := cmp.Diff(test.want, metrics, cmpopts.IgnoreFields(backendmetrics.Metrics{}, "UpdateTime"), cmpopts.SortSlices(func(a, b *backendmetrics.Metrics) bool { + return a.String() < b.String() + })) + assert.Equal(t, "", diff, "Unexpected diff (+got/-want)") + }, 5*time.Second, time.Millisecond) + }) + } +} diff --git a/pkg/epp/handlers/request.go b/pkg/epp/handlers/request.go new file mode 100644 index 00000000..12afe4d7 --- /dev/null +++ b/pkg/epp/handlers/request.go @@ -0,0 +1,210 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package handlers + +import ( + "context" + "encoding/json" + "fmt" + "strconv" + + configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "google.golang.org/protobuf/types/known/structpb" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +// HandleRequestBody handles body of the request to the backend server, such as parsing the "model" +// parameter. +// Envoy sends the request body to ext proc before sending the request to the backend server. +func (s *Server) HandleRequestBody( + ctx context.Context, + reqCtx *RequestContext, + req *extProcPb.ProcessingRequest, +) (*extProcPb.ProcessingResponse, error) { + logger := log.FromContext(ctx) + loggerVerbose := logger.V(logutil.VERBOSE) + loggerVerbose.Info("Handling request body") + + // Unmarshal request body (must be JSON). + v := req.Request.(*extProcPb.ProcessingRequest_RequestBody) + var rb map[string]interface{} + if err := json.Unmarshal(v.RequestBody.Body, &rb); err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error unmarshaling request body") + return nil, errutil.Error{Code: errutil.BadRequest, Msg: fmt.Sprintf("error unmarshaling request body: %v", err)} + } + loggerVerbose.Info("Request body unmarshalled", "body", rb) + + // Resolve target models. + model, ok := rb["model"].(string) + if !ok { + return nil, errutil.Error{Code: errutil.BadRequest, Msg: "model not found in request"} + } + loggerVerbose.Info("Model requested", "model", model) + modelName := model + + // NOTE: The nil checking for the modelObject means that we DO allow passthrough currently. + // This might be a security risk in the future where adapters not registered in the InferenceModel + // are able to be requested by using their distinct name. + modelObj := s.datastore.ModelGet(model) + if modelObj == nil { + return nil, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error finding a model object in InferenceModel for input %v", model)} + } + if len(modelObj.Spec.TargetModels) > 0 { + modelName = datastore.RandomWeightedDraw(logger, modelObj, 0) + if modelName == "" { + return nil, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error getting target model name for model %v", modelObj.Name)} + } + } + llmReq := &scheduling.LLMRequest{ + Model: model, + ResolvedTargetModel: modelName, + Critical: datastore.IsCritical(modelObj), + } + loggerVerbose.Info("LLM request assembled", "request", llmReq) + + requestBody := v.RequestBody.Body + var err error + // Update target models in the body. + if llmReq.Model != llmReq.ResolvedTargetModel { + rb["model"] = llmReq.ResolvedTargetModel + requestBody, err = json.Marshal(rb) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error marshaling request body") + return nil, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("error marshaling request body: %v", err)} + } + loggerVerbose.Info("Updated request body marshalled", "body", string(requestBody)) + } + + target, err := s.scheduler.Schedule(ctx, llmReq) + if err != nil { + return nil, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()} + } + targetPod := target.GetPod() + + logger.V(logutil.DEFAULT).Info("Request handled", + "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "endpoint", targetPod) + + // Insert target endpoint to instruct Envoy to route requests to the specified target pod. + // Attach the port number + pool, err := s.datastore.PoolGet() + if err != nil { + return nil, err + } + endpoint := targetPod.Address + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber)) + + reqCtx.Model = llmReq.Model + reqCtx.ResolvedTargetModel = llmReq.ResolvedTargetModel + reqCtx.RequestSize = len(v.RequestBody.Body) + reqCtx.TargetPod = targetPod.NamespacedName.String() + reqCtx.TargetEndpoint = endpoint + + headers := []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: s.destinationEndpointHintKey, + RawValue: []byte(endpoint), + }, + }, + // We need to update the content length header if the body is mutated, see Envoy doc: + // https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/filters/http/ext_proc/v3/processing_mode.proto + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte(strconv.Itoa(len(requestBody))), + }, + }, + } + // Print headers for debugging + for _, header := range headers { + logger.V(logutil.DEBUG).Info("Request body header", "key", header.Header.Key, "value", header.Header.RawValue) + } + + targetEndpointValue := &structpb.Struct{ + Fields: map[string]*structpb.Value{ + s.destinationEndpointHintKey: { + Kind: &structpb.Value_StringValue{ + StringValue: endpoint, + }, + }, + }, + } + dynamicMetadata := targetEndpointValue + if s.destinationEndpointHintMetadataNamespace != "" { + // If a namespace is defined, wrap the selected endpoint with that. + dynamicMetadata = &structpb.Struct{ + Fields: map[string]*structpb.Value{ + s.destinationEndpointHintMetadataNamespace: { + Kind: &structpb.Value_StructValue{ + StructValue: targetEndpointValue, + }, + }, + }, + } + } + + resp := &extProcPb.ProcessingResponse{ + // The Endpoint Picker supports two approaches to communicating the target endpoint, as a request header + // and as an unstructure ext-proc response metadata key/value pair. This enables different integration + // options for gateway providers. + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: headers, + }, + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_Body{ + Body: requestBody, + }, + }, + }, + }, + }, + DynamicMetadata: dynamicMetadata, + } + return resp, nil +} + +func HandleRequestHeaders( + ctx context.Context, + reqCtx *RequestContext, + req *extProcPb.ProcessingRequest, +) *extProcPb.ProcessingResponse { + r := req.Request + h := r.(*extProcPb.ProcessingRequest_RequestHeaders) + log.FromContext(ctx).V(logutil.VERBOSE).Info("Handling request headers", "headers", h) + + resp := &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + // Set `clear_route_cache = true` to force Envoy to recompute the target cluster + // based on the new "target-pod" header. + // See https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor.proto#service-ext-proc-v3-commonresponse. + ClearRouteCache: true, + }, + }, + }, + } + + return resp +} diff --git a/pkg/ext-proc/handlers/response.go b/pkg/epp/handlers/response.go similarity index 59% rename from pkg/ext-proc/handlers/response.go rename to pkg/epp/handlers/response.go index 3b8a9946..f9396acf 100644 --- a/pkg/ext-proc/handlers/response.go +++ b/pkg/epp/handlers/response.go @@ -1,20 +1,80 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package handlers import ( + "context" "encoding/json" "fmt" configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" - klog "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/log" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) // HandleResponseHeaders processes response headers from the backend model server. -func (s *Server) HandleResponseHeaders(reqCtx *RequestContext, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { - klog.V(logutil.VERBOSE).Info("Processing ResponseHeaders") +func (s *Server) HandleResponseHeaders( + ctx context.Context, + reqCtx *RequestContext, + req *extProcPb.ProcessingRequest, +) (*extProcPb.ProcessingResponse, error) { + loggerVerbose := log.FromContext(ctx).V(logutil.VERBOSE) + loggerVerbose.Info("Processing ResponseHeaders") h := req.Request.(*extProcPb.ProcessingRequest_ResponseHeaders) - klog.V(logutil.VERBOSE).Infof("Headers before: %+v\n", h) + loggerVerbose.Info("Headers before", "headers", h) + + // Example header + // { + // "ResponseHeaders": { + // "headers": [ + // { + // "key": ":status", + // "raw_value": "200" + // }, + // { + // "key": "date", + // "raw_value": "Thu, 30 Jan 2025 18:50:48 GMT" + // }, + // { + // "key": "server", + // "raw_value": "uvicorn" + // }, + // { + // "key": "content-type", + // "raw_value": "text/event-stream; charset=utf-8" + // }, + // { + // "key": "transfer-encoding", + // "raw_value": "chunked" + // } + // ] + // } + // } + for _, header := range h.ResponseHeaders.Headers.GetHeaders() { + if header.Key == "status" { + code := header.RawValue[0] + if string(code) != "200" { + reqCtx.ResponseStatusCode = errutil.ModelServerError + } + break + } + } resp := &extProcPb.ProcessingResponse{ Response: &extProcPb.ProcessingResponse_ResponseHeaders{ @@ -65,13 +125,19 @@ func (s *Server) HandleResponseHeaders(reqCtx *RequestContext, req *extProcPb.Pr "completion_tokens": 100 } }*/ -func (s *Server) HandleResponseBody(reqCtx *RequestContext, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { - klog.V(logutil.VERBOSE).Info("Processing HandleResponseBody") +func (s *Server) HandleResponseBody( + ctx context.Context, + reqCtx *RequestContext, + req *extProcPb.ProcessingRequest, +) (*extProcPb.ProcessingResponse, error) { + logger := log.FromContext(ctx) + loggerVerbose := logger.V(logutil.VERBOSE) + loggerVerbose.Info("Processing HandleResponseBody") body := req.Request.(*extProcPb.ProcessingRequest_ResponseBody) res := Response{} if err := json.Unmarshal(body.ResponseBody.Body, &res); err != nil { - return nil, fmt.Errorf("unmarshaling response body: %v", err) + return nil, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("unmarshaling response body: %v", err)} } reqCtx.Response = res reqCtx.ResponseSize = len(body.ResponseBody.Body) @@ -81,7 +147,7 @@ func (s *Server) HandleResponseBody(reqCtx *RequestContext, req *extProcPb.Proce // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/178) // will add the processing for streaming case. reqCtx.ResponseComplete = true - klog.V(logutil.VERBOSE).Infof("Response: %+v", res) + loggerVerbose.Info("Response generated", "response", res) resp := &extProcPb.ProcessingResponse{ Response: &extProcPb.ProcessingResponse_ResponseBody{ diff --git a/pkg/ext-proc/handlers/response_test.go b/pkg/epp/handlers/response_test.go similarity index 71% rename from pkg/ext-proc/handlers/response_test.go rename to pkg/epp/handlers/response_test.go index df338066..01f02d09 100644 --- a/pkg/ext-proc/handlers/response_test.go +++ b/pkg/epp/handlers/response_test.go @@ -1,10 +1,28 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package handlers import ( + "context" "testing" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "github.com/google/go-cmp/cmp" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) const ( @@ -34,6 +52,8 @@ const ( ) func TestHandleResponseBody(t *testing.T) { + ctx := logutil.NewTestLoggerIntoContext(context.Background()) + tests := []struct { name string req *extProcPb.ProcessingRequest_ResponseBody @@ -70,8 +90,7 @@ func TestHandleResponseBody(t *testing.T) { t.Run(test.name, func(t *testing.T) { server := &Server{} reqCtx := &RequestContext{} - _, err := server.HandleResponseBody(reqCtx, &extProcPb.ProcessingRequest{Request: test.req}) - + _, err := server.HandleResponseBody(ctx, reqCtx, &extProcPb.ProcessingRequest{Request: test.req}) if err != nil { if !test.wantErr { t.Fatalf("HandleResponseBody returned unexpected error: %v, want %v", err, test.wantErr) diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go new file mode 100644 index 00000000..be882fc7 --- /dev/null +++ b/pkg/epp/handlers/server.go @@ -0,0 +1,217 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package handlers + +import ( + "context" + "io" + "time" + + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + "sigs.k8s.io/controller-runtime/pkg/log" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +func NewServer(scheduler Scheduler, destinationEndpointHintMetadataNamespace, destinationEndpointHintKey string, datastore datastore.Datastore) *Server { + return &Server{ + scheduler: scheduler, + destinationEndpointHintMetadataNamespace: destinationEndpointHintMetadataNamespace, + destinationEndpointHintKey: destinationEndpointHintKey, + datastore: datastore, + } +} + +// Server implements the Envoy external processing server. +// https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor.proto +type Server struct { + scheduler Scheduler + // The key of the header to specify the target pod address. This value needs to match Envoy + // configuration. + destinationEndpointHintKey string + // The key acting as the outer namespace struct in the metadata extproc response to communicate + // back the picked endpoints. + destinationEndpointHintMetadataNamespace string + datastore datastore.Datastore +} + +type Scheduler interface { + Schedule(ctx context.Context, b *scheduling.LLMRequest) (targetPod backendmetrics.PodMetrics, err error) +} + +func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { + ctx := srv.Context() + logger := log.FromContext(ctx) + loggerVerbose := logger.V(logutil.VERBOSE) + loggerVerbose.Info("Processing") + + // Create request context to share states during life time of an HTTP request. + // See https://github.com/envoyproxy/envoy/issues/17540. + reqCtx := &RequestContext{} + + // Create variable for error handling as each request should only report once for + // error metric. This doesn't cover the error "Cannot receive stream request" because + // such error might happen even the response is processed. + var err error + defer func(error) { + if reqCtx.ResponseStatusCode != "" { + metrics.RecordRequestErrCounter(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseStatusCode) + } else if err != nil { + metrics.RecordRequestErrCounter(reqCtx.Model, reqCtx.ResolvedTargetModel, errutil.CanonicalCode(err)) + } + }(err) + + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + req, recvErr := srv.Recv() + if recvErr == io.EOF || status.Code(recvErr) == codes.Canceled { + return nil + } + if recvErr != nil { + // This error occurs very frequently, though it doesn't seem to have any impact. + // TODO Figure out if we can remove this noise. + loggerVerbose.Error(err, "Cannot receive stream request") + return status.Errorf(codes.Unknown, "cannot receive stream request: %v", err) + } + + var resp *extProcPb.ProcessingResponse + switch v := req.Request.(type) { + case *extProcPb.ProcessingRequest_RequestHeaders: + reqCtx.RequestReceivedTimestamp = time.Now() + resp = HandleRequestHeaders(ctx, reqCtx, req) + loggerVerbose.Info("Request context after HandleRequestHeaders", "context", reqCtx) + case *extProcPb.ProcessingRequest_RequestBody: + resp, err = s.HandleRequestBody(ctx, reqCtx, req) + if err == nil { + metrics.RecordRequestCounter(reqCtx.Model, reqCtx.ResolvedTargetModel) + metrics.RecordRequestSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestSize) + } + loggerVerbose.Info("Request context after HandleRequestBody", "context", reqCtx) + case *extProcPb.ProcessingRequest_ResponseHeaders: + resp, err = s.HandleResponseHeaders(ctx, reqCtx, req) + loggerVerbose.Info("Request context after HandleResponseHeaders", "context", reqCtx) + case *extProcPb.ProcessingRequest_ResponseBody: + resp, err = s.HandleResponseBody(ctx, reqCtx, req) + if err == nil && reqCtx.ResponseComplete { + reqCtx.ResponseCompleteTimestamp = time.Now() + metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) + metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) + metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Response.Usage.PromptTokens) + metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Response.Usage.CompletionTokens) + } + loggerVerbose.Info("Request context after HandleResponseBody", "context", reqCtx) + default: + logger.V(logutil.DEFAULT).Error(nil, "Unknown Request type", "request", v) + return status.Error(codes.Unknown, "unknown request type") + } + + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Failed to process request", "request", req) + resp, err = BuildErrResponse(err) + if err != nil { + return err + } + } + + loggerVerbose.Info("Response generated", "response", resp) + if err := srv.Send(resp); err != nil { + logger.V(logutil.DEFAULT).Error(err, "Send failed") + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + } +} + +func BuildErrResponse(err error) (*extProcPb.ProcessingResponse, error) { + var resp *extProcPb.ProcessingResponse + + switch errutil.CanonicalCode(err) { + // This code can be returned by scheduler when there is no capacity for sheddable + // requests. + case errutil.InferencePoolResourceExhausted: + resp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: envoyTypePb.StatusCode_TooManyRequests, + }, + }, + }, + } + // This code can be returned by when EPP processes the request and run into server-side errors. + case errutil.Internal: + resp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: envoyTypePb.StatusCode_InternalServerError, + }, + }, + }, + } + // This code can be returned when users provide invalid json request. + case errutil.BadRequest: + resp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: envoyTypePb.StatusCode_BadRequest, + }, + }, + }, + } + case errutil.BadConfiguration: + resp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: envoyTypePb.StatusCode_NotFound, + }, + }, + }, + } + default: + return nil, status.Errorf(status.Code(err), "failed to handle request: %v", err) + } + return resp, nil +} + +// RequestContext stores context information during the life time of an HTTP request. +type RequestContext struct { + TargetPod string + TargetEndpoint string + Model string + ResolvedTargetModel string + RequestReceivedTimestamp time.Time + ResponseCompleteTimestamp time.Time + RequestSize int + Response Response + ResponseSize int + ResponseComplete bool + ResponseStatusCode string +} diff --git a/pkg/epp/handlers/streamingserver.go b/pkg/epp/handlers/streamingserver.go new file mode 100644 index 00000000..2aaca7f3 --- /dev/null +++ b/pkg/epp/handlers/streamingserver.go @@ -0,0 +1,538 @@ +package handlers + +import ( + "context" + "encoding/json" + "fmt" + "io" + "strconv" + "strings" + "time" + + configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "github.com/go-logr/logr" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + "google.golang.org/protobuf/types/known/structpb" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +func NewStreamingServer(scheduler Scheduler, destinationEndpointHintMetadataNamespace, destinationEndpointHintKey string, datastore datastore.Datastore) *StreamingServer { + return &StreamingServer{ + scheduler: scheduler, + destinationEndpointHintMetadataNamespace: destinationEndpointHintMetadataNamespace, + destinationEndpointHintKey: destinationEndpointHintKey, + datastore: datastore, + } +} + +type StreamingServer struct { + scheduler Scheduler + // The key of the header to specify the target pod address. This value needs to match Envoy + // configuration. + destinationEndpointHintKey string + // The key acting as the outer namespace struct in the metadata extproc response to communicate + // back the picked endpoints. + destinationEndpointHintMetadataNamespace string + datastore datastore.Datastore +} + +func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { + ctx := srv.Context() + logger := log.FromContext(ctx) + loggerVerbose := logger.V(logutil.VERBOSE) + loggerVerbose.Info("Processing") + + // Create request context to share states during life time of an HTTP request. + // See https://github.com/envoyproxy/envoy/issues/17540. + reqCtx := &StreamingRequestContext{ + RequestState: RequestReceived, + } + + reader, writer := io.Pipe() + decoder := json.NewDecoder(reader) + + var requestBody, responseBody map[string]interface{} + // Create error handling var as each request should only report once for + // error metrics. This doesn't cover the error "Cannot receive stream request" because + // such errors might happen even though response is processed. + var err error + defer func(error) { + if reqCtx.ResponseStatusCode != "" { + metrics.RecordRequestErrCounter(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseStatusCode) + } else if err != nil { + metrics.RecordRequestErrCounter(reqCtx.Model, reqCtx.ResolvedTargetModel, errutil.CanonicalCode(err)) + } + }(err) + + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + req, recvErr := srv.Recv() + if recvErr == io.EOF || status.Code(recvErr) == codes.Canceled { + return nil + } + if recvErr != nil { + // This error occurs very frequently, though it doesn't seem to have any impact. + // TODO Figure out if we can remove this noise. + loggerVerbose.Error(err, "Cannot receive stream request") + return status.Errorf(codes.Unknown, "cannot receive stream request: %v", err) + } + + switch v := req.Request.(type) { + case *extProcPb.ProcessingRequest_RequestHeaders: + // Do nothing. Header info is handled in the HandleRequestBody func + case *extProcPb.ProcessingRequest_RequestBody: + loggerVerbose.Info("Incoming body chunk", "body", string(v.RequestBody.Body), "EoS", v.RequestBody.EndOfStream) + // In the stream case, we can receive multiple request bodies. + // To buffer the full message, we create a goroutine with a writer.Write() + // call, which will block until the corresponding reader reads from it. + // We do not read until we receive the EndofStream signal, and then + // decode the entire JSON body. + go func() { + _, err := writer.Write(v.RequestBody.Body) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error populating writer") + } + }() + + // Message is buffered, we can read and decode. + if v.RequestBody.EndOfStream { + loggerVerbose.Info("decoding") + err = decoder.Decode(&requestBody) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error unmarshaling request body") + } + // Body stream complete. Close the reader pipe, and start anew for response. + reader.Close() + reader, writer = io.Pipe() + decoder = json.NewDecoder(reader) + + reqCtx, err = s.HandleRequestBody(ctx, reqCtx, req, requestBody) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error handling body") + } else { + metrics.RecordRequestCounter(reqCtx.Model, reqCtx.ResolvedTargetModel) + metrics.RecordRequestSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestSize) + } + loggerVerbose.Info("Request context after HandleRequestBody", "context", reqCtx) + } + case *extProcPb.ProcessingRequest_RequestTrailers: + // This is currently unused. + case *extProcPb.ProcessingRequest_ResponseHeaders: + loggerVerbose.Info("got response headers", "headers", v.ResponseHeaders.Headers.GetHeaders()) + for _, header := range v.ResponseHeaders.Headers.GetHeaders() { + value := string(header.RawValue) + logger.Error(nil, "header", "key", header.Key, "value", value) + if header.Key == "status" && value != "200" { + reqCtx.ResponseStatusCode = errutil.ModelServerError + } else if header.Key == "content-type" && strings.Contains(value, "text/event-stream") { + reqCtx.modelServerStreaming = true + loggerVerbose.Info("model server is streaming response") + logger.Error(nil, "made it here") + } + } + reqCtx.RequestState = ResponseRecieved + reqCtx.respHeaderResp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ResponseHeaders{ + ResponseHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + // This is for debugging purpose only. + Key: "x-went-into-resp-headers", + RawValue: []byte("true"), + }, + }, + }, + }, + }, + }, + }, + } + + case *extProcPb.ProcessingRequest_ResponseBody: + if reqCtx.modelServerStreaming { + // Currently we punt on response parsing if the modelServer is streaming, and we just passthrough. + reqCtx.respBodyResp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: v.ResponseBody.Body, + EndOfStream: v.ResponseBody.EndOfStream, + }, + }, + }, + }, + }, + }, + } + } else { + go func() { + _, err := writer.Write(v.ResponseBody.Body) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error populating writer") + } + }() + + // Message is buffered, we can read and decode. + if v.ResponseBody.EndOfStream { + err = decoder.Decode(&responseBody) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error unmarshaling request body") + } + // Body stream complete. Close the reader pipe. + reader.Close() + + reqCtx, err = s.HandleResponseBody(ctx, reqCtx, responseBody) + if err == nil && reqCtx.ResponseComplete { + reqCtx.ResponseCompleteTimestamp = time.Now() + metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) + metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) + metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens) + metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens) + } + loggerVerbose.Info("Request context after HandleResponseBody", "context", reqCtx) + } + } + case *extProcPb.ProcessingRequest_ResponseTrailers: + // This is currently unused. + } + + // Handle the err and fire an immediate response. + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Failed to process request", "request", req) + resp, err := BuildErrResponse(err) + if err != nil { + return err + } + if err := srv.Send(resp); err != nil { + logger.V(logutil.DEFAULT).Error(err, "Send failed") + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + return nil + } + loggerVerbose.Info("checking", "request state", reqCtx.RequestState) + if err := reqCtx.updateStateAndSendIfNeeded(srv, loggerVerbose); err != nil { + return err + } + } +} + +// updateStateAndSendIfNeeded checks state and can send mutiple responses in a single pass, but only if ordered properly. +// Order of requests matter in FULL_DUPLEX_STREAMING. For both request and response, the order of response sent back MUST be: Header->Body->Trailer, with trailer being optional. +func (r *StreamingRequestContext) updateStateAndSendIfNeeded(srv extProcPb.ExternalProcessor_ProcessServer, loggerVerbose logr.Logger) error { + // No switch statement as we could send multiple responses in one pass. + if r.RequestState == RequestReceived && r.reqHeaderResp != nil { + loggerVerbose.Info("Request header response", "obj", r.reqHeaderResp) + if err := srv.Send(r.reqHeaderResp); err != nil { + loggerVerbose.Error(err, "error sending response") + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + r.RequestState = HeaderRequestResponseComplete + } + if r.RequestState == HeaderRequestResponseComplete && r.reqBodyResp != nil { + loggerVerbose.Info("Request body response", "obj", r.reqBodyResp) + if err := srv.Send(r.reqBodyResp); err != nil { + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + r.RequestState = BodyRequestResponsesComplete + // Dump the response so a new stream message can begin + r.reqBodyResp = nil + } + if r.RequestState == BodyRequestResponsesComplete && r.reqTrailerResp != nil { + // Trailers in requests are not guaranteed + if err := srv.Send(r.reqHeaderResp); err != nil { + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + } + if r.RequestState == ResponseRecieved && r.respHeaderResp != nil { + loggerVerbose.Info("Response header response", "obj", r.respHeaderResp) + if err := srv.Send(r.respHeaderResp); err != nil { + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + r.RequestState = HeaderResponseResponseComplete + } + if r.RequestState == HeaderResponseResponseComplete && r.respBodyResp != nil { + loggerVerbose.Info("Response body response", "obj", r.respBodyResp) + if err := srv.Send(r.respBodyResp); err != nil { + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + + body := r.respBodyResp.Response.(*extProcPb.ProcessingResponse_ResponseBody) + if body.ResponseBody.Response.GetBodyMutation().GetStreamedResponse().GetEndOfStream() { + r.RequestState = BodyResponseResponsesComplete + } + // Dump the response so a new stream message can begin + r.reqBodyResp = nil + } + if r.RequestState == BodyResponseResponsesComplete && r.respTrailerResp != nil { + // Trailers in requests are not guaranteed + if err := srv.Send(r.reqHeaderResp); err != nil { + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + } + return nil +} + +type StreamingRequestContext struct { + TargetPod string + TargetEndpoint string + Model string + ResolvedTargetModel string + RequestState StreamRequestState + RequestReceivedTimestamp time.Time + ResponseCompleteTimestamp time.Time + RequestSize int + Usage Usage + ResponseSize int + ResponseComplete bool + ResponseStatusCode string + + modelServerStreaming bool + + reqHeaderResp *extProcPb.ProcessingResponse + reqBodyResp *extProcPb.ProcessingResponse + reqTrailerResp *extProcPb.ProcessingResponse + + respHeaderResp *extProcPb.ProcessingResponse + respBodyResp *extProcPb.ProcessingResponse + respTrailerResp *extProcPb.ProcessingResponse +} + +type StreamRequestState int + +const ( + RequestReceived StreamRequestState = 0 + HeaderRequestResponseComplete StreamRequestState = 1 + BodyRequestResponsesComplete StreamRequestState = 2 + TrailerRequestResponsesComplete StreamRequestState = 3 + ResponseRecieved StreamRequestState = 4 + HeaderResponseResponseComplete StreamRequestState = 5 + BodyResponseResponsesComplete StreamRequestState = 6 + TrailerResponseResponsesComplete StreamRequestState = 7 +) + +// HandleRequestBody always returns the requestContext even in the error case, as the request context is used in error handling. +func (s *StreamingServer) HandleRequestBody( + ctx context.Context, + reqCtx *StreamingRequestContext, + req *extProcPb.ProcessingRequest, + requestBodyMap map[string]interface{}, +) (*StreamingRequestContext, error) { + var requestBodyBytes []byte + logger := log.FromContext(ctx) + loggerVerbose := logger.V(logutil.VERBOSE) + loggerVerbose.Info("Handling request body") + + // Resolve target models. + model, ok := requestBodyMap["model"].(string) + if !ok { + return reqCtx, errutil.Error{Code: errutil.BadRequest, Msg: "model not found in request"} + } + loggerVerbose.Info("Model requested", "model", model) + modelName := model + + // NOTE: The nil checking for the modelObject means that we DO allow passthrough currently. + // This might be a security risk in the future where adapters not registered in the InferenceModel + // are able to be requested by using their distinct name. + modelObj := s.datastore.ModelGet(model) + if modelObj == nil { + return reqCtx, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error finding a model object in InferenceModel for input %v", model)} + } + if len(modelObj.Spec.TargetModels) > 0 { + modelName = datastore.RandomWeightedDraw(logger, modelObj, 0) + if modelName == "" { + return reqCtx, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error getting target model name for model %v", modelObj.Name)} + } + } + llmReq := &scheduling.LLMRequest{ + Model: model, + ResolvedTargetModel: modelName, + Critical: datastore.IsCritical(modelObj), + } + loggerVerbose.Info("LLM request assembled", "request", llmReq) + + var err error + // Update target models in the body. + if llmReq.Model != llmReq.ResolvedTargetModel { + requestBodyMap["model"] = llmReq.ResolvedTargetModel + } + + requestBodyBytes, err = json.Marshal(requestBodyMap) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error marshaling request body") + return reqCtx, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("error marshaling request body: %v", err)} + } + loggerVerbose.Info("Updated request body marshalled", "body", string(requestBodyBytes)) + + target, err := s.scheduler.Schedule(ctx, llmReq) + if err != nil { + return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()} + } + targetPod := target.GetPod() + + // Insert target endpoint to instruct Envoy to route requests to the specified target pod. + // Attach the port number + pool, err := s.datastore.PoolGet() + if err != nil { + return reqCtx, err + } + endpoint := targetPod.Address + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber)) + + logger.V(logutil.DEFAULT).Info("Request handled", + "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "endpoint", targetPod) + + reqCtx.Model = llmReq.Model + reqCtx.ResolvedTargetModel = llmReq.ResolvedTargetModel + reqCtx.RequestSize = len(requestBodyBytes) + reqCtx.TargetPod = targetPod.NamespacedName.String() + reqCtx.TargetEndpoint = endpoint + + headers := []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: s.destinationEndpointHintKey, + RawValue: []byte(endpoint), + }, + }, + // We need to update the content length header if the body is mutated, see Envoy doc: + // https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/filters/http/ext_proc/v3/processing_mode.proto + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte(strconv.Itoa(len(requestBodyBytes))), + }, + }, + } + // Print headers for debugging + for _, header := range headers { + logger.V(logutil.DEBUG).Info("Request body header", "key", header.Header.Key, "value", header.Header.RawValue) + } + + targetEndpointValue := &structpb.Struct{ + Fields: map[string]*structpb.Value{ + s.destinationEndpointHintKey: { + Kind: &structpb.Value_StringValue{ + StringValue: endpoint, + }, + }, + }, + } + dynamicMetadata := targetEndpointValue + if s.destinationEndpointHintMetadataNamespace != "" { + // If a namespace is defined, wrap the selected endpoint with that. + dynamicMetadata = &structpb.Struct{ + Fields: map[string]*structpb.Value{ + s.destinationEndpointHintMetadataNamespace: { + Kind: &structpb.Value_StructValue{ + StructValue: targetEndpointValue, + }, + }, + }, + } + } + + reqCtx.reqHeaderResp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: headers, + }, + }, + }, + }, + DynamicMetadata: dynamicMetadata, + } + reqCtx.reqBodyResp = &extProcPb.ProcessingResponse{ + // The Endpoint Picker supports two approaches to communicating the target endpoint, as a request header + // and as an unstructure ext-proc response metadata key/value pair. This enables different integration + // options for gateway providers. + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: requestBodyBytes, + EndOfStream: true, + }, + }, + }, + }, + }, + }, + } + return reqCtx, nil +} + +// HandleResponseBody always returns the requestContext even in the error case, as the request context is used in error handling. +func (s *StreamingServer) HandleResponseBody( + ctx context.Context, + reqCtx *StreamingRequestContext, + response map[string]interface{}, +) (*StreamingRequestContext, error) { + logger := log.FromContext(ctx) + loggerVerbose := logger.V(logutil.VERBOSE) + loggerVerbose.Info("Processing HandleResponseBody") + responseBytes, err := json.Marshal(response) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "error marshalling responseBody") + return reqCtx, err + } + if response["usage"] != nil { + usg := response["usage"].(map[string]interface{}) + usage := Usage{ + PromptTokens: int(usg["prompt_tokens"].(float64)), + CompletionTokens: int(usg["completion_tokens"].(float64)), + TotalTokens: int(usg["total_tokens"].(float64)), + } + reqCtx.Usage = usage + loggerVerbose.Info("Response generated", "usage", reqCtx.Usage) + } + reqCtx.ResponseSize = len(responseBytes) + // ResponseComplete is to indicate the response is complete. In non-streaming + // case, it will be set to be true once the response is processed; in + // streaming case, it will be set to be true once the last chunk is processed. + // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/178) + // will add the processing for streaming case. + reqCtx.ResponseComplete = true + + reqCtx.respBodyResp = &extProcPb.ProcessingResponse{ + // The Endpoint Picker supports two approaches to communicating the target endpoint, as a request header + // and as an unstructure ext-proc response metadata key/value pair. This enables different integration + // options for gateway providers. + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: responseBytes, + EndOfStream: true, + }, + }, + }, + }, + }, + }, + } + return reqCtx, nil +} diff --git a/pkg/ext-proc/metrics/metrics.go b/pkg/epp/metrics/metrics.go similarity index 62% rename from pkg/ext-proc/metrics/metrics.go rename to pkg/epp/metrics/metrics.go index 8cb7bd27..e86ca901 100644 --- a/pkg/ext-proc/metrics/metrics.go +++ b/pkg/epp/metrics/metrics.go @@ -1,19 +1,39 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package metrics import ( + "context" "sync" "time" compbasemetrics "k8s.io/component-base/metrics" "k8s.io/component-base/metrics/legacyregistry" - klog "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/log" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) const ( InferenceModelComponent = "inference_model" + InferencePoolComponent = "inference_pool" ) var ( + // Inference Model Metrics requestCounter = compbasemetrics.NewCounterVec( &compbasemetrics.CounterOpts{ Subsystem: InferenceModelComponent, @@ -24,13 +44,25 @@ var ( []string{"model_name", "target_model_name"}, ) + requestErrCounter = compbasemetrics.NewCounterVec( + &compbasemetrics.CounterOpts{ + Subsystem: InferenceModelComponent, + Name: "request_error_total", + Help: "Counter of inference model requests errors broken out for each model and target model.", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"model_name", "target_model_name", "error_code"}, + ) + requestLatencies = compbasemetrics.NewHistogramVec( &compbasemetrics.HistogramOpts{ Subsystem: InferenceModelComponent, Name: "request_duration_seconds", Help: "Inference model response latency distribution in seconds for each model and target model.", - Buckets: []float64{0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3, - 4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600}, + Buckets: []float64{ + 0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3, + 4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600, + }, StabilityLevel: compbasemetrics.ALPHA, }, []string{"model_name", "target_model_name"}, @@ -88,6 +120,27 @@ var ( }, []string{"model_name", "target_model_name"}, ) + + // Inference Pool Metrics + inferencePoolAvgKVCache = compbasemetrics.NewGaugeVec( + &compbasemetrics.GaugeOpts{ + Subsystem: InferencePoolComponent, + Name: "average_kv_cache_utilization", + Help: "The average kv cache utilization for an inference server pool.", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"name"}, + ) + + inferencePoolAvgQueueSize = compbasemetrics.NewGaugeVec( + &compbasemetrics.GaugeOpts{ + Subsystem: InferencePoolComponent, + Name: "average_queue_size", + Help: "The average number of requests pending in the model server queue.", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"name"}, + ) ) var registerMetrics sync.Once @@ -96,11 +149,15 @@ var registerMetrics sync.Once func Register() { registerMetrics.Do(func() { legacyregistry.MustRegister(requestCounter) + legacyregistry.MustRegister(requestErrCounter) legacyregistry.MustRegister(requestLatencies) legacyregistry.MustRegister(requestSizes) legacyregistry.MustRegister(responseSizes) legacyregistry.MustRegister(inputTokens) legacyregistry.MustRegister(outputTokens) + + legacyregistry.MustRegister(inferencePoolAvgKVCache) + legacyregistry.MustRegister(inferencePoolAvgQueueSize) }) } @@ -109,15 +166,23 @@ func RecordRequestCounter(modelName, targetModelName string) { requestCounter.WithLabelValues(modelName, targetModelName).Inc() } +// RecordRequestErrCounter records the number of error requests. +func RecordRequestErrCounter(modelName, targetModelName string, code string) { + if code != "" { + requestErrCounter.WithLabelValues(modelName, targetModelName, code).Inc() + } +} + // RecordRequestSizes records the request sizes. func RecordRequestSizes(modelName, targetModelName string, reqSize int) { requestSizes.WithLabelValues(modelName, targetModelName).Observe(float64(reqSize)) } -// RecordRequstLatencies records duration of request. -func RecordRequestLatencies(modelName, targetModelName string, received time.Time, complete time.Time) bool { +// RecordRequestLatencies records duration of request. +func RecordRequestLatencies(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time) bool { if !complete.After(received) { - klog.Errorf("request latency value error for model name %v, target model name %v: complete time %v is before received time %v", modelName, targetModelName, complete, received) + log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "Request latency values are invalid", + "modelName", modelName, "targetModelName", targetModelName, "completeTime", complete, "receivedTime", received) return false } elapsedSeconds := complete.Sub(received).Seconds() @@ -143,3 +208,11 @@ func RecordOutputTokens(modelName, targetModelName string, size int) { outputTokens.WithLabelValues(modelName, targetModelName).Observe(float64(size)) } } + +func RecordInferencePoolAvgKVCache(name string, utilization float64) { + inferencePoolAvgKVCache.WithLabelValues(name).Set(utilization) +} + +func RecordInferencePoolAvgQueueSize(name string, queueSize float64) { + inferencePoolAvgQueueSize.WithLabelValues(name).Set(queueSize) +} diff --git a/pkg/ext-proc/metrics/metrics_test.go b/pkg/epp/metrics/metrics_test.go similarity index 52% rename from pkg/ext-proc/metrics/metrics_test.go rename to pkg/epp/metrics/metrics_test.go index 57774b11..c2436bab 100644 --- a/pkg/ext-proc/metrics/metrics_test.go +++ b/pkg/epp/metrics/metrics_test.go @@ -1,20 +1,44 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package metrics import ( + "context" "os" "testing" "time" "k8s.io/component-base/metrics/legacyregistry" "k8s.io/component-base/metrics/testutil" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) -const RequestTotalMetric = InferenceModelComponent + "_request_total" -const RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds" -const RequestSizesMetric = InferenceModelComponent + "_request_sizes" -const ResponseSizesMetric = InferenceModelComponent + "_response_sizes" -const InputTokensMetric = InferenceModelComponent + "_input_tokens" -const OutputTokensMetric = InferenceModelComponent + "_output_tokens" +const ( + RequestTotalMetric = InferenceModelComponent + "_request_total" + RequestErrorTotalMetric = InferenceModelComponent + "_request_error_total" + RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds" + RequestSizesMetric = InferenceModelComponent + "_request_sizes" + ResponseSizesMetric = InferenceModelComponent + "_response_sizes" + InputTokensMetric = InferenceModelComponent + "_input_tokens" + OutputTokensMetric = InferenceModelComponent + "_output_tokens" + KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization" + QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size" +) func TestRecordRequestCounterandSizes(t *testing.T) { type requests struct { @@ -81,12 +105,72 @@ func TestRecordRequestCounterandSizes(t *testing.T) { if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestSizes, RequestSizesMetric); err != nil { t.Error(err) } + }) + } +} + +func TestRecordRequestErrorCounter(t *testing.T) { + type requests struct { + modelName string + targetModelName string + error string + } + scenarios := []struct { + name string + reqs []requests + invalid bool + }{ + { + name: "multiple requests", + reqs: []requests{ + { + modelName: "m10", + targetModelName: "t10", + error: errutil.Internal, + }, + { + modelName: "m10", + targetModelName: "t10", + error: errutil.Internal, + }, + { + modelName: "m10", + targetModelName: "t11", + error: errutil.ModelServerError, + }, + { + modelName: "m20", + targetModelName: "t20", + error: errutil.InferencePoolResourceExhausted, + }, + }, + }, + } + Register() + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + for _, req := range scenario.reqs { + RecordRequestErrCounter(req.modelName, req.targetModelName, req.error) + } + wantRequestErrorCounter, err := os.Open("testdata/request_error_total_metric") + defer func() { + if err := wantRequestErrorCounter.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestErrorCounter, RequestErrorTotalMetric); err != nil { + t.Error(err) + } }) } } func TestRecordRequestLatencies(t *testing.T) { + ctx := logutil.NewTestLoggerIntoContext(context.Background()) timeBaseline := time.Now() type requests struct { modelName string @@ -98,35 +182,36 @@ func TestRecordRequestLatencies(t *testing.T) { name string reqs []requests invalid bool - }{{ - name: "multiple requests", - reqs: []requests{ - { - modelName: "m10", - targetModelName: "t10", - receivedTime: timeBaseline, - completeTime: timeBaseline.Add(time.Millisecond * 10), - }, - { - modelName: "m10", - targetModelName: "t10", - receivedTime: timeBaseline, - completeTime: timeBaseline.Add(time.Millisecond * 1600), - }, - { - modelName: "m10", - targetModelName: "t11", - receivedTime: timeBaseline, - completeTime: timeBaseline.Add(time.Millisecond * 60), - }, - { - modelName: "m20", - targetModelName: "t20", - receivedTime: timeBaseline, - completeTime: timeBaseline.Add(time.Millisecond * 120), + }{ + { + name: "multiple requests", + reqs: []requests{ + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 10), + }, + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 1600), + }, + { + modelName: "m10", + targetModelName: "t11", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 60), + }, + { + modelName: "m20", + targetModelName: "t20", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 120), + }, }, }, - }, { name: "invalid elapsed time", reqs: []requests{ @@ -135,14 +220,16 @@ func TestRecordRequestLatencies(t *testing.T) { targetModelName: "t10", receivedTime: timeBaseline.Add(time.Millisecond * 10), completeTime: timeBaseline, - }}, + }, + }, invalid: true, - }} + }, + } Register() for _, scenario := range scenarios { t.Run(scenario.name, func(t *testing.T) { for _, req := range scenario.reqs { - success := RecordRequestLatencies(req.modelName, req.targetModelName, req.receivedTime, req.completeTime) + success := RecordRequestLatencies(ctx, req.modelName, req.targetModelName, req.receivedTime, req.completeTime) if success == scenario.invalid { t.Errorf("got record success(%v), but the request expects invalid(%v)", success, scenario.invalid) } @@ -257,3 +344,52 @@ func TestRecordResponseMetrics(t *testing.T) { }) } } + +func TestInferencePoolMetrics(t *testing.T) { + scenarios := []struct { + name string + poolName string + kvCacheAvg float64 + queueSizeAvg float64 + }{ + { + name: "basic test", + poolName: "p1", + kvCacheAvg: 0.3, + queueSizeAvg: 0.4, + }, + } + Register() + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + RecordInferencePoolAvgKVCache(scenario.poolName, scenario.kvCacheAvg) + RecordInferencePoolAvgQueueSize(scenario.poolName, scenario.queueSizeAvg) + + wantKVCache, err := os.Open("testdata/kv_cache_avg_metrics") + defer func() { + if err := wantKVCache.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantKVCache, KVCacheAvgUsageMetric); err != nil { + t.Error(err) + } + + wantQueueSize, err := os.Open("testdata/queue_avg_size_metrics") + defer func() { + if err := wantQueueSize.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantQueueSize, QueueAvgSizeMetric); err != nil { + t.Error(err) + } + }) + } +} diff --git a/pkg/ext-proc/metrics/testdata/input_tokens_metric b/pkg/epp/metrics/testdata/input_tokens_metric similarity index 100% rename from pkg/ext-proc/metrics/testdata/input_tokens_metric rename to pkg/epp/metrics/testdata/input_tokens_metric diff --git a/pkg/epp/metrics/testdata/kv_cache_avg_metrics b/pkg/epp/metrics/testdata/kv_cache_avg_metrics new file mode 100644 index 00000000..99d1a93a --- /dev/null +++ b/pkg/epp/metrics/testdata/kv_cache_avg_metrics @@ -0,0 +1,3 @@ +# HELP inference_pool_average_kv_cache_utilization [ALPHA] The average kv cache utilization for an inference server pool. +# TYPE inference_pool_average_kv_cache_utilization gauge +inference_pool_average_kv_cache_utilization{name="p1"} 0.3 diff --git a/pkg/ext-proc/metrics/testdata/output_tokens_metric b/pkg/epp/metrics/testdata/output_tokens_metric similarity index 100% rename from pkg/ext-proc/metrics/testdata/output_tokens_metric rename to pkg/epp/metrics/testdata/output_tokens_metric diff --git a/pkg/epp/metrics/testdata/queue_avg_size_metrics b/pkg/epp/metrics/testdata/queue_avg_size_metrics new file mode 100644 index 00000000..3605740c --- /dev/null +++ b/pkg/epp/metrics/testdata/queue_avg_size_metrics @@ -0,0 +1,3 @@ +# HELP inference_pool_average_queue_size [ALPHA] The average number of requests pending in the model server queue. +# TYPE inference_pool_average_queue_size gauge +inference_pool_average_queue_size{name="p1"} 0.4 diff --git a/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric b/pkg/epp/metrics/testdata/request_duration_seconds_metric similarity index 100% rename from pkg/ext-proc/metrics/testdata/request_duration_seconds_metric rename to pkg/epp/metrics/testdata/request_duration_seconds_metric diff --git a/pkg/epp/metrics/testdata/request_error_total_metric b/pkg/epp/metrics/testdata/request_error_total_metric new file mode 100644 index 00000000..31036eb6 --- /dev/null +++ b/pkg/epp/metrics/testdata/request_error_total_metric @@ -0,0 +1,5 @@ +# HELP inference_model_request_error_total [ALPHA] Counter of inference model requests errors broken out for each model and target model. +# TYPE inference_model_request_error_total counter +inference_model_request_error_total{error_code="Internal", model_name="m10",target_model_name="t10"} 2 +inference_model_request_error_total{error_code="ModelServerError", model_name="m10",target_model_name="t11"} 1 +inference_model_request_error_total{error_code="InferencePoolResourceExhausted", model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/ext-proc/metrics/testdata/request_sizes_metric b/pkg/epp/metrics/testdata/request_sizes_metric similarity index 100% rename from pkg/ext-proc/metrics/testdata/request_sizes_metric rename to pkg/epp/metrics/testdata/request_sizes_metric diff --git a/pkg/ext-proc/metrics/testdata/request_total_metric b/pkg/epp/metrics/testdata/request_total_metric similarity index 100% rename from pkg/ext-proc/metrics/testdata/request_total_metric rename to pkg/epp/metrics/testdata/request_total_metric diff --git a/pkg/ext-proc/metrics/testdata/response_sizes_metric b/pkg/epp/metrics/testdata/response_sizes_metric similarity index 100% rename from pkg/ext-proc/metrics/testdata/response_sizes_metric rename to pkg/epp/metrics/testdata/response_sizes_metric diff --git a/pkg/epp/scheduling/filter.go b/pkg/epp/scheduling/filter.go new file mode 100644 index 00000000..cee683c5 --- /dev/null +++ b/pkg/epp/scheduling/filter.go @@ -0,0 +1,248 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduling + +import ( + "errors" + "math" + "math/rand" + "time" + + "github.com/go-logr/logr" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +type Filter interface { + Name() string + Filter(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) +} + +// filter applies current filterFunc, and then recursively applies next filters depending success or +// failure of the current filterFunc. +// It can be used to construct a flow chart algorithm. +type filter struct { + name string + filter filterFunc + // nextOnSuccess filter will be applied after successfully applying the current filter. + // The filtered results will be passed to the next filter. + nextOnSuccess *filter + // nextOnFailure filter will be applied if current filter fails. + // The original input will be passed to the next filter. + nextOnFailure *filter + // nextOnSuccessOrFailure is a convenience field to configure the next filter regardless of the + // success or failure of the current filter. + // NOTE: When using nextOnSuccessOrFailure, both nextOnSuccess and nextOnFailure SHOULD be nil. + // However if that's not the case, nextOnSuccess and nextOnFailure will be used, instead of + // nextOnSuccessOrFailure, in the success and failure scenarios, respectively. + nextOnSuccessOrFailure *filter +} + +func (f *filter) Name() string { + if f == nil { + return "nil" + } + return f.name +} + +func (f *filter) Filter(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { + loggerTrace := logger.V(logutil.TRACE) + loggerTrace.Info("Running a filter", "name", f.Name(), "podCount", len(pods)) + + filtered, err := f.filter(logger, req, pods) + + next := f.nextOnSuccessOrFailure + if err == nil && len(filtered) > 0 { + if f.nextOnSuccess == nil && f.nextOnSuccessOrFailure == nil { + // No succeeding filters to run, return. + return filtered, err + } + if f.nextOnSuccess != nil { + next = f.nextOnSuccess + } + loggerTrace.Info("Filter succeeded", "filter", f.Name(), "next", next.Name(), "filteredPodCount", len(filtered)) + // On success, pass the filtered result to the next filter. + return next.Filter(logger, req, filtered) + } else { + if f.nextOnFailure == nil && f.nextOnSuccessOrFailure == nil { + // No succeeding filters to run, return. + return filtered, err + } + if f.nextOnFailure != nil { + next = f.nextOnFailure + } + loggerTrace.Info("Filter failed", "filter", f.Name(), "next", next.Name()) + // On failure, pass the initial set of pods to the next filter. + return next.Filter(logger, req, pods) + } +} + +// filterFunc filters a set of input pods to a subset. +type filterFunc func(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) + +// toFilterFunc is a helper function to convert a per pod filter func to the FilterFunc. +func toFilterFunc(pp podPredicate) filterFunc { + return func(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { + filtered := []backendmetrics.PodMetrics{} + for _, pod := range pods { + pass := pp(req, pod) + if pass { + filtered = append(filtered, pod) + } + } + if len(filtered) == 0 { + return nil, errors.New("no pods left") + } + return filtered, nil + } +} + +// leastQueuingFilterFunc finds the max and min queue size of all pods, divides the whole range +// (max-min) by the number of pods, and finds the pods that fall into the first range. +// The intuition is that if there are multiple pods that share similar queue size in the low range, +// we should consider them all instead of the absolute minimum one. This worked better than picking +// the least one as it gives more choices for the next filter, which on aggregate gave better +// results. +// TODO: Compare this strategy with other strategies such as top K. +func leastQueuingFilterFunc(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { + min := math.MaxInt + max := 0 + filtered := []backendmetrics.PodMetrics{} + + for _, pod := range pods { + if pod.GetMetrics().WaitingQueueSize <= min { + min = pod.GetMetrics().WaitingQueueSize + } + if pod.GetMetrics().WaitingQueueSize >= max { + max = pod.GetMetrics().WaitingQueueSize + } + } + + for _, pod := range pods { + if pod.GetMetrics().WaitingQueueSize >= min && pod.GetMetrics().WaitingQueueSize <= min+(max-min)/len(pods) { + filtered = append(filtered, pod) + } + } + return filtered, nil +} + +func lowQueueingPodPredicate(_ *LLMRequest, pod backendmetrics.PodMetrics) bool { + return pod.GetMetrics().WaitingQueueSize < queueingThresholdLoRA +} + +// leastKVCacheFilterFunc finds the max and min KV cache of all pods, divides the whole range +// (max-min) by the number of pods, and finds the pods that fall into the first range. +// The intuition is that if there are multiple pods that share similar KV cache in the low range, we +// should consider them all instead of the absolute minimum one. This worked better than picking the +// least one as it gives more choices for the next filter, which on aggregate gave better results. +// TODO: Compare this strategy with other strategies such as top K. +func leastKVCacheFilterFunc(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { + min := math.MaxFloat64 + var max float64 = 0 + filtered := []backendmetrics.PodMetrics{} + + for _, pod := range pods { + if pod.GetMetrics().KVCacheUsagePercent <= min { + min = pod.GetMetrics().KVCacheUsagePercent + } + if pod.GetMetrics().KVCacheUsagePercent >= max { + max = pod.GetMetrics().KVCacheUsagePercent + } + } + + for _, pod := range pods { + if pod.GetMetrics().KVCacheUsagePercent >= min && pod.GetMetrics().KVCacheUsagePercent <= min+(max-min)/float64(len(pods)) { + filtered = append(filtered, pod) + } + } + return filtered, nil +} + +// podPredicate is a filter function to check whether a pod is desired. +type podPredicate func(req *LLMRequest, pod backendmetrics.PodMetrics) bool + +// We consider serving an adapter low cost it the adapter is active in the model server, or the +// model server has room to load the adapter. The lowLoRACostPredicate ensures weak affinity by +// spreading the load of a LoRA adapter across multiple pods, avoiding "pinning" all requests to +// a single pod. This gave good performance in our initial benchmarking results in the scenario +// where # of lora slots > # of lora adapters. +func lowLoRACostPredicate(req *LLMRequest, pod backendmetrics.PodMetrics) bool { + _, ok := pod.GetMetrics().ActiveModels[req.ResolvedTargetModel] + return ok || len(pod.GetMetrics().ActiveModels) < pod.GetMetrics().MaxActiveModels +} + +// loRASoftAffinityPredicate implements a pod selection strategy that prioritizes pods +// with existing LoRA model affinity while allowing for load balancing through randomization. +// +// The function works by: +// 1. Separating pods into two groups: those with target model affinity and those with available capacity +// 2. Using a probability threshold to sometimes select from non-affinity pods to enable load balancing +// 3. Falling back to whatever group has pods if one group is empty +// +// Parameters: +// - logger: Logger interface for diagnostic output +// - req: LLM request containing the resolved target model +// - pods: Slice of pod metrics to filter +// +// Returns: +// - Filtered slice of pod metrics based on affinity and availability +// - Error if any issues occur during filtering +func loRASoftAffinityFilter(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { + + // Pre-allocate slices with estimated capacity + filtered_affinity := make([]backendmetrics.PodMetrics, 0, len(pods)) + filtered_available := make([]backendmetrics.PodMetrics, 0, len(pods)) + + // Categorize pods based on affinity and availability + for _, pod := range pods { + + if _, exists := pod.GetMetrics().ActiveModels[req.ResolvedTargetModel]; exists { + filtered_affinity = append(filtered_affinity, pod) + } else if len(pod.GetMetrics().ActiveModels) < pod.GetMetrics().MaxActiveModels { + filtered_available = append(filtered_available, pod) + } + } + + // Use crypto/rand for better randomization in production environments + randSource := rand.NewSource(time.Now().UnixNano()) + randGen := rand.New(randSource) + + // If both groups have pods, use probability to select which group to return + if len(filtered_affinity) > 0 && len(filtered_available) > 0 { + if randGen.Float64() < loraAffinityThreshold { + return filtered_affinity, nil + } + return filtered_available, nil + } + + // Return whichever group has pods + if len(filtered_affinity) > 0 { + return filtered_affinity, nil + } + + return filtered_available, nil +} + +func criticalRequestPredicate(req *LLMRequest, _ backendmetrics.PodMetrics) bool { + return req.Critical +} + +func noQueueAndLessThanKVCacheThresholdPredicate(queueThreshold int, kvCacheThreshold float64) podPredicate { + return func(req *LLMRequest, pod backendmetrics.PodMetrics) bool { + return pod.GetMetrics().WaitingQueueSize <= queueThreshold && pod.GetMetrics().KVCacheUsagePercent <= kvCacheThreshold + } +} diff --git a/pkg/epp/scheduling/filter_test.go b/pkg/epp/scheduling/filter_test.go new file mode 100644 index 00000000..62ffe7f2 --- /dev/null +++ b/pkg/epp/scheduling/filter_test.go @@ -0,0 +1,540 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduling + +import ( + "errors" + "testing" + + "github.com/go-logr/logr" + "github.com/google/go-cmp/cmp" + "k8s.io/apimachinery/pkg/types" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +func TestFilter(t *testing.T) { + logger := logutil.NewTestLogger() + + tests := []struct { + name string + req *LLMRequest + input []*backendmetrics.FakePodMetrics + output []*backendmetrics.FakePodMetrics + err bool + filter *filter + }{ + { + name: "simple filter without successor, failure", + filter: &filter{filter: func(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { + return nil, errors.New("filter error") + }}, + err: true, + }, + { + name: "default filter, critical request", + filter: defaultFilter, + req: &LLMRequest{ + Model: "critical", + ResolvedTargetModel: "critical", + Critical: true, + }, + // pod2 will be picked because it has relatively low queue size, with the requested + // model being active, and has low KV cache. + input: []*backendmetrics.FakePodMetrics{ + { + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + }, + { + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 3, + KVCacheUsagePercent: 0.1, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "critical": 1, + }, + }, + }, + { + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + }, + }, + }, + }, + output: []*backendmetrics.FakePodMetrics{ + { + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 3, + KVCacheUsagePercent: 0.1, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "critical": 1, + }, + }, + }, + }, + }, + { + name: "default filter, sheddable request, accepted", + filter: defaultFilter, + req: &LLMRequest{ + Model: "sheddable", + ResolvedTargetModel: "sheddable", + Critical: false, + }, + // pod1 will be picked because it has capacity for the sheddable request. + input: []*backendmetrics.FakePodMetrics{ + { + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + }, + { + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 3, + KVCacheUsagePercent: 0.1, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "critical": 1, + }, + }, + }, + { + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + }, + }, + }, + }, + output: []*backendmetrics.FakePodMetrics{ + { + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + }, + }, + }, + { + name: "default filter, sheddable request, dropped", + filter: defaultFilter, + req: &LLMRequest{ + Model: "sheddable", + ResolvedTargetModel: "sheddable", + Critical: false, + }, + // All pods have higher KV cache thant the threshold, so the sheddable request will be + // dropped. + input: []*backendmetrics.FakePodMetrics{ + { + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + }, + { + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 3, + KVCacheUsagePercent: 0.85, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "critical": 1, + }, + }, + }, + { + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.85, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + }, + }, + }, + }, + output: []*backendmetrics.FakePodMetrics{}, + err: true, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + got, err := test.filter.Filter(logger, test.req, toInterface(test.input)) + if test.err != (err != nil) { + t.Errorf("Unexpected error, got %v, want %v", err, test.err) + } + + if diff := cmp.Diff(test.output, toStruct(got)); diff != "" { + t.Errorf("Unexpected output (-want +got): %v", diff) + } + }) + } +} + +func TestFilterFunc(t *testing.T) { + logger := logutil.NewTestLogger() + + tests := []struct { + name string + f filterFunc + req *LLMRequest + input []*backendmetrics.FakePodMetrics + output []*backendmetrics.FakePodMetrics + err bool + }{ + { + name: "least queuing empty input", + f: leastQueuingFilterFunc, + input: []*backendmetrics.FakePodMetrics{}, + output: []*backendmetrics.FakePodMetrics{}, + }, + { + name: "least queuing", + f: leastQueuingFilterFunc, + input: []*backendmetrics.FakePodMetrics{ + { + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 0, + }, + }, + { + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 3, + }, + }, + { + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 10, + }, + }, + }, + output: []*backendmetrics.FakePodMetrics{ + { + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 0, + }, + }, + { + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 3, + }, + }, + }, + }, + { + name: "least kv cache empty input", + f: leastKVCacheFilterFunc, + input: []*backendmetrics.FakePodMetrics{}, + output: []*backendmetrics.FakePodMetrics{}, + }, + { + name: "least kv cache", + f: leastKVCacheFilterFunc, + input: []*backendmetrics.FakePodMetrics{ + { + Metrics: &backendmetrics.Metrics{ + KVCacheUsagePercent: 0, + }, + }, + { + Metrics: &backendmetrics.Metrics{ + KVCacheUsagePercent: 0.3, + }, + }, + { + Metrics: &backendmetrics.Metrics{ + KVCacheUsagePercent: 1.0, + }, + }, + }, + output: []*backendmetrics.FakePodMetrics{ + { + Metrics: &backendmetrics.Metrics{ + KVCacheUsagePercent: 0, + }, + }, + { + Metrics: &backendmetrics.Metrics{ + KVCacheUsagePercent: 0.3, + }, + }, + }, + }, + { + name: "noQueueAndLessThanKVCacheThresholdPredicate", + f: toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(0, 0.8)), + input: []*backendmetrics.FakePodMetrics{ + { + // This pod should be returned. + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0, + }, + }, + { + // Queue is non zero, despite low kv cache, should not return. + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 1, + KVCacheUsagePercent: 0.3, + }, + }, + { + // High kv cache despite zero queue, should not return + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 1.0, + }, + }, + }, + output: []*backendmetrics.FakePodMetrics{ + { + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0, + }, + }, + }, + }, + { + name: "low LoRA cost", + f: toFilterFunc(lowLoRACostPredicate), + req: &LLMRequest{ + Model: "model", + ResolvedTargetModel: "model", + }, + input: []*backendmetrics.FakePodMetrics{ + // ActiveModels include input model, should be returned. + { + Metrics: &backendmetrics.Metrics{ + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "model": 1, + }, + }, + }, + // Input model is not active, however the server has room to load another adapter. + { + Metrics: &backendmetrics.Metrics{ + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "another-model": 1, + }, + }, + }, + // Input is not active, and the server has reached max active models. + { + Metrics: &backendmetrics.Metrics{ + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + }, + }, + output: []*backendmetrics.FakePodMetrics{ + { + Metrics: &backendmetrics.Metrics{ + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "model": 1, + }, + }, + }, + { + Metrics: &backendmetrics.Metrics{ + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "another-model": 1, + }, + }, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + got, err := test.f(logger, test.req, toInterface(test.input)) + if test.err != (err != nil) { + t.Errorf("Unexpected error, got %v, want %v", err, test.err) + } + + if diff := cmp.Diff(test.output, toStruct(got)); diff != "" { + t.Errorf("Unexpected output (-want +got): %v", diff) + } + }) + } +} + +// TestLoRASoftAffinityDistribution tests that the loRASoftAffinityFilter function +// properly distributes requests according to the loraAffinityThreshold +func TestLoRASoftAffinityDistribution(t *testing.T) { + logger := logutil.NewTestLogger() + + const ( + testModelName = "test-model" + testAffinityModel = "test-affinity-model" + numIterations = 10000 + tolerancePercent = 5.0 // Allow 5% tolerance from expected distribution + ) + + // Create a test request and pods + req := &LLMRequest{ + Model: testAffinityModel, + ResolvedTargetModel: testAffinityModel, + } + + // Test setup: One affinity pod and one available pod + pods := []*backendmetrics.FakePodMetrics{ + { + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "affinity-pod"}}, + Metrics: &backendmetrics.Metrics{ + MaxActiveModels: 2, + ActiveModels: map[string]int{ + testAffinityModel: 1, + }, + }, + }, + { + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "available-pod"}}, + Metrics: &backendmetrics.Metrics{ + MaxActiveModels: 2, + ActiveModels: map[string]int{}, + }, + }, + } + + // Run the filter function multiple times and count the results + affinityCount := 0 + availableCount := 0 + + // Use the actual loraAffinityThreshold as defined in the original code + // This test should work with whatever value is set there + expectedAffinityPercent := loraAffinityThreshold * 100 + for i := 0; i < numIterations; i++ { + result, err := loRASoftAffinityFilter(logger, req, toInterface(pods)) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + + // Check which type of pod was returned + if len(result) != 1 { + t.Fatalf("Expected exactly one pod in result, got %d", len(result)) + } + + // Identify if the returned pod is the affinity pod or available pod + if _, exists := result[0].GetMetrics().ActiveModels[testAffinityModel]; exists { + affinityCount++ + } else { + availableCount++ + } + } + + // Calculate the actual percentages + actualAffinityPercent := float64(affinityCount) / float64(numIterations) * 100 + actualAvailablePercent := float64(availableCount) / float64(numIterations) * 100 + + // Check if the distribution matches expected threshold within tolerance + affinityLowerBound := expectedAffinityPercent - tolerancePercent + affinityUpperBound := expectedAffinityPercent + tolerancePercent + + availableLowerBound := actualAvailablePercent - tolerancePercent + availableUpperBound := actualAvailablePercent + tolerancePercent + + t.Logf("Distribution results over %d iterations:", numIterations) + t.Logf("Expected affinity percent: %.2f%% (threshold: %.2f)", expectedAffinityPercent, loraAffinityThreshold) + t.Logf("Actual affinity percent: %.2f%% (%d out of %d)", actualAffinityPercent, affinityCount, numIterations) + t.Logf("Actual available percent: %.2f%% (%d out of %d)", actualAvailablePercent, availableCount, numIterations) + + if actualAffinityPercent < affinityLowerBound || actualAffinityPercent > affinityUpperBound { + t.Errorf("Affinity selection percent %.2f%% outside expected range %.2f%% to %.2f%%", + actualAffinityPercent, affinityLowerBound, affinityUpperBound) + } + if actualAvailablePercent < availableLowerBound || actualAvailablePercent > availableUpperBound { + t.Errorf("Availability selection percent %.2f%% outside expected range %.2f%% to %.2f%%", + actualAvailablePercent, availableLowerBound, availableUpperBound) + } +} + +func toInterface(input []*backendmetrics.FakePodMetrics) []backendmetrics.PodMetrics { + output := []backendmetrics.PodMetrics{} + for _, i := range input { + output = append(output, i) + } + return output +} + +func toStruct(input []backendmetrics.PodMetrics) []*backendmetrics.FakePodMetrics { + if input == nil { + return nil + } + output := []*backendmetrics.FakePodMetrics{} + for _, i := range input { + output = append(output, i.(*backendmetrics.FakePodMetrics)) + } + return output +} diff --git a/pkg/ext-proc/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go similarity index 51% rename from pkg/ext-proc/scheduling/scheduler.go rename to pkg/epp/scheduling/scheduler.go index 9fc3e663..82410787 100644 --- a/pkg/ext-proc/scheduling/scheduler.go +++ b/pkg/epp/scheduling/scheduler.go @@ -1,15 +1,33 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + // Package scheduling implements request scheduling algorithms. package scheduling import ( + "context" "fmt" "math/rand" - "google.golang.org/grpc/codes" - "google.golang.org/grpc/status" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" - klog "k8s.io/klog/v2" + "github.com/go-logr/logr" + "sigs.k8s.io/controller-runtime/pkg/log" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) const ( @@ -19,8 +37,11 @@ const ( queueThresholdCritical = 5 // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable. // the threshold for queued requests to be considered low below which we can prioritize LoRA affinity. - // The value of 50 is arrived heuristicically based on experiments. - queueingThresholdLoRA = 50 + // The value of 128 is arrived heuristicically based on experiments. + queueingThresholdLoRA = 128 + // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable. + // loraAffinityThreshold indicates the probability with which we prefer a pod with LoRA affinity over a pod without but having room to fit more LoRA adapters. + loraAffinityThreshold = 0.999 ) var ( @@ -37,7 +58,7 @@ var ( filter: leastQueuingFilterFunc, nextOnSuccessOrFailure: &filter{ name: "low cost LoRA", - filter: toFilterFunc(lowLoRACostPredicate), + filter: loRASoftAffinityFilter, nextOnSuccessOrFailure: &filter{ name: "least KV cache percent", filter: leastKVCacheFilterFunc, @@ -59,14 +80,9 @@ var ( name: "low queueing filter", filter: toFilterFunc((lowQueueingPodPredicate)), nextOnSuccess: &filter{ - name: "affinity LoRA", - filter: toFilterFunc(loRAAffinityPredicate), - nextOnSuccess: queueAndKVCacheFilter, - nextOnFailure: &filter{ - name: "can accept LoRA Adapter", - filter: toFilterFunc(canAcceptNewLoraPredicate), - nextOnSuccessOrFailure: queueAndKVCacheFilter, - }, + name: "affinity LoRA", + filter: loRASoftAffinityFilter, + nextOnSuccessOrFailure: queueAndKVCacheFilter, }, nextOnFailure: queueLoRAAndKVCacheFilter, } @@ -82,43 +98,39 @@ var ( // request to make room for critical requests. nextOnFailure: &filter{ name: "drop request", - filter: func(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { - klog.Infof("Dropping request %v", req) - return []*backend.PodMetrics{}, status.Errorf( - codes.ResourceExhausted, "dropping request due to limited backend resources") + filter: func(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { + logger.V(logutil.DEFAULT).Info("Request dropped", "request", req) + return []backendmetrics.PodMetrics{}, errutil.Error{ + Code: errutil.InferencePoolResourceExhausted, Msg: "dropping request due to limited backend resources", + } }, }, } ) -func NewScheduler(pmp PodMetricsProvider) *Scheduler { - +func NewScheduler(datastore datastore.Datastore) *Scheduler { return &Scheduler{ - podMetricsProvider: pmp, - filter: defaultFilter, + datastore: datastore, + filter: defaultFilter, } } type Scheduler struct { - podMetricsProvider PodMetricsProvider - filter Filter -} - -// PodMetricsProvider is an interface to provide set of pods in the backend and information such as -// metrics. -type PodMetricsProvider interface { - AllPodMetrics() []*backend.PodMetrics + datastore datastore.Datastore + filter Filter } // Schedule finds the target pod based on metrics and the requested lora adapter. -func (s *Scheduler) Schedule(req *LLMRequest) (targetPod backend.Pod, err error) { - klog.V(logutil.VERBOSE).Infof("request: %v; metrics: %+v", req, s.podMetricsProvider.AllPodMetrics()) - pods, err := s.filter.Filter(req, s.podMetricsProvider.AllPodMetrics()) +func (s *Scheduler) Schedule(ctx context.Context, req *LLMRequest) (targetPod backendmetrics.PodMetrics, err error) { + logger := log.FromContext(ctx).WithValues("request", req) + podMetrics := s.datastore.PodGetAll() + logger.V(logutil.VERBOSE).Info("Scheduling a request", "metrics", podMetrics) + pods, err := s.filter.Filter(logger, req, podMetrics) if err != nil || len(pods) == 0 { - return backend.Pod{}, fmt.Errorf( + return nil, fmt.Errorf( "failed to apply filter, resulted %v pods, this should never happen: %w", len(pods), err) } - klog.V(logutil.VERBOSE).Infof("Going to randomly select a pod from the candidates: %+v", pods) + logger.V(logutil.VERBOSE).Info("Selecting a random pod from the candidates", "candidatePods", pods) i := rand.Intn(len(pods)) - return pods[i].Pod, nil + return pods[i], nil } diff --git a/pkg/epp/scheduling/types.go b/pkg/epp/scheduling/types.go new file mode 100644 index 00000000..29e6648d --- /dev/null +++ b/pkg/epp/scheduling/types.go @@ -0,0 +1,27 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduling + +// LLMRequest is a structured representation of the fields we parse out of the LLMRequest body. +type LLMRequest struct { + Model string + // Target models is a map of target model name to weight. + TargetModels map[string]int + // Resolved target model is the final target model after traffic split. + ResolvedTargetModel string + Critical bool +} diff --git a/pkg/epp/server/controller_manager.go b/pkg/epp/server/controller_manager.go new file mode 100644 index 00000000..46694f7b --- /dev/null +++ b/pkg/epp/server/controller_manager.go @@ -0,0 +1,80 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package server + +import ( + "fmt" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" +) + +var scheme = runtime.NewScheme() + +func init() { + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) +} + +// NewDefaultManager creates a new controller manager with default configuration. +func NewDefaultManager(namespace, name string, restConfig *rest.Config) (ctrl.Manager, error) { + defaultOpts := ctrl.Options{ + Scheme: scheme, + Cache: cache.Options{ + ByObject: map[client.Object]cache.ByObject{ + &corev1.Pod{}: { + Namespaces: map[string]cache.Config{ + namespace: {}, + }, + }, + &v1alpha2.InferencePool{}: { + Namespaces: map[string]cache.Config{ + namespace: { + FieldSelector: fields.SelectorFromSet(fields.Set{ + "metadata.name": name, + }), + }, + }, + }, + &v1alpha2.InferenceModel{}: { + Namespaces: map[string]cache.Config{ + namespace: {}, + }, + }, + }, + }, + } + return NewManagerWithOptions(restConfig, defaultOpts) +} + +// NewManagerWithOptions creates a new controller manager with injectable options. +func NewManagerWithOptions(restConfig *rest.Config, opts manager.Options) (ctrl.Manager, error) { + manager, err := ctrl.NewManager(restConfig, opts) + if err != nil { + return nil, fmt.Errorf("failed to create controller manager: %v", err) + } + return manager, nil +} diff --git a/pkg/epp/server/runserver.go b/pkg/epp/server/runserver.go new file mode 100644 index 00000000..a6c9f1d3 --- /dev/null +++ b/pkg/epp/server/runserver.go @@ -0,0 +1,164 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package server + +import ( + "context" + "crypto/tls" + "fmt" + "time" + + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "github.com/go-logr/logr" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" + tlsutil "sigs.k8s.io/gateway-api-inference-extension/internal/tls" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/controller" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" +) + +// ExtProcServerRunner provides methods to manage an external process server. +type ExtProcServerRunner struct { + GrpcPort int + DestinationEndpointHintMetadataNamespace string + DestinationEndpointHintKey string + PoolName string + PoolNamespace string + Datastore datastore.Datastore + SecureServing bool + CertPath string + UseStreaming bool + RefreshPrometheusMetricsInterval time.Duration + + // This should only be used in tests. We won't need this once we don't inject metrics in the tests. + // TODO:(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/432) Cleanup + TestPodMetricsClient *backendmetrics.FakePodMetricsClient +} + +// Default values for CLI flags in main +const ( + DefaultGrpcPort = 9002 // default for --grpcPort + DefaultDestinationEndpointHintMetadataNamespace = "envoy.lb" // default for --destinationEndpointHintMetadataNamespace + DefaultDestinationEndpointHintKey = "x-gateway-destination-endpoint" // default for --destinationEndpointHintKey + DefaultPoolName = "" // required but no default + DefaultPoolNamespace = "default" // default for --poolNamespace + DefaultRefreshMetricsInterval = 50 * time.Millisecond // default for --refreshMetricsInterval + DefaultRefreshPrometheusMetricsInterval = 5 * time.Second // default for --refreshPrometheusMetricsInterval + DefaultSecureServing = true // default for --secureServing +) + +func NewDefaultExtProcServerRunner() *ExtProcServerRunner { + return &ExtProcServerRunner{ + GrpcPort: DefaultGrpcPort, + DestinationEndpointHintKey: DefaultDestinationEndpointHintKey, + DestinationEndpointHintMetadataNamespace: DefaultDestinationEndpointHintMetadataNamespace, + PoolName: DefaultPoolName, + PoolNamespace: DefaultPoolNamespace, + SecureServing: DefaultSecureServing, + // Datastore can be assigned later. + } +} + +// SetupWithManager sets up the runner with the given manager. +func (r *ExtProcServerRunner) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error { + // Create the controllers and register them with the manager + if err := (&controller.InferencePoolReconciler{ + Datastore: r.Datastore, + Client: mgr.GetClient(), + PoolNamespacedName: types.NamespacedName{ + Name: r.PoolName, + Namespace: r.PoolNamespace, + }, + Record: mgr.GetEventRecorderFor("InferencePool"), + }).SetupWithManager(mgr); err != nil { + return fmt.Errorf("failed setting up InferencePoolReconciler: %w", err) + } + + if err := (&controller.InferenceModelReconciler{ + Datastore: r.Datastore, + Client: mgr.GetClient(), + PoolNamespacedName: types.NamespacedName{ + Name: r.PoolName, + Namespace: r.PoolNamespace, + }, + Record: mgr.GetEventRecorderFor("InferenceModel"), + }).SetupWithManager(ctx, mgr); err != nil { + return fmt.Errorf("failed setting up InferenceModelReconciler: %w", err) + } + + if err := (&controller.PodReconciler{ + Datastore: r.Datastore, + Client: mgr.GetClient(), + Record: mgr.GetEventRecorderFor("pod"), + }).SetupWithManager(mgr); err != nil { + return fmt.Errorf("failed setting up EndpointSliceReconciler: %v", err) + } + return nil +} + +// AsRunnable returns a Runnable that can be used to start the ext-proc gRPC server. +// The runnable implements LeaderElectionRunnable with leader election disabled. +func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable { + return runnable.NoLeaderElection(manager.RunnableFunc(func(ctx context.Context) error { + backendmetrics.StartMetricsLogger(ctx, r.Datastore, r.RefreshPrometheusMetricsInterval) + var srv *grpc.Server + if r.SecureServing { + var cert tls.Certificate + var err error + if r.CertPath != "" { + cert, err = tls.LoadX509KeyPair(r.CertPath+"/tls.crt", r.CertPath+"/tls.key") + } else { + // Create tls based credential. + cert, err = tlsutil.CreateSelfSignedTLSCertificate(logger) + } + if err != nil { + logger.Error(err, "Failed to create self signed certificate") + return err + } + + creds := credentials.NewTLS(&tls.Config{ + Certificates: []tls.Certificate{cert}, + }) + // Init the server. + srv = grpc.NewServer(grpc.Creds(creds)) + } else { + srv = grpc.NewServer() + } + var extProcServer extProcPb.ExternalProcessorServer + if r.UseStreaming { + logger.Info("Using streaming extproc server") + extProcServer = handlers.NewStreamingServer(scheduling.NewScheduler(r.Datastore), r.DestinationEndpointHintMetadataNamespace, r.DestinationEndpointHintKey, r.Datastore) + } else { + logger.Info("Using standard extproc server") + extProcServer = handlers.NewServer(scheduling.NewScheduler(r.Datastore), r.DestinationEndpointHintMetadataNamespace, r.DestinationEndpointHintKey, r.Datastore) + } + extProcPb.RegisterExternalProcessorServer( + srv, + extProcServer, + ) + + // Forward to the gRPC runnable. + return runnable.GRPCServer("ext-proc", srv, r.GrpcPort).Start(ctx) + })) +} diff --git a/pkg/epp/server/runserver_test.go b/pkg/epp/server/runserver_test.go new file mode 100644 index 00000000..b02688c5 --- /dev/null +++ b/pkg/epp/server/runserver_test.go @@ -0,0 +1,38 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package server_test + +import ( + "testing" + + "sigs.k8s.io/controller-runtime/pkg/manager" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +func TestRunnable(t *testing.T) { + // Make sure AsRunnable() does not use leader election. + runner := server.NewDefaultExtProcServerRunner().AsRunnable(logutil.NewTestLogger()) + r, ok := runner.(manager.LeaderElectionRunnable) + if !ok { + t.Fatal("runner is not LeaderElectionRunnable") + } + if r.NeedLeaderElection() { + t.Error("runner returned NeedLeaderElection = true, expected false") + } +} diff --git a/pkg/epp/util/error/error.go b/pkg/epp/util/error/error.go new file mode 100644 index 00000000..2f9c992c --- /dev/null +++ b/pkg/epp/util/error/error.go @@ -0,0 +1,34 @@ +package error + +import ( + "fmt" +) + +// Error is an error struct for errors returned by the epp server. +type Error struct { + Code string + Msg string +} + +const ( + Unknown = "Unknown" + BadRequest = "BadRequest" + Internal = "Internal" + ModelServerError = "ModelServerError" + BadConfiguration = "BadConfiguration" + InferencePoolResourceExhausted = "InferencePoolResourceExhausted" +) + +// Error returns a string version of the error. +func (e Error) Error() string { + return fmt.Sprintf("inference gateway: %s - %s", e.Code, e.Msg) +} + +// CanonicalCode returns the error's ErrorCode. +func CanonicalCode(err error) string { + e, ok := err.(Error) + if ok { + return e.Code + } + return Unknown +} diff --git a/pkg/epp/util/logging/fatal.go b/pkg/epp/util/logging/fatal.go new file mode 100644 index 00000000..d8a9a937 --- /dev/null +++ b/pkg/epp/util/logging/fatal.go @@ -0,0 +1,31 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package logging + +import ( + "os" + + "github.com/go-logr/logr" +) + +// Fatal calls logger.Error followed by os.Exit(1). +// +// This is a utility function and should not be used in production code! +func Fatal(logger logr.Logger, err error, msg string, keysAndValues ...interface{}) { + logger.Error(err, msg, keysAndValues...) + os.Exit(1) +} diff --git a/pkg/epp/util/logging/logger.go b/pkg/epp/util/logging/logger.go new file mode 100644 index 00000000..5e6ed88d --- /dev/null +++ b/pkg/epp/util/logging/logger.go @@ -0,0 +1,36 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package logging + +import ( + "context" + + "github.com/go-logr/logr" + uberzap "go.uber.org/zap" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" +) + +// NewTestLogger creates a new Zap logger using the dev mode. +func NewTestLogger() logr.Logger { + return zap.New(zap.UseDevMode(true), zap.RawZapOpts(uberzap.AddCaller())) +} + +// NewTestLoggerIntoContext creates a new Zap logger using the dev mode and inserts it into the given context. +func NewTestLoggerIntoContext(ctx context.Context) context.Context { + return log.IntoContext(ctx, zap.New(zap.UseDevMode(true), zap.RawZapOpts(uberzap.AddCaller()))) +} diff --git a/pkg/epp/util/logging/logging_const.go b/pkg/epp/util/logging/logging_const.go new file mode 100644 index 00000000..823ab28b --- /dev/null +++ b/pkg/epp/util/logging/logging_const.go @@ -0,0 +1,24 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package logging + +const ( + DEFAULT = 2 + VERBOSE = 3 + DEBUG = 4 + TRACE = 5 +) diff --git a/pkg/epp/util/testing/diff.go b/pkg/epp/util/testing/diff.go new file mode 100644 index 00000000..34b0b8ca --- /dev/null +++ b/pkg/epp/util/testing/diff.go @@ -0,0 +1,27 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package testing + +import ( + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" +) + +func DiffModelLists(want, got []*v1alpha2.InferenceModel) string { + return cmp.Diff(want, got, cmpopts.SortSlices(func(a, b *v1alpha2.InferenceModel) bool { return a.Name < b.Name })) +} diff --git a/pkg/epp/util/testing/request.go b/pkg/epp/util/testing/request.go new file mode 100644 index 00000000..30772ad5 --- /dev/null +++ b/pkg/epp/util/testing/request.go @@ -0,0 +1,67 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package testing + +import ( + "encoding/json" + + envoyCorev3 "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "github.com/go-logr/logr" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +func GenerateRequest(logger logr.Logger, prompt, model string) *extProcPb.ProcessingRequest { + j := map[string]interface{}{ + "model": model, + "prompt": prompt, + "max_tokens": 100, + "temperature": 0, + } + + llmReq, err := json.Marshal(j) + if err != nil { + logutil.Fatal(logger, err, "Failed to unmarshal LLM request") + } + req := &extProcPb.ProcessingRequest{ + Request: &extProcPb.ProcessingRequest_RequestBody{ + RequestBody: &extProcPb.HttpBody{Body: llmReq, EndOfStream: true}, + }, + } + return req +} + +func GenerateStreamedRequestSet(logger logr.Logger, prompt, model string) []*extProcPb.ProcessingRequest { + requests := []*extProcPb.ProcessingRequest{} + headerReq := &extProcPb.ProcessingRequest{ + Request: &extProcPb.ProcessingRequest_RequestHeaders{ + RequestHeaders: &extProcPb.HttpHeaders{ + Headers: &envoyCorev3.HeaderMap{ + Headers: []*envoyCorev3.HeaderValue{ + { + Key: "hi", + Value: "mom", + }, + }, + }, + }, + }, + } + requests = append(requests, headerReq) + requests = append(requests, GenerateRequest(logger, prompt, model)) + return requests +} diff --git a/pkg/epp/util/testing/wrappers.go b/pkg/epp/util/testing/wrappers.go new file mode 100644 index 00000000..c4018631 --- /dev/null +++ b/pkg/epp/util/testing/wrappers.go @@ -0,0 +1,193 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package testing + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" +) + +// PodWrapper wraps a Pod. +type PodWrapper struct { + corev1.Pod +} + +func FromBase(pod *corev1.Pod) *PodWrapper { + return &PodWrapper{ + Pod: *pod, + } +} + +// MakePod creates a wrapper for a Pod. +func MakePod(podName string) *PodWrapper { + return &PodWrapper{ + corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{}, + }, + } +} + +// Complete sets necessary fields for a Pod to make it not denied by the apiserver +func (p *PodWrapper) Complete() *PodWrapper { + if p.Pod.Namespace == "" { + p.Namespace("default") + } + p.Spec.Containers = []corev1.Container{ + { + Name: "mock-vllm", + Image: "mock-vllm:latest", + }, + } + return p +} + +func (p *PodWrapper) Namespace(ns string) *PodWrapper { + p.ObjectMeta.Namespace = ns + return p +} + +// Labels sets the pod labels. +func (p *PodWrapper) Labels(labels map[string]string) *PodWrapper { + p.ObjectMeta.Labels = labels + return p +} + +// SetReadyCondition sets a PodReay=true condition. +func (p *PodWrapper) ReadyCondition() *PodWrapper { + p.Status.Conditions = []corev1.PodCondition{{ + Type: corev1.PodReady, + Status: corev1.ConditionTrue, + }} + return p +} + +func (p *PodWrapper) IP(ip string) *PodWrapper { + p.Status.PodIP = ip + return p +} + +func (p *PodWrapper) DeletionTimestamp() *PodWrapper { + now := metav1.Now() + p.ObjectMeta.DeletionTimestamp = &now + p.ObjectMeta.Finalizers = []string{"finalizer"} + return p +} + +// Obj returns the wrapped Pod. +func (p *PodWrapper) ObjRef() *corev1.Pod { + return &p.Pod +} + +// InferenceModelWrapper wraps an InferenceModel. +type InferenceModelWrapper struct { + v1alpha2.InferenceModel +} + +// MakeInferenceModel creates a wrapper for a InferenceModel. +func MakeInferenceModel(name string) *InferenceModelWrapper { + return &InferenceModelWrapper{ + v1alpha2.InferenceModel{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Spec: v1alpha2.InferenceModelSpec{}, + }, + } +} + +func (m *InferenceModelWrapper) Namespace(ns string) *InferenceModelWrapper { + m.ObjectMeta.Namespace = ns + return m +} + +// Obj returns the wrapped InferenceModel. +func (m *InferenceModelWrapper) ObjRef() *v1alpha2.InferenceModel { + return &m.InferenceModel +} + +func (m *InferenceModelWrapper) ModelName(modelName string) *InferenceModelWrapper { + m.Spec.ModelName = modelName + return m +} + +func (m *InferenceModelWrapper) PoolName(poolName string) *InferenceModelWrapper { + m.Spec.PoolRef = v1alpha2.PoolObjectReference{Name: v1alpha2.ObjectName(poolName)} + return m +} + +func (m *InferenceModelWrapper) Criticality(criticality v1alpha2.Criticality) *InferenceModelWrapper { + m.Spec.Criticality = &criticality + return m +} + +func (m *InferenceModelWrapper) DeletionTimestamp() *InferenceModelWrapper { + now := metav1.Now() + m.ObjectMeta.DeletionTimestamp = &now + m.ObjectMeta.Finalizers = []string{"finalizer"} + return m +} + +func (m *InferenceModelWrapper) CreationTimestamp(t metav1.Time) *InferenceModelWrapper { + m.ObjectMeta.CreationTimestamp = t + return m +} + +// InferencePoolWrapper wraps an InferencePool. +type InferencePoolWrapper struct { + v1alpha2.InferencePool +} + +// MakeInferencePool creates a wrapper for a InferencePool. +func MakeInferencePool(name string) *InferencePoolWrapper { + return &InferencePoolWrapper{ + v1alpha2.InferencePool{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Spec: v1alpha2.InferencePoolSpec{}, + }, + } +} + +func (m *InferencePoolWrapper) Namespace(ns string) *InferencePoolWrapper { + m.ObjectMeta.Namespace = ns + return m +} + +func (m *InferencePoolWrapper) Selector(selector map[string]string) *InferencePoolWrapper { + s := make(map[v1alpha2.LabelKey]v1alpha2.LabelValue) + for k, v := range selector { + s[v1alpha2.LabelKey(k)] = v1alpha2.LabelValue(v) + } + m.Spec.Selector = s + return m +} + +func (m *InferencePoolWrapper) TargetPortNumber(p int32) *InferencePoolWrapper { + m.Spec.TargetPortNumber = p + return m +} + +// Obj returns the wrapped InferencePool. +func (m *InferencePoolWrapper) ObjRef() *v1alpha2.InferencePool { + return &m.InferencePool +} diff --git a/pkg/ext-proc/backend/datastore.go b/pkg/ext-proc/backend/datastore.go deleted file mode 100644 index 627ddbe5..00000000 --- a/pkg/ext-proc/backend/datastore.go +++ /dev/null @@ -1,211 +0,0 @@ -package backend - -import ( - "context" - "errors" - "math/rand" - "sync" - "time" - - "github.com/google/go-cmp/cmp" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" - corev1 "k8s.io/api/core/v1" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/meta" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/client-go/informers" - informersv1 "k8s.io/client-go/informers/core/v1" - "k8s.io/client-go/kubernetes" - clientset "k8s.io/client-go/kubernetes" - listersv1 "k8s.io/client-go/listers/core/v1" - "k8s.io/client-go/tools/cache" - "k8s.io/klog/v2" -) - -func NewK8sDataStore(options ...K8sDatastoreOption) *K8sDatastore { - store := &K8sDatastore{ - poolMu: sync.RWMutex{}, - InferenceModels: &sync.Map{}, - } - - store.podListerFactory = store.createPodLister - for _, opt := range options { - opt(store) - } - return store -} - -// The datastore is a local cache of relevant data for the given InferencePool (currently all pulled from k8s-api) -type K8sDatastore struct { - client kubernetes.Interface - // poolMu is used to synchronize access to the inferencePool. - poolMu sync.RWMutex - inferencePool *v1alpha1.InferencePool - podListerFactory PodListerFactory - podLister *PodLister - InferenceModels *sync.Map -} - -type K8sDatastoreOption func(*K8sDatastore) -type PodListerFactory func(*v1alpha1.InferencePool) *PodLister - -// WithPods can be used in tests to override the pods. -func WithPodListerFactory(factory PodListerFactory) K8sDatastoreOption { - return func(store *K8sDatastore) { - store.podListerFactory = factory - } -} - -type PodLister struct { - Lister listersv1.PodLister - sharedInformer informers.SharedInformerFactory -} - -func (l *PodLister) listEverything() ([]*corev1.Pod, error) { - return l.Lister.List(labels.Everything()) - -} - -func (ds *K8sDatastore) SetClient(client kubernetes.Interface) { - ds.client = client -} - -func (ds *K8sDatastore) setInferencePool(pool *v1alpha1.InferencePool) { - ds.poolMu.Lock() - defer ds.poolMu.Unlock() - - if ds.inferencePool != nil && cmp.Equal(ds.inferencePool.Spec.Selector, pool.Spec.Selector) { - // Pool updated, but the selector stayed the same, so no need to change the informer. - ds.inferencePool = pool - return - } - - // New pool or selector updated. - ds.inferencePool = pool - - if ds.podLister != nil && ds.podLister.sharedInformer != nil { - // Shutdown the old informer async since this takes a few seconds. - go func() { - ds.podLister.sharedInformer.Shutdown() - }() - } - - if ds.podListerFactory != nil { - // Create a new informer with the new selector. - ds.podLister = ds.podListerFactory(ds.inferencePool) - if ds.podLister != nil && ds.podLister.sharedInformer != nil { - ctx := context.Background() - ds.podLister.sharedInformer.Start(ctx.Done()) - ds.podLister.sharedInformer.WaitForCacheSync(ctx.Done()) - } - } -} - -func (ds *K8sDatastore) getInferencePool() (*v1alpha1.InferencePool, error) { - ds.poolMu.RLock() - defer ds.poolMu.RUnlock() - if !ds.HasSynced() { - return nil, errors.New("InferencePool is not initialized in data store") - } - return ds.inferencePool, nil -} - -func (ds *K8sDatastore) createPodLister(pool *v1alpha1.InferencePool) *PodLister { - if ds.client == nil { - return nil - } - klog.V(logutil.DEFAULT).Infof("Creating informer for pool %v", pool.Name) - selectorSet := make(map[string]string) - for k, v := range pool.Spec.Selector { - selectorSet[string(k)] = string(v) - } - - newPodInformer := func(cs clientset.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { - informer := informersv1.NewFilteredPodInformer(cs, pool.Namespace, resyncPeriod, cache.Indexers{}, func(options *metav1.ListOptions) { - options.LabelSelector = labels.SelectorFromSet(selectorSet).String() - }) - err := informer.SetTransform(func(obj interface{}) (interface{}, error) { - // Remove unnecessary fields to improve memory footprint. - if accessor, err := meta.Accessor(obj); err == nil { - if accessor.GetManagedFields() != nil { - accessor.SetManagedFields(nil) - } - } - return obj, nil - }) - if err != nil { - klog.Errorf("Failed to set pod transformer: %v", err) - } - return informer - } - // 0 means we disable resyncing, it is not really useful to resync every hour (the controller-runtime default), - // if things go wrong in the watch, no one will wait for an hour for things to get fixed. - // As precedence, kube-scheduler also disables this since it is expensive to list all pods from the api-server regularly. - resyncPeriod := time.Duration(0) - sharedInformer := informers.NewSharedInformerFactory(ds.client, resyncPeriod) - sharedInformer.InformerFor(&v1.Pod{}, newPodInformer) - - return &PodLister{ - Lister: sharedInformer.Core().V1().Pods().Lister(), - sharedInformer: sharedInformer, - } -} - -func (ds *K8sDatastore) getPods() ([]*corev1.Pod, error) { - ds.poolMu.RLock() - defer ds.poolMu.RUnlock() - if !ds.HasSynced() { - return nil, errors.New("InferencePool is not initialized in datastore") - } - pods, err := ds.podLister.listEverything() - if err != nil { - return nil, err - } - return pods, nil -} - -func (s *K8sDatastore) FetchModelData(modelName string) (returnModel *v1alpha1.InferenceModel) { - infModel, ok := s.InferenceModels.Load(modelName) - if ok { - returnModel = infModel.(*v1alpha1.InferenceModel) - } - return -} - -// HasSynced returns true if InferencePool is set in the data store. -func (ds *K8sDatastore) HasSynced() bool { - ds.poolMu.RLock() - defer ds.poolMu.RUnlock() - return ds.inferencePool != nil -} - -func RandomWeightedDraw(model *v1alpha1.InferenceModel, seed int64) string { - var weights int32 - - source := rand.NewSource(rand.Int63()) - if seed > 0 { - source = rand.NewSource(seed) - } - r := rand.New(source) - for _, model := range model.Spec.TargetModels { - weights += *model.Weight - } - klog.V(logutil.VERBOSE).Infof("Weights for Model(%v) total to: %v", model.Name, weights) - randomVal := r.Int31n(weights) - for _, model := range model.Spec.TargetModels { - if randomVal < *model.Weight { - return model.Name - } - randomVal -= *model.Weight - } - return "" -} - -func IsCritical(model *v1alpha1.InferenceModel) bool { - if model.Spec.Criticality != nil && *model.Spec.Criticality == v1alpha1.Critical { - return true - } - return false -} diff --git a/pkg/ext-proc/backend/datastore_test.go b/pkg/ext-proc/backend/datastore_test.go deleted file mode 100644 index 323b3bb0..00000000 --- a/pkg/ext-proc/backend/datastore_test.go +++ /dev/null @@ -1,133 +0,0 @@ -package backend - -import ( - "testing" - - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -func TestHasSynced(t *testing.T) { - tests := []struct { - name string - inferencePool *v1alpha1.InferencePool - hasSynced bool - }{ - { - name: "Ready when InferencePool exists in data store", - inferencePool: &v1alpha1.InferencePool{ - ObjectMeta: v1.ObjectMeta{ - Name: "test-pool", - Namespace: "default", - }, - }, - hasSynced: true, - }, - { - name: "Not ready when InferencePool is nil in data store", - inferencePool: nil, - hasSynced: false, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - datastore := NewK8sDataStore() - // Set the inference pool - if tt.inferencePool != nil { - datastore.setInferencePool(tt.inferencePool) - } - // Check if the data store has been initialized - hasSynced := datastore.HasSynced() - if hasSynced != tt.hasSynced { - t.Errorf("IsInitialized() = %v, want %v", hasSynced, tt.hasSynced) - } - }) - } -} - -func TestRandomWeightedDraw(t *testing.T) { - tests := []struct { - name string - model *v1alpha1.InferenceModel - want string - }{ - { - name: "'random' distribution", - model: &v1alpha1.InferenceModel{ - Spec: v1alpha1.InferenceModelSpec{ - TargetModels: []v1alpha1.TargetModel{ - { - Name: "canary", - Weight: pointer(50), - }, - { - Name: "v1", - Weight: pointer(50), - }, - }, - }, - }, - want: "canary", - }, - { - name: "'random' distribution", - model: &v1alpha1.InferenceModel{ - Spec: v1alpha1.InferenceModelSpec{ - TargetModels: []v1alpha1.TargetModel{ - { - Name: "canary", - Weight: pointer(25), - }, - { - Name: "v1.1", - Weight: pointer(55), - }, - { - Name: "v1", - Weight: pointer(50), - }, - }, - }, - }, - want: "v1", - }, - { - name: "'random' distribution", - model: &v1alpha1.InferenceModel{ - Spec: v1alpha1.InferenceModelSpec{ - TargetModels: []v1alpha1.TargetModel{ - { - Name: "canary", - Weight: pointer(20), - }, - { - Name: "v1.1", - Weight: pointer(20), - }, - { - Name: "v1", - Weight: pointer(10), - }, - }, - }, - }, - want: "v1.1", - }, - } - var seedVal int64 = 420 - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - for range 10000 { - model := RandomWeightedDraw(test.model, seedVal) - if model != test.want { - t.Errorf("Model returned!: %v", model) - break - } - } - }) - } -} - -func pointer(v int32) *int32 { - return &v -} diff --git a/pkg/ext-proc/backend/fake.go b/pkg/ext-proc/backend/fake.go deleted file mode 100644 index 63f20db6..00000000 --- a/pkg/ext-proc/backend/fake.go +++ /dev/null @@ -1,29 +0,0 @@ -package backend - -import ( - "context" - - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - klog "k8s.io/klog/v2" -) - -type FakePodMetricsClient struct { - Err map[string]error - Res map[string]*PodMetrics -} - -func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, pod Pod, existing *PodMetrics) (*PodMetrics, error) { - if err, ok := f.Err[pod.Name]; ok { - return nil, err - } - klog.V(1).Infof("pod: %+v\n existing: %+v \n new: %+v \n", pod, existing, f.Res[pod.Name]) - return f.Res[pod.Name], nil -} - -type FakeDataStore struct { - Res map[string]*v1alpha1.InferenceModel -} - -func (fds *FakeDataStore) FetchModelData(modelName string) (returnModel *v1alpha1.InferenceModel) { - return fds.Res[modelName] -} diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler.go b/pkg/ext-proc/backend/inferencemodel_reconciler.go deleted file mode 100644 index 3164e098..00000000 --- a/pkg/ext-proc/backend/inferencemodel_reconciler.go +++ /dev/null @@ -1,56 +0,0 @@ -package backend - -import ( - "context" - - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/tools/record" - "k8s.io/klog/v2" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -type InferenceModelReconciler struct { - client.Client - Scheme *runtime.Scheme - Record record.EventRecorder - Datastore *K8sDatastore - PoolNamespacedName types.NamespacedName -} - -func (c *InferenceModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - if req.Namespace != c.PoolNamespacedName.Namespace { - return ctrl.Result{}, nil - } - klog.V(1).Infof("reconciling InferenceModel %v", req.NamespacedName) - - service := &v1alpha1.InferenceModel{} - if err := c.Get(ctx, req.NamespacedName, service); err != nil { - klog.Error(err, "unable to get InferencePool") - return ctrl.Result{}, err - } - - c.updateDatastore(service) - return ctrl.Result{}, nil -} - -func (c *InferenceModelReconciler) SetupWithManager(mgr ctrl.Manager) error { - return ctrl.NewControllerManagedBy(mgr). - For(&v1alpha1.InferenceModel{}). - Complete(c) -} - -func (c *InferenceModelReconciler) updateDatastore(infModel *v1alpha1.InferenceModel) { - if infModel.Spec.PoolRef.Name == c.PoolNamespacedName.Name { - klog.V(1).Infof("Incoming pool ref %v, server pool name: %v", infModel.Spec.PoolRef, c.PoolNamespacedName.Name) - klog.V(1).Infof("Adding/Updating inference model: %v", infModel.Spec.ModelName) - c.Datastore.InferenceModels.Store(infModel.Spec.ModelName, infModel) - return - } - klog.V(logutil.DEFAULT).Infof("Removing/Not adding inference model: %v", infModel.Spec.ModelName) - // If we get here. The model is not relevant to this pool, remove. - c.Datastore.InferenceModels.Delete(infModel.Spec.ModelName) -} diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go deleted file mode 100644 index 117766b9..00000000 --- a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go +++ /dev/null @@ -1,169 +0,0 @@ -package backend - -import ( - "sync" - "testing" - - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" -) - -var ( - service1 = &v1alpha1.InferenceModel{ - Spec: v1alpha1.InferenceModelSpec{ - ModelName: "fake model1", - PoolRef: v1alpha1.PoolObjectReference{Name: "test-pool"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-service", - }, - } - service1Modified = &v1alpha1.InferenceModel{ - Spec: v1alpha1.InferenceModelSpec{ - ModelName: "fake model1", - PoolRef: v1alpha1.PoolObjectReference{Name: "test-poolio"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-service", - }, - } - service2 = &v1alpha1.InferenceModel{ - Spec: v1alpha1.InferenceModelSpec{ - ModelName: "fake model", - PoolRef: v1alpha1.PoolObjectReference{Name: "test-pool"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-service-2", - }, - } -) - -func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { - tests := []struct { - name string - datastore *K8sDatastore - incomingService *v1alpha1.InferenceModel - wantInferenceModels *sync.Map - }{ - { - name: "No Services registered; valid, new service incoming.", - datastore: &K8sDatastore{ - inferencePool: &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool", - ResourceVersion: "Old and boring", - }, - }, - InferenceModels: &sync.Map{}, - }, - incomingService: service1, - wantInferenceModels: populateServiceMap(service1), - }, - { - name: "Removing existing service.", - datastore: &K8sDatastore{ - inferencePool: &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool", - ResourceVersion: "Old and boring", - }, - }, - InferenceModels: populateServiceMap(service1), - }, - incomingService: service1Modified, - wantInferenceModels: populateServiceMap(), - }, - { - name: "Unrelated service, do nothing.", - datastore: &K8sDatastore{ - inferencePool: &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool", - ResourceVersion: "Old and boring", - }, - }, - InferenceModels: populateServiceMap(service1), - }, - incomingService: &v1alpha1.InferenceModel{ - Spec: v1alpha1.InferenceModelSpec{ - ModelName: "fake model", - PoolRef: v1alpha1.PoolObjectReference{Name: "test-poolio"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "unrelated-service", - }, - }, - wantInferenceModels: populateServiceMap(service1), - }, - { - name: "Add to existing", - datastore: &K8sDatastore{ - inferencePool: &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool", - ResourceVersion: "Old and boring", - }, - }, - InferenceModels: populateServiceMap(service1), - }, - incomingService: service2, - wantInferenceModels: populateServiceMap(service1, service2), - }, - } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - InferenceModelReconciler := &InferenceModelReconciler{ - Datastore: test.datastore, - PoolNamespacedName: types.NamespacedName{Name: test.datastore.inferencePool.Name}, - } - InferenceModelReconciler.updateDatastore(test.incomingService) - - if ok := mapsEqual(InferenceModelReconciler.Datastore.InferenceModels, test.wantInferenceModels); !ok { - t.Error("Maps are not equal") - } - }) - } -} - -func populateServiceMap(services ...*v1alpha1.InferenceModel) *sync.Map { - returnVal := &sync.Map{} - - for _, service := range services { - returnVal.Store(service.Spec.ModelName, service) - } - return returnVal -} - -func mapsEqual(map1, map2 *sync.Map) bool { - equal := true - - map1.Range(func(k, v any) bool { - if _, ok := map2.Load(k); !ok { - equal = false - return false - } - return true - }) - map2.Range(func(k, v any) bool { - if _, ok := map1.Load(k); !ok { - equal = false - return false - } - return true - }) - - return equal -} diff --git a/pkg/ext-proc/backend/inferencepool_reconciler.go b/pkg/ext-proc/backend/inferencepool_reconciler.go deleted file mode 100644 index 0c2ae75f..00000000 --- a/pkg/ext-proc/backend/inferencepool_reconciler.go +++ /dev/null @@ -1,56 +0,0 @@ -package backend - -import ( - "context" - - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/tools/record" - klog "k8s.io/klog/v2" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -// InferencePoolReconciler utilizes the controller runtime to reconcile Instance Gateway resources -// This implementation is just used for reading & maintaining data sync. The Gateway implementation -// will have the proper controller that will create/manage objects on behalf of the server pool. -type InferencePoolReconciler struct { - client.Client - Scheme *runtime.Scheme - Record record.EventRecorder - PoolNamespacedName types.NamespacedName - Datastore *K8sDatastore -} - -func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - if req.NamespacedName.Name != c.PoolNamespacedName.Name || req.NamespacedName.Namespace != c.PoolNamespacedName.Namespace { - return ctrl.Result{}, nil - } - klog.V(1).Info("reconciling InferencePool", req.NamespacedName) - - serverPool := &v1alpha1.InferencePool{} - if err := c.Get(ctx, req.NamespacedName, serverPool); err != nil { - klog.Error(err, "unable to get InferencePool") - return ctrl.Result{}, err - } - - c.updateDatastore(serverPool) - - return ctrl.Result{}, nil -} - -func (c *InferencePoolReconciler) updateDatastore(serverPool *v1alpha1.InferencePool) { - pool, _ := c.Datastore.getInferencePool() - if pool == nil || - serverPool.ObjectMeta.ResourceVersion != pool.ObjectMeta.ResourceVersion { - klog.Infof("Updating inference pool to %v/%v", serverPool.ObjectMeta.Namespace, serverPool.ObjectMeta.Name) - c.Datastore.setInferencePool(serverPool) - } -} - -func (c *InferencePoolReconciler) SetupWithManager(mgr ctrl.Manager) error { - return ctrl.NewControllerManagedBy(mgr). - For(&v1alpha1.InferencePool{}). - Complete(c) -} diff --git a/pkg/ext-proc/backend/inferencepool_reconciler_test.go b/pkg/ext-proc/backend/inferencepool_reconciler_test.go deleted file mode 100644 index f03c31cb..00000000 --- a/pkg/ext-proc/backend/inferencepool_reconciler_test.go +++ /dev/null @@ -1,85 +0,0 @@ -package backend - -import ( - "reflect" - "testing" - - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -var ( - pool1 = &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool", - ResourceVersion: "50", - }, - } - // Different name, same RV doesn't really make sense, but helps with testing the - // updateStore impl which relies on the equality of RVs alone. - modPool1SameRV = &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool-mod", - ResourceVersion: "50", - }, - } - modPool1DiffRV = &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool-mod", - ResourceVersion: "51", - }, - } -) - -func TestUpdateDatastore_InferencePoolReconciler(t *testing.T) { - tests := []struct { - name string - datastore *K8sDatastore - incomingPool *v1alpha1.InferencePool - wantPool *v1alpha1.InferencePool - }{ - { - name: "InferencePool not set, should set InferencePool", - datastore: &K8sDatastore{}, - incomingPool: pool1.DeepCopy(), - wantPool: pool1, - }, - { - name: "InferencePool set, matching RVs, do nothing", - datastore: &K8sDatastore{ - inferencePool: pool1.DeepCopy(), - }, - incomingPool: modPool1SameRV.DeepCopy(), - wantPool: pool1, - }, - { - name: "InferencePool set, differing RVs, re-set InferencePool", - datastore: &K8sDatastore{ - inferencePool: pool1.DeepCopy(), - }, - incomingPool: modPool1DiffRV.DeepCopy(), - wantPool: modPool1DiffRV, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - inferencePoolReconciler := &InferencePoolReconciler{Datastore: test.datastore} - inferencePoolReconciler.updateDatastore(test.incomingPool) - - gotPool := inferencePoolReconciler.Datastore.inferencePool - if !reflect.DeepEqual(gotPool, test.wantPool) { - t.Errorf("Unexpected InferencePool: want %#v, got: %#v", test.wantPool, gotPool) - } - }) - } -} diff --git a/pkg/ext-proc/backend/provider.go b/pkg/ext-proc/backend/provider.go deleted file mode 100644 index d6ccf85f..00000000 --- a/pkg/ext-proc/backend/provider.go +++ /dev/null @@ -1,219 +0,0 @@ -package backend - -import ( - "context" - "fmt" - "math/rand" - "strconv" - "sync" - "time" - - "go.uber.org/multierr" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" - corev1 "k8s.io/api/core/v1" - klog "k8s.io/klog/v2" -) - -const ( - fetchMetricsTimeout = 5 * time.Second -) - -func NewProvider(pmc PodMetricsClient, datastore *K8sDatastore) *Provider { - p := &Provider{ - podMetrics: sync.Map{}, - pmc: pmc, - datastore: datastore, - } - return p -} - -// Provider provides backend pods and information such as metrics. -type Provider struct { - // key: PodName, value: *PodMetrics - // TODO: change to use NamespacedName once we support multi-tenant inferencePools - podMetrics sync.Map - pmc PodMetricsClient - datastore *K8sDatastore -} - -type PodMetricsClient interface { - FetchMetrics(ctx context.Context, pod Pod, existing *PodMetrics) (*PodMetrics, error) -} - -func (p *Provider) AllPodMetrics() []*PodMetrics { - res := []*PodMetrics{} - fn := func(k, v any) bool { - res = append(res, v.(*PodMetrics)) - return true - } - p.podMetrics.Range(fn) - return res -} - -func (p *Provider) UpdatePodMetrics(pod Pod, pm *PodMetrics) { - p.podMetrics.Store(pod.Name, pm) -} - -func (p *Provider) GetPodMetrics(pod Pod) (*PodMetrics, bool) { - val, ok := p.podMetrics.Load(pod.Name) - if ok { - return val.(*PodMetrics), true - } - return nil, false -} - -func (p *Provider) Init(refreshPodsInterval, refreshMetricsInterval time.Duration) error { - p.refreshPodsOnce() - - if err := p.refreshMetricsOnce(); err != nil { - klog.Errorf("Failed to init metrics: %v", err) - } - - klog.Infof("Initialized pods and metrics: %+v", p.AllPodMetrics()) - - // periodically refresh pods - go func() { - for { - time.Sleep(refreshPodsInterval) - p.refreshPodsOnce() - } - }() - - // periodically refresh metrics - go func() { - for { - time.Sleep(refreshMetricsInterval) - if err := p.refreshMetricsOnce(); err != nil { - klog.V(logutil.TRACE).Infof("Failed to refresh metrics: %v", err) - } - } - }() - - // Periodically print out the pods and metrics for DEBUGGING. - if klog.V(logutil.DEBUG).Enabled() { - go func() { - for { - time.Sleep(5 * time.Second) - klog.Infof("===DEBUG: Current Pods and metrics: %+v", p.AllPodMetrics()) - } - }() - } - - return nil -} - -// refreshPodsOnce lists pods and updates keys in the podMetrics map. -// Note this function doesn't update the PodMetrics value, it's done separately. -func (p *Provider) refreshPodsOnce() { - pods, err := p.datastore.getPods() - if err != nil { - klog.V(logutil.DEFAULT).Infof("Couldn't list pods: %v", err) - p.podMetrics.Clear() - return - } - pool, _ := p.datastore.getInferencePool() - // revision is used to track which entries we need to remove in the next iteration that removes - // metrics for pods that don't exist anymore. Otherwise we have to build a map of the listed pods, - // which is not efficient. Revision can be any random id as long as it is different from the last - // refresh, so it should be very reliable (as reliable as the probability of randomly picking two - // different numbers from range 0 - maxInt). - revision := rand.Int() - ready := 0 - for _, pod := range pods { - if !podIsReady(pod) { - continue - } - // a ready pod - ready++ - if val, ok := p.podMetrics.Load(pod.Name); ok { - // pod already exists - pm := val.(*PodMetrics) - pm.revision = revision - continue - } - // new pod, add to the store for probing - new := &PodMetrics{ - Pod: Pod{ - Name: pod.Name, - Address: pod.Status.PodIP + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber)), - }, - Metrics: Metrics{ - ActiveModels: make(map[string]int), - }, - revision: revision, - } - p.podMetrics.Store(pod.Name, new) - } - - klog.V(logutil.DEFAULT).Infof("Pods in pool %s/%s with selector %v: total=%v ready=%v", - pool.Namespace, pool.Name, pool.Spec.Selector, len(pods), ready) - - // remove pods that don't exist any more. - mergeFn := func(k, v any) bool { - pm := v.(*PodMetrics) - if pm.revision != revision { - p.podMetrics.Delete(pm.Pod.Name) - } - return true - } - p.podMetrics.Range(mergeFn) -} - -func podIsReady(pod *corev1.Pod) bool { - if pod.DeletionTimestamp != nil { - return false - } - for _, condition := range pod.Status.Conditions { - if condition.Type == corev1.PodReady { - return condition.Status == corev1.ConditionTrue - } - } - return false -} - -func (p *Provider) refreshMetricsOnce() error { - ctx, cancel := context.WithTimeout(context.Background(), fetchMetricsTimeout) - defer cancel() - start := time.Now() - defer func() { - d := time.Since(start) - // TODO: add a metric instead of logging - klog.V(logutil.TRACE).Infof("Refreshed metrics in %v", d) - }() - var wg sync.WaitGroup - errCh := make(chan error) - processOnePod := func(key, value any) bool { - klog.V(logutil.TRACE).Infof("Processing pod %v and metric %v", key, value) - existing := value.(*PodMetrics) - pod := existing.Pod - wg.Add(1) - go func() { - defer wg.Done() - updated, err := p.pmc.FetchMetrics(ctx, pod, existing) - if err != nil { - errCh <- fmt.Errorf("failed to parse metrics from %s: %v", pod, err) - return - } - p.UpdatePodMetrics(pod, updated) - klog.V(logutil.TRACE).Infof("Updated metrics for pod %s: %v", pod, updated.Metrics) - }() - return true - } - p.podMetrics.Range(processOnePod) - - // Wait for metric collection for all pods to complete and close the error channel in a - // goroutine so this is unblocking, allowing the code to proceed to the error collection code - // below. - // Note we couldn't use a buffered error channel with a size because the size of the podMetrics - // sync.Map is unknown beforehand. - go func() { - wg.Wait() - close(errCh) - }() - - var errs error - for err := range errCh { - errs = multierr.Append(errs, err) - } - return errs -} diff --git a/pkg/ext-proc/backend/provider_test.go b/pkg/ext-proc/backend/provider_test.go deleted file mode 100644 index 9159ba48..00000000 --- a/pkg/ext-proc/backend/provider_test.go +++ /dev/null @@ -1,181 +0,0 @@ -package backend - -import ( - "errors" - "testing" - - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - testingutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/testing" - corev1 "k8s.io/api/core/v1" -) - -var ( - pod1 = &PodMetrics{ - Pod: Pod{Name: "pod1", Address: "address1:9009"}, - Metrics: Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - } - pod2 = &PodMetrics{ - Pod: Pod{Name: "pod2", Address: "address2:9009"}, - Metrics: Metrics{ - WaitingQueueSize: 1, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo1": 1, - "bar1": 1, - }, - }, - } -) - -func TestProvider(t *testing.T) { - allPodsLister := &testingutil.FakePodLister{ - PodsList: []*corev1.Pod{ - testingutil.MakePod(pod1.Pod.Name).SetReady().SetPodIP("address1").Obj(), - testingutil.MakePod(pod2.Pod.Name).SetReady().SetPodIP("address2").Obj(), - }, - } - allPodsMetricsClient := &FakePodMetricsClient{ - Res: map[string]*PodMetrics{ - pod1.Pod.Name: pod1, - pod2.Pod.Name: pod2, - }, - } - - tests := []struct { - name string - initPodMetrics []*PodMetrics - lister *testingutil.FakePodLister - pmc PodMetricsClient - step func(*Provider) - want []*PodMetrics - }{ - { - name: "Init without refreshing pods", - initPodMetrics: []*PodMetrics{pod1, pod2}, - lister: allPodsLister, - pmc: allPodsMetricsClient, - step: func(p *Provider) { - _ = p.refreshMetricsOnce() - }, - want: []*PodMetrics{pod1, pod2}, - }, - { - name: "Fetching all success", - lister: allPodsLister, - pmc: allPodsMetricsClient, - step: func(p *Provider) { - p.refreshPodsOnce() - _ = p.refreshMetricsOnce() - }, - want: []*PodMetrics{pod1, pod2}, - }, - { - name: "Fetch metrics error", - lister: allPodsLister, - pmc: &FakePodMetricsClient{ - Err: map[string]error{ - pod2.Pod.Name: errors.New("injected error"), - }, - Res: map[string]*PodMetrics{ - pod1.Pod.Name: pod1, - }, - }, - step: func(p *Provider) { - p.refreshPodsOnce() - _ = p.refreshMetricsOnce() - }, - want: []*PodMetrics{ - pod1, - // Failed to fetch pod2 metrics so it remains the default values. - { - Pod: pod2.Pod, - Metrics: Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0, - MaxActiveModels: 0, - ActiveModels: map[string]int{}, - }, - }, - }, - }, - { - name: "A new pod added", - initPodMetrics: []*PodMetrics{pod2}, - lister: allPodsLister, - pmc: allPodsMetricsClient, - step: func(p *Provider) { - p.refreshPodsOnce() - _ = p.refreshMetricsOnce() - }, - want: []*PodMetrics{pod1, pod2}, - }, - { - name: "A pod removed", - initPodMetrics: []*PodMetrics{pod1, pod2}, - lister: &testingutil.FakePodLister{ - PodsList: []*corev1.Pod{ - testingutil.MakePod(pod2.Pod.Name).SetReady().SetPodIP("address2").Obj(), - }, - }, - pmc: allPodsMetricsClient, - step: func(p *Provider) { - p.refreshPodsOnce() - _ = p.refreshMetricsOnce() - }, - want: []*PodMetrics{pod2}, - }, - { - name: "A pod removed, another added", - initPodMetrics: []*PodMetrics{pod1}, - lister: &testingutil.FakePodLister{ - PodsList: []*corev1.Pod{ - testingutil.MakePod(pod1.Pod.Name).SetReady().SetPodIP("address1").Obj(), - }, - }, - pmc: allPodsMetricsClient, - step: func(p *Provider) { - p.refreshPodsOnce() - _ = p.refreshMetricsOnce() - }, - want: []*PodMetrics{pod1}, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - datastore := NewK8sDataStore(WithPodListerFactory( - func(pool *v1alpha1.InferencePool) *PodLister { - return &PodLister{ - Lister: test.lister, - } - })) - datastore.setInferencePool(&v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{TargetPortNumber: 9009}, - }) - p := NewProvider(test.pmc, datastore) - for _, m := range test.initPodMetrics { - p.UpdatePodMetrics(m.Pod, m) - } - test.step(p) - metrics := p.AllPodMetrics() - lessFunc := func(a, b *PodMetrics) bool { - return a.String() < b.String() - } - if diff := cmp.Diff(test.want, metrics, cmpopts.SortSlices(lessFunc), - cmpopts.IgnoreFields(PodMetrics{}, "revision")); diff != "" { - t.Errorf("Unexpected output (-want +got): %v", diff) - } - }) - } -} diff --git a/pkg/ext-proc/backend/types.go b/pkg/ext-proc/backend/types.go deleted file mode 100644 index d375e4ec..00000000 --- a/pkg/ext-proc/backend/types.go +++ /dev/null @@ -1,54 +0,0 @@ -// Package backend is a library to interact with backend model servers such as probing metrics. -package backend - -import "fmt" - -type PodSet map[Pod]bool - -type Pod struct { - Name string - Address string -} - -func (p Pod) String() string { - return p.Name + ":" + p.Address -} - -type Metrics struct { - // ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU. - ActiveModels map[string]int - // MaxActiveModels is the maximum number of models that can be loaded to GPU. - MaxActiveModels int - RunningQueueSize int - WaitingQueueSize int - KVCacheUsagePercent float64 - KvCacheMaxTokenCapacity int -} - -type PodMetrics struct { - Pod - Metrics - revision int -} - -func (pm *PodMetrics) String() string { - return fmt.Sprintf("Pod: %+v; Metrics: %+v", pm.Pod, pm.Metrics) -} - -func (pm *PodMetrics) Clone() *PodMetrics { - cm := make(map[string]int, len(pm.ActiveModels)) - for k, v := range pm.ActiveModels { - cm[k] = v - } - clone := &PodMetrics{ - Pod: pm.Pod, - Metrics: Metrics{ - ActiveModels: cm, - RunningQueueSize: pm.RunningQueueSize, - WaitingQueueSize: pm.WaitingQueueSize, - KVCacheUsagePercent: pm.KVCacheUsagePercent, - KvCacheMaxTokenCapacity: pm.KvCacheMaxTokenCapacity, - }, - } - return clone -} diff --git a/pkg/ext-proc/handlers/request.go b/pkg/ext-proc/handlers/request.go deleted file mode 100644 index d98f4602..00000000 --- a/pkg/ext-proc/handlers/request.go +++ /dev/null @@ -1,158 +0,0 @@ -package handlers - -import ( - "encoding/json" - "errors" - "fmt" - "strconv" - - configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" - extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" - "google.golang.org/protobuf/types/known/structpb" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" - klog "k8s.io/klog/v2" -) - -// HandleRequestBody handles body of the request to the backend server, such as parsing the "model" -// parameter. -// Envoy sends the request body to ext proc before sending the request to the backend server. -func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { - klog.V(logutil.VERBOSE).Infof("Handling request body") - - // Unmarshal request body (must be JSON). - v := req.Request.(*extProcPb.ProcessingRequest_RequestBody) - var rb map[string]interface{} - if err := json.Unmarshal(v.RequestBody.Body, &rb); err != nil { - klog.Errorf("Error unmarshaling request body: %v", err) - return nil, fmt.Errorf("error unmarshaling request body: %v", err) - } - klog.V(logutil.VERBOSE).Infof("Request body: %v", rb) - - // Resolve target models. - model, ok := rb["model"].(string) - if !ok { - return nil, errors.New("model not found in request") - } - klog.V(logutil.VERBOSE).Infof("Model requested: %v", model) - modelName := model - - // NOTE: The nil checking for the modelObject means that we DO allow passthrough currently. - // This might be a security risk in the future where adapters not registered in the InferenceModel - // are able to be requested by using their distinct name. - modelObj := s.datastore.FetchModelData(model) - if modelObj == nil { - return nil, fmt.Errorf("error finding a model object in InferenceModel for input %v", model) - } - if len(modelObj.Spec.TargetModels) > 0 { - modelName = backend.RandomWeightedDraw(modelObj, 0) - if modelName == "" { - return nil, fmt.Errorf("error getting target model name for model %v", modelObj.Name) - } - } - llmReq := &scheduling.LLMRequest{ - Model: model, - ResolvedTargetModel: modelName, - Critical: backend.IsCritical(modelObj), - } - klog.V(logutil.VERBOSE).Infof("LLM Request: %+v", llmReq) - - requestBody := v.RequestBody.Body - var err error - // Update target models in the body. - if llmReq.Model != llmReq.ResolvedTargetModel { - rb["model"] = llmReq.ResolvedTargetModel - requestBody, err = json.Marshal(rb) - if err != nil { - klog.Errorf("Error marshaling request body: %v", err) - return nil, fmt.Errorf("error marshaling request body: %v", err) - } - klog.V(logutil.VERBOSE).Infof("Updated body: %v", string(requestBody)) - } - - targetPod, err := s.scheduler.Schedule(llmReq) - if err != nil { - return nil, fmt.Errorf("failed to find target pod: %w", err) - } - klog.V(logutil.VERBOSE).Infof("Selected target model %v in target pod: %v\n", llmReq.ResolvedTargetModel, targetPod) - - reqCtx.Model = llmReq.Model - reqCtx.ResolvedTargetModel = llmReq.ResolvedTargetModel - reqCtx.RequestSize = len(v.RequestBody.Body) - reqCtx.TargetPod = targetPod - - // Insert target endpoint to instruct Envoy to route requests to the specified target pod. - headers := []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: s.targetEndpointKey, - RawValue: []byte(targetPod.Address), - }, - }, - // We need to update the content length header if the body is mutated, see Envoy doc: - // https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/filters/http/ext_proc/v3/processing_mode.proto - { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte(strconv.Itoa(len(requestBody))), - }, - }, - } - // Print headers for debugging - for _, header := range headers { - klog.V(logutil.VERBOSE).Infof("[request_body] Header Key: %s, Header Value: %s\n", header.Header.Key, header.Header.RawValue) - } - - resp := &extProcPb.ProcessingResponse{ - // The Endpoint Picker supports two approaches to communicating the target endpoint, as a request header - // and as an unstructure ext-proc response metadata key/value pair. This enables different integration - // options for gateway providers. - Response: &extProcPb.ProcessingResponse_RequestBody{ - RequestBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: headers, - }, - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_Body{ - Body: requestBody, - }, - }, - }, - }, - }, - DynamicMetadata: &structpb.Struct{ - Fields: map[string]*structpb.Value{ - s.targetEndpointKey: { - Kind: &structpb.Value_StringValue{ - StringValue: targetPod.Address, - }, - }, - }, - }, - } - return resp, nil -} - -func HandleRequestHeaders(reqCtx *RequestContext, req *extProcPb.ProcessingRequest) *extProcPb.ProcessingResponse { - klog.V(logutil.VERBOSE).Info("Handling request headers ...") - r := req.Request - h := r.(*extProcPb.ProcessingRequest_RequestHeaders) - klog.V(logutil.VERBOSE).Infof("Headers: %+v\n", h) - - resp := &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_RequestHeaders{ - RequestHeaders: &extProcPb.HeadersResponse{ - Response: &extProcPb.CommonResponse{ - // Set `clear_route_cache = true` to force Envoy to recompute the target cluster - // based on the new "target-pod" header. - // See https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor.proto#service-ext-proc-v3-commonresponse. - ClearRouteCache: true, - }, - }, - }, - } - - return resp -} diff --git a/pkg/ext-proc/handlers/server.go b/pkg/ext-proc/handlers/server.go deleted file mode 100644 index 172249b6..00000000 --- a/pkg/ext-proc/handlers/server.go +++ /dev/null @@ -1,147 +0,0 @@ -package handlers - -import ( - "io" - "time" - - extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" - envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3" - "google.golang.org/grpc/codes" - "google.golang.org/grpc/status" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" - klog "k8s.io/klog/v2" -) - -func NewServer(pp PodProvider, scheduler Scheduler, targetEndpointKey string, datastore ModelDataStore) *Server { - return &Server{ - scheduler: scheduler, - podProvider: pp, - targetEndpointKey: targetEndpointKey, - datastore: datastore, - } -} - -// Server implements the Envoy external processing server. -// https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor.proto -type Server struct { - scheduler Scheduler - podProvider PodProvider - // The key of the header to specify the target pod address. This value needs to match Envoy - // configuration. - targetEndpointKey string - datastore ModelDataStore -} - -type Scheduler interface { - Schedule(b *scheduling.LLMRequest) (targetPod backend.Pod, err error) -} - -// PodProvider is an interface to provide set of pods in the backend and information such as metrics. -type PodProvider interface { - GetPodMetrics(pod backend.Pod) (*backend.PodMetrics, bool) - UpdatePodMetrics(pod backend.Pod, pm *backend.PodMetrics) -} - -type ModelDataStore interface { - FetchModelData(modelName string) (returnModel *v1alpha1.InferenceModel) -} - -func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { - klog.V(logutil.VERBOSE).Info("Processing") - ctx := srv.Context() - // Create request context to share states during life time of an HTTP request. - // See https://github.com/envoyproxy/envoy/issues/17540. - reqCtx := &RequestContext{} - - for { - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - - req, err := srv.Recv() - if err == io.EOF { - return nil - } - if err != nil { - // This error occurs very frequently, though it doesn't seem to have any impact. - // TODO Figure out if we can remove this noise. - klog.V(logutil.VERBOSE).Infof("cannot receive stream request: %v", err) - return status.Errorf(codes.Unknown, "cannot receive stream request: %v", err) - } - - var resp *extProcPb.ProcessingResponse - switch v := req.Request.(type) { - case *extProcPb.ProcessingRequest_RequestHeaders: - reqCtx.RequestReceivedTimestamp = time.Now() - resp = HandleRequestHeaders(reqCtx, req) - klog.V(logutil.VERBOSE).Infof("Request context after HandleRequestHeaders: %+v", reqCtx) - case *extProcPb.ProcessingRequest_RequestBody: - resp, err = s.HandleRequestBody(reqCtx, req) - if err == nil { - metrics.RecordRequestCounter(reqCtx.Model, reqCtx.ResolvedTargetModel) - metrics.RecordRequestSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestSize) - } - klog.V(logutil.VERBOSE).Infof("Request context after HandleRequestBody: %+v", reqCtx) - case *extProcPb.ProcessingRequest_ResponseHeaders: - resp, err = s.HandleResponseHeaders(reqCtx, req) - klog.V(logutil.VERBOSE).Infof("Request context after HandleResponseHeaders: %+v", reqCtx) - case *extProcPb.ProcessingRequest_ResponseBody: - resp, err = s.HandleResponseBody(reqCtx, req) - if err == nil && reqCtx.ResponseComplete { - reqCtx.ResponseCompleteTimestamp = time.Now() - metrics.RecordRequestLatencies(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) - metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) - metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Response.Usage.PromptTokens) - metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Response.Usage.CompletionTokens) - } - klog.V(logutil.VERBOSE).Infof("Request context after HandleResponseBody: %+v", reqCtx) - default: - klog.Errorf("Unknown Request type %+v", v) - return status.Error(codes.Unknown, "unknown request type") - } - if err != nil { - klog.Errorf("failed to process request: %v", err) - switch status.Code(err) { - // This code can be returned by scheduler when there is no capacity for sheddable - // requests. - case codes.ResourceExhausted: - resp = &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_ImmediateResponse{ - ImmediateResponse: &extProcPb.ImmediateResponse{ - Status: &envoyTypePb.HttpStatus{ - Code: envoyTypePb.StatusCode_TooManyRequests, - }, - }, - }, - } - default: - return status.Errorf(status.Code(err), "failed to handle request: %v", err) - } - } - - klog.V(logutil.VERBOSE).Infof("response: %v", resp) - if err := srv.Send(resp); err != nil { - klog.Errorf("send error %v", err) - return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) - } - } -} - -// RequestContext stores context information during the life time of an HTTP request. -type RequestContext struct { - TargetPod backend.Pod - Model string - ResolvedTargetModel string - RequestReceivedTimestamp time.Time - ResponseCompleteTimestamp time.Time - RequestSize int - Response Response - ResponseSize int - ResponseComplete bool -} diff --git a/pkg/ext-proc/health.go b/pkg/ext-proc/health.go deleted file mode 100644 index 62527d06..00000000 --- a/pkg/ext-proc/health.go +++ /dev/null @@ -1,29 +0,0 @@ -package main - -import ( - "context" - - "google.golang.org/grpc/codes" - healthPb "google.golang.org/grpc/health/grpc_health_v1" - "google.golang.org/grpc/status" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" - klog "k8s.io/klog/v2" -) - -type healthServer struct { - datastore *backend.K8sDatastore -} - -func (s *healthServer) Check(ctx context.Context, in *healthPb.HealthCheckRequest) (*healthPb.HealthCheckResponse, error) { - if !s.datastore.HasSynced() { - klog.Infof("gRPC health check not serving: %s", in.String()) - return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_NOT_SERVING}, nil - } - klog.V(logutil.DEBUG).Infof("gRPC health check serving: %s", in.String()) - return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_SERVING}, nil -} - -func (s *healthServer) Watch(in *healthPb.HealthCheckRequest, srv healthPb.Health_WatchServer) error { - return status.Error(codes.Unimplemented, "Watch is not implemented") -} diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go deleted file mode 100644 index 98b7e6ca..00000000 --- a/pkg/ext-proc/main.go +++ /dev/null @@ -1,214 +0,0 @@ -package main - -import ( - "context" - "flag" - "fmt" - "net" - "net/http" - "strconv" - - "github.com/prometheus/client_golang/prometheus/promhttp" - "google.golang.org/grpc" - healthPb "google.golang.org/grpc/health/grpc_health_v1" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend/vllm" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" - runserver "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" - "k8s.io/apimachinery/pkg/runtime" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/client-go/kubernetes" - clientgoscheme "k8s.io/client-go/kubernetes/scheme" - "k8s.io/client-go/rest" - "k8s.io/component-base/metrics/legacyregistry" - klog "k8s.io/klog/v2" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/metrics/filters" -) - -const ( - defaultMetricsEndpoint = "/metrics" -) - -var ( - grpcPort = flag.Int( - "grpcPort", - runserver.DefaultGrpcPort, - "The gRPC port used for communicating with Envoy proxy") - grpcHealthPort = flag.Int( - "grpcHealthPort", - 9003, - "The port used for gRPC liveness and readiness probes") - metricsPort = flag.Int( - "metricsPort", 9090, "The metrics port") - targetEndpointKey = flag.String( - "targetEndpointKey", - runserver.DefaultTargetEndpointKey, - "Header key used by Envoy to route to the appropriate pod. This must match Envoy configuration.") - poolName = flag.String( - "poolName", - runserver.DefaultPoolName, - "Name of the InferencePool this Endpoint Picker is associated with.") - poolNamespace = flag.String( - "poolNamespace", - runserver.DefaultPoolNamespace, - "Namespace of the InferencePool this Endpoint Picker is associated with.") - refreshPodsInterval = flag.Duration( - "refreshPodsInterval", - runserver.DefaultRefreshPodsInterval, - "interval to refresh pods") - refreshMetricsInterval = flag.Duration( - "refreshMetricsInterval", - runserver.DefaultRefreshMetricsInterval, - "interval to refresh metrics") - - scheme = runtime.NewScheme() -) - -func init() { - utilruntime.Must(clientgoscheme.AddToScheme(scheme)) - utilruntime.Must(v1alpha1.AddToScheme(scheme)) -} - -func main() { - klog.InitFlags(nil) - flag.Parse() - - ctrl.SetLogger(klog.TODO()) - cfg, err := ctrl.GetConfig() - if err != nil { - klog.Fatalf("Failed to get rest config: %v", err) - } - // Validate flags - if err := validateFlags(); err != nil { - klog.Fatalf("Failed to validate flags: %v", err) - } - - // Print all flag values - flags := "Flags: " - flag.VisitAll(func(f *flag.Flag) { - flags += fmt.Sprintf("%s=%v; ", f.Name, f.Value) - }) - klog.Info(flags) - - datastore := backend.NewK8sDataStore() - - serverRunner := &runserver.ExtProcServerRunner{ - GrpcPort: *grpcPort, - TargetEndpointKey: *targetEndpointKey, - PoolName: *poolName, - PoolNamespace: *poolNamespace, - RefreshPodsInterval: *refreshPodsInterval, - RefreshMetricsInterval: *refreshMetricsInterval, - Scheme: scheme, - Config: ctrl.GetConfigOrDie(), - Datastore: datastore, - } - serverRunner.Setup() - - k8sClient, err := kubernetes.NewForConfigAndClient(cfg, serverRunner.Manager.GetHTTPClient()) - if err != nil { - klog.Fatalf("Failed to create client: %v", err) - } - datastore.SetClient(k8sClient) - - // Start health and ext-proc servers in goroutines - healthSvr := startHealthServer(datastore, *grpcHealthPort) - extProcSvr := serverRunner.Start(&vllm.PodMetricsClientImpl{}) - // Start metrics handler - metricsSvr := startMetricsHandler(*metricsPort, cfg) - - // Start manager, blocking - serverRunner.StartManager() - - // Gracefully shutdown servers - if healthSvr != nil { - klog.Info("Health server shutting down") - healthSvr.GracefulStop() - } - if extProcSvr != nil { - klog.Info("Ext-proc server shutting down") - extProcSvr.GracefulStop() - } - if metricsSvr != nil { - klog.Info("Metrics server shutting down") - if err := metricsSvr.Shutdown(context.Background()); err != nil { - klog.Infof("Metrics server Shutdown: %v", err) - } - } - - klog.Info("All components shutdown") -} - -// startHealthServer starts the gRPC health probe server in a goroutine. -func startHealthServer(ds *backend.K8sDatastore, port int) *grpc.Server { - svr := grpc.NewServer() - healthPb.RegisterHealthServer(svr, &healthServer{datastore: ds}) - - go func() { - lis, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) - if err != nil { - klog.Fatalf("Health server failed to listen: %v", err) - } - klog.Infof("Health server listening on port: %d", port) - - // Blocking and will return when shutdown is complete. - if err := svr.Serve(lis); err != nil && err != grpc.ErrServerStopped { - klog.Fatalf("Health server failed: %v", err) - } - klog.Info("Health server shutting down") - }() - return svr -} - -func startMetricsHandler(port int, cfg *rest.Config) *http.Server { - metrics.Register() - - var svr *http.Server - go func() { - klog.Info("Starting metrics HTTP handler ...") - - mux := http.NewServeMux() - mux.Handle(defaultMetricsEndpoint, metricsHandlerWithAuthenticationAndAuthorization(cfg)) - - svr = &http.Server{ - Addr: net.JoinHostPort("", strconv.Itoa(port)), - Handler: mux, - } - if err := svr.ListenAndServe(); err != http.ErrServerClosed { - klog.Fatalf("failed to start metrics HTTP handler: %v", err) - } - }() - return svr -} - -func metricsHandlerWithAuthenticationAndAuthorization(cfg *rest.Config) http.Handler { - h := promhttp.HandlerFor( - legacyregistry.DefaultGatherer, - promhttp.HandlerOpts{}, - ) - httpClient, err := rest.HTTPClientFor(cfg) - if err != nil { - klog.Fatalf("failed to create http client for metrics auth: %v", err) - } - - filter, err := filters.WithAuthenticationAndAuthorization(cfg, httpClient) - if err != nil { - klog.Fatalf("failed to create metrics filter for auth: %v", err) - } - metricsLogger := klog.LoggerWithValues(klog.NewKlogr(), "path", defaultMetricsEndpoint) - metricsAuthHandler, err := filter(metricsLogger, h) - if err != nil { - klog.Fatalf("failed to create metrics auth handler: %v", err) - } - return metricsAuthHandler -} - -func validateFlags() error { - if *poolName == "" { - return fmt.Errorf("required %q flag not set", "poolName") - } - - return nil -} diff --git a/pkg/ext-proc/scheduling/filter.go b/pkg/ext-proc/scheduling/filter.go deleted file mode 100644 index d431b076..00000000 --- a/pkg/ext-proc/scheduling/filter.go +++ /dev/null @@ -1,188 +0,0 @@ -package scheduling - -import ( - "errors" - "math" - - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" - klog "k8s.io/klog/v2" -) - -type Filter interface { - Name() string - Filter(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) -} - -// filter applies current filterFunc, and then recursively applies next filters depending success or -// failure of the current filterFunc. -// It can be used to construct a flow chart algorithm. -type filter struct { - name string - filter filterFunc - // nextOnSuccess filter will be applied after successfully applying the current filter. - // The filtered results will be passed to the next filter. - nextOnSuccess *filter - // nextOnFailure filter will be applied if current filter fails. - // The original input will be passed to the next filter. - nextOnFailure *filter - // nextOnSuccessOrFailure is a convenience field to configure the next filter regardless of the - // success or failure of the current filter. - // NOTE: When using nextOnSuccessOrFailure, both nextOnSuccess and nextOnFailure SHOULD be nil. - // However if that's not the case, nextOnSuccess and nextOnFailure will be used, instead of - // nextOnSuccessOrFailure, in the success and failure scenarios, respectively. - nextOnSuccessOrFailure *filter -} - -func (f *filter) Name() string { - if f == nil { - return "nil" - } - return f.name -} - -func (f *filter) Filter(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { - klog.V(logutil.VERBOSE).Infof("Running filter %q on request %v with %v pods", f.name, req, len(pods)) - - filtered, err := f.filter(req, pods) - - next := f.nextOnSuccessOrFailure - if err == nil && len(filtered) > 0 { - if f.nextOnSuccess == nil && f.nextOnSuccessOrFailure == nil { - // No succeeding filters to run, return. - return filtered, err - } - if f.nextOnSuccess != nil { - next = f.nextOnSuccess - } - klog.V(logutil.VERBOSE).Infof("onSuccess %q -> %q, filtered: %v", f.name, next.Name(), len(filtered)) - // On success, pass the filtered result to the next filter. - return next.Filter(req, filtered) - } else { - if f.nextOnFailure == nil && f.nextOnSuccessOrFailure == nil { - // No succeeding filters to run, return. - return filtered, err - } - if f.nextOnFailure != nil { - next = f.nextOnFailure - } - klog.V(logutil.VERBOSE).Infof("onFailure %q -> %q", f.name, next.Name()) - // On failure, pass the initial set of pods to the next filter. - return next.Filter(req, pods) - } -} - -// filterFunc filters a set of input pods to a subset. -type filterFunc func(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) - -// toFilterFunc is a helper function to convert a per pod filter func to the FilterFunc. -func toFilterFunc(pp podPredicate) filterFunc { - return func(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { - filtered := []*backend.PodMetrics{} - for _, pod := range pods { - pass := pp(req, pod) - if pass { - filtered = append(filtered, pod) - } - } - if len(filtered) == 0 { - return nil, errors.New("no pods left") - } - return filtered, nil - } -} - -// leastQueuingFilterFunc finds the max and min queue size of all pods, divides the whole range -// (max-min) by the number of pods, and finds the pods that fall into the first range. -// The intuition is that if there are multiple pods that share similar queue size in the low range, -// we should consider them all instead of the absolute minimum one. This worked better than picking -// the least one as it gives more choices for the next filter, which on aggregate gave better -// results. -// TODO: Compare this strategy with other strategies such as top K. -func leastQueuingFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { - min := math.MaxInt - max := 0 - filtered := []*backend.PodMetrics{} - - for _, pod := range pods { - if pod.WaitingQueueSize <= min { - min = pod.WaitingQueueSize - } - if pod.WaitingQueueSize >= max { - max = pod.WaitingQueueSize - } - } - - for _, pod := range pods { - if pod.WaitingQueueSize >= min && pod.WaitingQueueSize <= min+(max-min)/len(pods) { - filtered = append(filtered, pod) - } - } - return filtered, nil -} - -func lowQueueingPodPredicate(_ *LLMRequest, pod *backend.PodMetrics) bool { - return pod.WaitingQueueSize < queueingThresholdLoRA -} - -// leastKVCacheFilterFunc finds the max and min KV cache of all pods, divides the whole range -// (max-min) by the number of pods, and finds the pods that fall into the first range. -// The intuition is that if there are multiple pods that share similar KV cache in the low range, we -// should consider them all instead of the absolute minimum one. This worked better than picking the -// least one as it gives more choices for the next filter, which on aggregate gave better results. -// TODO: Compare this strategy with other strategies such as top K. -func leastKVCacheFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { - min := math.MaxFloat64 - var max float64 = 0 - filtered := []*backend.PodMetrics{} - - for _, pod := range pods { - if pod.KVCacheUsagePercent <= min { - min = pod.KVCacheUsagePercent - } - if pod.KVCacheUsagePercent >= max { - max = pod.KVCacheUsagePercent - } - } - - for _, pod := range pods { - if pod.KVCacheUsagePercent >= min && pod.KVCacheUsagePercent <= min+(max-min)/float64(len(pods)) { - filtered = append(filtered, pod) - } - } - return filtered, nil -} - -// podPredicate is a filter function to check whether a pod is desired. -type podPredicate func(req *LLMRequest, pod *backend.PodMetrics) bool - -// We consider serving an adapter low cost it the adapter is active in the model server, or the -// model server has room to load the adapter. The lowLoRACostPredicate ensures weak affinity by -// spreading the load of a LoRA adapter across multiple pods, avoiding "pinning" all requests to -// a single pod. This gave good performance in our initial benchmarking results in the scenario -// where # of lora slots > # of lora adapters. -func lowLoRACostPredicate(req *LLMRequest, pod *backend.PodMetrics) bool { - _, ok := pod.ActiveModels[req.ResolvedTargetModel] - return ok || len(pod.ActiveModels) < pod.MaxActiveModels -} - -// loRAAffinityPredicate is a filter function to check whether a pod has affinity to the lora requested. -func loRAAffinityPredicate(req *LLMRequest, pod *backend.PodMetrics) bool { - _, ok := pod.ActiveModels[req.ResolvedTargetModel] - return ok -} - -// canAcceptNewLoraPredicate is a filter function to check whether a pod has room to load the adapter. -func canAcceptNewLoraPredicate(req *LLMRequest, pod *backend.PodMetrics) bool { - return len(pod.ActiveModels) < pod.MaxActiveModels -} - -func criticalRequestPredicate(req *LLMRequest, pod *backend.PodMetrics) bool { - return req.Critical -} - -func noQueueAndLessThanKVCacheThresholdPredicate(queueThreshold int, kvCacheThreshold float64) podPredicate { - return func(req *LLMRequest, pod *backend.PodMetrics) bool { - return pod.WaitingQueueSize <= queueThreshold && pod.KVCacheUsagePercent <= kvCacheThreshold - } -} diff --git a/pkg/ext-proc/scheduling/filter_test.go b/pkg/ext-proc/scheduling/filter_test.go deleted file mode 100644 index 34731d15..00000000 --- a/pkg/ext-proc/scheduling/filter_test.go +++ /dev/null @@ -1,409 +0,0 @@ -package scheduling - -import ( - "errors" - "testing" - - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" -) - -func TestFilter(t *testing.T) { - tests := []struct { - name string - req *LLMRequest - input []*backend.PodMetrics - output []*backend.PodMetrics - err bool - filter *filter - }{ - { - name: "simple filter without successor, failure", - filter: &filter{filter: func(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { - return nil, errors.New("filter error") - }}, - err: true, - }, - { - name: "default filter, critical request", - filter: defaultFilter, - req: &LLMRequest{ - Model: "critical", - ResolvedTargetModel: "critical", - Critical: true, - }, - // pod2 will be picked because it has relatively low queue size, with the requested - // model being active, and has low KV cache. - input: []*backend.PodMetrics{ - { - Pod: backend.Pod{Name: "pod1"}, - Metrics: backend.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - }, - { - Pod: backend.Pod{Name: "pod2"}, - Metrics: backend.Metrics{ - WaitingQueueSize: 3, - KVCacheUsagePercent: 0.1, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "critical": 1, - }, - }, - }, - { - Pod: backend.Pod{Name: "pod3"}, - Metrics: backend.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - }, - }, - }, - }, - output: []*backend.PodMetrics{ - { - Pod: backend.Pod{Name: "pod2"}, - Metrics: backend.Metrics{ - WaitingQueueSize: 3, - KVCacheUsagePercent: 0.1, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "critical": 1, - }, - }, - }, - }, - }, - { - name: "default filter, sheddable request, accepted", - filter: defaultFilter, - req: &LLMRequest{ - Model: "sheddable", - ResolvedTargetModel: "sheddable", - Critical: false, - }, - // pod1 will be picked because it has capacity for the sheddable request. - input: []*backend.PodMetrics{ - { - Pod: backend.Pod{Name: "pod1"}, - Metrics: backend.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - }, - { - Pod: backend.Pod{Name: "pod2"}, - Metrics: backend.Metrics{ - WaitingQueueSize: 3, - KVCacheUsagePercent: 0.1, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "critical": 1, - }, - }, - }, - { - Pod: backend.Pod{Name: "pod3"}, - Metrics: backend.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - }, - }, - }, - }, - output: []*backend.PodMetrics{ - { - Pod: backend.Pod{Name: "pod1"}, - Metrics: backend.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - }, - }, - }, - { - name: "default filter, sheddable request, dropped", - filter: defaultFilter, - req: &LLMRequest{ - Model: "sheddable", - ResolvedTargetModel: "sheddable", - Critical: false, - }, - // All pods have higher KV cache thant the threshold, so the sheddable request will be - // dropped. - input: []*backend.PodMetrics{ - { - Pod: backend.Pod{Name: "pod1"}, - Metrics: backend.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.9, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - }, - { - Pod: backend.Pod{Name: "pod2"}, - Metrics: backend.Metrics{ - WaitingQueueSize: 3, - KVCacheUsagePercent: 0.85, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "critical": 1, - }, - }, - }, - { - Pod: backend.Pod{Name: "pod3"}, - Metrics: backend.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.85, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - }, - }, - }, - }, - output: []*backend.PodMetrics{}, - err: true, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - got, err := test.filter.Filter(test.req, test.input) - if test.err != (err != nil) { - t.Errorf("Unexpected error, got %v, want %v", err, test.err) - } - - if diff := cmp.Diff(test.output, got, cmpopts.IgnoreFields(backend.PodMetrics{}, "revision")); diff != "" { - t.Errorf("Unexpected output (-want +got): %v", diff) - } - }) - } -} - -func TestFilterFunc(t *testing.T) { - tests := []struct { - name string - f filterFunc - req *LLMRequest - input []*backend.PodMetrics - output []*backend.PodMetrics - err bool - }{ - { - name: "least queuing empty input", - f: leastQueuingFilterFunc, - input: []*backend.PodMetrics{}, - output: []*backend.PodMetrics{}, - }, - { - name: "least queuing", - f: leastQueuingFilterFunc, - input: []*backend.PodMetrics{ - { - Metrics: backend.Metrics{ - WaitingQueueSize: 0, - }, - }, - { - Metrics: backend.Metrics{ - WaitingQueueSize: 3, - }, - }, - { - Metrics: backend.Metrics{ - WaitingQueueSize: 10, - }, - }, - }, - output: []*backend.PodMetrics{ - { - Metrics: backend.Metrics{ - WaitingQueueSize: 0, - }, - }, - { - Metrics: backend.Metrics{ - WaitingQueueSize: 3, - }, - }, - }, - }, - { - name: "least kv cache empty input", - f: leastKVCacheFilterFunc, - input: []*backend.PodMetrics{}, - output: []*backend.PodMetrics{}, - }, - { - name: "least kv cache", - f: leastKVCacheFilterFunc, - input: []*backend.PodMetrics{ - { - Metrics: backend.Metrics{ - KVCacheUsagePercent: 0, - }, - }, - { - Metrics: backend.Metrics{ - KVCacheUsagePercent: 0.3, - }, - }, - { - Metrics: backend.Metrics{ - KVCacheUsagePercent: 1.0, - }, - }, - }, - output: []*backend.PodMetrics{ - { - Metrics: backend.Metrics{ - KVCacheUsagePercent: 0, - }, - }, - { - Metrics: backend.Metrics{ - KVCacheUsagePercent: 0.3, - }, - }, - }, - }, - { - name: "noQueueAndLessThanKVCacheThresholdPredicate", - f: toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(0, 0.8)), - input: []*backend.PodMetrics{ - { - // This pod should be returned. - Metrics: backend.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0, - }, - }, - { - // Queue is non zero, despite low kv cache, should not return. - Metrics: backend.Metrics{ - WaitingQueueSize: 1, - KVCacheUsagePercent: 0.3, - }, - }, - { - // High kv cache despite zero queue, should not return - Metrics: backend.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 1.0, - }, - }, - }, - output: []*backend.PodMetrics{ - { - Metrics: backend.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0, - }, - }, - }, - }, - { - name: "low LoRA cost", - f: toFilterFunc(lowLoRACostPredicate), - req: &LLMRequest{ - Model: "model", - ResolvedTargetModel: "model", - }, - input: []*backend.PodMetrics{ - // ActiveModels include input model, should be returned. - { - Metrics: backend.Metrics{ - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "model": 1, - }, - }, - }, - // Input model is not active, however the server has room to load another adapter. - { - Metrics: backend.Metrics{ - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "another-model": 1, - }, - }, - }, - // Input is not active, and the server has reached max active models. - { - Metrics: backend.Metrics{ - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - }, - }, - output: []*backend.PodMetrics{ - { - Metrics: backend.Metrics{ - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "model": 1, - }, - }, - }, - { - Metrics: backend.Metrics{ - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "another-model": 1, - }, - }, - }, - }, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - got, err := test.f(test.req, test.input) - if test.err != (err != nil) { - t.Errorf("Unexpected error, got %v, want %v", err, test.err) - } - - if diff := cmp.Diff(test.output, got, cmpopts.IgnoreFields(backend.PodMetrics{}, "revision")); diff != "" { - t.Errorf("Unexpected output (-want +got): %v", diff) - } - }) - } -} diff --git a/pkg/ext-proc/scheduling/types.go b/pkg/ext-proc/scheduling/types.go deleted file mode 100644 index cfb9d3b8..00000000 --- a/pkg/ext-proc/scheduling/types.go +++ /dev/null @@ -1,11 +0,0 @@ -package scheduling - -// LLMRequest is a structured representation of the fields we parse out of the LLMRequest body. -type LLMRequest struct { - Model string - // Target models is a map of target model name to weight. - TargetModels map[string]int - // Resolved target model is the final target model after traffic split. - ResolvedTargetModel string - Critical bool -} diff --git a/pkg/ext-proc/server/runserver.go b/pkg/ext-proc/server/runserver.go deleted file mode 100644 index 981dab11..00000000 --- a/pkg/ext-proc/server/runserver.go +++ /dev/null @@ -1,137 +0,0 @@ -package server - -import ( - "fmt" - "net" - "time" - - extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" - "google.golang.org/grpc" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/rest" - klog "k8s.io/klog/v2" - ctrl "sigs.k8s.io/controller-runtime" -) - -// ExtProcServerRunner provides methods to manage an external process server. -type ExtProcServerRunner struct { - GrpcPort int - TargetEndpointKey string - PoolName string - PoolNamespace string - RefreshPodsInterval time.Duration - RefreshMetricsInterval time.Duration - Scheme *runtime.Scheme - Config *rest.Config - Datastore *backend.K8sDatastore - Manager ctrl.Manager -} - -// Default values for CLI flags in main -const ( - DefaultGrpcPort = 9002 // default for --grpcPort - DefaultTargetEndpointKey = "x-gateway-destination-endpoint" // default for --targetEndpointKey - DefaultPoolName = "" // required but no default - DefaultPoolNamespace = "default" // default for --poolNamespace - DefaultRefreshPodsInterval = 10 * time.Second // default for --refreshPodsInterval - DefaultRefreshMetricsInterval = 50 * time.Millisecond // default for --refreshMetricsInterval -) - -func NewDefaultExtProcServerRunner() *ExtProcServerRunner { - return &ExtProcServerRunner{ - GrpcPort: DefaultGrpcPort, - TargetEndpointKey: DefaultTargetEndpointKey, - PoolName: DefaultPoolName, - PoolNamespace: DefaultPoolNamespace, - RefreshPodsInterval: DefaultRefreshPodsInterval, - RefreshMetricsInterval: DefaultRefreshMetricsInterval, - // Scheme, Config, and Datastore can be assigned later. - } -} - -// Setup creates the reconcilers for pools and models and starts the manager. -func (r *ExtProcServerRunner) Setup() { - // Create a new manager to manage controllers - mgr, err := ctrl.NewManager(r.Config, ctrl.Options{Scheme: r.Scheme}) - if err != nil { - klog.Fatalf("Failed to create controller manager: %v", err) - } - r.Manager = mgr - - // Create the controllers and register them with the manager - if err := (&backend.InferencePoolReconciler{ - Datastore: r.Datastore, - Scheme: mgr.GetScheme(), - Client: mgr.GetClient(), - PoolNamespacedName: types.NamespacedName{ - Name: r.PoolName, - Namespace: r.PoolNamespace, - }, - Record: mgr.GetEventRecorderFor("InferencePool"), - }).SetupWithManager(mgr); err != nil { - klog.Fatalf("Failed setting up InferencePoolReconciler: %v", err) - } - - if err := (&backend.InferenceModelReconciler{ - Datastore: r.Datastore, - Scheme: mgr.GetScheme(), - Client: mgr.GetClient(), - PoolNamespacedName: types.NamespacedName{ - Name: r.PoolName, - Namespace: r.PoolNamespace, - }, - Record: mgr.GetEventRecorderFor("InferenceModel"), - }).SetupWithManager(mgr); err != nil { - klog.Fatalf("Failed setting up InferenceModelReconciler: %v", err) - } -} - -// Start starts the Envoy external processor server in a goroutine. -func (r *ExtProcServerRunner) Start( - podMetricsClient backend.PodMetricsClient, -) *grpc.Server { - svr := grpc.NewServer() - - go func() { - lis, err := net.Listen("tcp", fmt.Sprintf(":%d", r.GrpcPort)) - if err != nil { - klog.Fatalf("Ext-proc server failed to listen: %v", err) - } - klog.Infof("Ext-proc server listening on port: %d", r.GrpcPort) - - // Initialize backend provider - pp := backend.NewProvider(podMetricsClient, r.Datastore) - if err := pp.Init(r.RefreshPodsInterval, r.RefreshMetricsInterval); err != nil { - klog.Fatalf("Failed to initialize backend provider: %v", err) - } - - // Register ext_proc handlers - extProcPb.RegisterExternalProcessorServer( - svr, - handlers.NewServer(pp, scheduling.NewScheduler(pp), r.TargetEndpointKey, r.Datastore), - ) - - // Blocking and will return when shutdown is complete. - if err := svr.Serve(lis); err != nil && err != grpc.ErrServerStopped { - klog.Fatalf("Ext-proc server failed: %v", err) - } - klog.Info("Ext-proc server shutting down") - }() - return svr -} - -func (r *ExtProcServerRunner) StartManager() { - if r.Manager == nil { - klog.Fatalf("Runner has no manager setup to run: %v", r) - } - // Start the controller manager. Blocking and will return when shutdown is complete. - klog.Infof("Starting controller manager") - if err := r.Manager.Start(ctrl.SetupSignalHandler()); err != nil { - klog.Fatalf("Error starting controller manager: %v", err) - } - klog.Info("Controller manager shutting down") -} diff --git a/pkg/ext-proc/test/benchmark/benchmark.go b/pkg/ext-proc/test/benchmark/benchmark.go deleted file mode 100644 index 9ff61d8b..00000000 --- a/pkg/ext-proc/test/benchmark/benchmark.go +++ /dev/null @@ -1,110 +0,0 @@ -package main - -import ( - "flag" - "fmt" - "os" - "time" - - "github.com/bojand/ghz/printer" - "github.com/bojand/ghz/runner" - "github.com/jhump/protoreflect/desc" - "google.golang.org/protobuf/proto" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - runserver "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/test" - klog "k8s.io/klog/v2" -) - -var ( - svrAddr = flag.String("server_address", fmt.Sprintf("localhost:%d", runserver.DefaultGrpcPort), "Address of the ext proc server") - totalRequests = flag.Int("total_requests", 100000, "number of requests to be sent for load test") - // Flags when running a local ext proc server. - numFakePods = flag.Int("num_fake_pods", 200, "number of fake pods when running a local ext proc server") - numModelsPerPod = flag.Int("num_models_per_pod", 5, "number of fake models per pod when running a local ext proc server") - localServer = flag.Bool("local_server", true, "whether to start a local ext proc server") - refreshPodsInterval = flag.Duration("refreshPodsInterval", 10*time.Second, "interval to refresh pods") - refreshMetricsInterval = flag.Duration("refreshMetricsInterval", 50*time.Millisecond, "interval to refresh metrics") -) - -const ( - port = runserver.DefaultGrpcPort -) - -func main() { - klog.InitFlags(nil) - flag.Parse() - - if *localServer { - test.StartExtProc(port, *refreshPodsInterval, *refreshMetricsInterval, fakePods(), fakeModels()) - time.Sleep(time.Second) // wait until server is up - klog.Info("Server started") - } - - report, err := runner.Run( - "envoy.service.ext_proc.v3.ExternalProcessor.Process", - *svrAddr, - runner.WithInsecure(true), - runner.WithBinaryDataFunc(generateRequest), - runner.WithTotalRequests(uint(*totalRequests)), - ) - if err != nil { - klog.Fatal(err) - } - - printer := printer.ReportPrinter{ - Out: os.Stdout, - Report: report, - } - - printer.Print("summary") -} - -func generateRequest(mtd *desc.MethodDescriptor, callData *runner.CallData) []byte { - numModels := *numFakePods * (*numModelsPerPod) - req := test.GenerateRequest(modelName(int(callData.RequestNumber) % numModels)) - data, err := proto.Marshal(req) - if err != nil { - klog.Fatal("marshaling error: ", err) - } - return data -} - -func fakeModels() map[string]*v1alpha1.InferenceModel { - models := map[string]*v1alpha1.InferenceModel{} - for i := range *numFakePods { - for j := range *numModelsPerPod { - m := modelName(i*(*numModelsPerPod) + j) - models[m] = &v1alpha1.InferenceModel{Spec: v1alpha1.InferenceModelSpec{ModelName: m}} - } - } - - return models -} - -func fakePods() []*backend.PodMetrics { - pms := make([]*backend.PodMetrics, 0, *numFakePods) - for i := 0; i < *numFakePods; i++ { - metrics := fakeMetrics(i) - pod := test.FakePod(i) - pms = append(pms, &backend.PodMetrics{Pod: pod, Metrics: metrics}) - } - - return pms -} - -// fakeMetrics adds numModelsPerPod number of adapters to the pod metrics. -func fakeMetrics(podNumber int) backend.Metrics { - metrics := backend.Metrics{ - ActiveModels: make(map[string]int), - } - for i := 0; i < *numModelsPerPod; i++ { - metrics.ActiveModels[modelName(podNumber*(*numModelsPerPod)+i)] = 0 - } - return metrics -} - -func modelName(i int) string { - return fmt.Sprintf("adapter-%v", i) -} diff --git a/pkg/ext-proc/test/utils.go b/pkg/ext-proc/test/utils.go deleted file mode 100644 index a9dc4efa..00000000 --- a/pkg/ext-proc/test/utils.go +++ /dev/null @@ -1,83 +0,0 @@ -package test - -import ( - "encoding/json" - "fmt" - "net" - "time" - - extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" - "google.golang.org/grpc" - "google.golang.org/grpc/reflection" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" - klog "k8s.io/klog/v2" -) - -func StartExtProc(port int, refreshPodsInterval, refreshMetricsInterval time.Duration, pods []*backend.PodMetrics, models map[string]*v1alpha1.InferenceModel) *grpc.Server { - ps := make(backend.PodSet) - pms := make(map[string]*backend.PodMetrics) - for _, pod := range pods { - ps[pod.Pod] = true - pms[pod.Pod.Name] = pod - } - pmc := &backend.FakePodMetricsClient{Res: pms} - pp := backend.NewProvider(pmc, backend.NewK8sDataStore()) - if err := pp.Init(refreshPodsInterval, refreshMetricsInterval); err != nil { - klog.Fatalf("failed to initialize: %v", err) - } - return startExtProc(port, pp, models) -} - -// startExtProc starts an extProc server with fake pods. -func startExtProc(port int, pp *backend.Provider, models map[string]*v1alpha1.InferenceModel) *grpc.Server { - lis, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) - if err != nil { - klog.Fatalf("failed to listen: %v", err) - } - - s := grpc.NewServer() - - extProcPb.RegisterExternalProcessorServer(s, handlers.NewServer(pp, scheduling.NewScheduler(pp), "target-pod", &backend.FakeDataStore{Res: models})) - - klog.Infof("Starting gRPC server on port :%v", port) - reflection.Register(s) - go func() { - err := s.Serve(lis) - if err != nil { - klog.Fatalf("Ext-proc failed with the err: %v", err) - } - }() - return s -} - -func GenerateRequest(model string) *extProcPb.ProcessingRequest { - j := map[string]interface{}{ - "model": model, - "prompt": "hello", - "max_tokens": 100, - "temperature": 0, - } - - llmReq, err := json.Marshal(j) - if err != nil { - klog.Fatal(err) - } - req := &extProcPb.ProcessingRequest{ - Request: &extProcPb.ProcessingRequest_RequestBody{ - RequestBody: &extProcPb.HttpBody{Body: llmReq}, - }, - } - return req -} - -func FakePod(index int) backend.Pod { - address := fmt.Sprintf("address-%v", index) - pod := backend.Pod{ - Name: fmt.Sprintf("pod-%v", index), - Address: address, - } - return pod -} diff --git a/pkg/ext-proc/util/logging/logging_const.go b/pkg/ext-proc/util/logging/logging_const.go deleted file mode 100644 index a6131d18..00000000 --- a/pkg/ext-proc/util/logging/logging_const.go +++ /dev/null @@ -1,8 +0,0 @@ -package logging - -const ( - DEFAULT = 2 - VERBOSE = 3 - DEBUG = 4 - TRACE = 5 -) diff --git a/pkg/ext-proc/util/testing/lister.go b/pkg/ext-proc/util/testing/lister.go deleted file mode 100644 index 023f30a1..00000000 --- a/pkg/ext-proc/util/testing/lister.go +++ /dev/null @@ -1,19 +0,0 @@ -package testing - -import ( - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/labels" - listersv1 "k8s.io/client-go/listers/core/v1" -) - -type FakePodLister struct { - PodsList []*v1.Pod -} - -func (l *FakePodLister) List(selector labels.Selector) (ret []*v1.Pod, err error) { - return l.PodsList, nil -} - -func (l *FakePodLister) Pods(namespace string) listersv1.PodNamespaceLister { - panic("not implemented") -} diff --git a/pkg/ext-proc/util/testing/wrappers.go b/pkg/ext-proc/util/testing/wrappers.go deleted file mode 100644 index 7b593bbd..00000000 --- a/pkg/ext-proc/util/testing/wrappers.go +++ /dev/null @@ -1,38 +0,0 @@ -package testing - -import ( - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -// PodWrapper wraps a Pod inside. -type PodWrapper struct{ corev1.Pod } - -// MakePod creates a Pod wrapper. -func MakePod(name string) *PodWrapper { - return &PodWrapper{ - corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - }, - }, - } -} - -// Obj returns the inner Pod. -func (p *PodWrapper) Obj() *corev1.Pod { - return &p.Pod -} - -func (p *PodWrapper) SetReady() *PodWrapper { - p.Status.Conditions = []corev1.PodCondition{{ - Type: corev1.PodReady, - Status: corev1.ConditionTrue, - }} - return p -} - -func (p *PodWrapper) SetPodIP(podIP string) *PodWrapper { - p.Status.PodIP = podIP - return p -} diff --git a/pkg/manifests/gateway/patch_policy.yaml b/pkg/manifests/gateway/patch_policy.yaml deleted file mode 100644 index 4a556b44..00000000 --- a/pkg/manifests/gateway/patch_policy.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: EnvoyPatchPolicy -metadata: - name: custom-response-patch-policy - namespace: default -spec: - targetRef: - group: gateway.networking.k8s.io - kind: Gateway - name: inference-gateway - type: JSONPatch - jsonPatches: - # Necessary to create a cluster of the type: ORIGINAL_DST to allow for - # direct pod scheduling. Which is heavily utilized in our scheduling. - # Specifically the field `original_dst_lb_config` allows us to enable - # `use_http_header` and `http_header_name`. - # Source: https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/cluster/v3/cluster.proto - - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" - name: original_destination_cluster - operation: - op: add - path: "" - value: - name: original_destination_cluster - type: ORIGINAL_DST - original_dst_lb_config: - use_http_header: true - http_header_name: "x-gateway-destination-endpoint" - connect_timeout: 1000s - lb_policy: CLUSTER_PROVIDED - dns_lookup_family: V4_ONLY - circuit_breakers: - thresholds: - - max_connections: 40000 - max_pending_requests: 40000 - max_requests: 40000 - - - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration" - name: default/inference-gateway/llm-gw - operation: - op: replace - path: "/virtual_hosts/0/routes/0/route/cluster" - value: original_destination_cluster diff --git a/pkg/manifests/inferencemodel.yaml b/pkg/manifests/inferencemodel.yaml deleted file mode 100644 index 0085a89d..00000000 --- a/pkg/manifests/inferencemodel.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: inference.networking.x-k8s.io/v1alpha1 -kind: InferenceModel -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: inferencemodel-sample -spec: - modelName: tweet-summary - criticality: Critical - poolRef: - # this is the default val: - group: inference.networking.x-k8s.io - # this is the default val: - kind: InferencePool - name: vllm-llama2-7b-pool - targetModels: - - name: tweet-summary-0 - weight: 50 - - name: tweet-summary-1 - weight: 50 diff --git a/pkg/scheduling.md b/pkg/scheduling.md deleted file mode 100644 index 99223ad2..00000000 --- a/pkg/scheduling.md +++ /dev/null @@ -1,5 +0,0 @@ -## Scheduling Package in Ext Proc -The scheduling package implements request scheduling algorithms for load balancing requests across backend pods in an inference gateway. The scheduler ensures efficient resource utilization while maintaining low latency and prioritizing critical requests. It applies a series of filters based on metrics and heuristics to select the best pod for a given request. - -# Flowchart -Scheduling Algorithm \ No newline at end of file diff --git a/site-src/concepts/api-overview.md b/site-src/concepts/api-overview.md index 94e76251..9c5c0416 100644 --- a/site-src/concepts/api-overview.md +++ b/site-src/concepts/api-overview.md @@ -1,7 +1,7 @@ # API Overview -## Bakcground -The Gateway API Inference Extension project is an extension of the Kubernetes Gateway API for serving Generative AI models on Kubernetes. Gateway API Inference Extension facilitates standardization of APIs for Kubernetes cluster operators and developers running generative AI inference, while allowing flexibility for underlying gateway implementations (such as Envoy Proxy) to iterate on mechanisms for optimized serving of models. +## Background +The Gateway API Inference Extension project is an extension of the Kubernetes Gateway API for serving Generative AI models on Kubernetes. Gateway API Inference Extension facilitates standardization of APIs for Kubernetes cluster operators and developers running generative AI inference, while allowing flexibility for underlying gateway implementations (such as Envoy Proxy) to iterate on mechanisms for optimized serving of models. Overview of API integration @@ -9,8 +9,8 @@ The Gateway API Inference Extension project is an extension of the Kubernetes Ga ### InferencePool -InferencePool represents a set of Inference-focused Pods and an extension that will be used to route to them. Within the broader Gateway API resource model, this resource is considered a "backend". In practice, that means that you'd replace a Kubernetes Service with an InferencePool. This resource has some similarities to Service (a way to select Pods and specify a port), but has some unique capabilities. With InferenceModel, you can configure a routing extension as well as inference-specific routing optimizations. For more information on this resource, refer to our [InferencePool documentation](/api-types/inferencepool.md) or go directly to the [InferencePool spec](/reference/spec/#inferencepool). +InferencePool represents a set of Inference-focused Pods and an extension that will be used to route to them. Within the broader Gateway API resource model, this resource is considered a "backend". In practice, that means that you'd replace a Kubernetes Service with an InferencePool. This resource has some similarities to Service (a way to select Pods and specify a port), but has some unique capabilities. With InferenceModel, you can configure a routing extension as well as inference-specific routing optimizations. For more information on this resource, refer to our [InferencePool documentation](/api-types/inferencepool) or go directly to the [InferencePool spec](/reference/spec/#inferencepool). ### InferenceModel -An InferenceModel represents a model or adapter, and configuration associated with that model. This resource enables you to configure the relative criticality of a model, and allows you to seamlessly translate the requested model name to one or more backend model names. Multiple InferenceModels can be attached to an InferencePool. For more information on this resource, refer to our [InferenceModel documentation](/api-types/inferencemodel.md) or go directly to the [InferenceModel spec](/reference/spec/#inferencemodel). +An InferenceModel represents a model or adapter, and configuration associated with that model. This resource enables you to configure the relative criticality of a model, and allows you to seamlessly translate the requested model name to one or more backend model names. Multiple InferenceModels can be attached to an InferencePool. For more information on this resource, refer to our [InferenceModel documentation](/api-types/inferencemodel) or go directly to the [InferenceModel spec](/reference/spec/#inferencemodel). diff --git a/site-src/concepts/roles-and-personas.md b/site-src/concepts/roles-and-personas.md index b11f43eb..0746adbf 100644 --- a/site-src/concepts/roles-and-personas.md +++ b/site-src/concepts/roles-and-personas.md @@ -1,10 +1,10 @@ # Roles and Personas -Before diving into the details of the API, decriptions of the personas these APIs were designed for will help convey the thought process of the API design. +Before diving into the details of the API, descriptions of the personas these APIs were designed for will help convey the thought process of the API design. ## Inference Platform Admin -The Inference Platform Admin creates and manages the infrastructure necessary to run LLM workloads. Including handling Ops for: +The Inference Platform Admin creates and manages the infrastructure necessary to run LLM workloads, including handling Ops for: - Hardware - Model Server @@ -15,7 +15,7 @@ The Inference Platform Admin creates and manages the infrastructure necessary to ## Inference Workload Owner -An Inference Workload Owner persona owns and manages 1 or many Generative AI Workloads (LLM focused *currently*). This includes: +An Inference Workload Owner persona owns and manages one or many Generative AI Workloads (LLM focused *currently*). This includes: - Defining criticality - Managing fine-tunes diff --git a/site-src/guides/adapter-rollout.md b/site-src/guides/adapter-rollout.md new file mode 100644 index 00000000..9ce8c3a4 --- /dev/null +++ b/site-src/guides/adapter-rollout.md @@ -0,0 +1,133 @@ +# Adapter Rollout + +The goal of this guide is to demonstrate how to rollout a new adapter version. + +## **Prerequisites** + +Follow the steps in the [main guide](index.md) + + +## **Safely rollout v2 adapter** + +### Load the new adapter version to the model servers + +This guide leverages the LoRA syncer sidecar to dynamically manage adapters within a vLLM deployment, enabling users to add or remove them through a shared ConfigMap. + + +Modify the LoRA syncer ConfigMap to initiate loading of the new adapter version. + + +```bash + kubectl edit configmap vllm-llama2-7b-adapters +``` + +Change the ConfigMap to match the following (note the new entry under models): + +```yaml + apiVersion: v1 + kind: ConfigMap + metadata: + name: vllm-llama2-7b-adapters + data: + configmap.yaml: | + vLLMLoRAConfig: + name: vllm-llama2-7b-adapters + port: 8000 + ensureExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-1 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-2 + source: mahimairaja/tweet-summarization-llama-2-finetuned +``` + +The new adapter version is applied to the model servers live, without requiring a restart. + + +### Direct traffic to the new adapter version + +Modify the InferenceModel to configure a canary rollout with traffic splitting. In this example, 10% of traffic for tweet-summary model will be sent to the new ***tweet-summary-2*** adapter. + + +```bash + kubectl edit inferencemodel tweet-summary +``` + +Change the targetModels list in InferenceModel to match the following: + + +```yaml +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: InferenceModel +metadata: + name: inferencemodel-sample +spec: + modelName: tweet-summary + criticality: Critical + poolRef: + name: vllm-llama2-7b-pool + targetModels: + - name: tweet-summary-1 + weight: 90 + - name: tweet-summary-2 + weight: 10 + +``` + +The above configuration means one in every ten requests should be sent to the new version. Try it out: + +1. Get the gateway IP: +```bash +IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=8081 +``` + +2. Send a few requests as follows: +```bash +curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ +"model": "tweet-summary", +"prompt": "Write as if you were a critic: San Francisco", +"max_tokens": 100, +"temperature": 0 +}' +``` + +### Finish the rollout + + +Modify the InferenceModel to direct 100% of the traffic to the latest version of the adapter. + +```yaml +model: + name: tweet-summary + targetModels: + targetModelName: tweet-summary-2 + weight: 100 +``` + +Unload the older versions from the servers by updating the LoRA syncer ConfigMap to list the older version under the `ensureNotExist` list: + +```yaml + apiVersion: v1 + kind: ConfigMap + metadata: + name: dynamic-lora-config + data: + configmap.yaml: | + vLLMLoRAConfig: + name: sql-loras-llama + port: 8000 + ensureExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-2 + source: mahimairaja/tweet-summarization-llama-2-finetuned + ensureNotExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-1 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm +``` + +With this, all requests should be served by the new adapter version. diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 92f6412a..d175a62d 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -1,3 +1,135 @@ # Getting started with Gateway API Inference Extension -TODO \ No newline at end of file +This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get a first, single InferencePool up and running! + +## **Prerequisites** + - Envoy Gateway [v1.3.0](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher + - A cluster with: + - Support for services of typs `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). + For example, with Kind, you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). + +## **Steps** + +### Deploy Sample Model Server + + This quickstart guide contains two options for setting up model server: + + 1. GPU-based model server. + Requirements: a Hugging Face access token that grants access to the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). + + 1. CPU-based model server (not using GPUs). + Requirements: a Hugging Face access token that grants access to the model [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct). + + Choose one of these options and follow the steps below. Please do not deploy both, as the deployments have the same name and will override each other. + +#### GPU-Based Model Server + + For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/gpu-deployment.yaml` as needed. + Create a Hugging Face secret to download the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). Ensure that the token grants access to this model. + Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. + ```bash + kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2 + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/vllm/gpu-deployment.yaml + ``` + +#### CPU-Based Model Server + + Create a Hugging Face secret to download the model [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct). Ensure that the token grants access to this model. + Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. + ```bash + kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Qwen + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/vllm/cpu-deployment.yaml + ``` + +### Install the Inference Extension CRDs + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml + ``` + +### Deploy InferenceModel + + Deploy the sample InferenceModel which is configured to load balance traffic between the `tweet-summary-0` and `tweet-summary-1` + [LoRA adapters](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server. + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/inferencemodel.yaml + ``` + +### Update Envoy Gateway Config to enable Patch Policy** + + Our custom LLM Gateway ext-proc is patched into the existing envoy gateway via `EnvoyPatchPolicy`. To enable this feature, we must extend the Envoy Gateway config map. To do this, simply run: + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/enable_patch_policy.yaml + kubectl rollout restart deployment envoy-gateway -n envoy-gateway-system + ``` + Additionally, if you would like to enable the admin interface, you can uncomment the admin lines and run this again. + +### Deploy Gateway + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/gateway.yaml + ``` + > **_NOTE:_** This file couples together the gateway infra and the HTTPRoute infra for a convenient, quick startup. Creating additional/different InferencePools on the same gateway will require an additional set of: `Backend`, `HTTPRoute`, the resources included in the `./config/manifests/gateway/ext-proc.yaml` file, and an additional `./config/manifests/gateway/patch_policy.yaml` file. ***Should you choose to experiment, familiarity with xDS and Envoy are very useful.*** + + Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: + ```bash + $ kubectl get gateway inference-gateway + NAME CLASS ADDRESS PROGRAMMED AGE + inference-gateway inference-gateway True 22s + ``` +### Deploy the Inference Extension and InferencePool + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/ext_proc.yaml + ``` +### Deploy Envoy Gateway Custom Policies + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/extension_policy.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/patch_policy.yaml + ``` + > **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further. + +### **OPTIONALLY**: Apply Traffic Policy + + For high-traffic benchmarking you can apply this manifest to avoid any defaults that can cause timeouts/errors. + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/traffic_policy.yaml + ``` + +### Try it out + + Wait until the gateway is ready. + + ```bash + IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') + PORT=8081 + + curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ + "model": "tweet-summary", + "prompt": "Write as if you were a critic: San Francisco", + "max_tokens": 100, + "temperature": 0 + }' + ``` + +### Cleanup + + The following cleanup assumes you would like to clean ALL resources that were created in this quickstart guide. + please be careful not to delete resources you'd like to keep. + ```bash + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/traffic_policy.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/extension_policy.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/patch_policy.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/ext_proc.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/gateway.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/enable_patch_policy.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/inferencemodel.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/vllm/cpu-deployment.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/vllm/gpu-deployment.yaml --ignore-not-found + kubectl delete secret hf-token --ignore-not-found + ``` \ No newline at end of file diff --git a/pkg/ext-proc/metrics/README.md b/site-src/guides/metrics.md similarity index 51% rename from pkg/ext-proc/metrics/README.md rename to site-src/guides/metrics.md index 1094bc23..f793734d 100644 --- a/pkg/ext-proc/metrics/README.md +++ b/site-src/guides/metrics.md @@ -1,10 +1,6 @@ -# Documentation +# Metrics -This documentation is the current state of exposed metrics. - -## Table of Contents -* [Exposed Metrics](#exposed-metrics) -* [Scrape Metrics](#scrape-metrics) +This guide describes the current state of exposed metrics and how to scrape them. ## Requirements @@ -38,14 +34,17 @@ spec: ## Exposed metrics -| Metric name | Metric Type | Description | Labels | Status | -| ------------|--------------| ----------- | ------ | ------ | -| inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_input_tokens | Distribution | Distribution of input token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_output_tokens | Distribution | Distribution of output token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| **Metric name** | **Metric Type** |
**Description**
|
**Labels**
| **Status** | +|:---------------------------------------------|:-----------------|:------------------------------------------------------------------|:-----------------------------------------------------------------------------------|:------------| +| inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_request_error_total | Counter | The counter of requests errors broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_input_tokens | Distribution | Distribution of input token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_output_tokens | Distribution | Distribution of output token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_pool_average_kv_cache_utilization | Gauge | The average kv cache utilization for an inference server pool. | `name`=<inference-pool-name> | ALPHA | +| inference_pool_average_queue_size | Gauge | The average number of requests pending in the model server queue. | `name`=<inference-pool-name> | ALPHA | ## Scrape Metrics diff --git a/site-src/implementations.md b/site-src/implementations.md index e2238827..89acb436 100644 --- a/site-src/implementations.md +++ b/site-src/implementations.md @@ -3,14 +3,15 @@ This project has several implementations that are planned or in progress: * [Envoy Gateway][1] -* [Gloo k8sgateway][2] +* [Kgateway][2] * [Google Kubernetes Engine][3] [1]:#envoy-gateway -[2]:#gloo-k8sgateway +[2]:#kgateway [3]:#google-kubernetes-engine ## Envoy Gateway + [Envoy Gateway][eg-home] is an [Envoy][envoy-org] subproject for managing Envoy-based application gateways. The supported APIs and fields of the Gateway API are outlined [here][eg-supported]. Use the [quickstart][eg-quickstart] to @@ -24,15 +25,15 @@ Issue](https://github.com/envoyproxy/gateway/issues/4423). [eg-supported]:https://gateway.envoyproxy.io/docs/tasks/quickstart/ [eg-quickstart]:https://gateway.envoyproxy.io/docs/tasks/quickstart -## Gloo k8sgateway +## Kgateway -[Gloo k8sgateway](https://k8sgateway.io/) is a feature-rich, Kubernetes-native -ingress controller and next-generation API gateway. Gloo k8sgateway brings the +[Kgateway](https://kgateway.dev/) is a feature-rich, Kubernetes-native +ingress controller and next-generation API gateway. Kgateway brings the full power and community support of Gateway API to its existing control-plane implementation. Progress towards supporting this project is tracked with a [GitHub -Issue](https://github.com/k8sgateway/k8sgateway/issues/10411). +Issue](https://github.com/kgateway-dev/kgateway/issues/10411). ## Google Kubernetes Engine @@ -53,4 +54,3 @@ Issue](https://github.com/GoogleCloudPlatform/gke-gateway-api/issues/20). [gke-gateway]:https://cloud.google.com/kubernetes-engine/docs/concepts/gateway-api [gke-gateway-deploy]:https://cloud.google.com/kubernetes-engine/docs/how-to/deploying-gateways [gke-multi-cluster-gateway]:https://cloud.google.com/kubernetes-engine/docs/how-to/deploying-multi-cluster-gateways - diff --git a/test/e2e/README.md b/test/e2e/epp/README.md similarity index 100% rename from test/e2e/README.md rename to test/e2e/epp/README.md diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/epp/e2e_suite_test.go similarity index 94% rename from test/e2e/e2e_suite_test.go rename to test/e2e/epp/e2e_suite_test.go index 019e858a..e7685c48 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/epp/e2e_suite_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package e2e +package epp import ( "context" @@ -26,8 +26,6 @@ import ( "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" - infextv1a1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - testutils "inference.networking.x-k8s.io/gateway-api-inference-extension/test/utils" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" @@ -40,6 +38,8 @@ import ( clientgoscheme "k8s.io/client-go/kubernetes/scheme" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/config" + infextv1a2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + testutils "sigs.k8s.io/gateway-api-inference-extension/test/utils" ) const ( @@ -67,19 +67,19 @@ const ( // inferExtName is the name of the inference extension test resources. inferExtName = "inference-gateway-ext-proc" // clientManifest is the manifest for the client test resources. - clientManifest = "../testdata/client.yaml" + clientManifest = "../../testdata/client.yaml" // modelServerManifest is the manifest for the model server test resources. - modelServerManifest = "../../pkg/manifests/vllm/deployment.yaml" + modelServerManifest = "../../../config/manifests/vllm/gpu-deployment.yaml" // modelServerSecretManifest is the manifest for the model server secret resource. - modelServerSecretManifest = "../testdata/model-secret.yaml" + modelServerSecretManifest = "../../testdata/model-secret.yaml" // inferPoolManifest is the manifest for the inference pool CRD. - inferPoolManifest = "../../config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml" + inferPoolManifest = "../../../config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml" // inferModelManifest is the manifest for the inference model CRD. - inferModelManifest = "../../config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml" + inferModelManifest = "../../../config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml" // inferExtManifest is the manifest for the inference extension test resources. - inferExtManifest = "../../pkg/manifests/ext_proc.yaml" + inferExtManifest = "../../../config/manifests/ext_proc.yaml" // envoyManifest is the manifest for the envoy proxy test resources. - envoyManifest = "../testdata/envoy.yaml" + envoyManifest = "../../testdata/envoy.yaml" ) var ( @@ -136,7 +136,7 @@ func setupSuite() { err = apiextv1.AddToScheme(scheme) gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) - err = infextv1a1.AddToScheme(scheme) + err = infextv1a2.AddToScheme(scheme) gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) cli, err = client.New(cfg, client.Options{Scheme: scheme}) diff --git a/test/e2e/e2e_test.go b/test/e2e/epp/e2e_test.go similarity index 91% rename from test/e2e/e2e_test.go rename to test/e2e/epp/e2e_test.go index 8e5968fc..f5cfaf24 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/epp/e2e_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package e2e +package epp import ( "fmt" @@ -24,10 +24,10 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" - infextv1a1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - testutils "inference.networking.x-k8s.io/gateway-api-inference-extension/test/utils" "k8s.io/apimachinery/pkg/types" "k8s.io/utils/ptr" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + testutils "sigs.k8s.io/gateway-api-inference-extension/test/utils" ) var _ = ginkgo.Describe("InferencePool", func() { @@ -95,8 +95,8 @@ var _ = ginkgo.Describe("InferencePool", func() { }) // newInferenceModel creates an InferenceModel in the given namespace for testutils. -func newInferenceModel(ns string) *infextv1a1.InferenceModel { - targets := []infextv1a1.TargetModel{ +func newInferenceModel(ns string) *v1alpha2.InferenceModel { + targets := []v1alpha2.TargetModel{ { Name: modelName + "-0", Weight: ptr.To(int32(50)), @@ -107,7 +107,7 @@ func newInferenceModel(ns string) *infextv1a1.InferenceModel { }, } return testutils.MakeModelWrapper("inferencemodel-sample", ns). - SetCriticality(infextv1a1.Critical). + SetCriticality(v1alpha2.Critical). SetModelName(modelName). SetPoolRef(modelServerName). SetTargetModels(targets). diff --git a/test/integration/bbr/hermetic_test.go b/test/integration/bbr/hermetic_test.go new file mode 100644 index 00000000..be8b2721 --- /dev/null +++ b/test/integration/bbr/hermetic_test.go @@ -0,0 +1,173 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package bbr contains integration tests for the body-based routing extension. +package bbr + +import ( + "context" + "encoding/json" + "fmt" + "testing" + "time" + + configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "github.com/go-logr/logr" + "github.com/google/go-cmp/cmp" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/protobuf/testing/protocmp" + runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/body-based-routing/server" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const port = runserver.DefaultGrpcPort + +var logger = logutil.NewTestLogger().V(logutil.VERBOSE) + +func TestBodyBasedRouting(t *testing.T) { + tests := []struct { + name string + req *extProcPb.ProcessingRequest + wantHeaders []*configPb.HeaderValueOption + wantErr bool + }{ + { + name: "success adding model parameter to header", + req: generateRequest(logger, "llama"), + wantHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "X-Gateway-Model-Name", + RawValue: []byte("llama"), + }, + }, + }, + wantErr: false, + }, + { + name: "no model parameter", + req: generateRequest(logger, ""), + wantHeaders: []*configPb.HeaderValueOption{}, + wantErr: false, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + client, cleanup := setUpHermeticServer() + t.Cleanup(cleanup) + + want := &extProcPb.ProcessingResponse{} + if len(test.wantHeaders) > 0 { + want.Response = &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: test.wantHeaders, + }, + ClearRouteCache: true, + }, + }, + } + } else { + want.Response = &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{}, + } + } + + res, err := sendRequest(t, client, test.req) + if err != nil && !test.wantErr { + t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) + } + if diff := cmp.Diff(want, res, protocmp.Transform()); diff != "" { + t.Errorf("Unexpected response, (-want +got): %v", diff) + } + }) + } +} + +func setUpHermeticServer() (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { + serverCtx, stopServer := context.WithCancel(context.Background()) + serverRunner := runserver.NewDefaultExtProcServerRunner() + serverRunner.SecureServing = false + + go func() { + if err := serverRunner.AsRunnable(logger.WithName("ext-proc")).Start(serverCtx); err != nil { + logutil.Fatal(logger, err, "Failed to start ext-proc server") + } + }() + + address := fmt.Sprintf("localhost:%v", port) + // Create a grpc connection + conn, err := grpc.NewClient(address, grpc.WithTransportCredentials(insecure.NewCredentials())) + if err != nil { + logutil.Fatal(logger, err, "Failed to connect", "address", address) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + client, err = extProcPb.NewExternalProcessorClient(conn).Process(ctx) + if err != nil { + logutil.Fatal(logger, err, "Failed to create client") + } + return client, func() { + cancel() + conn.Close() + stopServer() + + // wait a little until the goroutines actually exit + time.Sleep(5 * time.Second) + } +} + +func generateRequest(logger logr.Logger, model string) *extProcPb.ProcessingRequest { + j := map[string]interface{}{ + "prompt": "test1", + "max_tokens": 100, + "temperature": 0, + } + if model != "" { + j["model"] = model + } + + llmReq, err := json.Marshal(j) + if err != nil { + logutil.Fatal(logger, err, "Failed to unmarshal LLM request") + } + req := &extProcPb.ProcessingRequest{ + Request: &extProcPb.ProcessingRequest_RequestBody{ + RequestBody: &extProcPb.HttpBody{Body: llmReq}, + }, + } + return req +} + +func sendRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { + t.Logf("Sending request: %v", req) + if err := client.Send(req); err != nil { + t.Logf("Failed to send request %+v: %v", req, err) + return nil, err + } + + res, err := client.Recv() + if err != nil { + t.Logf("Failed to receive: %v", err) + return nil, err + } + t.Logf("Received request %+v", res) + return res, err +} diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go new file mode 100644 index 00000000..7dc9bdb8 --- /dev/null +++ b/test/integration/epp/hermetic_test.go @@ -0,0 +1,1754 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package epp contains integration tests for the ext proc while faking the backend pods. +package epp + +import ( + "bufio" + "bytes" + "context" + "errors" + "fmt" + "io" + "net" + "net/http" + "os" + "path/filepath" + "strconv" + "strings" + "testing" + "time" + + configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3" + "github.com/google/go-cmp/cmp" + "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/stretchr/testify/assert" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/protobuf/testing/protocmp" + "google.golang.org/protobuf/types/known/structpb" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + k8syaml "k8s.io/apimachinery/pkg/util/yaml" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/component-base/metrics/legacyregistry" + metricsutils "k8s.io/component-base/metrics/testutil" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + k8sclient "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/config" + "sigs.k8s.io/controller-runtime/pkg/envtest" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" + runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + utiltesting "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" + "sigs.k8s.io/yaml" +) + +const ( + port = runserver.DefaultGrpcPort + metricsPort = 8888 +) + +var ( + serverRunner *runserver.ExtProcServerRunner + k8sClient k8sclient.Client + testEnv *envtest.Environment + scheme = runtime.NewScheme() + logger = logutil.NewTestLogger().V(logutil.VERBOSE) +) + +func TestMain(m *testing.M) { + cleanup := BeforeSuite() + code := m.Run() + cleanup() + os.Exit(code) +} + +func TestKubeInferenceModelRequest(t *testing.T) { + tests := []struct { + name string + req *extProcPb.ProcessingRequest + pods map[backendmetrics.Pod]*backendmetrics.Metrics + wantHeaders []*configPb.HeaderValueOption + wantMetadata *structpb.Struct + wantBody []byte + wantMetrics string + wantErr bool + immediateResponse *extProcPb.ImmediateResponse + }{ + { + name: "select lower queue and kv cache, no active lora", + req: utiltesting.GenerateRequest(logger, "test1", "my-model"), + // pod-1 will be picked because it has relatively low queue size and low KV cache. + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 3, + KVCacheUsagePercent: 0.2, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.1, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + }, + }, + wantHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: runserver.DefaultDestinationEndpointHintKey, + RawValue: []byte("192.168.1.2:8000"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte("76"), + }, + }, + }, + wantMetadata: makeMetadata("192.168.1.2:8000"), + wantBody: []byte("{\"max_tokens\":100,\"model\":\"my-model-12345\",\"prompt\":\"test1\",\"temperature\":0}"), + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="my-model",target_model_name="my-model-12345"} 1 + `, + wantErr: false, + }, + { + name: "select active lora, low queue", + req: utiltesting.GenerateRequest(logger, "test2", "sql-lora"), + // pod-1 will be picked because it has relatively low queue size, with the requested + // model being active, and has low KV cache. + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.1, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg2": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + }, + wantHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: runserver.DefaultDestinationEndpointHintKey, + RawValue: []byte("192.168.1.2:8000"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte("76"), + }, + }, + }, + wantMetadata: makeMetadata("192.168.1.2:8000"), + wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test2\",\"temperature\":0}"), + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1 + `, + wantErr: false, + }, + { + name: "select no lora despite active model, avoid excessive queue size", + req: utiltesting.GenerateRequest(logger, "test3", "sql-lora"), + // pod-2 will be picked despite it NOT having the requested model being active + // as it's above the affinity for queue size. Also is critical, so we should + // still honor request despite all queues > 5 + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 200, + KVCacheUsagePercent: 0.1, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg2": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 6, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + }, + }, + }, + wantHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: runserver.DefaultDestinationEndpointHintKey, + RawValue: []byte("192.168.1.3:8000"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte("76"), + }, + }, + }, + wantMetadata: makeMetadata("192.168.1.3:8000"), + wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test3\",\"temperature\":0}"), + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1 + `, + wantErr: false, + }, + { + name: "noncritical and all models past threshold, shed request", + req: utiltesting.GenerateRequest(logger, "test4", "sql-lora-sheddable"), + // no pods will be picked as all models are either above kv threshold, + // queue threshold, or both. + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 6, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.85, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + }, + wantHeaders: []*configPb.HeaderValueOption{}, + wantMetadata: &structpb.Struct{}, + wantBody: []byte(""), + wantErr: false, + immediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: envoyTypePb.StatusCode_TooManyRequests, + }, + }, + wantMetrics: "", + }, + { + name: "noncritical, but one server has capacity, do not shed", + req: utiltesting.GenerateRequest(logger, "test5", "sql-lora-sheddable"), + // pod 0 will be picked as all other models are above threshold + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 4, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.85, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + }, + wantHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: runserver.DefaultDestinationEndpointHintKey, + RawValue: []byte("192.168.1.1:8000"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte("76"), + }, + }, + }, + wantMetadata: makeMetadata("192.168.1.1:8000"), + wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"test5\",\"temperature\":0}"), + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1 + `, + wantErr: false, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + client, cleanup := setUpHermeticServer(t, test.pods, false) + t.Cleanup(cleanup) + want := &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: test.wantHeaders, + }, + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_Body{ + Body: test.wantBody, + }, + }, + }, + }, + }, + DynamicMetadata: test.wantMetadata, + } + res, err := sendRequest(t, client, test.req) + + if err != nil && !test.wantErr { + t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) + } + if test.immediateResponse != nil { + want = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: test.immediateResponse, + }, + } + } + if diff := cmp.Diff(want, res, protocmp.Transform()); diff != "" { + t.Errorf("Unexpected response, (-want +got): %v", diff) + } + + if test.wantMetrics != "" { + if err := metricsutils.GatherAndCompare(legacyregistry.DefaultGatherer, strings.NewReader(test.wantMetrics), "inference_model_request_total"); err != nil { + t.Error(err) + } + } + + legacyregistry.Reset() + }) + } +} + +func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { + tests := []struct { + name string + requests []*extProcPb.ProcessingRequest + pods map[backendmetrics.Pod]*backendmetrics.Metrics + wantResponses []*extProcPb.ProcessingResponse + wantMetrics string + wantErr bool + immediateResponse *extProcPb.ImmediateResponse + }{ + // Request flow tests + { + name: "select lower queue and kv cache, no active lora", + requests: utiltesting.GenerateStreamedRequestSet(logger, "test1", "my-model"), + // pod-1 will be picked because it has relatively low queue size and low KV cache. + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 3, + KVCacheUsagePercent: 0.2, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.1, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + }, + }, + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="my-model",target_model_name="my-model-12345"} 1 + `, + wantErr: false, + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "x-gateway-destination-endpoint", + RawValue: []byte("192.168.1.2:8000"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte(strconv.Itoa(76)), + }, + }, + }}, + }, + }, + }, + DynamicMetadata: makeMetadata("192.168.1.2:8000"), + }, + { + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"my-model-12345\",\"prompt\":\"test1\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "select active lora, low queue", + requests: utiltesting.GenerateStreamedRequestSet(logger, "test2", "sql-lora"), + // pod-1 will be picked because it has relatively low queue size, with the requested + // model being active, and has low KV cache. + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.1, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg2": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + }, + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1 + `, + wantErr: false, + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "x-gateway-destination-endpoint", + RawValue: []byte("192.168.1.2:8000"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte(strconv.Itoa(76)), + }, + }, + }}, + }, + }, + }, + DynamicMetadata: makeMetadata("192.168.1.2:8000"), + }, + { + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test2\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "select no lora despite active model, avoid excessive queue size", + requests: utiltesting.GenerateStreamedRequestSet(logger, "test3", "sql-lora"), + // pod-2 will be picked despite it NOT having the requested model being active + // as it's above the affinity for queue size. Also is critical, so we should + // still honor request despite all queues > 5 + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 200, + KVCacheUsagePercent: 0.1, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg2": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 6, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + }, + }, + }, + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1 + `, + wantErr: false, + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "x-gateway-destination-endpoint", + RawValue: []byte("192.168.1.3:8000"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte(strconv.Itoa(76)), + }, + }, + }}, + }, + }, + }, + DynamicMetadata: makeMetadata("192.168.1.3:8000"), + }, + { + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test3\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "noncritical and all models past threshold, shed request", + requests: utiltesting.GenerateStreamedRequestSet(logger, "test4", "sql-lora-sheddable"), + // no pods will be picked as all models are either above kv threshold, + // queue threshold, or both. + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 6, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.85, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + }, + wantErr: false, + wantMetrics: "", + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: envoyTypePb.StatusCode_TooManyRequests, + }, + }, + }, + }, + }, + }, + { + name: "noncritical, but one server has capacity, do not shed", + requests: utiltesting.GenerateStreamedRequestSet(logger, "test5", "sql-lora-sheddable"), + // pod 0 will be picked as all other models are above threshold + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 4, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.85, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + }, + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1 + `, + wantErr: false, + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "x-gateway-destination-endpoint", + RawValue: []byte("192.168.1.1:8000"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte(strconv.Itoa(76)), + }, + }, + }}, + }, + }, + }, + DynamicMetadata: makeMetadata("192.168.1.1:8000"), + }, + { + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"test5\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "body sent over multiple requests, noncritical, but one server has capacity, do not shed", + requests: []*extProcPb.ProcessingRequest{ + { + Request: &extProcPb.ProcessingRequest_RequestHeaders{ + RequestHeaders: &extProcPb.HttpHeaders{ + Headers: &configPb.HeaderMap{ + Headers: []*configPb.HeaderValue{ + { + Key: "hi", + Value: "mom", + }, + }, + }, + }, + }, + }, + { + Request: &extProcPb.ProcessingRequest_RequestBody{ + RequestBody: &extProcPb.HttpBody{Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lo"), EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_RequestBody{ + RequestBody: &extProcPb.HttpBody{Body: []byte("ra-sheddable\",\"prompt\":\"test6\",\"temperature\":0}"), EndOfStream: true}, + }, + }, + }, + + // + // pod 0 will be picked as all other models are above threshold + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 4, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.85, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + }, + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1 + `, + wantErr: false, + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "x-gateway-destination-endpoint", + RawValue: []byte("192.168.1.1:8000"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte(strconv.Itoa(76)), + }, + }, + }}, + }, + }, + }, + DynamicMetadata: makeMetadata("192.168.1.1:8000"), + }, + { + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"test6\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "inferencemodel's modelName is not translated, passthrough", + requests: []*extProcPb.ProcessingRequest{ + { + Request: &extProcPb.ProcessingRequest_RequestHeaders{ + RequestHeaders: &extProcPb.HttpHeaders{ + Headers: &configPb.HeaderMap{ + Headers: []*configPb.HeaderValue{ + { + Key: "hi", + Value: "mom", + }, + }, + }, + }, + }, + }, + { + Request: &extProcPb.ProcessingRequest_RequestBody{ + RequestBody: &extProcPb.HttpBody{Body: []byte("{\"max_tokens\":100,\"model\":\"direct-"), EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_RequestBody{ + RequestBody: &extProcPb.HttpBody{Body: []byte("model\",\"prompt\":\"test6\",\"temperature\":0}"), EndOfStream: true}, + }, + }, + }, + + // + // pod 0 will be picked as all other models are above threshold + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 4, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.85, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + }, + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="direct-model",target_model_name="direct-model"} 1 + `, + wantErr: false, + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "x-gateway-destination-endpoint", + RawValue: []byte("192.168.1.2:8000"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte(strconv.Itoa(74)), + }, + }, + }}, + }, + }, + }, + DynamicMetadata: makeMetadata("192.168.1.2:8000"), + }, + { + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"direct-model\",\"prompt\":\"test6\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + // Response flow tests + { + name: "responsebody sent over multiple requests, content-type is json, buffer", + requests: []*extProcPb.ProcessingRequest{ + { + Request: &extProcPb.ProcessingRequest_ResponseHeaders{ + ResponseHeaders: &extProcPb.HttpHeaders{ + Headers: &configPb.HeaderMap{ + Headers: []*configPb.HeaderValue{ + { + Key: "content-type", + Value: "application/json", + }, + }, + }, + }, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lo"), EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{Body: []byte("ra-sheddable\",\"prompt\":\"test6\",\"temperature\":0}"), EndOfStream: true}, + }, + }, + }, + + // + // pod 0 will be picked as all other models are above threshold + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 4, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.85, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + }, + wantErr: false, + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_ResponseHeaders{ + ResponseHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "x-went-into-resp-headers", + RawValue: []byte("true"), + }, + }, + }, + }, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-sheddable\",\"prompt\":\"test6\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "responsebody sent over a single request, but empty body with EndOfStream in the second request(this is how envoy operates); content-type is json, buffer", + requests: []*extProcPb.ProcessingRequest{ + { + Request: &extProcPb.ProcessingRequest_ResponseHeaders{ + ResponseHeaders: &extProcPb.HttpHeaders{ + Headers: &configPb.HeaderMap{ + Headers: []*configPb.HeaderValue{ + { + Key: "content-type", + Value: "application/json", + }, + }, + }, + }, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-sheddable\",\"prompt\":\"test6\",\"temperature\":0}"), EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{Body: []byte(""), EndOfStream: true}, + }, + }, + }, + + // + // pod 0 will be picked as all other models are above threshold + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 4, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.85, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + }, + wantErr: false, + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_ResponseHeaders{ + ResponseHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "x-went-into-resp-headers", + RawValue: []byte("true"), + }, + }, + }, + }, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-sheddable\",\"prompt\":\"test6\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "responsebody sent over a single request, but empty body with EndOfStream in the second request(this is how envoy operates); content-type is json, buffer", + requests: []*extProcPb.ProcessingRequest{ + { + Request: &extProcPb.ProcessingRequest_ResponseHeaders{ + ResponseHeaders: &extProcPb.HttpHeaders{ + Headers: &configPb.HeaderMap{ + Headers: []*configPb.HeaderValue{ + { + Key: "content-type", + RawValue: []byte("text/event-stream"), + }, + { + Key: "status", + RawValue: []byte("200"), + }, + }, + }, + }, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"NEVER","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"GONNA","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"GIVE","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"YOU","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"UP","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}`), + EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte("data: [DONE]"), + EndOfStream: true}, + }, + }, + }, + wantErr: false, + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_ResponseHeaders{ + ResponseHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "x-went-into-resp-headers", + RawValue: []byte("true"), + }, + }, + }, + }, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"NEVER","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false, + }, + }, + }, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"GONNA","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false, + }, + }, + }, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"GIVE","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false, + }, + }, + }, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"YOU","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false, + }, + }, + }, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"UP","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false, + }, + }, + }, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}`), + EndOfStream: false, + }, + }, + }, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("data: [DONE]"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + client, cleanup := setUpHermeticServer(t, test.pods, true) + t.Cleanup(cleanup) + responses, err := streamedRequest(t, client, test.requests, len(test.wantResponses)) + + if err != nil && !test.wantErr { + t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) + } + if diff := cmp.Diff(test.wantResponses, responses, protocmp.Transform()); diff != "" { + t.Errorf("Unexpected response, (-want +got): %v", diff) + } + + if test.wantMetrics != "" { + if err := metricsutils.GatherAndCompare(legacyregistry.DefaultGatherer, strings.NewReader(test.wantMetrics), "inference_model_request_total"); err != nil { + t.Error(err) + } + } + + legacyregistry.Reset() + }) + } +} + +func setUpHermeticServer(t *testing.T, podAndMetrics map[backendmetrics.Pod]*backendmetrics.Metrics, streamed bool) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { + // Reconfigure the TestPodMetricsClient. + res := map[types.NamespacedName]*backendmetrics.Metrics{} + for pod, metrics := range podAndMetrics { + res[pod.NamespacedName] = metrics + } + serverRunner.TestPodMetricsClient.SetRes(res) + serverRunner.UseStreaming = streamed + + serverCtx, stopServer := context.WithCancel(context.Background()) + + // TODO: this should be consistent with the inference pool + podLabels := map[string]string{ + "app": "vllm-llama2-7b-pool", + } + + for pod := range podAndMetrics { + pod := utiltesting.MakePod(pod.NamespacedName.Name). + Namespace(pod.NamespacedName.Namespace). + ReadyCondition(). + Labels(podLabels). + IP(pod.Address). + Complete(). + ObjRef() + + copy := pod.DeepCopy() + if err := k8sClient.Create(context.Background(), copy); err != nil { + logutil.Fatal(logger, err, "Failed to create pod", "pod", pod) + } + + // since no pod controllers deployed in fake environment, we manually update pod status + copy.Status = pod.Status + if err := k8sClient.Status().Update(context.Background(), copy); err != nil { + logutil.Fatal(logger, err, "Failed to update pod status", "pod", pod) + } + } + go func() { + if err := serverRunner.AsRunnable(logger.WithName("ext-proc")).Start(serverCtx); err != nil { + logutil.Fatal(logger, err, "Failed to start ext-proc server") + } + }() + + // check if all pods are synced to datastore + assert.EventuallyWithT(t, func(t *assert.CollectT) { + assert.Len(t, serverRunner.Datastore.PodGetAll(), len(podAndMetrics), "Datastore not synced") + }, 10*time.Second, time.Second) + + address := fmt.Sprintf("localhost:%v", port) + // Create a grpc connection + conn, err := grpc.NewClient(address, grpc.WithTransportCredentials(insecure.NewCredentials())) + if err != nil { + logutil.Fatal(logger, err, "Failed to connect", "address", address) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + client, err = extProcPb.NewExternalProcessorClient(conn).Process(ctx) + if err != nil { + logutil.Fatal(logger, err, "Failed to create client") + } + return client, func() { + cancel() + conn.Close() + stopServer() + + // clear created pods + for pod := range podAndMetrics { + pod := utiltesting.MakePod(pod.NamespacedName.Name). + Namespace(pod.NamespacedName.Namespace).Complete().ObjRef() + + if err := k8sClient.Delete(context.Background(), pod); err != nil { + logutil.Fatal(logger, err, "Failed to delete pod", "pod", fakePod) + } + } + // wait a little until the goroutines actually exit + time.Sleep(5 * time.Second) + } +} + +func fakePod(index int) backendmetrics.Pod { + return backendmetrics.Pod{ + NamespacedName: types.NamespacedName{Name: fmt.Sprintf("pod-%v", index), Namespace: "default"}, + Address: fmt.Sprintf("192.168.1.%d", index+1), + } +} + +// Sets up a test environment and returns the runner struct +func BeforeSuite() func() { + // Set up mock k8s API Client + testEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "crd", "bases")}, + ErrorIfCRDPathMissing: true, + } + cfg, err := testEnv.Start() + if err != nil { + logutil.Fatal(logger, err, "Failed to start test environment", "config", cfg) + } + + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + + k8sClient, err = k8sclient.New(cfg, k8sclient.Options{Scheme: scheme}) + if err != nil { + logutil.Fatal(logger, err, "Failed to start k8s Client") + } else if k8sClient == nil { + logutil.Fatal(logger, nil, "No error, but returned kubernetes client is nil", "config", cfg) + } + + // Init runtime. + ctrl.SetLogger(logger) + + mgr, err := server.NewManagerWithOptions(cfg, managerTestOptions("default", "vllm-llama2-7b-pool")) + if err != nil { + logutil.Fatal(logger, err, "Failed to create controller manager") + } + + if err := registerMetricsHandler(mgr, metricsPort); err != nil { + logutil.Fatal(logger, err, "Failed to register metrics handler") + } + + serverRunner = runserver.NewDefaultExtProcServerRunner() + serverRunner.TestPodMetricsClient = &backendmetrics.FakePodMetricsClient{} + pmf := backendmetrics.NewPodMetricsFactory(serverRunner.TestPodMetricsClient, 10*time.Millisecond) + // Adjust from defaults + serverRunner.PoolName = "vllm-llama2-7b-pool" + serverRunner.Datastore = datastore.NewDatastore(context.Background(), pmf) + serverRunner.SecureServing = false + + if err := serverRunner.SetupWithManager(context.Background(), mgr); err != nil { + logutil.Fatal(logger, err, "Failed to setup server runner") + } + + // Start the controller manager in a go routine, not blocking + go func() { + if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + logutil.Fatal(logger, err, "Failed to start manager") + } + }() + + logger.Info("Setting up hermetic ExtProc server") + + // Unmarshal CRDs from file into structs + manifestsPath := filepath.Join("..", "..", "testdata", "inferencepool-with-model-hermetic.yaml") + docs, err := readDocuments(manifestsPath) + if err != nil { + logutil.Fatal(logger, err, "Can't read object manifests", "path", manifestsPath) + } + + for _, doc := range docs { + inferenceModel := &v1alpha2.InferenceModel{} + if err = yaml.Unmarshal(doc, inferenceModel); err != nil { + logutil.Fatal(logger, err, "Can't unmarshal object", "document", doc) + } + if inferenceModel.Kind == "InferenceModel" { + logger.Info("Creating inference model", "model", inferenceModel) + if err := k8sClient.Create(context.Background(), inferenceModel); err != nil { + logutil.Fatal(logger, err, "Unable to create inferenceModel", "modelName", inferenceModel.Name) + } + } + } + for _, doc := range docs { + inferencePool := &v1alpha2.InferencePool{} + if err = yaml.Unmarshal(doc, inferencePool); err != nil { + logutil.Fatal(logger, err, "Can't unmarshal object", "document", doc) + } + if inferencePool.Kind == "InferencePool" { + logger.Info("Creating inference pool", "pool", inferencePool) + if err := k8sClient.Create(context.Background(), inferencePool); err != nil { + logutil.Fatal(logger, err, "Unable to create inferencePool", "poolName", inferencePool.Name) + } + } + } + + assert.Eventually(nil, func() bool { + modelExist := serverRunner.Datastore.ModelGet("my-model") + synced := serverRunner.Datastore.PoolHasSynced() && modelExist != nil + return synced + }, 10*time.Second, 10*time.Millisecond) + + return func() { + _ = testEnv.Stop() + _ = k8sClient.DeleteAllOf(context.Background(), &v1alpha2.InferencePool{}) + _ = k8sClient.DeleteAllOf(context.Background(), &v1alpha2.InferenceModel{}) + } +} + +func sendRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { + t.Logf("Sending request: %v", req) + if err := client.Send(req); err != nil { + t.Logf("Failed to send request %+v: %v", req, err) + return nil, err + } + + res, err := client.Recv() + if err != nil { + t.Logf("Failed to receive: %v", err) + return nil, err + } + t.Logf("Received request %+v", res) + return res, err +} + +func streamedRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, requests []*extProcPb.ProcessingRequest, expectedResponses int) ([]*extProcPb.ProcessingResponse, error) { + for _, req := range requests { + t.Logf("Sending request: %v", req) + if err := client.Send(req); err != nil { + t.Logf("Failed to send request %+v: %v", req, err) + return nil, err + } + // Brief pause for the goroutines to execute sequentially and populate the internal pipe channels sequentially + // without the pause there can be a race condition where a goroutine from a subsequent request is able to populate + // the pipe writer channel before a previous chunk. This is simply due to everything running in memory, this would + // not happen in a real world environment with non-zero latency. + time.Sleep(1 * time.Millisecond) + } + responses := []*extProcPb.ProcessingResponse{} + + // Make an incredible simple timeout func in the case where + // there is less than the expected amount of responses; bail and fail. + var simpleTimeout bool + go func() { + time.Sleep(10 * time.Second) + simpleTimeout = true + }() + + for range expectedResponses { + if simpleTimeout { + break + } + res, err := client.Recv() + if err != nil && err != io.EOF { + t.Logf("Failed to receive: %v", err) + return nil, err + } + t.Logf("Received request %+v", res) + responses = append(responses, res) + } + return responses, nil +} + +// readDocuments reads documents from file. +func readDocuments(fp string) ([][]byte, error) { + b, err := os.ReadFile(fp) + if err != nil { + return nil, err + } + + docs := [][]byte{} + reader := k8syaml.NewYAMLReader(bufio.NewReader(bytes.NewReader(b))) + for { + // Read document + doc, err := reader.Read() + if err != nil { + if errors.Is(err, io.EOF) { + break + } + return nil, err + } + docs = append(docs, doc) + } + return docs, nil +} + +func makeMetadata(endpoint string) *structpb.Struct { + return &structpb.Struct{ + Fields: map[string]*structpb.Value{ + runserver.DefaultDestinationEndpointHintMetadataNamespace: { + Kind: &structpb.Value_StructValue{ + StructValue: &structpb.Struct{ + Fields: map[string]*structpb.Value{ + runserver.DefaultDestinationEndpointHintKey: { + Kind: &structpb.Value_StringValue{ + StringValue: endpoint, + }, + }, + }, + }, + }, + }, + }, + } +} + +// registerMetricsHandler is a simplified version of metrics endpoint handler +// without Authentication for integration tests. +func registerMetricsHandler(mgr manager.Manager, port int) error { + metrics.Register() + + // Init HTTP server. + h := promhttp.HandlerFor( + legacyregistry.DefaultGatherer, + promhttp.HandlerOpts{}, + ) + + mux := http.NewServeMux() + mux.Handle("/metrics", h) + + srv := &http.Server{ + Addr: net.JoinHostPort("", strconv.Itoa(port)), + Handler: mux, + } + + if err := mgr.Add(&manager.Server{ + Name: "metrics", + Server: srv, + }); err != nil { + return err + } + return nil +} + +// inject options that allow multiple test runs to run +// https://github.com/kubernetes-sigs/controller-runtime/issues/2937 +func managerTestOptions(namespace, name string) ctrl.Options { + return ctrl.Options{ + Scheme: scheme, + Cache: cache.Options{ + ByObject: map[client.Object]cache.ByObject{ + &corev1.Pod{}: { + Namespaces: map[string]cache.Config{ + namespace: {}, + }, + }, + &v1alpha2.InferencePool{}: { + Namespaces: map[string]cache.Config{ + namespace: { + FieldSelector: fields.SelectorFromSet(fields.Set{ + "metadata.name": name, + }), + }, + }, + }, + &v1alpha2.InferenceModel{}: { + Namespaces: map[string]cache.Config{ + namespace: {}, + }, + }, + }, + }, + Controller: config.Controller{ + SkipNameValidation: boolPointer(true), + }, + } +} + +func boolPointer(b bool) *bool { + return &b +} diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go deleted file mode 100644 index 3dfe28f7..00000000 --- a/test/integration/hermetic_test.go +++ /dev/null @@ -1,475 +0,0 @@ -// Package test contains e2e tests for the ext proc while faking the backend pods. -package integration - -import ( - "bufio" - "bytes" - "context" - "errors" - "flag" - "fmt" - "io" - "log" - "os" - "path/filepath" - "strconv" - "testing" - "time" - - configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" - extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" - "github.com/google/go-cmp/cmp" - "google.golang.org/grpc" - "google.golang.org/grpc/credentials/insecure" - "google.golang.org/protobuf/testing/protocmp" - "google.golang.org/protobuf/types/known/structpb" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - runserver "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" - extprocutils "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/test" - testingutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/testing" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/runtime" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - k8syaml "k8s.io/apimachinery/pkg/util/yaml" - clientgoscheme "k8s.io/client-go/kubernetes/scheme" - klog "k8s.io/klog/v2" - k8sclient "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/envtest" - "sigs.k8s.io/yaml" -) - -const ( - port = runserver.DefaultGrpcPort -) - -var ( - serverRunner *runserver.ExtProcServerRunner - k8sClient k8sclient.Client - testEnv *envtest.Environment - scheme = runtime.NewScheme() -) - -func SKIPTestHandleRequestBody(t *testing.T) { - tests := []struct { - name string - req *extProcPb.ProcessingRequest - pods []*backend.PodMetrics - models map[string]*v1alpha1.InferenceModel - wantHeaders []*configPb.HeaderValueOption - wantBody []byte - wantErr bool - }{ - { - name: "success", - req: extprocutils.GenerateRequest("my-model"), - models: map[string]*v1alpha1.InferenceModel{ - "my-model": { - Spec: v1alpha1.InferenceModelSpec{ - ModelName: "my-model", - TargetModels: []v1alpha1.TargetModel{ - { - Name: "my-model-v1", - Weight: pointer(100), - }, - }, - }, - }, - }, - // pod-1 will be picked because it has relatively low queue size, with the requested - // model being active, and has low KV cache. - pods: []*backend.PodMetrics{ - { - Pod: extprocutils.FakePod(0), - Metrics: backend.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - }, - { - Pod: extprocutils.FakePod(1), - Metrics: backend.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.1, - ActiveModels: map[string]int{ - "foo": 1, - "my-model-v1": 1, - }, - }, - }, - { - Pod: extprocutils.FakePod(2), - Metrics: backend.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - }, - }, - }, - }, - wantHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: runserver.DefaultTargetEndpointKey, - RawValue: []byte("pod-1:8000"), - }, - }, - { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte("73"), - }, - }, - }, - wantBody: []byte("{\"max_tokens\":100,\"model\":\"my-model-v1\",\"prompt\":\"hello\",\"temperature\":0}"), - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - client, cleanup := setUpServer(t, test.pods, test.models) - t.Cleanup(cleanup) - want := &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_RequestBody{ - RequestBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: test.wantHeaders, - }, - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_Body{ - Body: test.wantBody, - }, - }, - }, - }, - }, - } - res, err := sendRequest(t, client, test.req) - - if (err != nil) != test.wantErr { - t.Fatalf("Unexpected error, got %v, want %v", err, test.wantErr) - } - - if diff := cmp.Diff(want, res, protocmp.Transform()); diff != "" { - t.Errorf("Unexpected response, (-want +got): %v", diff) - } - }) - } - -} - -func TestKubeInferenceModelRequest(t *testing.T) { - tests := []struct { - name string - req *extProcPb.ProcessingRequest - wantHeaders []*configPb.HeaderValueOption - wantMetadata *structpb.Struct - wantBody []byte - wantErr bool - }{ - { - name: "success", - req: extprocutils.GenerateRequest("sql-lora"), - // pod-1 will be picked because it has relatively low queue size, with the requested - // model being active, and has low KV cache. - wantHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: runserver.DefaultTargetEndpointKey, - RawValue: []byte("pod-1:8000"), - }, - }, - { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte("76"), - }, - }, - }, - wantMetadata: &structpb.Struct{ - Fields: map[string]*structpb.Value{ - runserver.DefaultTargetEndpointKey: { - Kind: &structpb.Value_StringValue{ - StringValue: "pod-1:8000", - }, - }, - }, - }, - wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"hello\",\"temperature\":0}"), - wantErr: false, - }, - } - - metrics := []*backend.Metrics{ - { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.1, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg2": 1, - }, - }, - { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - }, - }, - } - - // Set up global k8sclient and extproc server runner with test environment config - podMetrics := BeforeSuit(metrics) - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - client, cleanup := setUpHermeticServer(t, podMetrics) - t.Cleanup(cleanup) - want := &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_RequestBody{ - RequestBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: test.wantHeaders, - }, - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_Body{ - Body: test.wantBody, - }, - }, - }, - }, - }, - DynamicMetadata: test.wantMetadata, - } - res, err := sendRequest(t, client, test.req) - - if err != nil { - if !test.wantErr { - t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) - } - } else if diff := cmp.Diff(want, res, protocmp.Transform()); diff != "" { - t.Errorf("Unexpected response, (-want +got): %v", diff) - } - }) - } -} - -func setUpServer(t *testing.T, pods []*backend.PodMetrics, models map[string]*v1alpha1.InferenceModel) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { - t.Logf("Setting up ExtProc server") - server := extprocutils.StartExtProc(port, time.Second, time.Second, pods, models) - - address := fmt.Sprintf("localhost:%v", port) - // Create a grpc connection - conn, err := grpc.NewClient(address, grpc.WithTransportCredentials(insecure.NewCredentials())) - if err != nil { - log.Fatalf("Failed to connect to %v: %v", address, err) - } - - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - client, err = extProcPb.NewExternalProcessorClient(conn).Process(ctx) - if err != nil { - log.Fatalf("Failed to create client: %v", err) - } - return client, func() { - cancel() - conn.Close() - server.GracefulStop() - } -} - -func setUpHermeticServer(t *testing.T, pods []*backend.PodMetrics) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { - t.Logf("Setting up hermetic ExtProc server") - klog.InitFlags(nil) - flag.Parse() - // Configure klog verbosity levels to print ext proc logs. - _ = flag.Lookup("v").Value.Set("3") - - // Unmarshal CRDs from file into structs - manifestsPath := filepath.Join("..", "testdata", "inferencepool-with-model-hermetic.yaml") - docs, err := readDocuments(manifestsPath) - if err != nil { - log.Fatalf("Can't read object manifests at path %v, %v", manifestsPath, err) - } - - for _, doc := range docs { - inferenceModel := &v1alpha1.InferenceModel{} - if err = yaml.Unmarshal(doc, inferenceModel); err != nil { - log.Fatalf("Can't unmarshal object: %v", doc) - } - if inferenceModel.Kind == "InferenceModel" { - t.Logf("Creating inference model: %+v", inferenceModel) - if err := k8sClient.Create(context.Background(), inferenceModel); err != nil { - log.Fatalf("unable to create inferenceModel %v: %v", inferenceModel.Name, err) - } - } - } - inferencePool := &v1alpha1.InferencePool{} - for _, doc := range docs { - if err = yaml.Unmarshal(doc, inferencePool); err != nil { - log.Fatalf("Can't unmarshal object: %v", doc) - } - if inferencePool.Kind == "InferencePool" { - t.Logf("Creating inference pool: %+v", inferencePool) - if err := k8sClient.Create(context.Background(), inferencePool); err != nil { - log.Fatalf("unable to create inferencePool %v: %v", inferencePool.Name, err) - } - // expecting a single inferencepool - break - } - } - - ps := make(backend.PodSet) - pms := make(map[string]*backend.PodMetrics) - for _, pod := range pods { - ps[pod.Pod] = true - pms[pod.Pod.Name] = pod - } - pmc := &backend.FakePodMetricsClient{Res: pms} - server := serverRunner.Start(pmc) - if err != nil { - log.Fatalf("Ext-proc failed with the err: %v", err) - } - - // Wait the reconciler to populate the datastore. - time.Sleep(10 * time.Second) - - address := fmt.Sprintf("localhost:%v", port) - // Create a grpc connection - conn, err := grpc.NewClient(address, grpc.WithTransportCredentials(insecure.NewCredentials())) - if err != nil { - log.Fatalf("Failed to connect to %v: %v", address, err) - } - - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - client, err = extProcPb.NewExternalProcessorClient(conn).Process(ctx) - if err != nil { - log.Fatalf("Failed to create client: %v", err) - } - return client, func() { - cancel() - conn.Close() - server.GracefulStop() - } -} - -// Sets up a test environment and returns the runner struct -func BeforeSuit(metrics []*backend.Metrics) []*backend.PodMetrics { - // Set up mock k8s API Client - testEnv = &envtest.Environment{ - CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, - ErrorIfCRDPathMissing: true, - } - cfg, err := testEnv.Start() - - if err != nil { - log.Fatalf("Failed to start test environment, cfg: %v error: %v", cfg, err) - } - - utilruntime.Must(clientgoscheme.AddToScheme(scheme)) - utilruntime.Must(v1alpha1.AddToScheme(scheme)) - - k8sClient, err = k8sclient.New(cfg, k8sclient.Options{Scheme: scheme}) - if err != nil { - log.Fatalf("Failed to start k8s Client: %v", err) - } else if k8sClient == nil { - log.Fatalf("No error, but returned kubernetes client is nil, cfg: %v", cfg) - } - - podMetrics := []*backend.PodMetrics{} - fakeLister := &testingutil.FakePodLister{ - PodsList: []*corev1.Pod{}, - } - for i, m := range metrics { - podName := "pod-" + strconv.Itoa(i) - pod := testingutil.MakePod(podName).SetReady().SetPodIP(podName).Obj() - fakeLister.PodsList = append(fakeLister.PodsList, pod) - podMetrics = append(podMetrics, &backend.PodMetrics{ - Pod: backend.Pod{ - Name: pod.Name, - Address: pod.Status.PodIP + ":8000", - }, - Metrics: *m, - }) - } - - serverRunner = runserver.NewDefaultExtProcServerRunner() - // Adjust from defaults - serverRunner.PoolName = "vllm-llama2-7b-pool" - serverRunner.Scheme = scheme - serverRunner.Config = cfg - serverRunner.Datastore = backend.NewK8sDataStore(backend.WithPodListerFactory( - func(pool *v1alpha1.InferencePool) *backend.PodLister { - klog.V(1).Infof("Setting the fake lister %v", len(fakeLister.PodsList)) - return &backend.PodLister{ - Lister: fakeLister, - } - })) - - serverRunner.Setup() - - // Start the controller manager in go routine, not blocking - go func() { - serverRunner.StartManager() - }() - - // Wait the reconcilers to populate the datastore. - time.Sleep(5 * time.Second) - return podMetrics -} - -func sendRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { - t.Logf("Sending request: %v", req) - if err := client.Send(req); err != nil { - t.Logf("Failed to send request %+v: %v", req, err) - return nil, err - } - - res, err := client.Recv() - if err != nil { - t.Logf("Failed to receive: %v", err) - return nil, err - } - t.Logf("Received request %+v", res) - return res, err -} - -// readDocuments reads documents from file. -func readDocuments(fp string) ([][]byte, error) { - b, err := os.ReadFile(fp) - if err != nil { - return nil, err - } - - docs := [][]byte{} - reader := k8syaml.NewYAMLReader(bufio.NewReader(bytes.NewReader(b))) - for { - // Read document - doc, err := reader.Read() - if err != nil { - if errors.Is(err, io.EOF) { - break - } - return nil, err - } - docs = append(docs, doc) - } - return docs, nil -} -func pointer(v int32) *int32 { - return &v -} diff --git a/test/testdata/envoy.yaml b/test/testdata/envoy.yaml index 700eb24c..ffb8add7 100644 --- a/test/testdata/envoy.yaml +++ b/test/testdata/envoy.yaml @@ -169,6 +169,15 @@ data: max_pending_requests: 40000 max_requests: 40000 max_retries: 1024 + # This ensures that envoy accepts untrusted certificates. We tried to explicitly + # set TrustChainVerification to ACCEPT_UNSTRUSTED, but that actually didn't work + # and what worked is setting the common_tls_context to empty. + transport_socket: + name: "envoy.transport_sockets.tls" + typed_config: + "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext + common_tls_context: + validation_context: typed_extension_protocol_options: envoy.extensions.upstreams.http.v3.HttpProtocolOptions: "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions @@ -219,7 +228,7 @@ spec: - "--service-node" - "$(ENVOY_POD_NAME)" - "--log-level" - - "debug" + - "trace" - "--cpuset-threads" - "--drain-strategy" - "immediate" diff --git a/test/testdata/inferencepool-with-model-hermetic.yaml b/test/testdata/inferencepool-with-model-hermetic.yaml index a07e0f35..36b6e539 100644 --- a/test/testdata/inferencepool-with-model-hermetic.yaml +++ b/test/testdata/inferencepool-with-model-hermetic.yaml @@ -1,4 +1,4 @@ -apiVersion: inference.networking.x-k8s.io/v1alpha1 +apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferencePool metadata: name: vllm-llama2-7b-pool @@ -10,7 +10,7 @@ spec: extensionRef: name: epp --- -apiVersion: inference.networking.x-k8s.io/v1alpha1 +apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferenceModel metadata: name: inferencemodel-sample @@ -23,3 +23,41 @@ spec: targetModels: - name: sql-lora-1fdg2 weight: 100 +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: inferencemodel-sheddable + namespace: default +spec: + modelName: sql-lora-sheddable + poolRef: + name: vllm-llama2-7b-pool + targetModels: + - name: sql-lora-1fdg3 + weight: 100 +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: inferencemodel-generic + namespace: default +spec: + modelName: my-model + criticality: Critical + poolRef: + name: vllm-llama2-7b-pool + targetModels: + - name: my-model-12345 + weight: 100 +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: inferencemodel-direct-model-name + namespace: default +spec: + modelName: direct-model + criticality: Critical + poolRef: + name: vllm-llama2-7b-pool \ No newline at end of file diff --git a/test/utils/utils.go b/test/utils/utils.go index 337599c3..1ec0fbaa 100644 --- a/test/utils/utils.go +++ b/test/utils/utils.go @@ -24,7 +24,6 @@ import ( "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" - infextv1a1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" @@ -37,6 +36,7 @@ import ( "k8s.io/client-go/rest" "k8s.io/client-go/tools/remotecommand" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) // DeleteClusterResources deletes all cluster-scoped objects the tests typically create. @@ -106,11 +106,11 @@ func DeleteNamespacedResources(ctx context.Context, cli client.Client, ns string if err != nil && !apierrors.IsNotFound(err) { return err } - err = cli.DeleteAllOf(ctx, &infextv1a1.InferencePool{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + err = cli.DeleteAllOf(ctx, &v1alpha2.InferencePool{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } - err = cli.DeleteAllOf(ctx, &infextv1a1.InferenceModel{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + err = cli.DeleteAllOf(ctx, &v1alpha2.InferenceModel{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } @@ -132,7 +132,7 @@ func DeleteInferenceModelResources(ctx context.Context, cli client.Client, ns st if ns == "" { return nil } - err := cli.DeleteAllOf(ctx, &infextv1a1.InferenceModel{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + err := cli.DeleteAllOf(ctx, &v1alpha2.InferenceModel{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } diff --git a/test/utils/wrappers.go b/test/utils/wrappers.go index 12ff856a..867118c1 100644 --- a/test/utils/wrappers.go +++ b/test/utils/wrappers.go @@ -17,26 +17,26 @@ limitations under the License. package utils import ( - infextv1a1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) // InferenceModelWrapper wraps an InferenceModel. type InferenceModelWrapper struct { - infextv1a1.InferenceModel + v1alpha2.InferenceModel } // MakeModelWrapper creates a wrapper for an MakeModelWrapper. func MakeModelWrapper(name, ns string) *InferenceModelWrapper { return &InferenceModelWrapper{ - infextv1a1.InferenceModel{ + v1alpha2.InferenceModel{ ObjectMeta: metav1.ObjectMeta{ Name: name, Namespace: ns, }, - Spec: infextv1a1.InferenceModelSpec{ + Spec: v1alpha2.InferenceModelSpec{ ModelName: "", - PoolRef: infextv1a1.PoolObjectReference{}, + PoolRef: v1alpha2.PoolObjectReference{}, }, }, } @@ -49,7 +49,7 @@ func (m *InferenceModelWrapper) SetModelName(name string) *InferenceModelWrapper } // SetCriticality sets the value of the inferenceModel.spec.criticality. -func (m *InferenceModelWrapper) SetCriticality(level infextv1a1.Criticality) *InferenceModelWrapper { +func (m *InferenceModelWrapper) SetCriticality(level v1alpha2.Criticality) *InferenceModelWrapper { m.Spec.Criticality = &level return m } @@ -57,22 +57,22 @@ func (m *InferenceModelWrapper) SetCriticality(level infextv1a1.Criticality) *In // SetPoolRef sets the value of the inferenceModel.spec.poolRef using defaults // for group/kind and name as the PoolObjectReference name. func (m *InferenceModelWrapper) SetPoolRef(name string) *InferenceModelWrapper { - ref := infextv1a1.PoolObjectReference{ - Group: infextv1a1.GroupVersion.Group, + ref := v1alpha2.PoolObjectReference{ + Group: v1alpha2.Group(v1alpha2.GroupVersion.Group), Kind: "inferencepools", - Name: name, + Name: v1alpha2.ObjectName(name), } m.Spec.PoolRef = ref return m } // SetTargetModels sets the value of the inferenceModel.spec.targetModels. -func (m *InferenceModelWrapper) SetTargetModels(models []infextv1a1.TargetModel) *InferenceModelWrapper { +func (m *InferenceModelWrapper) SetTargetModels(models []v1alpha2.TargetModel) *InferenceModelWrapper { m.Spec.TargetModels = models return m } // Obj returns the inner InferenceModel. -func (m *InferenceModelWrapper) Obj() *infextv1a1.InferenceModel { +func (m *InferenceModelWrapper) Obj() *v1alpha2.InferenceModel { return &m.InferenceModel } diff --git a/tools/dashboards/README.md b/tools/dashboards/README.md index c8258b63..7be2a5b8 100644 --- a/tools/dashboards/README.md +++ b/tools/dashboards/README.md @@ -4,7 +4,7 @@ This documentation provides instructions for setting up grafana dashboards to se ## Requirements -Please follow [metrics](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/ext-proc/metrics) page to configure the proxy to enable all metrics. +Please follow [metrics](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp/metrics) page to configure the proxy to enable all metrics. ## Load Inference Extension dashboard into Grafana @@ -21,6 +21,7 @@ If you run the inferece gateway with [Google Managed Prometheus](https://cloud.g Please configure the `scrape_interval` of your prometheus configuration to lower than `15s`, `rate` function returns empty string if data falls too apart. See https://www.robustperception.io/what-range-should-i-use-with-rate/ for more details. Example: + ``` global: scrape_interval: 5s diff --git a/tools/dashboards/inference_gateway.json b/tools/dashboards/inference_gateway.json index 3af66703..cf00420d 100644 --- a/tools/dashboards/inference_gateway.json +++ b/tools/dashboards/inference_gateway.json @@ -28,7 +28,7 @@ }, "gridPos": { "h": 3, - "w": 23, + "w": 20, "x": 0, "y": 0 }, @@ -39,10 +39,10 @@ "showLineNumbers": false, "showMiniMap": false }, - "content": "# Inferece Gateway Dashboard\n\nPlease see https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/ext-proc/metrics for more details of underlying metrics used in the dashboard.", + "content": "# Inferece Gateway Dashboard\n\nPlease see https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp/metrics for more details of underlying metrics used in the dashboard.", "mode": "markdown" }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "title": "", "type": "text" }, @@ -54,15 +54,15 @@ "x": 0, "y": 3 }, - "id": 3, + "id": 15, "panels": [], - "title": "Inference Model", + "title": "Inference Pool", "type": "row" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "deap2an4eadc0d" }, "fieldConfig": { "defaults": { @@ -125,7 +125,7 @@ "x": 0, "y": 4 }, - "id": 1, + "id": 16, "options": { "legend": { "calcs": [], @@ -139,33 +139,27 @@ "sort": "none" } }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, "disableTextWrap": false, "editorMode": "builder", - "exemplar": false, - "expr": "sum by(model_name, target_model_name) (rate(inference_model_request_total{}[$__rate_interval]))", + "expr": "sum by(name) (inference_pool_average_kv_cache_utilization)", "fullMetaSearch": false, "includeNullMetadata": true, - "interval": "", "legendFormat": "__auto", "range": true, "refId": "A", "useBackend": false } ], - "title": "Request / s", + "title": "Average KV Cache Utilization", "type": "timeseries" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "deap2an4eadc0d" }, "fieldConfig": { "defaults": { @@ -228,7 +222,7 @@ "x": 10, "y": 4 }, - "id": 2, + "id": 17, "options": { "legend": { "calcs": [], @@ -242,55 +236,36 @@ "sort": "none" } }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "targets": [ { "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", + "expr": "sum by(name) (inference_pool_average_queue_size)", "fullMetaSearch": false, - "includeNullMetadata": false, - "legendFormat": "95%", + "includeNullMetadata": true, + "legendFormat": "__auto", "range": true, "refId": "A", "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "90%", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "50%", - "range": true, - "refId": "C", - "useBackend": false } ], - "title": "E2E Request Latency", + "title": "Average Queue Size", "type": "timeseries" }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 3, + "panels": [], + "title": "Inference Model", + "type": "row" + }, { "datasource": { "type": "prometheus", @@ -353,11 +328,11 @@ }, "gridPos": { "h": 8, - "w": 10, + "w": 20, "x": 0, - "y": 12 + "y": 13 }, - "id": 6, + "id": 2, "options": { "legend": { "calcs": [], @@ -371,12 +346,12 @@ "sort": "none" } }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "targets": [ { "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "legendFormat": "95%", @@ -391,7 +366,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -407,7 +382,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -417,7 +392,7 @@ "useBackend": false } ], - "title": "Request Size", + "title": "E2E Request Latency", "type": "timeseries" }, { @@ -483,10 +458,10 @@ "gridPos": { "h": 8, "w": 10, - "x": 10, - "y": 12 + "x": 0, + "y": 21 }, - "id": 7, + "id": 1, "options": { "legend": { "calcs": [], @@ -500,35 +475,8 @@ "sort": "none" } }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "targets": [ - { - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "includeNullMetadata": false, - "legendFormat": "95%", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "90%", - "range": true, - "refId": "B", - "useBackend": false - }, { "datasource": { "type": "prometheus", @@ -536,17 +484,18 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", + "exemplar": false, + "expr": "sum by(model_name, target_model_name) (rate(inference_model_request_total{}[$__rate_interval]))", "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "50%", + "includeNullMetadata": true, + "interval": "", + "legendFormat": "__auto", "range": true, - "refId": "C", + "refId": "A", "useBackend": false } ], - "title": "Response Size", + "title": "Request / s", "type": "timeseries" }, { @@ -612,10 +561,10 @@ "gridPos": { "h": 8, "w": 10, - "x": 0, - "y": 20 + "x": 10, + "y": 21 }, - "id": 8, + "id": 18, "options": { "legend": { "calcs": [], @@ -629,19 +578,8 @@ "sort": "none" } }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "targets": [ - { - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "includeNullMetadata": false, - "legendFormat": "95%", - "range": true, - "refId": "A", - "useBackend": false - }, { "datasource": { "type": "prometheus", @@ -649,33 +587,18 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "90%", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", + "exemplar": false, + "expr": "sum by(error_code, model_name, target_model_name) (rate(inference_model_request_error_total[$__rate_interval]))", "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "50%", + "includeNullMetadata": true, + "interval": "", + "legendFormat": "__auto", "range": true, - "refId": "C", + "refId": "A", "useBackend": false } ], - "title": "Input Token Count", + "title": "Request Error / s", "type": "timeseries" }, { @@ -741,10 +664,10 @@ "gridPos": { "h": 8, "w": 10, - "x": 10, - "y": 20 + "x": 0, + "y": 29 }, - "id": 9, + "id": 6, "options": { "legend": { "calcs": [], @@ -758,12 +681,12 @@ "sort": "none" } }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "targets": [ { "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "legendFormat": "95%", @@ -778,7 +701,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -794,7 +717,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -804,22 +727,9 @@ "useBackend": false } ], - "title": "Output Token Count", + "title": "Request Size", "type": "timeseries" }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 28 - }, - "id": 10, - "panels": [], - "title": "vLLM", - "type": "row" - }, { "datasource": { "type": "prometheus", @@ -881,12 +791,12 @@ "overrides": [] }, "gridPos": { - "h": 7, + "h": 8, "w": 10, - "x": 0, + "x": 10, "y": 29 }, - "id": 14, + "id": 7, "options": { "legend": { "calcs": [], @@ -900,15 +810,15 @@ "sort": "none" } }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "targets": [ { "disableTextWrap": false, "editorMode": "builder", - "expr": "sum by(model_name) (rate(vllm:prompt_tokens_total[$__rate_interval]))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", "fullMetaSearch": false, - "includeNullMetadata": true, - "legendFormat": "Prompt Tokens/Sec", + "includeNullMetadata": false, + "legendFormat": "95%", "range": true, "refId": "A", "useBackend": false @@ -920,17 +830,33 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "sum by(model_name) (rate(vllm:generation_tokens_total[$__rate_interval]))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, - "includeNullMetadata": true, - "legendFormat": "Generation Tokens/Sec", + "includeNullMetadata": false, + "legendFormat": "90%", "range": true, "refId": "B", "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "legendFormat": "50%", + "range": true, + "refId": "C", + "useBackend": false } ], - "title": "Token Throughput", + "title": "Response Size", "type": "timeseries" }, { @@ -994,12 +920,12 @@ "overrides": [] }, "gridPos": { - "h": 7, + "h": 8, "w": 10, - "x": 10, - "y": 29 + "x": 0, + "y": 37 }, - "id": 11, + "id": 8, "options": { "legend": { "calcs": [], @@ -1013,14 +939,14 @@ "sort": "none" } }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "targets": [ { "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", "fullMetaSearch": false, - "includeNullMetadata": true, + "includeNullMetadata": false, "legendFormat": "95%", "range": true, "refId": "A", @@ -1033,10 +959,10 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, - "includeNullMetadata": true, + "includeNullMetadata": false, "legendFormat": "90%", "range": true, "refId": "B", @@ -1049,17 +975,17 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, - "includeNullMetadata": true, + "includeNullMetadata": false, "legendFormat": "50%", "range": true, "refId": "C", "useBackend": false } ], - "title": "E2E Request Latency", + "title": "Input Token Count", "type": "timeseries" }, { @@ -1123,12 +1049,12 @@ "overrides": [] }, "gridPos": { - "h": 7, + "h": 8, "w": 10, - "x": 0, - "y": 36 + "x": 10, + "y": 37 }, - "id": 13, + "id": 9, "options": { "legend": { "calcs": [], @@ -1142,14 +1068,14 @@ "sort": "none" } }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "targets": [ { "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", "fullMetaSearch": false, - "includeNullMetadata": true, + "includeNullMetadata": false, "legendFormat": "95%", "range": true, "refId": "A", @@ -1162,10 +1088,10 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, - "includeNullMetadata": true, + "includeNullMetadata": false, "legendFormat": "90%", "range": true, "refId": "B", @@ -1178,147 +1104,532 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, - "includeNullMetadata": true, + "includeNullMetadata": false, "legendFormat": "50%", "range": true, "refId": "C", "useBackend": false } ], - "title": "Time Per Output Token Latency", + "title": "Output Token Count", "type": "timeseries" }, { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 45 }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "id": 10, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 10, + "x": 0, + "y": 52 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "pluginVersion": "11.5.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(model_name) (rate(vllm:prompt_tokens_total[$__rate_interval]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "Prompt Tokens/Sec", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 10, - "x": 10, - "y": 36 - }, - "id": 12, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(model_name) (rate(vllm:generation_tokens_total[$__rate_interval]))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "Generation Tokens/Sec", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Token Throughput", + "type": "timeseries" }, - "tooltip": { - "hideZeros": false, - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.5.0", - "targets": [ { - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", - "fullMetaSearch": false, - "includeNullMetadata": true, - "legendFormat": "95%", - "range": true, - "refId": "A", - "useBackend": false + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 10, + "x": 10, + "y": 52 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "95%", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "90%", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "50%", + "range": true, + "refId": "C", + "useBackend": false + } + ], + "title": "E2E Request Latency", + "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "legendFormat": "90%", - "range": true, - "refId": "B", - "useBackend": false + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 10, + "x": 0, + "y": 59 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "95%", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "90%", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "50%", + "range": true, + "refId": "C", + "useBackend": false + } + ], + "title": "Time Per Output Token Latency", + "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "legendFormat": "50%", - "range": true, - "refId": "C", - "useBackend": false + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 10, + "x": 10, + "y": 59 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "95%", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "90%", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "50%", + "range": true, + "refId": "C", + "useBackend": false + } + ], + "title": "Time To First Token Latency", + "type": "timeseries" } ], - "title": "Time To First Token Latency", - "type": "timeseries" + "title": "vLLM", + "type": "row" } ], "preload": false, @@ -1350,6 +1661,6 @@ "timezone": "browser", "title": "Inference Gateway", "uid": "aeap3g4ujefb4b", - "version": 16, + "version": 20, "weekStart": "" }