From 89eacc719fe31ace669c421a828c4e37ff0b7581 Mon Sep 17 00:00:00 2001 From: kfswain <137822113+kfswain@users.noreply.github.com> Date: Thu, 6 Feb 2025 14:33:55 -0700 Subject: [PATCH 01/96] Revert "Replace EndpointSlice reconciler with pod list backed by informer (#271)" (#301) This reverts commit 9298849a6b39b2e636c0cfbbbfc00a762f6bfd81. --- .golangci.yml | 1 + pkg/ext-proc/backend/datastore.go | 132 ++---------- .../backend/endpointslice_reconciler.go | 109 ++++++++++ .../backend/endpointslice_reconcilier_test.go | 202 ++++++++++++++++++ pkg/ext-proc/backend/fake.go | 10 +- .../backend/inferencemodel_reconciler_test.go | 21 -- .../backend/inferencepool_reconciler.go | 1 + pkg/ext-proc/backend/provider.go | 85 ++------ pkg/ext-proc/backend/provider_test.go | 143 ++++--------- pkg/ext-proc/backend/types.go | 1 - pkg/ext-proc/health.go | 3 +- pkg/ext-proc/main.go | 26 ++- pkg/ext-proc/scheduling/filter_test.go | 5 +- pkg/ext-proc/server/runserver.go | 31 ++- pkg/ext-proc/test/utils.go | 6 +- pkg/ext-proc/util/testing/lister.go | 19 -- pkg/ext-proc/util/testing/wrappers.go | 38 ---- pkg/manifests/ext_proc.yaml | 2 + pkg/manifests/vllm/deployment.yaml | 13 ++ test/e2e/e2e_suite_test.go | 5 + test/integration/hermetic_test.go | 94 ++++---- 21 files changed, 499 insertions(+), 448 deletions(-) create mode 100644 pkg/ext-proc/backend/endpointslice_reconciler.go create mode 100644 pkg/ext-proc/backend/endpointslice_reconcilier_test.go delete mode 100644 pkg/ext-proc/util/testing/lister.go delete mode 100644 pkg/ext-proc/util/testing/wrappers.go diff --git a/.golangci.yml b/.golangci.yml index 2ad3b93d..1462bcc7 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -14,6 +14,7 @@ linters: - dupword - durationcheck - fatcontext + - gci - ginkgolinter - gocritic - govet diff --git a/pkg/ext-proc/backend/datastore.go b/pkg/ext-proc/backend/datastore.go index 627ddbe5..b466a2ed 100644 --- a/pkg/ext-proc/backend/datastore.go +++ b/pkg/ext-proc/backend/datastore.go @@ -1,26 +1,13 @@ package backend import ( - "context" "errors" "math/rand" "sync" - "time" - "github.com/google/go-cmp/cmp" "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" corev1 "k8s.io/api/core/v1" - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/meta" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/client-go/informers" - informersv1 "k8s.io/client-go/informers/core/v1" - "k8s.io/client-go/kubernetes" - clientset "k8s.io/client-go/kubernetes" - listersv1 "k8s.io/client-go/listers/core/v1" - "k8s.io/client-go/tools/cache" "k8s.io/klog/v2" ) @@ -28,9 +15,8 @@ func NewK8sDataStore(options ...K8sDatastoreOption) *K8sDatastore { store := &K8sDatastore{ poolMu: sync.RWMutex{}, InferenceModels: &sync.Map{}, + pods: &sync.Map{}, } - - store.podListerFactory = store.createPodLister for _, opt := range options { opt(store) } @@ -39,68 +25,29 @@ func NewK8sDataStore(options ...K8sDatastoreOption) *K8sDatastore { // The datastore is a local cache of relevant data for the given InferencePool (currently all pulled from k8s-api) type K8sDatastore struct { - client kubernetes.Interface // poolMu is used to synchronize access to the inferencePool. - poolMu sync.RWMutex - inferencePool *v1alpha1.InferencePool - podListerFactory PodListerFactory - podLister *PodLister - InferenceModels *sync.Map + poolMu sync.RWMutex + inferencePool *v1alpha1.InferencePool + InferenceModels *sync.Map + pods *sync.Map } type K8sDatastoreOption func(*K8sDatastore) -type PodListerFactory func(*v1alpha1.InferencePool) *PodLister // WithPods can be used in tests to override the pods. -func WithPodListerFactory(factory PodListerFactory) K8sDatastoreOption { +func WithPods(pods []*PodMetrics) K8sDatastoreOption { return func(store *K8sDatastore) { - store.podListerFactory = factory + store.pods = &sync.Map{} + for _, pod := range pods { + store.pods.Store(pod.Pod, true) + } } } -type PodLister struct { - Lister listersv1.PodLister - sharedInformer informers.SharedInformerFactory -} - -func (l *PodLister) listEverything() ([]*corev1.Pod, error) { - return l.Lister.List(labels.Everything()) - -} - -func (ds *K8sDatastore) SetClient(client kubernetes.Interface) { - ds.client = client -} - func (ds *K8sDatastore) setInferencePool(pool *v1alpha1.InferencePool) { ds.poolMu.Lock() defer ds.poolMu.Unlock() - - if ds.inferencePool != nil && cmp.Equal(ds.inferencePool.Spec.Selector, pool.Spec.Selector) { - // Pool updated, but the selector stayed the same, so no need to change the informer. - ds.inferencePool = pool - return - } - - // New pool or selector updated. ds.inferencePool = pool - - if ds.podLister != nil && ds.podLister.sharedInformer != nil { - // Shutdown the old informer async since this takes a few seconds. - go func() { - ds.podLister.sharedInformer.Shutdown() - }() - } - - if ds.podListerFactory != nil { - // Create a new informer with the new selector. - ds.podLister = ds.podListerFactory(ds.inferencePool) - if ds.podLister != nil && ds.podLister.sharedInformer != nil { - ctx := context.Background() - ds.podLister.sharedInformer.Start(ctx.Done()) - ds.podLister.sharedInformer.WaitForCacheSync(ctx.Done()) - } - } } func (ds *K8sDatastore) getInferencePool() (*v1alpha1.InferencePool, error) { @@ -112,58 +59,13 @@ func (ds *K8sDatastore) getInferencePool() (*v1alpha1.InferencePool, error) { return ds.inferencePool, nil } -func (ds *K8sDatastore) createPodLister(pool *v1alpha1.InferencePool) *PodLister { - if ds.client == nil { - return nil - } - klog.V(logutil.DEFAULT).Infof("Creating informer for pool %v", pool.Name) - selectorSet := make(map[string]string) - for k, v := range pool.Spec.Selector { - selectorSet[string(k)] = string(v) - } - - newPodInformer := func(cs clientset.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { - informer := informersv1.NewFilteredPodInformer(cs, pool.Namespace, resyncPeriod, cache.Indexers{}, func(options *metav1.ListOptions) { - options.LabelSelector = labels.SelectorFromSet(selectorSet).String() - }) - err := informer.SetTransform(func(obj interface{}) (interface{}, error) { - // Remove unnecessary fields to improve memory footprint. - if accessor, err := meta.Accessor(obj); err == nil { - if accessor.GetManagedFields() != nil { - accessor.SetManagedFields(nil) - } - } - return obj, nil - }) - if err != nil { - klog.Errorf("Failed to set pod transformer: %v", err) - } - return informer - } - // 0 means we disable resyncing, it is not really useful to resync every hour (the controller-runtime default), - // if things go wrong in the watch, no one will wait for an hour for things to get fixed. - // As precedence, kube-scheduler also disables this since it is expensive to list all pods from the api-server regularly. - resyncPeriod := time.Duration(0) - sharedInformer := informers.NewSharedInformerFactory(ds.client, resyncPeriod) - sharedInformer.InformerFor(&v1.Pod{}, newPodInformer) - - return &PodLister{ - Lister: sharedInformer.Core().V1().Pods().Lister(), - sharedInformer: sharedInformer, - } -} - -func (ds *K8sDatastore) getPods() ([]*corev1.Pod, error) { - ds.poolMu.RLock() - defer ds.poolMu.RUnlock() - if !ds.HasSynced() { - return nil, errors.New("InferencePool is not initialized in datastore") - } - pods, err := ds.podLister.listEverything() - if err != nil { - return nil, err - } - return pods, nil +func (ds *K8sDatastore) GetPodIPs() []string { + var ips []string + ds.pods.Range(func(name, pod any) bool { + ips = append(ips, pod.(*corev1.Pod).Status.PodIP) + return true + }) + return ips } func (s *K8sDatastore) FetchModelData(modelName string) (returnModel *v1alpha1.InferenceModel) { diff --git a/pkg/ext-proc/backend/endpointslice_reconciler.go b/pkg/ext-proc/backend/endpointslice_reconciler.go new file mode 100644 index 00000000..a2a9790f --- /dev/null +++ b/pkg/ext-proc/backend/endpointslice_reconciler.go @@ -0,0 +1,109 @@ +package backend + +import ( + "context" + "strconv" + "time" + + "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + discoveryv1 "k8s.io/api/discovery/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/record" + klog "k8s.io/klog/v2" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/predicate" +) + +var ( + serviceOwnerLabel = "kubernetes.io/service-name" +) + +type EndpointSliceReconciler struct { + client.Client + Scheme *runtime.Scheme + Record record.EventRecorder + ServiceName string + Zone string + Datastore *K8sDatastore +} + +func (c *EndpointSliceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + inferencePool, err := c.Datastore.getInferencePool() + if err != nil { + klog.V(logutil.DEFAULT).Infof("Skipping reconciling EndpointSlice because the InferencePool is not available yet: %v", err) + return ctrl.Result{Requeue: true, RequeueAfter: time.Second}, nil + } + + klog.V(logutil.DEFAULT).Info("Reconciling EndpointSlice ", req.NamespacedName) + + endpointSlice := &discoveryv1.EndpointSlice{} + if err := c.Get(ctx, req.NamespacedName, endpointSlice); err != nil { + klog.Errorf("Unable to get EndpointSlice: %v", err) + return ctrl.Result{}, err + } + c.updateDatastore(endpointSlice, inferencePool) + + return ctrl.Result{}, nil +} + +// TODO: Support multiple endpointslices for a single service +func (c *EndpointSliceReconciler) updateDatastore( + slice *discoveryv1.EndpointSlice, + inferencePool *v1alpha1.InferencePool) { + podMap := make(map[Pod]bool) + + for _, endpoint := range slice.Endpoints { + klog.V(logutil.DEFAULT).Infof("Zone: %v \n endpoint: %+v \n", c.Zone, endpoint) + if c.validPod(endpoint) { + pod := Pod{ + Name: endpoint.TargetRef.Name, + Address: endpoint.Addresses[0] + ":" + strconv.Itoa(int(inferencePool.Spec.TargetPortNumber)), + } + podMap[pod] = true + klog.V(logutil.DEFAULT).Infof("Storing pod %v", pod) + c.Datastore.pods.Store(pod, true) + } + } + + removeOldPods := func(k, v any) bool { + pod, ok := k.(Pod) + if !ok { + klog.Errorf("Unable to cast key to Pod: %v", k) + return false + } + if _, ok := podMap[pod]; !ok { + klog.V(logutil.DEFAULT).Infof("Removing pod %v", pod) + c.Datastore.pods.Delete(pod) + } + return true + } + c.Datastore.pods.Range(removeOldPods) +} + +func (c *EndpointSliceReconciler) SetupWithManager(mgr ctrl.Manager) error { + ownsEndPointSlice := func(object client.Object) bool { + // Check if the object is an EndpointSlice + endpointSlice, ok := object.(*discoveryv1.EndpointSlice) + if !ok { + return false + } + + gotLabel := endpointSlice.ObjectMeta.Labels[serviceOwnerLabel] + wantLabel := c.ServiceName + return gotLabel == wantLabel + } + + return ctrl.NewControllerManagedBy(mgr). + For(&discoveryv1.EndpointSlice{}, + builder.WithPredicates(predicate.NewPredicateFuncs(ownsEndPointSlice))). + Complete(c) +} + +func (c *EndpointSliceReconciler) validPod(endpoint discoveryv1.Endpoint) bool { + validZone := c.Zone == "" || c.Zone != "" && *endpoint.Zone == c.Zone + return validZone && *endpoint.Conditions.Ready + +} diff --git a/pkg/ext-proc/backend/endpointslice_reconcilier_test.go b/pkg/ext-proc/backend/endpointslice_reconcilier_test.go new file mode 100644 index 00000000..e3c927ba --- /dev/null +++ b/pkg/ext-proc/backend/endpointslice_reconcilier_test.go @@ -0,0 +1,202 @@ +package backend + +import ( + "sync" + "testing" + + "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + v1 "k8s.io/api/core/v1" + discoveryv1 "k8s.io/api/discovery/v1" +) + +var ( + basePod1 = Pod{Name: "pod1"} + basePod2 = Pod{Name: "pod2"} + basePod3 = Pod{Name: "pod3"} +) + +func TestUpdateDatastore_EndpointSliceReconciler(t *testing.T) { + tests := []struct { + name string + datastore *K8sDatastore + incomingSlice *discoveryv1.EndpointSlice + wantPods *sync.Map + }{ + { + name: "Add new pod", + datastore: &K8sDatastore{ + pods: populateMap(basePod1, basePod2), + inferencePool: &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + TargetPortNumber: int32(8000), + }, + }, + }, + incomingSlice: &discoveryv1.EndpointSlice{ + Endpoints: []discoveryv1.Endpoint{ + { + TargetRef: &v1.ObjectReference{ + Name: "pod1", + }, + Zone: new(string), + Conditions: discoveryv1.EndpointConditions{ + Ready: truePointer(), + }, + Addresses: []string{"0.0.0.0"}, + }, + { + TargetRef: &v1.ObjectReference{ + Name: "pod2", + }, + Zone: new(string), + Conditions: discoveryv1.EndpointConditions{ + Ready: truePointer(), + }, + Addresses: []string{"0.0.0.0"}, + }, + { + TargetRef: &v1.ObjectReference{ + Name: "pod3", + }, + Zone: new(string), + Conditions: discoveryv1.EndpointConditions{ + Ready: truePointer(), + }, + Addresses: []string{"0.0.0.0"}, + }, + }, + }, + wantPods: populateMap(basePod1, basePod2, basePod3), + }, + { + name: "New pod, but its not ready yet. Do not add.", + datastore: &K8sDatastore{ + pods: populateMap(basePod1, basePod2), + inferencePool: &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + TargetPortNumber: int32(8000), + }, + }, + }, + incomingSlice: &discoveryv1.EndpointSlice{ + Endpoints: []discoveryv1.Endpoint{ + { + TargetRef: &v1.ObjectReference{ + Name: "pod1", + }, + Zone: new(string), + Conditions: discoveryv1.EndpointConditions{ + Ready: truePointer(), + }, + Addresses: []string{"0.0.0.0"}, + }, + { + TargetRef: &v1.ObjectReference{ + Name: "pod2", + }, + Zone: new(string), + Conditions: discoveryv1.EndpointConditions{ + Ready: truePointer(), + }, + Addresses: []string{"0.0.0.0"}, + }, + { + TargetRef: &v1.ObjectReference{ + Name: "pod3", + }, + Zone: new(string), + Conditions: discoveryv1.EndpointConditions{ + Ready: new(bool), + }, + Addresses: []string{"0.0.0.0"}, + }, + }, + }, + wantPods: populateMap(basePod1, basePod2), + }, + { + name: "Existing pod not ready, new pod added, and is ready", + datastore: &K8sDatastore{ + pods: populateMap(basePod1, basePod2), + inferencePool: &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + TargetPortNumber: int32(8000), + }, + }, + }, + incomingSlice: &discoveryv1.EndpointSlice{ + Endpoints: []discoveryv1.Endpoint{ + { + TargetRef: &v1.ObjectReference{ + Name: "pod1", + }, + Zone: new(string), + Conditions: discoveryv1.EndpointConditions{ + Ready: new(bool), + }, + Addresses: []string{"0.0.0.0"}, + }, + { + TargetRef: &v1.ObjectReference{ + Name: "pod2", + }, + Zone: new(string), + Conditions: discoveryv1.EndpointConditions{ + Ready: truePointer(), + }, + Addresses: []string{"0.0.0.0"}, + }, + { + TargetRef: &v1.ObjectReference{ + Name: "pod3", + }, + Zone: new(string), + Conditions: discoveryv1.EndpointConditions{ + Ready: truePointer(), + }, + Addresses: []string{"0.0.0.0"}, + }, + }, + }, + wantPods: populateMap(basePod3, basePod2), + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + endpointSliceReconciler := &EndpointSliceReconciler{Datastore: test.datastore, Zone: ""} + endpointSliceReconciler.updateDatastore(test.incomingSlice, test.datastore.inferencePool) + + if mapsEqual(endpointSliceReconciler.Datastore.pods, test.wantPods) { + t.Errorf("Unexpected output pod mismatch. \n Got %v \n Want: %v \n", + endpointSliceReconciler.Datastore.pods, + test.wantPods) + } + }) + } +} + +func mapsEqual(map1, map2 *sync.Map) bool { + equal := true + + map1.Range(func(k, v any) bool { + if _, ok := map2.Load(k); !ok { + equal = false + return false + } + return true + }) + map2.Range(func(k, v any) bool { + if _, ok := map1.Load(k); !ok { + equal = false + return false + } + return true + }) + + return equal +} + +func truePointer() *bool { + primitivePointersAreSilly := true + return &primitivePointersAreSilly +} diff --git a/pkg/ext-proc/backend/fake.go b/pkg/ext-proc/backend/fake.go index 63f20db6..c4545497 100644 --- a/pkg/ext-proc/backend/fake.go +++ b/pkg/ext-proc/backend/fake.go @@ -8,16 +8,16 @@ import ( ) type FakePodMetricsClient struct { - Err map[string]error - Res map[string]*PodMetrics + Err map[Pod]error + Res map[Pod]*PodMetrics } func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, pod Pod, existing *PodMetrics) (*PodMetrics, error) { - if err, ok := f.Err[pod.Name]; ok { + if err, ok := f.Err[pod]; ok { return nil, err } - klog.V(1).Infof("pod: %+v\n existing: %+v \n new: %+v \n", pod, existing, f.Res[pod.Name]) - return f.Res[pod.Name], nil + klog.V(1).Infof("pod: %+v\n existing: %+v \n new: %+v \n", pod, existing, f.Res[pod]) + return f.Res[pod], nil } type FakeDataStore struct { diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go index 117766b9..5609ca53 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go @@ -146,24 +146,3 @@ func populateServiceMap(services ...*v1alpha1.InferenceModel) *sync.Map { } return returnVal } - -func mapsEqual(map1, map2 *sync.Map) bool { - equal := true - - map1.Range(func(k, v any) bool { - if _, ok := map2.Load(k); !ok { - equal = false - return false - } - return true - }) - map2.Range(func(k, v any) bool { - if _, ok := map1.Load(k); !ok { - equal = false - return false - } - return true - }) - - return equal -} diff --git a/pkg/ext-proc/backend/inferencepool_reconciler.go b/pkg/ext-proc/backend/inferencepool_reconciler.go index 0c2ae75f..35a41f8f 100644 --- a/pkg/ext-proc/backend/inferencepool_reconciler.go +++ b/pkg/ext-proc/backend/inferencepool_reconciler.go @@ -21,6 +21,7 @@ type InferencePoolReconciler struct { Record record.EventRecorder PoolNamespacedName types.NamespacedName Datastore *K8sDatastore + Zone string } func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { diff --git a/pkg/ext-proc/backend/provider.go b/pkg/ext-proc/backend/provider.go index d6ccf85f..8bf67257 100644 --- a/pkg/ext-proc/backend/provider.go +++ b/pkg/ext-proc/backend/provider.go @@ -3,14 +3,11 @@ package backend import ( "context" "fmt" - "math/rand" - "strconv" "sync" "time" "go.uber.org/multierr" logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" - corev1 "k8s.io/api/core/v1" klog "k8s.io/klog/v2" ) @@ -29,8 +26,7 @@ func NewProvider(pmc PodMetricsClient, datastore *K8sDatastore) *Provider { // Provider provides backend pods and information such as metrics. type Provider struct { - // key: PodName, value: *PodMetrics - // TODO: change to use NamespacedName once we support multi-tenant inferencePools + // key: Pod, value: *PodMetrics podMetrics sync.Map pmc PodMetricsClient datastore *K8sDatastore @@ -51,11 +47,11 @@ func (p *Provider) AllPodMetrics() []*PodMetrics { } func (p *Provider) UpdatePodMetrics(pod Pod, pm *PodMetrics) { - p.podMetrics.Store(pod.Name, pm) + p.podMetrics.Store(pod, pm) } func (p *Provider) GetPodMetrics(pod Pod) (*PodMetrics, bool) { - val, ok := p.podMetrics.Load(pod.Name) + val, ok := p.podMetrics.Load(pod) if ok { return val.(*PodMetrics), true } @@ -105,70 +101,31 @@ func (p *Provider) Init(refreshPodsInterval, refreshMetricsInterval time.Duratio // refreshPodsOnce lists pods and updates keys in the podMetrics map. // Note this function doesn't update the PodMetrics value, it's done separately. func (p *Provider) refreshPodsOnce() { - pods, err := p.datastore.getPods() - if err != nil { - klog.V(logutil.DEFAULT).Infof("Couldn't list pods: %v", err) - p.podMetrics.Clear() - return - } - pool, _ := p.datastore.getInferencePool() - // revision is used to track which entries we need to remove in the next iteration that removes - // metrics for pods that don't exist anymore. Otherwise we have to build a map of the listed pods, - // which is not efficient. Revision can be any random id as long as it is different from the last - // refresh, so it should be very reliable (as reliable as the probability of randomly picking two - // different numbers from range 0 - maxInt). - revision := rand.Int() - ready := 0 - for _, pod := range pods { - if !podIsReady(pod) { - continue - } - // a ready pod - ready++ - if val, ok := p.podMetrics.Load(pod.Name); ok { - // pod already exists - pm := val.(*PodMetrics) - pm.revision = revision - continue - } - // new pod, add to the store for probing - new := &PodMetrics{ - Pod: Pod{ - Name: pod.Name, - Address: pod.Status.PodIP + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber)), - }, - Metrics: Metrics{ - ActiveModels: make(map[string]int), - }, - revision: revision, + // merge new pods with cached ones. + // add new pod to the map + addNewPods := func(k, v any) bool { + pod := k.(Pod) + if _, ok := p.podMetrics.Load(pod); !ok { + new := &PodMetrics{ + Pod: pod, + Metrics: Metrics{ + ActiveModels: make(map[string]int), + }, + } + p.podMetrics.Store(pod, new) } - p.podMetrics.Store(pod.Name, new) + return true } - - klog.V(logutil.DEFAULT).Infof("Pods in pool %s/%s with selector %v: total=%v ready=%v", - pool.Namespace, pool.Name, pool.Spec.Selector, len(pods), ready) - // remove pods that don't exist any more. mergeFn := func(k, v any) bool { - pm := v.(*PodMetrics) - if pm.revision != revision { - p.podMetrics.Delete(pm.Pod.Name) + pod := k.(Pod) + if _, ok := p.datastore.pods.Load(pod); !ok { + p.podMetrics.Delete(pod) } return true } p.podMetrics.Range(mergeFn) -} - -func podIsReady(pod *corev1.Pod) bool { - if pod.DeletionTimestamp != nil { - return false - } - for _, condition := range pod.Status.Conditions { - if condition.Type == corev1.PodReady { - return condition.Status == corev1.ConditionTrue - } - } - return false + p.datastore.pods.Range(addNewPods) } func (p *Provider) refreshMetricsOnce() error { @@ -184,8 +141,8 @@ func (p *Provider) refreshMetricsOnce() error { errCh := make(chan error) processOnePod := func(key, value any) bool { klog.V(logutil.TRACE).Infof("Processing pod %v and metric %v", key, value) + pod := key.(Pod) existing := value.(*PodMetrics) - pod := existing.Pod wg.Add(1) go func() { defer wg.Done() diff --git a/pkg/ext-proc/backend/provider_test.go b/pkg/ext-proc/backend/provider_test.go index 9159ba48..ad231f57 100644 --- a/pkg/ext-proc/backend/provider_test.go +++ b/pkg/ext-proc/backend/provider_test.go @@ -2,18 +2,17 @@ package backend import ( "errors" + "sync" "testing" + "time" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - testingutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/testing" - corev1 "k8s.io/api/core/v1" ) var ( pod1 = &PodMetrics{ - Pod: Pod{Name: "pod1", Address: "address1:9009"}, + Pod: Pod{Name: "pod1"}, Metrics: Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, @@ -25,7 +24,7 @@ var ( }, } pod2 = &PodMetrics{ - Pod: Pod{Name: "pod2", Address: "address2:9009"}, + Pod: Pod{Name: "pod2"}, Metrics: Metrics{ WaitingQueueSize: 1, KVCacheUsagePercent: 0.2, @@ -39,67 +38,44 @@ var ( ) func TestProvider(t *testing.T) { - allPodsLister := &testingutil.FakePodLister{ - PodsList: []*corev1.Pod{ - testingutil.MakePod(pod1.Pod.Name).SetReady().SetPodIP("address1").Obj(), - testingutil.MakePod(pod2.Pod.Name).SetReady().SetPodIP("address2").Obj(), - }, - } - allPodsMetricsClient := &FakePodMetricsClient{ - Res: map[string]*PodMetrics{ - pod1.Pod.Name: pod1, - pod2.Pod.Name: pod2, - }, - } - tests := []struct { - name string - initPodMetrics []*PodMetrics - lister *testingutil.FakePodLister - pmc PodMetricsClient - step func(*Provider) - want []*PodMetrics + name string + pmc PodMetricsClient + datastore *K8sDatastore + initErr bool + want []*PodMetrics }{ { - name: "Init without refreshing pods", - initPodMetrics: []*PodMetrics{pod1, pod2}, - lister: allPodsLister, - pmc: allPodsMetricsClient, - step: func(p *Provider) { - _ = p.refreshMetricsOnce() + name: "Init success", + datastore: &K8sDatastore{ + pods: populateMap(pod1.Pod, pod2.Pod), }, - want: []*PodMetrics{pod1, pod2}, - }, - { - name: "Fetching all success", - lister: allPodsLister, - pmc: allPodsMetricsClient, - step: func(p *Provider) { - p.refreshPodsOnce() - _ = p.refreshMetricsOnce() + pmc: &FakePodMetricsClient{ + Res: map[Pod]*PodMetrics{ + pod1.Pod: pod1, + pod2.Pod: pod2, + }, }, want: []*PodMetrics{pod1, pod2}, }, { - name: "Fetch metrics error", - lister: allPodsLister, + name: "Fetch metrics error", pmc: &FakePodMetricsClient{ - Err: map[string]error{ - pod2.Pod.Name: errors.New("injected error"), + Err: map[Pod]error{ + pod2.Pod: errors.New("injected error"), }, - Res: map[string]*PodMetrics{ - pod1.Pod.Name: pod1, + Res: map[Pod]*PodMetrics{ + pod1.Pod: pod1, }, }, - step: func(p *Provider) { - p.refreshPodsOnce() - _ = p.refreshMetricsOnce() + datastore: &K8sDatastore{ + pods: populateMap(pod1.Pod, pod2.Pod), }, want: []*PodMetrics{ pod1, // Failed to fetch pod2 metrics so it remains the default values. { - Pod: pod2.Pod, + Pod: Pod{Name: "pod2"}, Metrics: Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0, @@ -109,73 +85,30 @@ func TestProvider(t *testing.T) { }, }, }, - { - name: "A new pod added", - initPodMetrics: []*PodMetrics{pod2}, - lister: allPodsLister, - pmc: allPodsMetricsClient, - step: func(p *Provider) { - p.refreshPodsOnce() - _ = p.refreshMetricsOnce() - }, - want: []*PodMetrics{pod1, pod2}, - }, - { - name: "A pod removed", - initPodMetrics: []*PodMetrics{pod1, pod2}, - lister: &testingutil.FakePodLister{ - PodsList: []*corev1.Pod{ - testingutil.MakePod(pod2.Pod.Name).SetReady().SetPodIP("address2").Obj(), - }, - }, - pmc: allPodsMetricsClient, - step: func(p *Provider) { - p.refreshPodsOnce() - _ = p.refreshMetricsOnce() - }, - want: []*PodMetrics{pod2}, - }, - { - name: "A pod removed, another added", - initPodMetrics: []*PodMetrics{pod1}, - lister: &testingutil.FakePodLister{ - PodsList: []*corev1.Pod{ - testingutil.MakePod(pod1.Pod.Name).SetReady().SetPodIP("address1").Obj(), - }, - }, - pmc: allPodsMetricsClient, - step: func(p *Provider) { - p.refreshPodsOnce() - _ = p.refreshMetricsOnce() - }, - want: []*PodMetrics{pod1}, - }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - datastore := NewK8sDataStore(WithPodListerFactory( - func(pool *v1alpha1.InferencePool) *PodLister { - return &PodLister{ - Lister: test.lister, - } - })) - datastore.setInferencePool(&v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{TargetPortNumber: 9009}, - }) - p := NewProvider(test.pmc, datastore) - for _, m := range test.initPodMetrics { - p.UpdatePodMetrics(m.Pod, m) + p := NewProvider(test.pmc, test.datastore) + err := p.Init(time.Millisecond, time.Millisecond) + if test.initErr != (err != nil) { + t.Fatalf("Unexpected error, got: %v, want: %v", err, test.initErr) } - test.step(p) metrics := p.AllPodMetrics() lessFunc := func(a, b *PodMetrics) bool { return a.String() < b.String() } - if diff := cmp.Diff(test.want, metrics, cmpopts.SortSlices(lessFunc), - cmpopts.IgnoreFields(PodMetrics{}, "revision")); diff != "" { + if diff := cmp.Diff(test.want, metrics, cmpopts.SortSlices(lessFunc)); diff != "" { t.Errorf("Unexpected output (-want +got): %v", diff) } }) } } + +func populateMap(pods ...Pod) *sync.Map { + newMap := &sync.Map{} + for _, pod := range pods { + newMap.Store(pod, true) + } + return newMap +} diff --git a/pkg/ext-proc/backend/types.go b/pkg/ext-proc/backend/types.go index d375e4ec..7e399fed 100644 --- a/pkg/ext-proc/backend/types.go +++ b/pkg/ext-proc/backend/types.go @@ -28,7 +28,6 @@ type Metrics struct { type PodMetrics struct { Pod Metrics - revision int } func (pm *PodMetrics) String() string { diff --git a/pkg/ext-proc/health.go b/pkg/ext-proc/health.go index 62527d06..488851eb 100644 --- a/pkg/ext-proc/health.go +++ b/pkg/ext-proc/health.go @@ -7,7 +7,6 @@ import ( healthPb "google.golang.org/grpc/health/grpc_health_v1" "google.golang.org/grpc/status" "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" klog "k8s.io/klog/v2" ) @@ -20,7 +19,7 @@ func (s *healthServer) Check(ctx context.Context, in *healthPb.HealthCheckReques klog.Infof("gRPC health check not serving: %s", in.String()) return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_NOT_SERVING}, nil } - klog.V(logutil.DEBUG).Infof("gRPC health check serving: %s", in.String()) + klog.Infof("gRPC health check serving: %s", in.String()) return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_SERVING}, nil } diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index 98b7e6ca..a783aa2c 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -18,7 +18,6 @@ import ( runserver "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" - "k8s.io/client-go/kubernetes" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" "k8s.io/component-base/metrics/legacyregistry" @@ -54,6 +53,14 @@ var ( "poolNamespace", runserver.DefaultPoolNamespace, "Namespace of the InferencePool this Endpoint Picker is associated with.") + serviceName = flag.String( + "serviceName", + runserver.DefaultServiceName, + "Name of the Service that will be used to read EndpointSlices from") + zone = flag.String( + "zone", + runserver.DefaultZone, + "The zone that this instance is created in. Will be passed to the corresponding endpointSlice. ") refreshPodsInterval = flag.Duration( "refreshPodsInterval", runserver.DefaultRefreshPodsInterval, @@ -99,6 +106,8 @@ func main() { TargetEndpointKey: *targetEndpointKey, PoolName: *poolName, PoolNamespace: *poolNamespace, + ServiceName: *serviceName, + Zone: *zone, RefreshPodsInterval: *refreshPodsInterval, RefreshMetricsInterval: *refreshMetricsInterval, Scheme: scheme, @@ -107,15 +116,12 @@ func main() { } serverRunner.Setup() - k8sClient, err := kubernetes.NewForConfigAndClient(cfg, serverRunner.Manager.GetHTTPClient()) - if err != nil { - klog.Fatalf("Failed to create client: %v", err) - } - datastore.SetClient(k8sClient) - // Start health and ext-proc servers in goroutines healthSvr := startHealthServer(datastore, *grpcHealthPort) - extProcSvr := serverRunner.Start(&vllm.PodMetricsClientImpl{}) + extProcSvr := serverRunner.Start( + datastore, + &vllm.PodMetricsClientImpl{}, + ) // Start metrics handler metricsSvr := startMetricsHandler(*metricsPort, cfg) @@ -210,5 +216,9 @@ func validateFlags() error { return fmt.Errorf("required %q flag not set", "poolName") } + if *serviceName == "" { + return fmt.Errorf("required %q flag not set", "serviceName") + } + return nil } diff --git a/pkg/ext-proc/scheduling/filter_test.go b/pkg/ext-proc/scheduling/filter_test.go index 34731d15..d88f437c 100644 --- a/pkg/ext-proc/scheduling/filter_test.go +++ b/pkg/ext-proc/scheduling/filter_test.go @@ -5,7 +5,6 @@ import ( "testing" "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" ) @@ -207,7 +206,7 @@ func TestFilter(t *testing.T) { t.Errorf("Unexpected error, got %v, want %v", err, test.err) } - if diff := cmp.Diff(test.output, got, cmpopts.IgnoreFields(backend.PodMetrics{}, "revision")); diff != "" { + if diff := cmp.Diff(test.output, got); diff != "" { t.Errorf("Unexpected output (-want +got): %v", diff) } }) @@ -401,7 +400,7 @@ func TestFilterFunc(t *testing.T) { t.Errorf("Unexpected error, got %v, want %v", err, test.err) } - if diff := cmp.Diff(test.output, got, cmpopts.IgnoreFields(backend.PodMetrics{}, "revision")); diff != "" { + if diff := cmp.Diff(test.output, got); diff != "" { t.Errorf("Unexpected output (-want +got): %v", diff) } }) diff --git a/pkg/ext-proc/server/runserver.go b/pkg/ext-proc/server/runserver.go index 981dab11..1c9c1b2e 100644 --- a/pkg/ext-proc/server/runserver.go +++ b/pkg/ext-proc/server/runserver.go @@ -23,12 +23,14 @@ type ExtProcServerRunner struct { TargetEndpointKey string PoolName string PoolNamespace string + ServiceName string + Zone string RefreshPodsInterval time.Duration RefreshMetricsInterval time.Duration Scheme *runtime.Scheme Config *rest.Config Datastore *backend.K8sDatastore - Manager ctrl.Manager + manager ctrl.Manager } // Default values for CLI flags in main @@ -37,6 +39,8 @@ const ( DefaultTargetEndpointKey = "x-gateway-destination-endpoint" // default for --targetEndpointKey DefaultPoolName = "" // required but no default DefaultPoolNamespace = "default" // default for --poolNamespace + DefaultServiceName = "" // required but no default + DefaultZone = "" // default for --zone DefaultRefreshPodsInterval = 10 * time.Second // default for --refreshPodsInterval DefaultRefreshMetricsInterval = 50 * time.Millisecond // default for --refreshMetricsInterval ) @@ -47,20 +51,22 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner { TargetEndpointKey: DefaultTargetEndpointKey, PoolName: DefaultPoolName, PoolNamespace: DefaultPoolNamespace, + ServiceName: DefaultServiceName, + Zone: DefaultZone, RefreshPodsInterval: DefaultRefreshPodsInterval, RefreshMetricsInterval: DefaultRefreshMetricsInterval, // Scheme, Config, and Datastore can be assigned later. } } -// Setup creates the reconcilers for pools and models and starts the manager. +// Setup creates the reconcilers for pools, models, and endpointSlices and starts the manager. func (r *ExtProcServerRunner) Setup() { // Create a new manager to manage controllers mgr, err := ctrl.NewManager(r.Config, ctrl.Options{Scheme: r.Scheme}) if err != nil { klog.Fatalf("Failed to create controller manager: %v", err) } - r.Manager = mgr + r.manager = mgr // Create the controllers and register them with the manager if err := (&backend.InferencePoolReconciler{ @@ -88,10 +94,22 @@ func (r *ExtProcServerRunner) Setup() { }).SetupWithManager(mgr); err != nil { klog.Fatalf("Failed setting up InferenceModelReconciler: %v", err) } + + if err := (&backend.EndpointSliceReconciler{ + Datastore: r.Datastore, + Scheme: mgr.GetScheme(), + Client: mgr.GetClient(), + Record: mgr.GetEventRecorderFor("endpointslice"), + ServiceName: r.ServiceName, + Zone: r.Zone, + }).SetupWithManager(mgr); err != nil { + klog.Fatalf("Failed setting up EndpointSliceReconciler: %v", err) + } } // Start starts the Envoy external processor server in a goroutine. func (r *ExtProcServerRunner) Start( + podDatastore *backend.K8sDatastore, podMetricsClient backend.PodMetricsClient, ) *grpc.Server { svr := grpc.NewServer() @@ -104,7 +122,7 @@ func (r *ExtProcServerRunner) Start( klog.Infof("Ext-proc server listening on port: %d", r.GrpcPort) // Initialize backend provider - pp := backend.NewProvider(podMetricsClient, r.Datastore) + pp := backend.NewProvider(podMetricsClient, podDatastore) if err := pp.Init(r.RefreshPodsInterval, r.RefreshMetricsInterval); err != nil { klog.Fatalf("Failed to initialize backend provider: %v", err) } @@ -125,12 +143,13 @@ func (r *ExtProcServerRunner) Start( } func (r *ExtProcServerRunner) StartManager() { - if r.Manager == nil { + if r.manager == nil { klog.Fatalf("Runner has no manager setup to run: %v", r) } // Start the controller manager. Blocking and will return when shutdown is complete. klog.Infof("Starting controller manager") - if err := r.Manager.Start(ctrl.SetupSignalHandler()); err != nil { + mgr := r.manager + if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { klog.Fatalf("Error starting controller manager: %v", err) } klog.Info("Controller manager shutting down") diff --git a/pkg/ext-proc/test/utils.go b/pkg/ext-proc/test/utils.go index a9dc4efa..63972849 100644 --- a/pkg/ext-proc/test/utils.go +++ b/pkg/ext-proc/test/utils.go @@ -18,13 +18,13 @@ import ( func StartExtProc(port int, refreshPodsInterval, refreshMetricsInterval time.Duration, pods []*backend.PodMetrics, models map[string]*v1alpha1.InferenceModel) *grpc.Server { ps := make(backend.PodSet) - pms := make(map[string]*backend.PodMetrics) + pms := make(map[backend.Pod]*backend.PodMetrics) for _, pod := range pods { ps[pod.Pod] = true - pms[pod.Pod.Name] = pod + pms[pod.Pod] = pod } pmc := &backend.FakePodMetricsClient{Res: pms} - pp := backend.NewProvider(pmc, backend.NewK8sDataStore()) + pp := backend.NewProvider(pmc, backend.NewK8sDataStore(backend.WithPods(pods))) if err := pp.Init(refreshPodsInterval, refreshMetricsInterval); err != nil { klog.Fatalf("failed to initialize: %v", err) } diff --git a/pkg/ext-proc/util/testing/lister.go b/pkg/ext-proc/util/testing/lister.go deleted file mode 100644 index 023f30a1..00000000 --- a/pkg/ext-proc/util/testing/lister.go +++ /dev/null @@ -1,19 +0,0 @@ -package testing - -import ( - v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/labels" - listersv1 "k8s.io/client-go/listers/core/v1" -) - -type FakePodLister struct { - PodsList []*v1.Pod -} - -func (l *FakePodLister) List(selector labels.Selector) (ret []*v1.Pod, err error) { - return l.PodsList, nil -} - -func (l *FakePodLister) Pods(namespace string) listersv1.PodNamespaceLister { - panic("not implemented") -} diff --git a/pkg/ext-proc/util/testing/wrappers.go b/pkg/ext-proc/util/testing/wrappers.go deleted file mode 100644 index 7b593bbd..00000000 --- a/pkg/ext-proc/util/testing/wrappers.go +++ /dev/null @@ -1,38 +0,0 @@ -package testing - -import ( - corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -// PodWrapper wraps a Pod inside. -type PodWrapper struct{ corev1.Pod } - -// MakePod creates a Pod wrapper. -func MakePod(name string) *PodWrapper { - return &PodWrapper{ - corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - }, - }, - } -} - -// Obj returns the inner Pod. -func (p *PodWrapper) Obj() *corev1.Pod { - return &p.Pod -} - -func (p *PodWrapper) SetReady() *PodWrapper { - p.Status.Conditions = []corev1.PodCondition{{ - Type: corev1.PodReady, - Status: corev1.ConditionTrue, - }} - return p -} - -func (p *PodWrapper) SetPodIP(podIP string) *PodWrapper { - p.Status.PodIP = podIP - return p -} diff --git a/pkg/manifests/ext_proc.yaml b/pkg/manifests/ext_proc.yaml index b9b860dc..4e82779e 100644 --- a/pkg/manifests/ext_proc.yaml +++ b/pkg/manifests/ext_proc.yaml @@ -77,6 +77,8 @@ spec: - "vllm-llama2-7b-pool" - -v - "3" + - -serviceName + - "vllm-llama2-7b-pool" - -grpcPort - "9002" - -grpcHealthPort diff --git a/pkg/manifests/vllm/deployment.yaml b/pkg/manifests/vllm/deployment.yaml index 1f5073e9..4af0891d 100644 --- a/pkg/manifests/vllm/deployment.yaml +++ b/pkg/manifests/vllm/deployment.yaml @@ -1,3 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: vllm-llama2-7b-pool +spec: + selector: + app: vllm-llama2-7b-pool + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + type: ClusterIP +--- apiVersion: apps/v1 kind: Deployment metadata: diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index 019e858a..c2c1ea92 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -245,6 +245,11 @@ func createModelServer(k8sClient client.Client, secretPath, deployPath string) { // Wait for the deployment to be available. testutils.DeploymentAvailable(ctx, k8sClient, deploy, modelReadyTimeout, interval) + + // Wait for the service to exist. + testutils.EventuallyExists(ctx, func() error { + return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: modelServerName}, &corev1.Service{}) + }, existsTimeout, interval) } // createEnvoy creates the envoy proxy resources used for testing from the given filePath. diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index 3dfe28f7..95ad4908 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -12,7 +12,6 @@ import ( "log" "os" "path/filepath" - "strconv" "testing" "time" @@ -27,8 +26,6 @@ import ( "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" runserver "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" extprocutils "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/test" - testingutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/testing" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" k8syaml "k8s.io/apimachinery/pkg/util/yaml" @@ -116,7 +113,7 @@ func SKIPTestHandleRequestBody(t *testing.T) { { Header: &configPb.HeaderValue{ Key: runserver.DefaultTargetEndpointKey, - RawValue: []byte("pod-1:8000"), + RawValue: []byte("address-1"), }, }, { @@ -182,7 +179,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { { Header: &configPb.HeaderValue{ Key: runserver.DefaultTargetEndpointKey, - RawValue: []byte("pod-1:8000"), + RawValue: []byte("address-1"), }, }, { @@ -196,7 +193,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { Fields: map[string]*structpb.Value{ runserver.DefaultTargetEndpointKey: { Kind: &structpb.Value_StringValue{ - StringValue: "pod-1:8000", + StringValue: "address-1", }, }, }, @@ -206,38 +203,47 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, } - metrics := []*backend.Metrics{ + pods := []*backend.PodMetrics{ { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, + Pod: extprocutils.FakePod(0), + Metrics: backend.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, }, }, { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.1, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg2": 1, + Pod: extprocutils.FakePod(1), + Metrics: backend.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.1, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg2": 1, + }, }, }, { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, + Pod: extprocutils.FakePod(2), + Metrics: backend.Metrics{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + }, }, }, } // Set up global k8sclient and extproc server runner with test environment config - podMetrics := BeforeSuit(metrics) + BeforeSuit() for _, test := range tests { t.Run(test.name, func(t *testing.T) { - client, cleanup := setUpHermeticServer(t, podMetrics) + client, cleanup := setUpHermeticServer(t, pods) t.Cleanup(cleanup) want := &extProcPb.ProcessingResponse{ Response: &extProcPb.ProcessingResponse_RequestBody{ @@ -318,8 +324,8 @@ func setUpHermeticServer(t *testing.T, pods []*backend.PodMetrics) (client extPr } } } - inferencePool := &v1alpha1.InferencePool{} for _, doc := range docs { + inferencePool := &v1alpha1.InferencePool{} if err = yaml.Unmarshal(doc, inferencePool); err != nil { log.Fatalf("Can't unmarshal object: %v", doc) } @@ -328,19 +334,18 @@ func setUpHermeticServer(t *testing.T, pods []*backend.PodMetrics) (client extPr if err := k8sClient.Create(context.Background(), inferencePool); err != nil { log.Fatalf("unable to create inferencePool %v: %v", inferencePool.Name, err) } - // expecting a single inferencepool - break } } ps := make(backend.PodSet) - pms := make(map[string]*backend.PodMetrics) + pms := make(map[backend.Pod]*backend.PodMetrics) for _, pod := range pods { ps[pod.Pod] = true - pms[pod.Pod.Name] = pod + pms[pod.Pod] = pod } pmc := &backend.FakePodMetricsClient{Res: pms} - server := serverRunner.Start(pmc) + + server := serverRunner.Start(backend.NewK8sDataStore(backend.WithPods(pods)), pmc) if err != nil { log.Fatalf("Ext-proc failed with the err: %v", err) } @@ -368,7 +373,7 @@ func setUpHermeticServer(t *testing.T, pods []*backend.PodMetrics) (client extPr } // Sets up a test environment and returns the runner struct -func BeforeSuit(metrics []*backend.Metrics) []*backend.PodMetrics { +func BeforeSuit() { // Set up mock k8s API Client testEnv = &envtest.Environment{ CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, @@ -390,35 +395,12 @@ func BeforeSuit(metrics []*backend.Metrics) []*backend.PodMetrics { log.Fatalf("No error, but returned kubernetes client is nil, cfg: %v", cfg) } - podMetrics := []*backend.PodMetrics{} - fakeLister := &testingutil.FakePodLister{ - PodsList: []*corev1.Pod{}, - } - for i, m := range metrics { - podName := "pod-" + strconv.Itoa(i) - pod := testingutil.MakePod(podName).SetReady().SetPodIP(podName).Obj() - fakeLister.PodsList = append(fakeLister.PodsList, pod) - podMetrics = append(podMetrics, &backend.PodMetrics{ - Pod: backend.Pod{ - Name: pod.Name, - Address: pod.Status.PodIP + ":8000", - }, - Metrics: *m, - }) - } - serverRunner = runserver.NewDefaultExtProcServerRunner() // Adjust from defaults serverRunner.PoolName = "vllm-llama2-7b-pool" serverRunner.Scheme = scheme serverRunner.Config = cfg - serverRunner.Datastore = backend.NewK8sDataStore(backend.WithPodListerFactory( - func(pool *v1alpha1.InferencePool) *backend.PodLister { - klog.V(1).Infof("Setting the fake lister %v", len(fakeLister.PodsList)) - return &backend.PodLister{ - Lister: fakeLister, - } - })) + serverRunner.Datastore = backend.NewK8sDataStore() serverRunner.Setup() @@ -426,10 +408,6 @@ func BeforeSuit(metrics []*backend.Metrics) []*backend.PodMetrics { go func() { serverRunner.StartManager() }() - - // Wait the reconcilers to populate the datastore. - time.Sleep(5 * time.Second) - return podMetrics } func sendRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { From ce2785c3c7a71270603ff07225ba8b2f1e14e063 Mon Sep 17 00:00:00 2001 From: kfswain <137822113+kfswain@users.noreply.github.com> Date: Thu, 6 Feb 2025 15:51:55 -0700 Subject: [PATCH 02/96] Fixing small linter complaints (#302) --- api/v1alpha1/inferencemodel_types.go | 8 ++++---- api/v1alpha1/inferencepool_types.go | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/api/v1alpha1/inferencemodel_types.go b/api/v1alpha1/inferencemodel_types.go index 3661820d..f171c10e 100644 --- a/api/v1alpha1/inferencemodel_types.go +++ b/api/v1alpha1/inferencemodel_types.go @@ -202,7 +202,7 @@ type InferenceModelConditionType string type InferenceModelConditionReason string const ( - // This condition indicates if the model config is accepted, and if not, why. + // ModelConditionAccepted indicates if the model config is accepted, and if not, why. // // Possible reasons for this condition to be True are: // @@ -218,14 +218,14 @@ const ( // ModelConditionAccepted InferenceModelConditionType = "Accepted" - // Desired state. Model conforms to the state of the pool. + // ModelReasonAccepted is the desired state. Model conforms to the state of the pool. ModelReasonAccepted InferenceModelConditionReason = "Accepted" - // This reason is used when a given ModelName already exists within the pool. + // ModelReasonNameInUse is used when a given ModelName already exists within the pool. // Details about naming conflict resolution are on the ModelName field itself. ModelReasonNameInUse InferenceModelConditionReason = "ModelNameInUse" - // This reason is the initial state, and indicates that the controller has not yet reconciled the InferenceModel. + // ModelReasonPending is the initial state, and indicates that the controller has not yet reconciled the InferenceModel. ModelReasonPending InferenceModelConditionReason = "Pending" ) diff --git a/api/v1alpha1/inferencepool_types.go b/api/v1alpha1/inferencepool_types.go index 61a3764d..b4c95d40 100644 --- a/api/v1alpha1/inferencepool_types.go +++ b/api/v1alpha1/inferencepool_types.go @@ -207,7 +207,7 @@ type InferencePoolConditionType string type InferencePoolConditionReason string const ( - // This condition indicates if the pool is ready to accept traffic, and if not, why. + // PoolConditionReady indicates if the pool is ready to accept traffic, and if not, why. // // Possible reasons for this condition to be True are: // @@ -223,13 +223,13 @@ const ( // PoolConditionReady InferencePoolConditionType = "Ready" - // Desired state. The pool and its components are initialized and ready for traffic. + // PoolReasonReady is the desired state. The pool and its components are initialized and ready for traffic. PoolReasonReady InferencePoolConditionReason = "Ready" - // This reason is used when the EPP has not yet passed health checks, or has started failing them. + // PoolReasonEPPNotHealthy is used when the EPP has not yet passed health checks, or has started failing them. PoolReasonEPPNotHealthy InferencePoolConditionReason = "EndpointPickerNotHealthy" - // This reason is the initial state, and indicates that the controller has not yet reconciled this pool. + // PoolReasonPending is the initial state, and indicates that the controller has not yet reconciled this pool. PoolReasonPending InferencePoolConditionReason = "Pending" ) From 3ff0af85f9341468c6f4b1e2923dd8f9b413b7e2 Mon Sep 17 00:00:00 2001 From: BenjaminBraunDev Date: Thu, 6 Feb 2025 16:19:55 -0800 Subject: [PATCH 03/96] In hermetic test, add additional test cases and move k8sClient object creation so it's called once for all tests (#278) * Add 2 new test cases to hermetic integration test. Move k8sclient API setup to BeforeSuit() so it is set up once for all test cases. Add getter function to scheduling to reference queue threshold for lora affinity inside integration tests. * remove vestigial unit test from hermetic test, minor change to comments, remove unreachable error check. * Add test-case for sheddable that is not shed, fix nits and rename the non-lora test case to use a different model name. * Fix small typo. --- test/integration/hermetic_test.go | 436 +++++++++++------- .../inferencepool-with-model-hermetic.yaml | 27 ++ 2 files changed, 296 insertions(+), 167 deletions(-) diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index 95ad4908..b52cc9d7 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -17,6 +17,7 @@ import ( configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3" "github.com/google/go-cmp/cmp" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" @@ -47,32 +48,73 @@ var ( scheme = runtime.NewScheme() ) -func SKIPTestHandleRequestBody(t *testing.T) { +func TestKubeInferenceModelRequest(t *testing.T) { tests := []struct { - name string - req *extProcPb.ProcessingRequest - pods []*backend.PodMetrics - models map[string]*v1alpha1.InferenceModel - wantHeaders []*configPb.HeaderValueOption - wantBody []byte - wantErr bool + name string + req *extProcPb.ProcessingRequest + pods []*backend.PodMetrics + wantHeaders []*configPb.HeaderValueOption + wantMetadata *structpb.Struct + wantBody []byte + wantErr bool + immediateResponse *extProcPb.ImmediateResponse }{ { - name: "success", + name: "select lower queue and kv cache, no active lora", req: extprocutils.GenerateRequest("my-model"), - models: map[string]*v1alpha1.InferenceModel{ - "my-model": { - Spec: v1alpha1.InferenceModelSpec{ - ModelName: "my-model", - TargetModels: []v1alpha1.TargetModel{ - { - Name: "my-model-v1", - Weight: pointer(100), - }, + // pod-1 will be picked because it has relatively low queue size and low KV cache. + pods: []*backend.PodMetrics{ + { + Pod: extprocutils.FakePod(0), + Metrics: backend.Metrics{ + WaitingQueueSize: 3, + KVCacheUsagePercent: 0.2, + }, + }, + { + Pod: extprocutils.FakePod(1), + Metrics: backend.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.1, + }, + }, + { + Pod: extprocutils.FakePod(2), + Metrics: backend.Metrics{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + }, + }, + }, + wantHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: runserver.DefaultTargetEndpointKey, + RawValue: []byte("address-1"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte("76"), + }, + }, + }, + wantMetadata: &structpb.Struct{ + Fields: map[string]*structpb.Value{ + runserver.DefaultTargetEndpointKey: { + Kind: &structpb.Value_StringValue{ + StringValue: "address-1", }, }, }, }, + wantBody: []byte("{\"max_tokens\":100,\"model\":\"my-model-12345\",\"prompt\":\"hello\",\"temperature\":0}"), + wantErr: false, + }, + { + name: "select active lora, low queue", + req: extprocutils.GenerateRequest("sql-lora"), // pod-1 will be picked because it has relatively low queue size, with the requested // model being active, and has low KV cache. pods: []*backend.PodMetrics{ @@ -93,8 +135,8 @@ func SKIPTestHandleRequestBody(t *testing.T) { WaitingQueueSize: 0, KVCacheUsagePercent: 0.1, ActiveModels: map[string]int{ - "foo": 1, - "my-model-v1": 1, + "foo": 1, + "sql-lora-1fdg2": 1, }, }, }, @@ -119,67 +161,67 @@ func SKIPTestHandleRequestBody(t *testing.T) { { Header: &configPb.HeaderValue{ Key: "Content-Length", - RawValue: []byte("73"), + RawValue: []byte("76"), }, }, }, - wantBody: []byte("{\"max_tokens\":100,\"model\":\"my-model-v1\",\"prompt\":\"hello\",\"temperature\":0}"), - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - client, cleanup := setUpServer(t, test.pods, test.models) - t.Cleanup(cleanup) - want := &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_RequestBody{ - RequestBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: test.wantHeaders, - }, - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_Body{ - Body: test.wantBody, - }, - }, + wantMetadata: &structpb.Struct{ + Fields: map[string]*structpb.Value{ + runserver.DefaultTargetEndpointKey: { + Kind: &structpb.Value_StringValue{ + StringValue: "address-1", }, }, }, - } - res, err := sendRequest(t, client, test.req) - - if (err != nil) != test.wantErr { - t.Fatalf("Unexpected error, got %v, want %v", err, test.wantErr) - } - - if diff := cmp.Diff(want, res, protocmp.Transform()); diff != "" { - t.Errorf("Unexpected response, (-want +got): %v", diff) - } - }) - } - -} - -func TestKubeInferenceModelRequest(t *testing.T) { - tests := []struct { - name string - req *extProcPb.ProcessingRequest - wantHeaders []*configPb.HeaderValueOption - wantMetadata *structpb.Struct - wantBody []byte - wantErr bool - }{ + }, + wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"hello\",\"temperature\":0}"), + wantErr: false, + }, { - name: "success", + name: "select no lora despite active model, avoid excessive queue size", req: extprocutils.GenerateRequest("sql-lora"), - // pod-1 will be picked because it has relatively low queue size, with the requested - // model being active, and has low KV cache. + // pod-2 will be picked despite it NOT having the requested model being active + // as it's above the affinity for queue size. Also is critical, so we should + // still honor request despite all queues > 5 + pods: []*backend.PodMetrics{ + { + Pod: extprocutils.FakePod(0), + Metrics: backend.Metrics{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + }, + { + Pod: extprocutils.FakePod(1), + Metrics: backend.Metrics{ + WaitingQueueSize: 50, + KVCacheUsagePercent: 0.1, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg2": 1, + }, + }, + }, + { + Pod: extprocutils.FakePod(2), + Metrics: backend.Metrics{ + WaitingQueueSize: 6, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + }, + }, + }, + }, wantHeaders: []*configPb.HeaderValueOption{ { Header: &configPb.HeaderValue{ Key: runserver.DefaultTargetEndpointKey, - RawValue: []byte("address-1"), + RawValue: []byte("address-2"), }, }, { @@ -193,7 +235,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { Fields: map[string]*structpb.Value{ runserver.DefaultTargetEndpointKey: { Kind: &structpb.Value_StringValue{ - StringValue: "address-1", + StringValue: "address-2", }, }, }, @@ -201,40 +243,122 @@ func TestKubeInferenceModelRequest(t *testing.T) { wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"hello\",\"temperature\":0}"), wantErr: false, }, - } - - pods := []*backend.PodMetrics{ { - Pod: extprocutils.FakePod(0), - Metrics: backend.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, + name: "noncritical and all models past threshold, shed request", + req: extprocutils.GenerateRequest("sql-lora-sheddable"), + // no pods will be picked as all models are either above kv threshold, + // queue threshold, or both. + pods: []*backend.PodMetrics{ + { + Pod: extprocutils.FakePod(0), + Metrics: backend.Metrics{ + WaitingQueueSize: 6, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + "sql-lora-1fdg3": 1, + }, + }, + }, + { + Pod: extprocutils.FakePod(1), + Metrics: backend.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.85, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + }, + { + Pod: extprocutils.FakePod(2), + Metrics: backend.Metrics{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, }, }, - }, - { - Pod: extprocutils.FakePod(1), - Metrics: backend.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.1, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg2": 1, + wantHeaders: []*configPb.HeaderValueOption{}, + wantMetadata: &structpb.Struct{}, + wantBody: []byte(""), + wantErr: false, + immediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: envoyTypePb.StatusCode_TooManyRequests, }, }, }, { - Pod: extprocutils.FakePod(2), - Metrics: backend.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, + name: "noncritical, but one server has capacity, do not shed", + req: extprocutils.GenerateRequest("sql-lora-sheddable"), + // pod 0 will be picked as all other models are above threshold + pods: []*backend.PodMetrics{ + { + Pod: extprocutils.FakePod(0), + Metrics: backend.Metrics{ + WaitingQueueSize: 4, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + "sql-lora-1fdg3": 1, + }, + }, + }, + { + Pod: extprocutils.FakePod(1), + Metrics: backend.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.85, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + }, + { + Pod: extprocutils.FakePod(2), + Metrics: backend.Metrics{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + }, + }, + wantHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: runserver.DefaultTargetEndpointKey, + RawValue: []byte("address-0"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte("76"), + }, }, }, + wantMetadata: &structpb.Struct{ + Fields: map[string]*structpb.Value{ + runserver.DefaultTargetEndpointKey: { + Kind: &structpb.Value_StringValue{ + StringValue: "address-0", + }, + }, + }, + }, + wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"hello\",\"temperature\":0}"), + wantErr: false, }, } @@ -243,7 +367,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - client, cleanup := setUpHermeticServer(t, pods) + client, cleanup := setUpHermeticServer(test.pods) t.Cleanup(cleanup) want := &extProcPb.ProcessingResponse{ Response: &extProcPb.ProcessingResponse_RequestBody{ @@ -264,78 +388,24 @@ func TestKubeInferenceModelRequest(t *testing.T) { } res, err := sendRequest(t, client, test.req) - if err != nil { - if !test.wantErr { - t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) + if err != nil && !test.wantErr { + t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) + } + if test.immediateResponse != nil { + want = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: test.immediateResponse, + }, } - } else if diff := cmp.Diff(want, res, protocmp.Transform()); diff != "" { + } + if diff := cmp.Diff(want, res, protocmp.Transform()); diff != "" { t.Errorf("Unexpected response, (-want +got): %v", diff) } }) } } -func setUpServer(t *testing.T, pods []*backend.PodMetrics, models map[string]*v1alpha1.InferenceModel) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { - t.Logf("Setting up ExtProc server") - server := extprocutils.StartExtProc(port, time.Second, time.Second, pods, models) - - address := fmt.Sprintf("localhost:%v", port) - // Create a grpc connection - conn, err := grpc.NewClient(address, grpc.WithTransportCredentials(insecure.NewCredentials())) - if err != nil { - log.Fatalf("Failed to connect to %v: %v", address, err) - } - - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - client, err = extProcPb.NewExternalProcessorClient(conn).Process(ctx) - if err != nil { - log.Fatalf("Failed to create client: %v", err) - } - return client, func() { - cancel() - conn.Close() - server.GracefulStop() - } -} - -func setUpHermeticServer(t *testing.T, pods []*backend.PodMetrics) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { - t.Logf("Setting up hermetic ExtProc server") - klog.InitFlags(nil) - flag.Parse() - // Configure klog verbosity levels to print ext proc logs. - _ = flag.Lookup("v").Value.Set("3") - - // Unmarshal CRDs from file into structs - manifestsPath := filepath.Join("..", "testdata", "inferencepool-with-model-hermetic.yaml") - docs, err := readDocuments(manifestsPath) - if err != nil { - log.Fatalf("Can't read object manifests at path %v, %v", manifestsPath, err) - } - - for _, doc := range docs { - inferenceModel := &v1alpha1.InferenceModel{} - if err = yaml.Unmarshal(doc, inferenceModel); err != nil { - log.Fatalf("Can't unmarshal object: %v", doc) - } - if inferenceModel.Kind == "InferenceModel" { - t.Logf("Creating inference model: %+v", inferenceModel) - if err := k8sClient.Create(context.Background(), inferenceModel); err != nil { - log.Fatalf("unable to create inferenceModel %v: %v", inferenceModel.Name, err) - } - } - } - for _, doc := range docs { - inferencePool := &v1alpha1.InferencePool{} - if err = yaml.Unmarshal(doc, inferencePool); err != nil { - log.Fatalf("Can't unmarshal object: %v", doc) - } - if inferencePool.Kind == "InferencePool" { - t.Logf("Creating inference pool: %+v", inferencePool) - if err := k8sClient.Create(context.Background(), inferencePool); err != nil { - log.Fatalf("unable to create inferencePool %v: %v", inferencePool.Name, err) - } - } - } +func setUpHermeticServer(pods []*backend.PodMetrics) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { ps := make(backend.PodSet) pms := make(map[backend.Pod]*backend.PodMetrics) @@ -346,9 +416,6 @@ func setUpHermeticServer(t *testing.T, pods []*backend.PodMetrics) (client extPr pmc := &backend.FakePodMetricsClient{Res: pms} server := serverRunner.Start(backend.NewK8sDataStore(backend.WithPods(pods)), pmc) - if err != nil { - log.Fatalf("Ext-proc failed with the err: %v", err) - } // Wait the reconciler to populate the datastore. time.Sleep(10 * time.Second) @@ -408,6 +475,44 @@ func BeforeSuit() { go func() { serverRunner.StartManager() }() + + klog.Info("Setting up hermetic ExtProc server") + klog.InitFlags(nil) + flag.Parse() + // Configure klog verbosity levels to print ext proc logs. + _ = flag.Lookup("v").Value.Set("3") + + // Unmarshal CRDs from file into structs + manifestsPath := filepath.Join("..", "testdata", "inferencepool-with-model-hermetic.yaml") + docs, err := readDocuments(manifestsPath) + if err != nil { + log.Fatalf("Can't read object manifests at path %v, %v", manifestsPath, err) + } + + for _, doc := range docs { + inferenceModel := &v1alpha1.InferenceModel{} + if err = yaml.Unmarshal(doc, inferenceModel); err != nil { + log.Fatalf("Can't unmarshal object: %v", doc) + } + if inferenceModel.Kind == "InferenceModel" { + klog.Infof("Creating inference model: %+v", inferenceModel) + if err := k8sClient.Create(context.Background(), inferenceModel); err != nil { + log.Fatalf("unable to create inferenceModel %v: %v", inferenceModel.Name, err) + } + } + } + for _, doc := range docs { + inferencePool := &v1alpha1.InferencePool{} + if err = yaml.Unmarshal(doc, inferencePool); err != nil { + log.Fatalf("Can't unmarshal object: %v", doc) + } + if inferencePool.Kind == "InferencePool" { + klog.Infof("Creating inference pool: %+v", inferencePool) + if err := k8sClient.Create(context.Background(), inferencePool); err != nil { + log.Fatalf("unable to create inferencePool %v: %v", inferencePool.Name, err) + } + } + } } func sendRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { @@ -448,6 +553,3 @@ func readDocuments(fp string) ([][]byte, error) { } return docs, nil } -func pointer(v int32) *int32 { - return &v -} diff --git a/test/testdata/inferencepool-with-model-hermetic.yaml b/test/testdata/inferencepool-with-model-hermetic.yaml index a07e0f35..372a8512 100644 --- a/test/testdata/inferencepool-with-model-hermetic.yaml +++ b/test/testdata/inferencepool-with-model-hermetic.yaml @@ -23,3 +23,30 @@ spec: targetModels: - name: sql-lora-1fdg2 weight: 100 +--- +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: InferenceModel +metadata: + name: inferencemodel-sheddable + namespace: default +spec: + modelName: sql-lora-sheddable + poolRef: + name: vllm-llama2-7b-pool + targetModels: + - name: sql-lora-1fdg3 + weight: 100 +--- +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: InferenceModel +metadata: + name: inferencemodel-generic + namespace: default +spec: + modelName: my-model + criticality: Critical + poolRef: + name: vllm-llama2-7b-pool + targetModels: + - name: my-model-12345 + weight: 100 From 71496247598d26526259419f4da006ca7aeb351e Mon Sep 17 00:00:00 2001 From: Jeff Luo Date: Mon, 10 Feb 2025 10:55:57 -0500 Subject: [PATCH 04/96] [Metrics] Add average kv cache and waiting queue size metrics for (#304) inference pool --- pkg/ext-proc/backend/provider.go | 38 +++++++++++- pkg/ext-proc/backend/provider_test.go | 2 +- pkg/ext-proc/main.go | 27 ++++---- pkg/ext-proc/metrics/README.md | 2 + pkg/ext-proc/metrics/metrics.go | 34 +++++++++++ pkg/ext-proc/metrics/metrics_test.go | 52 ++++++++++++++++ .../metrics/testdata/kv_cache_avg_metrics | 3 + .../metrics/testdata/queue_avg_size_metrics | 3 + pkg/ext-proc/server/runserver.go | 61 ++++++++++--------- pkg/ext-proc/test/benchmark/benchmark.go | 13 ++-- pkg/ext-proc/test/utils.go | 4 +- 11 files changed, 189 insertions(+), 50 deletions(-) create mode 100644 pkg/ext-proc/metrics/testdata/kv_cache_avg_metrics create mode 100644 pkg/ext-proc/metrics/testdata/queue_avg_size_metrics diff --git a/pkg/ext-proc/backend/provider.go b/pkg/ext-proc/backend/provider.go index 8bf67257..a9165e8f 100644 --- a/pkg/ext-proc/backend/provider.go +++ b/pkg/ext-proc/backend/provider.go @@ -7,6 +7,7 @@ import ( "time" "go.uber.org/multierr" + "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" klog "k8s.io/klog/v2" ) @@ -58,7 +59,7 @@ func (p *Provider) GetPodMetrics(pod Pod) (*PodMetrics, bool) { return nil, false } -func (p *Provider) Init(refreshPodsInterval, refreshMetricsInterval time.Duration) error { +func (p *Provider) Init(refreshPodsInterval, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration) error { p.refreshPodsOnce() if err := p.refreshMetricsOnce(); err != nil { @@ -85,6 +86,14 @@ func (p *Provider) Init(refreshPodsInterval, refreshMetricsInterval time.Duratio } }() + // Periodically flush prometheus metrics for inference pool + go func() { + for { + time.Sleep(refreshPrometheusMetricsInterval) + p.flushPrometheusMetricsOnce() + } + }() + // Periodically print out the pods and metrics for DEBUGGING. if klog.V(logutil.DEBUG).Enabled() { go func() { @@ -174,3 +183,30 @@ func (p *Provider) refreshMetricsOnce() error { } return errs } + +func (p *Provider) flushPrometheusMetricsOnce() { + klog.V(logutil.DEBUG).Infof("Flushing Prometheus Metrics") + + pool, _ := p.datastore.getInferencePool() + if pool == nil { + // No inference pool or not initialize. + return + } + + var kvCacheTotal float64 + var queueTotal int + + podMetrics := p.AllPodMetrics() + if len(podMetrics) == 0 { + return + } + + for _, pod := range podMetrics { + kvCacheTotal += pod.KVCacheUsagePercent + queueTotal += pod.WaitingQueueSize + } + + podTotalCount := len(podMetrics) + metrics.RecordInferencePoolAvgKVCache(pool.Name, kvCacheTotal/float64(podTotalCount)) + metrics.RecordInferencePoolAvgQueueSize(pool.Name, float64(queueTotal/podTotalCount)) +} diff --git a/pkg/ext-proc/backend/provider_test.go b/pkg/ext-proc/backend/provider_test.go index ad231f57..ddd7f0d6 100644 --- a/pkg/ext-proc/backend/provider_test.go +++ b/pkg/ext-proc/backend/provider_test.go @@ -90,7 +90,7 @@ func TestProvider(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { p := NewProvider(test.pmc, test.datastore) - err := p.Init(time.Millisecond, time.Millisecond) + err := p.Init(time.Millisecond, time.Millisecond, time.Millisecond) if test.initErr != (err != nil) { t.Fatalf("Unexpected error, got: %v, want: %v", err, test.initErr) } diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index a783aa2c..e126b6dd 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -69,6 +69,10 @@ var ( "refreshMetricsInterval", runserver.DefaultRefreshMetricsInterval, "interval to refresh metrics") + refreshPrometheusMetricsInterval = flag.Duration( + "refreshPrometheusMetricsInterval", + runserver.DefaultRefreshPrometheusMetricsInterval, + "interval to flush prometheus metrics") scheme = runtime.NewScheme() ) @@ -102,17 +106,18 @@ func main() { datastore := backend.NewK8sDataStore() serverRunner := &runserver.ExtProcServerRunner{ - GrpcPort: *grpcPort, - TargetEndpointKey: *targetEndpointKey, - PoolName: *poolName, - PoolNamespace: *poolNamespace, - ServiceName: *serviceName, - Zone: *zone, - RefreshPodsInterval: *refreshPodsInterval, - RefreshMetricsInterval: *refreshMetricsInterval, - Scheme: scheme, - Config: ctrl.GetConfigOrDie(), - Datastore: datastore, + GrpcPort: *grpcPort, + TargetEndpointKey: *targetEndpointKey, + PoolName: *poolName, + PoolNamespace: *poolNamespace, + ServiceName: *serviceName, + Zone: *zone, + RefreshPodsInterval: *refreshPodsInterval, + RefreshMetricsInterval: *refreshMetricsInterval, + RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval, + Scheme: scheme, + Config: ctrl.GetConfigOrDie(), + Datastore: datastore, } serverRunner.Setup() diff --git a/pkg/ext-proc/metrics/README.md b/pkg/ext-proc/metrics/README.md index 1094bc23..8adfd94e 100644 --- a/pkg/ext-proc/metrics/README.md +++ b/pkg/ext-proc/metrics/README.md @@ -46,6 +46,8 @@ spec: | inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_model_input_tokens | Distribution | Distribution of input token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_model_output_tokens | Distribution | Distribution of output token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_pool_average_kv_cache_utilization | Gauge | The average kv cache utilization for an inference server pool. | `name`=<inference-pool-name> | ALPHA | +| inference_pool_average_queue_size | Gauge | The average number of requests pending in the model server queue. | `name`=<inference-pool-name> | ALPHA | ## Scrape Metrics diff --git a/pkg/ext-proc/metrics/metrics.go b/pkg/ext-proc/metrics/metrics.go index 8cb7bd27..7bdc8436 100644 --- a/pkg/ext-proc/metrics/metrics.go +++ b/pkg/ext-proc/metrics/metrics.go @@ -11,9 +11,11 @@ import ( const ( InferenceModelComponent = "inference_model" + InferencePoolComponent = "inference_pool" ) var ( + // Inference Model Metrics requestCounter = compbasemetrics.NewCounterVec( &compbasemetrics.CounterOpts{ Subsystem: InferenceModelComponent, @@ -88,6 +90,27 @@ var ( }, []string{"model_name", "target_model_name"}, ) + + // Inference Pool Metrics + inferencePoolAvgKVCache = compbasemetrics.NewGaugeVec( + &compbasemetrics.GaugeOpts{ + Subsystem: InferencePoolComponent, + Name: "average_kv_cache_utilization", + Help: "The average kv cache utilization for an inference server pool.", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"name"}, + ) + + inferencePoolAvgQueueSize = compbasemetrics.NewGaugeVec( + &compbasemetrics.GaugeOpts{ + Subsystem: InferencePoolComponent, + Name: "average_queue_size", + Help: "The average number of requests pending in the model server queue.", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"name"}, + ) ) var registerMetrics sync.Once @@ -101,6 +124,9 @@ func Register() { legacyregistry.MustRegister(responseSizes) legacyregistry.MustRegister(inputTokens) legacyregistry.MustRegister(outputTokens) + + legacyregistry.MustRegister(inferencePoolAvgKVCache) + legacyregistry.MustRegister(inferencePoolAvgQueueSize) }) } @@ -143,3 +169,11 @@ func RecordOutputTokens(modelName, targetModelName string, size int) { outputTokens.WithLabelValues(modelName, targetModelName).Observe(float64(size)) } } + +func RecordInferencePoolAvgKVCache(name string, utilization float64) { + inferencePoolAvgKVCache.WithLabelValues(name).Set(utilization) +} + +func RecordInferencePoolAvgQueueSize(name string, queueSize float64) { + inferencePoolAvgQueueSize.WithLabelValues(name).Set(queueSize) +} diff --git a/pkg/ext-proc/metrics/metrics_test.go b/pkg/ext-proc/metrics/metrics_test.go index 57774b11..348f707e 100644 --- a/pkg/ext-proc/metrics/metrics_test.go +++ b/pkg/ext-proc/metrics/metrics_test.go @@ -15,6 +15,8 @@ const RequestSizesMetric = InferenceModelComponent + "_request_sizes" const ResponseSizesMetric = InferenceModelComponent + "_response_sizes" const InputTokensMetric = InferenceModelComponent + "_input_tokens" const OutputTokensMetric = InferenceModelComponent + "_output_tokens" +const KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization" +const QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size" func TestRecordRequestCounterandSizes(t *testing.T) { type requests struct { @@ -257,3 +259,53 @@ func TestRecordResponseMetrics(t *testing.T) { }) } } + +func TestInferencePoolMetrics(t *testing.T) { + scenarios := []struct { + name string + poolName string + kvCacheAvg float64 + queueSizeAvg float64 + }{ + { + name: "basic test", + poolName: "p1", + kvCacheAvg: 0.3, + queueSizeAvg: 0.4, + }, + } + Register() + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + + RecordInferencePoolAvgKVCache(scenario.poolName, scenario.kvCacheAvg) + RecordInferencePoolAvgQueueSize(scenario.poolName, scenario.queueSizeAvg) + + wantKVCache, err := os.Open("testdata/kv_cache_avg_metrics") + defer func() { + if err := wantKVCache.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantKVCache, KVCacheAvgUsageMetric); err != nil { + t.Error(err) + } + + wantQueueSize, err := os.Open("testdata/queue_avg_size_metrics") + defer func() { + if err := wantQueueSize.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantQueueSize, QueueAvgSizeMetric); err != nil { + t.Error(err) + } + }) + } +} diff --git a/pkg/ext-proc/metrics/testdata/kv_cache_avg_metrics b/pkg/ext-proc/metrics/testdata/kv_cache_avg_metrics new file mode 100644 index 00000000..99d1a93a --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/kv_cache_avg_metrics @@ -0,0 +1,3 @@ +# HELP inference_pool_average_kv_cache_utilization [ALPHA] The average kv cache utilization for an inference server pool. +# TYPE inference_pool_average_kv_cache_utilization gauge +inference_pool_average_kv_cache_utilization{name="p1"} 0.3 diff --git a/pkg/ext-proc/metrics/testdata/queue_avg_size_metrics b/pkg/ext-proc/metrics/testdata/queue_avg_size_metrics new file mode 100644 index 00000000..3605740c --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/queue_avg_size_metrics @@ -0,0 +1,3 @@ +# HELP inference_pool_average_queue_size [ALPHA] The average number of requests pending in the model server queue. +# TYPE inference_pool_average_queue_size gauge +inference_pool_average_queue_size{name="p1"} 0.4 diff --git a/pkg/ext-proc/server/runserver.go b/pkg/ext-proc/server/runserver.go index 1c9c1b2e..bf666f1f 100644 --- a/pkg/ext-proc/server/runserver.go +++ b/pkg/ext-proc/server/runserver.go @@ -19,42 +19,45 @@ import ( // ExtProcServerRunner provides methods to manage an external process server. type ExtProcServerRunner struct { - GrpcPort int - TargetEndpointKey string - PoolName string - PoolNamespace string - ServiceName string - Zone string - RefreshPodsInterval time.Duration - RefreshMetricsInterval time.Duration - Scheme *runtime.Scheme - Config *rest.Config - Datastore *backend.K8sDatastore - manager ctrl.Manager + GrpcPort int + TargetEndpointKey string + PoolName string + PoolNamespace string + ServiceName string + Zone string + RefreshPodsInterval time.Duration + RefreshMetricsInterval time.Duration + RefreshPrometheusMetricsInterval time.Duration + Scheme *runtime.Scheme + Config *rest.Config + Datastore *backend.K8sDatastore + manager ctrl.Manager } // Default values for CLI flags in main const ( - DefaultGrpcPort = 9002 // default for --grpcPort - DefaultTargetEndpointKey = "x-gateway-destination-endpoint" // default for --targetEndpointKey - DefaultPoolName = "" // required but no default - DefaultPoolNamespace = "default" // default for --poolNamespace - DefaultServiceName = "" // required but no default - DefaultZone = "" // default for --zone - DefaultRefreshPodsInterval = 10 * time.Second // default for --refreshPodsInterval - DefaultRefreshMetricsInterval = 50 * time.Millisecond // default for --refreshMetricsInterval + DefaultGrpcPort = 9002 // default for --grpcPort + DefaultTargetEndpointKey = "x-gateway-destination-endpoint" // default for --targetEndpointKey + DefaultPoolName = "" // required but no default + DefaultPoolNamespace = "default" // default for --poolNamespace + DefaultServiceName = "" // required but no default + DefaultZone = "" // default for --zone + DefaultRefreshPodsInterval = 10 * time.Second // default for --refreshPodsInterval + DefaultRefreshMetricsInterval = 50 * time.Millisecond // default for --refreshMetricsInterval + DefaultRefreshPrometheusMetricsInterval = 5 * time.Second // default for --refreshPrometheusMetricsInterval ) func NewDefaultExtProcServerRunner() *ExtProcServerRunner { return &ExtProcServerRunner{ - GrpcPort: DefaultGrpcPort, - TargetEndpointKey: DefaultTargetEndpointKey, - PoolName: DefaultPoolName, - PoolNamespace: DefaultPoolNamespace, - ServiceName: DefaultServiceName, - Zone: DefaultZone, - RefreshPodsInterval: DefaultRefreshPodsInterval, - RefreshMetricsInterval: DefaultRefreshMetricsInterval, + GrpcPort: DefaultGrpcPort, + TargetEndpointKey: DefaultTargetEndpointKey, + PoolName: DefaultPoolName, + PoolNamespace: DefaultPoolNamespace, + ServiceName: DefaultServiceName, + Zone: DefaultZone, + RefreshPodsInterval: DefaultRefreshPodsInterval, + RefreshMetricsInterval: DefaultRefreshMetricsInterval, + RefreshPrometheusMetricsInterval: DefaultRefreshPrometheusMetricsInterval, // Scheme, Config, and Datastore can be assigned later. } } @@ -123,7 +126,7 @@ func (r *ExtProcServerRunner) Start( // Initialize backend provider pp := backend.NewProvider(podMetricsClient, podDatastore) - if err := pp.Init(r.RefreshPodsInterval, r.RefreshMetricsInterval); err != nil { + if err := pp.Init(r.RefreshPodsInterval, r.RefreshMetricsInterval, r.RefreshPrometheusMetricsInterval); err != nil { klog.Fatalf("Failed to initialize backend provider: %v", err) } diff --git a/pkg/ext-proc/test/benchmark/benchmark.go b/pkg/ext-proc/test/benchmark/benchmark.go index 9ff61d8b..abaeedbb 100644 --- a/pkg/ext-proc/test/benchmark/benchmark.go +++ b/pkg/ext-proc/test/benchmark/benchmark.go @@ -21,11 +21,12 @@ var ( svrAddr = flag.String("server_address", fmt.Sprintf("localhost:%d", runserver.DefaultGrpcPort), "Address of the ext proc server") totalRequests = flag.Int("total_requests", 100000, "number of requests to be sent for load test") // Flags when running a local ext proc server. - numFakePods = flag.Int("num_fake_pods", 200, "number of fake pods when running a local ext proc server") - numModelsPerPod = flag.Int("num_models_per_pod", 5, "number of fake models per pod when running a local ext proc server") - localServer = flag.Bool("local_server", true, "whether to start a local ext proc server") - refreshPodsInterval = flag.Duration("refreshPodsInterval", 10*time.Second, "interval to refresh pods") - refreshMetricsInterval = flag.Duration("refreshMetricsInterval", 50*time.Millisecond, "interval to refresh metrics") + numFakePods = flag.Int("num_fake_pods", 200, "number of fake pods when running a local ext proc server") + numModelsPerPod = flag.Int("num_models_per_pod", 5, "number of fake models per pod when running a local ext proc server") + localServer = flag.Bool("local_server", true, "whether to start a local ext proc server") + refreshPodsInterval = flag.Duration("refreshPodsInterval", 10*time.Second, "interval to refresh pods") + refreshMetricsInterval = flag.Duration("refreshMetricsInterval", 50*time.Millisecond, "interval to refresh metrics via polling pods") + refreshPrometheusMetricsInterval = flag.Duration("refreshPrometheusMetricsInterval", 5*time.Second, "interval to flush prometheus metrics") ) const ( @@ -37,7 +38,7 @@ func main() { flag.Parse() if *localServer { - test.StartExtProc(port, *refreshPodsInterval, *refreshMetricsInterval, fakePods(), fakeModels()) + test.StartExtProc(port, *refreshPodsInterval, *refreshMetricsInterval, *refreshPrometheusMetricsInterval, fakePods(), fakeModels()) time.Sleep(time.Second) // wait until server is up klog.Info("Server started") } diff --git a/pkg/ext-proc/test/utils.go b/pkg/ext-proc/test/utils.go index 63972849..98793b95 100644 --- a/pkg/ext-proc/test/utils.go +++ b/pkg/ext-proc/test/utils.go @@ -16,7 +16,7 @@ import ( klog "k8s.io/klog/v2" ) -func StartExtProc(port int, refreshPodsInterval, refreshMetricsInterval time.Duration, pods []*backend.PodMetrics, models map[string]*v1alpha1.InferenceModel) *grpc.Server { +func StartExtProc(port int, refreshPodsInterval, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration, pods []*backend.PodMetrics, models map[string]*v1alpha1.InferenceModel) *grpc.Server { ps := make(backend.PodSet) pms := make(map[backend.Pod]*backend.PodMetrics) for _, pod := range pods { @@ -25,7 +25,7 @@ func StartExtProc(port int, refreshPodsInterval, refreshMetricsInterval time.Dur } pmc := &backend.FakePodMetricsClient{Res: pms} pp := backend.NewProvider(pmc, backend.NewK8sDataStore(backend.WithPods(pods))) - if err := pp.Init(refreshPodsInterval, refreshMetricsInterval); err != nil { + if err := pp.Init(refreshPodsInterval, refreshMetricsInterval, refreshPrometheusMetricsInterval); err != nil { klog.Fatalf("failed to initialize: %v", err) } return startExtProc(port, pp, models) From 836ef57d27cb4424d5e87f12ea2bde0aec4646a1 Mon Sep 17 00:00:00 2001 From: kfswain <137822113+kfswain@users.noreply.github.com> Date: Mon, 10 Feb 2025 12:49:57 -0700 Subject: [PATCH 05/96] Move getting started guide to docs site (#308) * Link to v0.1.0 getting started guide * Moving getting started guide to the site * site doesnt support markdown syntax for ordered lists, making explicit * fiddling with mkdocs syntax --- pkg/README.md | 95 +--------------------------------------- site-src/guides/index.md | 87 +++++++++++++++++++++++++++++++++++- 2 files changed, 87 insertions(+), 95 deletions(-) diff --git a/pkg/README.md b/pkg/README.md index 04ebfde2..b53ef777 100644 --- a/pkg/README.md +++ b/pkg/README.md @@ -1,96 +1,3 @@ ## Quickstart -This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get a first, single InferencePool up and running! - -### Requirements - - Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher - - A cluster with: - - Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind, - you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). - - 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed. - -### Steps - -1. **Deploy Sample Model Server** - - Create a Hugging Face secret to download the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). Ensure that the token grants access to this model. - Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. - ```bash - kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2 - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml - ``` - -1. **Install the Inference Extension CRDs:** - - ```sh - kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd - ``` - -1. **Deploy InferenceModel** - - Deploy the sample InferenceModel which is configured to load balance traffic between the `tweet-summary-0` and `tweet-summary-1` - [LoRA adapters](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server. - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/inferencemodel.yaml - ``` - -1. **Update Envoy Gateway Config to enable Patch Policy** - - Our custom LLM Gateway ext-proc is patched into the existing envoy gateway via `EnvoyPatchPolicy`. To enable this feature, we must extend the Envoy Gateway config map. To do this, simply run: - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/enable_patch_policy.yaml - kubectl rollout restart deployment envoy-gateway -n envoy-gateway-system - ``` - Additionally, if you would like to enable the admin interface, you can uncomment the admin lines and run this again. - -1. **Deploy Gateway** - - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/gateway.yaml - ``` - > **_NOTE:_** This file couples together the gateway infra and the HTTPRoute infra for a convenient, quick startup. Creating additional/different InferencePools on the same gateway will require an additional set of: `Backend`, `HTTPRoute`, the resources included in the `./manifests/gateway/ext-proc.yaml` file, and an additional `./manifests/gateway/patch_policy.yaml` file. ***Should you choose to experiment, familiarity with xDS and Envoy are very useful.*** - - Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: - ```bash - $ kubectl get gateway inference-gateway - NAME CLASS ADDRESS PROGRAMMED AGE - inference-gateway inference-gateway True 22s - ``` - -1. **Deploy the Inference Extension and InferencePool** - - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/ext_proc.yaml - ``` - -1. **Deploy Envoy Gateway Custom Policies** - - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/extension_policy.yaml - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/patch_policy.yaml - ``` - > **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further. - -1. **OPTIONALLY**: Apply Traffic Policy - - For high-traffic benchmarking you can apply this manifest to avoid any defaults that can cause timeouts/errors. - - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/traffic_policy.yaml - ``` - -1. **Try it out** - - Wait until the gateway is ready. - - ```bash - IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') - PORT=8081 - - curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ - "model": "tweet-summary", - "prompt": "Write as if you were a critic: San Francisco", - "max_tokens": 100, - "temperature": 0 - }' - ``` \ No newline at end of file +Please refer to our Getting started guide here: https://gateway-api-inference-extension.sigs.k8s.io/guides/ \ No newline at end of file diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 92f6412a..e4cbec6f 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -1,3 +1,88 @@ # Getting started with Gateway API Inference Extension -TODO \ No newline at end of file +This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get a first, single InferencePool up and running! + +### Requirements + - Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher + - A cluster with: + - Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind, + you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). + - 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed. + +### Steps + +1. **Deploy Sample Model Server** + + Create a Hugging Face secret to download the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). Ensure that the token grants access to this model. + Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. + ```bash + kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2 + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml + ``` +1. **Install the Inference Extension CRDs:** + + ```sh + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v0.1.0/manifests.yaml + +1. **Deploy InferenceModel** + + Deploy the sample InferenceModel which is configured to load balance traffic between the `tweet-summary-0` and `tweet-summary-1` + [LoRA adapters](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server. + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/inferencemodel.yaml + ``` +1. **Update Envoy Gateway Config to enable Patch Policy** + + Our custom LLM Gateway ext-proc is patched into the existing envoy gateway via `EnvoyPatchPolicy`. To enable this feature, we must extend the Envoy Gateway config map. To do this, simply run: + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/enable_patch_policy.yaml + kubectl rollout restart deployment envoy-gateway -n envoy-gateway-system + ``` + Additionally, if you would like to enable the admin interface, you can uncomment the admin lines and run this again. +1. **Deploy Gateway** + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/gateway.yaml + ``` + > **_NOTE:_** This file couples together the gateway infra and the HTTPRoute infra for a convenient, quick startup. Creating additional/different InferencePools on the same gateway will require an additional set of: `Backend`, `HTTPRoute`, the resources included in the `./manifests/gateway/ext-proc.yaml` file, and an additional `./manifests/gateway/patch_policy.yaml` file. ***Should you choose to experiment, familiarity with xDS and Envoy are very useful.*** + + Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: + ```bash + $ kubectl get gateway inference-gateway + NAME CLASS ADDRESS PROGRAMMED AGE + inference-gateway inference-gateway True 22s + ``` +1. **Deploy the Inference Extension and InferencePool** + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/ext_proc.yaml + ``` +1. **Deploy Envoy Gateway Custom Policies** + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/extension_policy.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/patch_policy.yaml + ``` + > **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further. +1. **OPTIONALLY**: Apply Traffic Policy + + For high-traffic benchmarking you can apply this manifest to avoid any defaults that can cause timeouts/errors. + + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/traffic_policy.yaml + ``` +1. **Try it out** + + Wait until the gateway is ready. + + ```bash + IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') + PORT=8081 + + curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ + "model": "tweet-summary", + "prompt": "Write as if you were a critic: San Francisco", + "max_tokens": 100, + "temperature": 0 + }' + ``` \ No newline at end of file From 6dd58f2546a1a4109bb836611db6634928f74d01 Mon Sep 17 00:00:00 2001 From: Tim Flannagan Date: Mon, 10 Feb 2025 15:05:57 -0500 Subject: [PATCH 06/96] site-source: Fix 'Bakcground' misspell in API concepts page (#309) Signed-off-by: timflannagan --- site-src/concepts/api-overview.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/site-src/concepts/api-overview.md b/site-src/concepts/api-overview.md index 94e76251..6c4c9ecd 100644 --- a/site-src/concepts/api-overview.md +++ b/site-src/concepts/api-overview.md @@ -1,7 +1,7 @@ # API Overview -## Bakcground -The Gateway API Inference Extension project is an extension of the Kubernetes Gateway API for serving Generative AI models on Kubernetes. Gateway API Inference Extension facilitates standardization of APIs for Kubernetes cluster operators and developers running generative AI inference, while allowing flexibility for underlying gateway implementations (such as Envoy Proxy) to iterate on mechanisms for optimized serving of models. +## Background +The Gateway API Inference Extension project is an extension of the Kubernetes Gateway API for serving Generative AI models on Kubernetes. Gateway API Inference Extension facilitates standardization of APIs for Kubernetes cluster operators and developers running generative AI inference, while allowing flexibility for underlying gateway implementations (such as Envoy Proxy) to iterate on mechanisms for optimized serving of models. Overview of API integration From d74eefa69e573481ff88ab7732f3ac40a1121e38 Mon Sep 17 00:00:00 2001 From: kfswain <137822113+kfswain@users.noreply.github.com> Date: Mon, 10 Feb 2025 14:31:58 -0700 Subject: [PATCH 07/96] Mkdocs fixes (#314) * Link to v0.1.0 getting started guide * Moving getting started guide to the site * site doesnt support markdown syntax for ordered lists, making explicit * fiddling with mkdocs syntax * mkdocs fixes --- site-src/concepts/api-overview.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/site-src/concepts/api-overview.md b/site-src/concepts/api-overview.md index 6c4c9ecd..9c5c0416 100644 --- a/site-src/concepts/api-overview.md +++ b/site-src/concepts/api-overview.md @@ -9,8 +9,8 @@ The Gateway API Inference Extension project is an extension of the Kubernetes Ga ### InferencePool -InferencePool represents a set of Inference-focused Pods and an extension that will be used to route to them. Within the broader Gateway API resource model, this resource is considered a "backend". In practice, that means that you'd replace a Kubernetes Service with an InferencePool. This resource has some similarities to Service (a way to select Pods and specify a port), but has some unique capabilities. With InferenceModel, you can configure a routing extension as well as inference-specific routing optimizations. For more information on this resource, refer to our [InferencePool documentation](/api-types/inferencepool.md) or go directly to the [InferencePool spec](/reference/spec/#inferencepool). +InferencePool represents a set of Inference-focused Pods and an extension that will be used to route to them. Within the broader Gateway API resource model, this resource is considered a "backend". In practice, that means that you'd replace a Kubernetes Service with an InferencePool. This resource has some similarities to Service (a way to select Pods and specify a port), but has some unique capabilities. With InferenceModel, you can configure a routing extension as well as inference-specific routing optimizations. For more information on this resource, refer to our [InferencePool documentation](/api-types/inferencepool) or go directly to the [InferencePool spec](/reference/spec/#inferencepool). ### InferenceModel -An InferenceModel represents a model or adapter, and configuration associated with that model. This resource enables you to configure the relative criticality of a model, and allows you to seamlessly translate the requested model name to one or more backend model names. Multiple InferenceModels can be attached to an InferencePool. For more information on this resource, refer to our [InferenceModel documentation](/api-types/inferencemodel.md) or go directly to the [InferenceModel spec](/reference/spec/#inferencemodel). +An InferenceModel represents a model or adapter, and configuration associated with that model. This resource enables you to configure the relative criticality of a model, and allows you to seamlessly translate the requested model name to one or more backend model names. Multiple InferenceModels can be attached to an InferencePool. For more information on this resource, refer to our [InferenceModel documentation](/api-types/inferencemodel) or go directly to the [InferenceModel spec](/reference/spec/#inferencemodel). From 5ad2888423ee795b3fbed1ec218eb2656e3e9bb5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 10 Feb 2025 14:39:57 -0800 Subject: [PATCH 08/96] Bump google.golang.org/protobuf from 1.36.4 to 1.36.5 (#315) Bumps google.golang.org/protobuf from 1.36.4 to 1.36.5. --- updated-dependencies: - dependency-name: google.golang.org/protobuf dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 8dd59e3e..d774d6bd 100644 --- a/go.mod +++ b/go.mod @@ -18,7 +18,7 @@ require ( github.com/stretchr/testify v1.10.0 go.uber.org/multierr v1.11.0 google.golang.org/grpc v1.70.0 - google.golang.org/protobuf v1.36.4 + google.golang.org/protobuf v1.36.5 k8s.io/api v0.32.1 k8s.io/apiextensions-apiserver v0.32.1 k8s.io/apimachinery v0.32.1 diff --git a/go.sum b/go.sum index 6d1cd8bd..803ed988 100644 --- a/go.sum +++ b/go.sum @@ -329,8 +329,8 @@ google.golang.org/genproto/googleapis/rpc v0.0.0-20241202173237-19429a94021a h1: google.golang.org/genproto/googleapis/rpc v0.0.0-20241202173237-19429a94021a/go.mod h1:5uTbfoYQed2U9p3KIj2/Zzm02PYhndfdmML0qC3q3FU= google.golang.org/grpc v1.70.0 h1:pWFv03aZoHzlRKHWicjsZytKAiYCtNS0dHbXnIdq7jQ= google.golang.org/grpc v1.70.0/go.mod h1:ofIJqVKDXx/JiXrwr2IG4/zwdH9txy3IlF40RmcJSQw= -google.golang.org/protobuf v1.36.4 h1:6A3ZDJHn/eNqc1i+IdefRzy/9PokBTPvcqMySR7NNIM= -google.golang.org/protobuf v1.36.4/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM= +google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= From b5ffb664e20289b728a660ee50f3f63f9b35a827 Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Mon, 10 Feb 2025 23:37:56 +0000 Subject: [PATCH 09/96] Remove gci linter (#317) --- .golangci.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.golangci.yml b/.golangci.yml index 1462bcc7..2ad3b93d 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -14,7 +14,6 @@ linters: - dupword - durationcheck - fatcontext - - gci - ginkgolinter - gocritic - govet From 6c22d92eb4594e1d560740de892432b87778e2f3 Mon Sep 17 00:00:00 2001 From: Daneyon Hansen Date: Mon, 10 Feb 2025 18:47:56 -0500 Subject: [PATCH 10/96] Adds ErrorNotFound Handling for Reconciler (#286) Signed-off-by: Daneyon Hansen --- .../backend/inferencemodel_reconciler.go | 34 +++-- .../backend/inferencemodel_reconciler_test.go | 133 ++++++++++++++++-- 2 files changed, 138 insertions(+), 29 deletions(-) diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler.go b/pkg/ext-proc/backend/inferencemodel_reconciler.go index 3164e098..1c1d2278 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler.go @@ -5,6 +5,7 @@ import ( "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" @@ -25,32 +26,37 @@ func (c *InferenceModelReconciler) Reconcile(ctx context.Context, req ctrl.Reque if req.Namespace != c.PoolNamespacedName.Namespace { return ctrl.Result{}, nil } - klog.V(1).Infof("reconciling InferenceModel %v", req.NamespacedName) - - service := &v1alpha1.InferenceModel{} - if err := c.Get(ctx, req.NamespacedName, service); err != nil { - klog.Error(err, "unable to get InferencePool") + klog.V(1).Infof("Reconciling InferenceModel %v", req.NamespacedName) + + infModel := &v1alpha1.InferenceModel{} + if err := c.Get(ctx, req.NamespacedName, infModel); err != nil { + if errors.IsNotFound(err) { + klog.V(1).Infof("InferenceModel %v not found. Removing from datastore since object must be deleted", req.NamespacedName) + c.Datastore.InferenceModels.Delete(infModel.Spec.ModelName) + return ctrl.Result{}, nil + } + klog.Error(err, "Unable to get InferenceModel") return ctrl.Result{}, err } - c.updateDatastore(service) + c.updateDatastore(infModel) return ctrl.Result{}, nil } -func (c *InferenceModelReconciler) SetupWithManager(mgr ctrl.Manager) error { - return ctrl.NewControllerManagedBy(mgr). - For(&v1alpha1.InferenceModel{}). - Complete(c) -} - func (c *InferenceModelReconciler) updateDatastore(infModel *v1alpha1.InferenceModel) { if infModel.Spec.PoolRef.Name == c.PoolNamespacedName.Name { klog.V(1).Infof("Incoming pool ref %v, server pool name: %v", infModel.Spec.PoolRef, c.PoolNamespacedName.Name) - klog.V(1).Infof("Adding/Updating inference model: %v", infModel.Spec.ModelName) + klog.V(1).Infof("Adding/Updating InferenceModel: %v", infModel.Spec.ModelName) c.Datastore.InferenceModels.Store(infModel.Spec.ModelName, infModel) return } - klog.V(logutil.DEFAULT).Infof("Removing/Not adding inference model: %v", infModel.Spec.ModelName) + klog.V(logutil.DEFAULT).Infof("Removing/Not adding InferenceModel: %v", infModel.Spec.ModelName) // If we get here. The model is not relevant to this pool, remove. c.Datastore.InferenceModels.Delete(infModel.Spec.ModelName) } + +func (c *InferenceModelReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&v1alpha1.InferenceModel{}). + Complete(c) +} diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go index 5609ca53..45669a30 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go @@ -1,16 +1,22 @@ package backend import ( + "context" "sync" "testing" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/record" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" ) var ( - service1 = &v1alpha1.InferenceModel{ + infModel1 = &v1alpha1.InferenceModel{ Spec: v1alpha1.InferenceModelSpec{ ModelName: "fake model1", PoolRef: v1alpha1.PoolObjectReference{Name: "test-pool"}, @@ -19,7 +25,7 @@ var ( Name: "test-service", }, } - service1Modified = &v1alpha1.InferenceModel{ + infModel1Modified = &v1alpha1.InferenceModel{ Spec: v1alpha1.InferenceModelSpec{ ModelName: "fake model1", PoolRef: v1alpha1.PoolObjectReference{Name: "test-poolio"}, @@ -28,7 +34,7 @@ var ( Name: "test-service", }, } - service2 = &v1alpha1.InferenceModel{ + infModel2 = &v1alpha1.InferenceModel{ Spec: v1alpha1.InferenceModelSpec{ ModelName: "fake model", PoolRef: v1alpha1.PoolObjectReference{Name: "test-pool"}, @@ -60,8 +66,8 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { }, InferenceModels: &sync.Map{}, }, - incomingService: service1, - wantInferenceModels: populateServiceMap(service1), + incomingService: infModel1, + wantInferenceModels: populateServiceMap(infModel1), }, { name: "Removing existing service.", @@ -75,9 +81,9 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { ResourceVersion: "Old and boring", }, }, - InferenceModels: populateServiceMap(service1), + InferenceModels: populateServiceMap(infModel1), }, - incomingService: service1Modified, + incomingService: infModel1Modified, wantInferenceModels: populateServiceMap(), }, { @@ -92,7 +98,7 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { ResourceVersion: "Old and boring", }, }, - InferenceModels: populateServiceMap(service1), + InferenceModels: populateServiceMap(infModel1), }, incomingService: &v1alpha1.InferenceModel{ Spec: v1alpha1.InferenceModelSpec{ @@ -103,7 +109,7 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { Name: "unrelated-service", }, }, - wantInferenceModels: populateServiceMap(service1), + wantInferenceModels: populateServiceMap(infModel1), }, { name: "Add to existing", @@ -117,27 +123,124 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { ResourceVersion: "Old and boring", }, }, - InferenceModels: populateServiceMap(service1), + InferenceModels: populateServiceMap(infModel1), }, - incomingService: service2, - wantInferenceModels: populateServiceMap(service1, service2), + incomingService: infModel2, + wantInferenceModels: populateServiceMap(infModel1, infModel2), }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - InferenceModelReconciler := &InferenceModelReconciler{ + reconciler := &InferenceModelReconciler{ Datastore: test.datastore, PoolNamespacedName: types.NamespacedName{Name: test.datastore.inferencePool.Name}, } - InferenceModelReconciler.updateDatastore(test.incomingService) + reconciler.updateDatastore(test.incomingService) - if ok := mapsEqual(InferenceModelReconciler.Datastore.InferenceModels, test.wantInferenceModels); !ok { + if ok := mapsEqual(reconciler.Datastore.InferenceModels, test.wantInferenceModels); !ok { t.Error("Maps are not equal") } }) } } +func TestReconcile_ResourceNotFound(t *testing.T) { + // Set up the scheme. + scheme := runtime.NewScheme() + _ = v1alpha1.AddToScheme(scheme) + + // Create a fake client with no InferenceModel objects. + fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() + + // Create a minimal datastore. + datastore := &K8sDatastore{ + InferenceModels: &sync.Map{}, + inferencePool: &v1alpha1.InferencePool{ + ObjectMeta: metav1.ObjectMeta{Name: "test-pool"}, + }, + } + + // Create the reconciler. + reconciler := &InferenceModelReconciler{ + Client: fakeClient, + Scheme: scheme, + Record: record.NewFakeRecorder(10), + Datastore: datastore, + PoolNamespacedName: types.NamespacedName{Name: "test-pool"}, + } + + // Create a request for a non-existent resource. + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: "non-existent-model", Namespace: "default"}} + + // Call Reconcile. + result, err := reconciler.Reconcile(context.Background(), req) + if err != nil { + t.Fatalf("expected no error when resource is not found, got %v", err) + } + + // Check that no requeue is requested. + if result.Requeue || result.RequeueAfter != 0 { + t.Errorf("expected no requeue, got %+v", result) + } +} + +func TestReconcile_ResourceExists(t *testing.T) { + // Set up the scheme. + scheme := runtime.NewScheme() + _ = v1alpha1.AddToScheme(scheme) + + // Create an InferenceModel object. + existingModel := &v1alpha1.InferenceModel{ + ObjectMeta: metav1.ObjectMeta{ + Name: "existing-model", + Namespace: "default", + }, + Spec: v1alpha1.InferenceModelSpec{ + ModelName: "fake-model", + PoolRef: v1alpha1.PoolObjectReference{Name: "test-pool"}, + }, + } + + // Create a fake client with the existing model. + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existingModel).Build() + + // Create a minimal datastore. + datastore := &K8sDatastore{ + InferenceModels: &sync.Map{}, + inferencePool: &v1alpha1.InferencePool{ + ObjectMeta: metav1.ObjectMeta{Name: "test-pool"}, + }, + } + + // Create the reconciler. + reconciler := &InferenceModelReconciler{ + Client: fakeClient, + Scheme: scheme, + Record: record.NewFakeRecorder(10), + Datastore: datastore, + PoolNamespacedName: types.NamespacedName{Name: "test-pool", Namespace: "default"}, + } + + // Create a request for the existing resource. + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: "existing-model", Namespace: "default"}} + + // Call Reconcile. + result, err := reconciler.Reconcile(context.Background(), req) + if err != nil { + t.Fatalf("expected no error when resource exists, got %v", err) + } + + // Check that no requeue is requested. + if result.Requeue || result.RequeueAfter != 0 { + t.Errorf("expected no requeue, got %+v", result) + } + + // Verify that the datastore was updated. + if _, ok := datastore.InferenceModels.Load(existingModel.Spec.ModelName); !ok { + t.Errorf("expected datastore to contain model %q", existingModel.Spec.ModelName) + } +} + func populateServiceMap(services ...*v1alpha1.InferenceModel) *sync.Map { returnVal := &sync.Map{} From d808d559535356375d07128351a55447a38fd4d8 Mon Sep 17 00:00:00 2001 From: Tim Flannagan Date: Tue, 11 Feb 2025 14:39:58 -0500 Subject: [PATCH 11/96] site-src: Replace k8sgateway with kgateway & fix spelling in roles-and-personas.md (#311) * site-src: Fix spelling/grammar issues in roles-and-personas.md Signed-off-by: timflannagan * site-src: Replace k8sgateway with kgateway in implementations.md Signed-off-by: timflannagan * site-src: Use the correct GW API naming Signed-off-by: timflannagan --------- Signed-off-by: timflannagan --- site-src/concepts/roles-and-personas.md | 6 +++--- site-src/implementations.md | 14 +++++++------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/site-src/concepts/roles-and-personas.md b/site-src/concepts/roles-and-personas.md index b11f43eb..0746adbf 100644 --- a/site-src/concepts/roles-and-personas.md +++ b/site-src/concepts/roles-and-personas.md @@ -1,10 +1,10 @@ # Roles and Personas -Before diving into the details of the API, decriptions of the personas these APIs were designed for will help convey the thought process of the API design. +Before diving into the details of the API, descriptions of the personas these APIs were designed for will help convey the thought process of the API design. ## Inference Platform Admin -The Inference Platform Admin creates and manages the infrastructure necessary to run LLM workloads. Including handling Ops for: +The Inference Platform Admin creates and manages the infrastructure necessary to run LLM workloads, including handling Ops for: - Hardware - Model Server @@ -15,7 +15,7 @@ The Inference Platform Admin creates and manages the infrastructure necessary to ## Inference Workload Owner -An Inference Workload Owner persona owns and manages 1 or many Generative AI Workloads (LLM focused *currently*). This includes: +An Inference Workload Owner persona owns and manages one or many Generative AI Workloads (LLM focused *currently*). This includes: - Defining criticality - Managing fine-tunes diff --git a/site-src/implementations.md b/site-src/implementations.md index e2238827..89acb436 100644 --- a/site-src/implementations.md +++ b/site-src/implementations.md @@ -3,14 +3,15 @@ This project has several implementations that are planned or in progress: * [Envoy Gateway][1] -* [Gloo k8sgateway][2] +* [Kgateway][2] * [Google Kubernetes Engine][3] [1]:#envoy-gateway -[2]:#gloo-k8sgateway +[2]:#kgateway [3]:#google-kubernetes-engine ## Envoy Gateway + [Envoy Gateway][eg-home] is an [Envoy][envoy-org] subproject for managing Envoy-based application gateways. The supported APIs and fields of the Gateway API are outlined [here][eg-supported]. Use the [quickstart][eg-quickstart] to @@ -24,15 +25,15 @@ Issue](https://github.com/envoyproxy/gateway/issues/4423). [eg-supported]:https://gateway.envoyproxy.io/docs/tasks/quickstart/ [eg-quickstart]:https://gateway.envoyproxy.io/docs/tasks/quickstart -## Gloo k8sgateway +## Kgateway -[Gloo k8sgateway](https://k8sgateway.io/) is a feature-rich, Kubernetes-native -ingress controller and next-generation API gateway. Gloo k8sgateway brings the +[Kgateway](https://kgateway.dev/) is a feature-rich, Kubernetes-native +ingress controller and next-generation API gateway. Kgateway brings the full power and community support of Gateway API to its existing control-plane implementation. Progress towards supporting this project is tracked with a [GitHub -Issue](https://github.com/k8sgateway/k8sgateway/issues/10411). +Issue](https://github.com/kgateway-dev/kgateway/issues/10411). ## Google Kubernetes Engine @@ -53,4 +54,3 @@ Issue](https://github.com/GoogleCloudPlatform/gke-gateway-api/issues/20). [gke-gateway]:https://cloud.google.com/kubernetes-engine/docs/concepts/gateway-api [gke-gateway-deploy]:https://cloud.google.com/kubernetes-engine/docs/how-to/deploying-gateways [gke-multi-cluster-gateway]:https://cloud.google.com/kubernetes-engine/docs/how-to/deploying-multi-cluster-gateways - From bdcfcf0e65442465a89842729bc23e361bb0b6ce Mon Sep 17 00:00:00 2001 From: Daneyon Hansen Date: Tue, 11 Feb 2025 17:07:58 -0500 Subject: [PATCH 12/96] Fix: Go Mod Imports (#318) * Replaces default API group name with inference.networking.x-k8s.io Signed-off-by: Daneyon Hansen * Renames go mod prefix to sigs.k8s.io Signed-off-by: Daneyon Hansen --------- Signed-off-by: Daneyon Hansen --- api/doc.go | 17 +++++++++ api/v1alpha1/doc.go | 23 ++++++++++++ .../api/v1alpha1/extension.go | 2 +- .../api/v1alpha1/extensionconnection.go | 2 +- .../api/v1alpha1/inferencemodel.go | 2 +- .../api/v1alpha1/inferencemodelspec.go | 2 +- .../api/v1alpha1/inferencepool.go | 2 +- .../api/v1alpha1/inferencepoolspec.go | 2 +- client-go/applyconfiguration/utils.go | 8 ++--- client-go/clientset/versioned/clientset.go | 16 ++++----- .../versioned/fake/clientset_generated.go | 14 ++++---- .../clientset/versioned/fake/register.go | 4 +-- .../clientset/versioned/scheme/register.go | 4 +-- .../typed/api/v1alpha1/api_client.go | 36 +++++++++---------- .../api/v1alpha1/fake/fake_api_client.go | 10 +++--- .../api/v1alpha1/fake/fake_inferencemodel.go | 10 +++--- .../api/v1alpha1/fake/fake_inferencepool.go | 10 +++--- .../typed/api/v1alpha1/inferencemodel.go | 8 ++--- .../typed/api/v1alpha1/inferencepool.go | 8 ++--- .../externalversions/api/interface.go | 4 +-- .../api/v1alpha1/inferencemodel.go | 12 +++---- .../api/v1alpha1/inferencepool.go | 12 +++---- .../api/v1alpha1/interface.go | 2 +- .../informers/externalversions/factory.go | 10 +++--- .../informers/externalversions/generic.go | 8 ++--- .../internalinterfaces/factory_interfaces.go | 2 +- .../listers/api/v1alpha1/inferencemodel.go | 2 +- .../listers/api/v1alpha1/inferencepool.go | 2 +- go.mod | 2 +- hack/update-codegen.sh | 2 +- pkg/ext-proc/backend/datastore.go | 4 +-- pkg/ext-proc/backend/datastore_test.go | 2 +- .../backend/endpointslice_reconciler.go | 4 +-- .../backend/endpointslice_reconcilier_test.go | 2 +- pkg/ext-proc/backend/fake.go | 2 +- .../backend/inferencemodel_reconciler.go | 4 +-- .../backend/inferencemodel_reconciler_test.go | 2 +- .../backend/inferencepool_reconciler.go | 2 +- .../backend/inferencepool_reconciler_test.go | 2 +- pkg/ext-proc/backend/provider.go | 4 +-- pkg/ext-proc/backend/vllm/metrics.go | 4 +-- pkg/ext-proc/backend/vllm/metrics_test.go | 2 +- pkg/ext-proc/handlers/request.go | 6 ++-- pkg/ext-proc/handlers/response.go | 2 +- pkg/ext-proc/handlers/server.go | 10 +++--- pkg/ext-proc/health.go | 2 +- pkg/ext-proc/main.go | 10 +++--- pkg/ext-proc/scheduling/filter.go | 4 +-- pkg/ext-proc/scheduling/filter_test.go | 2 +- pkg/ext-proc/scheduling/scheduler.go | 4 +-- pkg/ext-proc/server/runserver.go | 6 ++-- pkg/ext-proc/test/benchmark/benchmark.go | 8 ++--- pkg/ext-proc/test/utils.go | 8 ++--- test/e2e/e2e_suite_test.go | 4 +-- test/e2e/e2e_test.go | 4 +-- test/integration/hermetic_test.go | 8 ++--- test/utils/utils.go | 2 +- test/utils/wrappers.go | 2 +- 58 files changed, 197 insertions(+), 157 deletions(-) create mode 100644 api/doc.go create mode 100644 api/v1alpha1/doc.go diff --git a/api/doc.go b/api/doc.go new file mode 100644 index 00000000..c91adb92 --- /dev/null +++ b/api/doc.go @@ -0,0 +1,17 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package api diff --git a/api/v1alpha1/doc.go b/api/v1alpha1/doc.go new file mode 100644 index 00000000..8e970ced --- /dev/null +++ b/api/v1alpha1/doc.go @@ -0,0 +1,23 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1alpha1 contains API Schema definitions for the +// inference.networking.x-k8s.io API group. +// +// +k8s:openapi-gen=true +// +kubebuilder:object:generate=true +// +groupName=inference.networking.x-k8s.io +package v1alpha1 diff --git a/client-go/applyconfiguration/api/v1alpha1/extension.go b/client-go/applyconfiguration/api/v1alpha1/extension.go index 27807448..4213af88 100644 --- a/client-go/applyconfiguration/api/v1alpha1/extension.go +++ b/client-go/applyconfiguration/api/v1alpha1/extension.go @@ -18,7 +18,7 @@ limitations under the License. package v1alpha1 import ( - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" ) // ExtensionApplyConfiguration represents a declarative configuration of the Extension type for use diff --git a/client-go/applyconfiguration/api/v1alpha1/extensionconnection.go b/client-go/applyconfiguration/api/v1alpha1/extensionconnection.go index be9eeaa1..ff8752a9 100644 --- a/client-go/applyconfiguration/api/v1alpha1/extensionconnection.go +++ b/client-go/applyconfiguration/api/v1alpha1/extensionconnection.go @@ -18,7 +18,7 @@ limitations under the License. package v1alpha1 import ( - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" ) // ExtensionConnectionApplyConfiguration represents a declarative configuration of the ExtensionConnection type for use diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencemodel.go b/client-go/applyconfiguration/api/v1alpha1/inferencemodel.go index b6201467..d2a5b2b4 100644 --- a/client-go/applyconfiguration/api/v1alpha1/inferencemodel.go +++ b/client-go/applyconfiguration/api/v1alpha1/inferencemodel.go @@ -39,7 +39,7 @@ func InferenceModel(name, namespace string) *InferenceModelApplyConfiguration { b.WithName(name) b.WithNamespace(namespace) b.WithKind("InferenceModel") - b.WithAPIVersion("api/v1alpha1") + b.WithAPIVersion("inference.networking.x-k8s.io/v1alpha1") return b } diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go b/client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go index 9bbdda06..2b1a4cbf 100644 --- a/client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go +++ b/client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go @@ -18,7 +18,7 @@ limitations under the License. package v1alpha1 import ( - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" ) // InferenceModelSpecApplyConfiguration represents a declarative configuration of the InferenceModelSpec type for use diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencepool.go b/client-go/applyconfiguration/api/v1alpha1/inferencepool.go index a7f3ed6d..2940143e 100644 --- a/client-go/applyconfiguration/api/v1alpha1/inferencepool.go +++ b/client-go/applyconfiguration/api/v1alpha1/inferencepool.go @@ -39,7 +39,7 @@ func InferencePool(name, namespace string) *InferencePoolApplyConfiguration { b.WithName(name) b.WithNamespace(namespace) b.WithKind("InferencePool") - b.WithAPIVersion("api/v1alpha1") + b.WithAPIVersion("inference.networking.x-k8s.io/v1alpha1") return b } diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go b/client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go index e132f74b..5f69a154 100644 --- a/client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go +++ b/client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go @@ -18,7 +18,7 @@ limitations under the License. package v1alpha1 import ( - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" + apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" ) // InferencePoolSpecApplyConfiguration represents a declarative configuration of the InferencePoolSpec type for use diff --git a/client-go/applyconfiguration/utils.go b/client-go/applyconfiguration/utils.go index 1a71b674..677fa6e3 100644 --- a/client-go/applyconfiguration/utils.go +++ b/client-go/applyconfiguration/utils.go @@ -18,19 +18,19 @@ limitations under the License. package applyconfiguration import ( - v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" - internal "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/internal" runtime "k8s.io/apimachinery/pkg/runtime" schema "k8s.io/apimachinery/pkg/runtime/schema" testing "k8s.io/client-go/testing" + v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" + internal "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/internal" ) // ForKind returns an apply configuration type for the given GroupVersionKind, or nil if no // apply configuration type exists for the given GroupVersionKind. func ForKind(kind schema.GroupVersionKind) interface{} { switch kind { - // Group=api, Version=v1alpha1 + // Group=inference.networking.x-k8s.io, Version=v1alpha1 case v1alpha1.SchemeGroupVersion.WithKind("EndpointPickerConfig"): return &apiv1alpha1.EndpointPickerConfigApplyConfiguration{} case v1alpha1.SchemeGroupVersion.WithKind("Extension"): diff --git a/client-go/clientset/versioned/clientset.go b/client-go/clientset/versioned/clientset.go index 18e3236a..b7ebc1d8 100644 --- a/client-go/clientset/versioned/clientset.go +++ b/client-go/clientset/versioned/clientset.go @@ -21,26 +21,26 @@ import ( fmt "fmt" http "net/http" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" discovery "k8s.io/client-go/discovery" rest "k8s.io/client-go/rest" flowcontrol "k8s.io/client-go/util/flowcontrol" + inferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" ) type Interface interface { Discovery() discovery.DiscoveryInterface - ApiV1alpha1() apiv1alpha1.ApiV1alpha1Interface + InferenceV1alpha1() inferencev1alpha1.InferenceV1alpha1Interface } // Clientset contains the clients for groups. type Clientset struct { *discovery.DiscoveryClient - apiV1alpha1 *apiv1alpha1.ApiV1alpha1Client + inferenceV1alpha1 *inferencev1alpha1.InferenceV1alpha1Client } -// ApiV1alpha1 retrieves the ApiV1alpha1Client -func (c *Clientset) ApiV1alpha1() apiv1alpha1.ApiV1alpha1Interface { - return c.apiV1alpha1 +// InferenceV1alpha1 retrieves the InferenceV1alpha1Client +func (c *Clientset) InferenceV1alpha1() inferencev1alpha1.InferenceV1alpha1Interface { + return c.inferenceV1alpha1 } // Discovery retrieves the DiscoveryClient @@ -87,7 +87,7 @@ func NewForConfigAndClient(c *rest.Config, httpClient *http.Client) (*Clientset, var cs Clientset var err error - cs.apiV1alpha1, err = apiv1alpha1.NewForConfigAndClient(&configShallowCopy, httpClient) + cs.inferenceV1alpha1, err = inferencev1alpha1.NewForConfigAndClient(&configShallowCopy, httpClient) if err != nil { return nil, err } @@ -112,7 +112,7 @@ func NewForConfigOrDie(c *rest.Config) *Clientset { // New creates a new Clientset for the given RESTClient. func New(c rest.Interface) *Clientset { var cs Clientset - cs.apiV1alpha1 = apiv1alpha1.New(c) + cs.inferenceV1alpha1 = inferencev1alpha1.New(c) cs.DiscoveryClient = discovery.NewDiscoveryClient(c) return &cs diff --git a/client-go/clientset/versioned/fake/clientset_generated.go b/client-go/clientset/versioned/fake/clientset_generated.go index dda29ec6..1e54db31 100644 --- a/client-go/clientset/versioned/fake/clientset_generated.go +++ b/client-go/clientset/versioned/fake/clientset_generated.go @@ -18,15 +18,15 @@ limitations under the License. package fake import ( - applyconfiguration "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration" - clientset "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" - fakeapiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1/fake" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/discovery" fakediscovery "k8s.io/client-go/discovery/fake" "k8s.io/client-go/testing" + applyconfiguration "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration" + clientset "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" + inferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" + fakeinferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1/fake" ) // NewSimpleClientset returns a clientset that will respond with the provided objects. @@ -115,7 +115,7 @@ var ( _ testing.FakeClient = &Clientset{} ) -// ApiV1alpha1 retrieves the ApiV1alpha1Client -func (c *Clientset) ApiV1alpha1() apiv1alpha1.ApiV1alpha1Interface { - return &fakeapiv1alpha1.FakeApiV1alpha1{Fake: &c.Fake} +// InferenceV1alpha1 retrieves the InferenceV1alpha1Client +func (c *Clientset) InferenceV1alpha1() inferencev1alpha1.InferenceV1alpha1Interface { + return &fakeinferencev1alpha1.FakeInferenceV1alpha1{Fake: &c.Fake} } diff --git a/client-go/clientset/versioned/fake/register.go b/client-go/clientset/versioned/fake/register.go index f252a096..b72a8ce3 100644 --- a/client-go/clientset/versioned/fake/register.go +++ b/client-go/clientset/versioned/fake/register.go @@ -18,19 +18,19 @@ limitations under the License. package fake import ( - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" schema "k8s.io/apimachinery/pkg/runtime/schema" serializer "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" + inferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" ) var scheme = runtime.NewScheme() var codecs = serializer.NewCodecFactory(scheme) var localSchemeBuilder = runtime.SchemeBuilder{ - apiv1alpha1.AddToScheme, + inferencev1alpha1.AddToScheme, } // AddToScheme adds all types of this clientset into the given scheme. This allows composition diff --git a/client-go/clientset/versioned/scheme/register.go b/client-go/clientset/versioned/scheme/register.go index 6e243827..c4c06158 100644 --- a/client-go/clientset/versioned/scheme/register.go +++ b/client-go/clientset/versioned/scheme/register.go @@ -18,19 +18,19 @@ limitations under the License. package scheme import ( - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" schema "k8s.io/apimachinery/pkg/runtime/schema" serializer "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" + inferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" ) var Scheme = runtime.NewScheme() var Codecs = serializer.NewCodecFactory(Scheme) var ParameterCodec = runtime.NewParameterCodec(Scheme) var localSchemeBuilder = runtime.SchemeBuilder{ - apiv1alpha1.AddToScheme, + inferencev1alpha1.AddToScheme, } // AddToScheme adds all types of this clientset into the given scheme. This allows composition diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/api_client.go b/client-go/clientset/versioned/typed/api/v1alpha1/api_client.go index 84a4a0bb..8cc8a643 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/api_client.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/api_client.go @@ -20,34 +20,34 @@ package v1alpha1 import ( http "net/http" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - scheme "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" rest "k8s.io/client-go/rest" + apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + scheme "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" ) -type ApiV1alpha1Interface interface { +type InferenceV1alpha1Interface interface { RESTClient() rest.Interface InferenceModelsGetter InferencePoolsGetter } -// ApiV1alpha1Client is used to interact with features provided by the api group. -type ApiV1alpha1Client struct { +// InferenceV1alpha1Client is used to interact with features provided by the inference.networking.x-k8s.io group. +type InferenceV1alpha1Client struct { restClient rest.Interface } -func (c *ApiV1alpha1Client) InferenceModels(namespace string) InferenceModelInterface { +func (c *InferenceV1alpha1Client) InferenceModels(namespace string) InferenceModelInterface { return newInferenceModels(c, namespace) } -func (c *ApiV1alpha1Client) InferencePools(namespace string) InferencePoolInterface { +func (c *InferenceV1alpha1Client) InferencePools(namespace string) InferencePoolInterface { return newInferencePools(c, namespace) } -// NewForConfig creates a new ApiV1alpha1Client for the given config. +// NewForConfig creates a new InferenceV1alpha1Client for the given config. // NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), // where httpClient was generated with rest.HTTPClientFor(c). -func NewForConfig(c *rest.Config) (*ApiV1alpha1Client, error) { +func NewForConfig(c *rest.Config) (*InferenceV1alpha1Client, error) { config := *c if err := setConfigDefaults(&config); err != nil { return nil, err @@ -59,9 +59,9 @@ func NewForConfig(c *rest.Config) (*ApiV1alpha1Client, error) { return NewForConfigAndClient(&config, httpClient) } -// NewForConfigAndClient creates a new ApiV1alpha1Client for the given config and http client. +// NewForConfigAndClient creates a new InferenceV1alpha1Client for the given config and http client. // Note the http client provided takes precedence over the configured transport values. -func NewForConfigAndClient(c *rest.Config, h *http.Client) (*ApiV1alpha1Client, error) { +func NewForConfigAndClient(c *rest.Config, h *http.Client) (*InferenceV1alpha1Client, error) { config := *c if err := setConfigDefaults(&config); err != nil { return nil, err @@ -70,12 +70,12 @@ func NewForConfigAndClient(c *rest.Config, h *http.Client) (*ApiV1alpha1Client, if err != nil { return nil, err } - return &ApiV1alpha1Client{client}, nil + return &InferenceV1alpha1Client{client}, nil } -// NewForConfigOrDie creates a new ApiV1alpha1Client for the given config and +// NewForConfigOrDie creates a new InferenceV1alpha1Client for the given config and // panics if there is an error in the config. -func NewForConfigOrDie(c *rest.Config) *ApiV1alpha1Client { +func NewForConfigOrDie(c *rest.Config) *InferenceV1alpha1Client { client, err := NewForConfig(c) if err != nil { panic(err) @@ -83,9 +83,9 @@ func NewForConfigOrDie(c *rest.Config) *ApiV1alpha1Client { return client } -// New creates a new ApiV1alpha1Client for the given RESTClient. -func New(c rest.Interface) *ApiV1alpha1Client { - return &ApiV1alpha1Client{c} +// New creates a new InferenceV1alpha1Client for the given RESTClient. +func New(c rest.Interface) *InferenceV1alpha1Client { + return &InferenceV1alpha1Client{c} } func setConfigDefaults(config *rest.Config) error { @@ -103,7 +103,7 @@ func setConfigDefaults(config *rest.Config) error { // RESTClient returns a RESTClient that is used to communicate // with API server by this client implementation. -func (c *ApiV1alpha1Client) RESTClient() rest.Interface { +func (c *InferenceV1alpha1Client) RESTClient() rest.Interface { if c == nil { return nil } diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go index d5dbc1a8..1dee0f20 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go @@ -18,26 +18,26 @@ limitations under the License. package fake import ( - v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" rest "k8s.io/client-go/rest" testing "k8s.io/client-go/testing" + v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" ) -type FakeApiV1alpha1 struct { +type FakeInferenceV1alpha1 struct { *testing.Fake } -func (c *FakeApiV1alpha1) InferenceModels(namespace string) v1alpha1.InferenceModelInterface { +func (c *FakeInferenceV1alpha1) InferenceModels(namespace string) v1alpha1.InferenceModelInterface { return newFakeInferenceModels(c, namespace) } -func (c *FakeApiV1alpha1) InferencePools(namespace string) v1alpha1.InferencePoolInterface { +func (c *FakeInferenceV1alpha1) InferencePools(namespace string) v1alpha1.InferencePoolInterface { return newFakeInferencePools(c, namespace) } // RESTClient returns a RESTClient that is used to communicate // with API server by this client implementation. -func (c *FakeApiV1alpha1) RESTClient() rest.Interface { +func (c *FakeInferenceV1alpha1) RESTClient() rest.Interface { var ret *rest.RESTClient return ret } diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go index e33b311d..44007ae7 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go @@ -18,19 +18,19 @@ limitations under the License. package fake import ( - v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" - typedapiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" gentype "k8s.io/client-go/gentype" + v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" + typedapiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" ) // fakeInferenceModels implements InferenceModelInterface type fakeInferenceModels struct { *gentype.FakeClientWithListAndApply[*v1alpha1.InferenceModel, *v1alpha1.InferenceModelList, *apiv1alpha1.InferenceModelApplyConfiguration] - Fake *FakeApiV1alpha1 + Fake *FakeInferenceV1alpha1 } -func newFakeInferenceModels(fake *FakeApiV1alpha1, namespace string) typedapiv1alpha1.InferenceModelInterface { +func newFakeInferenceModels(fake *FakeInferenceV1alpha1, namespace string) typedapiv1alpha1.InferenceModelInterface { return &fakeInferenceModels{ gentype.NewFakeClientWithListAndApply[*v1alpha1.InferenceModel, *v1alpha1.InferenceModelList, *apiv1alpha1.InferenceModelApplyConfiguration]( fake.Fake, diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go index 92bc5cbe..cd0764aa 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go @@ -18,19 +18,19 @@ limitations under the License. package fake import ( - v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" - typedapiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" gentype "k8s.io/client-go/gentype" + v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" + typedapiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" ) // fakeInferencePools implements InferencePoolInterface type fakeInferencePools struct { *gentype.FakeClientWithListAndApply[*v1alpha1.InferencePool, *v1alpha1.InferencePoolList, *apiv1alpha1.InferencePoolApplyConfiguration] - Fake *FakeApiV1alpha1 + Fake *FakeInferenceV1alpha1 } -func newFakeInferencePools(fake *FakeApiV1alpha1, namespace string) typedapiv1alpha1.InferencePoolInterface { +func newFakeInferencePools(fake *FakeInferenceV1alpha1, namespace string) typedapiv1alpha1.InferencePoolInterface { return &fakeInferencePools{ gentype.NewFakeClientWithListAndApply[*v1alpha1.InferencePool, *v1alpha1.InferencePoolList, *apiv1alpha1.InferencePoolApplyConfiguration]( fake.Fake, diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go b/client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go index 1f5315ad..4c7c5941 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go @@ -20,13 +20,13 @@ package v1alpha1 import ( context "context" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - applyconfigurationapiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" - scheme "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" types "k8s.io/apimachinery/pkg/types" watch "k8s.io/apimachinery/pkg/watch" gentype "k8s.io/client-go/gentype" + apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + applyconfigurationapiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" + scheme "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" ) // InferenceModelsGetter has a method to return a InferenceModelInterface. @@ -59,7 +59,7 @@ type inferenceModels struct { } // newInferenceModels returns a InferenceModels -func newInferenceModels(c *ApiV1alpha1Client, namespace string) *inferenceModels { +func newInferenceModels(c *InferenceV1alpha1Client, namespace string) *inferenceModels { return &inferenceModels{ gentype.NewClientWithListAndApply[*apiv1alpha1.InferenceModel, *apiv1alpha1.InferenceModelList, *applyconfigurationapiv1alpha1.InferenceModelApplyConfiguration]( "inferencemodels", diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go b/client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go index 46a2b378..9af91801 100644 --- a/client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go +++ b/client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go @@ -20,13 +20,13 @@ package v1alpha1 import ( context "context" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - applyconfigurationapiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" - scheme "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" types "k8s.io/apimachinery/pkg/types" watch "k8s.io/apimachinery/pkg/watch" gentype "k8s.io/client-go/gentype" + apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + applyconfigurationapiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" + scheme "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" ) // InferencePoolsGetter has a method to return a InferencePoolInterface. @@ -59,7 +59,7 @@ type inferencePools struct { } // newInferencePools returns a InferencePools -func newInferencePools(c *ApiV1alpha1Client, namespace string) *inferencePools { +func newInferencePools(c *InferenceV1alpha1Client, namespace string) *inferencePools { return &inferencePools{ gentype.NewClientWithListAndApply[*apiv1alpha1.InferencePool, *apiv1alpha1.InferencePoolList, *applyconfigurationapiv1alpha1.InferencePoolApplyConfiguration]( "inferencepools", diff --git a/client-go/informers/externalversions/api/interface.go b/client-go/informers/externalversions/api/interface.go index 6ca4f9da..fbf5ba09 100644 --- a/client-go/informers/externalversions/api/interface.go +++ b/client-go/informers/externalversions/api/interface.go @@ -18,8 +18,8 @@ limitations under the License. package api import ( - v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/api/v1alpha1" - internalinterfaces "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" + v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/api/v1alpha1" + internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" ) // Interface provides access to each of this group's versions. diff --git a/client-go/informers/externalversions/api/v1alpha1/inferencemodel.go b/client-go/informers/externalversions/api/v1alpha1/inferencemodel.go index f887ff4a..a1522e48 100644 --- a/client-go/informers/externalversions/api/v1alpha1/inferencemodel.go +++ b/client-go/informers/externalversions/api/v1alpha1/inferencemodel.go @@ -21,14 +21,14 @@ import ( context "context" time "time" - gatewayapiinferenceextensionapiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - versioned "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" - internalinterfaces "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/listers/api/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" watch "k8s.io/apimachinery/pkg/watch" cache "k8s.io/client-go/tools/cache" + gatewayapiinferenceextensionapiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + versioned "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" + internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" + apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/listers/api/v1alpha1" ) // InferenceModelInformer provides access to a shared informer and lister for @@ -61,13 +61,13 @@ func NewFilteredInferenceModelInformer(client versioned.Interface, namespace str if tweakListOptions != nil { tweakListOptions(&options) } - return client.ApiV1alpha1().InferenceModels(namespace).List(context.TODO(), options) + return client.InferenceV1alpha1().InferenceModels(namespace).List(context.TODO(), options) }, WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { if tweakListOptions != nil { tweakListOptions(&options) } - return client.ApiV1alpha1().InferenceModels(namespace).Watch(context.TODO(), options) + return client.InferenceV1alpha1().InferenceModels(namespace).Watch(context.TODO(), options) }, }, &gatewayapiinferenceextensionapiv1alpha1.InferenceModel{}, diff --git a/client-go/informers/externalversions/api/v1alpha1/inferencepool.go b/client-go/informers/externalversions/api/v1alpha1/inferencepool.go index 2311a025..27f2d29e 100644 --- a/client-go/informers/externalversions/api/v1alpha1/inferencepool.go +++ b/client-go/informers/externalversions/api/v1alpha1/inferencepool.go @@ -21,14 +21,14 @@ import ( context "context" time "time" - gatewayapiinferenceextensionapiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - versioned "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" - internalinterfaces "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/listers/api/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" watch "k8s.io/apimachinery/pkg/watch" cache "k8s.io/client-go/tools/cache" + gatewayapiinferenceextensionapiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + versioned "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" + internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" + apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/listers/api/v1alpha1" ) // InferencePoolInformer provides access to a shared informer and lister for @@ -61,13 +61,13 @@ func NewFilteredInferencePoolInformer(client versioned.Interface, namespace stri if tweakListOptions != nil { tweakListOptions(&options) } - return client.ApiV1alpha1().InferencePools(namespace).List(context.TODO(), options) + return client.InferenceV1alpha1().InferencePools(namespace).List(context.TODO(), options) }, WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { if tweakListOptions != nil { tweakListOptions(&options) } - return client.ApiV1alpha1().InferencePools(namespace).Watch(context.TODO(), options) + return client.InferenceV1alpha1().InferencePools(namespace).Watch(context.TODO(), options) }, }, &gatewayapiinferenceextensionapiv1alpha1.InferencePool{}, diff --git a/client-go/informers/externalversions/api/v1alpha1/interface.go b/client-go/informers/externalversions/api/v1alpha1/interface.go index 9ba07025..3ea6d988 100644 --- a/client-go/informers/externalversions/api/v1alpha1/interface.go +++ b/client-go/informers/externalversions/api/v1alpha1/interface.go @@ -18,7 +18,7 @@ limitations under the License. package v1alpha1 import ( - internalinterfaces "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" + internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" ) // Interface provides access to all the informers in this group version. diff --git a/client-go/informers/externalversions/factory.go b/client-go/informers/externalversions/factory.go index 39c96068..c06ea464 100644 --- a/client-go/informers/externalversions/factory.go +++ b/client-go/informers/externalversions/factory.go @@ -22,13 +22,13 @@ import ( sync "sync" time "time" - versioned "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" - api "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/api" - internalinterfaces "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" schema "k8s.io/apimachinery/pkg/runtime/schema" cache "k8s.io/client-go/tools/cache" + versioned "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" + api "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/api" + internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" ) // SharedInformerOption defines the functional option type for SharedInformerFactory. @@ -253,9 +253,9 @@ type SharedInformerFactory interface { // client. InformerFor(obj runtime.Object, newFunc internalinterfaces.NewInformerFunc) cache.SharedIndexInformer - Api() api.Interface + Inference() api.Interface } -func (f *sharedInformerFactory) Api() api.Interface { +func (f *sharedInformerFactory) Inference() api.Interface { return api.New(f, f.namespace, f.tweakListOptions) } diff --git a/client-go/informers/externalversions/generic.go b/client-go/informers/externalversions/generic.go index a5f15f73..672998f5 100644 --- a/client-go/informers/externalversions/generic.go +++ b/client-go/informers/externalversions/generic.go @@ -20,9 +20,9 @@ package externalversions import ( fmt "fmt" - v1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" schema "k8s.io/apimachinery/pkg/runtime/schema" cache "k8s.io/client-go/tools/cache" + v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" ) // GenericInformer is type of SharedIndexInformer which will locate and delegate to other @@ -51,11 +51,11 @@ func (f *genericInformer) Lister() cache.GenericLister { // TODO extend this to unknown resources with a client pool func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) { switch resource { - // Group=api, Version=v1alpha1 + // Group=inference.networking.x-k8s.io, Version=v1alpha1 case v1alpha1.SchemeGroupVersion.WithResource("inferencemodels"): - return &genericInformer{resource: resource.GroupResource(), informer: f.Api().V1alpha1().InferenceModels().Informer()}, nil + return &genericInformer{resource: resource.GroupResource(), informer: f.Inference().V1alpha1().InferenceModels().Informer()}, nil case v1alpha1.SchemeGroupVersion.WithResource("inferencepools"): - return &genericInformer{resource: resource.GroupResource(), informer: f.Api().V1alpha1().InferencePools().Informer()}, nil + return &genericInformer{resource: resource.GroupResource(), informer: f.Inference().V1alpha1().InferencePools().Informer()}, nil } diff --git a/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go b/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go index 488aca6f..5b70862a 100644 --- a/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go +++ b/client-go/informers/externalversions/internalinterfaces/factory_interfaces.go @@ -20,10 +20,10 @@ package internalinterfaces import ( time "time" - versioned "inference.networking.x-k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" cache "k8s.io/client-go/tools/cache" + versioned "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" ) // NewInformerFunc takes versioned.Interface and time.Duration to return a SharedIndexInformer. diff --git a/client-go/listers/api/v1alpha1/inferencemodel.go b/client-go/listers/api/v1alpha1/inferencemodel.go index b0c33b61..b4342842 100644 --- a/client-go/listers/api/v1alpha1/inferencemodel.go +++ b/client-go/listers/api/v1alpha1/inferencemodel.go @@ -18,10 +18,10 @@ limitations under the License. package v1alpha1 import ( - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" labels "k8s.io/apimachinery/pkg/labels" listers "k8s.io/client-go/listers" cache "k8s.io/client-go/tools/cache" + apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" ) // InferenceModelLister helps list InferenceModels. diff --git a/client-go/listers/api/v1alpha1/inferencepool.go b/client-go/listers/api/v1alpha1/inferencepool.go index 0b0c1d6e..387daf39 100644 --- a/client-go/listers/api/v1alpha1/inferencepool.go +++ b/client-go/listers/api/v1alpha1/inferencepool.go @@ -18,10 +18,10 @@ limitations under the License. package v1alpha1 import ( - apiv1alpha1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" labels "k8s.io/apimachinery/pkg/labels" listers "k8s.io/client-go/listers" cache "k8s.io/client-go/tools/cache" + apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" ) // InferencePoolLister helps list InferencePools. diff --git a/go.mod b/go.mod index d774d6bd..c89080ae 100644 --- a/go.mod +++ b/go.mod @@ -1,4 +1,4 @@ -module inference.networking.x-k8s.io/gateway-api-inference-extension +module sigs.k8s.io/gateway-api-inference-extension go 1.23.0 diff --git a/hack/update-codegen.sh b/hack/update-codegen.sh index cfe75f81..c825507b 100755 --- a/hack/update-codegen.sh +++ b/hack/update-codegen.sh @@ -23,7 +23,7 @@ echo "$SCRIPT_ROOT script" CODEGEN_PKG=${2:-bin} echo $CODEGEN_PKG source "${CODEGEN_PKG}/kube_codegen.sh" -THIS_PKG="inference.networking.x-k8s.io/gateway-api-inference-extension" +THIS_PKG="sigs.k8s.io/gateway-api-inference-extension" kube::codegen::gen_helpers \ diff --git a/pkg/ext-proc/backend/datastore.go b/pkg/ext-proc/backend/datastore.go index b466a2ed..3208be26 100644 --- a/pkg/ext-proc/backend/datastore.go +++ b/pkg/ext-proc/backend/datastore.go @@ -5,10 +5,10 @@ import ( "math/rand" "sync" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" corev1 "k8s.io/api/core/v1" "k8s.io/klog/v2" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) func NewK8sDataStore(options ...K8sDatastoreOption) *K8sDatastore { diff --git a/pkg/ext-proc/backend/datastore_test.go b/pkg/ext-proc/backend/datastore_test.go index 323b3bb0..0fc5da1a 100644 --- a/pkg/ext-proc/backend/datastore_test.go +++ b/pkg/ext-proc/backend/datastore_test.go @@ -3,8 +3,8 @@ package backend import ( "testing" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" ) func TestHasSynced(t *testing.T) { diff --git a/pkg/ext-proc/backend/endpointslice_reconciler.go b/pkg/ext-proc/backend/endpointslice_reconciler.go index a2a9790f..ebc182b8 100644 --- a/pkg/ext-proc/backend/endpointslice_reconciler.go +++ b/pkg/ext-proc/backend/endpointslice_reconciler.go @@ -5,8 +5,6 @@ import ( "strconv" "time" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" discoveryv1 "k8s.io/api/discovery/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" @@ -15,6 +13,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) var ( diff --git a/pkg/ext-proc/backend/endpointslice_reconcilier_test.go b/pkg/ext-proc/backend/endpointslice_reconcilier_test.go index e3c927ba..9a3d55d8 100644 --- a/pkg/ext-proc/backend/endpointslice_reconcilier_test.go +++ b/pkg/ext-proc/backend/endpointslice_reconcilier_test.go @@ -4,9 +4,9 @@ import ( "sync" "testing" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" v1 "k8s.io/api/core/v1" discoveryv1 "k8s.io/api/discovery/v1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" ) var ( diff --git a/pkg/ext-proc/backend/fake.go b/pkg/ext-proc/backend/fake.go index c4545497..8c028b77 100644 --- a/pkg/ext-proc/backend/fake.go +++ b/pkg/ext-proc/backend/fake.go @@ -3,8 +3,8 @@ package backend import ( "context" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" klog "k8s.io/klog/v2" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" ) type FakePodMetricsClient struct { diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler.go b/pkg/ext-proc/backend/inferencemodel_reconciler.go index 1c1d2278..02394baa 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler.go @@ -3,8 +3,6 @@ package backend import ( "context" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -12,6 +10,8 @@ import ( "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) type InferenceModelReconciler struct { diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go index 45669a30..d0f6c36d 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go @@ -10,9 +10,9 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client/fake" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" ) var ( diff --git a/pkg/ext-proc/backend/inferencepool_reconciler.go b/pkg/ext-proc/backend/inferencepool_reconciler.go index 35a41f8f..b4cba202 100644 --- a/pkg/ext-proc/backend/inferencepool_reconciler.go +++ b/pkg/ext-proc/backend/inferencepool_reconciler.go @@ -3,13 +3,13 @@ package backend import ( "context" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" klog "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" ) // InferencePoolReconciler utilizes the controller runtime to reconcile Instance Gateway resources diff --git a/pkg/ext-proc/backend/inferencepool_reconciler_test.go b/pkg/ext-proc/backend/inferencepool_reconciler_test.go index f03c31cb..f16524a5 100644 --- a/pkg/ext-proc/backend/inferencepool_reconciler_test.go +++ b/pkg/ext-proc/backend/inferencepool_reconciler_test.go @@ -4,8 +4,8 @@ import ( "reflect" "testing" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" ) var ( diff --git a/pkg/ext-proc/backend/provider.go b/pkg/ext-proc/backend/provider.go index a9165e8f..68043d93 100644 --- a/pkg/ext-proc/backend/provider.go +++ b/pkg/ext-proc/backend/provider.go @@ -7,9 +7,9 @@ import ( "time" "go.uber.org/multierr" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" klog "k8s.io/klog/v2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) const ( diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/ext-proc/backend/vllm/metrics.go index 8800868a..e3693960 100644 --- a/pkg/ext-proc/backend/vllm/metrics.go +++ b/pkg/ext-proc/backend/vllm/metrics.go @@ -12,9 +12,9 @@ import ( dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" "go.uber.org/multierr" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" klog "k8s.io/klog/v2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) const ( diff --git a/pkg/ext-proc/backend/vllm/metrics_test.go b/pkg/ext-proc/backend/vllm/metrics_test.go index e3c1449d..3d4225e8 100644 --- a/pkg/ext-proc/backend/vllm/metrics_test.go +++ b/pkg/ext-proc/backend/vllm/metrics_test.go @@ -7,7 +7,7 @@ import ( dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" "google.golang.org/protobuf/proto" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" ) func TestPromToPodMetrics(t *testing.T) { diff --git a/pkg/ext-proc/handlers/request.go b/pkg/ext-proc/handlers/request.go index d98f4602..17278025 100644 --- a/pkg/ext-proc/handlers/request.go +++ b/pkg/ext-proc/handlers/request.go @@ -9,10 +9,10 @@ import ( configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "google.golang.org/protobuf/types/known/structpb" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" klog "k8s.io/klog/v2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) // HandleRequestBody handles body of the request to the backend server, such as parsing the "model" diff --git a/pkg/ext-proc/handlers/response.go b/pkg/ext-proc/handlers/response.go index 3b8a9946..34a7219a 100644 --- a/pkg/ext-proc/handlers/response.go +++ b/pkg/ext-proc/handlers/response.go @@ -6,8 +6,8 @@ import ( configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" klog "k8s.io/klog/v2" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) // HandleResponseHeaders processes response headers from the backend model server. diff --git a/pkg/ext-proc/handlers/server.go b/pkg/ext-proc/handlers/server.go index 172249b6..f27c9a15 100644 --- a/pkg/ext-proc/handlers/server.go +++ b/pkg/ext-proc/handlers/server.go @@ -8,12 +8,12 @@ import ( envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" klog "k8s.io/klog/v2" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) func NewServer(pp PodProvider, scheduler Scheduler, targetEndpointKey string, datastore ModelDataStore) *Server { diff --git a/pkg/ext-proc/health.go b/pkg/ext-proc/health.go index 488851eb..764992b2 100644 --- a/pkg/ext-proc/health.go +++ b/pkg/ext-proc/health.go @@ -6,8 +6,8 @@ import ( "google.golang.org/grpc/codes" healthPb "google.golang.org/grpc/health/grpc_health_v1" "google.golang.org/grpc/status" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" klog "k8s.io/klog/v2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" ) type healthServer struct { diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index e126b6dd..634c3581 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -11,11 +11,6 @@ import ( "github.com/prometheus/client_golang/prometheus/promhttp" "google.golang.org/grpc" healthPb "google.golang.org/grpc/health/grpc_health_v1" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend/vllm" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" - runserver "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -24,6 +19,11 @@ import ( klog "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend/vllm" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" + runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" ) const ( diff --git a/pkg/ext-proc/scheduling/filter.go b/pkg/ext-proc/scheduling/filter.go index d431b076..fc016882 100644 --- a/pkg/ext-proc/scheduling/filter.go +++ b/pkg/ext-proc/scheduling/filter.go @@ -4,9 +4,9 @@ import ( "errors" "math" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" klog "k8s.io/klog/v2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) type Filter interface { diff --git a/pkg/ext-proc/scheduling/filter_test.go b/pkg/ext-proc/scheduling/filter_test.go index d88f437c..224dc83f 100644 --- a/pkg/ext-proc/scheduling/filter_test.go +++ b/pkg/ext-proc/scheduling/filter_test.go @@ -5,7 +5,7 @@ import ( "testing" "github.com/google/go-cmp/cmp" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" ) func TestFilter(t *testing.T) { diff --git a/pkg/ext-proc/scheduling/scheduler.go b/pkg/ext-proc/scheduling/scheduler.go index 9fc3e663..ca896c5a 100644 --- a/pkg/ext-proc/scheduling/scheduler.go +++ b/pkg/ext-proc/scheduling/scheduler.go @@ -7,9 +7,9 @@ import ( "google.golang.org/grpc/codes" "google.golang.org/grpc/status" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - logutil "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" klog "k8s.io/klog/v2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) const ( diff --git a/pkg/ext-proc/server/runserver.go b/pkg/ext-proc/server/runserver.go index bf666f1f..affb4b6c 100644 --- a/pkg/ext-proc/server/runserver.go +++ b/pkg/ext-proc/server/runserver.go @@ -7,14 +7,14 @@ import ( extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "google.golang.org/grpc" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/rest" klog "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" ) // ExtProcServerRunner provides methods to manage an external process server. diff --git a/pkg/ext-proc/test/benchmark/benchmark.go b/pkg/ext-proc/test/benchmark/benchmark.go index abaeedbb..f18782d6 100644 --- a/pkg/ext-proc/test/benchmark/benchmark.go +++ b/pkg/ext-proc/test/benchmark/benchmark.go @@ -10,11 +10,11 @@ import ( "github.com/bojand/ghz/runner" "github.com/jhump/protoreflect/desc" "google.golang.org/protobuf/proto" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - runserver "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/test" klog "k8s.io/klog/v2" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/test" ) var ( diff --git a/pkg/ext-proc/test/utils.go b/pkg/ext-proc/test/utils.go index 98793b95..b91672fa 100644 --- a/pkg/ext-proc/test/utils.go +++ b/pkg/ext-proc/test/utils.go @@ -9,11 +9,11 @@ import ( extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "google.golang.org/grpc" "google.golang.org/grpc/reflection" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" klog "k8s.io/klog/v2" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" ) func StartExtProc(port int, refreshPodsInterval, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration, pods []*backend.PodMetrics, models map[string]*v1alpha1.InferenceModel) *grpc.Server { diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index c2c1ea92..4a0dd2a8 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -26,8 +26,6 @@ import ( "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" - infextv1a1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - testutils "inference.networking.x-k8s.io/gateway-api-inference-extension/test/utils" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" @@ -40,6 +38,8 @@ import ( clientgoscheme "k8s.io/client-go/kubernetes/scheme" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/config" + infextv1a1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + testutils "sigs.k8s.io/gateway-api-inference-extension/test/utils" ) const ( diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 8e5968fc..087097a7 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -24,10 +24,10 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" - infextv1a1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - testutils "inference.networking.x-k8s.io/gateway-api-inference-extension/test/utils" "k8s.io/apimachinery/pkg/types" "k8s.io/utils/ptr" + infextv1a1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + testutils "sigs.k8s.io/gateway-api-inference-extension/test/utils" ) var _ = ginkgo.Describe("InferencePool", func() { diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index b52cc9d7..e94be1a0 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -23,10 +23,6 @@ import ( "google.golang.org/grpc/credentials/insecure" "google.golang.org/protobuf/testing/protocmp" "google.golang.org/protobuf/types/known/structpb" - "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" - "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - runserver "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" - extprocutils "inference.networking.x-k8s.io/gateway-api-inference-extension/pkg/ext-proc/test" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" k8syaml "k8s.io/apimachinery/pkg/util/yaml" @@ -34,6 +30,10 @@ import ( klog "k8s.io/klog/v2" k8sclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/envtest" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" + extprocutils "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/test" "sigs.k8s.io/yaml" ) diff --git a/test/utils/utils.go b/test/utils/utils.go index 337599c3..777eadd8 100644 --- a/test/utils/utils.go +++ b/test/utils/utils.go @@ -24,7 +24,6 @@ import ( "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" - infextv1a1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" @@ -37,6 +36,7 @@ import ( "k8s.io/client-go/rest" "k8s.io/client-go/tools/remotecommand" "sigs.k8s.io/controller-runtime/pkg/client" + infextv1a1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" ) // DeleteClusterResources deletes all cluster-scoped objects the tests typically create. diff --git a/test/utils/wrappers.go b/test/utils/wrappers.go index 12ff856a..668a5adc 100644 --- a/test/utils/wrappers.go +++ b/test/utils/wrappers.go @@ -17,8 +17,8 @@ limitations under the License. package utils import ( - infextv1a1 "inference.networking.x-k8s.io/gateway-api-inference-extension/api/v1alpha1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + infextv1a1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" ) // InferenceModelWrapper wraps an InferenceModel. From 70b5c84dd2f46592e8ea0d60d34e995c28d19ea7 Mon Sep 17 00:00:00 2001 From: Daneyon Hansen Date: Tue, 11 Feb 2025 17:08:05 -0500 Subject: [PATCH 13/96] Updates EPP Deployment and Release Doc/Script (#322) * Changes EPP ImagePullPolicy Signed-off-by: Daneyon Hansen * Updates release doc and script Signed-off-by: Daneyon Hansen --------- Signed-off-by: Daneyon Hansen --- .github/ISSUE_TEMPLATE/new-release.md | 7 ++++--- hack/release-quickstart.sh | 19 +++++++++++-------- pkg/manifests/ext_proc.yaml | 1 + 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/new-release.md b/.github/ISSUE_TEMPLATE/new-release.md index 6ed3df8c..ceca9f5f 100644 --- a/.github/ISSUE_TEMPLATE/new-release.md +++ b/.github/ISSUE_TEMPLATE/new-release.md @@ -34,10 +34,10 @@ This document defines the process for releasing Gateway API Inference Extension. export RC=1 ``` -4. The vLLM image tag defaults to `v0.7.1` for a release. Optionally, change the vLLM image tag. For example: +4. The vLLM image tag defaults to `0.7.2` for a release. Optionally, change the vLLM image tag. For example: ```shell - export VLLM=0.7.2 + export VLLM=0.7.3 ``` ## Release Process @@ -114,7 +114,8 @@ This document defines the process for releasing Gateway API Inference Extension. 9. Pushing the tag triggers Prow to build and publish the container image to the [staging registry][]. 10. Submit a PR against [k8s.io][] to add the staging image tag and SHA to [`k8s-staging-gateway-api-inference-extension/images.yaml`][yaml]. This will - promote the image to the production registry. **Note:** Add a link to this issue when the PR is merged. + promote the image to the production registry, e.g. `registry.k8s.io/gateway-api-inference-extension/epp:v${MAJOR}.${MINOR}.0`. + **Note:** Add a link to this issue when the PR is merged. 11. Test the steps in the tagged quickstart guide after the PR merges, for example: `https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/v0.1.0-rc.1/pkg/README.md`. 12. Create a [new release][]: 1. Choose the tag that you created for the release. diff --git a/hack/release-quickstart.sh b/hack/release-quickstart.sh index b156b160..f4701508 100755 --- a/hack/release-quickstart.sh +++ b/hack/release-quickstart.sh @@ -15,8 +15,8 @@ else RELEASE_TAG="v${MAJOR}.${MINOR}.0-rc.${RC}" fi -# vLLM image version (default to 0.7.1 if not defined) -VLLM="${VLLM:-0.7.1}" +# vLLM image version (default to 0.7.2 if not defined) +VLLM="${VLLM:-0.7.2}" echo "Using release tag: ${RELEASE_TAG}" echo "Using vLLM image version: ${VLLM}" @@ -41,12 +41,15 @@ sed -i.bak "s|kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-in EXT_PROC="pkg/manifests/ext_proc.yaml" echo "Updating ${EXT_PROC} ..." -# Update any image reference for the EPP container. -# For images from registry.k8s.io: -sed -i.bak -E "s|(registry\.k8s\.io/gateway-api-inference-extension/epp:)[^\"[:space:]]+|\1${RELEASE_TAG}|g" "$EXT_PROC" -# In case there is still any reference from us-central1-docker.pkg.dev: +# Update the EPP container tag. sed -i.bak -E "s|(us-central1-docker\.pkg\.dev/k8s-staging-images/gateway-api-inference-extension/epp:)[^\"[:space:]]+|\1${RELEASE_TAG}|g" "$EXT_PROC" +# Update the EPP container image pull policy. +sed -i.bak '/us-central1-docker.pkg.dev\/k8s-staging-images\/gateway-api-inference-extension\/epp/ { n; s/Always/IfNotPresent/ }' "$EXT_PROC" + +# Update the EPP container registry. +sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$EXT_PROC" + # ----------------------------------------------------------------------------- # Update pkg/manifests/vllm/deployment.yaml # ----------------------------------------------------------------------------- @@ -54,10 +57,10 @@ VLLM_DEPLOY="pkg/manifests/vllm/deployment.yaml" echo "Updating ${VLLM_DEPLOY} ..." # Update the vLLM image version -sed -i.bak -E "s|(vllm/vllm-openai:)[^\"[:space:]]+|\1${VLLM}|g" "$VLLM_DEPLOY" +sed -i.bak -E "s|(vllm/vllm-openai:)[^\"[:space:]]+|\1v${VLLM}|g" "$VLLM_DEPLOY" # Also change the imagePullPolicy from Always to IfNotPresent on lines containing the vLLM image. -sed -i.bak "/vllm\/vllm-openai/ s/Always/IfNotPresent/g" "$VLLM_DEPLOY" +sed -i.bak '/vllm\/vllm-openai/ { n; s/Always/IfNotPresent/ }' "$VLLM_DEPLOY" # ----------------------------------------------------------------------------- # Stage the changes diff --git a/pkg/manifests/ext_proc.yaml b/pkg/manifests/ext_proc.yaml index 4e82779e..a7dc7678 100644 --- a/pkg/manifests/ext_proc.yaml +++ b/pkg/manifests/ext_proc.yaml @@ -72,6 +72,7 @@ spec: containers: - name: inference-gateway-ext-proc image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main + imagePullPolicy: Always args: - -poolName - "vllm-llama2-7b-pool" From 4a8f04c614faabf07f6378ae01056d14b12e6f8b Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Wed, 12 Feb 2025 17:56:22 +0000 Subject: [PATCH 14/96] Delete InferenceModels from the datastore when deletionTimestamp is set (#319) * Delete InferenceModels from the datastore when deletionTimestamp is set * Update pkg/ext-proc/backend/inferencemodel_reconciler_test.go --- .../backend/inferencemodel_reconciler.go | 4 ++ .../backend/inferencemodel_reconciler_test.go | 60 +++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler.go b/pkg/ext-proc/backend/inferencemodel_reconciler.go index 02394baa..f0a13941 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler.go @@ -37,6 +37,10 @@ func (c *InferenceModelReconciler) Reconcile(ctx context.Context, req ctrl.Reque } klog.Error(err, "Unable to get InferenceModel") return ctrl.Result{}, err + } else if !infModel.DeletionTimestamp.IsZero() { + klog.V(1).Infof("InferenceModel %v is marked for deletion. Removing from datastore", req.NamespacedName) + c.Datastore.InferenceModels.Delete(infModel.Spec.ModelName) + return ctrl.Result{}, nil } c.updateDatastore(infModel) diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go index d0f6c36d..415358b2 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go @@ -184,6 +184,66 @@ func TestReconcile_ResourceNotFound(t *testing.T) { } } +func TestReconcile_ModelMarkedForDeletion(t *testing.T) { + // Set up the scheme. + scheme := runtime.NewScheme() + _ = v1alpha1.AddToScheme(scheme) + + // Create an InferenceModel object. + now := metav1.Now() + existingModel := &v1alpha1.InferenceModel{ + ObjectMeta: metav1.ObjectMeta{ + Name: "existing-model", + Namespace: "default", + DeletionTimestamp: &now, + Finalizers: []string{"finalizer"}, + }, + Spec: v1alpha1.InferenceModelSpec{ + ModelName: "fake-model", + PoolRef: v1alpha1.PoolObjectReference{Name: "test-pool"}, + }, + } + + // Create a fake client with the existing model. + fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existingModel).Build() + + // Create a minimal datastore. + datastore := &K8sDatastore{ + InferenceModels: &sync.Map{}, + inferencePool: &v1alpha1.InferencePool{ + ObjectMeta: metav1.ObjectMeta{Name: "test-pool"}, + }, + } + + // Create the reconciler. + reconciler := &InferenceModelReconciler{ + Client: fakeClient, + Scheme: scheme, + Record: record.NewFakeRecorder(10), + Datastore: datastore, + PoolNamespacedName: types.NamespacedName{Name: "test-pool", Namespace: "default"}, + } + + // Create a request for the existing resource. + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: "existing-model", Namespace: "default"}} + + // Call Reconcile. + result, err := reconciler.Reconcile(context.Background(), req) + if err != nil { + t.Fatalf("expected no error when resource exists, got %v", err) + } + + // Check that no requeue is requested. + if result.Requeue || result.RequeueAfter != 0 { + t.Errorf("expected no requeue, got %+v", result) + } + + // Verify that the datastore was not updated. + if _, ok := datastore.InferenceModels.Load(existingModel.Spec.ModelName); ok { + t.Errorf("expected datastore to not contain model %q", existingModel.Spec.ModelName) + } +} + func TestReconcile_ResourceExists(t *testing.T) { // Set up the scheme. scheme := runtime.NewScheme() From 242b73e4cf6da33a09d96002ab6bd08936ec6855 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Kupka?= Date: Wed, 12 Feb 2025 23:50:21 +0100 Subject: [PATCH 15/96] Actually init logging using Zap (#267) Controllers typically use Zap these days. The only potential issue is that the flags are not compatible. This is somehow mitigated by supporting -v explicitly. --- go.mod | 3 ++- pkg/ext-proc/main.go | 31 +++++++++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index c89080ae..d8b143ec 100644 --- a/go.mod +++ b/go.mod @@ -17,6 +17,7 @@ require ( github.com/prometheus/common v0.62.0 github.com/stretchr/testify v1.10.0 go.uber.org/multierr v1.11.0 + go.uber.org/zap v1.27.0 google.golang.org/grpc v1.70.0 google.golang.org/protobuf v1.36.5 k8s.io/api v0.32.1 @@ -62,6 +63,7 @@ require ( github.com/fxamacker/cbor/v2 v2.7.0 // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect + github.com/go-logr/zapr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.23.0 // indirect @@ -114,7 +116,6 @@ require ( go.opentelemetry.io/otel/sdk v1.32.0 // indirect go.opentelemetry.io/otel/trace v1.32.0 // indirect go.opentelemetry.io/proto/otlp v1.3.1 // indirect - go.uber.org/zap v1.27.0 // indirect golang.org/x/crypto v0.32.0 // indirect golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect golang.org/x/mod v0.22.0 // indirect diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index 634c3581..6bdaae66 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -9,6 +9,8 @@ import ( "strconv" "github.com/prometheus/client_golang/prometheus/promhttp" + uberzap "go.uber.org/zap" + "go.uber.org/zap/zapcore" "google.golang.org/grpc" healthPb "google.golang.org/grpc/health/grpc_health_v1" "k8s.io/apimachinery/pkg/runtime" @@ -18,12 +20,14 @@ import ( "k8s.io/component-base/metrics/legacyregistry" klog "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend/vllm" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) const ( @@ -73,6 +77,7 @@ var ( "refreshPrometheusMetricsInterval", runserver.DefaultRefreshPrometheusMetricsInterval, "interval to flush prometheus metrics") + logVerbosity = flag.Int("v", logging.DEFAULT, "number for the log level verbosity") scheme = runtime.NewScheme() ) @@ -83,10 +88,13 @@ func init() { } func main() { - klog.InitFlags(nil) + opts := zap.Options{ + Development: true, + } + opts.BindFlags(flag.CommandLine) flag.Parse() + initLogging(&opts) - ctrl.SetLogger(klog.TODO()) cfg, err := ctrl.GetConfig() if err != nil { klog.Fatalf("Failed to get rest config: %v", err) @@ -152,6 +160,25 @@ func main() { klog.Info("All components shutdown") } +func initLogging(opts *zap.Options) { + // Unless -zap-log-level is explicitly set, use -v + useV := true + flag.Visit(func(f *flag.Flag) { + if f.Name == "zap-log-level" { + useV = false + } + }) + if useV { + // See https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/log/zap#Options.Level + lvl := -1 * (*logVerbosity) + opts.Level = uberzap.NewAtomicLevelAt(zapcore.Level(int8(lvl))) + } + + logger := zap.New(zap.UseFlagOptions(opts), zap.RawZapOpts(uberzap.AddCaller())) + ctrl.SetLogger(logger) + klog.SetLogger(logger) +} + // startHealthServer starts the gRPC health probe server in a goroutine. func startHealthServer(ds *backend.K8sDatastore, port int) *grpc.Server { svr := grpc.NewServer() From 0662f1f391c4c3ec331a05d9f2ebed2bb5a845fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Kupka?= Date: Thu, 13 Feb 2025 05:54:20 +0100 Subject: [PATCH 16/96] Remove fatal logging in executable code (#265) All Fatal log call are removed. Also all runnable components are now managed by controller-runtime, implementing manager.Runnable interface. --- pkg/ext-proc/internal/runnable/grpc.go | 52 +++++++ .../internal/runnable/leader_election.go | 31 ++++ pkg/ext-proc/main.go | 140 +++++++++--------- pkg/ext-proc/server/runserver.go | 75 +++++----- pkg/ext-proc/server/runserver_test.go | 21 +++ test/integration/hermetic_test.go | 22 ++- 6 files changed, 229 insertions(+), 112 deletions(-) create mode 100644 pkg/ext-proc/internal/runnable/grpc.go create mode 100644 pkg/ext-proc/internal/runnable/leader_election.go create mode 100644 pkg/ext-proc/server/runserver_test.go diff --git a/pkg/ext-proc/internal/runnable/grpc.go b/pkg/ext-proc/internal/runnable/grpc.go new file mode 100644 index 00000000..a619f788 --- /dev/null +++ b/pkg/ext-proc/internal/runnable/grpc.go @@ -0,0 +1,52 @@ +package runnable + +import ( + "context" + "fmt" + "net" + + "google.golang.org/grpc" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/manager" +) + +// GRPCServer converts the given gRPC server into a runnable. +// The server name is just being used for logging. +func GRPCServer(name string, srv *grpc.Server, port int) manager.Runnable { + return manager.RunnableFunc(func(ctx context.Context) error { + // Use "name" key as that is what manager.Server does as well. + log := ctrl.Log.WithValues("name", name) + log.Info("gRPC server starting") + + // Start listening. + lis, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) + if err != nil { + log.Error(err, "gRPC server failed to listen") + return err + } + + log.Info("gRPC server listening", "port", port) + + // Shutdown on context closed. + // Terminate the server on context closed. + // Make sure the goroutine does not leak. + doneCh := make(chan struct{}) + defer close(doneCh) + go func() { + select { + case <-ctx.Done(): + log.Info("gRPC server shutting down") + srv.GracefulStop() + case <-doneCh: + } + }() + + // Keep serving until terminated. + if err := srv.Serve(lis); err != nil && err != grpc.ErrServerStopped { + log.Error(err, "gRPC server failed") + return err + } + log.Info("gRPC server terminated") + return nil + }) +} diff --git a/pkg/ext-proc/internal/runnable/leader_election.go b/pkg/ext-proc/internal/runnable/leader_election.go new file mode 100644 index 00000000..00dfc782 --- /dev/null +++ b/pkg/ext-proc/internal/runnable/leader_election.go @@ -0,0 +1,31 @@ +package runnable + +import "sigs.k8s.io/controller-runtime/pkg/manager" + +type leaderElection struct { + manager.Runnable + needsLeaderElection bool +} + +// LeaderElection wraps the given runnable to implement manager.LeaderElectionRunnable. +func LeaderElection(runnable manager.Runnable, needsLeaderElection bool) manager.Runnable { + return &leaderElection{ + Runnable: runnable, + needsLeaderElection: needsLeaderElection, + } +} + +// RequireLeaderElection wraps the given runnable, marking it as requiring leader election. +func RequireLeaderElection(runnable manager.Runnable) manager.Runnable { + return LeaderElection(runnable, true) +} + +// RequireLeaderElection wraps the given runnable, marking it as not requiring leader election. +func NoLeaderElection(runnable manager.Runnable) manager.Runnable { + return LeaderElection(runnable, false) +} + +// NeedLeaderElection implements manager.NeedLeaderElection interface. +func (r *leaderElection) NeedLeaderElection() bool { + return r.needsLeaderElection +} diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index 6bdaae66..d51435ac 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -1,11 +1,11 @@ package main import ( - "context" "flag" "fmt" "net" "net/http" + "os" "strconv" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -21,10 +21,12 @@ import ( klog "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend/vllm" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/internal/runnable" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" @@ -88,6 +90,12 @@ func init() { } func main() { + if err := run(); err != nil { + os.Exit(1) + } +} + +func run() error { opts := zap.Options{ Development: true, } @@ -97,11 +105,13 @@ func main() { cfg, err := ctrl.GetConfig() if err != nil { - klog.Fatalf("Failed to get rest config: %v", err) + klog.ErrorS(err, "Failed to get rest config") + return err } // Validate flags if err := validateFlags(); err != nil { - klog.Fatalf("Failed to validate flags: %v", err) + klog.ErrorS(err, "Failed to validate flags") + return err } // Print all flag values @@ -127,37 +137,30 @@ func main() { Config: ctrl.GetConfigOrDie(), Datastore: datastore, } - serverRunner.Setup() - - // Start health and ext-proc servers in goroutines - healthSvr := startHealthServer(datastore, *grpcHealthPort) - extProcSvr := serverRunner.Start( - datastore, - &vllm.PodMetricsClientImpl{}, - ) - // Start metrics handler - metricsSvr := startMetricsHandler(*metricsPort, cfg) - - // Start manager, blocking - serverRunner.StartManager() + if err := serverRunner.Setup(); err != nil { + klog.ErrorS(err, "Failed to setup ext-proc server") + return err + } + mgr := serverRunner.Manager - // Gracefully shutdown servers - if healthSvr != nil { - klog.Info("Health server shutting down") - healthSvr.GracefulStop() + // Register health server. + if err := registerHealthServer(mgr, datastore, *grpcHealthPort); err != nil { + return err } - if extProcSvr != nil { - klog.Info("Ext-proc server shutting down") - extProcSvr.GracefulStop() + + // Register ext-proc server. + if err := mgr.Add(serverRunner.AsRunnable(datastore, &vllm.PodMetricsClientImpl{})); err != nil { + klog.ErrorS(err, "Failed to register ext-proc server") + return err } - if metricsSvr != nil { - klog.Info("Metrics server shutting down") - if err := metricsSvr.Shutdown(context.Background()); err != nil { - klog.Infof("Metrics server Shutdown: %v", err) - } + + // Register metrics handler. + if err := registerMetricsHandler(mgr, *metricsPort, cfg); err != nil { + return err } - klog.Info("All components shutdown") + // Start the manager. + return serverRunner.StartManager(ctrl.SetupSignalHandler()) } func initLogging(opts *zap.Options) { @@ -179,68 +182,69 @@ func initLogging(opts *zap.Options) { klog.SetLogger(logger) } -// startHealthServer starts the gRPC health probe server in a goroutine. -func startHealthServer(ds *backend.K8sDatastore, port int) *grpc.Server { - svr := grpc.NewServer() - healthPb.RegisterHealthServer(svr, &healthServer{datastore: ds}) - - go func() { - lis, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) - if err != nil { - klog.Fatalf("Health server failed to listen: %v", err) - } - klog.Infof("Health server listening on port: %d", port) - - // Blocking and will return when shutdown is complete. - if err := svr.Serve(lis); err != nil && err != grpc.ErrServerStopped { - klog.Fatalf("Health server failed: %v", err) - } - klog.Info("Health server shutting down") - }() - return svr +// registerHealthServer adds the Health gRPC server as a Runnable to the given manager. +func registerHealthServer(mgr manager.Manager, ds *backend.K8sDatastore, port int) error { + srv := grpc.NewServer() + healthPb.RegisterHealthServer(srv, &healthServer{datastore: ds}) + if err := mgr.Add( + runnable.NoLeaderElection(runnable.GRPCServer("health", srv, port))); err != nil { + klog.ErrorS(err, "Failed to register health server") + return err + } + return nil } -func startMetricsHandler(port int, cfg *rest.Config) *http.Server { +// registerMetricsHandler adds the metrics HTTP handler as a Runnable to the given manager. +func registerMetricsHandler(mgr manager.Manager, port int, cfg *rest.Config) error { metrics.Register() - var svr *http.Server - go func() { - klog.Info("Starting metrics HTTP handler ...") + // Init HTTP server. + h, err := metricsHandlerWithAuthenticationAndAuthorization(cfg) + if err != nil { + return err + } + + mux := http.NewServeMux() + mux.Handle(defaultMetricsEndpoint, h) - mux := http.NewServeMux() - mux.Handle(defaultMetricsEndpoint, metricsHandlerWithAuthenticationAndAuthorization(cfg)) + srv := &http.Server{ + Addr: net.JoinHostPort("", strconv.Itoa(port)), + Handler: mux, + } - svr = &http.Server{ - Addr: net.JoinHostPort("", strconv.Itoa(port)), - Handler: mux, - } - if err := svr.ListenAndServe(); err != http.ErrServerClosed { - klog.Fatalf("failed to start metrics HTTP handler: %v", err) - } - }() - return svr + if err := mgr.Add(&manager.Server{ + Name: "metrics", + Server: srv, + }); err != nil { + klog.ErrorS(err, "Failed to register metrics HTTP handler") + return err + } + return nil } -func metricsHandlerWithAuthenticationAndAuthorization(cfg *rest.Config) http.Handler { +func metricsHandlerWithAuthenticationAndAuthorization(cfg *rest.Config) (http.Handler, error) { h := promhttp.HandlerFor( legacyregistry.DefaultGatherer, promhttp.HandlerOpts{}, ) httpClient, err := rest.HTTPClientFor(cfg) if err != nil { - klog.Fatalf("failed to create http client for metrics auth: %v", err) + klog.ErrorS(err, "Failed to create http client for metrics auth") + return nil, err } filter, err := filters.WithAuthenticationAndAuthorization(cfg, httpClient) if err != nil { - klog.Fatalf("failed to create metrics filter for auth: %v", err) + klog.ErrorS(err, "Failed to create metrics filter for auth") + return nil, err } metricsLogger := klog.LoggerWithValues(klog.NewKlogr(), "path", defaultMetricsEndpoint) metricsAuthHandler, err := filter(metricsLogger, h) if err != nil { - klog.Fatalf("failed to create metrics auth handler: %v", err) + klog.ErrorS(err, "Failed to create metrics auth handler") + return nil, err } - return metricsAuthHandler + return metricsAuthHandler, nil } func validateFlags() error { diff --git a/pkg/ext-proc/server/runserver.go b/pkg/ext-proc/server/runserver.go index affb4b6c..71499e8f 100644 --- a/pkg/ext-proc/server/runserver.go +++ b/pkg/ext-proc/server/runserver.go @@ -1,8 +1,9 @@ package server import ( + "context" + "errors" "fmt" - "net" "time" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" @@ -12,8 +13,10 @@ import ( "k8s.io/client-go/rest" klog "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/internal/runnable" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" ) @@ -31,7 +34,7 @@ type ExtProcServerRunner struct { Scheme *runtime.Scheme Config *rest.Config Datastore *backend.K8sDatastore - manager ctrl.Manager + Manager ctrl.Manager } // Default values for CLI flags in main @@ -63,13 +66,13 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner { } // Setup creates the reconcilers for pools, models, and endpointSlices and starts the manager. -func (r *ExtProcServerRunner) Setup() { +func (r *ExtProcServerRunner) Setup() error { // Create a new manager to manage controllers mgr, err := ctrl.NewManager(r.Config, ctrl.Options{Scheme: r.Scheme}) if err != nil { - klog.Fatalf("Failed to create controller manager: %v", err) + return fmt.Errorf("failed to create controller manager: %w", err) } - r.manager = mgr + r.Manager = mgr // Create the controllers and register them with the manager if err := (&backend.InferencePoolReconciler{ @@ -82,7 +85,7 @@ func (r *ExtProcServerRunner) Setup() { }, Record: mgr.GetEventRecorderFor("InferencePool"), }).SetupWithManager(mgr); err != nil { - klog.Fatalf("Failed setting up InferencePoolReconciler: %v", err) + return fmt.Errorf("failed setting up InferencePoolReconciler: %w", err) } if err := (&backend.InferenceModelReconciler{ @@ -95,7 +98,7 @@ func (r *ExtProcServerRunner) Setup() { }, Record: mgr.GetEventRecorderFor("InferenceModel"), }).SetupWithManager(mgr); err != nil { - klog.Fatalf("Failed setting up InferenceModelReconciler: %v", err) + return fmt.Errorf("failed setting up InferenceModelReconciler: %w", err) } if err := (&backend.EndpointSliceReconciler{ @@ -106,54 +109,50 @@ func (r *ExtProcServerRunner) Setup() { ServiceName: r.ServiceName, Zone: r.Zone, }).SetupWithManager(mgr); err != nil { - klog.Fatalf("Failed setting up EndpointSliceReconciler: %v", err) + return fmt.Errorf("failed setting up EndpointSliceReconciler: %v", err) } + return nil } -// Start starts the Envoy external processor server in a goroutine. -func (r *ExtProcServerRunner) Start( +// AsRunnable returns a Runnable that can be used to start the ext-proc gRPC server. +// The runnable implements LeaderElectionRunnable with leader election disabled. +func (r *ExtProcServerRunner) AsRunnable( podDatastore *backend.K8sDatastore, podMetricsClient backend.PodMetricsClient, -) *grpc.Server { - svr := grpc.NewServer() - - go func() { - lis, err := net.Listen("tcp", fmt.Sprintf(":%d", r.GrpcPort)) - if err != nil { - klog.Fatalf("Ext-proc server failed to listen: %v", err) - } - klog.Infof("Ext-proc server listening on port: %d", r.GrpcPort) - +) manager.Runnable { + return runnable.NoLeaderElection(manager.RunnableFunc(func(ctx context.Context) error { // Initialize backend provider pp := backend.NewProvider(podMetricsClient, podDatastore) if err := pp.Init(r.RefreshPodsInterval, r.RefreshMetricsInterval, r.RefreshPrometheusMetricsInterval); err != nil { - klog.Fatalf("Failed to initialize backend provider: %v", err) + klog.ErrorS(err, "Failed to initialize backend provider") + return err } - // Register ext_proc handlers + // Init the server. + srv := grpc.NewServer() extProcPb.RegisterExternalProcessorServer( - svr, + srv, handlers.NewServer(pp, scheduling.NewScheduler(pp), r.TargetEndpointKey, r.Datastore), ) - // Blocking and will return when shutdown is complete. - if err := svr.Serve(lis); err != nil && err != grpc.ErrServerStopped { - klog.Fatalf("Ext-proc server failed: %v", err) - } - klog.Info("Ext-proc server shutting down") - }() - return svr + // Forward to the gRPC runnable. + return runnable.GRPCServer("ext-proc", srv, r.GrpcPort).Start(ctx) + })) } -func (r *ExtProcServerRunner) StartManager() { - if r.manager == nil { - klog.Fatalf("Runner has no manager setup to run: %v", r) +func (r *ExtProcServerRunner) StartManager(ctx context.Context) error { + if r.Manager == nil { + err := errors.New("runner manager is not set") + klog.ErrorS(err, "Runner has no manager setup to run") + return err } + // Start the controller manager. Blocking and will return when shutdown is complete. - klog.Infof("Starting controller manager") - mgr := r.manager - if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { - klog.Fatalf("Error starting controller manager: %v", err) + klog.InfoS("Controller manager starting") + if err := r.Manager.Start(ctx); err != nil { + klog.ErrorS(err, "Error starting controller manager") + return err } - klog.Info("Controller manager shutting down") + klog.InfoS("Controller manager terminated") + return nil } diff --git a/pkg/ext-proc/server/runserver_test.go b/pkg/ext-proc/server/runserver_test.go new file mode 100644 index 00000000..df2081aa --- /dev/null +++ b/pkg/ext-proc/server/runserver_test.go @@ -0,0 +1,21 @@ +package server_test + +import ( + "testing" + + "sigs.k8s.io/controller-runtime/pkg/manager" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" +) + +func TestRunnable(t *testing.T) { + // Make sure AsRunnable() does not use leader election. + runner := server.NewDefaultExtProcServerRunner().AsRunnable(nil, nil) + r, ok := runner.(manager.LeaderElectionRunnable) + if !ok { + t.Fatal("runner is not LeaderElectionRunnable") + } + if r.NeedLeaderElection() { + t.Error("runner returned NeedLeaderElection = true, expected false") + } +} diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index e94be1a0..74c9f049 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -28,6 +28,7 @@ import ( k8syaml "k8s.io/apimachinery/pkg/util/yaml" clientgoscheme "k8s.io/client-go/kubernetes/scheme" klog "k8s.io/klog/v2" + ctrl "sigs.k8s.io/controller-runtime" k8sclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/envtest" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" @@ -406,7 +407,6 @@ func TestKubeInferenceModelRequest(t *testing.T) { } func setUpHermeticServer(pods []*backend.PodMetrics) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { - ps := make(backend.PodSet) pms := make(map[backend.Pod]*backend.PodMetrics) for _, pod := range pods { @@ -415,7 +415,14 @@ func setUpHermeticServer(pods []*backend.PodMetrics) (client extProcPb.ExternalP } pmc := &backend.FakePodMetricsClient{Res: pms} - server := serverRunner.Start(backend.NewK8sDataStore(backend.WithPods(pods)), pmc) + serverCtx, stopServer := context.WithCancel(context.Background()) + go func() { + if err := serverRunner.AsRunnable( + backend.NewK8sDataStore(backend.WithPods(pods)), pmc, + ).Start(serverCtx); err != nil { + log.Fatalf("Failed to start ext-proc server: %v", err) + } + }() // Wait the reconciler to populate the datastore. time.Sleep(10 * time.Second) @@ -435,7 +442,7 @@ func setUpHermeticServer(pods []*backend.PodMetrics) (client extProcPb.ExternalP return client, func() { cancel() conn.Close() - server.GracefulStop() + stopServer() } } @@ -447,7 +454,6 @@ func BeforeSuit() { ErrorIfCRDPathMissing: true, } cfg, err := testEnv.Start() - if err != nil { log.Fatalf("Failed to start test environment, cfg: %v error: %v", cfg, err) } @@ -469,11 +475,15 @@ func BeforeSuit() { serverRunner.Config = cfg serverRunner.Datastore = backend.NewK8sDataStore() - serverRunner.Setup() + if err := serverRunner.Setup(); err != nil { + log.Fatalf("Failed to start server runner: %v", err) + } // Start the controller manager in go routine, not blocking go func() { - serverRunner.StartManager() + if err := serverRunner.StartManager(ctrl.SetupSignalHandler()); err != nil { + log.Fatalf("Failed to start manager: %v", err) + } }() klog.Info("Setting up hermetic ExtProc server") From db21e9eaaa6222b15a624df1f9881e3f20009a37 Mon Sep 17 00:00:00 2001 From: Daneyon Hansen Date: Thu, 13 Feb 2025 13:10:21 -0500 Subject: [PATCH 17/96] feat: Adds e2e test script (#294) * feat: Adds e2e test script Signed-off-by: Daneyon Hansen * Docs the e2e test script Signed-off-by: Daneyon Hansen --------- Signed-off-by: Daneyon Hansen --- hack/test-e2e.sh | 137 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100755 hack/test-e2e.sh diff --git a/hack/test-e2e.sh b/hack/test-e2e.sh new file mode 100755 index 00000000..716e626a --- /dev/null +++ b/hack/test-e2e.sh @@ -0,0 +1,137 @@ +#!/bin/bash +# +# This script verifies end-to-end connectivity for an example inference extension test environment based on +# resources from the quickstart guide or e2e test framework. It can optionally launch a "curl" client pod to +# run these tests within the cluster. +# +# USAGE: ./hack/e2e-test.sh +# +# OPTIONAL ENVIRONMENT VARIABLES: +# - TIME: The duration (in seconds) for which the test will run. Defaults to 1 second. +# - CURL_POD: If set to "true", the script will use a Kubernetes pod named "curl" for making requests. +# - IP: Override the detected IP address. If not provided, the script attempts to use a Gateway based on +# the quickstart guide or an Envoy service IP based on the e2e test framework. +# - PORT: Override the detected port. If not provided, the script attempts to use a Gateway based on the +# quickstart guide or an Envoy service IP based on the e2e test framework. +# +# WHAT THE SCRIPT DOES: +# 1. Determines if there is a Gateway named "inference-gateway" in the "default" namespace. If found, it extracts the IP +# address and port from the Gateway's "llm-gw" listener. Otherwise, it falls back to the Envoy service in the "default" namespace. +# 2. Optionally checks for (or creates) a "curl" pod, ensuring it is ready to execute requests. +# 3. Loops for $TIME seconds, sending requests every 5 seconds to the /v1/completions endpoint to confirm successful connectivity. + +set -euo pipefail + +# Determine the directory of this script and build an absolute path to client.yaml. +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +CLIENT_YAML="$SCRIPT_DIR/../test/testdata/client.yaml" + +# TIME is the amount of time, in seconds, to run the test. +TIME=${TIME:-1} +# Optionally use a client curl pod for executing the curl command. +CURL_POD=${CURL_POD:-false} + +check_resource_exists() { + local type=$1 + local name=$2 + local namespace=$3 + + if kubectl get "$type" "$name" -n "$namespace" &>/dev/null; then + return 0 + else + return 1 + fi +} + +check_pod_ready() { + local pod_name=$1 + local namespace=$2 + # Check the Ready condition using jsonpath. Default to False if not found. + local ready_status + ready_status=$(kubectl get pod "$pod_name" -n "$namespace" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "False") + if [[ "$ready_status" == "True" ]]; then + return 0 + else + return 1 + fi +} + +# Try to get the Gateway's IP and the port from the listener named "llm-gw" if it exists. +if check_resource_exists "gateway" "inference-gateway" "default"; then + GATEWAY_IP=$(kubectl get gateway inference-gateway -n default -o jsonpath='{.status.addresses[0].value}') + # Use JSONPath to select the port from the listener with name "llm-gw" + GATEWAY_PORT=$(kubectl get gateway inference-gateway -n default -o jsonpath='{.spec.listeners[?(@.name=="llm-gw")].port}') +else + GATEWAY_IP="" + GATEWAY_PORT="" +fi + +if [[ -n "$GATEWAY_IP" && -n "$GATEWAY_PORT" ]]; then + echo "Using Gateway inference-gateway IP and port from listener 'llm-gw'." + IP=${IP:-$GATEWAY_IP} + PORT=${PORT:-$GATEWAY_PORT} +else + echo "Gateway inference-gateway not found or missing IP/port. Falling back to Envoy service." + # Ensure the Envoy service exists. + if ! check_resource_exists "svc" "envoy" "default"; then + echo "Error: Envoy service not found in namespace 'default'." + exit 1 + fi + IP=${IP:-$(kubectl get svc envoy -n default -o jsonpath='{.spec.clusterIP}')} + PORT=${PORT:-$(kubectl get svc envoy -n default -o jsonpath='{.spec.ports[0].port}')} +fi + +# Optionally verify that the curl pod exists and is ready. +if [[ "$CURL_POD" == "true" ]]; then + if ! check_resource_exists "pod" "curl" "default"; then + echo "Pod 'curl' not found in namespace 'default'. Applying client.yaml from $CLIENT_YAML..." + kubectl apply -f "$CLIENT_YAML" + fi + echo "Waiting for pod 'curl' to be ready..." + # Retry every 5 seconds for up to 30 seconds (6 attempts) + for i in {1..6}; do + if check_pod_ready "curl" "default"; then + echo "Pod 'curl' is now ready." + break + fi + echo "Retry attempt $i: Pod 'curl' not ready; waiting 5 seconds..." + sleep 5 + done + + if ! check_pod_ready "curl" "default"; then + echo "Error: Pod 'curl' is still not ready in namespace 'default' after 30 seconds." + exit 1 + fi +fi + +# Validate that we have a non-empty IP and PORT. +if [[ -z "$IP" ]]; then + echo "Error: Unable to determine a valid IP from either Gateway or Envoy service." + exit 1 +fi + +if [[ -z "$PORT" ]]; then + echo "Error: Unable to determine a valid port from either Gateway or Envoy service." + exit 1 +fi + +echo "Using IP: $IP" +echo "Using PORT: $PORT" + +# Run the test for the specified duration. +end=$((SECONDS + TIME)) +if [[ "$CURL_POD" == "true" ]]; then + while [ $SECONDS -lt $end ]; do + kubectl exec po/curl -- curl -i "$IP:$PORT/v1/completions" \ + -H 'Content-Type: application/json' \ + -d '{"model": "tweet-summary","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}' + sleep 5 + done +else + while [ $SECONDS -lt $end ]; do + curl -i "$IP:$PORT/v1/completions" \ + -H 'Content-Type: application/json' \ + -d '{"model": "tweet-summary","prompt": "Write as if you were a critic: San Francisco","max_tokens": 100,"temperature": 0}' + sleep 5 + done +fi From 46541d0e92a5050a194b6aaf71a80fcb3b9c3fc8 Mon Sep 17 00:00:00 2001 From: kfswain <137822113+kfswain@users.noreply.github.com> Date: Thu, 13 Feb 2025 12:02:20 -0700 Subject: [PATCH 18/96] Replacing endpointSlice Reconciler with a direct Pod Reconciler (#300) * reversion to pod reconciliation * adding ready check and unit tests * updating test * ablating unnecessary func * embedding ready status into update so non-ready pods are deleted * scrubbing serviceName & zone as they are obsolete * implementing pod cache flushing logic * Renaming file so merge confilcts can find the diffs easier * cleaning up messy merge conflict * nil checking short circuit * Listing fixes * feedback cleanup * log formatting and removing pods if not found * removing err to provent perma-reconciliation * removing dev image ref * cleaning up err logic --- pkg/ext-proc/backend/datastore.go | 42 ++++ .../backend/endpointslice_reconciler.go | 109 ---------- .../backend/endpointslice_reconcilier_test.go | 202 ------------------ .../backend/inferencemodel_reconciler_test.go | 21 ++ .../backend/inferencepool_reconciler.go | 12 +- pkg/ext-proc/backend/pod_reconciler.go | 80 +++++++ pkg/ext-proc/backend/pod_reconciler_test.go | 168 +++++++++++++++ pkg/ext-proc/main.go | 14 -- pkg/ext-proc/server/runserver.go | 18 +- pkg/manifests/ext_proc.yaml | 2 - 10 files changed, 324 insertions(+), 344 deletions(-) delete mode 100644 pkg/ext-proc/backend/endpointslice_reconciler.go delete mode 100644 pkg/ext-proc/backend/endpointslice_reconcilier_test.go create mode 100644 pkg/ext-proc/backend/pod_reconciler.go create mode 100644 pkg/ext-proc/backend/pod_reconciler_test.go diff --git a/pkg/ext-proc/backend/datastore.go b/pkg/ext-proc/backend/datastore.go index 3208be26..be3c7f0b 100644 --- a/pkg/ext-proc/backend/datastore.go +++ b/pkg/ext-proc/backend/datastore.go @@ -1,12 +1,16 @@ package backend import ( + "context" "errors" "math/rand" + "strconv" "sync" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/labels" "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -111,3 +115,41 @@ func IsCritical(model *v1alpha1.InferenceModel) bool { } return false } + +func (ds *K8sDatastore) LabelsMatch(podLabels map[string]string) bool { + poolSelector := selectorFromInferencePoolSelector(ds.inferencePool.Spec.Selector) + podSet := labels.Set(podLabels) + return poolSelector.Matches(podSet) +} + +func (ds *K8sDatastore) flushPodsAndRefetch(ctx context.Context, ctrlClient client.Client, newServerPool *v1alpha1.InferencePool) { + podList := &corev1.PodList{} + if err := ctrlClient.List(ctx, podList, &client.ListOptions{ + LabelSelector: selectorFromInferencePoolSelector(newServerPool.Spec.Selector), + Namespace: newServerPool.Namespace, + }); err != nil { + klog.Error(err, "error listing clients") + } + ds.pods.Clear() + + for _, k8sPod := range podList.Items { + pod := Pod{ + Name: k8sPod.Name, + Address: k8sPod.Status.PodIP + ":" + strconv.Itoa(int(newServerPool.Spec.TargetPortNumber)), + } + ds.pods.Store(pod, true) + } + +} + +func selectorFromInferencePoolSelector(selector map[v1alpha1.LabelKey]v1alpha1.LabelValue) labels.Selector { + return labels.SelectorFromSet(stripLabelKeyAliasFromLabelMap(selector)) +} + +func stripLabelKeyAliasFromLabelMap(labels map[v1alpha1.LabelKey]v1alpha1.LabelValue) map[string]string { + outMap := make(map[string]string) + for k, v := range labels { + outMap[string(k)] = string(v) + } + return outMap +} diff --git a/pkg/ext-proc/backend/endpointslice_reconciler.go b/pkg/ext-proc/backend/endpointslice_reconciler.go deleted file mode 100644 index ebc182b8..00000000 --- a/pkg/ext-proc/backend/endpointslice_reconciler.go +++ /dev/null @@ -1,109 +0,0 @@ -package backend - -import ( - "context" - "strconv" - "time" - - discoveryv1 "k8s.io/api/discovery/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/client-go/tools/record" - klog "k8s.io/klog/v2" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/predicate" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" -) - -var ( - serviceOwnerLabel = "kubernetes.io/service-name" -) - -type EndpointSliceReconciler struct { - client.Client - Scheme *runtime.Scheme - Record record.EventRecorder - ServiceName string - Zone string - Datastore *K8sDatastore -} - -func (c *EndpointSliceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - inferencePool, err := c.Datastore.getInferencePool() - if err != nil { - klog.V(logutil.DEFAULT).Infof("Skipping reconciling EndpointSlice because the InferencePool is not available yet: %v", err) - return ctrl.Result{Requeue: true, RequeueAfter: time.Second}, nil - } - - klog.V(logutil.DEFAULT).Info("Reconciling EndpointSlice ", req.NamespacedName) - - endpointSlice := &discoveryv1.EndpointSlice{} - if err := c.Get(ctx, req.NamespacedName, endpointSlice); err != nil { - klog.Errorf("Unable to get EndpointSlice: %v", err) - return ctrl.Result{}, err - } - c.updateDatastore(endpointSlice, inferencePool) - - return ctrl.Result{}, nil -} - -// TODO: Support multiple endpointslices for a single service -func (c *EndpointSliceReconciler) updateDatastore( - slice *discoveryv1.EndpointSlice, - inferencePool *v1alpha1.InferencePool) { - podMap := make(map[Pod]bool) - - for _, endpoint := range slice.Endpoints { - klog.V(logutil.DEFAULT).Infof("Zone: %v \n endpoint: %+v \n", c.Zone, endpoint) - if c.validPod(endpoint) { - pod := Pod{ - Name: endpoint.TargetRef.Name, - Address: endpoint.Addresses[0] + ":" + strconv.Itoa(int(inferencePool.Spec.TargetPortNumber)), - } - podMap[pod] = true - klog.V(logutil.DEFAULT).Infof("Storing pod %v", pod) - c.Datastore.pods.Store(pod, true) - } - } - - removeOldPods := func(k, v any) bool { - pod, ok := k.(Pod) - if !ok { - klog.Errorf("Unable to cast key to Pod: %v", k) - return false - } - if _, ok := podMap[pod]; !ok { - klog.V(logutil.DEFAULT).Infof("Removing pod %v", pod) - c.Datastore.pods.Delete(pod) - } - return true - } - c.Datastore.pods.Range(removeOldPods) -} - -func (c *EndpointSliceReconciler) SetupWithManager(mgr ctrl.Manager) error { - ownsEndPointSlice := func(object client.Object) bool { - // Check if the object is an EndpointSlice - endpointSlice, ok := object.(*discoveryv1.EndpointSlice) - if !ok { - return false - } - - gotLabel := endpointSlice.ObjectMeta.Labels[serviceOwnerLabel] - wantLabel := c.ServiceName - return gotLabel == wantLabel - } - - return ctrl.NewControllerManagedBy(mgr). - For(&discoveryv1.EndpointSlice{}, - builder.WithPredicates(predicate.NewPredicateFuncs(ownsEndPointSlice))). - Complete(c) -} - -func (c *EndpointSliceReconciler) validPod(endpoint discoveryv1.Endpoint) bool { - validZone := c.Zone == "" || c.Zone != "" && *endpoint.Zone == c.Zone - return validZone && *endpoint.Conditions.Ready - -} diff --git a/pkg/ext-proc/backend/endpointslice_reconcilier_test.go b/pkg/ext-proc/backend/endpointslice_reconcilier_test.go deleted file mode 100644 index 9a3d55d8..00000000 --- a/pkg/ext-proc/backend/endpointslice_reconcilier_test.go +++ /dev/null @@ -1,202 +0,0 @@ -package backend - -import ( - "sync" - "testing" - - v1 "k8s.io/api/core/v1" - discoveryv1 "k8s.io/api/discovery/v1" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" -) - -var ( - basePod1 = Pod{Name: "pod1"} - basePod2 = Pod{Name: "pod2"} - basePod3 = Pod{Name: "pod3"} -) - -func TestUpdateDatastore_EndpointSliceReconciler(t *testing.T) { - tests := []struct { - name string - datastore *K8sDatastore - incomingSlice *discoveryv1.EndpointSlice - wantPods *sync.Map - }{ - { - name: "Add new pod", - datastore: &K8sDatastore{ - pods: populateMap(basePod1, basePod2), - inferencePool: &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - TargetPortNumber: int32(8000), - }, - }, - }, - incomingSlice: &discoveryv1.EndpointSlice{ - Endpoints: []discoveryv1.Endpoint{ - { - TargetRef: &v1.ObjectReference{ - Name: "pod1", - }, - Zone: new(string), - Conditions: discoveryv1.EndpointConditions{ - Ready: truePointer(), - }, - Addresses: []string{"0.0.0.0"}, - }, - { - TargetRef: &v1.ObjectReference{ - Name: "pod2", - }, - Zone: new(string), - Conditions: discoveryv1.EndpointConditions{ - Ready: truePointer(), - }, - Addresses: []string{"0.0.0.0"}, - }, - { - TargetRef: &v1.ObjectReference{ - Name: "pod3", - }, - Zone: new(string), - Conditions: discoveryv1.EndpointConditions{ - Ready: truePointer(), - }, - Addresses: []string{"0.0.0.0"}, - }, - }, - }, - wantPods: populateMap(basePod1, basePod2, basePod3), - }, - { - name: "New pod, but its not ready yet. Do not add.", - datastore: &K8sDatastore{ - pods: populateMap(basePod1, basePod2), - inferencePool: &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - TargetPortNumber: int32(8000), - }, - }, - }, - incomingSlice: &discoveryv1.EndpointSlice{ - Endpoints: []discoveryv1.Endpoint{ - { - TargetRef: &v1.ObjectReference{ - Name: "pod1", - }, - Zone: new(string), - Conditions: discoveryv1.EndpointConditions{ - Ready: truePointer(), - }, - Addresses: []string{"0.0.0.0"}, - }, - { - TargetRef: &v1.ObjectReference{ - Name: "pod2", - }, - Zone: new(string), - Conditions: discoveryv1.EndpointConditions{ - Ready: truePointer(), - }, - Addresses: []string{"0.0.0.0"}, - }, - { - TargetRef: &v1.ObjectReference{ - Name: "pod3", - }, - Zone: new(string), - Conditions: discoveryv1.EndpointConditions{ - Ready: new(bool), - }, - Addresses: []string{"0.0.0.0"}, - }, - }, - }, - wantPods: populateMap(basePod1, basePod2), - }, - { - name: "Existing pod not ready, new pod added, and is ready", - datastore: &K8sDatastore{ - pods: populateMap(basePod1, basePod2), - inferencePool: &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - TargetPortNumber: int32(8000), - }, - }, - }, - incomingSlice: &discoveryv1.EndpointSlice{ - Endpoints: []discoveryv1.Endpoint{ - { - TargetRef: &v1.ObjectReference{ - Name: "pod1", - }, - Zone: new(string), - Conditions: discoveryv1.EndpointConditions{ - Ready: new(bool), - }, - Addresses: []string{"0.0.0.0"}, - }, - { - TargetRef: &v1.ObjectReference{ - Name: "pod2", - }, - Zone: new(string), - Conditions: discoveryv1.EndpointConditions{ - Ready: truePointer(), - }, - Addresses: []string{"0.0.0.0"}, - }, - { - TargetRef: &v1.ObjectReference{ - Name: "pod3", - }, - Zone: new(string), - Conditions: discoveryv1.EndpointConditions{ - Ready: truePointer(), - }, - Addresses: []string{"0.0.0.0"}, - }, - }, - }, - wantPods: populateMap(basePod3, basePod2), - }, - } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - endpointSliceReconciler := &EndpointSliceReconciler{Datastore: test.datastore, Zone: ""} - endpointSliceReconciler.updateDatastore(test.incomingSlice, test.datastore.inferencePool) - - if mapsEqual(endpointSliceReconciler.Datastore.pods, test.wantPods) { - t.Errorf("Unexpected output pod mismatch. \n Got %v \n Want: %v \n", - endpointSliceReconciler.Datastore.pods, - test.wantPods) - } - }) - } -} - -func mapsEqual(map1, map2 *sync.Map) bool { - equal := true - - map1.Range(func(k, v any) bool { - if _, ok := map2.Load(k); !ok { - equal = false - return false - } - return true - }) - map2.Range(func(k, v any) bool { - if _, ok := map1.Load(k); !ok { - equal = false - return false - } - return true - }) - - return equal -} - -func truePointer() *bool { - primitivePointersAreSilly := true - return &primitivePointersAreSilly -} diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go index 415358b2..c5ef8d14 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go @@ -309,3 +309,24 @@ func populateServiceMap(services ...*v1alpha1.InferenceModel) *sync.Map { } return returnVal } + +func mapsEqual(map1, map2 *sync.Map) bool { + equal := true + + map1.Range(func(k, v any) bool { + if _, ok := map2.Load(k); !ok { + equal = false + return false + } + return true + }) + map2.Range(func(k, v any) bool { + if _, ok := map1.Load(k); !ok { + equal = false + return false + } + return true + }) + + return equal +} diff --git a/pkg/ext-proc/backend/inferencepool_reconciler.go b/pkg/ext-proc/backend/inferencepool_reconciler.go index b4cba202..fd15ebc3 100644 --- a/pkg/ext-proc/backend/inferencepool_reconciler.go +++ b/pkg/ext-proc/backend/inferencepool_reconciler.go @@ -2,6 +2,7 @@ package backend import ( "context" + "reflect" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -21,7 +22,6 @@ type InferencePoolReconciler struct { Record record.EventRecorder PoolNamespacedName types.NamespacedName Datastore *K8sDatastore - Zone string } func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { @@ -32,11 +32,15 @@ func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Reques serverPool := &v1alpha1.InferencePool{} if err := c.Get(ctx, req.NamespacedName, serverPool); err != nil { - klog.Error(err, "unable to get InferencePool") + klog.Error(err, ": unable to get InferencePool") return ctrl.Result{}, err } - - c.updateDatastore(serverPool) + if c.Datastore.inferencePool == nil || !reflect.DeepEqual(serverPool.Spec.Selector, c.Datastore.inferencePool.Spec.Selector) { + c.updateDatastore(serverPool) + c.Datastore.flushPodsAndRefetch(ctx, c.Client, serverPool) + } else { + c.updateDatastore(serverPool) + } return ctrl.Result{}, nil } diff --git a/pkg/ext-proc/backend/pod_reconciler.go b/pkg/ext-proc/backend/pod_reconciler.go new file mode 100644 index 00000000..60d014ce --- /dev/null +++ b/pkg/ext-proc/backend/pod_reconciler.go @@ -0,0 +1,80 @@ +package backend + +import ( + "context" + "strconv" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/record" + "k8s.io/klog/v2" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" +) + +type PodReconciler struct { + client.Client + Datastore *K8sDatastore + Scheme *runtime.Scheme + Record record.EventRecorder +} + +func (c *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + inferencePool, err := c.Datastore.getInferencePool() + if err != nil { + klog.V(logutil.DEFAULT).Infof("Skipping reconciling Pod because the InferencePool is not available yet: %v", err) + // When the inferencePool is initialized it lists the appropriate pods and populates the datastore, so no need to requeue. + return ctrl.Result{}, nil + } else if inferencePool.Namespace != req.Namespace { + return ctrl.Result{}, nil + } + + klog.V(logutil.VERBOSE).Info("reconciling Pod", req.NamespacedName) + + pod := &corev1.Pod{} + if err := c.Get(ctx, req.NamespacedName, pod); err != nil { + klog.Error(err, ": unable to get pod") + if apierrors.IsNotFound(err) { + c.Datastore.pods.Delete(pod) + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + c.updateDatastore(pod, inferencePool) + + return ctrl.Result{}, nil +} + +func (c *PodReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&corev1.Pod{}). + Complete(c) +} + +func (c *PodReconciler) updateDatastore(k8sPod *corev1.Pod, inferencePool *v1alpha1.InferencePool) { + pod := Pod{ + Name: k8sPod.Name, + Address: k8sPod.Status.PodIP + ":" + strconv.Itoa(int(inferencePool.Spec.TargetPortNumber)), + } + if !k8sPod.DeletionTimestamp.IsZero() || !c.Datastore.LabelsMatch(k8sPod.ObjectMeta.Labels) || !podIsReady(k8sPod) { + c.Datastore.pods.Delete(pod) + } else { + c.Datastore.pods.Store(pod, true) + } +} + +func podIsReady(pod *corev1.Pod) bool { + for _, condition := range pod.Status.Conditions { + if condition.Type == corev1.PodReady { + if condition.Status == corev1.ConditionTrue { + return true + } + break + } + } + return false +} diff --git a/pkg/ext-proc/backend/pod_reconciler_test.go b/pkg/ext-proc/backend/pod_reconciler_test.go new file mode 100644 index 00000000..42d6d8e4 --- /dev/null +++ b/pkg/ext-proc/backend/pod_reconciler_test.go @@ -0,0 +1,168 @@ +package backend + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" +) + +var ( + basePod1 = Pod{Name: "pod1", Address: ":8000"} + basePod2 = Pod{Name: "pod2", Address: ":8000"} + basePod3 = Pod{Name: "pod3", Address: ":8000"} +) + +func TestUpdateDatastore_PodReconciler(t *testing.T) { + tests := []struct { + name string + datastore *K8sDatastore + incomingPod *corev1.Pod + wantPods []string + }{ + { + name: "Add new pod", + datastore: &K8sDatastore{ + pods: populateMap(basePod1, basePod2), + inferencePool: &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + "some-key": "some-val", + }, + }, + }, + }, + incomingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod3", + Labels: map[string]string{ + "some-key": "some-val", + }, + }, + Status: corev1.PodStatus{ + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionTrue, + }, + }, + }, + }, + wantPods: []string{basePod1.Name, basePod2.Name, basePod3.Name}, + }, + { + name: "New pod, not ready, valid selector", + datastore: &K8sDatastore{ + pods: populateMap(basePod1, basePod2), + inferencePool: &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + "some-key": "some-val", + }, + }, + }, + }, + incomingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod3", + Labels: map[string]string{ + "some-key": "some-val", + }, + }, + Status: corev1.PodStatus{ + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionFalse, + }, + }, + }, + }, + wantPods: []string{basePod1.Name, basePod2.Name}, + }, + { + name: "Remove pod that does not match selector", + datastore: &K8sDatastore{ + pods: populateMap(basePod1, basePod2), + inferencePool: &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + "some-key": "some-val", + }, + }, + }, + }, + incomingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Labels: map[string]string{ + "some-wrong-key": "some-val", + }, + }, + Status: corev1.PodStatus{ + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionTrue, + }, + }, + }, + }, + wantPods: []string{basePod2.Name}, + }, + { + name: "Remove pod that is not ready", + datastore: &K8sDatastore{ + pods: populateMap(basePod1, basePod2), + inferencePool: &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + "some-key": "some-val", + }, + }, + }, + }, + incomingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Labels: map[string]string{ + "some-wrong-key": "some-val", + }, + }, + Status: corev1.PodStatus{ + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionFalse, + }, + }, + }, + }, + wantPods: []string{basePod2.Name}, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + podReconciler := &PodReconciler{Datastore: test.datastore} + podReconciler.updateDatastore(test.incomingPod, test.datastore.inferencePool) + var gotPods []string + test.datastore.pods.Range(func(k, v any) bool { + pod := k.(Pod) + if v != nil { + gotPods = append(gotPods, pod.Name) + } + return true + }) + if !cmp.Equal(gotPods, test.wantPods, cmpopts.SortSlices(func(a, b string) bool { return a < b })) { + t.Errorf("got (%v) != want (%v);", gotPods, test.wantPods) + } + }) + } +} diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index d51435ac..30b87299 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -59,14 +59,6 @@ var ( "poolNamespace", runserver.DefaultPoolNamespace, "Namespace of the InferencePool this Endpoint Picker is associated with.") - serviceName = flag.String( - "serviceName", - runserver.DefaultServiceName, - "Name of the Service that will be used to read EndpointSlices from") - zone = flag.String( - "zone", - runserver.DefaultZone, - "The zone that this instance is created in. Will be passed to the corresponding endpointSlice. ") refreshPodsInterval = flag.Duration( "refreshPodsInterval", runserver.DefaultRefreshPodsInterval, @@ -128,8 +120,6 @@ func run() error { TargetEndpointKey: *targetEndpointKey, PoolName: *poolName, PoolNamespace: *poolNamespace, - ServiceName: *serviceName, - Zone: *zone, RefreshPodsInterval: *refreshPodsInterval, RefreshMetricsInterval: *refreshMetricsInterval, RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval, @@ -252,9 +242,5 @@ func validateFlags() error { return fmt.Errorf("required %q flag not set", "poolName") } - if *serviceName == "" { - return fmt.Errorf("required %q flag not set", "serviceName") - } - return nil } diff --git a/pkg/ext-proc/server/runserver.go b/pkg/ext-proc/server/runserver.go index 71499e8f..d7d4c71a 100644 --- a/pkg/ext-proc/server/runserver.go +++ b/pkg/ext-proc/server/runserver.go @@ -26,8 +26,6 @@ type ExtProcServerRunner struct { TargetEndpointKey string PoolName string PoolNamespace string - ServiceName string - Zone string RefreshPodsInterval time.Duration RefreshMetricsInterval time.Duration RefreshPrometheusMetricsInterval time.Duration @@ -43,8 +41,6 @@ const ( DefaultTargetEndpointKey = "x-gateway-destination-endpoint" // default for --targetEndpointKey DefaultPoolName = "" // required but no default DefaultPoolNamespace = "default" // default for --poolNamespace - DefaultServiceName = "" // required but no default - DefaultZone = "" // default for --zone DefaultRefreshPodsInterval = 10 * time.Second // default for --refreshPodsInterval DefaultRefreshMetricsInterval = 50 * time.Millisecond // default for --refreshMetricsInterval DefaultRefreshPrometheusMetricsInterval = 5 * time.Second // default for --refreshPrometheusMetricsInterval @@ -56,8 +52,6 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner { TargetEndpointKey: DefaultTargetEndpointKey, PoolName: DefaultPoolName, PoolNamespace: DefaultPoolNamespace, - ServiceName: DefaultServiceName, - Zone: DefaultZone, RefreshPodsInterval: DefaultRefreshPodsInterval, RefreshMetricsInterval: DefaultRefreshMetricsInterval, RefreshPrometheusMetricsInterval: DefaultRefreshPrometheusMetricsInterval, @@ -101,13 +95,11 @@ func (r *ExtProcServerRunner) Setup() error { return fmt.Errorf("failed setting up InferenceModelReconciler: %w", err) } - if err := (&backend.EndpointSliceReconciler{ - Datastore: r.Datastore, - Scheme: mgr.GetScheme(), - Client: mgr.GetClient(), - Record: mgr.GetEventRecorderFor("endpointslice"), - ServiceName: r.ServiceName, - Zone: r.Zone, + if err := (&backend.PodReconciler{ + Datastore: r.Datastore, + Scheme: mgr.GetScheme(), + Client: mgr.GetClient(), + Record: mgr.GetEventRecorderFor("pod"), }).SetupWithManager(mgr); err != nil { return fmt.Errorf("failed setting up EndpointSliceReconciler: %v", err) } diff --git a/pkg/manifests/ext_proc.yaml b/pkg/manifests/ext_proc.yaml index a7dc7678..49145d24 100644 --- a/pkg/manifests/ext_proc.yaml +++ b/pkg/manifests/ext_proc.yaml @@ -78,8 +78,6 @@ spec: - "vllm-llama2-7b-pool" - -v - "3" - - -serviceName - - "vllm-llama2-7b-pool" - -grpcPort - "9002" - -grpcHealthPort From 5bc8fcdd64de5a9125e028c1763596c49d4084bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Kupka?= Date: Thu, 13 Feb 2025 20:20:21 +0100 Subject: [PATCH 19/96] Move manager from runserver to main (#331) The manager setup logic is now moved to main. runserver package does not manage the manager any more. This establishes a clear separation of concerns. --- pkg/ext-proc/main.go | 34 +++++++++++++++++++---------- pkg/ext-proc/server/runserver.go | 36 +++---------------------------- test/integration/hermetic_test.go | 13 +++++++---- 3 files changed, 35 insertions(+), 48 deletions(-) diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index 30b87299..968d09f5 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -95,11 +95,6 @@ func run() error { flag.Parse() initLogging(&opts) - cfg, err := ctrl.GetConfig() - if err != nil { - klog.ErrorS(err, "Failed to get rest config") - return err - } // Validate flags if err := validateFlags(); err != nil { klog.ErrorS(err, "Failed to validate flags") @@ -115,6 +110,20 @@ func run() error { datastore := backend.NewK8sDataStore() + // Init runtime. + cfg, err := ctrl.GetConfig() + if err != nil { + klog.ErrorS(err, "Failed to get rest config") + return err + } + + mgr, err := ctrl.NewManager(cfg, ctrl.Options{Scheme: scheme}) + if err != nil { + klog.ErrorS(err, "Failed to create controller manager", "config", cfg) + return err + } + + // Setup runner. serverRunner := &runserver.ExtProcServerRunner{ GrpcPort: *grpcPort, TargetEndpointKey: *targetEndpointKey, @@ -123,15 +132,12 @@ func run() error { RefreshPodsInterval: *refreshPodsInterval, RefreshMetricsInterval: *refreshMetricsInterval, RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval, - Scheme: scheme, - Config: ctrl.GetConfigOrDie(), Datastore: datastore, } - if err := serverRunner.Setup(); err != nil { + if err := serverRunner.SetupWithManager(mgr); err != nil { klog.ErrorS(err, "Failed to setup ext-proc server") return err } - mgr := serverRunner.Manager // Register health server. if err := registerHealthServer(mgr, datastore, *grpcHealthPort); err != nil { @@ -149,8 +155,14 @@ func run() error { return err } - // Start the manager. - return serverRunner.StartManager(ctrl.SetupSignalHandler()) + // Start the manager. This blocks until a signal is received. + klog.InfoS("Controller manager starting") + if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + klog.ErrorS(err, "Error starting controller manager") + return err + } + klog.InfoS("Controller manager terminated") + return nil } func initLogging(opts *zap.Options) { diff --git a/pkg/ext-proc/server/runserver.go b/pkg/ext-proc/server/runserver.go index d7d4c71a..2d92e412 100644 --- a/pkg/ext-proc/server/runserver.go +++ b/pkg/ext-proc/server/runserver.go @@ -2,15 +2,12 @@ package server import ( "context" - "errors" "fmt" "time" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "google.golang.org/grpc" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/rest" klog "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -29,10 +26,7 @@ type ExtProcServerRunner struct { RefreshPodsInterval time.Duration RefreshMetricsInterval time.Duration RefreshPrometheusMetricsInterval time.Duration - Scheme *runtime.Scheme - Config *rest.Config Datastore *backend.K8sDatastore - Manager ctrl.Manager } // Default values for CLI flags in main @@ -55,19 +49,12 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner { RefreshPodsInterval: DefaultRefreshPodsInterval, RefreshMetricsInterval: DefaultRefreshMetricsInterval, RefreshPrometheusMetricsInterval: DefaultRefreshPrometheusMetricsInterval, - // Scheme, Config, and Datastore can be assigned later. + // Datastore can be assigned later. } } -// Setup creates the reconcilers for pools, models, and endpointSlices and starts the manager. -func (r *ExtProcServerRunner) Setup() error { - // Create a new manager to manage controllers - mgr, err := ctrl.NewManager(r.Config, ctrl.Options{Scheme: r.Scheme}) - if err != nil { - return fmt.Errorf("failed to create controller manager: %w", err) - } - r.Manager = mgr - +// SetupWithManager sets up the runner with the given manager. +func (r *ExtProcServerRunner) SetupWithManager(mgr ctrl.Manager) error { // Create the controllers and register them with the manager if err := (&backend.InferencePoolReconciler{ Datastore: r.Datastore, @@ -131,20 +118,3 @@ func (r *ExtProcServerRunner) AsRunnable( return runnable.GRPCServer("ext-proc", srv, r.GrpcPort).Start(ctx) })) } - -func (r *ExtProcServerRunner) StartManager(ctx context.Context) error { - if r.Manager == nil { - err := errors.New("runner manager is not set") - klog.ErrorS(err, "Runner has no manager setup to run") - return err - } - - // Start the controller manager. Blocking and will return when shutdown is complete. - klog.InfoS("Controller manager starting") - if err := r.Manager.Start(ctx); err != nil { - klog.ErrorS(err, "Error starting controller manager") - return err - } - klog.InfoS("Controller manager terminated") - return nil -} diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index 74c9f049..ff018f28 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -468,20 +468,25 @@ func BeforeSuit() { log.Fatalf("No error, but returned kubernetes client is nil, cfg: %v", cfg) } + // Init runtime. + mgr, err := ctrl.NewManager(cfg, ctrl.Options{Scheme: scheme}) + if err != nil { + klog.ErrorS(err, "Failed to create controller manager") + klog.FlushAndExit(klog.ExitFlushTimeout, 1) + } + serverRunner = runserver.NewDefaultExtProcServerRunner() // Adjust from defaults serverRunner.PoolName = "vllm-llama2-7b-pool" - serverRunner.Scheme = scheme - serverRunner.Config = cfg serverRunner.Datastore = backend.NewK8sDataStore() - if err := serverRunner.Setup(); err != nil { + if err := serverRunner.SetupWithManager(mgr); err != nil { log.Fatalf("Failed to start server runner: %v", err) } // Start the controller manager in go routine, not blocking go func() { - if err := serverRunner.StartManager(ctrl.SetupSignalHandler()); err != nil { + if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { log.Fatalf("Failed to start manager: %v", err) } }() From cdf3533105b34f92783c13c1b8a7dced580b15f2 Mon Sep 17 00:00:00 2001 From: Daneyon Hansen Date: Thu, 13 Feb 2025 18:20:20 -0500 Subject: [PATCH 20/96] Adds image-load and kind-load Make targets (#288) Signed-off-by: Daneyon Hansen --- Makefile | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 83de8dd1..b7654ed7 100644 --- a/Makefile +++ b/Makefile @@ -44,6 +44,9 @@ ifdef IMAGE_EXTRA_TAG IMAGE_BUILD_EXTRA_OPTS += -t $(IMAGE_EXTRA_TAG) endif +# The name of the kind cluster to use for the "kind-load" target. +KIND_CLUSTER ?= kind + ##@ General # The help target prints out all targets with their descriptions organized @@ -132,28 +135,42 @@ verify: vet fmt-verify manifests generate ci-lint # Build the container image .PHONY: image-local-build -image-local-build: +image-local-build: ## Build the EPP image using Docker Buildx for local development. BUILDER=$(shell $(DOCKER_BUILDX_CMD) create --use) $(MAKE) image-build PUSH=$(PUSH) + $(MAKE) image-build LOAD=$(LOAD) $(DOCKER_BUILDX_CMD) rm $$BUILDER .PHONY: image-local-push -image-local-push: PUSH=--push +image-local-push: PUSH=--push ## Build the EPP image for local development and push it to $IMAGE_REPO. image-local-push: image-local-build +.PHONY: image-local-load +image-local-load: LOAD=--load ## Build the EPP image for local development and load it in the local Docker registry. +image-local-load: image-local-build + .PHONY: image-build -image-build: +image-build: ## Build the EPP image using Docker Buildx. $(IMAGE_BUILD_CMD) -t $(IMAGE_TAG) \ --platform=$(PLATFORMS) \ --build-arg BASE_IMAGE=$(BASE_IMAGE) \ --build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \ $(PUSH) \ + $(LOAD) \ $(IMAGE_BUILD_EXTRA_OPTS) ./ .PHONY: image-push -image-push: PUSH=--push +image-push: PUSH=--push ## Build the EPP image and push it to $IMAGE_REPO. image-push: image-build +.PHONY: image-load +image-load: LOAD=--load ## Build the EPP image and load it in the local Docker registry. +image-load: image-build + +.PHONY: image-kind +image-kind: image-build ## Build the EPP image and load it to kind cluster $KIND_CLUSTER ("kind" by default). + kind load docker-image $(IMAGE_TAG) --name $(KIND_CLUSTER) + ##@ Docs .PHONY: build-docs From ef9b92fbf1fb44c85232ab701aeb6cab42ef4ed2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Kupka?= Date: Fri, 14 Feb 2025 00:54:20 +0100 Subject: [PATCH 21/96] Use structured logging (#330) * Use structured logging All logging calls are rewritten to use structured logging. * test/integration: Use logutil.Fatal --- pkg/ext-proc/backend/datastore.go | 2 +- pkg/ext-proc/backend/fake.go | 3 +- .../backend/inferencemodel_reconciler.go | 18 +++++---- .../backend/inferencepool_reconciler.go | 8 ++-- pkg/ext-proc/backend/provider.go | 19 +++++----- pkg/ext-proc/backend/vllm/metrics.go | 14 +++---- pkg/ext-proc/handlers/request.go | 22 +++++------ pkg/ext-proc/handlers/response.go | 8 ++-- pkg/ext-proc/handlers/server.go | 20 +++++----- pkg/ext-proc/health.go | 5 ++- pkg/ext-proc/main.go | 6 +-- pkg/ext-proc/metrics/metrics.go | 12 ++++-- pkg/ext-proc/scheduling/filter.go | 6 +-- pkg/ext-proc/scheduling/scheduler.go | 7 ++-- pkg/ext-proc/test/benchmark/benchmark.go | 15 ++++++-- pkg/ext-proc/test/utils.go | 11 +++--- pkg/ext-proc/util/logging/fatal.go | 11 ++++++ test/integration/hermetic_test.go | 37 +++++++++---------- 18 files changed, 128 insertions(+), 96 deletions(-) create mode 100644 pkg/ext-proc/util/logging/fatal.go diff --git a/pkg/ext-proc/backend/datastore.go b/pkg/ext-proc/backend/datastore.go index be3c7f0b..a54833bc 100644 --- a/pkg/ext-proc/backend/datastore.go +++ b/pkg/ext-proc/backend/datastore.go @@ -98,7 +98,7 @@ func RandomWeightedDraw(model *v1alpha1.InferenceModel, seed int64) string { for _, model := range model.Spec.TargetModels { weights += *model.Weight } - klog.V(logutil.VERBOSE).Infof("Weights for Model(%v) total to: %v", model.Name, weights) + klog.V(logutil.VERBOSE).InfoS("Weights for model computed", "model", model.Name, "weights", weights) randomVal := r.Int31n(weights) for _, model := range model.Spec.TargetModels { if randomVal < *model.Weight { diff --git a/pkg/ext-proc/backend/fake.go b/pkg/ext-proc/backend/fake.go index 8c028b77..7ab8a464 100644 --- a/pkg/ext-proc/backend/fake.go +++ b/pkg/ext-proc/backend/fake.go @@ -5,6 +5,7 @@ import ( klog "k8s.io/klog/v2" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) type FakePodMetricsClient struct { @@ -16,7 +17,7 @@ func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, pod Pod, existi if err, ok := f.Err[pod]; ok { return nil, err } - klog.V(1).Infof("pod: %+v\n existing: %+v \n new: %+v \n", pod, existing, f.Res[pod]) + klog.V(logutil.VERBOSE).InfoS("Fetching metrics for pod", "pod", pod, "existing", existing, "new", f.Res[pod]) return f.Res[pod], nil } diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler.go b/pkg/ext-proc/backend/inferencemodel_reconciler.go index f0a13941..72ea063e 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler.go @@ -26,19 +26,21 @@ func (c *InferenceModelReconciler) Reconcile(ctx context.Context, req ctrl.Reque if req.Namespace != c.PoolNamespacedName.Namespace { return ctrl.Result{}, nil } - klog.V(1).Infof("Reconciling InferenceModel %v", req.NamespacedName) + + klogV := klog.V(logutil.DEFAULT) + klogV.InfoS("Reconciling InferenceModel", "name", req.NamespacedName) infModel := &v1alpha1.InferenceModel{} if err := c.Get(ctx, req.NamespacedName, infModel); err != nil { if errors.IsNotFound(err) { - klog.V(1).Infof("InferenceModel %v not found. Removing from datastore since object must be deleted", req.NamespacedName) + klogV.InfoS("InferenceModel not found. Removing from datastore since object must be deleted", "name", req.NamespacedName) c.Datastore.InferenceModels.Delete(infModel.Spec.ModelName) return ctrl.Result{}, nil } - klog.Error(err, "Unable to get InferenceModel") + klogV.ErrorS(err, "Unable to get InferenceModel", "name", req.NamespacedName) return ctrl.Result{}, err } else if !infModel.DeletionTimestamp.IsZero() { - klog.V(1).Infof("InferenceModel %v is marked for deletion. Removing from datastore", req.NamespacedName) + klogV.InfoS("InferenceModel is marked for deletion. Removing from datastore", "name", req.NamespacedName) c.Datastore.InferenceModels.Delete(infModel.Spec.ModelName) return ctrl.Result{}, nil } @@ -48,13 +50,15 @@ func (c *InferenceModelReconciler) Reconcile(ctx context.Context, req ctrl.Reque } func (c *InferenceModelReconciler) updateDatastore(infModel *v1alpha1.InferenceModel) { + klogV := klog.V(logutil.DEFAULT) + if infModel.Spec.PoolRef.Name == c.PoolNamespacedName.Name { - klog.V(1).Infof("Incoming pool ref %v, server pool name: %v", infModel.Spec.PoolRef, c.PoolNamespacedName.Name) - klog.V(1).Infof("Adding/Updating InferenceModel: %v", infModel.Spec.ModelName) + klogV.InfoS("Updating datastore", "poolRef", infModel.Spec.PoolRef, "serverPoolName", c.PoolNamespacedName) + klogV.InfoS("Adding/Updating InferenceModel", "modelName", infModel.Spec.ModelName) c.Datastore.InferenceModels.Store(infModel.Spec.ModelName, infModel) return } - klog.V(logutil.DEFAULT).Infof("Removing/Not adding InferenceModel: %v", infModel.Spec.ModelName) + klogV.InfoS("Removing/Not adding InferenceModel", "modelName", infModel.Spec.ModelName) // If we get here. The model is not relevant to this pool, remove. c.Datastore.InferenceModels.Delete(infModel.Spec.ModelName) } diff --git a/pkg/ext-proc/backend/inferencepool_reconciler.go b/pkg/ext-proc/backend/inferencepool_reconciler.go index fd15ebc3..9504b4e0 100644 --- a/pkg/ext-proc/backend/inferencepool_reconciler.go +++ b/pkg/ext-proc/backend/inferencepool_reconciler.go @@ -11,6 +11,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) // InferencePoolReconciler utilizes the controller runtime to reconcile Instance Gateway resources @@ -28,11 +29,12 @@ func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Reques if req.NamespacedName.Name != c.PoolNamespacedName.Name || req.NamespacedName.Namespace != c.PoolNamespacedName.Namespace { return ctrl.Result{}, nil } - klog.V(1).Info("reconciling InferencePool", req.NamespacedName) + klogV := klog.V(logutil.DEFAULT) + klogV.InfoS("Reconciling InferencePool", "name", req.NamespacedName) serverPool := &v1alpha1.InferencePool{} if err := c.Get(ctx, req.NamespacedName, serverPool); err != nil { - klog.Error(err, ": unable to get InferencePool") + klogV.ErrorS(err, "Unable to get InferencePool", "name", req.NamespacedName) return ctrl.Result{}, err } if c.Datastore.inferencePool == nil || !reflect.DeepEqual(serverPool.Spec.Selector, c.Datastore.inferencePool.Spec.Selector) { @@ -49,7 +51,7 @@ func (c *InferencePoolReconciler) updateDatastore(serverPool *v1alpha1.Inference pool, _ := c.Datastore.getInferencePool() if pool == nil || serverPool.ObjectMeta.ResourceVersion != pool.ObjectMeta.ResourceVersion { - klog.Infof("Updating inference pool to %v/%v", serverPool.ObjectMeta.Namespace, serverPool.ObjectMeta.Name) + klog.V(logutil.DEFAULT).InfoS("Updating inference pool", "target", klog.KMetadata(&serverPool.ObjectMeta)) c.Datastore.setInferencePool(serverPool) } } diff --git a/pkg/ext-proc/backend/provider.go b/pkg/ext-proc/backend/provider.go index 68043d93..d64b80b3 100644 --- a/pkg/ext-proc/backend/provider.go +++ b/pkg/ext-proc/backend/provider.go @@ -63,10 +63,10 @@ func (p *Provider) Init(refreshPodsInterval, refreshMetricsInterval, refreshProm p.refreshPodsOnce() if err := p.refreshMetricsOnce(); err != nil { - klog.Errorf("Failed to init metrics: %v", err) + klog.ErrorS(err, "Failed to init metrics") } - klog.Infof("Initialized pods and metrics: %+v", p.AllPodMetrics()) + klog.InfoS("Initialized pods and metrics", "metrics", p.AllPodMetrics()) // periodically refresh pods go func() { @@ -81,7 +81,7 @@ func (p *Provider) Init(refreshPodsInterval, refreshMetricsInterval, refreshProm for { time.Sleep(refreshMetricsInterval) if err := p.refreshMetricsOnce(); err != nil { - klog.V(logutil.TRACE).Infof("Failed to refresh metrics: %v", err) + klog.V(logutil.TRACE).ErrorS(err, "Failed to refresh metrics") } } }() @@ -95,11 +95,11 @@ func (p *Provider) Init(refreshPodsInterval, refreshMetricsInterval, refreshProm }() // Periodically print out the pods and metrics for DEBUGGING. - if klog.V(logutil.DEBUG).Enabled() { + if klogV := klog.V(logutil.DEBUG); klogV.Enabled() { go func() { for { time.Sleep(5 * time.Second) - klog.Infof("===DEBUG: Current Pods and metrics: %+v", p.AllPodMetrics()) + klogV.InfoS("Current Pods and metrics gathered", "metrics", p.AllPodMetrics()) } }() } @@ -138,18 +138,19 @@ func (p *Provider) refreshPodsOnce() { } func (p *Provider) refreshMetricsOnce() error { + klogV := klog.V(logutil.TRACE) ctx, cancel := context.WithTimeout(context.Background(), fetchMetricsTimeout) defer cancel() start := time.Now() defer func() { d := time.Since(start) // TODO: add a metric instead of logging - klog.V(logutil.TRACE).Infof("Refreshed metrics in %v", d) + klogV.InfoS("Metrics refreshed", "duration", d) }() var wg sync.WaitGroup errCh := make(chan error) processOnePod := func(key, value any) bool { - klog.V(logutil.TRACE).Infof("Processing pod %v and metric %v", key, value) + klogV.InfoS("Pod and metric being processed", "pod", key, "metric", value) pod := key.(Pod) existing := value.(*PodMetrics) wg.Add(1) @@ -161,7 +162,7 @@ func (p *Provider) refreshMetricsOnce() error { return } p.UpdatePodMetrics(pod, updated) - klog.V(logutil.TRACE).Infof("Updated metrics for pod %s: %v", pod, updated.Metrics) + klogV.InfoS("Updated metrics for pod", "pod", pod, "metrics", updated.Metrics) }() return true } @@ -185,7 +186,7 @@ func (p *Provider) refreshMetricsOnce() error { } func (p *Provider) flushPrometheusMetricsOnce() { - klog.V(logutil.DEBUG).Infof("Flushing Prometheus Metrics") + klog.V(logutil.DEBUG).InfoS("Flushing Prometheus Metrics") pool, _ := p.datastore.getInferencePool() if pool == nil { diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/ext-proc/backend/vllm/metrics.go index e3693960..4c3804ce 100644 --- a/pkg/ext-proc/backend/vllm/metrics.go +++ b/pkg/ext-proc/backend/vllm/metrics.go @@ -32,8 +32,7 @@ const ( KvCacheMaxTokenCapacityMetricName = "vllm:gpu_cache_max_token_capacity" ) -type PodMetricsClientImpl struct { -} +type PodMetricsClientImpl struct{} // FetchMetrics fetches metrics from a given pod. func (p *PodMetricsClientImpl) FetchMetrics( @@ -46,11 +45,12 @@ func (p *PodMetricsClientImpl) FetchMetrics( url := fmt.Sprintf("http://%s/metrics", pod.Address) req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { + klog.V(logutil.DEFAULT).ErrorS(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) return nil, fmt.Errorf("failed to create request: %v", err) } resp, err := http.DefaultClient.Do(req) if err != nil { - klog.Errorf("failed to fetch metrics from %s: %v", pod, err) + klog.V(logutil.DEFAULT).ErrorS(err, "Failed to fetch metrics", "pod", pod) return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod, err) } defer func() { @@ -58,7 +58,7 @@ func (p *PodMetricsClientImpl) FetchMetrics( }() if resp.StatusCode != http.StatusOK { - klog.Errorf("unexpected status code from %s: %v", pod, resp.StatusCode) + klog.V(logutil.DEFAULT).ErrorS(nil, "Unexpected status code returned", "pod", pod, "statusCode", resp.StatusCode) return nil, fmt.Errorf("unexpected status code from %s: %v", pod, resp.StatusCode) } @@ -138,7 +138,7 @@ func promToPodMetrics( func getLatestLoraMetric(metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) { loraRequests, ok := metricFamilies[LoraRequestInfoMetricName] if !ok { - klog.Warningf("metric family %q not found", LoraRequestInfoMetricName) + klog.V(logutil.DEFAULT).ErrorS(nil, "Metric family not found", "name", LoraRequestInfoMetricName) return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName) } var latestTs float64 @@ -157,7 +157,7 @@ func getLatestLoraMetric(metricFamilies map[string]*dto.MetricFamily) (*dto.Metr func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, error) { mf, ok := metricFamilies[metricName] if !ok { - klog.Warningf("metric family %q not found", metricName) + klog.V(logutil.DEFAULT).ErrorS(nil, "Metric family not found", "name", metricName) return nil, fmt.Errorf("metric family %q not found", metricName) } if len(mf.GetMetric()) == 0 { @@ -171,6 +171,6 @@ func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName str latest = m } } - klog.V(logutil.TRACE).Infof("Got metric value %+v for metric %v", latest, metricName) + klog.V(logutil.TRACE).InfoS("Metric value selected", "value", latest, "metric", metricName) return latest, nil } diff --git a/pkg/ext-proc/handlers/request.go b/pkg/ext-proc/handlers/request.go index 17278025..a36f7ae3 100644 --- a/pkg/ext-proc/handlers/request.go +++ b/pkg/ext-proc/handlers/request.go @@ -19,23 +19,24 @@ import ( // parameter. // Envoy sends the request body to ext proc before sending the request to the backend server. func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { - klog.V(logutil.VERBOSE).Infof("Handling request body") + klogV := klog.V(logutil.VERBOSE) + klogV.InfoS("Handling request body") // Unmarshal request body (must be JSON). v := req.Request.(*extProcPb.ProcessingRequest_RequestBody) var rb map[string]interface{} if err := json.Unmarshal(v.RequestBody.Body, &rb); err != nil { - klog.Errorf("Error unmarshaling request body: %v", err) + klog.V(logutil.DEFAULT).ErrorS(err, "Error unmarshaling request body") return nil, fmt.Errorf("error unmarshaling request body: %v", err) } - klog.V(logutil.VERBOSE).Infof("Request body: %v", rb) + klogV.InfoS("Request body unmarshalled", "body", rb) // Resolve target models. model, ok := rb["model"].(string) if !ok { return nil, errors.New("model not found in request") } - klog.V(logutil.VERBOSE).Infof("Model requested: %v", model) + klogV.InfoS("Model requested", "model", model) modelName := model // NOTE: The nil checking for the modelObject means that we DO allow passthrough currently. @@ -56,7 +57,7 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces ResolvedTargetModel: modelName, Critical: backend.IsCritical(modelObj), } - klog.V(logutil.VERBOSE).Infof("LLM Request: %+v", llmReq) + klogV.InfoS("LLM request assembled", "request", llmReq) requestBody := v.RequestBody.Body var err error @@ -65,17 +66,17 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces rb["model"] = llmReq.ResolvedTargetModel requestBody, err = json.Marshal(rb) if err != nil { - klog.Errorf("Error marshaling request body: %v", err) + klog.V(logutil.DEFAULT).ErrorS(err, "Error marshaling request body") return nil, fmt.Errorf("error marshaling request body: %v", err) } - klog.V(logutil.VERBOSE).Infof("Updated body: %v", string(requestBody)) + klogV.InfoS("Updated request body marshalled", "body", string(requestBody)) } targetPod, err := s.scheduler.Schedule(llmReq) if err != nil { return nil, fmt.Errorf("failed to find target pod: %w", err) } - klog.V(logutil.VERBOSE).Infof("Selected target model %v in target pod: %v\n", llmReq.ResolvedTargetModel, targetPod) + klogV.InfoS("Target model and pod selected", "model", llmReq.ResolvedTargetModel, "pod", targetPod) reqCtx.Model = llmReq.Model reqCtx.ResolvedTargetModel = llmReq.ResolvedTargetModel @@ -101,7 +102,7 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces } // Print headers for debugging for _, header := range headers { - klog.V(logutil.VERBOSE).Infof("[request_body] Header Key: %s, Header Value: %s\n", header.Header.Key, header.Header.RawValue) + klog.V(logutil.DEBUG).InfoS("Request body header", "key", header.Header.Key, "value", header.Header.RawValue) } resp := &extProcPb.ProcessingResponse{ @@ -136,10 +137,9 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces } func HandleRequestHeaders(reqCtx *RequestContext, req *extProcPb.ProcessingRequest) *extProcPb.ProcessingResponse { - klog.V(logutil.VERBOSE).Info("Handling request headers ...") r := req.Request h := r.(*extProcPb.ProcessingRequest_RequestHeaders) - klog.V(logutil.VERBOSE).Infof("Headers: %+v\n", h) + klog.V(logutil.VERBOSE).InfoS("Handling request headers", "headers", h) resp := &extProcPb.ProcessingResponse{ Response: &extProcPb.ProcessingResponse_RequestHeaders{ diff --git a/pkg/ext-proc/handlers/response.go b/pkg/ext-proc/handlers/response.go index 34a7219a..012b0b8d 100644 --- a/pkg/ext-proc/handlers/response.go +++ b/pkg/ext-proc/handlers/response.go @@ -12,9 +12,9 @@ import ( // HandleResponseHeaders processes response headers from the backend model server. func (s *Server) HandleResponseHeaders(reqCtx *RequestContext, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { - klog.V(logutil.VERBOSE).Info("Processing ResponseHeaders") + klog.V(logutil.VERBOSE).InfoS("Processing ResponseHeaders") h := req.Request.(*extProcPb.ProcessingRequest_ResponseHeaders) - klog.V(logutil.VERBOSE).Infof("Headers before: %+v\n", h) + klog.V(logutil.VERBOSE).InfoS("Headers before", "headers", h) resp := &extProcPb.ProcessingResponse{ Response: &extProcPb.ProcessingResponse_ResponseHeaders{ @@ -66,7 +66,7 @@ func (s *Server) HandleResponseHeaders(reqCtx *RequestContext, req *extProcPb.Pr } }*/ func (s *Server) HandleResponseBody(reqCtx *RequestContext, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { - klog.V(logutil.VERBOSE).Info("Processing HandleResponseBody") + klog.V(logutil.VERBOSE).InfoS("Processing HandleResponseBody") body := req.Request.(*extProcPb.ProcessingRequest_ResponseBody) res := Response{} @@ -81,7 +81,7 @@ func (s *Server) HandleResponseBody(reqCtx *RequestContext, req *extProcPb.Proce // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/178) // will add the processing for streaming case. reqCtx.ResponseComplete = true - klog.V(logutil.VERBOSE).Infof("Response: %+v", res) + klog.V(logutil.VERBOSE).InfoS("Response generated", "response", res) resp := &extProcPb.ProcessingResponse{ Response: &extProcPb.ProcessingResponse_ResponseBody{ diff --git a/pkg/ext-proc/handlers/server.go b/pkg/ext-proc/handlers/server.go index f27c9a15..a3cfcada 100644 --- a/pkg/ext-proc/handlers/server.go +++ b/pkg/ext-proc/handlers/server.go @@ -51,7 +51,7 @@ type ModelDataStore interface { } func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { - klog.V(logutil.VERBOSE).Info("Processing") + klog.V(logutil.VERBOSE).InfoS("Processing") ctx := srv.Context() // Create request context to share states during life time of an HTTP request. // See https://github.com/envoyproxy/envoy/issues/17540. @@ -71,7 +71,7 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { if err != nil { // This error occurs very frequently, though it doesn't seem to have any impact. // TODO Figure out if we can remove this noise. - klog.V(logutil.VERBOSE).Infof("cannot receive stream request: %v", err) + klog.V(logutil.VERBOSE).ErrorS(err, "Cannot receive stream request") return status.Errorf(codes.Unknown, "cannot receive stream request: %v", err) } @@ -80,17 +80,17 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { case *extProcPb.ProcessingRequest_RequestHeaders: reqCtx.RequestReceivedTimestamp = time.Now() resp = HandleRequestHeaders(reqCtx, req) - klog.V(logutil.VERBOSE).Infof("Request context after HandleRequestHeaders: %+v", reqCtx) + klog.V(logutil.VERBOSE).InfoS("Request context after HandleRequestHeaders", "context", reqCtx) case *extProcPb.ProcessingRequest_RequestBody: resp, err = s.HandleRequestBody(reqCtx, req) if err == nil { metrics.RecordRequestCounter(reqCtx.Model, reqCtx.ResolvedTargetModel) metrics.RecordRequestSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestSize) } - klog.V(logutil.VERBOSE).Infof("Request context after HandleRequestBody: %+v", reqCtx) + klog.V(logutil.VERBOSE).InfoS("Request context after HandleRequestBody", "context", reqCtx) case *extProcPb.ProcessingRequest_ResponseHeaders: resp, err = s.HandleResponseHeaders(reqCtx, req) - klog.V(logutil.VERBOSE).Infof("Request context after HandleResponseHeaders: %+v", reqCtx) + klog.V(logutil.VERBOSE).InfoS("Request context after HandleResponseHeaders", "context", reqCtx) case *extProcPb.ProcessingRequest_ResponseBody: resp, err = s.HandleResponseBody(reqCtx, req) if err == nil && reqCtx.ResponseComplete { @@ -100,13 +100,13 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Response.Usage.PromptTokens) metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Response.Usage.CompletionTokens) } - klog.V(logutil.VERBOSE).Infof("Request context after HandleResponseBody: %+v", reqCtx) + klog.V(logutil.VERBOSE).InfoS("Request context after HandleResponseBody", "context", reqCtx) default: - klog.Errorf("Unknown Request type %+v", v) + klog.V(logutil.DEFAULT).ErrorS(nil, "Unknown Request type", "request", v) return status.Error(codes.Unknown, "unknown request type") } if err != nil { - klog.Errorf("failed to process request: %v", err) + klog.V(logutil.DEFAULT).ErrorS(err, "Failed to process request", "request", req) switch status.Code(err) { // This code can be returned by scheduler when there is no capacity for sheddable // requests. @@ -125,9 +125,9 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { } } - klog.V(logutil.VERBOSE).Infof("response: %v", resp) + klog.V(logutil.VERBOSE).InfoS("Response generated", "response", resp) if err := srv.Send(resp); err != nil { - klog.Errorf("send error %v", err) + klog.V(logutil.DEFAULT).ErrorS(err, "Send failed") return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) } } diff --git a/pkg/ext-proc/health.go b/pkg/ext-proc/health.go index 764992b2..aabb150d 100644 --- a/pkg/ext-proc/health.go +++ b/pkg/ext-proc/health.go @@ -8,6 +8,7 @@ import ( "google.golang.org/grpc/status" klog "k8s.io/klog/v2" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) type healthServer struct { @@ -16,10 +17,10 @@ type healthServer struct { func (s *healthServer) Check(ctx context.Context, in *healthPb.HealthCheckRequest) (*healthPb.HealthCheckResponse, error) { if !s.datastore.HasSynced() { - klog.Infof("gRPC health check not serving: %s", in.String()) + klog.V(logutil.VERBOSE).InfoS("gRPC health check not serving", "service", in.Service) return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_NOT_SERVING}, nil } - klog.Infof("gRPC health check serving: %s", in.String()) + klog.V(logutil.VERBOSE).InfoS("gRPC health check serving", "service", in.Service) return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_SERVING}, nil } diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index 968d09f5..06c77af3 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -102,11 +102,11 @@ func run() error { } // Print all flag values - flags := "Flags: " + flags := make(map[string]any) flag.VisitAll(func(f *flag.Flag) { - flags += fmt.Sprintf("%s=%v; ", f.Name, f.Value) + flags[f.Name] = f.Value }) - klog.Info(flags) + klog.InfoS("Flags processed", "flags", flags) datastore := backend.NewK8sDataStore() diff --git a/pkg/ext-proc/metrics/metrics.go b/pkg/ext-proc/metrics/metrics.go index 7bdc8436..1412af6e 100644 --- a/pkg/ext-proc/metrics/metrics.go +++ b/pkg/ext-proc/metrics/metrics.go @@ -7,6 +7,7 @@ import ( compbasemetrics "k8s.io/component-base/metrics" "k8s.io/component-base/metrics/legacyregistry" klog "k8s.io/klog/v2" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) const ( @@ -31,8 +32,10 @@ var ( Subsystem: InferenceModelComponent, Name: "request_duration_seconds", Help: "Inference model response latency distribution in seconds for each model and target model.", - Buckets: []float64{0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3, - 4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600}, + Buckets: []float64{ + 0.005, 0.025, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.25, 1.5, 2, 3, + 4, 5, 6, 8, 10, 15, 20, 30, 45, 60, 120, 180, 240, 300, 360, 480, 600, 900, 1200, 1800, 2700, 3600, + }, StabilityLevel: compbasemetrics.ALPHA, }, []string{"model_name", "target_model_name"}, @@ -140,10 +143,11 @@ func RecordRequestSizes(modelName, targetModelName string, reqSize int) { requestSizes.WithLabelValues(modelName, targetModelName).Observe(float64(reqSize)) } -// RecordRequstLatencies records duration of request. +// RecordRequestLatencies records duration of request. func RecordRequestLatencies(modelName, targetModelName string, received time.Time, complete time.Time) bool { if !complete.After(received) { - klog.Errorf("request latency value error for model name %v, target model name %v: complete time %v is before received time %v", modelName, targetModelName, complete, received) + klog.V(logutil.DEFAULT).ErrorS(nil, "Request latency values are invalid", + "modelName", modelName, "targetModelName", targetModelName, "completeTime", complete, "receivedTime", received) return false } elapsedSeconds := complete.Sub(received).Seconds() diff --git a/pkg/ext-proc/scheduling/filter.go b/pkg/ext-proc/scheduling/filter.go index fc016882..ac7a287c 100644 --- a/pkg/ext-proc/scheduling/filter.go +++ b/pkg/ext-proc/scheduling/filter.go @@ -42,7 +42,7 @@ func (f *filter) Name() string { } func (f *filter) Filter(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { - klog.V(logutil.VERBOSE).Infof("Running filter %q on request %v with %v pods", f.name, req, len(pods)) + klog.V(logutil.VERBOSE).InfoS("Running a filter", "name", f.Name(), "request", req, "podCount", len(pods)) filtered, err := f.filter(req, pods) @@ -55,7 +55,7 @@ func (f *filter) Filter(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend if f.nextOnSuccess != nil { next = f.nextOnSuccess } - klog.V(logutil.VERBOSE).Infof("onSuccess %q -> %q, filtered: %v", f.name, next.Name(), len(filtered)) + klog.V(logutil.VERBOSE).InfoS("Filter succeeded", "filter", f.Name(), "next", next.Name(), "filteredPodCount", len(filtered)) // On success, pass the filtered result to the next filter. return next.Filter(req, filtered) } else { @@ -66,7 +66,7 @@ func (f *filter) Filter(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend if f.nextOnFailure != nil { next = f.nextOnFailure } - klog.V(logutil.VERBOSE).Infof("onFailure %q -> %q", f.name, next.Name()) + klog.V(logutil.VERBOSE).InfoS("Filter failed", "filter", f.Name(), "next", next.Name()) // On failure, pass the initial set of pods to the next filter. return next.Filter(req, pods) } diff --git a/pkg/ext-proc/scheduling/scheduler.go b/pkg/ext-proc/scheduling/scheduler.go index ca896c5a..50564898 100644 --- a/pkg/ext-proc/scheduling/scheduler.go +++ b/pkg/ext-proc/scheduling/scheduler.go @@ -83,7 +83,7 @@ var ( nextOnFailure: &filter{ name: "drop request", filter: func(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { - klog.Infof("Dropping request %v", req) + klog.V(logutil.DEFAULT).InfoS("Request dropped", "request", req) return []*backend.PodMetrics{}, status.Errorf( codes.ResourceExhausted, "dropping request due to limited backend resources") }, @@ -92,7 +92,6 @@ var ( ) func NewScheduler(pmp PodMetricsProvider) *Scheduler { - return &Scheduler{ podMetricsProvider: pmp, filter: defaultFilter, @@ -112,13 +111,13 @@ type PodMetricsProvider interface { // Schedule finds the target pod based on metrics and the requested lora adapter. func (s *Scheduler) Schedule(req *LLMRequest) (targetPod backend.Pod, err error) { - klog.V(logutil.VERBOSE).Infof("request: %v; metrics: %+v", req, s.podMetricsProvider.AllPodMetrics()) + klog.V(logutil.VERBOSE).InfoS("Scheduling a request", "request", req, "metrics", s.podMetricsProvider.AllPodMetrics()) pods, err := s.filter.Filter(req, s.podMetricsProvider.AllPodMetrics()) if err != nil || len(pods) == 0 { return backend.Pod{}, fmt.Errorf( "failed to apply filter, resulted %v pods, this should never happen: %w", len(pods), err) } - klog.V(logutil.VERBOSE).Infof("Going to randomly select a pod from the candidates: %+v", pods) + klog.V(logutil.VERBOSE).InfoS("Selecting a random pod from the candidates", "candidatePods", pods) i := rand.Intn(len(pods)) return pods[i].Pod, nil } diff --git a/pkg/ext-proc/test/benchmark/benchmark.go b/pkg/ext-proc/test/benchmark/benchmark.go index f18782d6..c83dbcb9 100644 --- a/pkg/ext-proc/test/benchmark/benchmark.go +++ b/pkg/ext-proc/test/benchmark/benchmark.go @@ -15,6 +15,7 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/test" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) var ( @@ -34,13 +35,19 @@ const ( ) func main() { + if err := run(); err != nil { + os.Exit(1) + } +} + +func run() error { klog.InitFlags(nil) flag.Parse() if *localServer { test.StartExtProc(port, *refreshPodsInterval, *refreshMetricsInterval, *refreshPrometheusMetricsInterval, fakePods(), fakeModels()) time.Sleep(time.Second) // wait until server is up - klog.Info("Server started") + klog.InfoS("Server started") } report, err := runner.Run( @@ -51,7 +58,8 @@ func main() { runner.WithTotalRequests(uint(*totalRequests)), ) if err != nil { - klog.Fatal(err) + klog.ErrorS(err, "Runner failed") + return err } printer := printer.ReportPrinter{ @@ -60,6 +68,7 @@ func main() { } printer.Print("summary") + return nil } func generateRequest(mtd *desc.MethodDescriptor, callData *runner.CallData) []byte { @@ -67,7 +76,7 @@ func generateRequest(mtd *desc.MethodDescriptor, callData *runner.CallData) []by req := test.GenerateRequest(modelName(int(callData.RequestNumber) % numModels)) data, err := proto.Marshal(req) if err != nil { - klog.Fatal("marshaling error: ", err) + logutil.Fatal(err, "Failed to marshal request", "request", req) } return data } diff --git a/pkg/ext-proc/test/utils.go b/pkg/ext-proc/test/utils.go index b91672fa..4c000722 100644 --- a/pkg/ext-proc/test/utils.go +++ b/pkg/ext-proc/test/utils.go @@ -14,6 +14,7 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) func StartExtProc(port int, refreshPodsInterval, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration, pods []*backend.PodMetrics, models map[string]*v1alpha1.InferenceModel) *grpc.Server { @@ -26,7 +27,7 @@ func StartExtProc(port int, refreshPodsInterval, refreshMetricsInterval, refresh pmc := &backend.FakePodMetricsClient{Res: pms} pp := backend.NewProvider(pmc, backend.NewK8sDataStore(backend.WithPods(pods))) if err := pp.Init(refreshPodsInterval, refreshMetricsInterval, refreshPrometheusMetricsInterval); err != nil { - klog.Fatalf("failed to initialize: %v", err) + logutil.Fatal(err, "Failed to initialize") } return startExtProc(port, pp, models) } @@ -35,19 +36,19 @@ func StartExtProc(port int, refreshPodsInterval, refreshMetricsInterval, refresh func startExtProc(port int, pp *backend.Provider, models map[string]*v1alpha1.InferenceModel) *grpc.Server { lis, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) if err != nil { - klog.Fatalf("failed to listen: %v", err) + logutil.Fatal(err, "Failed to listen", "port", port) } s := grpc.NewServer() extProcPb.RegisterExternalProcessorServer(s, handlers.NewServer(pp, scheduling.NewScheduler(pp), "target-pod", &backend.FakeDataStore{Res: models})) - klog.Infof("Starting gRPC server on port :%v", port) + klog.InfoS("gRPC server starting", "port", port) reflection.Register(s) go func() { err := s.Serve(lis) if err != nil { - klog.Fatalf("Ext-proc failed with the err: %v", err) + logutil.Fatal(err, "Ext-proc failed with the err") } }() return s @@ -63,7 +64,7 @@ func GenerateRequest(model string) *extProcPb.ProcessingRequest { llmReq, err := json.Marshal(j) if err != nil { - klog.Fatal(err) + logutil.Fatal(err, "Failed to unmarshal LLM request") } req := &extProcPb.ProcessingRequest{ Request: &extProcPb.ProcessingRequest_RequestBody{ diff --git a/pkg/ext-proc/util/logging/fatal.go b/pkg/ext-proc/util/logging/fatal.go new file mode 100644 index 00000000..65926824 --- /dev/null +++ b/pkg/ext-proc/util/logging/fatal.go @@ -0,0 +1,11 @@ +package logging + +import "k8s.io/klog/v2" + +// Fatal calls klog.ErrorS followed by klog.FlushAndExit(1). +// +// This is a utility function and should not be used in production code! +func Fatal(err error, msg string, keysAndValues ...interface{}) { + klog.ErrorS(err, msg, keysAndValues...) + klog.FlushAndExit(klog.ExitFlushTimeout, 1) +} diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index ff018f28..13cddfdf 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -9,7 +9,6 @@ import ( "flag" "fmt" "io" - "log" "os" "path/filepath" "testing" @@ -35,6 +34,7 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" extprocutils "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/test" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" "sigs.k8s.io/yaml" ) @@ -420,7 +420,7 @@ func setUpHermeticServer(pods []*backend.PodMetrics) (client extProcPb.ExternalP if err := serverRunner.AsRunnable( backend.NewK8sDataStore(backend.WithPods(pods)), pmc, ).Start(serverCtx); err != nil { - log.Fatalf("Failed to start ext-proc server: %v", err) + logutil.Fatal(err, "Failed to start ext-proc server") } }() @@ -431,13 +431,13 @@ func setUpHermeticServer(pods []*backend.PodMetrics) (client extProcPb.ExternalP // Create a grpc connection conn, err := grpc.NewClient(address, grpc.WithTransportCredentials(insecure.NewCredentials())) if err != nil { - log.Fatalf("Failed to connect to %v: %v", address, err) + logutil.Fatal(err, "Failed to connect", "address", address) } ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) client, err = extProcPb.NewExternalProcessorClient(conn).Process(ctx) if err != nil { - log.Fatalf("Failed to create client: %v", err) + logutil.Fatal(err, "Failed to create client") } return client, func() { cancel() @@ -455,7 +455,7 @@ func BeforeSuit() { } cfg, err := testEnv.Start() if err != nil { - log.Fatalf("Failed to start test environment, cfg: %v error: %v", cfg, err) + logutil.Fatal(err, "Failed to start test environment", "config", cfg) } utilruntime.Must(clientgoscheme.AddToScheme(scheme)) @@ -463,16 +463,15 @@ func BeforeSuit() { k8sClient, err = k8sclient.New(cfg, k8sclient.Options{Scheme: scheme}) if err != nil { - log.Fatalf("Failed to start k8s Client: %v", err) + logutil.Fatal(err, "Failed to start k8s Client") } else if k8sClient == nil { - log.Fatalf("No error, but returned kubernetes client is nil, cfg: %v", cfg) + logutil.Fatal(nil, "No error, but returned kubernetes client is nil", "config", cfg) } // Init runtime. mgr, err := ctrl.NewManager(cfg, ctrl.Options{Scheme: scheme}) if err != nil { - klog.ErrorS(err, "Failed to create controller manager") - klog.FlushAndExit(klog.ExitFlushTimeout, 1) + logutil.Fatal(err, "Failed to create controller manager") } serverRunner = runserver.NewDefaultExtProcServerRunner() @@ -481,17 +480,17 @@ func BeforeSuit() { serverRunner.Datastore = backend.NewK8sDataStore() if err := serverRunner.SetupWithManager(mgr); err != nil { - log.Fatalf("Failed to start server runner: %v", err) + logutil.Fatal(err, "Failed to setup server runner") } // Start the controller manager in go routine, not blocking go func() { if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { - log.Fatalf("Failed to start manager: %v", err) + logutil.Fatal(err, "Failed to start manager") } }() - klog.Info("Setting up hermetic ExtProc server") + klog.InfoS("Setting up hermetic ExtProc server") klog.InitFlags(nil) flag.Parse() // Configure klog verbosity levels to print ext proc logs. @@ -501,30 +500,30 @@ func BeforeSuit() { manifestsPath := filepath.Join("..", "testdata", "inferencepool-with-model-hermetic.yaml") docs, err := readDocuments(manifestsPath) if err != nil { - log.Fatalf("Can't read object manifests at path %v, %v", manifestsPath, err) + logutil.Fatal(err, "Can't read object manifests", "path", manifestsPath) } for _, doc := range docs { inferenceModel := &v1alpha1.InferenceModel{} if err = yaml.Unmarshal(doc, inferenceModel); err != nil { - log.Fatalf("Can't unmarshal object: %v", doc) + logutil.Fatal(err, "Can't unmarshal object", "document", doc) } if inferenceModel.Kind == "InferenceModel" { - klog.Infof("Creating inference model: %+v", inferenceModel) + klog.InfoS("Creating inference model", "model", inferenceModel) if err := k8sClient.Create(context.Background(), inferenceModel); err != nil { - log.Fatalf("unable to create inferenceModel %v: %v", inferenceModel.Name, err) + logutil.Fatal(err, "Unable to create inferenceModel", "modelName", inferenceModel.Name) } } } for _, doc := range docs { inferencePool := &v1alpha1.InferencePool{} if err = yaml.Unmarshal(doc, inferencePool); err != nil { - log.Fatalf("Can't unmarshal object: %v", doc) + logutil.Fatal(err, "Can't unmarshal object", "document", doc) } if inferencePool.Kind == "InferencePool" { - klog.Infof("Creating inference pool: %+v", inferencePool) + klog.InfoS("Creating inference pool", "pool", inferencePool) if err := k8sClient.Create(context.Background(), inferencePool); err != nil { - log.Fatalf("unable to create inferencePool %v: %v", inferencePool.Name, err) + logutil.Fatal(err, "Unable to create inferencePool", "poolName", inferencePool.Name) } } } From 8233946981074610b26193be2b51d1313820005b Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Fri, 14 Feb 2025 17:58:21 +0000 Subject: [PATCH 22/96] Add TLS support with self-signed certificate. (#335) --- pkg/ext-proc/main.go | 10 ++- pkg/ext-proc/server/runserver.go | 82 ++++++++++++++++++++++++- pkg/manifests/gateway/patch_policy.yaml | 14 +++++ test/integration/hermetic_test.go | 1 + test/testdata/envoy.yaml | 11 +++- 5 files changed, 114 insertions(+), 4 deletions(-) diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index 06c77af3..8f4cd8e7 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -71,7 +71,13 @@ var ( "refreshPrometheusMetricsInterval", runserver.DefaultRefreshPrometheusMetricsInterval, "interval to flush prometheus metrics") - logVerbosity = flag.Int("v", logging.DEFAULT, "number for the log level verbosity") + logVerbosity = flag.Int("v", logging.DEFAULT, "number for the log level verbosity") + secureServing = flag.Bool( + "secureServing", runserver.DefaultSecureServing, "Enables secure serving. Defaults to true.") + certPath = flag.String( + "certPath", "", "The path to the certificate for secure serving. The certificate and private key files "+ + "are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+ + "then a self-signed certificate is used.") scheme = runtime.NewScheme() ) @@ -133,6 +139,8 @@ func run() error { RefreshMetricsInterval: *refreshMetricsInterval, RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval, Datastore: datastore, + SecureServing: *secureServing, + CertPath: *certPath, } if err := serverRunner.SetupWithManager(mgr); err != nil { klog.ErrorS(err, "Failed to setup ext-proc server") diff --git a/pkg/ext-proc/server/runserver.go b/pkg/ext-proc/server/runserver.go index 2d92e412..ed260b04 100644 --- a/pkg/ext-proc/server/runserver.go +++ b/pkg/ext-proc/server/runserver.go @@ -2,11 +2,19 @@ package server import ( "context" + "crypto/rand" + "crypto/rsa" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" "fmt" + "math/big" "time" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "google.golang.org/grpc" + "google.golang.org/grpc/credentials" "k8s.io/apimachinery/pkg/types" klog "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" @@ -27,6 +35,8 @@ type ExtProcServerRunner struct { RefreshMetricsInterval time.Duration RefreshPrometheusMetricsInterval time.Duration Datastore *backend.K8sDatastore + SecureServing bool + CertPath string } // Default values for CLI flags in main @@ -38,6 +48,7 @@ const ( DefaultRefreshPodsInterval = 10 * time.Second // default for --refreshPodsInterval DefaultRefreshMetricsInterval = 50 * time.Millisecond // default for --refreshMetricsInterval DefaultRefreshPrometheusMetricsInterval = 5 * time.Second // default for --refreshPrometheusMetricsInterval + DefaultSecureServing = true // default for --secureServing ) func NewDefaultExtProcServerRunner() *ExtProcServerRunner { @@ -49,6 +60,7 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner { RefreshPodsInterval: DefaultRefreshPodsInterval, RefreshMetricsInterval: DefaultRefreshMetricsInterval, RefreshPrometheusMetricsInterval: DefaultRefreshPrometheusMetricsInterval, + SecureServing: DefaultSecureServing, // Datastore can be assigned later. } } @@ -107,8 +119,29 @@ func (r *ExtProcServerRunner) AsRunnable( return err } - // Init the server. - srv := grpc.NewServer() + var srv *grpc.Server + if r.SecureServing { + var cert tls.Certificate + var err error + if r.CertPath != "" { + cert, err = tls.LoadX509KeyPair(r.CertPath+"/tls.crt", r.CertPath+"/tls.key") + } else { + // Create tls based credential. + cert, err = createSelfSignedTLSCertificate() + } + if err != nil { + klog.ErrorS(err, "Failed to create self signed certificate") + return err + } + + creds := credentials.NewTLS(&tls.Config{ + Certificates: []tls.Certificate{cert}, + }) + // Init the server. + srv = grpc.NewServer(grpc.Creds(creds)) + } else { + srv = grpc.NewServer() + } extProcPb.RegisterExternalProcessorServer( srv, handlers.NewServer(pp, scheduling.NewScheduler(pp), r.TargetEndpointKey, r.Datastore), @@ -118,3 +151,48 @@ func (r *ExtProcServerRunner) AsRunnable( return runnable.GRPCServer("ext-proc", srv, r.GrpcPort).Start(ctx) })) } + +func createSelfSignedTLSCertificate() (tls.Certificate, error) { + serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) + serialNumber, err := rand.Int(rand.Reader, serialNumberLimit) + if err != nil { + klog.ErrorS(err, "Failed to create serial number for self-signed cert") + return tls.Certificate{}, err + } + now := time.Now() + notBefore := now.UTC() + template := x509.Certificate{ + SerialNumber: serialNumber, + Subject: pkix.Name{ + Organization: []string{"Inference Ext"}, + }, + NotBefore: notBefore, + NotAfter: now.Add(time.Hour * 24 * 365 * 10).UTC(), // 10 years + KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + BasicConstraintsValid: true, + } + + priv, err := rsa.GenerateKey(rand.Reader, 4096) + if err != nil { + klog.ErrorS(err, "Failed to generate key for self-signed cert") + return tls.Certificate{}, err + } + + derBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, &priv.PublicKey, priv) + if err != nil { + klog.ErrorS(err, "Failed to create self-signed certificate") + return tls.Certificate{}, err + } + + certBytes := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: derBytes}) + + privBytes, err := x509.MarshalPKCS8PrivateKey(priv) + if err != nil { + klog.ErrorS(err, "Failed to marshal private key for self-signed certificate") + return tls.Certificate{}, err + } + keyBytes := pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: privBytes}) + + return tls.X509KeyPair(certBytes, keyBytes) +} diff --git a/pkg/manifests/gateway/patch_policy.yaml b/pkg/manifests/gateway/patch_policy.yaml index 4a556b44..ae4fb6d8 100644 --- a/pkg/manifests/gateway/patch_policy.yaml +++ b/pkg/manifests/gateway/patch_policy.yaml @@ -35,6 +35,20 @@ spec: max_pending_requests: 40000 max_requests: 40000 + # This ensures that envoy accepts untrusted certificates. We tried to explicitly + # set TrustChainVerification to ACCEPT_UNSTRUSTED, but that actually didn't work + # and what worked is setting the common_tls_context to empty. + - type: "type.googleapis.com/envoy.config.cluster.v3.Cluster" + name: "envoyextensionpolicy/default/ext-proc-policy/extproc/0" + operation: + op: add + path: "/transport_socket" + value: + name: "envoy.transport_sockets.tls" + typed_config: + "@type": "type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext" + common_tls_context: {} + - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration" name: default/inference-gateway/llm-gw operation: diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index 13cddfdf..6424663b 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -478,6 +478,7 @@ func BeforeSuit() { // Adjust from defaults serverRunner.PoolName = "vllm-llama2-7b-pool" serverRunner.Datastore = backend.NewK8sDataStore() + serverRunner.SecureServing = false if err := serverRunner.SetupWithManager(mgr); err != nil { logutil.Fatal(err, "Failed to setup server runner") diff --git a/test/testdata/envoy.yaml b/test/testdata/envoy.yaml index 700eb24c..ffb8add7 100644 --- a/test/testdata/envoy.yaml +++ b/test/testdata/envoy.yaml @@ -169,6 +169,15 @@ data: max_pending_requests: 40000 max_requests: 40000 max_retries: 1024 + # This ensures that envoy accepts untrusted certificates. We tried to explicitly + # set TrustChainVerification to ACCEPT_UNSTRUSTED, but that actually didn't work + # and what worked is setting the common_tls_context to empty. + transport_socket: + name: "envoy.transport_sockets.tls" + typed_config: + "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext + common_tls_context: + validation_context: typed_extension_protocol_options: envoy.extensions.upstreams.http.v3.HttpProtocolOptions: "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions @@ -219,7 +228,7 @@ spec: - "--service-node" - "$(ENVOY_POD_NAME)" - "--log-level" - - "debug" + - "trace" - "--cpuset-threads" - "--drain-strategy" - "immediate" From 88c20f186dc9fc1eb1650592404064c7d689df46 Mon Sep 17 00:00:00 2001 From: Kunjan Date: Fri, 14 Feb 2025 13:22:21 -0800 Subject: [PATCH 23/96] Lora syncer docs (#320) * Integrate dynamic-lora-sidecar into main guide and add makefile, cloudbuild to build and publish lora-syncer image Signed-off-by: Kunjan * Add makefile and cloudbuild file to build and push lora-syncer Signed-off-by: Kunjan * Add makefile and cloudbuild file to build and push lora-syncer Signed-off-by: Kunjan * Update site-src/guides/dynamic-lora.md Co-authored-by: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> * Update site-src/guides/dynamic-lora.md Co-authored-by: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> * Add makefile and cloudbuild file to build and push lora-syncer Signed-off-by: Kunjan * Adds image-load and kind-load Make targets (#288) Signed-off-by: Daneyon Hansen * Add makefile and cloudbuild file to build and push lora-syncer Signed-off-by: Kunjan * Add build targets for lora syncer Signed-off-by: Kunjan * Apply suggestions from code review * Apply suggestions from code review * Apply suggestions from code review * Apply suggestions from code review --------- Signed-off-by: Kunjan Signed-off-by: Daneyon Hansen Co-authored-by: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Co-authored-by: Daneyon Hansen --- Makefile | 32 ++++ cloudbuild.yaml | 8 + .../vllm/deployment-with-syncer.yaml | 145 ++++++++++++++++++ pkg/manifests/vllm/deployment.yaml | 37 +---- site-src/guides/dynamic-lora.md | 93 +++++++++++ site-src/guides/index.md | 4 + 6 files changed, 284 insertions(+), 35 deletions(-) create mode 100644 pkg/manifests/vllm/deployment-with-syncer.yaml create mode 100644 site-src/guides/dynamic-lora.md diff --git a/Makefile b/Makefile index b7654ed7..1d8fc531 100644 --- a/Makefile +++ b/Makefile @@ -26,11 +26,16 @@ PLATFORMS ?= linux/amd64 DOCKER_BUILDX_CMD ?= docker buildx IMAGE_BUILD_CMD ?= $(DOCKER_BUILDX_CMD) build IMAGE_BUILD_EXTRA_OPTS ?= +SYNCER_IMAGE_BUILD_EXTRA_OPTS ?= IMAGE_REGISTRY ?= us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension IMAGE_NAME := epp IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(IMAGE_NAME) IMAGE_TAG ?= $(IMAGE_REPO):$(GIT_TAG) +SYNCER_IMAGE_NAME := lora-syncer +SYNCER_IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(SYNCER_IMAGE_NAME) +SYNCER_IMAGE_TAG ?= $(SYNCER_IMAGE_REPO):$(GIT_TAG) + BASE_IMAGE ?= gcr.io/distroless/base-debian10 BUILDER_IMAGE ?= golang:1.23-alpine ifdef GO_VERSION @@ -39,9 +44,11 @@ endif ifdef EXTRA_TAG IMAGE_EXTRA_TAG ?= $(IMAGE_REPO):$(EXTRA_TAG) +SYNCER_IMAGE_EXTRA_TAG ?= $(SYNCER_IMAGE_REPO):$(EXTRA_TAG) endif ifdef IMAGE_EXTRA_TAG IMAGE_BUILD_EXTRA_OPTS += -t $(IMAGE_EXTRA_TAG) +SYNCER_IMAGE_BUILD_EXTRA_OPTS += -t $(SYNCER_IMAGE_EXTRA_TAG) endif # The name of the kind cluster to use for the "kind-load" target. @@ -171,6 +178,31 @@ image-load: image-build image-kind: image-build ## Build the EPP image and load it to kind cluster $KIND_CLUSTER ("kind" by default). kind load docker-image $(IMAGE_TAG) --name $(KIND_CLUSTER) +##@ Lora Syncer + +.PHONY: syncer-image-local-build +syncer-image-local-build: + BUILDER=$(shell $(DOCKER_BUILDX_CMD) create --use) + $(MAKE) image-build PUSH=$(PUSH) + $(DOCKER_BUILDX_CMD) rm $$BUILDER + +.PHONY: syncer-image-local-push +syncer-image-local-push: PUSH=--push +syncer-image-local-push: syncer-image-local-build + +.PHONY: syncer-image-build +syncer-image-build: + $ cd $(CURDIR)/tools/dynamic-lora-sidecar && $(IMAGE_BUILD_CMD) -t $(SYNCER_IMAGE_TAG) \ + --platform=$(PLATFORMS) \ + --build-arg BASE_IMAGE=$(BASE_IMAGE) \ + --build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \ + $(PUSH) \ + $(SYNCER_IMAGE_BUILD_EXTRA_OPTS) ./ + +.PHONY: syncer-image-push +syncer-image-push: PUSH=--push +syncer-image-push: syncer-image-build + ##@ Docs .PHONY: build-docs diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 2da147f4..40e45923 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -12,6 +12,14 @@ steps: - GIT_TAG=$_GIT_TAG - EXTRA_TAG=$_PULL_BASE_REF - DOCKER_BUILDX_CMD=/buildx-entrypoint + - name: lora-adapter-syncer + entrypoint: make + args: + - syncer-image-push + env: + - GIT_TAG=$_GIT_TAG + - EXTRA_TAG=$_PULL_BASE_REF + - DOCKER_BUILDX_CMD=/buildx-entrypoint substitutions: # _GIT_TAG will be filled with a git-based tag for the image, of the form vYYYYMMDD-hash, and # can be used as a substitution diff --git a/pkg/manifests/vllm/deployment-with-syncer.yaml b/pkg/manifests/vllm/deployment-with-syncer.yaml new file mode 100644 index 00000000..d6110f4b --- /dev/null +++ b/pkg/manifests/vllm/deployment-with-syncer.yaml @@ -0,0 +1,145 @@ +apiVersion: v1 +kind: Service +metadata: + name: vllm-llama2-7b-pool +spec: + selector: + app: vllm-llama2-7b-pool + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-llama2-7b-pool +spec: + replicas: 3 + selector: + matchLabels: + app: vllm-llama2-7b-pool + template: + metadata: + labels: + app: vllm-llama2-7b-pool + spec: + containers: + - name: lora + image: "vllm/vllm-openai:latest" + imagePullPolicy: Always + command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args: + - "--model" + - "meta-llama/Llama-2-7b-hf" + - "--tensor-parallel-size" + - "1" + - "--port" + - "8000" + - "--enable-lora" + - "--max-loras" + - "4" + - "--max-cpu-loras" + - "12" + - "--lora-modules" + - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' + - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' + env: + - name: PORT + value: "8000" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: token + - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING + value: "true" + ports: + - containerPort: 8000 + name: http + protocol: TCP + livenessProbe: + failureThreshold: 240 + httpGet: + path: /health + port: http + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + readinessProbe: + failureThreshold: 600 + httpGet: + path: /health + port: http + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + resources: + limits: + nvidia.com/gpu: 1 + requests: + nvidia.com/gpu: 1 + volumeMounts: + - mountPath: /data + name: data + - mountPath: /dev/shm + name: shm + - name: adapters + mountPath: "/adapters" + initContainers: + - name: lora-adapter-syncer + tty: true + stdin: true + image: us-central1-docker.pkg.dev/ahg-gke-dev/jobset2/lora-syncer:6dc97be + restartPolicy: Always + imagePullPolicy: Always + env: + - name: DYNAMIC_LORA_ROLLOUT_CONFIG + value: "/config/configmap.yaml" + volumeMounts: # DO NOT USE subPath + - name: config-volume + mountPath: /config + restartPolicy: Always + schedulerName: default-scheduler + terminationGracePeriodSeconds: 30 + volumes: + - name: data + emptyDir: {} + - name: shm + emptyDir: + medium: Memory + - name: adapters + emptyDir: {} + - name: config-volume + configMap: + name: dynamic-lora-config + +--- + +apiVersion: v1 +kind: ConfigMap +metadata: + name: dynamic-lora-config +data: + configmap.yaml: | + vLLMLoRAConfig: + name: sql-loras-llama + port: 8000 + ensureExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-0 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-1 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + ensureNotExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-2 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm \ No newline at end of file diff --git a/pkg/manifests/vllm/deployment.yaml b/pkg/manifests/vllm/deployment.yaml index 4af0891d..1d115f4d 100644 --- a/pkg/manifests/vllm/deployment.yaml +++ b/pkg/manifests/vllm/deployment.yaml @@ -43,18 +43,8 @@ spec: - "--max-cpu-loras" - "12" - "--lora-modules" - - "sql-lora=/adapters/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/" - - "tweet-summary=/adapters/hub/models--vineetsharma--qlora-adapter-Llama-2-7b-hf-TweetSumm/snapshots/796337d8e866318c59e38f16416e3ecd11fe5403" - - 'sql-lora-0=/adapters/yard1/llama-2-7b-sql-lora-test_0' - - 'sql-lora-1=/adapters/yard1/llama-2-7b-sql-lora-test_1' - - 'sql-lora-2=/adapters/yard1/llama-2-7b-sql-lora-test_2' - - 'sql-lora-3=/adapters/yard1/llama-2-7b-sql-lora-test_3' - - 'sql-lora-4=/adapters/yard1/llama-2-7b-sql-lora-test_4' - - 'tweet-summary-0=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_0' - - 'tweet-summary-1=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_1' - - 'tweet-summary-2=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_2' - - 'tweet-summary-3=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_3' - - 'tweet-summary-4=/adapters/vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm_4' + - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' + - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' env: - name: PORT value: "8000" @@ -99,29 +89,6 @@ spec: name: shm - name: adapters mountPath: "/adapters" - initContainers: - - name: adapter-loader - image: ghcr.io/tomatillo-and-multiverse/adapter-puller:demo - command: ["python"] - args: - - ./pull_adapters.py - - --adapter - - yard1/llama-2-7b-sql-lora-test - - --adapter - - vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm - - --duplicate-count - - "5" - env: - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token - key: token - - name: HF_HOME - value: /adapters - volumeMounts: - - name: adapters - mountPath: "/adapters" restartPolicy: Always schedulerName: default-scheduler terminationGracePeriodSeconds: 30 diff --git a/site-src/guides/dynamic-lora.md b/site-src/guides/dynamic-lora.md new file mode 100644 index 00000000..ef3c2b0f --- /dev/null +++ b/site-src/guides/dynamic-lora.md @@ -0,0 +1,93 @@ +# Getting started with Gateway API Inference Extension with Dynamic lora updates on vllm + +The goal of this guide is to get a single InferencePool running with vLLM and demonstrate use of dynamic lora updating! + +### Requirements + - Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher + - A cluster with: + - Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind, + you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). + - 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed. + +### Steps + +1. **Deploy Sample VLLM Model Server with dynamic lora update enabled and dynamic lora syncer sidecar ** + [Redeploy the vLLM deployment with Dynamic lora adapter enabled and Lora syncer sidecar and configmap](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/manifests/vllm/dynamic-lora-sidecar/deployment.yaml) + +Rest of the steps are same as [general setup](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/site-src/guides/index.md) + + +### Safely rollout v2 adapter + +1. Update the LoRA syncer ConfigMap to make the new adapter version available on the model servers. + +```yaml + apiVersion: v1 + kind: ConfigMap + metadata: + name: dynamic-lora-config + data: + configmap.yaml: | + vLLMLoRAConfig: + name: sql-loras-llama + port: 8000 + ensureExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-0 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-1 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-2 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm +2. Configure a canary rollout with traffic split using LLMService. In this example, 40% of traffic for tweet-summary model will be sent to the ***tweet-summary-2*** adapter . + +```yaml +model: + name: tweet-summary + targetModels: + targetModelName: tweet-summary-0 + weight: 20 + targetModelName: tweet-summary-1 + weight: 40 + targetModelName: tweet-summary-2 + weight: 40 + +``` + +3. Finish rollout by setting the traffic to the new version 100%. +```yaml +model: + name: tweet-summary + targetModels: + targetModelName: tweet-summary-2 + weight: 100 +``` + +4. Remove v1 from dynamic lora configmap. +```yaml + apiVersion: v1 + kind: ConfigMap + metadata: + name: dynamic-lora-config + data: + configmap.yaml: | + vLLMLoRAConfig: + name: sql-loras-llama + port: 8000 + ensureExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-2 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + ensureNotExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-1 + source: gs://[HUGGING FACE PATH] + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-0 + source: gs://[HUGGING FACE PATH] +``` diff --git a/site-src/guides/index.md b/site-src/guides/index.md index e4cbec6f..2cc971c6 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -19,6 +19,10 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2 kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml ``` + + + + 1. **Install the Inference Extension CRDs:** ```sh From 918b96f8463273c7562a2ce80b156f7ebc3e5454 Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Sat, 15 Feb 2025 00:10:20 +0000 Subject: [PATCH 24/96] Fix cloudbuild rule for the LoRA syncer image (#339) --- cloudbuild.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 40e45923..9b345c18 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -12,7 +12,7 @@ steps: - GIT_TAG=$_GIT_TAG - EXTRA_TAG=$_PULL_BASE_REF - DOCKER_BUILDX_CMD=/buildx-entrypoint - - name: lora-adapter-syncer + - name: gcr.io/k8s-testimages/gcb-docker-gcloud:v20220830-45cbff55bc entrypoint: make args: - syncer-image-push From 5114a5523a730a5b2003c2e9ca506762c4eaf4d6 Mon Sep 17 00:00:00 2001 From: Daneyon Hansen Date: Sat, 15 Feb 2025 12:30:21 -0500 Subject: [PATCH 25/96] fix: Corrects release branch naming (#333) Signed-off-by: Daneyon Hansen --- .github/ISSUE_TEMPLATE/new-release.md | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/new-release.md b/.github/ISSUE_TEMPLATE/new-release.md index ceca9f5f..be569844 100644 --- a/.github/ISSUE_TEMPLATE/new-release.md +++ b/.github/ISSUE_TEMPLATE/new-release.md @@ -34,7 +34,7 @@ This document defines the process for releasing Gateway API Inference Extension. export RC=1 ``` -4. The vLLM image tag defaults to `0.7.2` for a release. Optionally, change the vLLM image tag. For example: +4. The vLLM image tag defaults to `v0.7.2` for a release. Set the `VLLM` environment variable if a newer [tag][vllm-tag] has been published. For example: ```shell export VLLM=0.7.3 @@ -45,16 +45,25 @@ This document defines the process for releasing Gateway API Inference Extension. 1. If needed, clone the Gateway API Inference Extension [repo][repo]. ```shell - git clone https://github.com/kubernetes-sigs/gateway-api-inference-extension.git -b main + git clone -o ${REMOTE} https://github.com/kubernetes-sigs/gateway-api-inference-extension.git ``` 2. If you already have the repo cloned, ensure it’s up-to-date and your local branch is clean. -3. Create a new release branch from the `main` branch. The release branch should be named `release-v${MAJOR}.${MINOR}`, e.g. `release-v0.1`. +3. Release Branch Handling: + - For a Release Candidate: + Create a new release branch from the `main` branch. The branch should be named `release-${MAJOR}.${MINOR}`, for example, `release-0.1`: - ```shell - git checkout -b release-v${MAJOR}.${MINOR} - ``` + ```shell + git checkout -b release-${MAJOR}.${MINOR} + ``` + + - For a Major or Minor Release: + A release branch should already exist. In this case, check out the existing branch: + + ```shell + git checkout -b release-${MAJOR}.${MINOR} ${REMOTE}/release-${MAJOR}.${MINOR} + ``` 4. Update release-specific content, generate release artifacts, and stage the changes. @@ -79,7 +88,7 @@ This document defines the process for releasing Gateway API Inference Extension. 6. Push your release branch to the Gateway API Inference Extension remote. ```shell - git push ${REMOTE} release-v${MAJOR}.${MINOR} + git push ${REMOTE} release-${MAJOR}.${MINOR} ``` 7. Tag the head of your release branch with the number. @@ -149,3 +158,4 @@ Use the following steps to announce the release. [k8s.io]: https://github.com/kubernetes/k8s.io [yaml]: https://github.com/kubernetes/k8s.io/blob/main/registry.k8s.io/images/k8s-staging-gateway-api-inference-extension/images.yaml [issue]: https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/new/choose +[vllm-tag]: https://hub.docker.com/r/vllm/vllm-openai/tags From 6b42ab8e932da14840ee3a46a5a73a856a0938cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Kupka?= Date: Sat, 15 Feb 2025 20:06:21 +0100 Subject: [PATCH 26/96] Use contextual logging (#337) * Use contextual logging All possible direct klog calls are removed. Instead logr.Logger is loaded from the context or passed around as an argument. * Fix log levels * server: Handle context canceled * pod_reconciler: Use TRACE log level Co-authored-by: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> * pod_reconciler: Don't log pod not found err --------- Co-authored-by: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> --- docs/dev.md | 80 ++++++++++------- go.mod | 2 +- pkg/ext-proc/backend/datastore.go | 10 +-- pkg/ext-proc/backend/datastore_test.go | 4 +- pkg/ext-proc/backend/fake.go | 4 +- .../backend/inferencemodel_reconciler.go | 26 +++--- .../backend/inferencemodel_reconciler_test.go | 5 +- .../backend/inferencepool_reconciler.go | 18 ++-- .../backend/inferencepool_reconciler_test.go | 5 +- pkg/ext-proc/backend/pod_reconciler.go | 9 +- pkg/ext-proc/backend/provider.go | 34 ++++---- pkg/ext-proc/backend/provider_test.go | 5 +- pkg/ext-proc/backend/vllm/metrics.go | 33 ++++--- pkg/ext-proc/backend/vllm/metrics_test.go | 5 +- pkg/ext-proc/handlers/request.go | 43 ++++++---- pkg/ext-proc/handlers/response.go | 26 ++++-- pkg/ext-proc/handlers/response_test.go | 7 +- pkg/ext-proc/handlers/server.go | 41 +++++---- pkg/ext-proc/health.go | 7 +- pkg/ext-proc/main.go | 49 ++++++----- pkg/ext-proc/metrics/metrics.go | 7 +- pkg/ext-proc/metrics/metrics_test.go | 86 ++++++++++--------- pkg/ext-proc/scheduling/filter.go | 27 +++--- pkg/ext-proc/scheduling/filter_test.go | 12 ++- pkg/ext-proc/scheduling/scheduler.go | 17 ++-- pkg/ext-proc/server/runserver.go | 21 ++--- pkg/ext-proc/server/runserver_test.go | 3 +- pkg/ext-proc/test/benchmark/benchmark.go | 35 +++++--- pkg/ext-proc/test/utils.go | 28 +++--- pkg/ext-proc/util/logging/fatal.go | 14 +-- pkg/ext-proc/util/logging/logger.go | 20 +++++ test/integration/hermetic_test.go | 54 ++++++------ 32 files changed, 436 insertions(+), 301 deletions(-) create mode 100644 pkg/ext-proc/util/logging/logger.go diff --git a/docs/dev.md b/docs/dev.md index efd2023a..2af39668 100644 --- a/docs/dev.md +++ b/docs/dev.md @@ -1,27 +1,33 @@ - ## Logging +We use `logr.Logger` interface for logging everywhere. +The logger instance is loaded from `context.Context` or passed around as an argument directly. +This is aligned with contextual logging as explained in [k8s instrumentation logging guidelines](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-instrumentation/logging.md). + +In other words, we explicitly don't use `klog` global logging calls. +Using `klog` log value helpers like `klog.KObj` is just fine. + ### Change log verbosity -We use the `k8s.io/klog/v2` package to manage logging. We generally follow the [k8s instrumentation logging guidelines](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-instrumentation/logging.md), which states "the practical default level is V(2). Developers and QE environments may wish to run at V(3) or V(4)". -To configure logging verbosity, specify the `v` flag such as `--v=2`. +To configure logging verbosity, specify the `v` flag such as `--v=2`. ### Add logs The [k8s instrumentation logging guidelines](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-instrumentation/logging.md) has the following definitions: -* `klog.V(0).InfoS` = `klog.InfoS` - Generally useful for this to **always** be visible to a cluster operator -* `klog.V(1).InfoS` - A reasonable default log level if you don't want verbosity. -* `klog.V(2).InfoS` - Useful steady state information about the service and important log messages that may correlate to significant changes in the system. This is the recommended default log level for most systems. -* `klog.V(3).InfoS` - Extended information about changes -* `klog.V(4).InfoS` - Debug level verbosity -* `klog.V(5).InfoS` - Trace level verbosity +- `logger.V(0).Info` = `logger.Info` - Generally useful for this to **always** be visible to a cluster operator +- `logger.V(1).Info` - A reasonable default log level if you don't want verbosity. +- `logger.V(2).Info` - Useful steady state information about the service and important log messages that may correlate to significant changes in the system. This is the recommended default log level for most systems. +- `logger.V(3).Info` - Extended information about changes +- `logger.V(4).Info` - Debug level verbosity +- `logger.V(5).Info` - Trace level verbosity We choose to simplify to the following 3 common levels. + ``` const( DEFAULT=2 @@ -33,34 +39,46 @@ const( The guidelines are written in the context of a k8s controller. Our [ext-proc](../pkg/ext-proc/) does more things such as handling requests and scraping metrics, therefore we adapt the guidelines as follows: -1. The server startup process and configuration. - * `klog.InfoS` Logging at the `V(0)` verbosity level is generally welcome here as this is only logged once at startup, and provides useful info for debugging. +1. The server startup process and configuration. + + - `logger.Info` Logging at the `V(0)` verbosity level is generally welcome here as this is only logged once at startup, and provides useful info for debugging. 2. Reconciler loops. The reconciler loops watch for CR changes such as the `InferenceModel` CR. And given changes in these CRs significantly affect the behavior of the extension, we recommend using v=1 verbosity level as default, and sparsely use higher verbosity levels. - - * `klog.V(DEFAULT).InfoS` - * Default log level in the reconcilers. - * Information about config (listening on X, watching Y) - * Errors that repeat frequently that relate to conditions that can be corrected (e.g., inference model not initialized yet) - * System state changing (adding/removing objects in the data store) - * `V(VERBOSE)` and above: Use your best judgement. + + - `logger.V(DEFAULT)` + - Default log level in the reconcilers. + - Information about config (listening on X, watching Y) + - Errors that repeat frequently that relate to conditions that can be corrected (e.g., inference model not initialized yet) + - System state changing (adding/removing objects in the data store) + - `logger.V(VERBOSE)` and above: Use your best judgement. 3. Inference request handling. These requests are expected to be much higher volume than the control flow in the reconcilers and therefore we should be mindful of log spamming. We recommend using v=2 to log important info about a request, such as the HTTP response code, and higher verbosity levels for less important info. - * `klog.V(DEFAULT).InfoS` - * Logging the status code of an HTTP request - * Important decision making such as picking the target model, target pod - * `klog.V(VERBOSE).InfoS` - * Detailed request scheduling algorithm operations, such as running the filtering logic - * `V(DEBUG)` and above: Use your best judgement. + - `logger.V(DEFAULT)` + - Logging the status code of an HTTP request + - Important decision making such as picking the target model, target pod + - `logger.V(VERBOSE)` + - Detailed request scheduling algorithm operations, such as running the filtering logic + - `logger.V(DEBUG)` and above: Use your best judgement. 4. Metric scraping loops. These loops run at a very high frequency, and logs can be very spammy if not handled properly. - * `klog.V(TRACE).InfoS` - * Transient errors/warnings, such as failure to get response from a pod. - * Important state changes, such as updating a metric. -5. Misc + - `logger.V(TRACE)` + - Transient errors/warnings, such as failure to get response from a pod. + - Important state changes, such as updating a metric. + +5. Misc 1. Periodic (every 5s) debug loop which prints the current pods and metrics. - * `klog.WarningS` If the metrics are not fresh enough, which indicates an error occurred during the metric scraping loop. - * `klog.V(DEBUG).InfoS` - * This is very important to debug the request scheduling algorithm, and yet not spammy compared to the metric scraping loop logs. \ No newline at end of file + - `logger.V(DEFAULT).Error` If the metrics are not fresh enough, which indicates an error occurred during the metric scraping loop. + - `logger.V(DEBUG)` + - This is very important to debug the request scheduling algorithm, and yet not spammy compared to the metric scraping loop logs. + +### Passing Logger Around + +You can pass around a `context.Context` that contains a logger or a `logr.Logger` instance directly. +You need to make the call which one to use. Passing a `context.Context` is more standard, +on the other hand you then need to call `log.FromContext` everywhere. + +As `logger.V` calls are cummulative, i.e. `logger.V(2).V(3)` results in `logger.V(5)`, +a logger should be passed around with no verbosity level set so that `logger.V(DEFAULT)` +actually uses `DEFAULT` verbosity level. diff --git a/go.mod b/go.mod index d8b143ec..a59a28cc 100644 --- a/go.mod +++ b/go.mod @@ -8,6 +8,7 @@ require ( github.com/bojand/ghz v0.120.0 github.com/elastic/crd-ref-docs v0.1.0 github.com/envoyproxy/go-control-plane/envoy v1.32.4 + github.com/go-logr/logr v1.4.2 github.com/google/go-cmp v0.6.0 github.com/jhump/protoreflect v1.17.0 github.com/onsi/ginkgo/v2 v2.22.2 @@ -61,7 +62,6 @@ require ( github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/fxamacker/cbor/v2 v2.7.0 // indirect - github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect diff --git a/pkg/ext-proc/backend/datastore.go b/pkg/ext-proc/backend/datastore.go index a54833bc..a75e7e43 100644 --- a/pkg/ext-proc/backend/datastore.go +++ b/pkg/ext-proc/backend/datastore.go @@ -7,10 +7,11 @@ import ( "strconv" "sync" + "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/labels" - "k8s.io/klog/v2" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -87,7 +88,7 @@ func (ds *K8sDatastore) HasSynced() bool { return ds.inferencePool != nil } -func RandomWeightedDraw(model *v1alpha1.InferenceModel, seed int64) string { +func RandomWeightedDraw(logger logr.Logger, model *v1alpha1.InferenceModel, seed int64) string { var weights int32 source := rand.NewSource(rand.Int63()) @@ -98,7 +99,7 @@ func RandomWeightedDraw(model *v1alpha1.InferenceModel, seed int64) string { for _, model := range model.Spec.TargetModels { weights += *model.Weight } - klog.V(logutil.VERBOSE).InfoS("Weights for model computed", "model", model.Name, "weights", weights) + logger.V(logutil.TRACE).Info("Weights for model computed", "model", model.Name, "weights", weights) randomVal := r.Int31n(weights) for _, model := range model.Spec.TargetModels { if randomVal < *model.Weight { @@ -128,7 +129,7 @@ func (ds *K8sDatastore) flushPodsAndRefetch(ctx context.Context, ctrlClient clie LabelSelector: selectorFromInferencePoolSelector(newServerPool.Spec.Selector), Namespace: newServerPool.Namespace, }); err != nil { - klog.Error(err, "error listing clients") + log.FromContext(ctx).V(logutil.DEFAULT).Error(err, "Failed to list clients") } ds.pods.Clear() @@ -139,7 +140,6 @@ func (ds *K8sDatastore) flushPodsAndRefetch(ctx context.Context, ctrlClient clie } ds.pods.Store(pod, true) } - } func selectorFromInferencePoolSelector(selector map[v1alpha1.LabelKey]v1alpha1.LabelValue) labels.Selector { diff --git a/pkg/ext-proc/backend/datastore_test.go b/pkg/ext-proc/backend/datastore_test.go index 0fc5da1a..9f74226a 100644 --- a/pkg/ext-proc/backend/datastore_test.go +++ b/pkg/ext-proc/backend/datastore_test.go @@ -5,6 +5,7 @@ import ( v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) func TestHasSynced(t *testing.T) { @@ -46,6 +47,7 @@ func TestHasSynced(t *testing.T) { } func TestRandomWeightedDraw(t *testing.T) { + logger := logutil.NewTestLogger() tests := []struct { name string model *v1alpha1.InferenceModel @@ -118,7 +120,7 @@ func TestRandomWeightedDraw(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { for range 10000 { - model := RandomWeightedDraw(test.model, seedVal) + model := RandomWeightedDraw(logger, test.model, seedVal) if model != test.want { t.Errorf("Model returned!: %v", model) break diff --git a/pkg/ext-proc/backend/fake.go b/pkg/ext-proc/backend/fake.go index 7ab8a464..2c0757db 100644 --- a/pkg/ext-proc/backend/fake.go +++ b/pkg/ext-proc/backend/fake.go @@ -3,7 +3,7 @@ package backend import ( "context" - klog "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -17,7 +17,7 @@ func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, pod Pod, existi if err, ok := f.Err[pod]; ok { return nil, err } - klog.V(logutil.VERBOSE).InfoS("Fetching metrics for pod", "pod", pod, "existing", existing, "new", f.Res[pod]) + log.FromContext(ctx).V(logutil.VERBOSE).Info("Fetching metrics for pod", "pod", pod, "existing", existing, "new", f.Res[pod]) return f.Res[pod], nil } diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler.go b/pkg/ext-proc/backend/inferencemodel_reconciler.go index 72ea063e..4959845c 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler.go @@ -3,13 +3,14 @@ package backend import ( "context" + "github.com/go-logr/logr" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" - "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -27,38 +28,39 @@ func (c *InferenceModelReconciler) Reconcile(ctx context.Context, req ctrl.Reque return ctrl.Result{}, nil } - klogV := klog.V(logutil.DEFAULT) - klogV.InfoS("Reconciling InferenceModel", "name", req.NamespacedName) + logger := log.FromContext(ctx) + loggerDefault := logger.V(logutil.DEFAULT) + loggerDefault.Info("Reconciling InferenceModel", "name", req.NamespacedName) infModel := &v1alpha1.InferenceModel{} if err := c.Get(ctx, req.NamespacedName, infModel); err != nil { if errors.IsNotFound(err) { - klogV.InfoS("InferenceModel not found. Removing from datastore since object must be deleted", "name", req.NamespacedName) + loggerDefault.Info("InferenceModel not found. Removing from datastore since object must be deleted", "name", req.NamespacedName) c.Datastore.InferenceModels.Delete(infModel.Spec.ModelName) return ctrl.Result{}, nil } - klogV.ErrorS(err, "Unable to get InferenceModel", "name", req.NamespacedName) + loggerDefault.Error(err, "Unable to get InferenceModel", "name", req.NamespacedName) return ctrl.Result{}, err } else if !infModel.DeletionTimestamp.IsZero() { - klogV.InfoS("InferenceModel is marked for deletion. Removing from datastore", "name", req.NamespacedName) + loggerDefault.Info("InferenceModel is marked for deletion. Removing from datastore", "name", req.NamespacedName) c.Datastore.InferenceModels.Delete(infModel.Spec.ModelName) return ctrl.Result{}, nil } - c.updateDatastore(infModel) + c.updateDatastore(logger, infModel) return ctrl.Result{}, nil } -func (c *InferenceModelReconciler) updateDatastore(infModel *v1alpha1.InferenceModel) { - klogV := klog.V(logutil.DEFAULT) +func (c *InferenceModelReconciler) updateDatastore(logger logr.Logger, infModel *v1alpha1.InferenceModel) { + loggerDefault := logger.V(logutil.DEFAULT) if infModel.Spec.PoolRef.Name == c.PoolNamespacedName.Name { - klogV.InfoS("Updating datastore", "poolRef", infModel.Spec.PoolRef, "serverPoolName", c.PoolNamespacedName) - klogV.InfoS("Adding/Updating InferenceModel", "modelName", infModel.Spec.ModelName) + loggerDefault.Info("Updating datastore", "poolRef", infModel.Spec.PoolRef, "serverPoolName", c.PoolNamespacedName) + loggerDefault.Info("Adding/Updating InferenceModel", "modelName", infModel.Spec.ModelName) c.Datastore.InferenceModels.Store(infModel.Spec.ModelName, infModel) return } - klogV.InfoS("Removing/Not adding InferenceModel", "modelName", infModel.Spec.ModelName) + loggerDefault.Info("Removing/Not adding InferenceModel", "modelName", infModel.Spec.ModelName) // If we get here. The model is not relevant to this pool, remove. c.Datastore.InferenceModels.Delete(infModel.Spec.ModelName) } diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go index c5ef8d14..4e195818 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go @@ -13,6 +13,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) var ( @@ -46,6 +47,8 @@ var ( ) func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { + logger := logutil.NewTestLogger() + tests := []struct { name string datastore *K8sDatastore @@ -135,7 +138,7 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { Datastore: test.datastore, PoolNamespacedName: types.NamespacedName{Name: test.datastore.inferencePool.Name}, } - reconciler.updateDatastore(test.incomingService) + reconciler.updateDatastore(logger, test.incomingService) if ok := mapsEqual(reconciler.Datastore.InferenceModels, test.wantInferenceModels); !ok { t.Error("Maps are not equal") diff --git a/pkg/ext-proc/backend/inferencepool_reconciler.go b/pkg/ext-proc/backend/inferencepool_reconciler.go index 9504b4e0..e44a278a 100644 --- a/pkg/ext-proc/backend/inferencepool_reconciler.go +++ b/pkg/ext-proc/backend/inferencepool_reconciler.go @@ -4,12 +4,14 @@ import ( "context" "reflect" + "github.com/go-logr/logr" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" klog "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -29,29 +31,31 @@ func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Reques if req.NamespacedName.Name != c.PoolNamespacedName.Name || req.NamespacedName.Namespace != c.PoolNamespacedName.Namespace { return ctrl.Result{}, nil } - klogV := klog.V(logutil.DEFAULT) - klogV.InfoS("Reconciling InferencePool", "name", req.NamespacedName) + + logger := log.FromContext(ctx) + loggerDefault := logger.V(logutil.DEFAULT) + loggerDefault.Info("Reconciling InferencePool", "name", req.NamespacedName) serverPool := &v1alpha1.InferencePool{} if err := c.Get(ctx, req.NamespacedName, serverPool); err != nil { - klogV.ErrorS(err, "Unable to get InferencePool", "name", req.NamespacedName) + loggerDefault.Error(err, "Unable to get InferencePool", "name", req.NamespacedName) return ctrl.Result{}, err } if c.Datastore.inferencePool == nil || !reflect.DeepEqual(serverPool.Spec.Selector, c.Datastore.inferencePool.Spec.Selector) { - c.updateDatastore(serverPool) + c.updateDatastore(logger, serverPool) c.Datastore.flushPodsAndRefetch(ctx, c.Client, serverPool) } else { - c.updateDatastore(serverPool) + c.updateDatastore(logger, serverPool) } return ctrl.Result{}, nil } -func (c *InferencePoolReconciler) updateDatastore(serverPool *v1alpha1.InferencePool) { +func (c *InferencePoolReconciler) updateDatastore(logger logr.Logger, serverPool *v1alpha1.InferencePool) { pool, _ := c.Datastore.getInferencePool() if pool == nil || serverPool.ObjectMeta.ResourceVersion != pool.ObjectMeta.ResourceVersion { - klog.V(logutil.DEFAULT).InfoS("Updating inference pool", "target", klog.KMetadata(&serverPool.ObjectMeta)) + logger.V(logutil.DEFAULT).Info("Updating inference pool", "target", klog.KMetadata(&serverPool.ObjectMeta)) c.Datastore.setInferencePool(serverPool) } } diff --git a/pkg/ext-proc/backend/inferencepool_reconciler_test.go b/pkg/ext-proc/backend/inferencepool_reconciler_test.go index f16524a5..1da7d61b 100644 --- a/pkg/ext-proc/backend/inferencepool_reconciler_test.go +++ b/pkg/ext-proc/backend/inferencepool_reconciler_test.go @@ -6,6 +6,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) var ( @@ -41,6 +42,8 @@ var ( ) func TestUpdateDatastore_InferencePoolReconciler(t *testing.T) { + logger := logutil.NewTestLogger() + tests := []struct { name string datastore *K8sDatastore @@ -74,7 +77,7 @@ func TestUpdateDatastore_InferencePoolReconciler(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { inferencePoolReconciler := &InferencePoolReconciler{Datastore: test.datastore} - inferencePoolReconciler.updateDatastore(test.incomingPool) + inferencePoolReconciler.updateDatastore(logger, test.incomingPool) gotPool := inferencePoolReconciler.Datastore.inferencePool if !reflect.DeepEqual(gotPool, test.wantPool) { diff --git a/pkg/ext-proc/backend/pod_reconciler.go b/pkg/ext-proc/backend/pod_reconciler.go index 60d014ce..b914ea8d 100644 --- a/pkg/ext-proc/backend/pod_reconciler.go +++ b/pkg/ext-proc/backend/pod_reconciler.go @@ -8,9 +8,9 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/record" - "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -23,24 +23,25 @@ type PodReconciler struct { } func (c *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) inferencePool, err := c.Datastore.getInferencePool() if err != nil { - klog.V(logutil.DEFAULT).Infof("Skipping reconciling Pod because the InferencePool is not available yet: %v", err) + logger.V(logutil.TRACE).Info("Skipping reconciling Pod because the InferencePool is not available yet", "error", err) // When the inferencePool is initialized it lists the appropriate pods and populates the datastore, so no need to requeue. return ctrl.Result{}, nil } else if inferencePool.Namespace != req.Namespace { return ctrl.Result{}, nil } - klog.V(logutil.VERBOSE).Info("reconciling Pod", req.NamespacedName) + logger.V(logutil.VERBOSE).Info("Pod being reconciled", "name", req.NamespacedName) pod := &corev1.Pod{} if err := c.Get(ctx, req.NamespacedName, pod); err != nil { - klog.Error(err, ": unable to get pod") if apierrors.IsNotFound(err) { c.Datastore.pods.Delete(pod) return ctrl.Result{}, nil } + logger.V(logutil.DEFAULT).Error(err, "Unable to get pod", "name", req.NamespacedName) return ctrl.Result{}, err } diff --git a/pkg/ext-proc/backend/provider.go b/pkg/ext-proc/backend/provider.go index d64b80b3..ce738986 100644 --- a/pkg/ext-proc/backend/provider.go +++ b/pkg/ext-proc/backend/provider.go @@ -6,8 +6,8 @@ import ( "sync" "time" + "github.com/go-logr/logr" "go.uber.org/multierr" - klog "k8s.io/klog/v2" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -59,14 +59,14 @@ func (p *Provider) GetPodMetrics(pod Pod) (*PodMetrics, bool) { return nil, false } -func (p *Provider) Init(refreshPodsInterval, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration) error { +func (p *Provider) Init(logger logr.Logger, refreshPodsInterval, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration) error { p.refreshPodsOnce() - if err := p.refreshMetricsOnce(); err != nil { - klog.ErrorS(err, "Failed to init metrics") + if err := p.refreshMetricsOnce(logger); err != nil { + logger.Error(err, "Failed to init metrics") } - klog.InfoS("Initialized pods and metrics", "metrics", p.AllPodMetrics()) + logger.Info("Initialized pods and metrics", "metrics", p.AllPodMetrics()) // periodically refresh pods go func() { @@ -80,8 +80,8 @@ func (p *Provider) Init(refreshPodsInterval, refreshMetricsInterval, refreshProm go func() { for { time.Sleep(refreshMetricsInterval) - if err := p.refreshMetricsOnce(); err != nil { - klog.V(logutil.TRACE).ErrorS(err, "Failed to refresh metrics") + if err := p.refreshMetricsOnce(logger); err != nil { + logger.V(logutil.DEFAULT).Error(err, "Failed to refresh metrics") } } }() @@ -90,16 +90,16 @@ func (p *Provider) Init(refreshPodsInterval, refreshMetricsInterval, refreshProm go func() { for { time.Sleep(refreshPrometheusMetricsInterval) - p.flushPrometheusMetricsOnce() + p.flushPrometheusMetricsOnce(logger) } }() // Periodically print out the pods and metrics for DEBUGGING. - if klogV := klog.V(logutil.DEBUG); klogV.Enabled() { + if logger := logger.V(logutil.DEBUG); logger.Enabled() { go func() { for { time.Sleep(5 * time.Second) - klogV.InfoS("Current Pods and metrics gathered", "metrics", p.AllPodMetrics()) + logger.Info("Current Pods and metrics gathered", "metrics", p.AllPodMetrics()) } }() } @@ -137,20 +137,20 @@ func (p *Provider) refreshPodsOnce() { p.datastore.pods.Range(addNewPods) } -func (p *Provider) refreshMetricsOnce() error { - klogV := klog.V(logutil.TRACE) +func (p *Provider) refreshMetricsOnce(logger logr.Logger) error { + loggerTrace := logger.V(logutil.TRACE) ctx, cancel := context.WithTimeout(context.Background(), fetchMetricsTimeout) defer cancel() start := time.Now() defer func() { d := time.Since(start) // TODO: add a metric instead of logging - klogV.InfoS("Metrics refreshed", "duration", d) + loggerTrace.Info("Metrics refreshed", "duration", d) }() var wg sync.WaitGroup errCh := make(chan error) processOnePod := func(key, value any) bool { - klogV.InfoS("Pod and metric being processed", "pod", key, "metric", value) + loggerTrace.Info("Pod and metric being processed", "pod", key, "metric", value) pod := key.(Pod) existing := value.(*PodMetrics) wg.Add(1) @@ -162,7 +162,7 @@ func (p *Provider) refreshMetricsOnce() error { return } p.UpdatePodMetrics(pod, updated) - klogV.InfoS("Updated metrics for pod", "pod", pod, "metrics", updated.Metrics) + loggerTrace.Info("Updated metrics for pod", "pod", pod, "metrics", updated.Metrics) }() return true } @@ -185,8 +185,8 @@ func (p *Provider) refreshMetricsOnce() error { return errs } -func (p *Provider) flushPrometheusMetricsOnce() { - klog.V(logutil.DEBUG).InfoS("Flushing Prometheus Metrics") +func (p *Provider) flushPrometheusMetricsOnce(logger logr.Logger) { + logger.V(logutil.DEBUG).Info("Flushing Prometheus Metrics") pool, _ := p.datastore.getInferencePool() if pool == nil { diff --git a/pkg/ext-proc/backend/provider_test.go b/pkg/ext-proc/backend/provider_test.go index ddd7f0d6..95575046 100644 --- a/pkg/ext-proc/backend/provider_test.go +++ b/pkg/ext-proc/backend/provider_test.go @@ -8,6 +8,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) var ( @@ -38,6 +39,8 @@ var ( ) func TestProvider(t *testing.T) { + logger := logutil.NewTestLogger() + tests := []struct { name string pmc PodMetricsClient @@ -90,7 +93,7 @@ func TestProvider(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { p := NewProvider(test.pmc, test.datastore) - err := p.Init(time.Millisecond, time.Millisecond, time.Millisecond) + err := p.Init(logger, time.Millisecond, time.Millisecond, time.Millisecond) if test.initErr != (err != nil) { t.Fatalf("Unexpected error, got: %v, want: %v", err, test.initErr) } diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/ext-proc/backend/vllm/metrics.go index 4c3804ce..4558a664 100644 --- a/pkg/ext-proc/backend/vllm/metrics.go +++ b/pkg/ext-proc/backend/vllm/metrics.go @@ -9,10 +9,11 @@ import ( "strings" "time" + "github.com/go-logr/logr" dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" "go.uber.org/multierr" - klog "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -40,17 +41,20 @@ func (p *PodMetricsClientImpl) FetchMetrics( pod backend.Pod, existing *backend.PodMetrics, ) (*backend.PodMetrics, error) { + logger := log.FromContext(ctx) + loggerDefault := logger.V(logutil.DEFAULT) + // Currently the metrics endpoint is hard-coded, which works with vLLM. // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. url := fmt.Sprintf("http://%s/metrics", pod.Address) req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { - klog.V(logutil.DEFAULT).ErrorS(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) + loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) return nil, fmt.Errorf("failed to create request: %v", err) } resp, err := http.DefaultClient.Do(req) if err != nil { - klog.V(logutil.DEFAULT).ErrorS(err, "Failed to fetch metrics", "pod", pod) + loggerDefault.Error(err, "Failed to fetch metrics", "pod", pod) return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod, err) } defer func() { @@ -58,7 +62,7 @@ func (p *PodMetricsClientImpl) FetchMetrics( }() if resp.StatusCode != http.StatusOK { - klog.V(logutil.DEFAULT).ErrorS(nil, "Unexpected status code returned", "pod", pod, "statusCode", resp.StatusCode) + loggerDefault.Error(nil, "Unexpected status code returned", "pod", pod, "statusCode", resp.StatusCode) return nil, fmt.Errorf("unexpected status code from %s: %v", pod, resp.StatusCode) } @@ -67,35 +71,36 @@ func (p *PodMetricsClientImpl) FetchMetrics( if err != nil { return nil, err } - return promToPodMetrics(metricFamilies, existing) + return promToPodMetrics(logger, metricFamilies, existing) } // promToPodMetrics updates internal pod metrics with scraped prometheus metrics. // A combined error is returned if errors occur in one or more metric processing. // it returns a new PodMetrics pointer which can be used to atomically update the pod metrics map. func promToPodMetrics( + logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, existing *backend.PodMetrics, ) (*backend.PodMetrics, error) { var errs error updated := existing.Clone() - runningQueueSize, err := getLatestMetric(metricFamilies, RunningQueueSizeMetricName) + runningQueueSize, err := getLatestMetric(logger, metricFamilies, RunningQueueSizeMetricName) errs = multierr.Append(errs, err) if err == nil { updated.RunningQueueSize = int(runningQueueSize.GetGauge().GetValue()) } - waitingQueueSize, err := getLatestMetric(metricFamilies, WaitingQueueSizeMetricName) + waitingQueueSize, err := getLatestMetric(logger, metricFamilies, WaitingQueueSizeMetricName) errs = multierr.Append(errs, err) if err == nil { updated.WaitingQueueSize = int(waitingQueueSize.GetGauge().GetValue()) } - cachePercent, err := getLatestMetric(metricFamilies, KVCacheUsagePercentMetricName) + cachePercent, err := getLatestMetric(logger, metricFamilies, KVCacheUsagePercentMetricName) errs = multierr.Append(errs, err) if err == nil { updated.KVCacheUsagePercent = cachePercent.GetGauge().GetValue() } - loraMetrics, _, err := getLatestLoraMetric(metricFamilies) + loraMetrics, _, err := getLatestLoraMetric(logger, metricFamilies) errs = multierr.Append(errs, err) /* TODO: uncomment once this is available in vllm. kvCap, _, err := getGaugeLatestValue(metricFamilies, KvCacheMaxTokenCapacityMetricName) @@ -135,10 +140,10 @@ func promToPodMetrics( // reason its specially fetched is because each label key value pair permutation generates new series // and only most recent is useful. The value of each series is the creation timestamp so we can // retrieve the latest by sorting the value. -func getLatestLoraMetric(metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) { +func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) { loraRequests, ok := metricFamilies[LoraRequestInfoMetricName] if !ok { - klog.V(logutil.DEFAULT).ErrorS(nil, "Metric family not found", "name", LoraRequestInfoMetricName) + logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName) return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName) } var latestTs float64 @@ -154,10 +159,10 @@ func getLatestLoraMetric(metricFamilies map[string]*dto.MetricFamily) (*dto.Metr // getLatestMetric gets the latest metric of a family. This should be used to get the latest Gauge metric. // Since vllm doesn't set the timestamp in metric, this metric essentially gets the first metric. -func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, error) { +func getLatestMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, error) { mf, ok := metricFamilies[metricName] if !ok { - klog.V(logutil.DEFAULT).ErrorS(nil, "Metric family not found", "name", metricName) + logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", metricName) return nil, fmt.Errorf("metric family %q not found", metricName) } if len(mf.GetMetric()) == 0 { @@ -171,6 +176,6 @@ func getLatestMetric(metricFamilies map[string]*dto.MetricFamily, metricName str latest = m } } - klog.V(logutil.TRACE).InfoS("Metric value selected", "value", latest, "metric", metricName) + logger.V(logutil.TRACE).Info("Metric value selected", "value", latest, "metric", metricName) return latest, nil } diff --git a/pkg/ext-proc/backend/vllm/metrics_test.go b/pkg/ext-proc/backend/vllm/metrics_test.go index 3d4225e8..0a718cd7 100644 --- a/pkg/ext-proc/backend/vllm/metrics_test.go +++ b/pkg/ext-proc/backend/vllm/metrics_test.go @@ -8,9 +8,12 @@ import ( "github.com/stretchr/testify/assert" "google.golang.org/protobuf/proto" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) func TestPromToPodMetrics(t *testing.T) { + logger := logutil.NewTestLogger() + testCases := []struct { name string metricFamilies map[string]*dto.MetricFamily @@ -219,7 +222,7 @@ func TestPromToPodMetrics(t *testing.T) { } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - updated, err := promToPodMetrics(tc.metricFamilies, tc.initialPodMetrics) + updated, err := promToPodMetrics(logger, tc.metricFamilies, tc.initialPodMetrics) if tc.expectedErr != nil { assert.Error(t, err) } else { diff --git a/pkg/ext-proc/handlers/request.go b/pkg/ext-proc/handlers/request.go index a36f7ae3..8ce2956f 100644 --- a/pkg/ext-proc/handlers/request.go +++ b/pkg/ext-proc/handlers/request.go @@ -1,6 +1,7 @@ package handlers import ( + "context" "encoding/json" "errors" "fmt" @@ -9,7 +10,7 @@ import ( configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "google.golang.org/protobuf/types/known/structpb" - klog "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" @@ -18,25 +19,30 @@ import ( // HandleRequestBody handles body of the request to the backend server, such as parsing the "model" // parameter. // Envoy sends the request body to ext proc before sending the request to the backend server. -func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { - klogV := klog.V(logutil.VERBOSE) - klogV.InfoS("Handling request body") +func (s *Server) HandleRequestBody( + ctx context.Context, + reqCtx *RequestContext, + req *extProcPb.ProcessingRequest, +) (*extProcPb.ProcessingResponse, error) { + logger := log.FromContext(ctx) + loggerVerbose := logger.V(logutil.VERBOSE) + loggerVerbose.Info("Handling request body") // Unmarshal request body (must be JSON). v := req.Request.(*extProcPb.ProcessingRequest_RequestBody) var rb map[string]interface{} if err := json.Unmarshal(v.RequestBody.Body, &rb); err != nil { - klog.V(logutil.DEFAULT).ErrorS(err, "Error unmarshaling request body") + logger.V(logutil.DEFAULT).Error(err, "Error unmarshaling request body") return nil, fmt.Errorf("error unmarshaling request body: %v", err) } - klogV.InfoS("Request body unmarshalled", "body", rb) + loggerVerbose.Info("Request body unmarshalled", "body", rb) // Resolve target models. model, ok := rb["model"].(string) if !ok { return nil, errors.New("model not found in request") } - klogV.InfoS("Model requested", "model", model) + loggerVerbose.Info("Model requested", "model", model) modelName := model // NOTE: The nil checking for the modelObject means that we DO allow passthrough currently. @@ -47,7 +53,7 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces return nil, fmt.Errorf("error finding a model object in InferenceModel for input %v", model) } if len(modelObj.Spec.TargetModels) > 0 { - modelName = backend.RandomWeightedDraw(modelObj, 0) + modelName = backend.RandomWeightedDraw(logger, modelObj, 0) if modelName == "" { return nil, fmt.Errorf("error getting target model name for model %v", modelObj.Name) } @@ -57,7 +63,7 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces ResolvedTargetModel: modelName, Critical: backend.IsCritical(modelObj), } - klogV.InfoS("LLM request assembled", "request", llmReq) + loggerVerbose.Info("LLM request assembled", "request", llmReq) requestBody := v.RequestBody.Body var err error @@ -66,17 +72,18 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces rb["model"] = llmReq.ResolvedTargetModel requestBody, err = json.Marshal(rb) if err != nil { - klog.V(logutil.DEFAULT).ErrorS(err, "Error marshaling request body") + logger.V(logutil.DEFAULT).Error(err, "Error marshaling request body") return nil, fmt.Errorf("error marshaling request body: %v", err) } - klogV.InfoS("Updated request body marshalled", "body", string(requestBody)) + loggerVerbose.Info("Updated request body marshalled", "body", string(requestBody)) } - targetPod, err := s.scheduler.Schedule(llmReq) + targetPod, err := s.scheduler.Schedule(ctx, llmReq) if err != nil { return nil, fmt.Errorf("failed to find target pod: %w", err) } - klogV.InfoS("Target model and pod selected", "model", llmReq.ResolvedTargetModel, "pod", targetPod) + logger.V(logutil.DEFAULT).Info("Request handled", + "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "endpoint", targetPod) reqCtx.Model = llmReq.Model reqCtx.ResolvedTargetModel = llmReq.ResolvedTargetModel @@ -102,7 +109,7 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces } // Print headers for debugging for _, header := range headers { - klog.V(logutil.DEBUG).InfoS("Request body header", "key", header.Header.Key, "value", header.Header.RawValue) + logger.V(logutil.DEBUG).Info("Request body header", "key", header.Header.Key, "value", header.Header.RawValue) } resp := &extProcPb.ProcessingResponse{ @@ -136,10 +143,14 @@ func (s *Server) HandleRequestBody(reqCtx *RequestContext, req *extProcPb.Proces return resp, nil } -func HandleRequestHeaders(reqCtx *RequestContext, req *extProcPb.ProcessingRequest) *extProcPb.ProcessingResponse { +func HandleRequestHeaders( + ctx context.Context, + reqCtx *RequestContext, + req *extProcPb.ProcessingRequest, +) *extProcPb.ProcessingResponse { r := req.Request h := r.(*extProcPb.ProcessingRequest_RequestHeaders) - klog.V(logutil.VERBOSE).InfoS("Handling request headers", "headers", h) + log.FromContext(ctx).V(logutil.VERBOSE).Info("Handling request headers", "headers", h) resp := &extProcPb.ProcessingResponse{ Response: &extProcPb.ProcessingResponse_RequestHeaders{ diff --git a/pkg/ext-proc/handlers/response.go b/pkg/ext-proc/handlers/response.go index 012b0b8d..06da8106 100644 --- a/pkg/ext-proc/handlers/response.go +++ b/pkg/ext-proc/handlers/response.go @@ -1,20 +1,26 @@ package handlers import ( + "context" "encoding/json" "fmt" configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" - klog "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/log" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) // HandleResponseHeaders processes response headers from the backend model server. -func (s *Server) HandleResponseHeaders(reqCtx *RequestContext, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { - klog.V(logutil.VERBOSE).InfoS("Processing ResponseHeaders") +func (s *Server) HandleResponseHeaders( + ctx context.Context, + reqCtx *RequestContext, + req *extProcPb.ProcessingRequest, +) (*extProcPb.ProcessingResponse, error) { + loggerVerbose := log.FromContext(ctx).V(logutil.VERBOSE) + loggerVerbose.Info("Processing ResponseHeaders") h := req.Request.(*extProcPb.ProcessingRequest_ResponseHeaders) - klog.V(logutil.VERBOSE).InfoS("Headers before", "headers", h) + loggerVerbose.Info("Headers before", "headers", h) resp := &extProcPb.ProcessingResponse{ Response: &extProcPb.ProcessingResponse_ResponseHeaders{ @@ -65,8 +71,14 @@ func (s *Server) HandleResponseHeaders(reqCtx *RequestContext, req *extProcPb.Pr "completion_tokens": 100 } }*/ -func (s *Server) HandleResponseBody(reqCtx *RequestContext, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { - klog.V(logutil.VERBOSE).InfoS("Processing HandleResponseBody") +func (s *Server) HandleResponseBody( + ctx context.Context, + reqCtx *RequestContext, + req *extProcPb.ProcessingRequest, +) (*extProcPb.ProcessingResponse, error) { + logger := log.FromContext(ctx) + loggerVerbose := logger.V(logutil.VERBOSE) + loggerVerbose.Info("Processing HandleResponseBody") body := req.Request.(*extProcPb.ProcessingRequest_ResponseBody) res := Response{} @@ -81,7 +93,7 @@ func (s *Server) HandleResponseBody(reqCtx *RequestContext, req *extProcPb.Proce // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/178) // will add the processing for streaming case. reqCtx.ResponseComplete = true - klog.V(logutil.VERBOSE).InfoS("Response generated", "response", res) + loggerVerbose.Info("Response generated", "response", res) resp := &extProcPb.ProcessingResponse{ Response: &extProcPb.ProcessingResponse_ResponseBody{ diff --git a/pkg/ext-proc/handlers/response_test.go b/pkg/ext-proc/handlers/response_test.go index df338066..67875e05 100644 --- a/pkg/ext-proc/handlers/response_test.go +++ b/pkg/ext-proc/handlers/response_test.go @@ -1,10 +1,12 @@ package handlers import ( + "context" "testing" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "github.com/google/go-cmp/cmp" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) const ( @@ -34,6 +36,8 @@ const ( ) func TestHandleResponseBody(t *testing.T) { + ctx := logutil.NewTestLoggerIntoContext(context.Background()) + tests := []struct { name string req *extProcPb.ProcessingRequest_ResponseBody @@ -70,8 +74,7 @@ func TestHandleResponseBody(t *testing.T) { t.Run(test.name, func(t *testing.T) { server := &Server{} reqCtx := &RequestContext{} - _, err := server.HandleResponseBody(reqCtx, &extProcPb.ProcessingRequest{Request: test.req}) - + _, err := server.HandleResponseBody(ctx, reqCtx, &extProcPb.ProcessingRequest{Request: test.req}) if err != nil { if !test.wantErr { t.Fatalf("HandleResponseBody returned unexpected error: %v, want %v", err, test.wantErr) diff --git a/pkg/ext-proc/handlers/server.go b/pkg/ext-proc/handlers/server.go index a3cfcada..6be747da 100644 --- a/pkg/ext-proc/handlers/server.go +++ b/pkg/ext-proc/handlers/server.go @@ -1,6 +1,8 @@ package handlers import ( + "context" + "errors" "io" "time" @@ -8,7 +10,7 @@ import ( envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" - klog "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" @@ -37,7 +39,7 @@ type Server struct { } type Scheduler interface { - Schedule(b *scheduling.LLMRequest) (targetPod backend.Pod, err error) + Schedule(ctx context.Context, b *scheduling.LLMRequest) (targetPod backend.Pod, err error) } // PodProvider is an interface to provide set of pods in the backend and information such as metrics. @@ -51,8 +53,11 @@ type ModelDataStore interface { } func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { - klog.V(logutil.VERBOSE).InfoS("Processing") ctx := srv.Context() + logger := log.FromContext(ctx) + loggerVerbose := logger.V(logutil.VERBOSE) + loggerVerbose.Info("Processing") + // Create request context to share states during life time of an HTTP request. // See https://github.com/envoyproxy/envoy/issues/17540. reqCtx := &RequestContext{} @@ -65,13 +70,13 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { } req, err := srv.Recv() - if err == io.EOF { + if err == io.EOF || errors.Is(err, context.Canceled) { return nil } if err != nil { // This error occurs very frequently, though it doesn't seem to have any impact. // TODO Figure out if we can remove this noise. - klog.V(logutil.VERBOSE).ErrorS(err, "Cannot receive stream request") + loggerVerbose.Error(err, "Cannot receive stream request") return status.Errorf(codes.Unknown, "cannot receive stream request: %v", err) } @@ -79,34 +84,34 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { switch v := req.Request.(type) { case *extProcPb.ProcessingRequest_RequestHeaders: reqCtx.RequestReceivedTimestamp = time.Now() - resp = HandleRequestHeaders(reqCtx, req) - klog.V(logutil.VERBOSE).InfoS("Request context after HandleRequestHeaders", "context", reqCtx) + resp = HandleRequestHeaders(ctx, reqCtx, req) + loggerVerbose.Info("Request context after HandleRequestHeaders", "context", reqCtx) case *extProcPb.ProcessingRequest_RequestBody: - resp, err = s.HandleRequestBody(reqCtx, req) + resp, err = s.HandleRequestBody(ctx, reqCtx, req) if err == nil { metrics.RecordRequestCounter(reqCtx.Model, reqCtx.ResolvedTargetModel) metrics.RecordRequestSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestSize) } - klog.V(logutil.VERBOSE).InfoS("Request context after HandleRequestBody", "context", reqCtx) + loggerVerbose.Info("Request context after HandleRequestBody", "context", reqCtx) case *extProcPb.ProcessingRequest_ResponseHeaders: - resp, err = s.HandleResponseHeaders(reqCtx, req) - klog.V(logutil.VERBOSE).InfoS("Request context after HandleResponseHeaders", "context", reqCtx) + resp, err = s.HandleResponseHeaders(ctx, reqCtx, req) + loggerVerbose.Info("Request context after HandleResponseHeaders", "context", reqCtx) case *extProcPb.ProcessingRequest_ResponseBody: - resp, err = s.HandleResponseBody(reqCtx, req) + resp, err = s.HandleResponseBody(ctx, reqCtx, req) if err == nil && reqCtx.ResponseComplete { reqCtx.ResponseCompleteTimestamp = time.Now() - metrics.RecordRequestLatencies(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) + metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Response.Usage.PromptTokens) metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Response.Usage.CompletionTokens) } - klog.V(logutil.VERBOSE).InfoS("Request context after HandleResponseBody", "context", reqCtx) + loggerVerbose.Info("Request context after HandleResponseBody", "context", reqCtx) default: - klog.V(logutil.DEFAULT).ErrorS(nil, "Unknown Request type", "request", v) + logger.V(logutil.DEFAULT).Error(nil, "Unknown Request type", "request", v) return status.Error(codes.Unknown, "unknown request type") } if err != nil { - klog.V(logutil.DEFAULT).ErrorS(err, "Failed to process request", "request", req) + logger.V(logutil.DEFAULT).Error(err, "Failed to process request", "request", req) switch status.Code(err) { // This code can be returned by scheduler when there is no capacity for sheddable // requests. @@ -125,9 +130,9 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { } } - klog.V(logutil.VERBOSE).InfoS("Response generated", "response", resp) + loggerVerbose.Info("Response generated", "response", resp) if err := srv.Send(resp); err != nil { - klog.V(logutil.DEFAULT).ErrorS(err, "Send failed") + logger.V(logutil.DEFAULT).Error(err, "Send failed") return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) } } diff --git a/pkg/ext-proc/health.go b/pkg/ext-proc/health.go index aabb150d..8b684d39 100644 --- a/pkg/ext-proc/health.go +++ b/pkg/ext-proc/health.go @@ -3,24 +3,25 @@ package main import ( "context" + "github.com/go-logr/logr" "google.golang.org/grpc/codes" healthPb "google.golang.org/grpc/health/grpc_health_v1" "google.golang.org/grpc/status" - klog "k8s.io/klog/v2" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) type healthServer struct { + logger logr.Logger datastore *backend.K8sDatastore } func (s *healthServer) Check(ctx context.Context, in *healthPb.HealthCheckRequest) (*healthPb.HealthCheckResponse, error) { if !s.datastore.HasSynced() { - klog.V(logutil.VERBOSE).InfoS("gRPC health check not serving", "service", in.Service) + s.logger.V(logutil.VERBOSE).Info("gRPC health check not serving", "service", in.Service) return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_NOT_SERVING}, nil } - klog.V(logutil.VERBOSE).InfoS("gRPC health check serving", "service", in.Service) + s.logger.V(logutil.VERBOSE).Info("gRPC health check serving", "service", in.Service) return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_SERVING}, nil } diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index 8f4cd8e7..ba593d7d 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -8,6 +8,7 @@ import ( "os" "strconv" + "github.com/go-logr/logr" "github.com/prometheus/client_golang/prometheus/promhttp" uberzap "go.uber.org/zap" "go.uber.org/zap/zapcore" @@ -18,7 +19,6 @@ import ( clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" "k8s.io/component-base/metrics/legacyregistry" - klog "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -79,7 +79,8 @@ var ( "are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+ "then a self-signed certificate is used.") - scheme = runtime.NewScheme() + scheme = runtime.NewScheme() + setupLog = ctrl.Log.WithName("setup") ) func init() { @@ -103,7 +104,7 @@ func run() error { // Validate flags if err := validateFlags(); err != nil { - klog.ErrorS(err, "Failed to validate flags") + setupLog.Error(err, "Failed to validate flags") return err } @@ -112,20 +113,20 @@ func run() error { flag.VisitAll(func(f *flag.Flag) { flags[f.Name] = f.Value }) - klog.InfoS("Flags processed", "flags", flags) + setupLog.Info("Flags processed", "flags", flags) datastore := backend.NewK8sDataStore() // Init runtime. cfg, err := ctrl.GetConfig() if err != nil { - klog.ErrorS(err, "Failed to get rest config") + setupLog.Error(err, "Failed to get rest config") return err } mgr, err := ctrl.NewManager(cfg, ctrl.Options{Scheme: scheme}) if err != nil { - klog.ErrorS(err, "Failed to create controller manager", "config", cfg) + setupLog.Error(err, "Failed to create controller manager", "config", cfg) return err } @@ -143,18 +144,20 @@ func run() error { CertPath: *certPath, } if err := serverRunner.SetupWithManager(mgr); err != nil { - klog.ErrorS(err, "Failed to setup ext-proc server") + setupLog.Error(err, "Failed to setup ext-proc server") return err } // Register health server. - if err := registerHealthServer(mgr, datastore, *grpcHealthPort); err != nil { + if err := registerHealthServer(mgr, ctrl.Log.WithName("health"), datastore, *grpcHealthPort); err != nil { return err } // Register ext-proc server. - if err := mgr.Add(serverRunner.AsRunnable(datastore, &vllm.PodMetricsClientImpl{})); err != nil { - klog.ErrorS(err, "Failed to register ext-proc server") + if err := mgr.Add(serverRunner.AsRunnable( + ctrl.Log.WithName("ext-proc"), datastore, &vllm.PodMetricsClientImpl{}, + )); err != nil { + setupLog.Error(err, "Failed to register ext-proc server") return err } @@ -164,12 +167,12 @@ func run() error { } // Start the manager. This blocks until a signal is received. - klog.InfoS("Controller manager starting") + setupLog.Info("Controller manager starting") if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { - klog.ErrorS(err, "Error starting controller manager") + setupLog.Error(err, "Error starting controller manager") return err } - klog.InfoS("Controller manager terminated") + setupLog.Info("Controller manager terminated") return nil } @@ -189,16 +192,18 @@ func initLogging(opts *zap.Options) { logger := zap.New(zap.UseFlagOptions(opts), zap.RawZapOpts(uberzap.AddCaller())) ctrl.SetLogger(logger) - klog.SetLogger(logger) } // registerHealthServer adds the Health gRPC server as a Runnable to the given manager. -func registerHealthServer(mgr manager.Manager, ds *backend.K8sDatastore, port int) error { +func registerHealthServer(mgr manager.Manager, logger logr.Logger, ds *backend.K8sDatastore, port int) error { srv := grpc.NewServer() - healthPb.RegisterHealthServer(srv, &healthServer{datastore: ds}) + healthPb.RegisterHealthServer(srv, &healthServer{ + logger: logger, + datastore: ds, + }) if err := mgr.Add( runnable.NoLeaderElection(runnable.GRPCServer("health", srv, port))); err != nil { - klog.ErrorS(err, "Failed to register health server") + setupLog.Error(err, "Failed to register health server") return err } return nil @@ -226,7 +231,7 @@ func registerMetricsHandler(mgr manager.Manager, port int, cfg *rest.Config) err Name: "metrics", Server: srv, }); err != nil { - klog.ErrorS(err, "Failed to register metrics HTTP handler") + setupLog.Error(err, "Failed to register metrics HTTP handler") return err } return nil @@ -239,19 +244,19 @@ func metricsHandlerWithAuthenticationAndAuthorization(cfg *rest.Config) (http.Ha ) httpClient, err := rest.HTTPClientFor(cfg) if err != nil { - klog.ErrorS(err, "Failed to create http client for metrics auth") + setupLog.Error(err, "Failed to create http client for metrics auth") return nil, err } filter, err := filters.WithAuthenticationAndAuthorization(cfg, httpClient) if err != nil { - klog.ErrorS(err, "Failed to create metrics filter for auth") + setupLog.Error(err, "Failed to create metrics filter for auth") return nil, err } - metricsLogger := klog.LoggerWithValues(klog.NewKlogr(), "path", defaultMetricsEndpoint) + metricsLogger := ctrl.Log.WithName("metrics").WithValues("path", defaultMetricsEndpoint) metricsAuthHandler, err := filter(metricsLogger, h) if err != nil { - klog.ErrorS(err, "Failed to create metrics auth handler") + setupLog.Error(err, "Failed to create metrics auth handler") return nil, err } return metricsAuthHandler, nil diff --git a/pkg/ext-proc/metrics/metrics.go b/pkg/ext-proc/metrics/metrics.go index 1412af6e..e3226f47 100644 --- a/pkg/ext-proc/metrics/metrics.go +++ b/pkg/ext-proc/metrics/metrics.go @@ -1,12 +1,13 @@ package metrics import ( + "context" "sync" "time" compbasemetrics "k8s.io/component-base/metrics" "k8s.io/component-base/metrics/legacyregistry" - klog "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/log" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -144,9 +145,9 @@ func RecordRequestSizes(modelName, targetModelName string, reqSize int) { } // RecordRequestLatencies records duration of request. -func RecordRequestLatencies(modelName, targetModelName string, received time.Time, complete time.Time) bool { +func RecordRequestLatencies(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time) bool { if !complete.After(received) { - klog.V(logutil.DEFAULT).ErrorS(nil, "Request latency values are invalid", + log.FromContext(ctx).V(logutil.DEFAULT).Error(nil, "Request latency values are invalid", "modelName", modelName, "targetModelName", targetModelName, "completeTime", complete, "receivedTime", received) return false } diff --git a/pkg/ext-proc/metrics/metrics_test.go b/pkg/ext-proc/metrics/metrics_test.go index 348f707e..d24afdb1 100644 --- a/pkg/ext-proc/metrics/metrics_test.go +++ b/pkg/ext-proc/metrics/metrics_test.go @@ -1,22 +1,26 @@ package metrics import ( + "context" "os" "testing" "time" "k8s.io/component-base/metrics/legacyregistry" "k8s.io/component-base/metrics/testutil" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) -const RequestTotalMetric = InferenceModelComponent + "_request_total" -const RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds" -const RequestSizesMetric = InferenceModelComponent + "_request_sizes" -const ResponseSizesMetric = InferenceModelComponent + "_response_sizes" -const InputTokensMetric = InferenceModelComponent + "_input_tokens" -const OutputTokensMetric = InferenceModelComponent + "_output_tokens" -const KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization" -const QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size" +const ( + RequestTotalMetric = InferenceModelComponent + "_request_total" + RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds" + RequestSizesMetric = InferenceModelComponent + "_request_sizes" + ResponseSizesMetric = InferenceModelComponent + "_response_sizes" + InputTokensMetric = InferenceModelComponent + "_input_tokens" + OutputTokensMetric = InferenceModelComponent + "_output_tokens" + KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization" + QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size" +) func TestRecordRequestCounterandSizes(t *testing.T) { type requests struct { @@ -83,12 +87,12 @@ func TestRecordRequestCounterandSizes(t *testing.T) { if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestSizes, RequestSizesMetric); err != nil { t.Error(err) } - }) } } func TestRecordRequestLatencies(t *testing.T) { + ctx := logutil.NewTestLoggerIntoContext(context.Background()) timeBaseline := time.Now() type requests struct { modelName string @@ -100,35 +104,36 @@ func TestRecordRequestLatencies(t *testing.T) { name string reqs []requests invalid bool - }{{ - name: "multiple requests", - reqs: []requests{ - { - modelName: "m10", - targetModelName: "t10", - receivedTime: timeBaseline, - completeTime: timeBaseline.Add(time.Millisecond * 10), - }, - { - modelName: "m10", - targetModelName: "t10", - receivedTime: timeBaseline, - completeTime: timeBaseline.Add(time.Millisecond * 1600), - }, - { - modelName: "m10", - targetModelName: "t11", - receivedTime: timeBaseline, - completeTime: timeBaseline.Add(time.Millisecond * 60), - }, - { - modelName: "m20", - targetModelName: "t20", - receivedTime: timeBaseline, - completeTime: timeBaseline.Add(time.Millisecond * 120), + }{ + { + name: "multiple requests", + reqs: []requests{ + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 10), + }, + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 1600), + }, + { + modelName: "m10", + targetModelName: "t11", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 60), + }, + { + modelName: "m20", + targetModelName: "t20", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 120), + }, }, }, - }, { name: "invalid elapsed time", reqs: []requests{ @@ -137,14 +142,16 @@ func TestRecordRequestLatencies(t *testing.T) { targetModelName: "t10", receivedTime: timeBaseline.Add(time.Millisecond * 10), completeTime: timeBaseline, - }}, + }, + }, invalid: true, - }} + }, + } Register() for _, scenario := range scenarios { t.Run(scenario.name, func(t *testing.T) { for _, req := range scenario.reqs { - success := RecordRequestLatencies(req.modelName, req.targetModelName, req.receivedTime, req.completeTime) + success := RecordRequestLatencies(ctx, req.modelName, req.targetModelName, req.receivedTime, req.completeTime) if success == scenario.invalid { t.Errorf("got record success(%v), but the request expects invalid(%v)", success, scenario.invalid) } @@ -277,7 +284,6 @@ func TestInferencePoolMetrics(t *testing.T) { Register() for _, scenario := range scenarios { t.Run(scenario.name, func(t *testing.T) { - RecordInferencePoolAvgKVCache(scenario.poolName, scenario.kvCacheAvg) RecordInferencePoolAvgQueueSize(scenario.poolName, scenario.queueSizeAvg) diff --git a/pkg/ext-proc/scheduling/filter.go b/pkg/ext-proc/scheduling/filter.go index ac7a287c..e028c59a 100644 --- a/pkg/ext-proc/scheduling/filter.go +++ b/pkg/ext-proc/scheduling/filter.go @@ -4,14 +4,14 @@ import ( "errors" "math" - klog "k8s.io/klog/v2" + "github.com/go-logr/logr" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) type Filter interface { Name() string - Filter(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) + Filter(logger logr.Logger, req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) } // filter applies current filterFunc, and then recursively applies next filters depending success or @@ -41,10 +41,11 @@ func (f *filter) Name() string { return f.name } -func (f *filter) Filter(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { - klog.V(logutil.VERBOSE).InfoS("Running a filter", "name", f.Name(), "request", req, "podCount", len(pods)) +func (f *filter) Filter(logger logr.Logger, req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { + loggerTrace := logger.V(logutil.TRACE) + loggerTrace.Info("Running a filter", "name", f.Name(), "podCount", len(pods)) - filtered, err := f.filter(req, pods) + filtered, err := f.filter(logger, req, pods) next := f.nextOnSuccessOrFailure if err == nil && len(filtered) > 0 { @@ -55,9 +56,9 @@ func (f *filter) Filter(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend if f.nextOnSuccess != nil { next = f.nextOnSuccess } - klog.V(logutil.VERBOSE).InfoS("Filter succeeded", "filter", f.Name(), "next", next.Name(), "filteredPodCount", len(filtered)) + loggerTrace.Info("Filter succeeded", "filter", f.Name(), "next", next.Name(), "filteredPodCount", len(filtered)) // On success, pass the filtered result to the next filter. - return next.Filter(req, filtered) + return next.Filter(logger, req, filtered) } else { if f.nextOnFailure == nil && f.nextOnSuccessOrFailure == nil { // No succeeding filters to run, return. @@ -66,18 +67,18 @@ func (f *filter) Filter(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend if f.nextOnFailure != nil { next = f.nextOnFailure } - klog.V(logutil.VERBOSE).InfoS("Filter failed", "filter", f.Name(), "next", next.Name()) + loggerTrace.Info("Filter failed", "filter", f.Name(), "next", next.Name()) // On failure, pass the initial set of pods to the next filter. - return next.Filter(req, pods) + return next.Filter(logger, req, pods) } } // filterFunc filters a set of input pods to a subset. -type filterFunc func(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) +type filterFunc func(logger logr.Logger, req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) // toFilterFunc is a helper function to convert a per pod filter func to the FilterFunc. func toFilterFunc(pp podPredicate) filterFunc { - return func(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { + return func(logger logr.Logger, req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { filtered := []*backend.PodMetrics{} for _, pod := range pods { pass := pp(req, pod) @@ -99,7 +100,7 @@ func toFilterFunc(pp podPredicate) filterFunc { // the least one as it gives more choices for the next filter, which on aggregate gave better // results. // TODO: Compare this strategy with other strategies such as top K. -func leastQueuingFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { +func leastQueuingFilterFunc(logger logr.Logger, req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { min := math.MaxInt max := 0 filtered := []*backend.PodMetrics{} @@ -131,7 +132,7 @@ func lowQueueingPodPredicate(_ *LLMRequest, pod *backend.PodMetrics) bool { // should consider them all instead of the absolute minimum one. This worked better than picking the // least one as it gives more choices for the next filter, which on aggregate gave better results. // TODO: Compare this strategy with other strategies such as top K. -func leastKVCacheFilterFunc(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { +func leastKVCacheFilterFunc(logger logr.Logger, req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { min := math.MaxFloat64 var max float64 = 0 filtered := []*backend.PodMetrics{} diff --git a/pkg/ext-proc/scheduling/filter_test.go b/pkg/ext-proc/scheduling/filter_test.go index 224dc83f..ee1a8c33 100644 --- a/pkg/ext-proc/scheduling/filter_test.go +++ b/pkg/ext-proc/scheduling/filter_test.go @@ -4,11 +4,15 @@ import ( "errors" "testing" + "github.com/go-logr/logr" "github.com/google/go-cmp/cmp" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) func TestFilter(t *testing.T) { + logger := logutil.NewTestLogger() + tests := []struct { name string req *LLMRequest @@ -19,7 +23,7 @@ func TestFilter(t *testing.T) { }{ { name: "simple filter without successor, failure", - filter: &filter{filter: func(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { + filter: &filter{filter: func(logger logr.Logger, req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { return nil, errors.New("filter error") }}, err: true, @@ -201,7 +205,7 @@ func TestFilter(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - got, err := test.filter.Filter(test.req, test.input) + got, err := test.filter.Filter(logger, test.req, test.input) if test.err != (err != nil) { t.Errorf("Unexpected error, got %v, want %v", err, test.err) } @@ -214,6 +218,8 @@ func TestFilter(t *testing.T) { } func TestFilterFunc(t *testing.T) { + logger := logutil.NewTestLogger() + tests := []struct { name string f filterFunc @@ -395,7 +401,7 @@ func TestFilterFunc(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - got, err := test.f(test.req, test.input) + got, err := test.f(logger, test.req, test.input) if test.err != (err != nil) { t.Errorf("Unexpected error, got %v, want %v", err, test.err) } diff --git a/pkg/ext-proc/scheduling/scheduler.go b/pkg/ext-proc/scheduling/scheduler.go index 50564898..16cf90b8 100644 --- a/pkg/ext-proc/scheduling/scheduler.go +++ b/pkg/ext-proc/scheduling/scheduler.go @@ -2,12 +2,14 @@ package scheduling import ( + "context" "fmt" "math/rand" + "github.com/go-logr/logr" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" - klog "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -82,8 +84,8 @@ var ( // request to make room for critical requests. nextOnFailure: &filter{ name: "drop request", - filter: func(req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { - klog.V(logutil.DEFAULT).InfoS("Request dropped", "request", req) + filter: func(logger logr.Logger, req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { + logger.V(logutil.DEFAULT).Info("Request dropped", "request", req) return []*backend.PodMetrics{}, status.Errorf( codes.ResourceExhausted, "dropping request due to limited backend resources") }, @@ -110,14 +112,15 @@ type PodMetricsProvider interface { } // Schedule finds the target pod based on metrics and the requested lora adapter. -func (s *Scheduler) Schedule(req *LLMRequest) (targetPod backend.Pod, err error) { - klog.V(logutil.VERBOSE).InfoS("Scheduling a request", "request", req, "metrics", s.podMetricsProvider.AllPodMetrics()) - pods, err := s.filter.Filter(req, s.podMetricsProvider.AllPodMetrics()) +func (s *Scheduler) Schedule(ctx context.Context, req *LLMRequest) (targetPod backend.Pod, err error) { + logger := log.FromContext(ctx).WithValues("request", req) + logger.V(logutil.VERBOSE).Info("Scheduling a request", "metrics", s.podMetricsProvider.AllPodMetrics()) + pods, err := s.filter.Filter(logger, req, s.podMetricsProvider.AllPodMetrics()) if err != nil || len(pods) == 0 { return backend.Pod{}, fmt.Errorf( "failed to apply filter, resulted %v pods, this should never happen: %w", len(pods), err) } - klog.V(logutil.VERBOSE).InfoS("Selecting a random pod from the candidates", "candidatePods", pods) + logger.V(logutil.VERBOSE).Info("Selecting a random pod from the candidates", "candidatePods", pods) i := rand.Intn(len(pods)) return pods[i].Pod, nil } diff --git a/pkg/ext-proc/server/runserver.go b/pkg/ext-proc/server/runserver.go index ed260b04..fb9741d2 100644 --- a/pkg/ext-proc/server/runserver.go +++ b/pkg/ext-proc/server/runserver.go @@ -13,10 +13,10 @@ import ( "time" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "github.com/go-logr/logr" "google.golang.org/grpc" "google.golang.org/grpc/credentials" "k8s.io/apimachinery/pkg/types" - klog "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" @@ -108,14 +108,15 @@ func (r *ExtProcServerRunner) SetupWithManager(mgr ctrl.Manager) error { // AsRunnable returns a Runnable that can be used to start the ext-proc gRPC server. // The runnable implements LeaderElectionRunnable with leader election disabled. func (r *ExtProcServerRunner) AsRunnable( + logger logr.Logger, podDatastore *backend.K8sDatastore, podMetricsClient backend.PodMetricsClient, ) manager.Runnable { return runnable.NoLeaderElection(manager.RunnableFunc(func(ctx context.Context) error { // Initialize backend provider pp := backend.NewProvider(podMetricsClient, podDatastore) - if err := pp.Init(r.RefreshPodsInterval, r.RefreshMetricsInterval, r.RefreshPrometheusMetricsInterval); err != nil { - klog.ErrorS(err, "Failed to initialize backend provider") + if err := pp.Init(logger.WithName("provider"), r.RefreshPodsInterval, r.RefreshMetricsInterval, r.RefreshPrometheusMetricsInterval); err != nil { + logger.Error(err, "Failed to initialize backend provider") return err } @@ -127,10 +128,10 @@ func (r *ExtProcServerRunner) AsRunnable( cert, err = tls.LoadX509KeyPair(r.CertPath+"/tls.crt", r.CertPath+"/tls.key") } else { // Create tls based credential. - cert, err = createSelfSignedTLSCertificate() + cert, err = createSelfSignedTLSCertificate(logger) } if err != nil { - klog.ErrorS(err, "Failed to create self signed certificate") + logger.Error(err, "Failed to create self signed certificate") return err } @@ -152,11 +153,11 @@ func (r *ExtProcServerRunner) AsRunnable( })) } -func createSelfSignedTLSCertificate() (tls.Certificate, error) { +func createSelfSignedTLSCertificate(logger logr.Logger) (tls.Certificate, error) { serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) serialNumber, err := rand.Int(rand.Reader, serialNumberLimit) if err != nil { - klog.ErrorS(err, "Failed to create serial number for self-signed cert") + logger.Error(err, "Failed to create serial number for self-signed cert") return tls.Certificate{}, err } now := time.Now() @@ -175,13 +176,13 @@ func createSelfSignedTLSCertificate() (tls.Certificate, error) { priv, err := rsa.GenerateKey(rand.Reader, 4096) if err != nil { - klog.ErrorS(err, "Failed to generate key for self-signed cert") + logger.Error(err, "Failed to generate key for self-signed cert") return tls.Certificate{}, err } derBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, &priv.PublicKey, priv) if err != nil { - klog.ErrorS(err, "Failed to create self-signed certificate") + logger.Error(err, "Failed to create self-signed certificate") return tls.Certificate{}, err } @@ -189,7 +190,7 @@ func createSelfSignedTLSCertificate() (tls.Certificate, error) { privBytes, err := x509.MarshalPKCS8PrivateKey(priv) if err != nil { - klog.ErrorS(err, "Failed to marshal private key for self-signed certificate") + logger.Error(err, "Failed to marshal private key for self-signed certificate") return tls.Certificate{}, err } keyBytes := pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: privBytes}) diff --git a/pkg/ext-proc/server/runserver_test.go b/pkg/ext-proc/server/runserver_test.go index df2081aa..1badb8fd 100644 --- a/pkg/ext-proc/server/runserver_test.go +++ b/pkg/ext-proc/server/runserver_test.go @@ -6,11 +6,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) func TestRunnable(t *testing.T) { // Make sure AsRunnable() does not use leader election. - runner := server.NewDefaultExtProcServerRunner().AsRunnable(nil, nil) + runner := server.NewDefaultExtProcServerRunner().AsRunnable(logutil.NewTestLogger(), nil, nil) r, ok := runner.(manager.LeaderElectionRunnable) if !ok { t.Fatal("runner is not LeaderElectionRunnable") diff --git a/pkg/ext-proc/test/benchmark/benchmark.go b/pkg/ext-proc/test/benchmark/benchmark.go index c83dbcb9..9eca2edc 100644 --- a/pkg/ext-proc/test/benchmark/benchmark.go +++ b/pkg/ext-proc/test/benchmark/benchmark.go @@ -8,9 +8,11 @@ import ( "github.com/bojand/ghz/printer" "github.com/bojand/ghz/runner" + "github.com/go-logr/logr" "github.com/jhump/protoreflect/desc" + uberzap "go.uber.org/zap" "google.golang.org/protobuf/proto" - klog "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" @@ -41,24 +43,29 @@ func main() { } func run() error { - klog.InitFlags(nil) + opts := zap.Options{ + Development: true, + } + opts.BindFlags(flag.CommandLine) flag.Parse() + logger := zap.New(zap.UseFlagOptions(&opts), zap.RawZapOpts(uberzap.AddCaller())) + if *localServer { - test.StartExtProc(port, *refreshPodsInterval, *refreshMetricsInterval, *refreshPrometheusMetricsInterval, fakePods(), fakeModels()) + test.StartExtProc(logger, port, *refreshPodsInterval, *refreshMetricsInterval, *refreshPrometheusMetricsInterval, fakePods(), fakeModels()) time.Sleep(time.Second) // wait until server is up - klog.InfoS("Server started") + logger.Info("Server started") } report, err := runner.Run( "envoy.service.ext_proc.v3.ExternalProcessor.Process", *svrAddr, runner.WithInsecure(true), - runner.WithBinaryDataFunc(generateRequest), + runner.WithBinaryDataFunc(generateRequestFunc(logger)), runner.WithTotalRequests(uint(*totalRequests)), ) if err != nil { - klog.ErrorS(err, "Runner failed") + logger.Error(err, "Runner failed") return err } @@ -71,14 +78,16 @@ func run() error { return nil } -func generateRequest(mtd *desc.MethodDescriptor, callData *runner.CallData) []byte { - numModels := *numFakePods * (*numModelsPerPod) - req := test.GenerateRequest(modelName(int(callData.RequestNumber) % numModels)) - data, err := proto.Marshal(req) - if err != nil { - logutil.Fatal(err, "Failed to marshal request", "request", req) +func generateRequestFunc(logger logr.Logger) func(mtd *desc.MethodDescriptor, callData *runner.CallData) []byte { + return func(mtd *desc.MethodDescriptor, callData *runner.CallData) []byte { + numModels := *numFakePods * (*numModelsPerPod) + req := test.GenerateRequest(logger, modelName(int(callData.RequestNumber)%numModels)) + data, err := proto.Marshal(req) + if err != nil { + logutil.Fatal(logger, err, "Failed to marshal request", "request", req) + } + return data } - return data } func fakeModels() map[string]*v1alpha1.InferenceModel { diff --git a/pkg/ext-proc/test/utils.go b/pkg/ext-proc/test/utils.go index 4c000722..cb99a36b 100644 --- a/pkg/ext-proc/test/utils.go +++ b/pkg/ext-proc/test/utils.go @@ -7,9 +7,9 @@ import ( "time" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "github.com/go-logr/logr" "google.golang.org/grpc" "google.golang.org/grpc/reflection" - klog "k8s.io/klog/v2" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" @@ -17,7 +17,13 @@ import ( logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) -func StartExtProc(port int, refreshPodsInterval, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration, pods []*backend.PodMetrics, models map[string]*v1alpha1.InferenceModel) *grpc.Server { +func StartExtProc( + logger logr.Logger, + port int, + refreshPodsInterval, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration, + pods []*backend.PodMetrics, + models map[string]*v1alpha1.InferenceModel, +) *grpc.Server { ps := make(backend.PodSet) pms := make(map[backend.Pod]*backend.PodMetrics) for _, pod := range pods { @@ -26,35 +32,35 @@ func StartExtProc(port int, refreshPodsInterval, refreshMetricsInterval, refresh } pmc := &backend.FakePodMetricsClient{Res: pms} pp := backend.NewProvider(pmc, backend.NewK8sDataStore(backend.WithPods(pods))) - if err := pp.Init(refreshPodsInterval, refreshMetricsInterval, refreshPrometheusMetricsInterval); err != nil { - logutil.Fatal(err, "Failed to initialize") + if err := pp.Init(logger, refreshPodsInterval, refreshMetricsInterval, refreshPrometheusMetricsInterval); err != nil { + logutil.Fatal(logger, err, "Failed to initialize") } - return startExtProc(port, pp, models) + return startExtProc(logger, port, pp, models) } // startExtProc starts an extProc server with fake pods. -func startExtProc(port int, pp *backend.Provider, models map[string]*v1alpha1.InferenceModel) *grpc.Server { +func startExtProc(logger logr.Logger, port int, pp *backend.Provider, models map[string]*v1alpha1.InferenceModel) *grpc.Server { lis, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) if err != nil { - logutil.Fatal(err, "Failed to listen", "port", port) + logutil.Fatal(logger, err, "Failed to listen", "port", port) } s := grpc.NewServer() extProcPb.RegisterExternalProcessorServer(s, handlers.NewServer(pp, scheduling.NewScheduler(pp), "target-pod", &backend.FakeDataStore{Res: models})) - klog.InfoS("gRPC server starting", "port", port) + logger.Info("gRPC server starting", "port", port) reflection.Register(s) go func() { err := s.Serve(lis) if err != nil { - logutil.Fatal(err, "Ext-proc failed with the err") + logutil.Fatal(logger, err, "Ext-proc failed with the err") } }() return s } -func GenerateRequest(model string) *extProcPb.ProcessingRequest { +func GenerateRequest(logger logr.Logger, model string) *extProcPb.ProcessingRequest { j := map[string]interface{}{ "model": model, "prompt": "hello", @@ -64,7 +70,7 @@ func GenerateRequest(model string) *extProcPb.ProcessingRequest { llmReq, err := json.Marshal(j) if err != nil { - logutil.Fatal(err, "Failed to unmarshal LLM request") + logutil.Fatal(logger, err, "Failed to unmarshal LLM request") } req := &extProcPb.ProcessingRequest{ Request: &extProcPb.ProcessingRequest_RequestBody{ diff --git a/pkg/ext-proc/util/logging/fatal.go b/pkg/ext-proc/util/logging/fatal.go index 65926824..1f85b450 100644 --- a/pkg/ext-proc/util/logging/fatal.go +++ b/pkg/ext-proc/util/logging/fatal.go @@ -1,11 +1,15 @@ package logging -import "k8s.io/klog/v2" +import ( + "os" -// Fatal calls klog.ErrorS followed by klog.FlushAndExit(1). + "github.com/go-logr/logr" +) + +// Fatal calls logger.Error followed by os.Exit(1). // // This is a utility function and should not be used in production code! -func Fatal(err error, msg string, keysAndValues ...interface{}) { - klog.ErrorS(err, msg, keysAndValues...) - klog.FlushAndExit(klog.ExitFlushTimeout, 1) +func Fatal(logger logr.Logger, err error, msg string, keysAndValues ...interface{}) { + logger.Error(err, msg, keysAndValues...) + os.Exit(1) } diff --git a/pkg/ext-proc/util/logging/logger.go b/pkg/ext-proc/util/logging/logger.go new file mode 100644 index 00000000..086a012f --- /dev/null +++ b/pkg/ext-proc/util/logging/logger.go @@ -0,0 +1,20 @@ +package logging + +import ( + "context" + + "github.com/go-logr/logr" + uberzap "go.uber.org/zap" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" +) + +// NewTestLogger creates a new Zap logger using the dev mode. +func NewTestLogger() logr.Logger { + return zap.New(zap.UseDevMode(true), zap.RawZapOpts(uberzap.AddCaller())) +} + +// NewTestLoggerIntoContext creates a new Zap logger using the dev mode and inserts it into the given context. +func NewTestLoggerIntoContext(ctx context.Context) context.Context { + return log.IntoContext(ctx, zap.New(zap.UseDevMode(true), zap.RawZapOpts(uberzap.AddCaller()))) +} diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index 6424663b..a99b6bd7 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -6,7 +6,6 @@ import ( "bytes" "context" "errors" - "flag" "fmt" "io" "os" @@ -26,7 +25,6 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" k8syaml "k8s.io/apimachinery/pkg/util/yaml" clientgoscheme "k8s.io/client-go/kubernetes/scheme" - klog "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" k8sclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/envtest" @@ -47,6 +45,7 @@ var ( k8sClient k8sclient.Client testEnv *envtest.Environment scheme = runtime.NewScheme() + logger = logutil.NewTestLogger().V(logutil.VERBOSE) ) func TestKubeInferenceModelRequest(t *testing.T) { @@ -62,7 +61,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { }{ { name: "select lower queue and kv cache, no active lora", - req: extprocutils.GenerateRequest("my-model"), + req: extprocutils.GenerateRequest(logger, "my-model"), // pod-1 will be picked because it has relatively low queue size and low KV cache. pods: []*backend.PodMetrics{ { @@ -115,7 +114,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, { name: "select active lora, low queue", - req: extprocutils.GenerateRequest("sql-lora"), + req: extprocutils.GenerateRequest(logger, "sql-lora"), // pod-1 will be picked because it has relatively low queue size, with the requested // model being active, and has low KV cache. pods: []*backend.PodMetrics{ @@ -180,7 +179,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, { name: "select no lora despite active model, avoid excessive queue size", - req: extprocutils.GenerateRequest("sql-lora"), + req: extprocutils.GenerateRequest(logger, "sql-lora"), // pod-2 will be picked despite it NOT having the requested model being active // as it's above the affinity for queue size. Also is critical, so we should // still honor request despite all queues > 5 @@ -246,7 +245,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, { name: "noncritical and all models past threshold, shed request", - req: extprocutils.GenerateRequest("sql-lora-sheddable"), + req: extprocutils.GenerateRequest(logger, "sql-lora-sheddable"), // no pods will be picked as all models are either above kv threshold, // queue threshold, or both. pods: []*backend.PodMetrics{ @@ -297,7 +296,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, { name: "noncritical, but one server has capacity, do not shed", - req: extprocutils.GenerateRequest("sql-lora-sheddable"), + req: extprocutils.GenerateRequest(logger, "sql-lora-sheddable"), // pod 0 will be picked as all other models are above threshold pods: []*backend.PodMetrics{ { @@ -418,9 +417,9 @@ func setUpHermeticServer(pods []*backend.PodMetrics) (client extProcPb.ExternalP serverCtx, stopServer := context.WithCancel(context.Background()) go func() { if err := serverRunner.AsRunnable( - backend.NewK8sDataStore(backend.WithPods(pods)), pmc, + logger.WithName("ext-proc"), backend.NewK8sDataStore(backend.WithPods(pods)), pmc, ).Start(serverCtx); err != nil { - logutil.Fatal(err, "Failed to start ext-proc server") + logutil.Fatal(logger, err, "Failed to start ext-proc server") } }() @@ -431,13 +430,13 @@ func setUpHermeticServer(pods []*backend.PodMetrics) (client extProcPb.ExternalP // Create a grpc connection conn, err := grpc.NewClient(address, grpc.WithTransportCredentials(insecure.NewCredentials())) if err != nil { - logutil.Fatal(err, "Failed to connect", "address", address) + logutil.Fatal(logger, err, "Failed to connect", "address", address) } ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) client, err = extProcPb.NewExternalProcessorClient(conn).Process(ctx) if err != nil { - logutil.Fatal(err, "Failed to create client") + logutil.Fatal(logger, err, "Failed to create client") } return client, func() { cancel() @@ -455,7 +454,7 @@ func BeforeSuit() { } cfg, err := testEnv.Start() if err != nil { - logutil.Fatal(err, "Failed to start test environment", "config", cfg) + logutil.Fatal(logger, err, "Failed to start test environment", "config", cfg) } utilruntime.Must(clientgoscheme.AddToScheme(scheme)) @@ -463,15 +462,16 @@ func BeforeSuit() { k8sClient, err = k8sclient.New(cfg, k8sclient.Options{Scheme: scheme}) if err != nil { - logutil.Fatal(err, "Failed to start k8s Client") + logutil.Fatal(logger, err, "Failed to start k8s Client") } else if k8sClient == nil { - logutil.Fatal(nil, "No error, but returned kubernetes client is nil", "config", cfg) + logutil.Fatal(logger, nil, "No error, but returned kubernetes client is nil", "config", cfg) } // Init runtime. + ctrl.SetLogger(logger) mgr, err := ctrl.NewManager(cfg, ctrl.Options{Scheme: scheme}) if err != nil { - logutil.Fatal(err, "Failed to create controller manager") + logutil.Fatal(logger, err, "Failed to create controller manager") } serverRunner = runserver.NewDefaultExtProcServerRunner() @@ -481,50 +481,46 @@ func BeforeSuit() { serverRunner.SecureServing = false if err := serverRunner.SetupWithManager(mgr); err != nil { - logutil.Fatal(err, "Failed to setup server runner") + logutil.Fatal(logger, err, "Failed to setup server runner") } // Start the controller manager in go routine, not blocking go func() { if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { - logutil.Fatal(err, "Failed to start manager") + logutil.Fatal(logger, err, "Failed to start manager") } }() - klog.InfoS("Setting up hermetic ExtProc server") - klog.InitFlags(nil) - flag.Parse() - // Configure klog verbosity levels to print ext proc logs. - _ = flag.Lookup("v").Value.Set("3") + logger.Info("Setting up hermetic ExtProc server") // Unmarshal CRDs from file into structs manifestsPath := filepath.Join("..", "testdata", "inferencepool-with-model-hermetic.yaml") docs, err := readDocuments(manifestsPath) if err != nil { - logutil.Fatal(err, "Can't read object manifests", "path", manifestsPath) + logutil.Fatal(logger, err, "Can't read object manifests", "path", manifestsPath) } for _, doc := range docs { inferenceModel := &v1alpha1.InferenceModel{} if err = yaml.Unmarshal(doc, inferenceModel); err != nil { - logutil.Fatal(err, "Can't unmarshal object", "document", doc) + logutil.Fatal(logger, err, "Can't unmarshal object", "document", doc) } if inferenceModel.Kind == "InferenceModel" { - klog.InfoS("Creating inference model", "model", inferenceModel) + logger.Info("Creating inference model", "model", inferenceModel) if err := k8sClient.Create(context.Background(), inferenceModel); err != nil { - logutil.Fatal(err, "Unable to create inferenceModel", "modelName", inferenceModel.Name) + logutil.Fatal(logger, err, "Unable to create inferenceModel", "modelName", inferenceModel.Name) } } } for _, doc := range docs { inferencePool := &v1alpha1.InferencePool{} if err = yaml.Unmarshal(doc, inferencePool); err != nil { - logutil.Fatal(err, "Can't unmarshal object", "document", doc) + logutil.Fatal(logger, err, "Can't unmarshal object", "document", doc) } if inferencePool.Kind == "InferencePool" { - klog.InfoS("Creating inference pool", "pool", inferencePool) + logger.Info("Creating inference pool", "pool", inferencePool) if err := k8sClient.Create(context.Background(), inferencePool); err != nil { - logutil.Fatal(err, "Unable to create inferencePool", "poolName", inferencePool.Name) + logutil.Fatal(logger, err, "Unable to create inferencePool", "poolName", inferencePool.Name) } } } From bc5eac67bd3f70bd87210a660b8f6edd5bc2e6f6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 17 Feb 2025 15:48:12 -0800 Subject: [PATCH 27/96] Bump the kubernetes group with 6 updates (#351) Bumps the kubernetes group with 6 updates: | Package | From | To | | --- | --- | --- | | [k8s.io/api](https://github.com/kubernetes/api) | `0.32.1` | `0.32.2` | | [k8s.io/apiextensions-apiserver](https://github.com/kubernetes/apiextensions-apiserver) | `0.32.1` | `0.32.2` | | [k8s.io/apimachinery](https://github.com/kubernetes/apimachinery) | `0.32.1` | `0.32.2` | | [k8s.io/client-go](https://github.com/kubernetes/client-go) | `0.32.1` | `0.32.2` | | [k8s.io/code-generator](https://github.com/kubernetes/code-generator) | `0.32.1` | `0.32.2` | | [k8s.io/component-base](https://github.com/kubernetes/component-base) | `0.32.1` | `0.32.2` | Updates `k8s.io/api` from 0.32.1 to 0.32.2 - [Commits](https://github.com/kubernetes/api/compare/v0.32.1...v0.32.2) Updates `k8s.io/apiextensions-apiserver` from 0.32.1 to 0.32.2 - [Release notes](https://github.com/kubernetes/apiextensions-apiserver/releases) - [Commits](https://github.com/kubernetes/apiextensions-apiserver/compare/v0.32.1...v0.32.2) Updates `k8s.io/apimachinery` from 0.32.1 to 0.32.2 - [Commits](https://github.com/kubernetes/apimachinery/compare/v0.32.1...v0.32.2) Updates `k8s.io/client-go` from 0.32.1 to 0.32.2 - [Changelog](https://github.com/kubernetes/client-go/blob/master/CHANGELOG.md) - [Commits](https://github.com/kubernetes/client-go/compare/v0.32.1...v0.32.2) Updates `k8s.io/code-generator` from 0.32.1 to 0.32.2 - [Commits](https://github.com/kubernetes/code-generator/compare/v0.32.1...v0.32.2) Updates `k8s.io/component-base` from 0.32.1 to 0.32.2 - [Commits](https://github.com/kubernetes/component-base/compare/v0.32.1...v0.32.2) --- updated-dependencies: - dependency-name: k8s.io/api dependency-type: direct:production update-type: version-update:semver-patch dependency-group: kubernetes - dependency-name: k8s.io/apiextensions-apiserver dependency-type: direct:production update-type: version-update:semver-patch dependency-group: kubernetes - dependency-name: k8s.io/apimachinery dependency-type: direct:production update-type: version-update:semver-patch dependency-group: kubernetes - dependency-name: k8s.io/client-go dependency-type: direct:production update-type: version-update:semver-patch dependency-group: kubernetes - dependency-name: k8s.io/code-generator dependency-type: direct:production update-type: version-update:semver-patch dependency-group: kubernetes - dependency-name: k8s.io/component-base dependency-type: direct:production update-type: version-update:semver-patch dependency-group: kubernetes ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 14 +++++++------- go.sum | 28 ++++++++++++++-------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/go.mod b/go.mod index a59a28cc..2c590489 100644 --- a/go.mod +++ b/go.mod @@ -21,12 +21,12 @@ require ( go.uber.org/zap v1.27.0 google.golang.org/grpc v1.70.0 google.golang.org/protobuf v1.36.5 - k8s.io/api v0.32.1 - k8s.io/apiextensions-apiserver v0.32.1 - k8s.io/apimachinery v0.32.1 - k8s.io/client-go v0.32.1 - k8s.io/code-generator v0.32.1 - k8s.io/component-base v0.32.1 + k8s.io/api v0.32.2 + k8s.io/apiextensions-apiserver v0.32.2 + k8s.io/apimachinery v0.32.2 + k8s.io/client-go v0.32.2 + k8s.io/code-generator v0.32.2 + k8s.io/component-base v0.32.2 k8s.io/klog/v2 v2.130.1 k8s.io/utils v0.0.0-20241210054802-24370beab758 sigs.k8s.io/controller-runtime v0.20.1 @@ -135,7 +135,7 @@ require ( gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiserver v0.32.1 // indirect + k8s.io/apiserver v0.32.2 // indirect k8s.io/gengo/v2 v2.0.0-20240911193312-2b36238f13e9 // indirect k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 // indirect diff --git a/go.sum b/go.sum index 803ed988..f10f9a31 100644 --- a/go.sum +++ b/go.sum @@ -347,20 +347,20 @@ gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.32.1 h1:f562zw9cy+GvXzXf0CKlVQ7yHJVYzLfL6JAS4kOAaOc= -k8s.io/api v0.32.1/go.mod h1:/Yi/BqkuueW1BgpoePYBRdDYfjPF5sgTr5+YqDZra5k= -k8s.io/apiextensions-apiserver v0.32.1 h1:hjkALhRUeCariC8DiVmb5jj0VjIc1N0DREP32+6UXZw= -k8s.io/apiextensions-apiserver v0.32.1/go.mod h1:sxWIGuGiYov7Io1fAS2X06NjMIk5CbRHc2StSmbaQto= -k8s.io/apimachinery v0.32.1 h1:683ENpaCBjma4CYqsmZyhEzrGz6cjn1MY/X2jB2hkZs= -k8s.io/apimachinery v0.32.1/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= -k8s.io/apiserver v0.32.1 h1:oo0OozRos66WFq87Zc5tclUX2r0mymoVHRq8JmR7Aak= -k8s.io/apiserver v0.32.1/go.mod h1:UcB9tWjBY7aryeI5zAgzVJB/6k7E97bkr1RgqDz0jPw= -k8s.io/client-go v0.32.1 h1:otM0AxdhdBIaQh7l1Q0jQpmo7WOFIk5FFa4bg6YMdUU= -k8s.io/client-go v0.32.1/go.mod h1:aTTKZY7MdxUaJ/KiUs8D+GssR9zJZi77ZqtzcGXIiDg= -k8s.io/code-generator v0.32.1 h1:4lw1kFNDuFYXquTkB7Sl5EwPMUP2yyW9hh6BnFfRZFY= -k8s.io/code-generator v0.32.1/go.mod h1:zaILfm00CVyP/6/pJMJ3zxRepXkxyDfUV5SNG4CjZI4= -k8s.io/component-base v0.32.1 h1:/5IfJ0dHIKBWysGV0yKTFfacZ5yNV1sulPh3ilJjRZk= -k8s.io/component-base v0.32.1/go.mod h1:j1iMMHi/sqAHeG5z+O9BFNCF698a1u0186zkjMZQ28w= +k8s.io/api v0.32.2 h1:bZrMLEkgizC24G9eViHGOPbW+aRo9duEISRIJKfdJuw= +k8s.io/api v0.32.2/go.mod h1:hKlhk4x1sJyYnHENsrdCWw31FEmCijNGPJO5WzHiJ6Y= +k8s.io/apiextensions-apiserver v0.32.2 h1:2YMk285jWMk2188V2AERy5yDwBYrjgWYggscghPCvV4= +k8s.io/apiextensions-apiserver v0.32.2/go.mod h1:GPwf8sph7YlJT3H6aKUWtd0E+oyShk/YHWQHf/OOgCA= +k8s.io/apimachinery v0.32.2 h1:yoQBR9ZGkA6Rgmhbp/yuT9/g+4lxtsGYwW6dR6BDPLQ= +k8s.io/apimachinery v0.32.2/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= +k8s.io/apiserver v0.32.2 h1:WzyxAu4mvLkQxwD9hGa4ZfExo3yZZaYzoYvvVDlM6vw= +k8s.io/apiserver v0.32.2/go.mod h1:PEwREHiHNU2oFdte7BjzA1ZyjWjuckORLIK/wLV5goM= +k8s.io/client-go v0.32.2 h1:4dYCD4Nz+9RApM2b/3BtVvBHw54QjMFUl1OLcJG5yOA= +k8s.io/client-go v0.32.2/go.mod h1:fpZ4oJXclZ3r2nDOv+Ux3XcJutfrwjKTCHz2H3sww94= +k8s.io/code-generator v0.32.2 h1:CIvyPrLWP7cMgrqval2qYT839YAwCDeSvGfXgWSNpHQ= +k8s.io/code-generator v0.32.2/go.mod h1:plh7bWk7JztAUkHM4zpbdy0KOMdrhsePcZL2HLWFH7Y= +k8s.io/component-base v0.32.2 h1:1aUL5Vdmu7qNo4ZsE+569PV5zFatM9hl+lb3dEea2zU= +k8s.io/component-base v0.32.2/go.mod h1:PXJ61Vx9Lg+P5mS8TLd7bCIr+eMJRQTyXe8KvkrvJq0= k8s.io/gengo/v2 v2.0.0-20240911193312-2b36238f13e9 h1:si3PfKm8dDYxgfbeA6orqrtLkvvIeH8UqffFJDl0bz4= k8s.io/gengo/v2 v2.0.0-20240911193312-2b36238f13e9/go.mod h1:EJykeLsmFC60UQbYJezXkEsG2FLrt0GPNkU5iK5GWxU= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= From 5c67d108b4e758b9cfa20f17a879954dd43704d4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 18 Feb 2025 08:14:27 -0800 Subject: [PATCH 28/96] Bump sigs.k8s.io/controller-runtime from 0.20.1 to 0.20.2 (#352) Bumps [sigs.k8s.io/controller-runtime](https://github.com/kubernetes-sigs/controller-runtime) from 0.20.1 to 0.20.2. - [Release notes](https://github.com/kubernetes-sigs/controller-runtime/releases) - [Changelog](https://github.com/kubernetes-sigs/controller-runtime/blob/main/RELEASE.md) - [Commits](https://github.com/kubernetes-sigs/controller-runtime/compare/v0.20.1...v0.20.2) --- updated-dependencies: - dependency-name: sigs.k8s.io/controller-runtime dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 4 ++-- go.sum | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index 2c590489..25daf027 100644 --- a/go.mod +++ b/go.mod @@ -29,7 +29,7 @@ require ( k8s.io/component-base v0.32.2 k8s.io/klog/v2 v2.130.1 k8s.io/utils v0.0.0-20241210054802-24370beab758 - sigs.k8s.io/controller-runtime v0.20.1 + sigs.k8s.io/controller-runtime v0.20.2 sigs.k8s.io/structured-merge-diff/v4 v4.5.0 sigs.k8s.io/yaml v1.4.0 ) @@ -57,7 +57,7 @@ require ( github.com/emicklei/go-restful/v3 v3.11.0 // indirect github.com/envoyproxy/go-control-plane/ratelimit v0.1.0 // indirect github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect - github.com/evanphx/json-patch/v5 v5.9.0 // indirect + github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/fatih/color v1.16.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect diff --git a/go.sum b/go.sum index f10f9a31..2d54aba2 100644 --- a/go.sum +++ b/go.sum @@ -59,8 +59,8 @@ github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfU github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU= github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ= -github.com/evanphx/json-patch/v5 v5.9.0 h1:kcBlZQbplgElYIlo/n1hJbls2z/1awpXxpRi0/FOJfg= -github.com/evanphx/json-patch/v5 v5.9.0/go.mod h1:VNkHZ/282BpEyt/tObQO8s5CMPmYYq14uClGH4abBuQ= +github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= +github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= @@ -371,8 +371,8 @@ k8s.io/utils v0.0.0-20241210054802-24370beab758 h1:sdbE21q2nlQtFh65saZY+rRM6x6aJ k8s.io/utils v0.0.0-20241210054802-24370beab758/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 h1:CPT0ExVicCzcpeN4baWEV2ko2Z/AsiZgEdwgcfwLgMo= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= -sigs.k8s.io/controller-runtime v0.20.1 h1:JbGMAG/X94NeM3xvjenVUaBjy6Ui4Ogd/J5ZtjZnHaE= -sigs.k8s.io/controller-runtime v0.20.1/go.mod h1:BrP3w158MwvB3ZbNpaAcIKkHQ7YGpYnzpoSTZ8E14WU= +sigs.k8s.io/controller-runtime v0.20.2 h1:/439OZVxoEc02psi1h4QO3bHzTgu49bb347Xp4gW1pc= +sigs.k8s.io/controller-runtime v0.20.2/go.mod h1:xg2XB0K5ShQzAgsoujxuKN4LNXR2LfwwHsPj7Iaw+XY= sigs.k8s.io/controller-tools v0.14.0 h1:rnNoCC5wSXlrNoBKKzL70LNJKIQKEzT6lloG6/LF73A= sigs.k8s.io/controller-tools v0.14.0/go.mod h1:TV7uOtNNnnR72SpzhStvPkoS/U5ir0nMudrkrC4M9Sc= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= From 755b81bfd70fbc6cb3343ab6e33ce3c7d33c0f49 Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Tue, 18 Feb 2025 17:22:28 +0000 Subject: [PATCH 29/96] Fixes to the adapter rollouts guide (#338) * Polishing to the adapter rollouts guide * Make all guides use the same deployment so that we can till one story as the user navigates through the guides * Addressed comments --- mkdocs.yml | 1 + pkg/manifests/inferencemodel.yaml | 11 +- .../vllm/deployment-with-syncer.yaml | 145 ------------------ pkg/manifests/vllm/deployment.yaml | 49 ++++-- site-src/guides/adapter-rollout.md | 133 ++++++++++++++++ site-src/guides/dynamic-lora.md | 93 ----------- site-src/guides/index.md | 33 ++-- 7 files changed, 187 insertions(+), 278 deletions(-) delete mode 100644 pkg/manifests/vllm/deployment-with-syncer.yaml create mode 100644 site-src/guides/adapter-rollout.md delete mode 100644 site-src/guides/dynamic-lora.md diff --git a/mkdocs.yml b/mkdocs.yml index c9bc30e0..a024c16d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -56,6 +56,7 @@ nav: - Guides: - User Guides: - Getting started: guides/index.md + - Adapter Rollout: guides/adapter-rollout.md - Implementer's Guide: guides/implementers.md - Reference: - API Reference: reference/spec.md diff --git a/pkg/manifests/inferencemodel.yaml b/pkg/manifests/inferencemodel.yaml index 0085a89d..2a292c16 100644 --- a/pkg/manifests/inferencemodel.yaml +++ b/pkg/manifests/inferencemodel.yaml @@ -1,21 +1,12 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: InferenceModel metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize name: inferencemodel-sample spec: modelName: tweet-summary criticality: Critical poolRef: - # this is the default val: - group: inference.networking.x-k8s.io - # this is the default val: - kind: InferencePool name: vllm-llama2-7b-pool targetModels: - - name: tweet-summary-0 - weight: 50 - name: tweet-summary-1 - weight: 50 + weight: 100 diff --git a/pkg/manifests/vllm/deployment-with-syncer.yaml b/pkg/manifests/vllm/deployment-with-syncer.yaml deleted file mode 100644 index d6110f4b..00000000 --- a/pkg/manifests/vllm/deployment-with-syncer.yaml +++ /dev/null @@ -1,145 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: vllm-llama2-7b-pool -spec: - selector: - app: vllm-llama2-7b-pool - ports: - - protocol: TCP - port: 8000 - targetPort: 8000 - type: ClusterIP ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm-llama2-7b-pool -spec: - replicas: 3 - selector: - matchLabels: - app: vllm-llama2-7b-pool - template: - metadata: - labels: - app: vllm-llama2-7b-pool - spec: - containers: - - name: lora - image: "vllm/vllm-openai:latest" - imagePullPolicy: Always - command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] - args: - - "--model" - - "meta-llama/Llama-2-7b-hf" - - "--tensor-parallel-size" - - "1" - - "--port" - - "8000" - - "--enable-lora" - - "--max-loras" - - "4" - - "--max-cpu-loras" - - "12" - - "--lora-modules" - - '{"name": "tweet-summary-0", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' - - '{"name": "tweet-summary-1", "path": "vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm", "base_model_name": "llama-2"}' - env: - - name: PORT - value: "8000" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-token - key: token - - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING - value: "true" - ports: - - containerPort: 8000 - name: http - protocol: TCP - livenessProbe: - failureThreshold: 240 - httpGet: - path: /health - port: http - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 5 - successThreshold: 1 - timeoutSeconds: 1 - readinessProbe: - failureThreshold: 600 - httpGet: - path: /health - port: http - scheme: HTTP - initialDelaySeconds: 5 - periodSeconds: 5 - successThreshold: 1 - timeoutSeconds: 1 - resources: - limits: - nvidia.com/gpu: 1 - requests: - nvidia.com/gpu: 1 - volumeMounts: - - mountPath: /data - name: data - - mountPath: /dev/shm - name: shm - - name: adapters - mountPath: "/adapters" - initContainers: - - name: lora-adapter-syncer - tty: true - stdin: true - image: us-central1-docker.pkg.dev/ahg-gke-dev/jobset2/lora-syncer:6dc97be - restartPolicy: Always - imagePullPolicy: Always - env: - - name: DYNAMIC_LORA_ROLLOUT_CONFIG - value: "/config/configmap.yaml" - volumeMounts: # DO NOT USE subPath - - name: config-volume - mountPath: /config - restartPolicy: Always - schedulerName: default-scheduler - terminationGracePeriodSeconds: 30 - volumes: - - name: data - emptyDir: {} - - name: shm - emptyDir: - medium: Memory - - name: adapters - emptyDir: {} - - name: config-volume - configMap: - name: dynamic-lora-config - ---- - -apiVersion: v1 -kind: ConfigMap -metadata: - name: dynamic-lora-config -data: - configmap.yaml: | - vLLMLoRAConfig: - name: sql-loras-llama - port: 8000 - ensureExist: - models: - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-0 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-1 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm - ensureNotExist: - models: - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-2 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm \ No newline at end of file diff --git a/pkg/manifests/vllm/deployment.yaml b/pkg/manifests/vllm/deployment.yaml index 1d115f4d..a54d99b3 100644 --- a/pkg/manifests/vllm/deployment.yaml +++ b/pkg/manifests/vllm/deployment.yaml @@ -1,16 +1,3 @@ -apiVersion: v1 -kind: Service -metadata: - name: vllm-llama2-7b-pool -spec: - selector: - app: vllm-llama2-7b-pool - ports: - - protocol: TCP - port: 8000 - targetPort: 8000 - type: ClusterIP ---- apiVersion: apps/v1 kind: Deployment metadata: @@ -39,7 +26,7 @@ spec: - "8000" - "--enable-lora" - "--max-loras" - - "4" + - "2" - "--max-cpu-loras" - "12" - "--lora-modules" @@ -53,6 +40,8 @@ spec: secretKeyRef: name: hf-token key: token + - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING + value: "true" ports: - containerPort: 8000 name: http @@ -89,6 +78,19 @@ spec: name: shm - name: adapters mountPath: "/adapters" + initContainers: + - name: lora-adapter-syncer + tty: true + stdin: true + image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/lora-syncer:main + restartPolicy: Always + imagePullPolicy: Always + env: + - name: DYNAMIC_LORA_ROLLOUT_CONFIG + value: "/config/configmap.yaml" + volumeMounts: # DO NOT USE subPath, dynamic configmap updates don't work on subPaths + - name: config-volume + mountPath: /config restartPolicy: Always schedulerName: default-scheduler terminationGracePeriodSeconds: 30 @@ -100,3 +102,22 @@ spec: medium: Memory - name: adapters emptyDir: {} + - name: config-volume + configMap: + name: vllm-llama2-7b-adapters +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: vllm-llama2-7b-adapters +data: + configmap.yaml: | + vLLMLoRAConfig: + name: vllm-llama2-7b + port: 8000 + ensureExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-1 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + diff --git a/site-src/guides/adapter-rollout.md b/site-src/guides/adapter-rollout.md new file mode 100644 index 00000000..9ce8c3a4 --- /dev/null +++ b/site-src/guides/adapter-rollout.md @@ -0,0 +1,133 @@ +# Adapter Rollout + +The goal of this guide is to demonstrate how to rollout a new adapter version. + +## **Prerequisites** + +Follow the steps in the [main guide](index.md) + + +## **Safely rollout v2 adapter** + +### Load the new adapter version to the model servers + +This guide leverages the LoRA syncer sidecar to dynamically manage adapters within a vLLM deployment, enabling users to add or remove them through a shared ConfigMap. + + +Modify the LoRA syncer ConfigMap to initiate loading of the new adapter version. + + +```bash + kubectl edit configmap vllm-llama2-7b-adapters +``` + +Change the ConfigMap to match the following (note the new entry under models): + +```yaml + apiVersion: v1 + kind: ConfigMap + metadata: + name: vllm-llama2-7b-adapters + data: + configmap.yaml: | + vLLMLoRAConfig: + name: vllm-llama2-7b-adapters + port: 8000 + ensureExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-1 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-2 + source: mahimairaja/tweet-summarization-llama-2-finetuned +``` + +The new adapter version is applied to the model servers live, without requiring a restart. + + +### Direct traffic to the new adapter version + +Modify the InferenceModel to configure a canary rollout with traffic splitting. In this example, 10% of traffic for tweet-summary model will be sent to the new ***tweet-summary-2*** adapter. + + +```bash + kubectl edit inferencemodel tweet-summary +``` + +Change the targetModels list in InferenceModel to match the following: + + +```yaml +apiVersion: inference.networking.x-k8s.io/v1alpha1 +kind: InferenceModel +metadata: + name: inferencemodel-sample +spec: + modelName: tweet-summary + criticality: Critical + poolRef: + name: vllm-llama2-7b-pool + targetModels: + - name: tweet-summary-1 + weight: 90 + - name: tweet-summary-2 + weight: 10 + +``` + +The above configuration means one in every ten requests should be sent to the new version. Try it out: + +1. Get the gateway IP: +```bash +IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=8081 +``` + +2. Send a few requests as follows: +```bash +curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ +"model": "tweet-summary", +"prompt": "Write as if you were a critic: San Francisco", +"max_tokens": 100, +"temperature": 0 +}' +``` + +### Finish the rollout + + +Modify the InferenceModel to direct 100% of the traffic to the latest version of the adapter. + +```yaml +model: + name: tweet-summary + targetModels: + targetModelName: tweet-summary-2 + weight: 100 +``` + +Unload the older versions from the servers by updating the LoRA syncer ConfigMap to list the older version under the `ensureNotExist` list: + +```yaml + apiVersion: v1 + kind: ConfigMap + metadata: + name: dynamic-lora-config + data: + configmap.yaml: | + vLLMLoRAConfig: + name: sql-loras-llama + port: 8000 + ensureExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-2 + source: mahimairaja/tweet-summarization-llama-2-finetuned + ensureNotExist: + models: + - base-model: meta-llama/Llama-2-7b-hf + id: tweet-summary-1 + source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm +``` + +With this, all requests should be served by the new adapter version. diff --git a/site-src/guides/dynamic-lora.md b/site-src/guides/dynamic-lora.md deleted file mode 100644 index ef3c2b0f..00000000 --- a/site-src/guides/dynamic-lora.md +++ /dev/null @@ -1,93 +0,0 @@ -# Getting started with Gateway API Inference Extension with Dynamic lora updates on vllm - -The goal of this guide is to get a single InferencePool running with vLLM and demonstrate use of dynamic lora updating! - -### Requirements - - Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher - - A cluster with: - - Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind, - you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). - - 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed. - -### Steps - -1. **Deploy Sample VLLM Model Server with dynamic lora update enabled and dynamic lora syncer sidecar ** - [Redeploy the vLLM deployment with Dynamic lora adapter enabled and Lora syncer sidecar and configmap](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/pkg/manifests/vllm/dynamic-lora-sidecar/deployment.yaml) - -Rest of the steps are same as [general setup](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/site-src/guides/index.md) - - -### Safely rollout v2 adapter - -1. Update the LoRA syncer ConfigMap to make the new adapter version available on the model servers. - -```yaml - apiVersion: v1 - kind: ConfigMap - metadata: - name: dynamic-lora-config - data: - configmap.yaml: | - vLLMLoRAConfig: - name: sql-loras-llama - port: 8000 - ensureExist: - models: - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-0 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-1 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-2 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm -2. Configure a canary rollout with traffic split using LLMService. In this example, 40% of traffic for tweet-summary model will be sent to the ***tweet-summary-2*** adapter . - -```yaml -model: - name: tweet-summary - targetModels: - targetModelName: tweet-summary-0 - weight: 20 - targetModelName: tweet-summary-1 - weight: 40 - targetModelName: tweet-summary-2 - weight: 40 - -``` - -3. Finish rollout by setting the traffic to the new version 100%. -```yaml -model: - name: tweet-summary - targetModels: - targetModelName: tweet-summary-2 - weight: 100 -``` - -4. Remove v1 from dynamic lora configmap. -```yaml - apiVersion: v1 - kind: ConfigMap - metadata: - name: dynamic-lora-config - data: - configmap.yaml: | - vLLMLoRAConfig: - name: sql-loras-llama - port: 8000 - ensureExist: - models: - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-2 - source: vineetsharma/qlora-adapter-Llama-2-7b-hf-TweetSumm - ensureNotExist: - models: - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-1 - source: gs://[HUGGING FACE PATH] - - base-model: meta-llama/Llama-2-7b-hf - id: tweet-summary-0 - source: gs://[HUGGING FACE PATH] -``` diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 2cc971c6..b9c38d87 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -2,16 +2,16 @@ This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get a first, single InferencePool up and running! -### Requirements +## **Prerequisites** - Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher - A cluster with: - Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind, you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). - 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed. -### Steps +## **Steps** -1. **Deploy Sample Model Server** +### Deploy Sample Model Server Create a Hugging Face secret to download the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). Ensure that the token grants access to this model. Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. @@ -20,22 +20,20 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml ``` +### Install the Inference Extension CRDs - - -1. **Install the Inference Extension CRDs:** - - ```sh + ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v0.1.0/manifests.yaml -1. **Deploy InferenceModel** +### Deploy InferenceModel Deploy the sample InferenceModel which is configured to load balance traffic between the `tweet-summary-0` and `tweet-summary-1` [LoRA adapters](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server. ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/inferencemodel.yaml ``` -1. **Update Envoy Gateway Config to enable Patch Policy** + +### Update Envoy Gateway Config to enable Patch Policy** Our custom LLM Gateway ext-proc is patched into the existing envoy gateway via `EnvoyPatchPolicy`. To enable this feature, we must extend the Envoy Gateway config map. To do this, simply run: ```bash @@ -43,7 +41,8 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl rollout restart deployment envoy-gateway -n envoy-gateway-system ``` Additionally, if you would like to enable the admin interface, you can uncomment the admin lines and run this again. -1. **Deploy Gateway** + +### Deploy Gateway ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/gateway.yaml @@ -56,26 +55,28 @@ This quickstart guide is intended for engineers familiar with k8s and model serv NAME CLASS ADDRESS PROGRAMMED AGE inference-gateway inference-gateway True 22s ``` -1. **Deploy the Inference Extension and InferencePool** +### Deploy the Inference Extension and InferencePool ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/ext_proc.yaml ``` -1. **Deploy Envoy Gateway Custom Policies** +### Deploy Envoy Gateway Custom Policies ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/extension_policy.yaml kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/patch_policy.yaml ``` > **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further. -1. **OPTIONALLY**: Apply Traffic Policy + +### **OPTIONALLY**: Apply Traffic Policy For high-traffic benchmarking you can apply this manifest to avoid any defaults that can cause timeouts/errors. ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/traffic_policy.yaml ``` -1. **Try it out** + +### Try it out Wait until the gateway is ready. @@ -89,4 +90,4 @@ This quickstart guide is intended for engineers familiar with k8s and model serv "max_tokens": 100, "temperature": 0 }' - ``` \ No newline at end of file + ``` From 5705c5825b3e1e91567a89c39543ba6e98af0fc3 Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Tue, 18 Feb 2025 21:34:26 +0000 Subject: [PATCH 30/96] Consolidating all storage behind datastore (#350) * Removed the intermediate cache in provider, and consolidating all storage behind datastore. * Fixed the provider test and covered the pool deletion events. * Don't store the port number with the pods * Address pod ip address updates * rename PodFlushAll to PodResyncAll * Addressed first round of comments * Addressed more comments * Adding a comment --- pkg/ext-proc/backend/datastore.go | 256 +++++++++----- pkg/ext-proc/backend/datastore_test.go | 6 +- pkg/ext-proc/backend/fake.go | 13 +- .../backend/inferencemodel_reconciler.go | 10 +- .../backend/inferencemodel_reconciler_test.go | 52 +-- .../backend/inferencepool_reconciler.go | 42 ++- .../backend/inferencepool_reconciler_test.go | 189 +++++++---- pkg/ext-proc/backend/pod_reconciler.go | 31 +- pkg/ext-proc/backend/pod_reconciler_test.go | 161 +++++++-- pkg/ext-proc/backend/provider.go | 134 +++----- pkg/ext-proc/backend/provider_test.go | 95 ++++-- pkg/ext-proc/backend/types.go | 21 +- pkg/ext-proc/backend/vllm/metrics.go | 11 +- pkg/ext-proc/handlers/request.go | 21 +- pkg/ext-proc/handlers/server.go | 24 +- pkg/ext-proc/health.go | 4 +- pkg/ext-proc/main.go | 16 +- pkg/ext-proc/scheduling/filter_test.go | 23 +- pkg/ext-proc/scheduling/scheduler.go | 27 +- pkg/ext-proc/server/runserver.go | 17 +- pkg/ext-proc/server/runserver_test.go | 2 +- pkg/ext-proc/test/benchmark/benchmark.go | 12 +- pkg/ext-proc/test/utils.go | 52 ++- pkg/ext-proc/util/testing/wrappers.go | 50 +++ pkg/manifests/vllm/deployment.yaml | 2 +- test/integration/hermetic_test.go | 320 ++++++++---------- 26 files changed, 935 insertions(+), 656 deletions(-) create mode 100644 pkg/ext-proc/util/testing/wrappers.go diff --git a/pkg/ext-proc/backend/datastore.go b/pkg/ext-proc/backend/datastore.go index a75e7e43..6b8483d3 100644 --- a/pkg/ext-proc/backend/datastore.go +++ b/pkg/ext-proc/backend/datastore.go @@ -4,142 +4,209 @@ import ( "context" "errors" "math/rand" - "strconv" "sync" "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) -func NewK8sDataStore(options ...K8sDatastoreOption) *K8sDatastore { - store := &K8sDatastore{ - poolMu: sync.RWMutex{}, - InferenceModels: &sync.Map{}, - pods: &sync.Map{}, - } - for _, opt := range options { - opt(store) +// The datastore is a local cache of relevant data for the given InferencePool (currently all pulled from k8s-api) +type Datastore interface { + // InferencePool operations + PoolSet(pool *v1alpha1.InferencePool) + PoolGet() (*v1alpha1.InferencePool, error) + PoolHasSynced() bool + PoolLabelsMatch(podLabels map[string]string) bool + + // InferenceModel operations + ModelSet(infModel *v1alpha1.InferenceModel) + ModelGet(modelName string) (*v1alpha1.InferenceModel, bool) + ModelDelete(modelName string) + + // PodMetrics operations + PodUpdateOrAddIfNotExist(pod *corev1.Pod) bool + PodUpdateMetricsIfExist(namespacedName types.NamespacedName, m *Metrics) bool + PodGet(namespacedName types.NamespacedName) (*PodMetrics, bool) + PodDelete(namespacedName types.NamespacedName) + PodResyncAll(ctx context.Context, ctrlClient client.Client) + PodGetAll() []*PodMetrics + PodDeleteAll() // This is only for testing. + PodRange(f func(key, value any) bool) + + // Clears the store state, happens when the pool gets deleted. + Clear() +} + +func NewDatastore() Datastore { + store := &datastore{ + poolMu: sync.RWMutex{}, + models: &sync.Map{}, + pods: &sync.Map{}, } return store } -// The datastore is a local cache of relevant data for the given InferencePool (currently all pulled from k8s-api) -type K8sDatastore struct { +type datastore struct { // poolMu is used to synchronize access to the inferencePool. - poolMu sync.RWMutex - inferencePool *v1alpha1.InferencePool - InferenceModels *sync.Map - pods *sync.Map + poolMu sync.RWMutex + pool *v1alpha1.InferencePool + models *sync.Map + // key: types.NamespacedName, value: *PodMetrics + pods *sync.Map } -type K8sDatastoreOption func(*K8sDatastore) - -// WithPods can be used in tests to override the pods. -func WithPods(pods []*PodMetrics) K8sDatastoreOption { - return func(store *K8sDatastore) { - store.pods = &sync.Map{} - for _, pod := range pods { - store.pods.Store(pod.Pod, true) - } - } +func (ds *datastore) Clear() { + ds.poolMu.Lock() + defer ds.poolMu.Unlock() + ds.pool = nil + ds.models.Clear() + ds.pods.Clear() } -func (ds *K8sDatastore) setInferencePool(pool *v1alpha1.InferencePool) { +// /// InferencePool APIs /// +func (ds *datastore) PoolSet(pool *v1alpha1.InferencePool) { ds.poolMu.Lock() defer ds.poolMu.Unlock() - ds.inferencePool = pool + ds.pool = pool } -func (ds *K8sDatastore) getInferencePool() (*v1alpha1.InferencePool, error) { +func (ds *datastore) PoolGet() (*v1alpha1.InferencePool, error) { ds.poolMu.RLock() defer ds.poolMu.RUnlock() - if !ds.HasSynced() { + if !ds.PoolHasSynced() { return nil, errors.New("InferencePool is not initialized in data store") } - return ds.inferencePool, nil + return ds.pool, nil } -func (ds *K8sDatastore) GetPodIPs() []string { - var ips []string - ds.pods.Range(func(name, pod any) bool { - ips = append(ips, pod.(*corev1.Pod).Status.PodIP) - return true - }) - return ips +func (ds *datastore) PoolHasSynced() bool { + ds.poolMu.RLock() + defer ds.poolMu.RUnlock() + return ds.pool != nil +} + +func (ds *datastore) PoolLabelsMatch(podLabels map[string]string) bool { + poolSelector := selectorFromInferencePoolSelector(ds.pool.Spec.Selector) + podSet := labels.Set(podLabels) + return poolSelector.Matches(podSet) } -func (s *K8sDatastore) FetchModelData(modelName string) (returnModel *v1alpha1.InferenceModel) { - infModel, ok := s.InferenceModels.Load(modelName) +// /// InferenceModel APIs /// +func (ds *datastore) ModelSet(infModel *v1alpha1.InferenceModel) { + ds.models.Store(infModel.Spec.ModelName, infModel) +} + +func (ds *datastore) ModelGet(modelName string) (*v1alpha1.InferenceModel, bool) { + infModel, ok := ds.models.Load(modelName) if ok { - returnModel = infModel.(*v1alpha1.InferenceModel) + return infModel.(*v1alpha1.InferenceModel), true } - return + return nil, false } -// HasSynced returns true if InferencePool is set in the data store. -func (ds *K8sDatastore) HasSynced() bool { - ds.poolMu.RLock() - defer ds.poolMu.RUnlock() - return ds.inferencePool != nil +func (ds *datastore) ModelDelete(modelName string) { + ds.models.Delete(modelName) } -func RandomWeightedDraw(logger logr.Logger, model *v1alpha1.InferenceModel, seed int64) string { - var weights int32 - - source := rand.NewSource(rand.Int63()) - if seed > 0 { - source = rand.NewSource(seed) - } - r := rand.New(source) - for _, model := range model.Spec.TargetModels { - weights += *model.Weight +// /// Pods/endpoints APIs /// +func (ds *datastore) PodUpdateMetricsIfExist(namespacedName types.NamespacedName, m *Metrics) bool { + if val, ok := ds.pods.Load(namespacedName); ok { + existing := val.(*PodMetrics) + existing.Metrics = *m + return true } - logger.V(logutil.TRACE).Info("Weights for model computed", "model", model.Name, "weights", weights) - randomVal := r.Int31n(weights) - for _, model := range model.Spec.TargetModels { - if randomVal < *model.Weight { - return model.Name - } - randomVal -= *model.Weight + return false +} + +func (ds *datastore) PodGet(namespacedName types.NamespacedName) (*PodMetrics, bool) { + val, ok := ds.pods.Load(namespacedName) + if ok { + return val.(*PodMetrics), true } - return "" + return nil, false } -func IsCritical(model *v1alpha1.InferenceModel) bool { - if model.Spec.Criticality != nil && *model.Spec.Criticality == v1alpha1.Critical { +func (ds *datastore) PodGetAll() []*PodMetrics { + res := []*PodMetrics{} + fn := func(k, v any) bool { + res = append(res, v.(*PodMetrics)) return true } - return false + ds.pods.Range(fn) + return res } -func (ds *K8sDatastore) LabelsMatch(podLabels map[string]string) bool { - poolSelector := selectorFromInferencePoolSelector(ds.inferencePool.Spec.Selector) - podSet := labels.Set(podLabels) - return poolSelector.Matches(podSet) +func (ds *datastore) PodRange(f func(key, value any) bool) { + ds.pods.Range(f) +} + +func (ds *datastore) PodDelete(namespacedName types.NamespacedName) { + ds.pods.Delete(namespacedName) +} + +func (ds *datastore) PodUpdateOrAddIfNotExist(pod *corev1.Pod) bool { + new := &PodMetrics{ + Pod: Pod{ + NamespacedName: types.NamespacedName{ + Name: pod.Name, + Namespace: pod.Namespace, + }, + Address: pod.Status.PodIP, + }, + Metrics: Metrics{ + ActiveModels: make(map[string]int), + }, + } + existing, ok := ds.pods.Load(new.NamespacedName) + if !ok { + ds.pods.Store(new.NamespacedName, new) + return true + } + + // Update pod properties if anything changed. + existing.(*PodMetrics).Pod = new.Pod + return false } -func (ds *K8sDatastore) flushPodsAndRefetch(ctx context.Context, ctrlClient client.Client, newServerPool *v1alpha1.InferencePool) { +func (ds *datastore) PodResyncAll(ctx context.Context, ctrlClient client.Client) { + // Pool must exist to invoke this function. + pool, _ := ds.PoolGet() podList := &corev1.PodList{} if err := ctrlClient.List(ctx, podList, &client.ListOptions{ - LabelSelector: selectorFromInferencePoolSelector(newServerPool.Spec.Selector), - Namespace: newServerPool.Namespace, + LabelSelector: selectorFromInferencePoolSelector(pool.Spec.Selector), + Namespace: pool.Namespace, }); err != nil { log.FromContext(ctx).V(logutil.DEFAULT).Error(err, "Failed to list clients") + return } - ds.pods.Clear() - for _, k8sPod := range podList.Items { - pod := Pod{ - Name: k8sPod.Name, - Address: k8sPod.Status.PodIP + ":" + strconv.Itoa(int(newServerPool.Spec.TargetPortNumber)), + activePods := make(map[string]bool) + for _, pod := range podList.Items { + if podIsReady(&pod) { + activePods[pod.Name] = true + ds.PodUpdateOrAddIfNotExist(&pod) } - ds.pods.Store(pod, true) } + + // Remove pods that don't exist or not ready any more. + deleteFn := func(k, v any) bool { + pm := v.(*PodMetrics) + if exist := activePods[pm.NamespacedName.Name]; !exist { + ds.pods.Delete(pm.NamespacedName) + } + return true + } + ds.pods.Range(deleteFn) +} + +func (ds *datastore) PodDeleteAll() { + ds.pods.Clear() } func selectorFromInferencePoolSelector(selector map[v1alpha1.LabelKey]v1alpha1.LabelValue) labels.Selector { @@ -153,3 +220,32 @@ func stripLabelKeyAliasFromLabelMap(labels map[v1alpha1.LabelKey]v1alpha1.LabelV } return outMap } + +func RandomWeightedDraw(logger logr.Logger, model *v1alpha1.InferenceModel, seed int64) string { + var weights int32 + + source := rand.NewSource(rand.Int63()) + if seed > 0 { + source = rand.NewSource(seed) + } + r := rand.New(source) + for _, model := range model.Spec.TargetModels { + weights += *model.Weight + } + logger.V(logutil.TRACE).Info("Weights for model computed", "model", model.Name, "weights", weights) + randomVal := r.Int31n(weights) + for _, model := range model.Spec.TargetModels { + if randomVal < *model.Weight { + return model.Name + } + randomVal -= *model.Weight + } + return "" +} + +func IsCritical(model *v1alpha1.InferenceModel) bool { + if model.Spec.Criticality != nil && *model.Spec.Criticality == v1alpha1.Critical { + return true + } + return false +} diff --git a/pkg/ext-proc/backend/datastore_test.go b/pkg/ext-proc/backend/datastore_test.go index 9f74226a..b44de0a5 100644 --- a/pkg/ext-proc/backend/datastore_test.go +++ b/pkg/ext-proc/backend/datastore_test.go @@ -32,13 +32,13 @@ func TestHasSynced(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - datastore := NewK8sDataStore() + datastore := NewDatastore() // Set the inference pool if tt.inferencePool != nil { - datastore.setInferencePool(tt.inferencePool) + datastore.PoolSet(tt.inferencePool) } // Check if the data store has been initialized - hasSynced := datastore.HasSynced() + hasSynced := datastore.PoolHasSynced() if hasSynced != tt.hasSynced { t.Errorf("IsInitialized() = %v, want %v", hasSynced, tt.hasSynced) } diff --git a/pkg/ext-proc/backend/fake.go b/pkg/ext-proc/backend/fake.go index 2c0757db..dfb520ef 100644 --- a/pkg/ext-proc/backend/fake.go +++ b/pkg/ext-proc/backend/fake.go @@ -3,22 +3,23 @@ package backend import ( "context" + "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) type FakePodMetricsClient struct { - Err map[Pod]error - Res map[Pod]*PodMetrics + Err map[types.NamespacedName]error + Res map[types.NamespacedName]*PodMetrics } -func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, pod Pod, existing *PodMetrics) (*PodMetrics, error) { - if err, ok := f.Err[pod]; ok { +func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, existing *PodMetrics) (*PodMetrics, error) { + if err, ok := f.Err[existing.NamespacedName]; ok { return nil, err } - log.FromContext(ctx).V(logutil.VERBOSE).Info("Fetching metrics for pod", "pod", pod, "existing", existing, "new", f.Res[pod]) - return f.Res[pod], nil + log.FromContext(ctx).V(logutil.VERBOSE).Info("Fetching metrics for pod", "existing", existing, "new", f.Res[existing.NamespacedName]) + return f.Res[existing.NamespacedName], nil } type FakeDataStore struct { diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler.go b/pkg/ext-proc/backend/inferencemodel_reconciler.go index 4959845c..884e6b7e 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler.go @@ -19,7 +19,7 @@ type InferenceModelReconciler struct { client.Client Scheme *runtime.Scheme Record record.EventRecorder - Datastore *K8sDatastore + Datastore Datastore PoolNamespacedName types.NamespacedName } @@ -36,14 +36,14 @@ func (c *InferenceModelReconciler) Reconcile(ctx context.Context, req ctrl.Reque if err := c.Get(ctx, req.NamespacedName, infModel); err != nil { if errors.IsNotFound(err) { loggerDefault.Info("InferenceModel not found. Removing from datastore since object must be deleted", "name", req.NamespacedName) - c.Datastore.InferenceModels.Delete(infModel.Spec.ModelName) + c.Datastore.ModelDelete(infModel.Spec.ModelName) return ctrl.Result{}, nil } loggerDefault.Error(err, "Unable to get InferenceModel", "name", req.NamespacedName) return ctrl.Result{}, err } else if !infModel.DeletionTimestamp.IsZero() { loggerDefault.Info("InferenceModel is marked for deletion. Removing from datastore", "name", req.NamespacedName) - c.Datastore.InferenceModels.Delete(infModel.Spec.ModelName) + c.Datastore.ModelDelete(infModel.Spec.ModelName) return ctrl.Result{}, nil } @@ -57,12 +57,12 @@ func (c *InferenceModelReconciler) updateDatastore(logger logr.Logger, infModel if infModel.Spec.PoolRef.Name == c.PoolNamespacedName.Name { loggerDefault.Info("Updating datastore", "poolRef", infModel.Spec.PoolRef, "serverPoolName", c.PoolNamespacedName) loggerDefault.Info("Adding/Updating InferenceModel", "modelName", infModel.Spec.ModelName) - c.Datastore.InferenceModels.Store(infModel.Spec.ModelName, infModel) + c.Datastore.ModelSet(infModel) return } loggerDefault.Info("Removing/Not adding InferenceModel", "modelName", infModel.Spec.ModelName) // If we get here. The model is not relevant to this pool, remove. - c.Datastore.InferenceModels.Delete(infModel.Spec.ModelName) + c.Datastore.ModelDelete(infModel.Spec.ModelName) } func (c *InferenceModelReconciler) SetupWithManager(mgr ctrl.Manager) error { diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go index 4e195818..5afe3b5a 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go +++ b/pkg/ext-proc/backend/inferencemodel_reconciler_test.go @@ -51,14 +51,14 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { tests := []struct { name string - datastore *K8sDatastore + datastore *datastore incomingService *v1alpha1.InferenceModel wantInferenceModels *sync.Map }{ { name: "No Services registered; valid, new service incoming.", - datastore: &K8sDatastore{ - inferencePool: &v1alpha1.InferencePool{ + datastore: &datastore{ + pool: &v1alpha1.InferencePool{ Spec: v1alpha1.InferencePoolSpec{ Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, }, @@ -67,15 +67,15 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { ResourceVersion: "Old and boring", }, }, - InferenceModels: &sync.Map{}, + models: &sync.Map{}, }, incomingService: infModel1, wantInferenceModels: populateServiceMap(infModel1), }, { name: "Removing existing service.", - datastore: &K8sDatastore{ - inferencePool: &v1alpha1.InferencePool{ + datastore: &datastore{ + pool: &v1alpha1.InferencePool{ Spec: v1alpha1.InferencePoolSpec{ Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, }, @@ -84,15 +84,15 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { ResourceVersion: "Old and boring", }, }, - InferenceModels: populateServiceMap(infModel1), + models: populateServiceMap(infModel1), }, incomingService: infModel1Modified, wantInferenceModels: populateServiceMap(), }, { name: "Unrelated service, do nothing.", - datastore: &K8sDatastore{ - inferencePool: &v1alpha1.InferencePool{ + datastore: &datastore{ + pool: &v1alpha1.InferencePool{ Spec: v1alpha1.InferencePoolSpec{ Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, }, @@ -101,7 +101,7 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { ResourceVersion: "Old and boring", }, }, - InferenceModels: populateServiceMap(infModel1), + models: populateServiceMap(infModel1), }, incomingService: &v1alpha1.InferenceModel{ Spec: v1alpha1.InferenceModelSpec{ @@ -116,8 +116,8 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { }, { name: "Add to existing", - datastore: &K8sDatastore{ - inferencePool: &v1alpha1.InferencePool{ + datastore: &datastore{ + pool: &v1alpha1.InferencePool{ Spec: v1alpha1.InferencePoolSpec{ Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, }, @@ -126,7 +126,7 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { ResourceVersion: "Old and boring", }, }, - InferenceModels: populateServiceMap(infModel1), + models: populateServiceMap(infModel1), }, incomingService: infModel2, wantInferenceModels: populateServiceMap(infModel1, infModel2), @@ -136,11 +136,11 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { t.Run(test.name, func(t *testing.T) { reconciler := &InferenceModelReconciler{ Datastore: test.datastore, - PoolNamespacedName: types.NamespacedName{Name: test.datastore.inferencePool.Name}, + PoolNamespacedName: types.NamespacedName{Name: test.datastore.pool.Name}, } reconciler.updateDatastore(logger, test.incomingService) - if ok := mapsEqual(reconciler.Datastore.InferenceModels, test.wantInferenceModels); !ok { + if ok := mapsEqual(test.datastore.models, test.wantInferenceModels); !ok { t.Error("Maps are not equal") } }) @@ -156,9 +156,9 @@ func TestReconcile_ResourceNotFound(t *testing.T) { fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() // Create a minimal datastore. - datastore := &K8sDatastore{ - InferenceModels: &sync.Map{}, - inferencePool: &v1alpha1.InferencePool{ + datastore := &datastore{ + models: &sync.Map{}, + pool: &v1alpha1.InferencePool{ ObjectMeta: metav1.ObjectMeta{Name: "test-pool"}, }, } @@ -211,9 +211,9 @@ func TestReconcile_ModelMarkedForDeletion(t *testing.T) { fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existingModel).Build() // Create a minimal datastore. - datastore := &K8sDatastore{ - InferenceModels: &sync.Map{}, - inferencePool: &v1alpha1.InferencePool{ + datastore := &datastore{ + models: &sync.Map{}, + pool: &v1alpha1.InferencePool{ ObjectMeta: metav1.ObjectMeta{Name: "test-pool"}, }, } @@ -242,7 +242,7 @@ func TestReconcile_ModelMarkedForDeletion(t *testing.T) { } // Verify that the datastore was not updated. - if _, ok := datastore.InferenceModels.Load(existingModel.Spec.ModelName); ok { + if _, exist := datastore.ModelGet(existingModel.Spec.ModelName); exist { t.Errorf("expected datastore to not contain model %q", existingModel.Spec.ModelName) } } @@ -268,9 +268,9 @@ func TestReconcile_ResourceExists(t *testing.T) { fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existingModel).Build() // Create a minimal datastore. - datastore := &K8sDatastore{ - InferenceModels: &sync.Map{}, - inferencePool: &v1alpha1.InferencePool{ + datastore := &datastore{ + models: &sync.Map{}, + pool: &v1alpha1.InferencePool{ ObjectMeta: metav1.ObjectMeta{Name: "test-pool"}, }, } @@ -299,7 +299,7 @@ func TestReconcile_ResourceExists(t *testing.T) { } // Verify that the datastore was updated. - if _, ok := datastore.InferenceModels.Load(existingModel.Spec.ModelName); !ok { + if _, exist := datastore.ModelGet(existingModel.Spec.ModelName); !exist { t.Errorf("expected datastore to contain model %q", existingModel.Spec.ModelName) } } diff --git a/pkg/ext-proc/backend/inferencepool_reconciler.go b/pkg/ext-proc/backend/inferencepool_reconciler.go index e44a278a..6f52862e 100644 --- a/pkg/ext-proc/backend/inferencepool_reconciler.go +++ b/pkg/ext-proc/backend/inferencepool_reconciler.go @@ -4,11 +4,10 @@ import ( "context" "reflect" - "github.com/go-logr/logr" + "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" - klog "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" @@ -24,7 +23,7 @@ type InferencePoolReconciler struct { Scheme *runtime.Scheme Record record.EventRecorder PoolNamespacedName types.NamespacedName - Datastore *K8sDatastore + Datastore Datastore } func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { @@ -37,26 +36,39 @@ func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Reques loggerDefault.Info("Reconciling InferencePool", "name", req.NamespacedName) serverPool := &v1alpha1.InferencePool{} + if err := c.Get(ctx, req.NamespacedName, serverPool); err != nil { + if errors.IsNotFound(err) { + loggerDefault.Info("InferencePool not found. Clearing the datastore", "name", req.NamespacedName) + c.Datastore.Clear() + return ctrl.Result{}, nil + } loggerDefault.Error(err, "Unable to get InferencePool", "name", req.NamespacedName) return ctrl.Result{}, err - } - if c.Datastore.inferencePool == nil || !reflect.DeepEqual(serverPool.Spec.Selector, c.Datastore.inferencePool.Spec.Selector) { - c.updateDatastore(logger, serverPool) - c.Datastore.flushPodsAndRefetch(ctx, c.Client, serverPool) - } else { - c.updateDatastore(logger, serverPool) + } else if !serverPool.DeletionTimestamp.IsZero() { + loggerDefault.Info("InferencePool is marked for deletion. Clearing the datastore", "name", req.NamespacedName) + c.Datastore.Clear() + return ctrl.Result{}, nil } + c.updateDatastore(ctx, serverPool) + return ctrl.Result{}, nil } -func (c *InferencePoolReconciler) updateDatastore(logger logr.Logger, serverPool *v1alpha1.InferencePool) { - pool, _ := c.Datastore.getInferencePool() - if pool == nil || - serverPool.ObjectMeta.ResourceVersion != pool.ObjectMeta.ResourceVersion { - logger.V(logutil.DEFAULT).Info("Updating inference pool", "target", klog.KMetadata(&serverPool.ObjectMeta)) - c.Datastore.setInferencePool(serverPool) +func (c *InferencePoolReconciler) updateDatastore(ctx context.Context, newPool *v1alpha1.InferencePool) { + logger := log.FromContext(ctx) + oldPool, err := c.Datastore.PoolGet() + c.Datastore.PoolSet(newPool) + if err != nil || !reflect.DeepEqual(newPool.Spec.Selector, oldPool.Spec.Selector) { + logger.V(logutil.DEFAULT).Info("Updating inference pool endpoints", "selector", newPool.Spec.Selector) + // A full resync is required to address two cases: + // 1) At startup, the pod events may get processed before the pool is synced with the datastore, + // and hence they will not be added to the store since pool selector is not known yet + // 2) If the selector on the pool was updated, then we will not get any pod events, and so we need + // to resync the whole pool: remove pods in the store that don't match the new selector and add + // the ones that may have existed already to the store. + c.Datastore.PodResyncAll(ctx, c.Client) } } diff --git a/pkg/ext-proc/backend/inferencepool_reconciler_test.go b/pkg/ext-proc/backend/inferencepool_reconciler_test.go index 1da7d61b..b6403489 100644 --- a/pkg/ext-proc/backend/inferencepool_reconciler_test.go +++ b/pkg/ext-proc/backend/inferencepool_reconciler_test.go @@ -1,88 +1,153 @@ package backend import ( - "reflect" + "context" "testing" + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + utiltesting "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/testing" ) var ( - pool1 = &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, - }, + selector_v1 = map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm_v1"} + selector_v2 = map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm_v2"} + pool1 = &v1alpha1.InferencePool{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool", - ResourceVersion: "50", + Name: "pool1", + Namespace: "pool1-ns", }, - } - // Different name, same RV doesn't really make sense, but helps with testing the - // updateStore impl which relies on the equality of RVs alone. - modPool1SameRV = &v1alpha1.InferencePool{ Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool-mod", - ResourceVersion: "50", + Selector: selector_v1, + TargetPortNumber: 8080, }, } - modPool1DiffRV = &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, - }, + pool2 = &v1alpha1.InferencePool{ ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool-mod", - ResourceVersion: "51", + Name: "pool2", + Namespace: "pool2-ns", }, } + pods = []corev1.Pod{ + // Two ready pods matching pool1 + utiltesting.MakePod("pod1", "pool1-ns").Labels(stripLabelKeyAliasFromLabelMap(selector_v1)).ReadyCondition().Obj(), + utiltesting.MakePod("pod2", "pool1-ns").Labels(stripLabelKeyAliasFromLabelMap(selector_v1)).ReadyCondition().Obj(), + // A not ready pod matching pool1 + utiltesting.MakePod("pod3", "pool1-ns").Labels(stripLabelKeyAliasFromLabelMap(selector_v1)).Obj(), + // A pod not matching pool1 namespace + utiltesting.MakePod("pod4", "pool2-ns").Labels(stripLabelKeyAliasFromLabelMap(selector_v1)).ReadyCondition().Obj(), + // A ready pod matching pool1 with a new selector + utiltesting.MakePod("pod5", "pool1-ns").Labels(stripLabelKeyAliasFromLabelMap(selector_v2)).ReadyCondition().Obj(), + } ) -func TestUpdateDatastore_InferencePoolReconciler(t *testing.T) { - logger := logutil.NewTestLogger() +func TestReconcile_InferencePoolReconciler(t *testing.T) { + // The best practice is to use table-driven tests, however in this scaenario it seems + // more logical to do a single test with steps that depend on each other. - tests := []struct { - name string - datastore *K8sDatastore - incomingPool *v1alpha1.InferencePool - wantPool *v1alpha1.InferencePool - }{ - { - name: "InferencePool not set, should set InferencePool", - datastore: &K8sDatastore{}, - incomingPool: pool1.DeepCopy(), - wantPool: pool1, - }, - { - name: "InferencePool set, matching RVs, do nothing", - datastore: &K8sDatastore{ - inferencePool: pool1.DeepCopy(), - }, - incomingPool: modPool1SameRV.DeepCopy(), - wantPool: pool1, - }, - { - name: "InferencePool set, differing RVs, re-set InferencePool", - datastore: &K8sDatastore{ - inferencePool: pool1.DeepCopy(), - }, - incomingPool: modPool1DiffRV.DeepCopy(), - wantPool: modPool1DiffRV, - }, + // Set up the scheme. + scheme := runtime.NewScheme() + _ = clientgoscheme.AddToScheme(scheme) + _ = v1alpha1.AddToScheme(scheme) + + // Create a fake client with the pool and the pods. + initialObjects := []client.Object{pool1, pool2} + for i := range pods { + initialObjects = append(initialObjects, &pods[i]) } + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(initialObjects...). + Build() + + // Create a request for the existing resource. + namespacedName := types.NamespacedName{Name: pool1.Name, Namespace: pool1.Namespace} + req := ctrl.Request{NamespacedName: namespacedName} + ctx := context.Background() - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - inferencePoolReconciler := &InferencePoolReconciler{Datastore: test.datastore} - inferencePoolReconciler.updateDatastore(logger, test.incomingPool) + datastore := NewDatastore() + inferencePoolReconciler := &InferencePoolReconciler{PoolNamespacedName: namespacedName, Client: fakeClient, Datastore: datastore} + + // Step 1: Inception, only ready pods matching pool1 are added to the store. + if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { + t.Errorf("Unexpected InferencePool reconcile error: %v", err) + } + if diff := diffPool(datastore, pool1, []string{"pod1", "pod2"}); diff != "" { + t.Errorf("Unexpected diff (+got/-want): %s", diff) + } + + // Step 2: A reconcile on pool2 should not change anything. + if _, err := inferencePoolReconciler.Reconcile(ctx, ctrl.Request{NamespacedName: types.NamespacedName{Name: pool2.Name, Namespace: pool2.Namespace}}); err != nil { + t.Errorf("Unexpected InferencePool reconcile error: %v", err) + } + if diff := diffPool(datastore, pool1, []string{"pod1", "pod2"}); diff != "" { + t.Errorf("Unexpected diff (+got/-want): %s", diff) + } - gotPool := inferencePoolReconciler.Datastore.inferencePool - if !reflect.DeepEqual(gotPool, test.wantPool) { - t.Errorf("Unexpected InferencePool: want %#v, got: %#v", test.wantPool, gotPool) - } - }) + // Step 3: update the pool selector to include more pods + newPool1 := &v1alpha1.InferencePool{} + if err := fakeClient.Get(ctx, req.NamespacedName, newPool1); err != nil { + t.Errorf("Unexpected pool get error: %v", err) + } + newPool1.Spec.Selector = selector_v2 + if err := fakeClient.Update(ctx, newPool1, &client.UpdateOptions{}); err != nil { + t.Errorf("Unexpected pool update error: %v", err) + } + + if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { + t.Errorf("Unexpected InferencePool reconcile error: %v", err) + } + if diff := diffPool(datastore, newPool1, []string{"pod5"}); diff != "" { + t.Errorf("Unexpected diff (+got/-want): %s", diff) + } + + // Step 4: update the pool port + if err := fakeClient.Get(ctx, req.NamespacedName, newPool1); err != nil { + t.Errorf("Unexpected pool get error: %v", err) + } + newPool1.Spec.TargetPortNumber = 9090 + if err := fakeClient.Update(ctx, newPool1, &client.UpdateOptions{}); err != nil { + t.Errorf("Unexpected pool update error: %v", err) + } + if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { + t.Errorf("Unexpected InferencePool reconcile error: %v", err) + } + if diff := diffPool(datastore, newPool1, []string{"pod5"}); diff != "" { + t.Errorf("Unexpected diff (+got/-want): %s", diff) + } + + // Step 5: delete the pool to trigger a datastore clear + if err := fakeClient.Get(ctx, req.NamespacedName, newPool1); err != nil { + t.Errorf("Unexpected pool get error: %v", err) + } + if err := fakeClient.Delete(ctx, newPool1, &client.DeleteOptions{}); err != nil { + t.Errorf("Unexpected pool delete error: %v", err) + } + if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { + t.Errorf("Unexpected InferencePool reconcile error: %v", err) + } + if diff := diffPool(datastore, nil, []string{}); diff != "" { + t.Errorf("Unexpected diff (+got/-want): %s", diff) + } +} + +func diffPool(datastore Datastore, wantPool *v1alpha1.InferencePool, wantPods []string) string { + gotPool, _ := datastore.PoolGet() + if diff := cmp.Diff(wantPool, gotPool); diff != "" { + return diff + } + gotPods := []string{} + for _, pm := range datastore.PodGetAll() { + gotPods = append(gotPods, pm.NamespacedName.Name) } + return cmp.Diff(wantPods, gotPods, cmpopts.SortSlices(func(a, b string) bool { return a < b })) } diff --git a/pkg/ext-proc/backend/pod_reconciler.go b/pkg/ext-proc/backend/pod_reconciler.go index b914ea8d..8705ce83 100644 --- a/pkg/ext-proc/backend/pod_reconciler.go +++ b/pkg/ext-proc/backend/pod_reconciler.go @@ -2,29 +2,29 @@ package backend import ( "context" - "strconv" + "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) type PodReconciler struct { client.Client - Datastore *K8sDatastore + Datastore Datastore Scheme *runtime.Scheme Record record.EventRecorder } func (c *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { logger := log.FromContext(ctx) - inferencePool, err := c.Datastore.getInferencePool() + inferencePool, err := c.Datastore.PoolGet() if err != nil { logger.V(logutil.TRACE).Info("Skipping reconciling Pod because the InferencePool is not available yet", "error", err) // When the inferencePool is initialized it lists the appropriate pods and populates the datastore, so no need to requeue. @@ -38,15 +38,14 @@ func (c *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R pod := &corev1.Pod{} if err := c.Get(ctx, req.NamespacedName, pod); err != nil { if apierrors.IsNotFound(err) { - c.Datastore.pods.Delete(pod) + c.Datastore.PodDelete(req.NamespacedName) return ctrl.Result{}, nil } logger.V(logutil.DEFAULT).Error(err, "Unable to get pod", "name", req.NamespacedName) return ctrl.Result{}, err } - c.updateDatastore(pod, inferencePool) - + c.updateDatastore(logger, pod) return ctrl.Result{}, nil } @@ -56,15 +55,17 @@ func (c *PodReconciler) SetupWithManager(mgr ctrl.Manager) error { Complete(c) } -func (c *PodReconciler) updateDatastore(k8sPod *corev1.Pod, inferencePool *v1alpha1.InferencePool) { - pod := Pod{ - Name: k8sPod.Name, - Address: k8sPod.Status.PodIP + ":" + strconv.Itoa(int(inferencePool.Spec.TargetPortNumber)), - } - if !k8sPod.DeletionTimestamp.IsZero() || !c.Datastore.LabelsMatch(k8sPod.ObjectMeta.Labels) || !podIsReady(k8sPod) { - c.Datastore.pods.Delete(pod) +func (c *PodReconciler) updateDatastore(logger logr.Logger, pod *corev1.Pod) { + namespacedName := types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace} + if !pod.DeletionTimestamp.IsZero() || !c.Datastore.PoolLabelsMatch(pod.Labels) || !podIsReady(pod) { + logger.V(logutil.DEFAULT).Info("Pod removed or not added", "name", namespacedName) + c.Datastore.PodDelete(namespacedName) } else { - c.Datastore.pods.Store(pod, true) + if c.Datastore.PodUpdateOrAddIfNotExist(pod) { + logger.V(logutil.DEFAULT).Info("Pod added", "name", namespacedName) + } else { + logger.V(logutil.DEFAULT).Info("Pod already exists", "name", namespacedName) + } } } diff --git a/pkg/ext-proc/backend/pod_reconciler_test.go b/pkg/ext-proc/backend/pod_reconciler_test.go index 42d6d8e4..cc7381f6 100644 --- a/pkg/ext-proc/backend/pod_reconciler_test.go +++ b/pkg/ext-proc/backend/pod_reconciler_test.go @@ -1,33 +1,43 @@ package backend import ( + "context" "testing" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" ) var ( - basePod1 = Pod{Name: "pod1", Address: ":8000"} - basePod2 = Pod{Name: "pod2", Address: ":8000"} - basePod3 = Pod{Name: "pod3", Address: ":8000"} + basePod1 = &PodMetrics{Pod: Pod{NamespacedName: types.NamespacedName{Name: "pod1"}, Address: "address-1"}} + basePod2 = &PodMetrics{Pod: Pod{NamespacedName: types.NamespacedName{Name: "pod2"}, Address: "address-2"}} + basePod3 = &PodMetrics{Pod: Pod{NamespacedName: types.NamespacedName{Name: "pod3"}, Address: "address-3"}} + basePod11 = &PodMetrics{Pod: Pod{NamespacedName: types.NamespacedName{Name: "pod1"}, Address: "address-11"}} ) func TestUpdateDatastore_PodReconciler(t *testing.T) { + now := metav1.Now() tests := []struct { name string - datastore *K8sDatastore + datastore Datastore incomingPod *corev1.Pod - wantPods []string + wantPods []Pod + req *ctrl.Request }{ { name: "Add new pod", - datastore: &K8sDatastore{ + datastore: &datastore{ pods: populateMap(basePod1, basePod2), - inferencePool: &v1alpha1.InferencePool{ + pool: &v1alpha1.InferencePool{ Spec: v1alpha1.InferencePoolSpec{ TargetPortNumber: int32(8000), Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ @@ -38,10 +48,76 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, incomingPod: &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ - Name: "pod3", + Name: basePod3.NamespacedName.Name, + Labels: map[string]string{ + "some-key": "some-val", + }, + }, + Status: corev1.PodStatus{ + PodIP: basePod3.Address, + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionTrue, + }, + }, + }, + }, + wantPods: []Pod{basePod1.Pod, basePod2.Pod, basePod3.Pod}, + }, + { + name: "Update pod1 address", + datastore: &datastore{ + pods: populateMap(basePod1, basePod2), + pool: &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + "some-key": "some-val", + }, + }, + }, + }, + incomingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: basePod11.NamespacedName.Name, + Labels: map[string]string{ + "some-key": "some-val", + }, + }, + Status: corev1.PodStatus{ + PodIP: basePod11.Address, + Conditions: []corev1.PodCondition{ + { + Type: corev1.PodReady, + Status: corev1.ConditionTrue, + }, + }, + }, + }, + wantPods: []Pod{basePod11.Pod, basePod2.Pod}, + }, + { + name: "Delete pod with DeletionTimestamp", + datastore: &datastore{ + pods: populateMap(basePod1, basePod2), + pool: &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + "some-key": "some-val", + }, + }, + }, + }, + incomingPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", Labels: map[string]string{ "some-key": "some-val", }, + DeletionTimestamp: &now, + Finalizers: []string{"finalizer"}, }, Status: corev1.PodStatus{ Conditions: []corev1.PodCondition{ @@ -52,13 +128,29 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, }, }, - wantPods: []string{basePod1.Name, basePod2.Name, basePod3.Name}, + wantPods: []Pod{basePod2.Pod}, + }, + { + name: "Delete notfound pod", + datastore: &datastore{ + pods: populateMap(basePod1, basePod2), + pool: &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + "some-key": "some-val", + }, + }, + }, + }, + req: &ctrl.Request{NamespacedName: types.NamespacedName{Name: "pod1"}}, + wantPods: []Pod{basePod2.Pod}, }, { name: "New pod, not ready, valid selector", - datastore: &K8sDatastore{ + datastore: &datastore{ pods: populateMap(basePod1, basePod2), - inferencePool: &v1alpha1.InferencePool{ + pool: &v1alpha1.InferencePool{ Spec: v1alpha1.InferencePoolSpec{ TargetPortNumber: int32(8000), Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ @@ -83,13 +175,13 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, }, }, - wantPods: []string{basePod1.Name, basePod2.Name}, + wantPods: []Pod{basePod1.Pod, basePod2.Pod}, }, { name: "Remove pod that does not match selector", - datastore: &K8sDatastore{ + datastore: &datastore{ pods: populateMap(basePod1, basePod2), - inferencePool: &v1alpha1.InferencePool{ + pool: &v1alpha1.InferencePool{ Spec: v1alpha1.InferencePoolSpec{ TargetPortNumber: int32(8000), Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ @@ -114,13 +206,13 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, }, }, - wantPods: []string{basePod2.Name}, + wantPods: []Pod{basePod2.Pod}, }, { name: "Remove pod that is not ready", - datastore: &K8sDatastore{ + datastore: &datastore{ pods: populateMap(basePod1, basePod2), - inferencePool: &v1alpha1.InferencePool{ + pool: &v1alpha1.InferencePool{ Spec: v1alpha1.InferencePoolSpec{ TargetPortNumber: int32(8000), Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ @@ -145,22 +237,41 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, }, }, - wantPods: []string{basePod2.Name}, + wantPods: []Pod{basePod2.Pod}, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - podReconciler := &PodReconciler{Datastore: test.datastore} - podReconciler.updateDatastore(test.incomingPod, test.datastore.inferencePool) - var gotPods []string - test.datastore.pods.Range(func(k, v any) bool { - pod := k.(Pod) + // Set up the scheme. + scheme := runtime.NewScheme() + _ = clientgoscheme.AddToScheme(scheme) + initialObjects := []client.Object{} + if test.incomingPod != nil { + initialObjects = append(initialObjects, test.incomingPod) + } + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(initialObjects...). + Build() + + podReconciler := &PodReconciler{Client: fakeClient, Datastore: test.datastore} + namespacedName := types.NamespacedName{Name: test.incomingPod.Name, Namespace: test.incomingPod.Namespace} + if test.req == nil { + test.req = &ctrl.Request{NamespacedName: namespacedName} + } + if _, err := podReconciler.Reconcile(context.Background(), *test.req); err != nil { + t.Errorf("Unexpected InferencePool reconcile error: %v", err) + } + + var gotPods []Pod + test.datastore.PodRange(func(k, v any) bool { + pod := v.(*PodMetrics) if v != nil { - gotPods = append(gotPods, pod.Name) + gotPods = append(gotPods, pod.Pod) } return true }) - if !cmp.Equal(gotPods, test.wantPods, cmpopts.SortSlices(func(a, b string) bool { return a < b })) { + if !cmp.Equal(gotPods, test.wantPods, cmpopts.SortSlices(func(a, b Pod) bool { return a.NamespacedName.String() < b.NamespacedName.String() })) { t.Errorf("got (%v) != want (%v);", gotPods, test.wantPods) } }) diff --git a/pkg/ext-proc/backend/provider.go b/pkg/ext-proc/backend/provider.go index ce738986..bb575d19 100644 --- a/pkg/ext-proc/backend/provider.go +++ b/pkg/ext-proc/backend/provider.go @@ -8,6 +8,7 @@ import ( "github.com/go-logr/logr" "go.uber.org/multierr" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -16,72 +17,38 @@ const ( fetchMetricsTimeout = 5 * time.Second ) -func NewProvider(pmc PodMetricsClient, datastore *K8sDatastore) *Provider { +func NewProvider(pmc PodMetricsClient, datastore Datastore) *Provider { p := &Provider{ - podMetrics: sync.Map{}, - pmc: pmc, - datastore: datastore, + pmc: pmc, + datastore: datastore, } return p } // Provider provides backend pods and information such as metrics. type Provider struct { - // key: Pod, value: *PodMetrics - podMetrics sync.Map - pmc PodMetricsClient - datastore *K8sDatastore + pmc PodMetricsClient + datastore Datastore } type PodMetricsClient interface { - FetchMetrics(ctx context.Context, pod Pod, existing *PodMetrics) (*PodMetrics, error) + FetchMetrics(ctx context.Context, existing *PodMetrics) (*PodMetrics, error) } -func (p *Provider) AllPodMetrics() []*PodMetrics { - res := []*PodMetrics{} - fn := func(k, v any) bool { - res = append(res, v.(*PodMetrics)) - return true - } - p.podMetrics.Range(fn) - return res -} - -func (p *Provider) UpdatePodMetrics(pod Pod, pm *PodMetrics) { - p.podMetrics.Store(pod, pm) -} - -func (p *Provider) GetPodMetrics(pod Pod) (*PodMetrics, bool) { - val, ok := p.podMetrics.Load(pod) - if ok { - return val.(*PodMetrics), true - } - return nil, false -} - -func (p *Provider) Init(logger logr.Logger, refreshPodsInterval, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration) error { - p.refreshPodsOnce() - - if err := p.refreshMetricsOnce(logger); err != nil { - logger.Error(err, "Failed to init metrics") - } - - logger.Info("Initialized pods and metrics", "metrics", p.AllPodMetrics()) - - // periodically refresh pods - go func() { - for { - time.Sleep(refreshPodsInterval) - p.refreshPodsOnce() - } - }() - +func (p *Provider) Init(ctx context.Context, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration) error { // periodically refresh metrics + logger := log.FromContext(ctx) go func() { for { - time.Sleep(refreshMetricsInterval) - if err := p.refreshMetricsOnce(logger); err != nil { - logger.V(logutil.DEFAULT).Error(err, "Failed to refresh metrics") + select { + case <-ctx.Done(): + logger.V(logutil.DEFAULT).Info("Shutting down metrics prober") + return + default: + time.Sleep(refreshMetricsInterval) + if err := p.refreshMetricsOnce(logger); err != nil { + logger.V(logutil.DEFAULT).Error(err, "Failed to refresh metrics") + } } } }() @@ -89,8 +56,14 @@ func (p *Provider) Init(logger logr.Logger, refreshPodsInterval, refreshMetricsI // Periodically flush prometheus metrics for inference pool go func() { for { - time.Sleep(refreshPrometheusMetricsInterval) - p.flushPrometheusMetricsOnce(logger) + select { + case <-ctx.Done(): + logger.V(logutil.DEFAULT).Info("Shutting down prometheus metrics thread") + return + default: + time.Sleep(refreshPrometheusMetricsInterval) + p.flushPrometheusMetricsOnce(logger) + } } }() @@ -98,8 +71,14 @@ func (p *Provider) Init(logger logr.Logger, refreshPodsInterval, refreshMetricsI if logger := logger.V(logutil.DEBUG); logger.Enabled() { go func() { for { - time.Sleep(5 * time.Second) - logger.Info("Current Pods and metrics gathered", "metrics", p.AllPodMetrics()) + select { + case <-ctx.Done(): + logger.V(logutil.DEFAULT).Info("Shutting down metrics logger thread") + return + default: + time.Sleep(5 * time.Second) + logger.Info("Current Pods and metrics gathered", "metrics", p.datastore.PodGetAll()) + } } }() } @@ -107,36 +86,6 @@ func (p *Provider) Init(logger logr.Logger, refreshPodsInterval, refreshMetricsI return nil } -// refreshPodsOnce lists pods and updates keys in the podMetrics map. -// Note this function doesn't update the PodMetrics value, it's done separately. -func (p *Provider) refreshPodsOnce() { - // merge new pods with cached ones. - // add new pod to the map - addNewPods := func(k, v any) bool { - pod := k.(Pod) - if _, ok := p.podMetrics.Load(pod); !ok { - new := &PodMetrics{ - Pod: pod, - Metrics: Metrics{ - ActiveModels: make(map[string]int), - }, - } - p.podMetrics.Store(pod, new) - } - return true - } - // remove pods that don't exist any more. - mergeFn := func(k, v any) bool { - pod := k.(Pod) - if _, ok := p.datastore.pods.Load(pod); !ok { - p.podMetrics.Delete(pod) - } - return true - } - p.podMetrics.Range(mergeFn) - p.datastore.pods.Range(addNewPods) -} - func (p *Provider) refreshMetricsOnce(logger logr.Logger) error { loggerTrace := logger.V(logutil.TRACE) ctx, cancel := context.WithTimeout(context.Background(), fetchMetricsTimeout) @@ -151,22 +100,21 @@ func (p *Provider) refreshMetricsOnce(logger logr.Logger) error { errCh := make(chan error) processOnePod := func(key, value any) bool { loggerTrace.Info("Pod and metric being processed", "pod", key, "metric", value) - pod := key.(Pod) existing := value.(*PodMetrics) wg.Add(1) go func() { defer wg.Done() - updated, err := p.pmc.FetchMetrics(ctx, pod, existing) + updated, err := p.pmc.FetchMetrics(ctx, existing) if err != nil { - errCh <- fmt.Errorf("failed to parse metrics from %s: %v", pod, err) + errCh <- fmt.Errorf("failed to parse metrics from %s: %v", existing.NamespacedName, err) return } - p.UpdatePodMetrics(pod, updated) - loggerTrace.Info("Updated metrics for pod", "pod", pod, "metrics", updated.Metrics) + p.datastore.PodUpdateMetricsIfExist(updated.NamespacedName, &updated.Metrics) + loggerTrace.Info("Updated metrics for pod", "pod", updated.NamespacedName, "metrics", updated.Metrics) }() return true } - p.podMetrics.Range(processOnePod) + p.datastore.PodRange(processOnePod) // Wait for metric collection for all pods to complete and close the error channel in a // goroutine so this is unblocking, allowing the code to proceed to the error collection code @@ -188,7 +136,7 @@ func (p *Provider) refreshMetricsOnce(logger logr.Logger) error { func (p *Provider) flushPrometheusMetricsOnce(logger logr.Logger) { logger.V(logutil.DEBUG).Info("Flushing Prometheus Metrics") - pool, _ := p.datastore.getInferencePool() + pool, _ := p.datastore.PoolGet() if pool == nil { // No inference pool or not initialize. return @@ -197,7 +145,7 @@ func (p *Provider) flushPrometheusMetricsOnce(logger logr.Logger) { var kvCacheTotal float64 var queueTotal int - podMetrics := p.AllPodMetrics() + podMetrics := p.datastore.PodGetAll() if len(podMetrics) == 0 { return } diff --git a/pkg/ext-proc/backend/provider_test.go b/pkg/ext-proc/backend/provider_test.go index 95575046..2aa2c213 100644 --- a/pkg/ext-proc/backend/provider_test.go +++ b/pkg/ext-proc/backend/provider_test.go @@ -1,6 +1,7 @@ package backend import ( + "context" "errors" "sync" "testing" @@ -8,12 +9,17 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "github.com/stretchr/testify/assert" + "k8s.io/apimachinery/pkg/types" ) var ( pod1 = &PodMetrics{ - Pod: Pod{Name: "pod1"}, + Pod: Pod{ + NamespacedName: types.NamespacedName{ + Name: "pod1", + }, + }, Metrics: Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, @@ -25,7 +31,11 @@ var ( }, } pod2 = &PodMetrics{ - Pod: Pod{Name: "pod2"}, + Pod: Pod{ + NamespacedName: types.NamespacedName{ + Name: "pod2", + }, + }, Metrics: Metrics{ WaitingQueueSize: 1, KVCacheUsagePercent: 0.2, @@ -39,51 +49,65 @@ var ( ) func TestProvider(t *testing.T) { - logger := logutil.NewTestLogger() - tests := []struct { name string pmc PodMetricsClient - datastore *K8sDatastore - initErr bool + datastore Datastore want []*PodMetrics }{ { - name: "Init success", - datastore: &K8sDatastore{ - pods: populateMap(pod1.Pod, pod2.Pod), + name: "Probing metrics success", + pmc: &FakePodMetricsClient{ + Res: map[types.NamespacedName]*PodMetrics{ + pod1.NamespacedName: pod1, + pod2.NamespacedName: pod2, + }, }, + datastore: &datastore{ + pods: populateMap(pod1, pod2), + }, + want: []*PodMetrics{ + pod1, + pod2, + }, + }, + { + name: "Only pods in the datastore are probed", pmc: &FakePodMetricsClient{ - Res: map[Pod]*PodMetrics{ - pod1.Pod: pod1, - pod2.Pod: pod2, + Res: map[types.NamespacedName]*PodMetrics{ + pod1.NamespacedName: pod1, + pod2.NamespacedName: pod2, }, }, - want: []*PodMetrics{pod1, pod2}, + datastore: &datastore{ + pods: populateMap(pod1), + }, + want: []*PodMetrics{ + pod1, + }, }, { - name: "Fetch metrics error", + name: "Probing metrics error", pmc: &FakePodMetricsClient{ - Err: map[Pod]error{ - pod2.Pod: errors.New("injected error"), + Err: map[types.NamespacedName]error{ + pod2.NamespacedName: errors.New("injected error"), }, - Res: map[Pod]*PodMetrics{ - pod1.Pod: pod1, + Res: map[types.NamespacedName]*PodMetrics{ + pod1.NamespacedName: pod1, }, }, - datastore: &K8sDatastore{ - pods: populateMap(pod1.Pod, pod2.Pod), + datastore: &datastore{ + pods: populateMap(pod1, pod2), }, want: []*PodMetrics{ pod1, // Failed to fetch pod2 metrics so it remains the default values. { - Pod: Pod{Name: "pod2"}, + Pod: Pod{NamespacedName: pod2.NamespacedName}, Metrics: Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0, MaxActiveModels: 0, - ActiveModels: map[string]int{}, }, }, }, @@ -93,25 +117,24 @@ func TestProvider(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { p := NewProvider(test.pmc, test.datastore) - err := p.Init(logger, time.Millisecond, time.Millisecond, time.Millisecond) - if test.initErr != (err != nil) { - t.Fatalf("Unexpected error, got: %v, want: %v", err, test.initErr) - } - metrics := p.AllPodMetrics() - lessFunc := func(a, b *PodMetrics) bool { - return a.String() < b.String() - } - if diff := cmp.Diff(test.want, metrics, cmpopts.SortSlices(lessFunc)); diff != "" { - t.Errorf("Unexpected output (-want +got): %v", diff) - } + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + _ = p.Init(ctx, time.Millisecond, time.Millisecond) + assert.EventuallyWithT(t, func(t *assert.CollectT) { + metrics := test.datastore.PodGetAll() + diff := cmp.Diff(test.want, metrics, cmpopts.SortSlices(func(a, b *PodMetrics) bool { + return a.String() < b.String() + })) + assert.Equal(t, "", diff, "Unexpected diff (+got/-want)") + }, 5*time.Second, time.Millisecond) }) } } -func populateMap(pods ...Pod) *sync.Map { +func populateMap(pods ...*PodMetrics) *sync.Map { newMap := &sync.Map{} for _, pod := range pods { - newMap.Store(pod, true) + newMap.Store(pod.NamespacedName, &PodMetrics{Pod: Pod{NamespacedName: pod.NamespacedName, Address: pod.Address}}) } return newMap } diff --git a/pkg/ext-proc/backend/types.go b/pkg/ext-proc/backend/types.go index 7e399fed..0e02fb09 100644 --- a/pkg/ext-proc/backend/types.go +++ b/pkg/ext-proc/backend/types.go @@ -1,17 +1,15 @@ // Package backend is a library to interact with backend model servers such as probing metrics. package backend -import "fmt" +import ( + "fmt" -type PodSet map[Pod]bool + "k8s.io/apimachinery/pkg/types" +) type Pod struct { - Name string - Address string -} - -func (p Pod) String() string { - return p.Name + ":" + p.Address + NamespacedName types.NamespacedName + Address string } type Metrics struct { @@ -31,7 +29,7 @@ type PodMetrics struct { } func (pm *PodMetrics) String() string { - return fmt.Sprintf("Pod: %+v; Metrics: %+v", pm.Pod, pm.Metrics) + return fmt.Sprintf("Pod: %+v; Address: %+v; Metrics: %+v", pm.NamespacedName, pm.Address, pm.Metrics) } func (pm *PodMetrics) Clone() *PodMetrics { @@ -40,7 +38,10 @@ func (pm *PodMetrics) Clone() *PodMetrics { cm[k] = v } clone := &PodMetrics{ - Pod: pm.Pod, + Pod: Pod{ + NamespacedName: pm.NamespacedName, + Address: pm.Address, + }, Metrics: Metrics{ ActiveModels: cm, RunningQueueSize: pm.RunningQueueSize, diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/ext-proc/backend/vllm/metrics.go index 4558a664..3737425d 100644 --- a/pkg/ext-proc/backend/vllm/metrics.go +++ b/pkg/ext-proc/backend/vllm/metrics.go @@ -38,7 +38,6 @@ type PodMetricsClientImpl struct{} // FetchMetrics fetches metrics from a given pod. func (p *PodMetricsClientImpl) FetchMetrics( ctx context.Context, - pod backend.Pod, existing *backend.PodMetrics, ) (*backend.PodMetrics, error) { logger := log.FromContext(ctx) @@ -46,7 +45,7 @@ func (p *PodMetricsClientImpl) FetchMetrics( // Currently the metrics endpoint is hard-coded, which works with vLLM. // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. - url := fmt.Sprintf("http://%s/metrics", pod.Address) + url := fmt.Sprintf("http://%s/metrics", existing.Address) req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) @@ -54,16 +53,16 @@ func (p *PodMetricsClientImpl) FetchMetrics( } resp, err := http.DefaultClient.Do(req) if err != nil { - loggerDefault.Error(err, "Failed to fetch metrics", "pod", pod) - return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod, err) + loggerDefault.Error(err, "Failed to fetch metrics", "pod", existing.NamespacedName) + return nil, fmt.Errorf("failed to fetch metrics from %s: %w", existing.NamespacedName, err) } defer func() { _ = resp.Body.Close() }() if resp.StatusCode != http.StatusOK { - loggerDefault.Error(nil, "Unexpected status code returned", "pod", pod, "statusCode", resp.StatusCode) - return nil, fmt.Errorf("unexpected status code from %s: %v", pod, resp.StatusCode) + loggerDefault.Error(nil, "Unexpected status code returned", "pod", existing.NamespacedName, "statusCode", resp.StatusCode) + return nil, fmt.Errorf("unexpected status code from %s: %v", existing.NamespacedName, resp.StatusCode) } parser := expfmt.TextParser{} diff --git a/pkg/ext-proc/handlers/request.go b/pkg/ext-proc/handlers/request.go index 8ce2956f..5edb2e77 100644 --- a/pkg/ext-proc/handlers/request.go +++ b/pkg/ext-proc/handlers/request.go @@ -48,8 +48,8 @@ func (s *Server) HandleRequestBody( // NOTE: The nil checking for the modelObject means that we DO allow passthrough currently. // This might be a security risk in the future where adapters not registered in the InferenceModel // are able to be requested by using their distinct name. - modelObj := s.datastore.FetchModelData(model) - if modelObj == nil { + modelObj, exist := s.datastore.ModelGet(model) + if !exist { return nil, fmt.Errorf("error finding a model object in InferenceModel for input %v", model) } if len(modelObj.Spec.TargetModels) > 0 { @@ -82,20 +82,29 @@ func (s *Server) HandleRequestBody( if err != nil { return nil, fmt.Errorf("failed to find target pod: %w", err) } + logger.V(logutil.DEFAULT).Info("Request handled", "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "endpoint", targetPod) + // Insert target endpoint to instruct Envoy to route requests to the specified target pod. + // Attach the port number + pool, err := s.datastore.PoolGet() + if err != nil { + return nil, err + } + endpoint := targetPod.Address + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber)) + reqCtx.Model = llmReq.Model reqCtx.ResolvedTargetModel = llmReq.ResolvedTargetModel reqCtx.RequestSize = len(v.RequestBody.Body) - reqCtx.TargetPod = targetPod + reqCtx.TargetPod = targetPod.NamespacedName.String() + reqCtx.TargetEndpoint = endpoint - // Insert target endpoint to instruct Envoy to route requests to the specified target pod. headers := []*configPb.HeaderValueOption{ { Header: &configPb.HeaderValue{ Key: s.targetEndpointKey, - RawValue: []byte(targetPod.Address), + RawValue: []byte(endpoint), }, }, // We need to update the content length header if the body is mutated, see Envoy doc: @@ -134,7 +143,7 @@ func (s *Server) HandleRequestBody( Fields: map[string]*structpb.Value{ s.targetEndpointKey: { Kind: &structpb.Value_StringValue{ - StringValue: targetPod.Address, + StringValue: endpoint, }, }, }, diff --git a/pkg/ext-proc/handlers/server.go b/pkg/ext-proc/handlers/server.go index 6be747da..fe00ebeb 100644 --- a/pkg/ext-proc/handlers/server.go +++ b/pkg/ext-proc/handlers/server.go @@ -11,17 +11,15 @@ import ( "google.golang.org/grpc/codes" "google.golang.org/grpc/status" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) -func NewServer(pp PodProvider, scheduler Scheduler, targetEndpointKey string, datastore ModelDataStore) *Server { +func NewServer(scheduler Scheduler, targetEndpointKey string, datastore backend.Datastore) *Server { return &Server{ scheduler: scheduler, - podProvider: pp, targetEndpointKey: targetEndpointKey, datastore: datastore, } @@ -30,26 +28,15 @@ func NewServer(pp PodProvider, scheduler Scheduler, targetEndpointKey string, da // Server implements the Envoy external processing server. // https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor.proto type Server struct { - scheduler Scheduler - podProvider PodProvider + scheduler Scheduler // The key of the header to specify the target pod address. This value needs to match Envoy // configuration. targetEndpointKey string - datastore ModelDataStore + datastore backend.Datastore } type Scheduler interface { - Schedule(ctx context.Context, b *scheduling.LLMRequest) (targetPod backend.Pod, err error) -} - -// PodProvider is an interface to provide set of pods in the backend and information such as metrics. -type PodProvider interface { - GetPodMetrics(pod backend.Pod) (*backend.PodMetrics, bool) - UpdatePodMetrics(pod backend.Pod, pm *backend.PodMetrics) -} - -type ModelDataStore interface { - FetchModelData(modelName string) (returnModel *v1alpha1.InferenceModel) + Schedule(ctx context.Context, b *scheduling.LLMRequest) (targetPod backend.PodMetrics, err error) } func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { @@ -140,7 +127,8 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { // RequestContext stores context information during the life time of an HTTP request. type RequestContext struct { - TargetPod backend.Pod + TargetPod string + TargetEndpoint string Model string ResolvedTargetModel string RequestReceivedTimestamp time.Time diff --git a/pkg/ext-proc/health.go b/pkg/ext-proc/health.go index 8b684d39..59aec348 100644 --- a/pkg/ext-proc/health.go +++ b/pkg/ext-proc/health.go @@ -13,11 +13,11 @@ import ( type healthServer struct { logger logr.Logger - datastore *backend.K8sDatastore + datastore backend.Datastore } func (s *healthServer) Check(ctx context.Context, in *healthPb.HealthCheckRequest) (*healthPb.HealthCheckResponse, error) { - if !s.datastore.HasSynced() { + if !s.datastore.PoolHasSynced() { s.logger.V(logutil.VERBOSE).Info("gRPC health check not serving", "service", in.Service) return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_NOT_SERVING}, nil } diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index ba593d7d..8e588673 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -59,10 +59,6 @@ var ( "poolNamespace", runserver.DefaultPoolNamespace, "Namespace of the InferencePool this Endpoint Picker is associated with.") - refreshPodsInterval = flag.Duration( - "refreshPodsInterval", - runserver.DefaultRefreshPodsInterval, - "interval to refresh pods") refreshMetricsInterval = flag.Duration( "refreshMetricsInterval", runserver.DefaultRefreshMetricsInterval, @@ -115,8 +111,6 @@ func run() error { }) setupLog.Info("Flags processed", "flags", flags) - datastore := backend.NewK8sDataStore() - // Init runtime. cfg, err := ctrl.GetConfig() if err != nil { @@ -131,17 +125,19 @@ func run() error { } // Setup runner. + datastore := backend.NewDatastore() + provider := backend.NewProvider(&vllm.PodMetricsClientImpl{}, datastore) serverRunner := &runserver.ExtProcServerRunner{ GrpcPort: *grpcPort, TargetEndpointKey: *targetEndpointKey, PoolName: *poolName, PoolNamespace: *poolNamespace, - RefreshPodsInterval: *refreshPodsInterval, RefreshMetricsInterval: *refreshMetricsInterval, RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval, Datastore: datastore, SecureServing: *secureServing, CertPath: *certPath, + Provider: provider, } if err := serverRunner.SetupWithManager(mgr); err != nil { setupLog.Error(err, "Failed to setup ext-proc server") @@ -154,9 +150,7 @@ func run() error { } // Register ext-proc server. - if err := mgr.Add(serverRunner.AsRunnable( - ctrl.Log.WithName("ext-proc"), datastore, &vllm.PodMetricsClientImpl{}, - )); err != nil { + if err := mgr.Add(serverRunner.AsRunnable(ctrl.Log.WithName("ext-proc"))); err != nil { setupLog.Error(err, "Failed to register ext-proc server") return err } @@ -195,7 +189,7 @@ func initLogging(opts *zap.Options) { } // registerHealthServer adds the Health gRPC server as a Runnable to the given manager. -func registerHealthServer(mgr manager.Manager, logger logr.Logger, ds *backend.K8sDatastore, port int) error { +func registerHealthServer(mgr manager.Manager, logger logr.Logger, ds backend.Datastore, port int) error { srv := grpc.NewServer() healthPb.RegisterHealthServer(srv, &healthServer{ logger: logger, diff --git a/pkg/ext-proc/scheduling/filter_test.go b/pkg/ext-proc/scheduling/filter_test.go index ee1a8c33..9ed781c4 100644 --- a/pkg/ext-proc/scheduling/filter_test.go +++ b/pkg/ext-proc/scheduling/filter_test.go @@ -6,6 +6,7 @@ import ( "github.com/go-logr/logr" "github.com/google/go-cmp/cmp" + "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -40,7 +41,7 @@ func TestFilter(t *testing.T) { // model being active, and has low KV cache. input: []*backend.PodMetrics{ { - Pod: backend.Pod{Name: "pod1"}, + Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, Metrics: backend.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, @@ -52,7 +53,7 @@ func TestFilter(t *testing.T) { }, }, { - Pod: backend.Pod{Name: "pod2"}, + Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, Metrics: backend.Metrics{ WaitingQueueSize: 3, KVCacheUsagePercent: 0.1, @@ -64,7 +65,7 @@ func TestFilter(t *testing.T) { }, }, { - Pod: backend.Pod{Name: "pod3"}, + Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, Metrics: backend.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.2, @@ -77,7 +78,7 @@ func TestFilter(t *testing.T) { }, output: []*backend.PodMetrics{ { - Pod: backend.Pod{Name: "pod2"}, + Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, Metrics: backend.Metrics{ WaitingQueueSize: 3, KVCacheUsagePercent: 0.1, @@ -101,7 +102,7 @@ func TestFilter(t *testing.T) { // pod1 will be picked because it has capacity for the sheddable request. input: []*backend.PodMetrics{ { - Pod: backend.Pod{Name: "pod1"}, + Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, Metrics: backend.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, @@ -113,7 +114,7 @@ func TestFilter(t *testing.T) { }, }, { - Pod: backend.Pod{Name: "pod2"}, + Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, Metrics: backend.Metrics{ WaitingQueueSize: 3, KVCacheUsagePercent: 0.1, @@ -125,7 +126,7 @@ func TestFilter(t *testing.T) { }, }, { - Pod: backend.Pod{Name: "pod3"}, + Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, Metrics: backend.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.2, @@ -138,7 +139,7 @@ func TestFilter(t *testing.T) { }, output: []*backend.PodMetrics{ { - Pod: backend.Pod{Name: "pod1"}, + Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, Metrics: backend.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, @@ -163,7 +164,7 @@ func TestFilter(t *testing.T) { // dropped. input: []*backend.PodMetrics{ { - Pod: backend.Pod{Name: "pod1"}, + Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, Metrics: backend.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.9, @@ -175,7 +176,7 @@ func TestFilter(t *testing.T) { }, }, { - Pod: backend.Pod{Name: "pod2"}, + Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, Metrics: backend.Metrics{ WaitingQueueSize: 3, KVCacheUsagePercent: 0.85, @@ -187,7 +188,7 @@ func TestFilter(t *testing.T) { }, }, { - Pod: backend.Pod{Name: "pod3"}, + Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, Metrics: backend.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.85, diff --git a/pkg/ext-proc/scheduling/scheduler.go b/pkg/ext-proc/scheduling/scheduler.go index 16cf90b8..354bd39c 100644 --- a/pkg/ext-proc/scheduling/scheduler.go +++ b/pkg/ext-proc/scheduling/scheduler.go @@ -93,34 +93,29 @@ var ( } ) -func NewScheduler(pmp PodMetricsProvider) *Scheduler { +func NewScheduler(datastore backend.Datastore) *Scheduler { return &Scheduler{ - podMetricsProvider: pmp, - filter: defaultFilter, + datastore: datastore, + filter: defaultFilter, } } type Scheduler struct { - podMetricsProvider PodMetricsProvider - filter Filter -} - -// PodMetricsProvider is an interface to provide set of pods in the backend and information such as -// metrics. -type PodMetricsProvider interface { - AllPodMetrics() []*backend.PodMetrics + datastore backend.Datastore + filter Filter } // Schedule finds the target pod based on metrics and the requested lora adapter. -func (s *Scheduler) Schedule(ctx context.Context, req *LLMRequest) (targetPod backend.Pod, err error) { +func (s *Scheduler) Schedule(ctx context.Context, req *LLMRequest) (targetPod backend.PodMetrics, err error) { logger := log.FromContext(ctx).WithValues("request", req) - logger.V(logutil.VERBOSE).Info("Scheduling a request", "metrics", s.podMetricsProvider.AllPodMetrics()) - pods, err := s.filter.Filter(logger, req, s.podMetricsProvider.AllPodMetrics()) + podMetrics := s.datastore.PodGetAll() + logger.V(logutil.VERBOSE).Info("Scheduling a request", "metrics", podMetrics) + pods, err := s.filter.Filter(logger, req, podMetrics) if err != nil || len(pods) == 0 { - return backend.Pod{}, fmt.Errorf( + return backend.PodMetrics{}, fmt.Errorf( "failed to apply filter, resulted %v pods, this should never happen: %w", len(pods), err) } logger.V(logutil.VERBOSE).Info("Selecting a random pod from the candidates", "candidatePods", pods) i := rand.Intn(len(pods)) - return pods[i].Pod, nil + return *pods[i], nil } diff --git a/pkg/ext-proc/server/runserver.go b/pkg/ext-proc/server/runserver.go index fb9741d2..073c30df 100644 --- a/pkg/ext-proc/server/runserver.go +++ b/pkg/ext-proc/server/runserver.go @@ -31,10 +31,10 @@ type ExtProcServerRunner struct { TargetEndpointKey string PoolName string PoolNamespace string - RefreshPodsInterval time.Duration RefreshMetricsInterval time.Duration RefreshPrometheusMetricsInterval time.Duration - Datastore *backend.K8sDatastore + Datastore backend.Datastore + Provider *backend.Provider SecureServing bool CertPath string } @@ -45,7 +45,6 @@ const ( DefaultTargetEndpointKey = "x-gateway-destination-endpoint" // default for --targetEndpointKey DefaultPoolName = "" // required but no default DefaultPoolNamespace = "default" // default for --poolNamespace - DefaultRefreshPodsInterval = 10 * time.Second // default for --refreshPodsInterval DefaultRefreshMetricsInterval = 50 * time.Millisecond // default for --refreshMetricsInterval DefaultRefreshPrometheusMetricsInterval = 5 * time.Second // default for --refreshPrometheusMetricsInterval DefaultSecureServing = true // default for --secureServing @@ -57,7 +56,6 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner { TargetEndpointKey: DefaultTargetEndpointKey, PoolName: DefaultPoolName, PoolNamespace: DefaultPoolNamespace, - RefreshPodsInterval: DefaultRefreshPodsInterval, RefreshMetricsInterval: DefaultRefreshMetricsInterval, RefreshPrometheusMetricsInterval: DefaultRefreshPrometheusMetricsInterval, SecureServing: DefaultSecureServing, @@ -107,15 +105,10 @@ func (r *ExtProcServerRunner) SetupWithManager(mgr ctrl.Manager) error { // AsRunnable returns a Runnable that can be used to start the ext-proc gRPC server. // The runnable implements LeaderElectionRunnable with leader election disabled. -func (r *ExtProcServerRunner) AsRunnable( - logger logr.Logger, - podDatastore *backend.K8sDatastore, - podMetricsClient backend.PodMetricsClient, -) manager.Runnable { +func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable { return runnable.NoLeaderElection(manager.RunnableFunc(func(ctx context.Context) error { // Initialize backend provider - pp := backend.NewProvider(podMetricsClient, podDatastore) - if err := pp.Init(logger.WithName("provider"), r.RefreshPodsInterval, r.RefreshMetricsInterval, r.RefreshPrometheusMetricsInterval); err != nil { + if err := r.Provider.Init(ctx, r.RefreshMetricsInterval, r.RefreshPrometheusMetricsInterval); err != nil { logger.Error(err, "Failed to initialize backend provider") return err } @@ -145,7 +138,7 @@ func (r *ExtProcServerRunner) AsRunnable( } extProcPb.RegisterExternalProcessorServer( srv, - handlers.NewServer(pp, scheduling.NewScheduler(pp), r.TargetEndpointKey, r.Datastore), + handlers.NewServer(scheduling.NewScheduler(r.Datastore), r.TargetEndpointKey, r.Datastore), ) // Forward to the gRPC runnable. diff --git a/pkg/ext-proc/server/runserver_test.go b/pkg/ext-proc/server/runserver_test.go index 1badb8fd..32af2cd8 100644 --- a/pkg/ext-proc/server/runserver_test.go +++ b/pkg/ext-proc/server/runserver_test.go @@ -11,7 +11,7 @@ import ( func TestRunnable(t *testing.T) { // Make sure AsRunnable() does not use leader election. - runner := server.NewDefaultExtProcServerRunner().AsRunnable(logutil.NewTestLogger(), nil, nil) + runner := server.NewDefaultExtProcServerRunner().AsRunnable(logutil.NewTestLogger()) r, ok := runner.(manager.LeaderElectionRunnable) if !ok { t.Fatal("runner is not LeaderElectionRunnable") diff --git a/pkg/ext-proc/test/benchmark/benchmark.go b/pkg/ext-proc/test/benchmark/benchmark.go index 9eca2edc..a48f0465 100644 --- a/pkg/ext-proc/test/benchmark/benchmark.go +++ b/pkg/ext-proc/test/benchmark/benchmark.go @@ -1,6 +1,7 @@ package main import ( + "context" "flag" "fmt" "os" @@ -12,6 +13,7 @@ import ( "github.com/jhump/protoreflect/desc" uberzap "go.uber.org/zap" "google.golang.org/protobuf/proto" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" @@ -48,11 +50,11 @@ func run() error { } opts.BindFlags(flag.CommandLine) flag.Parse() - logger := zap.New(zap.UseFlagOptions(&opts), zap.RawZapOpts(uberzap.AddCaller())) + ctx := log.IntoContext(context.Background(), logger) if *localServer { - test.StartExtProc(logger, port, *refreshPodsInterval, *refreshMetricsInterval, *refreshPrometheusMetricsInterval, fakePods(), fakeModels()) + test.StartExtProc(ctx, port, *refreshPodsInterval, *refreshMetricsInterval, *refreshPrometheusMetricsInterval, fakePods(), fakeModels()) time.Sleep(time.Second) // wait until server is up logger.Info("Server started") } @@ -81,7 +83,7 @@ func run() error { func generateRequestFunc(logger logr.Logger) func(mtd *desc.MethodDescriptor, callData *runner.CallData) []byte { return func(mtd *desc.MethodDescriptor, callData *runner.CallData) []byte { numModels := *numFakePods * (*numModelsPerPod) - req := test.GenerateRequest(logger, modelName(int(callData.RequestNumber)%numModels)) + req := test.GenerateRequest(logger, "hello", modelName(int(callData.RequestNumber)%numModels)) data, err := proto.Marshal(req) if err != nil { logutil.Fatal(logger, err, "Failed to marshal request", "request", req) @@ -105,9 +107,7 @@ func fakeModels() map[string]*v1alpha1.InferenceModel { func fakePods() []*backend.PodMetrics { pms := make([]*backend.PodMetrics, 0, *numFakePods) for i := 0; i < *numFakePods; i++ { - metrics := fakeMetrics(i) - pod := test.FakePod(i) - pms = append(pms, &backend.PodMetrics{Pod: pod, Metrics: metrics}) + pms = append(pms, test.FakePodMetrics(i, fakeMetrics(i))) } return pms diff --git a/pkg/ext-proc/test/utils.go b/pkg/ext-proc/test/utils.go index cb99a36b..46affae9 100644 --- a/pkg/ext-proc/test/utils.go +++ b/pkg/ext-proc/test/utils.go @@ -1,6 +1,7 @@ package test import ( + "context" "encoding/json" "fmt" "net" @@ -10,36 +11,50 @@ import ( "github.com/go-logr/logr" "google.golang.org/grpc" "google.golang.org/grpc/reflection" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + utiltesting "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/testing" ) func StartExtProc( - logger logr.Logger, + ctx context.Context, port int, refreshPodsInterval, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration, pods []*backend.PodMetrics, models map[string]*v1alpha1.InferenceModel, ) *grpc.Server { - ps := make(backend.PodSet) - pms := make(map[backend.Pod]*backend.PodMetrics) + logger := log.FromContext(ctx) + pms := make(map[types.NamespacedName]*backend.PodMetrics) for _, pod := range pods { - ps[pod.Pod] = true - pms[pod.Pod] = pod + pms[pod.NamespacedName] = pod } pmc := &backend.FakePodMetricsClient{Res: pms} - pp := backend.NewProvider(pmc, backend.NewK8sDataStore(backend.WithPods(pods))) - if err := pp.Init(logger, refreshPodsInterval, refreshMetricsInterval, refreshPrometheusMetricsInterval); err != nil { + datastore := backend.NewDatastore() + for _, m := range models { + datastore.ModelSet(m) + } + for _, pm := range pods { + pod := utiltesting.MakePod(pm.NamespacedName.Name, pm.NamespacedName.Namespace). + ReadyCondition(). + IP(pm.Address). + Obj() + datastore.PodUpdateOrAddIfNotExist(&pod) + datastore.PodUpdateMetricsIfExist(pm.NamespacedName, &pm.Metrics) + } + pp := backend.NewProvider(pmc, datastore) + if err := pp.Init(ctx, refreshMetricsInterval, refreshPrometheusMetricsInterval); err != nil { logutil.Fatal(logger, err, "Failed to initialize") } - return startExtProc(logger, port, pp, models) + return startExtProc(logger, port, datastore) } // startExtProc starts an extProc server with fake pods. -func startExtProc(logger logr.Logger, port int, pp *backend.Provider, models map[string]*v1alpha1.InferenceModel) *grpc.Server { +func startExtProc(logger logr.Logger, port int, datastore backend.Datastore) *grpc.Server { lis, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) if err != nil { logutil.Fatal(logger, err, "Failed to listen", "port", port) @@ -47,7 +62,7 @@ func startExtProc(logger logr.Logger, port int, pp *backend.Provider, models map s := grpc.NewServer() - extProcPb.RegisterExternalProcessorServer(s, handlers.NewServer(pp, scheduling.NewScheduler(pp), "target-pod", &backend.FakeDataStore{Res: models})) + extProcPb.RegisterExternalProcessorServer(s, handlers.NewServer(scheduling.NewScheduler(datastore), "target-pod", datastore)) logger.Info("gRPC server starting", "port", port) reflection.Register(s) @@ -60,10 +75,10 @@ func startExtProc(logger logr.Logger, port int, pp *backend.Provider, models map return s } -func GenerateRequest(logger logr.Logger, model string) *extProcPb.ProcessingRequest { +func GenerateRequest(logger logr.Logger, prompt, model string) *extProcPb.ProcessingRequest { j := map[string]interface{}{ "model": model, - "prompt": "hello", + "prompt": prompt, "max_tokens": 100, "temperature": 0, } @@ -80,11 +95,14 @@ func GenerateRequest(logger logr.Logger, model string) *extProcPb.ProcessingRequ return req } -func FakePod(index int) backend.Pod { +func FakePodMetrics(index int, metrics backend.Metrics) *backend.PodMetrics { address := fmt.Sprintf("address-%v", index) - pod := backend.Pod{ - Name: fmt.Sprintf("pod-%v", index), - Address: address, + pod := backend.PodMetrics{ + Pod: backend.Pod{ + NamespacedName: types.NamespacedName{Name: fmt.Sprintf("pod-%v", index)}, + Address: address, + }, + Metrics: metrics, } - return pod + return &pod } diff --git a/pkg/ext-proc/util/testing/wrappers.go b/pkg/ext-proc/util/testing/wrappers.go new file mode 100644 index 00000000..f9005499 --- /dev/null +++ b/pkg/ext-proc/util/testing/wrappers.go @@ -0,0 +1,50 @@ +package testing + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// PodWrapper wraps a Pod. +type PodWrapper struct { + corev1.Pod +} + +// MakePod creates a wrapper for a Pod. +func MakePod(podName, ns string) *PodWrapper { + return &PodWrapper{ + corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: ns, + }, + Spec: corev1.PodSpec{}, + Status: corev1.PodStatus{}, + }, + } +} + +// Labels sets the pod labels. +func (p *PodWrapper) Labels(labels map[string]string) *PodWrapper { + p.ObjectMeta.Labels = labels + return p +} + +// SetReadyCondition sets a PodReay=true condition. +func (p *PodWrapper) ReadyCondition() *PodWrapper { + p.Status.Conditions = []corev1.PodCondition{{ + Type: corev1.PodReady, + Status: corev1.ConditionTrue, + }} + return p +} + +func (p *PodWrapper) IP(ip string) *PodWrapper { + p.Status.PodIP = ip + return p +} + +// Obj returns the wrapped Pod. +func (p *PodWrapper) Obj() corev1.Pod { + return p.Pod +} diff --git a/pkg/manifests/vllm/deployment.yaml b/pkg/manifests/vllm/deployment.yaml index a54d99b3..51689c9f 100644 --- a/pkg/manifests/vllm/deployment.yaml +++ b/pkg/manifests/vllm/deployment.yaml @@ -26,7 +26,7 @@ spec: - "8000" - "--enable-lora" - "--max-loras" - - "2" + - "4" - "--max-cpu-loras" - "12" - "--lora-modules" diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index a99b6bd7..0e30ac69 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -17,11 +17,13 @@ import ( extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3" "github.com/google/go-cmp/cmp" + "github.com/stretchr/testify/assert" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" "google.golang.org/protobuf/testing/protocmp" "google.golang.org/protobuf/types/known/structpb" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" utilruntime "k8s.io/apimachinery/pkg/util/runtime" k8syaml "k8s.io/apimachinery/pkg/util/yaml" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -33,6 +35,7 @@ import ( runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" extprocutils "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/test" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + utiltesting "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/testing" "sigs.k8s.io/yaml" ) @@ -61,36 +64,27 @@ func TestKubeInferenceModelRequest(t *testing.T) { }{ { name: "select lower queue and kv cache, no active lora", - req: extprocutils.GenerateRequest(logger, "my-model"), + req: extprocutils.GenerateRequest(logger, "test1", "my-model"), // pod-1 will be picked because it has relatively low queue size and low KV cache. pods: []*backend.PodMetrics{ - { - Pod: extprocutils.FakePod(0), - Metrics: backend.Metrics{ - WaitingQueueSize: 3, - KVCacheUsagePercent: 0.2, - }, - }, - { - Pod: extprocutils.FakePod(1), - Metrics: backend.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.1, - }, - }, - { - Pod: extprocutils.FakePod(2), - Metrics: backend.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - }, - }, + extprocutils.FakePodMetrics(0, backend.Metrics{ + WaitingQueueSize: 3, + KVCacheUsagePercent: 0.2, + }), + extprocutils.FakePodMetrics(1, backend.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.1, + }), + extprocutils.FakePodMetrics(2, backend.Metrics{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + }), }, wantHeaders: []*configPb.HeaderValueOption{ { Header: &configPb.HeaderValue{ Key: runserver.DefaultTargetEndpointKey, - RawValue: []byte("address-1"), + RawValue: []byte("address-1:8000"), }, }, { @@ -104,58 +98,49 @@ func TestKubeInferenceModelRequest(t *testing.T) { Fields: map[string]*structpb.Value{ runserver.DefaultTargetEndpointKey: { Kind: &structpb.Value_StringValue{ - StringValue: "address-1", + StringValue: "address-1:8000", }, }, }, }, - wantBody: []byte("{\"max_tokens\":100,\"model\":\"my-model-12345\",\"prompt\":\"hello\",\"temperature\":0}"), + wantBody: []byte("{\"max_tokens\":100,\"model\":\"my-model-12345\",\"prompt\":\"test1\",\"temperature\":0}"), wantErr: false, }, { name: "select active lora, low queue", - req: extprocutils.GenerateRequest(logger, "sql-lora"), + req: extprocutils.GenerateRequest(logger, "test2", "sql-lora"), // pod-1 will be picked because it has relatively low queue size, with the requested // model being active, and has low KV cache. pods: []*backend.PodMetrics{ - { - Pod: extprocutils.FakePod(0), - Metrics: backend.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, + extprocutils.FakePodMetrics(0, backend.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, }, - }, - { - Pod: extprocutils.FakePod(1), - Metrics: backend.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.1, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg2": 1, - }, + }), + extprocutils.FakePodMetrics(1, backend.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.1, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg2": 1, }, - }, - { - Pod: extprocutils.FakePod(2), - Metrics: backend.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - }, + }), + extprocutils.FakePodMetrics(2, backend.Metrics{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, }, - }, + }), }, wantHeaders: []*configPb.HeaderValueOption{ { Header: &configPb.HeaderValue{ Key: runserver.DefaultTargetEndpointKey, - RawValue: []byte("address-1"), + RawValue: []byte("address-1:8000"), }, }, { @@ -169,59 +154,50 @@ func TestKubeInferenceModelRequest(t *testing.T) { Fields: map[string]*structpb.Value{ runserver.DefaultTargetEndpointKey: { Kind: &structpb.Value_StringValue{ - StringValue: "address-1", + StringValue: "address-1:8000", }, }, }, }, - wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"hello\",\"temperature\":0}"), + wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test2\",\"temperature\":0}"), wantErr: false, }, { name: "select no lora despite active model, avoid excessive queue size", - req: extprocutils.GenerateRequest(logger, "sql-lora"), + req: extprocutils.GenerateRequest(logger, "test3", "sql-lora"), // pod-2 will be picked despite it NOT having the requested model being active // as it's above the affinity for queue size. Also is critical, so we should // still honor request despite all queues > 5 pods: []*backend.PodMetrics{ - { - Pod: extprocutils.FakePod(0), - Metrics: backend.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, + extprocutils.FakePodMetrics(0, backend.Metrics{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, }, - }, - { - Pod: extprocutils.FakePod(1), - Metrics: backend.Metrics{ - WaitingQueueSize: 50, - KVCacheUsagePercent: 0.1, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg2": 1, - }, + }), + extprocutils.FakePodMetrics(1, backend.Metrics{ + WaitingQueueSize: 50, + KVCacheUsagePercent: 0.1, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg2": 1, }, - }, - { - Pod: extprocutils.FakePod(2), - Metrics: backend.Metrics{ - WaitingQueueSize: 6, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - }, + }), + extprocutils.FakePodMetrics(2, backend.Metrics{ + WaitingQueueSize: 6, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, }, - }, + }), }, wantHeaders: []*configPb.HeaderValueOption{ { Header: &configPb.HeaderValue{ Key: runserver.DefaultTargetEndpointKey, - RawValue: []byte("address-2"), + RawValue: []byte("address-2:8000"), }, }, { @@ -235,54 +211,45 @@ func TestKubeInferenceModelRequest(t *testing.T) { Fields: map[string]*structpb.Value{ runserver.DefaultTargetEndpointKey: { Kind: &structpb.Value_StringValue{ - StringValue: "address-2", + StringValue: "address-2:8000", }, }, }, }, - wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"hello\",\"temperature\":0}"), + wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test3\",\"temperature\":0}"), wantErr: false, }, { name: "noncritical and all models past threshold, shed request", - req: extprocutils.GenerateRequest(logger, "sql-lora-sheddable"), + req: extprocutils.GenerateRequest(logger, "test4", "sql-lora-sheddable"), // no pods will be picked as all models are either above kv threshold, // queue threshold, or both. pods: []*backend.PodMetrics{ - { - Pod: extprocutils.FakePod(0), - Metrics: backend.Metrics{ - WaitingQueueSize: 6, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - "sql-lora-1fdg3": 1, - }, + extprocutils.FakePodMetrics(0, backend.Metrics{ + WaitingQueueSize: 6, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + "sql-lora-1fdg3": 1, }, - }, - { - Pod: extprocutils.FakePod(1), - Metrics: backend.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.85, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, + }), + extprocutils.FakePodMetrics(1, backend.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.85, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, }, - }, - { - Pod: extprocutils.FakePod(2), - Metrics: backend.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.9, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, + }), + extprocutils.FakePodMetrics(2, backend.Metrics{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, }, - }, + }), }, wantHeaders: []*configPb.HeaderValueOption{}, wantMetadata: &structpb.Struct{}, @@ -296,49 +263,40 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, { name: "noncritical, but one server has capacity, do not shed", - req: extprocutils.GenerateRequest(logger, "sql-lora-sheddable"), + req: extprocutils.GenerateRequest(logger, "test5", "sql-lora-sheddable"), // pod 0 will be picked as all other models are above threshold pods: []*backend.PodMetrics{ - { - Pod: extprocutils.FakePod(0), - Metrics: backend.Metrics{ - WaitingQueueSize: 4, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - "sql-lora-1fdg3": 1, - }, + extprocutils.FakePodMetrics(0, backend.Metrics{ + WaitingQueueSize: 4, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + "sql-lora-1fdg3": 1, }, - }, - { - Pod: extprocutils.FakePod(1), - Metrics: backend.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.85, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, + }), + extprocutils.FakePodMetrics(1, backend.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.85, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, }, - }, - { - Pod: extprocutils.FakePod(2), - Metrics: backend.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.9, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, + }), + extprocutils.FakePodMetrics(2, backend.Metrics{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, }, - }, + }), }, wantHeaders: []*configPb.HeaderValueOption{ { Header: &configPb.HeaderValue{ Key: runserver.DefaultTargetEndpointKey, - RawValue: []byte("address-0"), + RawValue: []byte("address-0:8000"), }, }, { @@ -352,18 +310,19 @@ func TestKubeInferenceModelRequest(t *testing.T) { Fields: map[string]*structpb.Value{ runserver.DefaultTargetEndpointKey: { Kind: &structpb.Value_StringValue{ - StringValue: "address-0", + StringValue: "address-0:8000", }, }, }, }, - wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"hello\",\"temperature\":0}"), + wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"test5\",\"temperature\":0}"), wantErr: false, }, } // Set up global k8sclient and extproc server runner with test environment config - BeforeSuit() + cleanup := BeforeSuit(t) + defer cleanup() for _, test := range tests { t.Run(test.name, func(t *testing.T) { @@ -405,27 +364,30 @@ func TestKubeInferenceModelRequest(t *testing.T) { } } -func setUpHermeticServer(pods []*backend.PodMetrics) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { - ps := make(backend.PodSet) - pms := make(map[backend.Pod]*backend.PodMetrics) - for _, pod := range pods { - ps[pod.Pod] = true - pms[pod.Pod] = pod +func setUpHermeticServer(podMetrics []*backend.PodMetrics) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { + pms := make(map[types.NamespacedName]*backend.PodMetrics) + for _, pm := range podMetrics { + pms[pm.NamespacedName] = pm } pmc := &backend.FakePodMetricsClient{Res: pms} serverCtx, stopServer := context.WithCancel(context.Background()) go func() { - if err := serverRunner.AsRunnable( - logger.WithName("ext-proc"), backend.NewK8sDataStore(backend.WithPods(pods)), pmc, - ).Start(serverCtx); err != nil { + serverRunner.Datastore.PodDeleteAll() + for _, pm := range podMetrics { + pod := utiltesting.MakePod(pm.NamespacedName.Name, pm.NamespacedName.Namespace). + ReadyCondition(). + IP(pm.Address). + Obj() + serverRunner.Datastore.PodUpdateOrAddIfNotExist(&pod) + serverRunner.Datastore.PodUpdateMetricsIfExist(pm.NamespacedName, &pm.Metrics) + } + serverRunner.Provider = backend.NewProvider(pmc, serverRunner.Datastore) + if err := serverRunner.AsRunnable(logger.WithName("ext-proc")).Start(serverCtx); err != nil { logutil.Fatal(logger, err, "Failed to start ext-proc server") } }() - // Wait the reconciler to populate the datastore. - time.Sleep(10 * time.Second) - address := fmt.Sprintf("localhost:%v", port) // Create a grpc connection conn, err := grpc.NewClient(address, grpc.WithTransportCredentials(insecure.NewCredentials())) @@ -442,11 +404,13 @@ func setUpHermeticServer(pods []*backend.PodMetrics) (client extProcPb.ExternalP cancel() conn.Close() stopServer() + // wait a little until the goroutines actually exit + time.Sleep(5 * time.Second) } } // Sets up a test environment and returns the runner struct -func BeforeSuit() { +func BeforeSuit(t *testing.T) func() { // Set up mock k8s API Client testEnv = &envtest.Environment{ CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, @@ -477,7 +441,7 @@ func BeforeSuit() { serverRunner = runserver.NewDefaultExtProcServerRunner() // Adjust from defaults serverRunner.PoolName = "vllm-llama2-7b-pool" - serverRunner.Datastore = backend.NewK8sDataStore() + serverRunner.Datastore = backend.NewDatastore() serverRunner.SecureServing = false if err := serverRunner.SetupWithManager(mgr); err != nil { @@ -524,6 +488,16 @@ func BeforeSuit() { } } } + + assert.EventuallyWithT(t, func(t *assert.CollectT) { + _, modelExist := serverRunner.Datastore.ModelGet("my-model") + synced := serverRunner.Datastore.PoolHasSynced() && modelExist + assert.True(t, synced, "Timeout waiting for the pool and models to sync") + }, 10*time.Second, 10*time.Millisecond) + + return func() { + _ = testEnv.Stop() + } } func sendRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { From 21d0c1372c37f3b14a125a87ce9611ab695a31d3 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Wed, 19 Feb 2025 13:30:27 +0200 Subject: [PATCH 31/96] fixed a typo - close a bash markdown (#364) Signed-off-by: Nir Rozenbaum --- site-src/guides/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index b9c38d87..34fff20c 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -24,6 +24,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v0.1.0/manifests.yaml + ``` ### Deploy InferenceModel From c998e50585a39e2304d82e639b233573b2de3d48 Mon Sep 17 00:00:00 2001 From: Tiger Xu / Zhonghu Xu Date: Wed, 19 Feb 2025 22:54:29 +0800 Subject: [PATCH 32/96] Added controller and datastore package (#363) * Added controller and datastore package * Fix lint --- pkg/ext-proc/backend/fake.go | 5 +- pkg/ext-proc/backend/provider.go | 9 +- pkg/ext-proc/backend/provider_test.go | 52 +++--- pkg/ext-proc/backend/vllm/metrics.go | 10 +- pkg/ext-proc/backend/vllm/metrics_test.go | 14 +- .../inferencemodel_reconciler.go | 5 +- .../inferencemodel_reconciler_test.go | 146 +++++++---------- .../inferencepool_reconciler.go | 5 +- .../inferencepool_reconciler_test.go | 25 +-- .../{backend => controller}/pod_reconciler.go | 5 +- .../pod_reconciler_test.go | 149 ++++++++---------- .../{backend => datastore}/datastore.go | 30 +++- .../{backend => datastore}/datastore_test.go | 2 +- pkg/ext-proc/{backend => datastore}/types.go | 4 +- pkg/ext-proc/handlers/request.go | 6 +- pkg/ext-proc/handlers/server.go | 8 +- pkg/ext-proc/health.go | 4 +- pkg/ext-proc/main.go | 5 +- pkg/ext-proc/scheduling/filter.go | 34 ++-- pkg/ext-proc/scheduling/filter_test.go | 130 +++++++-------- pkg/ext-proc/scheduling/scheduler.go | 14 +- pkg/ext-proc/server/runserver.go | 10 +- pkg/ext-proc/test/benchmark/benchmark.go | 10 +- pkg/ext-proc/test/utils.go | 15 +- test/integration/hermetic_test.go | 49 +++--- 25 files changed, 369 insertions(+), 377 deletions(-) rename pkg/ext-proc/{backend => controller}/inferencemodel_reconciler.go (95%) rename pkg/ext-proc/{backend => controller}/inferencemodel_reconciler_test.go (74%) rename pkg/ext-proc/{backend => controller}/inferencepool_reconciler.go (96%) rename pkg/ext-proc/{backend => controller}/inferencepool_reconciler_test.go (84%) rename pkg/ext-proc/{backend => controller}/pod_reconciler.go (95%) rename pkg/ext-proc/{backend => controller}/pod_reconciler_test.go (57%) rename pkg/ext-proc/{backend => datastore}/datastore.go (91%) rename pkg/ext-proc/{backend => datastore}/datastore_test.go (99%) rename pkg/ext-proc/{backend => datastore}/types.go (91%) diff --git a/pkg/ext-proc/backend/fake.go b/pkg/ext-proc/backend/fake.go index dfb520ef..2ddf2932 100644 --- a/pkg/ext-proc/backend/fake.go +++ b/pkg/ext-proc/backend/fake.go @@ -6,15 +6,16 @@ import ( "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) type FakePodMetricsClient struct { Err map[types.NamespacedName]error - Res map[types.NamespacedName]*PodMetrics + Res map[types.NamespacedName]*datastore.PodMetrics } -func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, existing *PodMetrics) (*PodMetrics, error) { +func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, existing *datastore.PodMetrics) (*datastore.PodMetrics, error) { if err, ok := f.Err[existing.NamespacedName]; ok { return nil, err } diff --git a/pkg/ext-proc/backend/provider.go b/pkg/ext-proc/backend/provider.go index bb575d19..103659db 100644 --- a/pkg/ext-proc/backend/provider.go +++ b/pkg/ext-proc/backend/provider.go @@ -9,6 +9,7 @@ import ( "github.com/go-logr/logr" "go.uber.org/multierr" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -17,7 +18,7 @@ const ( fetchMetricsTimeout = 5 * time.Second ) -func NewProvider(pmc PodMetricsClient, datastore Datastore) *Provider { +func NewProvider(pmc PodMetricsClient, datastore datastore.Datastore) *Provider { p := &Provider{ pmc: pmc, datastore: datastore, @@ -28,11 +29,11 @@ func NewProvider(pmc PodMetricsClient, datastore Datastore) *Provider { // Provider provides backend pods and information such as metrics. type Provider struct { pmc PodMetricsClient - datastore Datastore + datastore datastore.Datastore } type PodMetricsClient interface { - FetchMetrics(ctx context.Context, existing *PodMetrics) (*PodMetrics, error) + FetchMetrics(ctx context.Context, existing *datastore.PodMetrics) (*datastore.PodMetrics, error) } func (p *Provider) Init(ctx context.Context, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration) error { @@ -100,7 +101,7 @@ func (p *Provider) refreshMetricsOnce(logger logr.Logger) error { errCh := make(chan error) processOnePod := func(key, value any) bool { loggerTrace.Info("Pod and metric being processed", "pod", key, "metric", value) - existing := value.(*PodMetrics) + existing := value.(*datastore.PodMetrics) wg.Add(1) go func() { defer wg.Done() diff --git a/pkg/ext-proc/backend/provider_test.go b/pkg/ext-proc/backend/provider_test.go index 2aa2c213..95936f7e 100644 --- a/pkg/ext-proc/backend/provider_test.go +++ b/pkg/ext-proc/backend/provider_test.go @@ -11,16 +11,17 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" "github.com/stretchr/testify/assert" "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" ) var ( - pod1 = &PodMetrics{ - Pod: Pod{ + pod1 = &datastore.PodMetrics{ + Pod: datastore.Pod{ NamespacedName: types.NamespacedName{ Name: "pod1", }, }, - Metrics: Metrics{ + Metrics: datastore.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, MaxActiveModels: 2, @@ -30,13 +31,13 @@ var ( }, }, } - pod2 = &PodMetrics{ - Pod: Pod{ + pod2 = &datastore.PodMetrics{ + Pod: datastore.Pod{ NamespacedName: types.NamespacedName{ Name: "pod2", }, }, - Metrics: Metrics{ + Metrics: datastore.Metrics{ WaitingQueueSize: 1, KVCacheUsagePercent: 0.2, MaxActiveModels: 2, @@ -52,21 +53,19 @@ func TestProvider(t *testing.T) { tests := []struct { name string pmc PodMetricsClient - datastore Datastore - want []*PodMetrics + datastore datastore.Datastore + want []*datastore.PodMetrics }{ { name: "Probing metrics success", pmc: &FakePodMetricsClient{ - Res: map[types.NamespacedName]*PodMetrics{ + Res: map[types.NamespacedName]*datastore.PodMetrics{ pod1.NamespacedName: pod1, pod2.NamespacedName: pod2, }, }, - datastore: &datastore{ - pods: populateMap(pod1, pod2), - }, - want: []*PodMetrics{ + datastore: datastore.NewFakeDatastore(populateMap(pod1, pod2), nil, nil), + want: []*datastore.PodMetrics{ pod1, pod2, }, @@ -74,15 +73,13 @@ func TestProvider(t *testing.T) { { name: "Only pods in the datastore are probed", pmc: &FakePodMetricsClient{ - Res: map[types.NamespacedName]*PodMetrics{ + Res: map[types.NamespacedName]*datastore.PodMetrics{ pod1.NamespacedName: pod1, pod2.NamespacedName: pod2, }, }, - datastore: &datastore{ - pods: populateMap(pod1), - }, - want: []*PodMetrics{ + datastore: datastore.NewFakeDatastore(populateMap(pod1), nil, nil), + want: []*datastore.PodMetrics{ pod1, }, }, @@ -92,19 +89,18 @@ func TestProvider(t *testing.T) { Err: map[types.NamespacedName]error{ pod2.NamespacedName: errors.New("injected error"), }, - Res: map[types.NamespacedName]*PodMetrics{ + Res: map[types.NamespacedName]*datastore.PodMetrics{ pod1.NamespacedName: pod1, }, }, - datastore: &datastore{ - pods: populateMap(pod1, pod2), - }, - want: []*PodMetrics{ + datastore: datastore.NewFakeDatastore(populateMap(pod1, pod2), nil, nil), + + want: []*datastore.PodMetrics{ pod1, // Failed to fetch pod2 metrics so it remains the default values. { - Pod: Pod{NamespacedName: pod2.NamespacedName}, - Metrics: Metrics{ + Pod: datastore.Pod{NamespacedName: pod2.NamespacedName}, + Metrics: datastore.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0, MaxActiveModels: 0, @@ -122,7 +118,7 @@ func TestProvider(t *testing.T) { _ = p.Init(ctx, time.Millisecond, time.Millisecond) assert.EventuallyWithT(t, func(t *assert.CollectT) { metrics := test.datastore.PodGetAll() - diff := cmp.Diff(test.want, metrics, cmpopts.SortSlices(func(a, b *PodMetrics) bool { + diff := cmp.Diff(test.want, metrics, cmpopts.SortSlices(func(a, b *datastore.PodMetrics) bool { return a.String() < b.String() })) assert.Equal(t, "", diff, "Unexpected diff (+got/-want)") @@ -131,10 +127,10 @@ func TestProvider(t *testing.T) { } } -func populateMap(pods ...*PodMetrics) *sync.Map { +func populateMap(pods ...*datastore.PodMetrics) *sync.Map { newMap := &sync.Map{} for _, pod := range pods { - newMap.Store(pod.NamespacedName, &PodMetrics{Pod: Pod{NamespacedName: pod.NamespacedName, Address: pod.Address}}) + newMap.Store(pod.NamespacedName, &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: pod.NamespacedName, Address: pod.Address}}) } return newMap } diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/ext-proc/backend/vllm/metrics.go index 3737425d..4785e484 100644 --- a/pkg/ext-proc/backend/vllm/metrics.go +++ b/pkg/ext-proc/backend/vllm/metrics.go @@ -14,7 +14,7 @@ import ( "github.com/prometheus/common/expfmt" "go.uber.org/multierr" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -38,8 +38,8 @@ type PodMetricsClientImpl struct{} // FetchMetrics fetches metrics from a given pod. func (p *PodMetricsClientImpl) FetchMetrics( ctx context.Context, - existing *backend.PodMetrics, -) (*backend.PodMetrics, error) { + existing *datastore.PodMetrics, +) (*datastore.PodMetrics, error) { logger := log.FromContext(ctx) loggerDefault := logger.V(logutil.DEFAULT) @@ -79,8 +79,8 @@ func (p *PodMetricsClientImpl) FetchMetrics( func promToPodMetrics( logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, - existing *backend.PodMetrics, -) (*backend.PodMetrics, error) { + existing *datastore.PodMetrics, +) (*datastore.PodMetrics, error) { var errs error updated := existing.Clone() runningQueueSize, err := getLatestMetric(logger, metricFamilies, RunningQueueSizeMetricName) diff --git a/pkg/ext-proc/backend/vllm/metrics_test.go b/pkg/ext-proc/backend/vllm/metrics_test.go index 0a718cd7..23121ad5 100644 --- a/pkg/ext-proc/backend/vllm/metrics_test.go +++ b/pkg/ext-proc/backend/vllm/metrics_test.go @@ -7,7 +7,7 @@ import ( dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" "google.golang.org/protobuf/proto" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -17,9 +17,9 @@ func TestPromToPodMetrics(t *testing.T) { testCases := []struct { name string metricFamilies map[string]*dto.MetricFamily - expectedMetrics *backend.Metrics + expectedMetrics *datastore.Metrics expectedErr error - initialPodMetrics *backend.PodMetrics + initialPodMetrics *datastore.PodMetrics }{ { name: "all metrics available", @@ -107,7 +107,7 @@ func TestPromToPodMetrics(t *testing.T) { }, }, }, - expectedMetrics: &backend.Metrics{ + expectedMetrics: &datastore.Metrics{ RunningQueueSize: 15, WaitingQueueSize: 25, KVCacheUsagePercent: 0.9, @@ -117,7 +117,7 @@ func TestPromToPodMetrics(t *testing.T) { }, MaxActiveModels: 2, }, - initialPodMetrics: &backend.PodMetrics{}, + initialPodMetrics: &datastore.PodMetrics{}, expectedErr: nil, }, { @@ -206,7 +206,7 @@ func TestPromToPodMetrics(t *testing.T) { }, }, }, - expectedMetrics: &backend.Metrics{ + expectedMetrics: &datastore.Metrics{ RunningQueueSize: 15, WaitingQueueSize: 25, KVCacheUsagePercent: 0.9, @@ -216,7 +216,7 @@ func TestPromToPodMetrics(t *testing.T) { }, MaxActiveModels: 0, }, - initialPodMetrics: &backend.PodMetrics{}, + initialPodMetrics: &datastore.PodMetrics{}, expectedErr: errors.New("strconv.Atoi: parsing '2a': invalid syntax"), }, } diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler.go b/pkg/ext-proc/controller/inferencemodel_reconciler.go similarity index 95% rename from pkg/ext-proc/backend/inferencemodel_reconciler.go rename to pkg/ext-proc/controller/inferencemodel_reconciler.go index 884e6b7e..a4622988 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler.go +++ b/pkg/ext-proc/controller/inferencemodel_reconciler.go @@ -1,4 +1,4 @@ -package backend +package controller import ( "context" @@ -12,6 +12,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -19,7 +20,7 @@ type InferenceModelReconciler struct { client.Client Scheme *runtime.Scheme Record record.EventRecorder - Datastore Datastore + Datastore datastore.Datastore PoolNamespacedName types.NamespacedName } diff --git a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go b/pkg/ext-proc/controller/inferencemodel_reconciler_test.go similarity index 74% rename from pkg/ext-proc/backend/inferencemodel_reconciler_test.go rename to pkg/ext-proc/controller/inferencemodel_reconciler_test.go index 5afe3b5a..c3ebb646 100644 --- a/pkg/ext-proc/backend/inferencemodel_reconciler_test.go +++ b/pkg/ext-proc/controller/inferencemodel_reconciler_test.go @@ -1,4 +1,4 @@ -package backend +package controller import ( "context" @@ -13,6 +13,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -51,58 +52,50 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { tests := []struct { name string - datastore *datastore + datastore datastore.Datastore incomingService *v1alpha1.InferenceModel wantInferenceModels *sync.Map }{ { name: "No Services registered; valid, new service incoming.", - datastore: &datastore{ - pool: &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool", - ResourceVersion: "Old and boring", - }, + datastore: datastore.NewFakeDatastore(nil, nil, &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, }, - models: &sync.Map{}, - }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pool", + ResourceVersion: "Old and boring", + }, + }), + incomingService: infModel1, wantInferenceModels: populateServiceMap(infModel1), }, { name: "Removing existing service.", - datastore: &datastore{ - pool: &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool", - ResourceVersion: "Old and boring", - }, + datastore: datastore.NewFakeDatastore(nil, populateServiceMap(infModel1), &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, }, - models: populateServiceMap(infModel1), - }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pool", + ResourceVersion: "Old and boring", + }, + }), incomingService: infModel1Modified, wantInferenceModels: populateServiceMap(), }, { name: "Unrelated service, do nothing.", - datastore: &datastore{ - pool: &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool", - ResourceVersion: "Old and boring", - }, + datastore: datastore.NewFakeDatastore(nil, populateServiceMap(infModel1), &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, }, - models: populateServiceMap(infModel1), - }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pool", + ResourceVersion: "Old and boring", + }, + }), incomingService: &v1alpha1.InferenceModel{ Spec: v1alpha1.InferenceModelSpec{ ModelName: "fake model", @@ -116,33 +109,38 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { }, { name: "Add to existing", - datastore: &datastore{ - pool: &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool", - ResourceVersion: "Old and boring", - }, + datastore: datastore.NewFakeDatastore(nil, populateServiceMap(infModel1), &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, }, - models: populateServiceMap(infModel1), - }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pool", + ResourceVersion: "Old and boring", + }, + }), incomingService: infModel2, wantInferenceModels: populateServiceMap(infModel1, infModel2), }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { + pool, err := test.datastore.PoolGet() + if err != nil { + t.Fatalf("failed to get pool: %v", err) + } reconciler := &InferenceModelReconciler{ Datastore: test.datastore, - PoolNamespacedName: types.NamespacedName{Name: test.datastore.pool.Name}, + PoolNamespacedName: types.NamespacedName{Name: pool.Name}, } reconciler.updateDatastore(logger, test.incomingService) - if ok := mapsEqual(test.datastore.models, test.wantInferenceModels); !ok { - t.Error("Maps are not equal") - } + test.wantInferenceModels.Range(func(k, v any) bool { + _, exist := test.datastore.ModelGet(k.(string)) + if !exist { + t.Fatalf("failed to get model %s", k) + } + return true + }) }) } } @@ -156,12 +154,9 @@ func TestReconcile_ResourceNotFound(t *testing.T) { fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() // Create a minimal datastore. - datastore := &datastore{ - models: &sync.Map{}, - pool: &v1alpha1.InferencePool{ - ObjectMeta: metav1.ObjectMeta{Name: "test-pool"}, - }, - } + datastore := datastore.NewFakeDatastore(nil, nil, &v1alpha1.InferencePool{ + ObjectMeta: metav1.ObjectMeta{Name: "test-pool"}, + }) // Create the reconciler. reconciler := &InferenceModelReconciler{ @@ -211,12 +206,9 @@ func TestReconcile_ModelMarkedForDeletion(t *testing.T) { fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existingModel).Build() // Create a minimal datastore. - datastore := &datastore{ - models: &sync.Map{}, - pool: &v1alpha1.InferencePool{ - ObjectMeta: metav1.ObjectMeta{Name: "test-pool"}, - }, - } + datastore := datastore.NewFakeDatastore(nil, nil, &v1alpha1.InferencePool{ + ObjectMeta: metav1.ObjectMeta{Name: "test-pool"}, + }) // Create the reconciler. reconciler := &InferenceModelReconciler{ @@ -268,12 +260,9 @@ func TestReconcile_ResourceExists(t *testing.T) { fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existingModel).Build() // Create a minimal datastore. - datastore := &datastore{ - models: &sync.Map{}, - pool: &v1alpha1.InferencePool{ - ObjectMeta: metav1.ObjectMeta{Name: "test-pool"}, - }, - } + datastore := datastore.NewFakeDatastore(nil, nil, &v1alpha1.InferencePool{ + ObjectMeta: metav1.ObjectMeta{Name: "test-pool"}, + }) // Create the reconciler. reconciler := &InferenceModelReconciler{ @@ -312,24 +301,3 @@ func populateServiceMap(services ...*v1alpha1.InferenceModel) *sync.Map { } return returnVal } - -func mapsEqual(map1, map2 *sync.Map) bool { - equal := true - - map1.Range(func(k, v any) bool { - if _, ok := map2.Load(k); !ok { - equal = false - return false - } - return true - }) - map2.Range(func(k, v any) bool { - if _, ok := map1.Load(k); !ok { - equal = false - return false - } - return true - }) - - return equal -} diff --git a/pkg/ext-proc/backend/inferencepool_reconciler.go b/pkg/ext-proc/controller/inferencepool_reconciler.go similarity index 96% rename from pkg/ext-proc/backend/inferencepool_reconciler.go rename to pkg/ext-proc/controller/inferencepool_reconciler.go index 6f52862e..5c9e4969 100644 --- a/pkg/ext-proc/backend/inferencepool_reconciler.go +++ b/pkg/ext-proc/controller/inferencepool_reconciler.go @@ -1,4 +1,4 @@ -package backend +package controller import ( "context" @@ -12,6 +12,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -23,7 +24,7 @@ type InferencePoolReconciler struct { Scheme *runtime.Scheme Record record.EventRecorder PoolNamespacedName types.NamespacedName - Datastore Datastore + Datastore datastore.Datastore } func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { diff --git a/pkg/ext-proc/backend/inferencepool_reconciler_test.go b/pkg/ext-proc/controller/inferencepool_reconciler_test.go similarity index 84% rename from pkg/ext-proc/backend/inferencepool_reconciler_test.go rename to pkg/ext-proc/controller/inferencepool_reconciler_test.go index b6403489..ec2fdfe1 100644 --- a/pkg/ext-proc/backend/inferencepool_reconciler_test.go +++ b/pkg/ext-proc/controller/inferencepool_reconciler_test.go @@ -1,4 +1,4 @@ -package backend +package controller import ( "context" @@ -15,19 +15,20 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" utiltesting "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/testing" ) var ( - selector_v1 = map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm_v1"} - selector_v2 = map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm_v2"} + selector_v1 = map[string]string{"app": "vllm_v1"} + selector_v2 = map[string]string{"app": "vllm_v2"} pool1 = &v1alpha1.InferencePool{ ObjectMeta: metav1.ObjectMeta{ Name: "pool1", Namespace: "pool1-ns", }, Spec: v1alpha1.InferencePoolSpec{ - Selector: selector_v1, + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm_v1"}, TargetPortNumber: 8080, }, } @@ -39,14 +40,14 @@ var ( } pods = []corev1.Pod{ // Two ready pods matching pool1 - utiltesting.MakePod("pod1", "pool1-ns").Labels(stripLabelKeyAliasFromLabelMap(selector_v1)).ReadyCondition().Obj(), - utiltesting.MakePod("pod2", "pool1-ns").Labels(stripLabelKeyAliasFromLabelMap(selector_v1)).ReadyCondition().Obj(), + utiltesting.MakePod("pod1", "pool1-ns").Labels(selector_v1).ReadyCondition().Obj(), + utiltesting.MakePod("pod2", "pool1-ns").Labels(selector_v1).ReadyCondition().Obj(), // A not ready pod matching pool1 - utiltesting.MakePod("pod3", "pool1-ns").Labels(stripLabelKeyAliasFromLabelMap(selector_v1)).Obj(), + utiltesting.MakePod("pod3", "pool1-ns").Labels(selector_v1).Obj(), // A pod not matching pool1 namespace - utiltesting.MakePod("pod4", "pool2-ns").Labels(stripLabelKeyAliasFromLabelMap(selector_v1)).ReadyCondition().Obj(), + utiltesting.MakePod("pod4", "pool2-ns").Labels(selector_v1).ReadyCondition().Obj(), // A ready pod matching pool1 with a new selector - utiltesting.MakePod("pod5", "pool1-ns").Labels(stripLabelKeyAliasFromLabelMap(selector_v2)).ReadyCondition().Obj(), + utiltesting.MakePod("pod5", "pool1-ns").Labels(selector_v2).ReadyCondition().Obj(), } ) @@ -74,7 +75,7 @@ func TestReconcile_InferencePoolReconciler(t *testing.T) { req := ctrl.Request{NamespacedName: namespacedName} ctx := context.Background() - datastore := NewDatastore() + datastore := datastore.NewDatastore() inferencePoolReconciler := &InferencePoolReconciler{PoolNamespacedName: namespacedName, Client: fakeClient, Datastore: datastore} // Step 1: Inception, only ready pods matching pool1 are added to the store. @@ -98,7 +99,7 @@ func TestReconcile_InferencePoolReconciler(t *testing.T) { if err := fakeClient.Get(ctx, req.NamespacedName, newPool1); err != nil { t.Errorf("Unexpected pool get error: %v", err) } - newPool1.Spec.Selector = selector_v2 + newPool1.Spec.Selector = map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm_v2"} if err := fakeClient.Update(ctx, newPool1, &client.UpdateOptions{}); err != nil { t.Errorf("Unexpected pool update error: %v", err) } @@ -140,7 +141,7 @@ func TestReconcile_InferencePoolReconciler(t *testing.T) { } } -func diffPool(datastore Datastore, wantPool *v1alpha1.InferencePool, wantPods []string) string { +func diffPool(datastore datastore.Datastore, wantPool *v1alpha1.InferencePool, wantPods []string) string { gotPool, _ := datastore.PoolGet() if diff := cmp.Diff(wantPool, gotPool); diff != "" { return diff diff --git a/pkg/ext-proc/backend/pod_reconciler.go b/pkg/ext-proc/controller/pod_reconciler.go similarity index 95% rename from pkg/ext-proc/backend/pod_reconciler.go rename to pkg/ext-proc/controller/pod_reconciler.go index 8705ce83..209d2ca7 100644 --- a/pkg/ext-proc/backend/pod_reconciler.go +++ b/pkg/ext-proc/controller/pod_reconciler.go @@ -1,4 +1,4 @@ -package backend +package controller import ( "context" @@ -12,12 +12,13 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) type PodReconciler struct { client.Client - Datastore Datastore + Datastore datastore.Datastore Scheme *runtime.Scheme Record record.EventRecorder } diff --git a/pkg/ext-proc/backend/pod_reconciler_test.go b/pkg/ext-proc/controller/pod_reconciler_test.go similarity index 57% rename from pkg/ext-proc/backend/pod_reconciler_test.go rename to pkg/ext-proc/controller/pod_reconciler_test.go index cc7381f6..b146745a 100644 --- a/pkg/ext-proc/backend/pod_reconciler_test.go +++ b/pkg/ext-proc/controller/pod_reconciler_test.go @@ -1,7 +1,8 @@ -package backend +package controller import ( "context" + "sync" "testing" "github.com/google/go-cmp/cmp" @@ -15,37 +16,35 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" ) var ( - basePod1 = &PodMetrics{Pod: Pod{NamespacedName: types.NamespacedName{Name: "pod1"}, Address: "address-1"}} - basePod2 = &PodMetrics{Pod: Pod{NamespacedName: types.NamespacedName{Name: "pod2"}, Address: "address-2"}} - basePod3 = &PodMetrics{Pod: Pod{NamespacedName: types.NamespacedName{Name: "pod3"}, Address: "address-3"}} - basePod11 = &PodMetrics{Pod: Pod{NamespacedName: types.NamespacedName{Name: "pod1"}, Address: "address-11"}} + basePod1 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}, Address: "address-1"}} + basePod2 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}, Address: "address-2"}} + basePod3 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}, Address: "address-3"}} + basePod11 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}, Address: "address-11"}} ) func TestUpdateDatastore_PodReconciler(t *testing.T) { now := metav1.Now() tests := []struct { name string - datastore Datastore + datastore datastore.Datastore incomingPod *corev1.Pod - wantPods []Pod + wantPods []datastore.Pod req *ctrl.Request }{ { name: "Add new pod", - datastore: &datastore{ - pods: populateMap(basePod1, basePod2), - pool: &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - TargetPortNumber: int32(8000), - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ - "some-key": "some-val", - }, + datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + "some-key": "some-val", }, }, - }, + }), incomingPod: &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: basePod3.NamespacedName.Name, @@ -63,21 +62,18 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, }, }, - wantPods: []Pod{basePod1.Pod, basePod2.Pod, basePod3.Pod}, + wantPods: []datastore.Pod{basePod1.Pod, basePod2.Pod, basePod3.Pod}, }, { name: "Update pod1 address", - datastore: &datastore{ - pods: populateMap(basePod1, basePod2), - pool: &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - TargetPortNumber: int32(8000), - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ - "some-key": "some-val", - }, + datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + "some-key": "some-val", }, }, - }, + }), incomingPod: &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: basePod11.NamespacedName.Name, @@ -95,21 +91,18 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, }, }, - wantPods: []Pod{basePod11.Pod, basePod2.Pod}, + wantPods: []datastore.Pod{basePod11.Pod, basePod2.Pod}, }, { name: "Delete pod with DeletionTimestamp", - datastore: &datastore{ - pods: populateMap(basePod1, basePod2), - pool: &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - TargetPortNumber: int32(8000), - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ - "some-key": "some-val", - }, + datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + "some-key": "some-val", }, }, - }, + }), incomingPod: &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: "pod1", @@ -128,37 +121,31 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, }, }, - wantPods: []Pod{basePod2.Pod}, + wantPods: []datastore.Pod{basePod2.Pod}, }, { name: "Delete notfound pod", - datastore: &datastore{ - pods: populateMap(basePod1, basePod2), - pool: &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - TargetPortNumber: int32(8000), - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ - "some-key": "some-val", - }, + datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + "some-key": "some-val", }, }, - }, + }), req: &ctrl.Request{NamespacedName: types.NamespacedName{Name: "pod1"}}, - wantPods: []Pod{basePod2.Pod}, + wantPods: []datastore.Pod{basePod2.Pod}, }, { name: "New pod, not ready, valid selector", - datastore: &datastore{ - pods: populateMap(basePod1, basePod2), - pool: &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - TargetPortNumber: int32(8000), - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ - "some-key": "some-val", - }, + datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + "some-key": "some-val", }, }, - }, + }), incomingPod: &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: "pod3", @@ -175,21 +162,18 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, }, }, - wantPods: []Pod{basePod1.Pod, basePod2.Pod}, + wantPods: []datastore.Pod{basePod1.Pod, basePod2.Pod}, }, { name: "Remove pod that does not match selector", - datastore: &datastore{ - pods: populateMap(basePod1, basePod2), - pool: &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - TargetPortNumber: int32(8000), - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ - "some-key": "some-val", - }, + datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + "some-key": "some-val", }, }, - }, + }), incomingPod: &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: "pod1", @@ -206,21 +190,18 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, }, }, - wantPods: []Pod{basePod2.Pod}, + wantPods: []datastore.Pod{basePod2.Pod}, }, { name: "Remove pod that is not ready", - datastore: &datastore{ - pods: populateMap(basePod1, basePod2), - pool: &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - TargetPortNumber: int32(8000), - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ - "some-key": "some-val", - }, + datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha1.InferencePool{ + Spec: v1alpha1.InferencePoolSpec{ + TargetPortNumber: int32(8000), + Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + "some-key": "some-val", }, }, - }, + }), incomingPod: &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: "pod1", @@ -237,7 +218,7 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, }, }, - wantPods: []Pod{basePod2.Pod}, + wantPods: []datastore.Pod{basePod2.Pod}, }, } for _, test := range tests { @@ -263,17 +244,25 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { t.Errorf("Unexpected InferencePool reconcile error: %v", err) } - var gotPods []Pod + var gotPods []datastore.Pod test.datastore.PodRange(func(k, v any) bool { - pod := v.(*PodMetrics) + pod := v.(*datastore.PodMetrics) if v != nil { gotPods = append(gotPods, pod.Pod) } return true }) - if !cmp.Equal(gotPods, test.wantPods, cmpopts.SortSlices(func(a, b Pod) bool { return a.NamespacedName.String() < b.NamespacedName.String() })) { + if !cmp.Equal(gotPods, test.wantPods, cmpopts.SortSlices(func(a, b datastore.Pod) bool { return a.NamespacedName.String() < b.NamespacedName.String() })) { t.Errorf("got (%v) != want (%v);", gotPods, test.wantPods) } }) } } + +func populateMap(pods ...*datastore.PodMetrics) *sync.Map { + newMap := &sync.Map{} + for _, pod := range pods { + newMap.Store(pod.NamespacedName, &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: pod.NamespacedName, Address: pod.Address}}) + } + return newMap +} diff --git a/pkg/ext-proc/backend/datastore.go b/pkg/ext-proc/datastore/datastore.go similarity index 91% rename from pkg/ext-proc/backend/datastore.go rename to pkg/ext-proc/datastore/datastore.go index 6b8483d3..f85f9014 100644 --- a/pkg/ext-proc/backend/datastore.go +++ b/pkg/ext-proc/datastore/datastore.go @@ -1,4 +1,4 @@ -package backend +package datastore import ( "context" @@ -52,6 +52,21 @@ func NewDatastore() Datastore { return store } +// Used for test only +func NewFakeDatastore(pods, models *sync.Map, pool *v1alpha1.InferencePool) Datastore { + store := NewDatastore() + if pods != nil { + store.(*datastore).pods = pods + } + if models != nil { + store.(*datastore).models = models + } + if pool != nil { + store.(*datastore).pool = pool + } + return store +} + type datastore struct { // poolMu is used to synchronize access to the inferencePool. poolMu sync.RWMutex @@ -249,3 +264,16 @@ func IsCritical(model *v1alpha1.InferenceModel) bool { } return false } + +// TODO: move out to share with pod_reconciler.go +func podIsReady(pod *corev1.Pod) bool { + for _, condition := range pod.Status.Conditions { + if condition.Type == corev1.PodReady { + if condition.Status == corev1.ConditionTrue { + return true + } + break + } + } + return false +} diff --git a/pkg/ext-proc/backend/datastore_test.go b/pkg/ext-proc/datastore/datastore_test.go similarity index 99% rename from pkg/ext-proc/backend/datastore_test.go rename to pkg/ext-proc/datastore/datastore_test.go index b44de0a5..6c5874df 100644 --- a/pkg/ext-proc/backend/datastore_test.go +++ b/pkg/ext-proc/datastore/datastore_test.go @@ -1,4 +1,4 @@ -package backend +package datastore import ( "testing" diff --git a/pkg/ext-proc/backend/types.go b/pkg/ext-proc/datastore/types.go similarity index 91% rename from pkg/ext-proc/backend/types.go rename to pkg/ext-proc/datastore/types.go index 0e02fb09..221c6630 100644 --- a/pkg/ext-proc/backend/types.go +++ b/pkg/ext-proc/datastore/types.go @@ -1,5 +1,5 @@ -// Package backend is a library to interact with backend model servers such as probing metrics. -package backend +// Package datastore is a library to interact with backend model servers such as probing metrics. +package datastore import ( "fmt" diff --git a/pkg/ext-proc/handlers/request.go b/pkg/ext-proc/handlers/request.go index 5edb2e77..b3ef08e0 100644 --- a/pkg/ext-proc/handlers/request.go +++ b/pkg/ext-proc/handlers/request.go @@ -11,7 +11,7 @@ import ( extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "google.golang.org/protobuf/types/known/structpb" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -53,7 +53,7 @@ func (s *Server) HandleRequestBody( return nil, fmt.Errorf("error finding a model object in InferenceModel for input %v", model) } if len(modelObj.Spec.TargetModels) > 0 { - modelName = backend.RandomWeightedDraw(logger, modelObj, 0) + modelName = datastore.RandomWeightedDraw(logger, modelObj, 0) if modelName == "" { return nil, fmt.Errorf("error getting target model name for model %v", modelObj.Name) } @@ -61,7 +61,7 @@ func (s *Server) HandleRequestBody( llmReq := &scheduling.LLMRequest{ Model: model, ResolvedTargetModel: modelName, - Critical: backend.IsCritical(modelObj), + Critical: datastore.IsCritical(modelObj), } loggerVerbose.Info("LLM request assembled", "request", llmReq) diff --git a/pkg/ext-proc/handlers/server.go b/pkg/ext-proc/handlers/server.go index fe00ebeb..05de0c42 100644 --- a/pkg/ext-proc/handlers/server.go +++ b/pkg/ext-proc/handlers/server.go @@ -11,13 +11,13 @@ import ( "google.golang.org/grpc/codes" "google.golang.org/grpc/status" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) -func NewServer(scheduler Scheduler, targetEndpointKey string, datastore backend.Datastore) *Server { +func NewServer(scheduler Scheduler, targetEndpointKey string, datastore datastore.Datastore) *Server { return &Server{ scheduler: scheduler, targetEndpointKey: targetEndpointKey, @@ -32,11 +32,11 @@ type Server struct { // The key of the header to specify the target pod address. This value needs to match Envoy // configuration. targetEndpointKey string - datastore backend.Datastore + datastore datastore.Datastore } type Scheduler interface { - Schedule(ctx context.Context, b *scheduling.LLMRequest) (targetPod backend.PodMetrics, err error) + Schedule(ctx context.Context, b *scheduling.LLMRequest) (targetPod datastore.PodMetrics, err error) } func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { diff --git a/pkg/ext-proc/health.go b/pkg/ext-proc/health.go index 59aec348..525440cb 100644 --- a/pkg/ext-proc/health.go +++ b/pkg/ext-proc/health.go @@ -7,13 +7,13 @@ import ( "google.golang.org/grpc/codes" healthPb "google.golang.org/grpc/health/grpc_health_v1" "google.golang.org/grpc/status" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) type healthServer struct { logger logr.Logger - datastore backend.Datastore + datastore datastore.Datastore } func (s *healthServer) Check(ctx context.Context, in *healthPb.HealthCheckRequest) (*healthPb.HealthCheckResponse, error) { diff --git a/pkg/ext-proc/main.go b/pkg/ext-proc/main.go index 8e588673..d43f2c57 100644 --- a/pkg/ext-proc/main.go +++ b/pkg/ext-proc/main.go @@ -26,6 +26,7 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend/vllm" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/internal/runnable" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" @@ -125,7 +126,7 @@ func run() error { } // Setup runner. - datastore := backend.NewDatastore() + datastore := datastore.NewDatastore() provider := backend.NewProvider(&vllm.PodMetricsClientImpl{}, datastore) serverRunner := &runserver.ExtProcServerRunner{ GrpcPort: *grpcPort, @@ -189,7 +190,7 @@ func initLogging(opts *zap.Options) { } // registerHealthServer adds the Health gRPC server as a Runnable to the given manager. -func registerHealthServer(mgr manager.Manager, logger logr.Logger, ds backend.Datastore, port int) error { +func registerHealthServer(mgr manager.Manager, logger logr.Logger, ds datastore.Datastore, port int) error { srv := grpc.NewServer() healthPb.RegisterHealthServer(srv, &healthServer{ logger: logger, diff --git a/pkg/ext-proc/scheduling/filter.go b/pkg/ext-proc/scheduling/filter.go index e028c59a..4d53e720 100644 --- a/pkg/ext-proc/scheduling/filter.go +++ b/pkg/ext-proc/scheduling/filter.go @@ -5,13 +5,13 @@ import ( "math" "github.com/go-logr/logr" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) type Filter interface { Name() string - Filter(logger logr.Logger, req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) + Filter(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) } // filter applies current filterFunc, and then recursively applies next filters depending success or @@ -41,7 +41,7 @@ func (f *filter) Name() string { return f.name } -func (f *filter) Filter(logger logr.Logger, req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { +func (f *filter) Filter(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { loggerTrace := logger.V(logutil.TRACE) loggerTrace.Info("Running a filter", "name", f.Name(), "podCount", len(pods)) @@ -74,12 +74,12 @@ func (f *filter) Filter(logger logr.Logger, req *LLMRequest, pods []*backend.Pod } // filterFunc filters a set of input pods to a subset. -type filterFunc func(logger logr.Logger, req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) +type filterFunc func(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) // toFilterFunc is a helper function to convert a per pod filter func to the FilterFunc. func toFilterFunc(pp podPredicate) filterFunc { - return func(logger logr.Logger, req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { - filtered := []*backend.PodMetrics{} + return func(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { + filtered := []*datastore.PodMetrics{} for _, pod := range pods { pass := pp(req, pod) if pass { @@ -100,10 +100,10 @@ func toFilterFunc(pp podPredicate) filterFunc { // the least one as it gives more choices for the next filter, which on aggregate gave better // results. // TODO: Compare this strategy with other strategies such as top K. -func leastQueuingFilterFunc(logger logr.Logger, req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { +func leastQueuingFilterFunc(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { min := math.MaxInt max := 0 - filtered := []*backend.PodMetrics{} + filtered := []*datastore.PodMetrics{} for _, pod := range pods { if pod.WaitingQueueSize <= min { @@ -122,7 +122,7 @@ func leastQueuingFilterFunc(logger logr.Logger, req *LLMRequest, pods []*backend return filtered, nil } -func lowQueueingPodPredicate(_ *LLMRequest, pod *backend.PodMetrics) bool { +func lowQueueingPodPredicate(_ *LLMRequest, pod *datastore.PodMetrics) bool { return pod.WaitingQueueSize < queueingThresholdLoRA } @@ -132,10 +132,10 @@ func lowQueueingPodPredicate(_ *LLMRequest, pod *backend.PodMetrics) bool { // should consider them all instead of the absolute minimum one. This worked better than picking the // least one as it gives more choices for the next filter, which on aggregate gave better results. // TODO: Compare this strategy with other strategies such as top K. -func leastKVCacheFilterFunc(logger logr.Logger, req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { +func leastKVCacheFilterFunc(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { min := math.MaxFloat64 var max float64 = 0 - filtered := []*backend.PodMetrics{} + filtered := []*datastore.PodMetrics{} for _, pod := range pods { if pod.KVCacheUsagePercent <= min { @@ -155,35 +155,35 @@ func leastKVCacheFilterFunc(logger logr.Logger, req *LLMRequest, pods []*backend } // podPredicate is a filter function to check whether a pod is desired. -type podPredicate func(req *LLMRequest, pod *backend.PodMetrics) bool +type podPredicate func(req *LLMRequest, pod *datastore.PodMetrics) bool // We consider serving an adapter low cost it the adapter is active in the model server, or the // model server has room to load the adapter. The lowLoRACostPredicate ensures weak affinity by // spreading the load of a LoRA adapter across multiple pods, avoiding "pinning" all requests to // a single pod. This gave good performance in our initial benchmarking results in the scenario // where # of lora slots > # of lora adapters. -func lowLoRACostPredicate(req *LLMRequest, pod *backend.PodMetrics) bool { +func lowLoRACostPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool { _, ok := pod.ActiveModels[req.ResolvedTargetModel] return ok || len(pod.ActiveModels) < pod.MaxActiveModels } // loRAAffinityPredicate is a filter function to check whether a pod has affinity to the lora requested. -func loRAAffinityPredicate(req *LLMRequest, pod *backend.PodMetrics) bool { +func loRAAffinityPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool { _, ok := pod.ActiveModels[req.ResolvedTargetModel] return ok } // canAcceptNewLoraPredicate is a filter function to check whether a pod has room to load the adapter. -func canAcceptNewLoraPredicate(req *LLMRequest, pod *backend.PodMetrics) bool { +func canAcceptNewLoraPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool { return len(pod.ActiveModels) < pod.MaxActiveModels } -func criticalRequestPredicate(req *LLMRequest, pod *backend.PodMetrics) bool { +func criticalRequestPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool { return req.Critical } func noQueueAndLessThanKVCacheThresholdPredicate(queueThreshold int, kvCacheThreshold float64) podPredicate { - return func(req *LLMRequest, pod *backend.PodMetrics) bool { + return func(req *LLMRequest, pod *datastore.PodMetrics) bool { return pod.WaitingQueueSize <= queueThreshold && pod.KVCacheUsagePercent <= kvCacheThreshold } } diff --git a/pkg/ext-proc/scheduling/filter_test.go b/pkg/ext-proc/scheduling/filter_test.go index 9ed781c4..b2ae4b89 100644 --- a/pkg/ext-proc/scheduling/filter_test.go +++ b/pkg/ext-proc/scheduling/filter_test.go @@ -7,7 +7,7 @@ import ( "github.com/go-logr/logr" "github.com/google/go-cmp/cmp" "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -17,14 +17,14 @@ func TestFilter(t *testing.T) { tests := []struct { name string req *LLMRequest - input []*backend.PodMetrics - output []*backend.PodMetrics + input []*datastore.PodMetrics + output []*datastore.PodMetrics err bool filter *filter }{ { name: "simple filter without successor, failure", - filter: &filter{filter: func(logger logr.Logger, req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { + filter: &filter{filter: func(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { return nil, errors.New("filter error") }}, err: true, @@ -39,10 +39,10 @@ func TestFilter(t *testing.T) { }, // pod2 will be picked because it has relatively low queue size, with the requested // model being active, and has low KV cache. - input: []*backend.PodMetrics{ + input: []*datastore.PodMetrics{ { - Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, - Metrics: backend.Metrics{ + Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, + Metrics: datastore.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, MaxActiveModels: 2, @@ -53,8 +53,8 @@ func TestFilter(t *testing.T) { }, }, { - Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, - Metrics: backend.Metrics{ + Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, + Metrics: datastore.Metrics{ WaitingQueueSize: 3, KVCacheUsagePercent: 0.1, MaxActiveModels: 2, @@ -65,8 +65,8 @@ func TestFilter(t *testing.T) { }, }, { - Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, - Metrics: backend.Metrics{ + Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, + Metrics: datastore.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.2, MaxActiveModels: 2, @@ -76,10 +76,10 @@ func TestFilter(t *testing.T) { }, }, }, - output: []*backend.PodMetrics{ + output: []*datastore.PodMetrics{ { - Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, - Metrics: backend.Metrics{ + Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, + Metrics: datastore.Metrics{ WaitingQueueSize: 3, KVCacheUsagePercent: 0.1, MaxActiveModels: 2, @@ -100,10 +100,10 @@ func TestFilter(t *testing.T) { Critical: false, }, // pod1 will be picked because it has capacity for the sheddable request. - input: []*backend.PodMetrics{ + input: []*datastore.PodMetrics{ { - Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, - Metrics: backend.Metrics{ + Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, + Metrics: datastore.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, MaxActiveModels: 2, @@ -114,8 +114,8 @@ func TestFilter(t *testing.T) { }, }, { - Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, - Metrics: backend.Metrics{ + Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, + Metrics: datastore.Metrics{ WaitingQueueSize: 3, KVCacheUsagePercent: 0.1, MaxActiveModels: 2, @@ -126,8 +126,8 @@ func TestFilter(t *testing.T) { }, }, { - Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, - Metrics: backend.Metrics{ + Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, + Metrics: datastore.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.2, MaxActiveModels: 2, @@ -137,10 +137,10 @@ func TestFilter(t *testing.T) { }, }, }, - output: []*backend.PodMetrics{ + output: []*datastore.PodMetrics{ { - Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, - Metrics: backend.Metrics{ + Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, + Metrics: datastore.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, MaxActiveModels: 2, @@ -162,10 +162,10 @@ func TestFilter(t *testing.T) { }, // All pods have higher KV cache thant the threshold, so the sheddable request will be // dropped. - input: []*backend.PodMetrics{ + input: []*datastore.PodMetrics{ { - Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, - Metrics: backend.Metrics{ + Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, + Metrics: datastore.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.9, MaxActiveModels: 2, @@ -176,8 +176,8 @@ func TestFilter(t *testing.T) { }, }, { - Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, - Metrics: backend.Metrics{ + Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, + Metrics: datastore.Metrics{ WaitingQueueSize: 3, KVCacheUsagePercent: 0.85, MaxActiveModels: 2, @@ -188,8 +188,8 @@ func TestFilter(t *testing.T) { }, }, { - Pod: backend.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, - Metrics: backend.Metrics{ + Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, + Metrics: datastore.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.85, MaxActiveModels: 2, @@ -199,7 +199,7 @@ func TestFilter(t *testing.T) { }, }, }, - output: []*backend.PodMetrics{}, + output: []*datastore.PodMetrics{}, err: true, }, } @@ -225,44 +225,44 @@ func TestFilterFunc(t *testing.T) { name string f filterFunc req *LLMRequest - input []*backend.PodMetrics - output []*backend.PodMetrics + input []*datastore.PodMetrics + output []*datastore.PodMetrics err bool }{ { name: "least queuing empty input", f: leastQueuingFilterFunc, - input: []*backend.PodMetrics{}, - output: []*backend.PodMetrics{}, + input: []*datastore.PodMetrics{}, + output: []*datastore.PodMetrics{}, }, { name: "least queuing", f: leastQueuingFilterFunc, - input: []*backend.PodMetrics{ + input: []*datastore.PodMetrics{ { - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ WaitingQueueSize: 0, }, }, { - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ WaitingQueueSize: 3, }, }, { - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ WaitingQueueSize: 10, }, }, }, - output: []*backend.PodMetrics{ + output: []*datastore.PodMetrics{ { - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ WaitingQueueSize: 0, }, }, { - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ WaitingQueueSize: 3, }, }, @@ -271,37 +271,37 @@ func TestFilterFunc(t *testing.T) { { name: "least kv cache empty input", f: leastKVCacheFilterFunc, - input: []*backend.PodMetrics{}, - output: []*backend.PodMetrics{}, + input: []*datastore.PodMetrics{}, + output: []*datastore.PodMetrics{}, }, { name: "least kv cache", f: leastKVCacheFilterFunc, - input: []*backend.PodMetrics{ + input: []*datastore.PodMetrics{ { - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ KVCacheUsagePercent: 0, }, }, { - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ KVCacheUsagePercent: 0.3, }, }, { - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ KVCacheUsagePercent: 1.0, }, }, }, - output: []*backend.PodMetrics{ + output: []*datastore.PodMetrics{ { - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ KVCacheUsagePercent: 0, }, }, { - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ KVCacheUsagePercent: 0.3, }, }, @@ -310,32 +310,32 @@ func TestFilterFunc(t *testing.T) { { name: "noQueueAndLessThanKVCacheThresholdPredicate", f: toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(0, 0.8)), - input: []*backend.PodMetrics{ + input: []*datastore.PodMetrics{ { // This pod should be returned. - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0, }, }, { // Queue is non zero, despite low kv cache, should not return. - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ WaitingQueueSize: 1, KVCacheUsagePercent: 0.3, }, }, { // High kv cache despite zero queue, should not return - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 1.0, }, }, }, - output: []*backend.PodMetrics{ + output: []*datastore.PodMetrics{ { - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0, }, @@ -349,10 +349,10 @@ func TestFilterFunc(t *testing.T) { Model: "model", ResolvedTargetModel: "model", }, - input: []*backend.PodMetrics{ + input: []*datastore.PodMetrics{ // ActiveModels include input model, should be returned. { - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ MaxActiveModels: 2, ActiveModels: map[string]int{ "model": 1, @@ -361,7 +361,7 @@ func TestFilterFunc(t *testing.T) { }, // Input model is not active, however the server has room to load another adapter. { - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ MaxActiveModels: 2, ActiveModels: map[string]int{ "another-model": 1, @@ -370,7 +370,7 @@ func TestFilterFunc(t *testing.T) { }, // Input is not active, and the server has reached max active models. { - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ MaxActiveModels: 2, ActiveModels: map[string]int{ "foo": 1, @@ -379,9 +379,9 @@ func TestFilterFunc(t *testing.T) { }, }, }, - output: []*backend.PodMetrics{ + output: []*datastore.PodMetrics{ { - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ MaxActiveModels: 2, ActiveModels: map[string]int{ "model": 1, @@ -389,7 +389,7 @@ func TestFilterFunc(t *testing.T) { }, }, { - Metrics: backend.Metrics{ + Metrics: datastore.Metrics{ MaxActiveModels: 2, ActiveModels: map[string]int{ "another-model": 1, diff --git a/pkg/ext-proc/scheduling/scheduler.go b/pkg/ext-proc/scheduling/scheduler.go index 354bd39c..1e56fee3 100644 --- a/pkg/ext-proc/scheduling/scheduler.go +++ b/pkg/ext-proc/scheduling/scheduler.go @@ -10,7 +10,7 @@ import ( "google.golang.org/grpc/codes" "google.golang.org/grpc/status" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -84,16 +84,16 @@ var ( // request to make room for critical requests. nextOnFailure: &filter{ name: "drop request", - filter: func(logger logr.Logger, req *LLMRequest, pods []*backend.PodMetrics) ([]*backend.PodMetrics, error) { + filter: func(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { logger.V(logutil.DEFAULT).Info("Request dropped", "request", req) - return []*backend.PodMetrics{}, status.Errorf( + return []*datastore.PodMetrics{}, status.Errorf( codes.ResourceExhausted, "dropping request due to limited backend resources") }, }, } ) -func NewScheduler(datastore backend.Datastore) *Scheduler { +func NewScheduler(datastore datastore.Datastore) *Scheduler { return &Scheduler{ datastore: datastore, filter: defaultFilter, @@ -101,18 +101,18 @@ func NewScheduler(datastore backend.Datastore) *Scheduler { } type Scheduler struct { - datastore backend.Datastore + datastore datastore.Datastore filter Filter } // Schedule finds the target pod based on metrics and the requested lora adapter. -func (s *Scheduler) Schedule(ctx context.Context, req *LLMRequest) (targetPod backend.PodMetrics, err error) { +func (s *Scheduler) Schedule(ctx context.Context, req *LLMRequest) (targetPod datastore.PodMetrics, err error) { logger := log.FromContext(ctx).WithValues("request", req) podMetrics := s.datastore.PodGetAll() logger.V(logutil.VERBOSE).Info("Scheduling a request", "metrics", podMetrics) pods, err := s.filter.Filter(logger, req, podMetrics) if err != nil || len(pods) == 0 { - return backend.PodMetrics{}, fmt.Errorf( + return datastore.PodMetrics{}, fmt.Errorf( "failed to apply filter, resulted %v pods, this should never happen: %w", len(pods), err) } logger.V(logutil.VERBOSE).Info("Selecting a random pod from the candidates", "candidatePods", pods) diff --git a/pkg/ext-proc/server/runserver.go b/pkg/ext-proc/server/runserver.go index 073c30df..7b0209a6 100644 --- a/pkg/ext-proc/server/runserver.go +++ b/pkg/ext-proc/server/runserver.go @@ -20,6 +20,8 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/controller" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/internal/runnable" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" @@ -33,7 +35,7 @@ type ExtProcServerRunner struct { PoolNamespace string RefreshMetricsInterval time.Duration RefreshPrometheusMetricsInterval time.Duration - Datastore backend.Datastore + Datastore datastore.Datastore Provider *backend.Provider SecureServing bool CertPath string @@ -66,7 +68,7 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner { // SetupWithManager sets up the runner with the given manager. func (r *ExtProcServerRunner) SetupWithManager(mgr ctrl.Manager) error { // Create the controllers and register them with the manager - if err := (&backend.InferencePoolReconciler{ + if err := (&controller.InferencePoolReconciler{ Datastore: r.Datastore, Scheme: mgr.GetScheme(), Client: mgr.GetClient(), @@ -79,7 +81,7 @@ func (r *ExtProcServerRunner) SetupWithManager(mgr ctrl.Manager) error { return fmt.Errorf("failed setting up InferencePoolReconciler: %w", err) } - if err := (&backend.InferenceModelReconciler{ + if err := (&controller.InferenceModelReconciler{ Datastore: r.Datastore, Scheme: mgr.GetScheme(), Client: mgr.GetClient(), @@ -92,7 +94,7 @@ func (r *ExtProcServerRunner) SetupWithManager(mgr ctrl.Manager) error { return fmt.Errorf("failed setting up InferenceModelReconciler: %w", err) } - if err := (&backend.PodReconciler{ + if err := (&controller.PodReconciler{ Datastore: r.Datastore, Scheme: mgr.GetScheme(), Client: mgr.GetClient(), diff --git a/pkg/ext-proc/test/benchmark/benchmark.go b/pkg/ext-proc/test/benchmark/benchmark.go index a48f0465..3820998d 100644 --- a/pkg/ext-proc/test/benchmark/benchmark.go +++ b/pkg/ext-proc/test/benchmark/benchmark.go @@ -16,7 +16,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/test" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" @@ -104,8 +104,8 @@ func fakeModels() map[string]*v1alpha1.InferenceModel { return models } -func fakePods() []*backend.PodMetrics { - pms := make([]*backend.PodMetrics, 0, *numFakePods) +func fakePods() []*datastore.PodMetrics { + pms := make([]*datastore.PodMetrics, 0, *numFakePods) for i := 0; i < *numFakePods; i++ { pms = append(pms, test.FakePodMetrics(i, fakeMetrics(i))) } @@ -114,8 +114,8 @@ func fakePods() []*backend.PodMetrics { } // fakeMetrics adds numModelsPerPod number of adapters to the pod metrics. -func fakeMetrics(podNumber int) backend.Metrics { - metrics := backend.Metrics{ +func fakeMetrics(podNumber int) datastore.Metrics { + metrics := datastore.Metrics{ ActiveModels: make(map[string]int), } for i := 0; i < *numModelsPerPod; i++ { diff --git a/pkg/ext-proc/test/utils.go b/pkg/ext-proc/test/utils.go index 46affae9..a2d833e0 100644 --- a/pkg/ext-proc/test/utils.go +++ b/pkg/ext-proc/test/utils.go @@ -15,6 +15,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" @@ -25,16 +26,16 @@ func StartExtProc( ctx context.Context, port int, refreshPodsInterval, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration, - pods []*backend.PodMetrics, + pods []*datastore.PodMetrics, models map[string]*v1alpha1.InferenceModel, ) *grpc.Server { logger := log.FromContext(ctx) - pms := make(map[types.NamespacedName]*backend.PodMetrics) + pms := make(map[types.NamespacedName]*datastore.PodMetrics) for _, pod := range pods { pms[pod.NamespacedName] = pod } pmc := &backend.FakePodMetricsClient{Res: pms} - datastore := backend.NewDatastore() + datastore := datastore.NewDatastore() for _, m := range models { datastore.ModelSet(m) } @@ -54,7 +55,7 @@ func StartExtProc( } // startExtProc starts an extProc server with fake pods. -func startExtProc(logger logr.Logger, port int, datastore backend.Datastore) *grpc.Server { +func startExtProc(logger logr.Logger, port int, datastore datastore.Datastore) *grpc.Server { lis, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) if err != nil { logutil.Fatal(logger, err, "Failed to listen", "port", port) @@ -95,10 +96,10 @@ func GenerateRequest(logger logr.Logger, prompt, model string) *extProcPb.Proces return req } -func FakePodMetrics(index int, metrics backend.Metrics) *backend.PodMetrics { +func FakePodMetrics(index int, metrics datastore.Metrics) *datastore.PodMetrics { address := fmt.Sprintf("address-%v", index) - pod := backend.PodMetrics{ - Pod: backend.Pod{ + pod := datastore.PodMetrics{ + Pod: datastore.Pod{ NamespacedName: types.NamespacedName{Name: fmt.Sprintf("pod-%v", index)}, Address: address, }, diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index 0e30ac69..89fc02d7 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -32,6 +32,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/envtest" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" extprocutils "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/test" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" @@ -55,7 +56,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { tests := []struct { name string req *extProcPb.ProcessingRequest - pods []*backend.PodMetrics + pods []*datastore.PodMetrics wantHeaders []*configPb.HeaderValueOption wantMetadata *structpb.Struct wantBody []byte @@ -66,16 +67,16 @@ func TestKubeInferenceModelRequest(t *testing.T) { name: "select lower queue and kv cache, no active lora", req: extprocutils.GenerateRequest(logger, "test1", "my-model"), // pod-1 will be picked because it has relatively low queue size and low KV cache. - pods: []*backend.PodMetrics{ - extprocutils.FakePodMetrics(0, backend.Metrics{ + pods: []*datastore.PodMetrics{ + extprocutils.FakePodMetrics(0, datastore.Metrics{ WaitingQueueSize: 3, KVCacheUsagePercent: 0.2, }), - extprocutils.FakePodMetrics(1, backend.Metrics{ + extprocutils.FakePodMetrics(1, datastore.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.1, }), - extprocutils.FakePodMetrics(2, backend.Metrics{ + extprocutils.FakePodMetrics(2, datastore.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.2, }), @@ -111,8 +112,8 @@ func TestKubeInferenceModelRequest(t *testing.T) { req: extprocutils.GenerateRequest(logger, "test2", "sql-lora"), // pod-1 will be picked because it has relatively low queue size, with the requested // model being active, and has low KV cache. - pods: []*backend.PodMetrics{ - extprocutils.FakePodMetrics(0, backend.Metrics{ + pods: []*datastore.PodMetrics{ + extprocutils.FakePodMetrics(0, datastore.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, ActiveModels: map[string]int{ @@ -120,7 +121,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "bar": 1, }, }), - extprocutils.FakePodMetrics(1, backend.Metrics{ + extprocutils.FakePodMetrics(1, datastore.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.1, ActiveModels: map[string]int{ @@ -128,7 +129,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "sql-lora-1fdg2": 1, }, }), - extprocutils.FakePodMetrics(2, backend.Metrics{ + extprocutils.FakePodMetrics(2, datastore.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.2, ActiveModels: map[string]int{ @@ -168,8 +169,8 @@ func TestKubeInferenceModelRequest(t *testing.T) { // pod-2 will be picked despite it NOT having the requested model being active // as it's above the affinity for queue size. Also is critical, so we should // still honor request despite all queues > 5 - pods: []*backend.PodMetrics{ - extprocutils.FakePodMetrics(0, backend.Metrics{ + pods: []*datastore.PodMetrics{ + extprocutils.FakePodMetrics(0, datastore.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.2, ActiveModels: map[string]int{ @@ -177,7 +178,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "bar": 1, }, }), - extprocutils.FakePodMetrics(1, backend.Metrics{ + extprocutils.FakePodMetrics(1, datastore.Metrics{ WaitingQueueSize: 50, KVCacheUsagePercent: 0.1, ActiveModels: map[string]int{ @@ -185,7 +186,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "sql-lora-1fdg2": 1, }, }), - extprocutils.FakePodMetrics(2, backend.Metrics{ + extprocutils.FakePodMetrics(2, datastore.Metrics{ WaitingQueueSize: 6, KVCacheUsagePercent: 0.2, ActiveModels: map[string]int{ @@ -224,8 +225,8 @@ func TestKubeInferenceModelRequest(t *testing.T) { req: extprocutils.GenerateRequest(logger, "test4", "sql-lora-sheddable"), // no pods will be picked as all models are either above kv threshold, // queue threshold, or both. - pods: []*backend.PodMetrics{ - extprocutils.FakePodMetrics(0, backend.Metrics{ + pods: []*datastore.PodMetrics{ + extprocutils.FakePodMetrics(0, datastore.Metrics{ WaitingQueueSize: 6, KVCacheUsagePercent: 0.2, ActiveModels: map[string]int{ @@ -234,7 +235,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "sql-lora-1fdg3": 1, }, }), - extprocutils.FakePodMetrics(1, backend.Metrics{ + extprocutils.FakePodMetrics(1, datastore.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.85, ActiveModels: map[string]int{ @@ -242,7 +243,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "sql-lora-1fdg3": 1, }, }), - extprocutils.FakePodMetrics(2, backend.Metrics{ + extprocutils.FakePodMetrics(2, datastore.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.9, ActiveModels: map[string]int{ @@ -265,8 +266,8 @@ func TestKubeInferenceModelRequest(t *testing.T) { name: "noncritical, but one server has capacity, do not shed", req: extprocutils.GenerateRequest(logger, "test5", "sql-lora-sheddable"), // pod 0 will be picked as all other models are above threshold - pods: []*backend.PodMetrics{ - extprocutils.FakePodMetrics(0, backend.Metrics{ + pods: []*datastore.PodMetrics{ + extprocutils.FakePodMetrics(0, datastore.Metrics{ WaitingQueueSize: 4, KVCacheUsagePercent: 0.2, ActiveModels: map[string]int{ @@ -275,7 +276,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "sql-lora-1fdg3": 1, }, }), - extprocutils.FakePodMetrics(1, backend.Metrics{ + extprocutils.FakePodMetrics(1, datastore.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.85, ActiveModels: map[string]int{ @@ -283,7 +284,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "sql-lora-1fdg3": 1, }, }), - extprocutils.FakePodMetrics(2, backend.Metrics{ + extprocutils.FakePodMetrics(2, datastore.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.9, ActiveModels: map[string]int{ @@ -364,8 +365,8 @@ func TestKubeInferenceModelRequest(t *testing.T) { } } -func setUpHermeticServer(podMetrics []*backend.PodMetrics) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { - pms := make(map[types.NamespacedName]*backend.PodMetrics) +func setUpHermeticServer(podMetrics []*datastore.PodMetrics) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { + pms := make(map[types.NamespacedName]*datastore.PodMetrics) for _, pm := range podMetrics { pms[pm.NamespacedName] = pm } @@ -441,7 +442,7 @@ func BeforeSuit(t *testing.T) func() { serverRunner = runserver.NewDefaultExtProcServerRunner() // Adjust from defaults serverRunner.PoolName = "vllm-llama2-7b-pool" - serverRunner.Datastore = backend.NewDatastore() + serverRunner.Datastore = datastore.NewDatastore() serverRunner.SecureServing = false if err := serverRunner.SetupWithManager(mgr); err != nil { From cab2472a1238dcdd8993e5a6c5e3e00686555852 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Kupka?= Date: Wed, 19 Feb 2025 16:14:27 +0100 Subject: [PATCH 33/96] Move pkg/ext-proc -> cmd/ext-proc (#368) * Move pkg/ext-proc -> cmd/ext-proc * Rework Dockerfile - Cache dependencies - Upload only the files needed --- Dockerfile | 11 +++++++++-- {pkg => cmd}/ext-proc/health.go | 0 {pkg => cmd}/ext-proc/main.go | 2 +- go.mod | 2 +- {pkg/ext-proc/internal => internal}/runnable/grpc.go | 0 .../internal => internal}/runnable/leader_election.go | 0 pkg/ext-proc/server/runserver.go | 2 +- 7 files changed, 12 insertions(+), 5 deletions(-) rename {pkg => cmd}/ext-proc/health.go (100%) rename {pkg => cmd}/ext-proc/main.go (99%) rename {pkg/ext-proc/internal => internal}/runnable/grpc.go (100%) rename {pkg/ext-proc/internal => internal}/runnable/leader_election.go (100%) diff --git a/Dockerfile b/Dockerfile index e854e133..5d6f08a5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,10 +9,17 @@ ENV CGO_ENABLED=0 ENV GOOS=linux ENV GOARCH=amd64 +# Dependencies WORKDIR /src -COPY . . -WORKDIR /src/pkg/ext-proc +COPY go.mod go.sum ./ RUN go mod download + +# Sources +COPY cmd ./cmd +COPY pkg ./pkg +COPY internal ./internal +COPY api ./api +WORKDIR /src/cmd/ext-proc RUN go build -o /ext-proc ## Multistage deploy diff --git a/pkg/ext-proc/health.go b/cmd/ext-proc/health.go similarity index 100% rename from pkg/ext-proc/health.go rename to cmd/ext-proc/health.go diff --git a/pkg/ext-proc/main.go b/cmd/ext-proc/main.go similarity index 99% rename from pkg/ext-proc/main.go rename to cmd/ext-proc/main.go index d43f2c57..fa4f5b4c 100644 --- a/pkg/ext-proc/main.go +++ b/cmd/ext-proc/main.go @@ -24,10 +24,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend/vllm" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/internal/runnable" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" diff --git a/go.mod b/go.mod index 25daf027..ca4a1633 100644 --- a/go.mod +++ b/go.mod @@ -27,7 +27,6 @@ require ( k8s.io/client-go v0.32.2 k8s.io/code-generator v0.32.2 k8s.io/component-base v0.32.2 - k8s.io/klog/v2 v2.130.1 k8s.io/utils v0.0.0-20241210054802-24370beab758 sigs.k8s.io/controller-runtime v0.20.2 sigs.k8s.io/structured-merge-diff/v4 v4.5.0 @@ -137,6 +136,7 @@ require ( gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiserver v0.32.2 // indirect k8s.io/gengo/v2 v2.0.0-20240911193312-2b36238f13e9 // indirect + k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 // indirect sigs.k8s.io/controller-tools v0.14.0 // indirect diff --git a/pkg/ext-proc/internal/runnable/grpc.go b/internal/runnable/grpc.go similarity index 100% rename from pkg/ext-proc/internal/runnable/grpc.go rename to internal/runnable/grpc.go diff --git a/pkg/ext-proc/internal/runnable/leader_election.go b/internal/runnable/leader_election.go similarity index 100% rename from pkg/ext-proc/internal/runnable/leader_election.go rename to internal/runnable/leader_election.go diff --git a/pkg/ext-proc/server/runserver.go b/pkg/ext-proc/server/runserver.go index 7b0209a6..eb6b2cf7 100644 --- a/pkg/ext-proc/server/runserver.go +++ b/pkg/ext-proc/server/runserver.go @@ -19,11 +19,11 @@ import ( "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/controller" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/internal/runnable" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" ) From 435a40d25b2e60fef5bac9188d03a712ecb77f53 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Wed, 19 Feb 2025 17:36:27 +0200 Subject: [PATCH 34/96] added license header to all .go files (#370) Signed-off-by: Nir Rozenbaum --- cmd/ext-proc/health.go | 16 ++++++++++++++++ cmd/ext-proc/main.go | 16 ++++++++++++++++ pkg/ext-proc/backend/fake.go | 16 ++++++++++++++++ pkg/ext-proc/backend/provider.go | 16 ++++++++++++++++ pkg/ext-proc/backend/provider_test.go | 16 ++++++++++++++++ pkg/ext-proc/backend/vllm/metrics.go | 16 ++++++++++++++++ pkg/ext-proc/backend/vllm/metrics_test.go | 16 ++++++++++++++++ .../controller/inferencemodel_reconciler.go | 16 ++++++++++++++++ .../controller/inferencemodel_reconciler_test.go | 16 ++++++++++++++++ .../controller/inferencepool_reconciler.go | 16 ++++++++++++++++ .../controller/inferencepool_reconciler_test.go | 16 ++++++++++++++++ pkg/ext-proc/controller/pod_reconciler.go | 16 ++++++++++++++++ pkg/ext-proc/controller/pod_reconciler_test.go | 16 ++++++++++++++++ pkg/ext-proc/datastore/datastore.go | 16 ++++++++++++++++ pkg/ext-proc/datastore/datastore_test.go | 16 ++++++++++++++++ pkg/ext-proc/datastore/types.go | 16 ++++++++++++++++ pkg/ext-proc/handlers/request.go | 16 ++++++++++++++++ pkg/ext-proc/handlers/response.go | 16 ++++++++++++++++ pkg/ext-proc/handlers/response_test.go | 16 ++++++++++++++++ pkg/ext-proc/handlers/server.go | 16 ++++++++++++++++ pkg/ext-proc/metrics/metrics.go | 16 ++++++++++++++++ pkg/ext-proc/metrics/metrics_test.go | 16 ++++++++++++++++ pkg/ext-proc/scheduling/filter.go | 16 ++++++++++++++++ pkg/ext-proc/scheduling/filter_test.go | 16 ++++++++++++++++ pkg/ext-proc/scheduling/scheduler.go | 16 ++++++++++++++++ pkg/ext-proc/scheduling/types.go | 16 ++++++++++++++++ pkg/ext-proc/server/runserver.go | 16 ++++++++++++++++ pkg/ext-proc/server/runserver_test.go | 16 ++++++++++++++++ pkg/ext-proc/test/benchmark/benchmark.go | 16 ++++++++++++++++ pkg/ext-proc/test/utils.go | 16 ++++++++++++++++ pkg/ext-proc/util/logging/fatal.go | 16 ++++++++++++++++ pkg/ext-proc/util/logging/logger.go | 16 ++++++++++++++++ pkg/ext-proc/util/logging/logging_const.go | 16 ++++++++++++++++ pkg/ext-proc/util/testing/wrappers.go | 16 ++++++++++++++++ test/integration/hermetic_test.go | 16 ++++++++++++++++ 35 files changed, 560 insertions(+) diff --git a/cmd/ext-proc/health.go b/cmd/ext-proc/health.go index 525440cb..26a58df8 100644 --- a/cmd/ext-proc/health.go +++ b/cmd/ext-proc/health.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package main import ( diff --git a/cmd/ext-proc/main.go b/cmd/ext-proc/main.go index fa4f5b4c..047a1fa7 100644 --- a/cmd/ext-proc/main.go +++ b/cmd/ext-proc/main.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package main import ( diff --git a/pkg/ext-proc/backend/fake.go b/pkg/ext-proc/backend/fake.go index 2ddf2932..2de34c16 100644 --- a/pkg/ext-proc/backend/fake.go +++ b/pkg/ext-proc/backend/fake.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package backend import ( diff --git a/pkg/ext-proc/backend/provider.go b/pkg/ext-proc/backend/provider.go index 103659db..974319f7 100644 --- a/pkg/ext-proc/backend/provider.go +++ b/pkg/ext-proc/backend/provider.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package backend import ( diff --git a/pkg/ext-proc/backend/provider_test.go b/pkg/ext-proc/backend/provider_test.go index 95936f7e..7736dd8d 100644 --- a/pkg/ext-proc/backend/provider_test.go +++ b/pkg/ext-proc/backend/provider_test.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package backend import ( diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/ext-proc/backend/vllm/metrics.go index 4785e484..2fd03172 100644 --- a/pkg/ext-proc/backend/vllm/metrics.go +++ b/pkg/ext-proc/backend/vllm/metrics.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + // Package vllm provides vllm specific pod metrics implementation. package vllm diff --git a/pkg/ext-proc/backend/vllm/metrics_test.go b/pkg/ext-proc/backend/vllm/metrics_test.go index 23121ad5..1c9d5448 100644 --- a/pkg/ext-proc/backend/vllm/metrics_test.go +++ b/pkg/ext-proc/backend/vllm/metrics_test.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package vllm import ( diff --git a/pkg/ext-proc/controller/inferencemodel_reconciler.go b/pkg/ext-proc/controller/inferencemodel_reconciler.go index a4622988..cca05fce 100644 --- a/pkg/ext-proc/controller/inferencemodel_reconciler.go +++ b/pkg/ext-proc/controller/inferencemodel_reconciler.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package controller import ( diff --git a/pkg/ext-proc/controller/inferencemodel_reconciler_test.go b/pkg/ext-proc/controller/inferencemodel_reconciler_test.go index c3ebb646..583f5f75 100644 --- a/pkg/ext-proc/controller/inferencemodel_reconciler_test.go +++ b/pkg/ext-proc/controller/inferencemodel_reconciler_test.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package controller import ( diff --git a/pkg/ext-proc/controller/inferencepool_reconciler.go b/pkg/ext-proc/controller/inferencepool_reconciler.go index 5c9e4969..b2cd01c0 100644 --- a/pkg/ext-proc/controller/inferencepool_reconciler.go +++ b/pkg/ext-proc/controller/inferencepool_reconciler.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package controller import ( diff --git a/pkg/ext-proc/controller/inferencepool_reconciler_test.go b/pkg/ext-proc/controller/inferencepool_reconciler_test.go index ec2fdfe1..925cb236 100644 --- a/pkg/ext-proc/controller/inferencepool_reconciler_test.go +++ b/pkg/ext-proc/controller/inferencepool_reconciler_test.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package controller import ( diff --git a/pkg/ext-proc/controller/pod_reconciler.go b/pkg/ext-proc/controller/pod_reconciler.go index 209d2ca7..871e1da5 100644 --- a/pkg/ext-proc/controller/pod_reconciler.go +++ b/pkg/ext-proc/controller/pod_reconciler.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package controller import ( diff --git a/pkg/ext-proc/controller/pod_reconciler_test.go b/pkg/ext-proc/controller/pod_reconciler_test.go index b146745a..2e62be28 100644 --- a/pkg/ext-proc/controller/pod_reconciler_test.go +++ b/pkg/ext-proc/controller/pod_reconciler_test.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package controller import ( diff --git a/pkg/ext-proc/datastore/datastore.go b/pkg/ext-proc/datastore/datastore.go index f85f9014..76b61e77 100644 --- a/pkg/ext-proc/datastore/datastore.go +++ b/pkg/ext-proc/datastore/datastore.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package datastore import ( diff --git a/pkg/ext-proc/datastore/datastore_test.go b/pkg/ext-proc/datastore/datastore_test.go index 6c5874df..f32d8d77 100644 --- a/pkg/ext-proc/datastore/datastore_test.go +++ b/pkg/ext-proc/datastore/datastore_test.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package datastore import ( diff --git a/pkg/ext-proc/datastore/types.go b/pkg/ext-proc/datastore/types.go index 221c6630..c21a3d2b 100644 --- a/pkg/ext-proc/datastore/types.go +++ b/pkg/ext-proc/datastore/types.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + // Package datastore is a library to interact with backend model servers such as probing metrics. package datastore diff --git a/pkg/ext-proc/handlers/request.go b/pkg/ext-proc/handlers/request.go index b3ef08e0..7f6178d6 100644 --- a/pkg/ext-proc/handlers/request.go +++ b/pkg/ext-proc/handlers/request.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package handlers import ( diff --git a/pkg/ext-proc/handlers/response.go b/pkg/ext-proc/handlers/response.go index 06da8106..afe7549b 100644 --- a/pkg/ext-proc/handlers/response.go +++ b/pkg/ext-proc/handlers/response.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package handlers import ( diff --git a/pkg/ext-proc/handlers/response_test.go b/pkg/ext-proc/handlers/response_test.go index 67875e05..dbb7e700 100644 --- a/pkg/ext-proc/handlers/response_test.go +++ b/pkg/ext-proc/handlers/response_test.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package handlers import ( diff --git a/pkg/ext-proc/handlers/server.go b/pkg/ext-proc/handlers/server.go index 05de0c42..a5274275 100644 --- a/pkg/ext-proc/handlers/server.go +++ b/pkg/ext-proc/handlers/server.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package handlers import ( diff --git a/pkg/ext-proc/metrics/metrics.go b/pkg/ext-proc/metrics/metrics.go index e3226f47..a396f4ae 100644 --- a/pkg/ext-proc/metrics/metrics.go +++ b/pkg/ext-proc/metrics/metrics.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package metrics import ( diff --git a/pkg/ext-proc/metrics/metrics_test.go b/pkg/ext-proc/metrics/metrics_test.go index d24afdb1..cf638b93 100644 --- a/pkg/ext-proc/metrics/metrics_test.go +++ b/pkg/ext-proc/metrics/metrics_test.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package metrics import ( diff --git a/pkg/ext-proc/scheduling/filter.go b/pkg/ext-proc/scheduling/filter.go index 4d53e720..36691a73 100644 --- a/pkg/ext-proc/scheduling/filter.go +++ b/pkg/ext-proc/scheduling/filter.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package scheduling import ( diff --git a/pkg/ext-proc/scheduling/filter_test.go b/pkg/ext-proc/scheduling/filter_test.go index b2ae4b89..01909fea 100644 --- a/pkg/ext-proc/scheduling/filter_test.go +++ b/pkg/ext-proc/scheduling/filter_test.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package scheduling import ( diff --git a/pkg/ext-proc/scheduling/scheduler.go b/pkg/ext-proc/scheduling/scheduler.go index 1e56fee3..49402fb3 100644 --- a/pkg/ext-proc/scheduling/scheduler.go +++ b/pkg/ext-proc/scheduling/scheduler.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + // Package scheduling implements request scheduling algorithms. package scheduling diff --git a/pkg/ext-proc/scheduling/types.go b/pkg/ext-proc/scheduling/types.go index cfb9d3b8..29e6648d 100644 --- a/pkg/ext-proc/scheduling/types.go +++ b/pkg/ext-proc/scheduling/types.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package scheduling // LLMRequest is a structured representation of the fields we parse out of the LLMRequest body. diff --git a/pkg/ext-proc/server/runserver.go b/pkg/ext-proc/server/runserver.go index eb6b2cf7..795b242d 100644 --- a/pkg/ext-proc/server/runserver.go +++ b/pkg/ext-proc/server/runserver.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package server import ( diff --git a/pkg/ext-proc/server/runserver_test.go b/pkg/ext-proc/server/runserver_test.go index 32af2cd8..438dc096 100644 --- a/pkg/ext-proc/server/runserver_test.go +++ b/pkg/ext-proc/server/runserver_test.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package server_test import ( diff --git a/pkg/ext-proc/test/benchmark/benchmark.go b/pkg/ext-proc/test/benchmark/benchmark.go index 3820998d..dc06a27a 100644 --- a/pkg/ext-proc/test/benchmark/benchmark.go +++ b/pkg/ext-proc/test/benchmark/benchmark.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package main import ( diff --git a/pkg/ext-proc/test/utils.go b/pkg/ext-proc/test/utils.go index a2d833e0..ef83c932 100644 --- a/pkg/ext-proc/test/utils.go +++ b/pkg/ext-proc/test/utils.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package test import ( diff --git a/pkg/ext-proc/util/logging/fatal.go b/pkg/ext-proc/util/logging/fatal.go index 1f85b450..d8a9a937 100644 --- a/pkg/ext-proc/util/logging/fatal.go +++ b/pkg/ext-proc/util/logging/fatal.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package logging import ( diff --git a/pkg/ext-proc/util/logging/logger.go b/pkg/ext-proc/util/logging/logger.go index 086a012f..5e6ed88d 100644 --- a/pkg/ext-proc/util/logging/logger.go +++ b/pkg/ext-proc/util/logging/logger.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package logging import ( diff --git a/pkg/ext-proc/util/logging/logging_const.go b/pkg/ext-proc/util/logging/logging_const.go index a6131d18..823ab28b 100644 --- a/pkg/ext-proc/util/logging/logging_const.go +++ b/pkg/ext-proc/util/logging/logging_const.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package logging const ( diff --git a/pkg/ext-proc/util/testing/wrappers.go b/pkg/ext-proc/util/testing/wrappers.go index f9005499..7c9a2939 100644 --- a/pkg/ext-proc/util/testing/wrappers.go +++ b/pkg/ext-proc/util/testing/wrappers.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package testing import ( diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index 89fc02d7..18efe7bf 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -1,3 +1,19 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + // Package test contains e2e tests for the ext proc while faking the backend pods. package integration From 0f67df5bcf2df64cf54b87f73a15262d89925310 Mon Sep 17 00:00:00 2001 From: Kuromesi Date: Thu, 20 Feb 2025 00:00:27 +0800 Subject: [PATCH 35/96] fix inference extension not correctly scrape pod metrics (#366) Signed-off-by: Kuromesi --- pkg/ext-proc/backend/vllm/metrics.go | 2 +- pkg/ext-proc/controller/pod_reconciler_test.go | 10 +++++----- pkg/ext-proc/datastore/datastore.go | 5 ++++- pkg/ext-proc/datastore/types.go | 10 ++++++++++ 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/ext-proc/backend/vllm/metrics.go index 2fd03172..59a132c8 100644 --- a/pkg/ext-proc/backend/vllm/metrics.go +++ b/pkg/ext-proc/backend/vllm/metrics.go @@ -61,7 +61,7 @@ func (p *PodMetricsClientImpl) FetchMetrics( // Currently the metrics endpoint is hard-coded, which works with vLLM. // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. - url := fmt.Sprintf("http://%s/metrics", existing.Address) + url := existing.BuildScrapeEndpoint() req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) diff --git a/pkg/ext-proc/controller/pod_reconciler_test.go b/pkg/ext-proc/controller/pod_reconciler_test.go index 2e62be28..c87ee54d 100644 --- a/pkg/ext-proc/controller/pod_reconciler_test.go +++ b/pkg/ext-proc/controller/pod_reconciler_test.go @@ -36,10 +36,10 @@ import ( ) var ( - basePod1 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}, Address: "address-1"}} - basePod2 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}, Address: "address-2"}} - basePod3 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}, Address: "address-3"}} - basePod11 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}, Address: "address-11"}} + basePod1 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}, Address: "address-1", ScrapePath: "/metrics", ScrapePort: 8000}} + basePod2 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}, Address: "address-2", ScrapePath: "/metrics", ScrapePort: 8000}} + basePod3 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}, Address: "address-3", ScrapePath: "/metrics", ScrapePort: 8000}} + basePod11 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}, Address: "address-11", ScrapePath: "/metrics", ScrapePort: 8000}} ) func TestUpdateDatastore_PodReconciler(t *testing.T) { @@ -278,7 +278,7 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { func populateMap(pods ...*datastore.PodMetrics) *sync.Map { newMap := &sync.Map{} for _, pod := range pods { - newMap.Store(pod.NamespacedName, &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: pod.NamespacedName, Address: pod.Address}}) + newMap.Store(pod.NamespacedName, &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: pod.NamespacedName, Address: pod.Address, ScrapePort: pod.ScrapePort, ScrapePath: pod.ScrapePath}}) } return newMap } diff --git a/pkg/ext-proc/datastore/datastore.go b/pkg/ext-proc/datastore/datastore.go index 76b61e77..60236496 100644 --- a/pkg/ext-proc/datastore/datastore.go +++ b/pkg/ext-proc/datastore/datastore.go @@ -182,13 +182,16 @@ func (ds *datastore) PodDelete(namespacedName types.NamespacedName) { } func (ds *datastore) PodUpdateOrAddIfNotExist(pod *corev1.Pod) bool { + pool, _ := ds.PoolGet() new := &PodMetrics{ Pod: Pod{ NamespacedName: types.NamespacedName{ Name: pod.Name, Namespace: pod.Namespace, }, - Address: pod.Status.PodIP, + Address: pod.Status.PodIP, + ScrapePath: "/metrics", + ScrapePort: pool.Spec.TargetPortNumber, }, Metrics: Metrics{ ActiveModels: make(map[string]int), diff --git a/pkg/ext-proc/datastore/types.go b/pkg/ext-proc/datastore/types.go index c21a3d2b..237e98ca 100644 --- a/pkg/ext-proc/datastore/types.go +++ b/pkg/ext-proc/datastore/types.go @@ -26,6 +26,10 @@ import ( type Pod struct { NamespacedName types.NamespacedName Address string + + // metrics scrape options + ScrapePort int32 + ScrapePath string } type Metrics struct { @@ -57,6 +61,8 @@ func (pm *PodMetrics) Clone() *PodMetrics { Pod: Pod{ NamespacedName: pm.NamespacedName, Address: pm.Address, + ScrapePort: pm.ScrapePort, + ScrapePath: pm.ScrapePath, }, Metrics: Metrics{ ActiveModels: cm, @@ -68,3 +74,7 @@ func (pm *PodMetrics) Clone() *PodMetrics { } return clone } + +func (pm *PodMetrics) BuildScrapeEndpoint() string { + return fmt.Sprintf("http://%s:%d%s", pm.Address, pm.ScrapePort, pm.ScrapePath) +} From 9f346734d7e3c5d3b7eec835f606394006471772 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Kupka?= Date: Wed, 19 Feb 2025 17:18:27 +0100 Subject: [PATCH 36/96] Move pkg/manifests -> config/manifests (#371) --- {pkg => config}/manifests/ext_proc.yaml | 0 .../manifests/gateway/enable_patch_policy.yaml | 0 .../manifests/gateway/extension_policy.yaml | 0 {pkg => config}/manifests/gateway/gateway.yaml | 0 .../manifests/gateway/patch_policy.yaml | 0 .../manifests/gateway/traffic_policy.yaml | 0 {pkg => config}/manifests/inferencemodel.yaml | 0 {pkg => config}/manifests/vllm/deployment.yaml | 0 hack/release-quickstart.sh | 8 ++++---- site-src/guides/index.md | 16 ++++++++-------- test/e2e/e2e_suite_test.go | 4 ++-- 11 files changed, 14 insertions(+), 14 deletions(-) rename {pkg => config}/manifests/ext_proc.yaml (100%) rename {pkg => config}/manifests/gateway/enable_patch_policy.yaml (100%) rename {pkg => config}/manifests/gateway/extension_policy.yaml (100%) rename {pkg => config}/manifests/gateway/gateway.yaml (100%) rename {pkg => config}/manifests/gateway/patch_policy.yaml (100%) rename {pkg => config}/manifests/gateway/traffic_policy.yaml (100%) rename {pkg => config}/manifests/inferencemodel.yaml (100%) rename {pkg => config}/manifests/vllm/deployment.yaml (100%) diff --git a/pkg/manifests/ext_proc.yaml b/config/manifests/ext_proc.yaml similarity index 100% rename from pkg/manifests/ext_proc.yaml rename to config/manifests/ext_proc.yaml diff --git a/pkg/manifests/gateway/enable_patch_policy.yaml b/config/manifests/gateway/enable_patch_policy.yaml similarity index 100% rename from pkg/manifests/gateway/enable_patch_policy.yaml rename to config/manifests/gateway/enable_patch_policy.yaml diff --git a/pkg/manifests/gateway/extension_policy.yaml b/config/manifests/gateway/extension_policy.yaml similarity index 100% rename from pkg/manifests/gateway/extension_policy.yaml rename to config/manifests/gateway/extension_policy.yaml diff --git a/pkg/manifests/gateway/gateway.yaml b/config/manifests/gateway/gateway.yaml similarity index 100% rename from pkg/manifests/gateway/gateway.yaml rename to config/manifests/gateway/gateway.yaml diff --git a/pkg/manifests/gateway/patch_policy.yaml b/config/manifests/gateway/patch_policy.yaml similarity index 100% rename from pkg/manifests/gateway/patch_policy.yaml rename to config/manifests/gateway/patch_policy.yaml diff --git a/pkg/manifests/gateway/traffic_policy.yaml b/config/manifests/gateway/traffic_policy.yaml similarity index 100% rename from pkg/manifests/gateway/traffic_policy.yaml rename to config/manifests/gateway/traffic_policy.yaml diff --git a/pkg/manifests/inferencemodel.yaml b/config/manifests/inferencemodel.yaml similarity index 100% rename from pkg/manifests/inferencemodel.yaml rename to config/manifests/inferencemodel.yaml diff --git a/pkg/manifests/vllm/deployment.yaml b/config/manifests/vllm/deployment.yaml similarity index 100% rename from pkg/manifests/vllm/deployment.yaml rename to config/manifests/vllm/deployment.yaml diff --git a/hack/release-quickstart.sh b/hack/release-quickstart.sh index f4701508..a21047c3 100755 --- a/hack/release-quickstart.sh +++ b/hack/release-quickstart.sh @@ -36,9 +36,9 @@ sed -i.bak -E "s|(releases/download/)v[0-9]+\.[0-9]+\.0-rc\.?[0-9]+|\1${RELEASE_ sed -i.bak "s|kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd|kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/${RELEASE_TAG}/manifests.yaml|g" "$README" # ----------------------------------------------------------------------------- -# Update pkg/manifests/ext_proc.yaml +# Update config/manifests/ext_proc.yaml # ----------------------------------------------------------------------------- -EXT_PROC="pkg/manifests/ext_proc.yaml" +EXT_PROC="config/manifests/ext_proc.yaml" echo "Updating ${EXT_PROC} ..." # Update the EPP container tag. @@ -51,9 +51,9 @@ sed -i.bak '/us-central1-docker.pkg.dev\/k8s-staging-images\/gateway-api-inferen sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$EXT_PROC" # ----------------------------------------------------------------------------- -# Update pkg/manifests/vllm/deployment.yaml +# Update config/manifests/vllm/deployment.yaml # ----------------------------------------------------------------------------- -VLLM_DEPLOY="pkg/manifests/vllm/deployment.yaml" +VLLM_DEPLOY="config/manifests/vllm/deployment.yaml" echo "Updating ${VLLM_DEPLOY} ..." # Update the vLLM image version diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 34fff20c..4478128f 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -17,7 +17,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. ```bash kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2 - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/vllm/deployment.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/deployment.yaml ``` ### Install the Inference Extension CRDs @@ -31,14 +31,14 @@ This quickstart guide is intended for engineers familiar with k8s and model serv Deploy the sample InferenceModel which is configured to load balance traffic between the `tweet-summary-0` and `tweet-summary-1` [LoRA adapters](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server. ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/inferencemodel.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml ``` ### Update Envoy Gateway Config to enable Patch Policy** Our custom LLM Gateway ext-proc is patched into the existing envoy gateway via `EnvoyPatchPolicy`. To enable this feature, we must extend the Envoy Gateway config map. To do this, simply run: ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/enable_patch_policy.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/enable_patch_policy.yaml kubectl rollout restart deployment envoy-gateway -n envoy-gateway-system ``` Additionally, if you would like to enable the admin interface, you can uncomment the admin lines and run this again. @@ -46,7 +46,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Deploy Gateway ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/gateway.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gateway.yaml ``` > **_NOTE:_** This file couples together the gateway infra and the HTTPRoute infra for a convenient, quick startup. Creating additional/different InferencePools on the same gateway will require an additional set of: `Backend`, `HTTPRoute`, the resources included in the `./manifests/gateway/ext-proc.yaml` file, and an additional `./manifests/gateway/patch_policy.yaml` file. ***Should you choose to experiment, familiarity with xDS and Envoy are very useful.*** @@ -59,13 +59,13 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Deploy the Inference Extension and InferencePool ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/ext_proc.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/ext_proc.yaml ``` ### Deploy Envoy Gateway Custom Policies ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/extension_policy.yaml - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/patch_policy.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/extension_policy.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/patch_policy.yaml ``` > **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further. @@ -74,7 +74,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv For high-traffic benchmarking you can apply this manifest to avoid any defaults that can cause timeouts/errors. ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/pkg/manifests/gateway/traffic_policy.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/traffic_policy.yaml ``` ### Try it out diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index 4a0dd2a8..c4342775 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -69,7 +69,7 @@ const ( // clientManifest is the manifest for the client test resources. clientManifest = "../testdata/client.yaml" // modelServerManifest is the manifest for the model server test resources. - modelServerManifest = "../../pkg/manifests/vllm/deployment.yaml" + modelServerManifest = "../../config/manifests/vllm/deployment.yaml" // modelServerSecretManifest is the manifest for the model server secret resource. modelServerSecretManifest = "../testdata/model-secret.yaml" // inferPoolManifest is the manifest for the inference pool CRD. @@ -77,7 +77,7 @@ const ( // inferModelManifest is the manifest for the inference model CRD. inferModelManifest = "../../config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml" // inferExtManifest is the manifest for the inference extension test resources. - inferExtManifest = "../../pkg/manifests/ext_proc.yaml" + inferExtManifest = "../../config/manifests/ext_proc.yaml" // envoyManifest is the manifest for the envoy proxy test resources. envoyManifest = "../testdata/envoy.yaml" ) From 6130ee0cd2a4f551002fc6e218615d7bf342cc44 Mon Sep 17 00:00:00 2001 From: Jeff Luo Date: Wed, 19 Feb 2025 13:58:27 -0500 Subject: [PATCH 37/96] [Metrics] Add request error metrics (#269) This change defines some general errors, the list might grow in the future if more finer error types are needed. --- pkg/ext-proc/handlers/request.go | 14 ++-- pkg/ext-proc/handlers/response.go | 40 +++++++++- pkg/ext-proc/handlers/server.go | 58 ++++++++++++-- pkg/ext-proc/metrics/README.md | 1 + pkg/ext-proc/metrics/metrics.go | 18 +++++ pkg/ext-proc/metrics/metrics_test.go | 77 +++++++++++++++++-- .../testdata/request_error_total_metric | 5 ++ pkg/ext-proc/scheduling/scheduler.go | 7 +- pkg/ext-proc/util/error/error.go | 34 ++++++++ 9 files changed, 229 insertions(+), 25 deletions(-) create mode 100644 pkg/ext-proc/metrics/testdata/request_error_total_metric create mode 100644 pkg/ext-proc/util/error/error.go diff --git a/pkg/ext-proc/handlers/request.go b/pkg/ext-proc/handlers/request.go index 7f6178d6..34db206d 100644 --- a/pkg/ext-proc/handlers/request.go +++ b/pkg/ext-proc/handlers/request.go @@ -19,7 +19,6 @@ package handlers import ( "context" "encoding/json" - "errors" "fmt" "strconv" @@ -29,6 +28,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/error" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -49,14 +49,14 @@ func (s *Server) HandleRequestBody( var rb map[string]interface{} if err := json.Unmarshal(v.RequestBody.Body, &rb); err != nil { logger.V(logutil.DEFAULT).Error(err, "Error unmarshaling request body") - return nil, fmt.Errorf("error unmarshaling request body: %v", err) + return nil, errutil.Error{Code: errutil.BadRequest, Msg: fmt.Sprintf("error unmarshaling request body: %v", err)} } loggerVerbose.Info("Request body unmarshalled", "body", rb) // Resolve target models. model, ok := rb["model"].(string) if !ok { - return nil, errors.New("model not found in request") + return nil, errutil.Error{Code: errutil.BadRequest, Msg: "model not found in request"} } loggerVerbose.Info("Model requested", "model", model) modelName := model @@ -66,12 +66,12 @@ func (s *Server) HandleRequestBody( // are able to be requested by using their distinct name. modelObj, exist := s.datastore.ModelGet(model) if !exist { - return nil, fmt.Errorf("error finding a model object in InferenceModel for input %v", model) + return nil, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error finding a model object in InferenceModel for input %v", model)} } if len(modelObj.Spec.TargetModels) > 0 { modelName = datastore.RandomWeightedDraw(logger, modelObj, 0) if modelName == "" { - return nil, fmt.Errorf("error getting target model name for model %v", modelObj.Name) + return nil, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error getting target model name for model %v", modelObj.Name)} } } llmReq := &scheduling.LLMRequest{ @@ -89,14 +89,14 @@ func (s *Server) HandleRequestBody( requestBody, err = json.Marshal(rb) if err != nil { logger.V(logutil.DEFAULT).Error(err, "Error marshaling request body") - return nil, fmt.Errorf("error marshaling request body: %v", err) + return nil, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("error marshaling request body: %v", err)} } loggerVerbose.Info("Updated request body marshalled", "body", string(requestBody)) } targetPod, err := s.scheduler.Schedule(ctx, llmReq) if err != nil { - return nil, fmt.Errorf("failed to find target pod: %w", err) + return nil, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()} } logger.V(logutil.DEFAULT).Info("Request handled", diff --git a/pkg/ext-proc/handlers/response.go b/pkg/ext-proc/handlers/response.go index afe7549b..ed3082c5 100644 --- a/pkg/ext-proc/handlers/response.go +++ b/pkg/ext-proc/handlers/response.go @@ -24,6 +24,7 @@ import ( configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "sigs.k8s.io/controller-runtime/pkg/log" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/error" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -38,6 +39,43 @@ func (s *Server) HandleResponseHeaders( h := req.Request.(*extProcPb.ProcessingRequest_ResponseHeaders) loggerVerbose.Info("Headers before", "headers", h) + // Example header + // { + // "ResponseHeaders": { + // "headers": [ + // { + // "key": ":status", + // "raw_value": "200" + // }, + // { + // "key": "date", + // "raw_value": "Thu, 30 Jan 2025 18:50:48 GMT" + // }, + // { + // "key": "server", + // "raw_value": "uvicorn" + // }, + // { + // "key": "content-type", + // "raw_value": "text/event-stream; charset=utf-8" + // }, + // { + // "key": "transfer-encoding", + // "raw_value": "chunked" + // } + // ] + // } + // } + for _, header := range h.ResponseHeaders.Headers.GetHeaders() { + if header.Key == "status" { + code := header.RawValue[0] + if string(code) != "200" { + reqCtx.ResponseStatusCode = errutil.ModelServerError + } + break + } + } + resp := &extProcPb.ProcessingResponse{ Response: &extProcPb.ProcessingResponse_ResponseHeaders{ ResponseHeaders: &extProcPb.HeadersResponse{ @@ -99,7 +137,7 @@ func (s *Server) HandleResponseBody( res := Response{} if err := json.Unmarshal(body.ResponseBody.Body, &res); err != nil { - return nil, fmt.Errorf("unmarshaling response body: %v", err) + return nil, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("unmarshaling response body: %v", err)} } reqCtx.Response = res reqCtx.ResponseSize = len(body.ResponseBody.Body) diff --git a/pkg/ext-proc/handlers/server.go b/pkg/ext-proc/handlers/server.go index a5274275..506eaa97 100644 --- a/pkg/ext-proc/handlers/server.go +++ b/pkg/ext-proc/handlers/server.go @@ -30,6 +30,7 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/error" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -65,6 +66,18 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { // See https://github.com/envoyproxy/envoy/issues/17540. reqCtx := &RequestContext{} + // Create variable for error handling as each request should only report once for + // error metric. This doesn't cover the error "Cannot receive stream request" because + // such error might happen even the response is processed. + var err error + defer func(error) { + if reqCtx.ResponseStatusCode != "" { + metrics.RecordRequestErrCounter(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseStatusCode) + } else if err != nil { + metrics.RecordRequestErrCounter(reqCtx.Model, reqCtx.ResolvedTargetModel, errutil.CanonicalCode(err)) + } + }(err) + for { select { case <-ctx.Done(): @@ -72,11 +85,11 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { default: } - req, err := srv.Recv() - if err == io.EOF || errors.Is(err, context.Canceled) { + req, recvErr := srv.Recv() + if recvErr == io.EOF || errors.Is(recvErr, context.Canceled) { return nil } - if err != nil { + if recvErr != nil { // This error occurs very frequently, though it doesn't seem to have any impact. // TODO Figure out if we can remove this noise. loggerVerbose.Error(err, "Cannot receive stream request") @@ -113,12 +126,13 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { logger.V(logutil.DEFAULT).Error(nil, "Unknown Request type", "request", v) return status.Error(codes.Unknown, "unknown request type") } + if err != nil { logger.V(logutil.DEFAULT).Error(err, "Failed to process request", "request", req) - switch status.Code(err) { + switch errutil.CanonicalCode(err) { // This code can be returned by scheduler when there is no capacity for sheddable // requests. - case codes.ResourceExhausted: + case errutil.InferencePoolResourceExhausted: resp = &extProcPb.ProcessingResponse{ Response: &extProcPb.ProcessingResponse_ImmediateResponse{ ImmediateResponse: &extProcPb.ImmediateResponse{ @@ -128,6 +142,38 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { }, }, } + // This code can be returned by when EPP processes the request and run into server-side errors. + case errutil.Internal: + resp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: envoyTypePb.StatusCode_InternalServerError, + }, + }, + }, + } + // This code can be returned when users provide invalid json request. + case errutil.BadRequest: + resp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: envoyTypePb.StatusCode_BadRequest, + }, + }, + }, + } + case errutil.BadConfiguration: + resp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: envoyTypePb.StatusCode_NotFound, + }, + }, + }, + } default: return status.Errorf(status.Code(err), "failed to handle request: %v", err) } @@ -139,6 +185,7 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) } } + } // RequestContext stores context information during the life time of an HTTP request. @@ -153,4 +200,5 @@ type RequestContext struct { Response Response ResponseSize int ResponseComplete bool + ResponseStatusCode string } diff --git a/pkg/ext-proc/metrics/README.md b/pkg/ext-proc/metrics/README.md index 8adfd94e..1f68a0bd 100644 --- a/pkg/ext-proc/metrics/README.md +++ b/pkg/ext-proc/metrics/README.md @@ -41,6 +41,7 @@ spec: | Metric name | Metric Type | Description | Labels | Status | | ------------|--------------| ----------- | ------ | ------ | | inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_request_error_total | Counter | The counter of requests errors broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_model_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | diff --git a/pkg/ext-proc/metrics/metrics.go b/pkg/ext-proc/metrics/metrics.go index a396f4ae..cc21d531 100644 --- a/pkg/ext-proc/metrics/metrics.go +++ b/pkg/ext-proc/metrics/metrics.go @@ -44,6 +44,16 @@ var ( []string{"model_name", "target_model_name"}, ) + requestErrCounter = compbasemetrics.NewCounterVec( + &compbasemetrics.CounterOpts{ + Subsystem: InferenceModelComponent, + Name: "request_error_total", + Help: "Counter of inference model requests errors broken out for each model and target model.", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"model_name", "target_model_name", "error_code"}, + ) + requestLatencies = compbasemetrics.NewHistogramVec( &compbasemetrics.HistogramOpts{ Subsystem: InferenceModelComponent, @@ -139,6 +149,7 @@ var registerMetrics sync.Once func Register() { registerMetrics.Do(func() { legacyregistry.MustRegister(requestCounter) + legacyregistry.MustRegister(requestErrCounter) legacyregistry.MustRegister(requestLatencies) legacyregistry.MustRegister(requestSizes) legacyregistry.MustRegister(responseSizes) @@ -155,6 +166,13 @@ func RecordRequestCounter(modelName, targetModelName string) { requestCounter.WithLabelValues(modelName, targetModelName).Inc() } +// RecordRequestErrCounter records the number of error requests. +func RecordRequestErrCounter(modelName, targetModelName string, code string) { + if code != "" { + requestErrCounter.WithLabelValues(modelName, targetModelName, code).Inc() + } +} + // RecordRequestSizes records the request sizes. func RecordRequestSizes(modelName, targetModelName string, reqSize int) { requestSizes.WithLabelValues(modelName, targetModelName).Observe(float64(reqSize)) diff --git a/pkg/ext-proc/metrics/metrics_test.go b/pkg/ext-proc/metrics/metrics_test.go index cf638b93..2e891066 100644 --- a/pkg/ext-proc/metrics/metrics_test.go +++ b/pkg/ext-proc/metrics/metrics_test.go @@ -24,18 +24,20 @@ import ( "k8s.io/component-base/metrics/legacyregistry" "k8s.io/component-base/metrics/testutil" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/error" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) const ( - RequestTotalMetric = InferenceModelComponent + "_request_total" - RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds" - RequestSizesMetric = InferenceModelComponent + "_request_sizes" - ResponseSizesMetric = InferenceModelComponent + "_response_sizes" - InputTokensMetric = InferenceModelComponent + "_input_tokens" - OutputTokensMetric = InferenceModelComponent + "_output_tokens" - KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization" - QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size" + RequestTotalMetric = InferenceModelComponent + "_request_total" + RequestErrorTotalMetric = InferenceModelComponent + "_request_error_total" + RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds" + RequestSizesMetric = InferenceModelComponent + "_request_sizes" + ResponseSizesMetric = InferenceModelComponent + "_response_sizes" + InputTokensMetric = InferenceModelComponent + "_input_tokens" + OutputTokensMetric = InferenceModelComponent + "_output_tokens" + KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization" + QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size" ) func TestRecordRequestCounterandSizes(t *testing.T) { @@ -107,6 +109,65 @@ func TestRecordRequestCounterandSizes(t *testing.T) { } } +func TestRecordRequestErrorCounter(t *testing.T) { + type requests struct { + modelName string + targetModelName string + error string + } + scenarios := []struct { + name string + reqs []requests + invalid bool + }{{ + name: "multiple requests", + reqs: []requests{ + { + modelName: "m10", + targetModelName: "t10", + error: errutil.Internal, + }, + { + modelName: "m10", + targetModelName: "t10", + error: errutil.Internal, + }, + { + modelName: "m10", + targetModelName: "t11", + error: errutil.ModelServerError, + }, + { + modelName: "m20", + targetModelName: "t20", + error: errutil.InferencePoolResourceExhausted, + }, + }, + }, + } + Register() + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + for _, req := range scenario.reqs { + RecordRequestErrCounter(req.modelName, req.targetModelName, req.error) + } + + wantRequestErrorCounter, err := os.Open("testdata/request_error_total_metric") + defer func() { + if err := wantRequestErrorCounter.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantRequestErrorCounter, RequestErrorTotalMetric); err != nil { + t.Error(err) + } + }) + } +} + func TestRecordRequestLatencies(t *testing.T) { ctx := logutil.NewTestLoggerIntoContext(context.Background()) timeBaseline := time.Now() diff --git a/pkg/ext-proc/metrics/testdata/request_error_total_metric b/pkg/ext-proc/metrics/testdata/request_error_total_metric new file mode 100644 index 00000000..31036eb6 --- /dev/null +++ b/pkg/ext-proc/metrics/testdata/request_error_total_metric @@ -0,0 +1,5 @@ +# HELP inference_model_request_error_total [ALPHA] Counter of inference model requests errors broken out for each model and target model. +# TYPE inference_model_request_error_total counter +inference_model_request_error_total{error_code="Internal", model_name="m10",target_model_name="t10"} 2 +inference_model_request_error_total{error_code="ModelServerError", model_name="m10",target_model_name="t11"} 1 +inference_model_request_error_total{error_code="InferencePoolResourceExhausted", model_name="m20",target_model_name="t20"} 1 diff --git a/pkg/ext-proc/scheduling/scheduler.go b/pkg/ext-proc/scheduling/scheduler.go index 49402fb3..b5f2f4f2 100644 --- a/pkg/ext-proc/scheduling/scheduler.go +++ b/pkg/ext-proc/scheduling/scheduler.go @@ -23,10 +23,9 @@ import ( "math/rand" "github.com/go-logr/logr" - "google.golang.org/grpc/codes" - "google.golang.org/grpc/status" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/error" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" ) @@ -102,8 +101,8 @@ var ( name: "drop request", filter: func(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { logger.V(logutil.DEFAULT).Info("Request dropped", "request", req) - return []*datastore.PodMetrics{}, status.Errorf( - codes.ResourceExhausted, "dropping request due to limited backend resources") + return []*datastore.PodMetrics{}, errutil.Error{ + Code: errutil.InferencePoolResourceExhausted, Msg: "dropping request due to limited backend resources"} }, }, } diff --git a/pkg/ext-proc/util/error/error.go b/pkg/ext-proc/util/error/error.go new file mode 100644 index 00000000..2f9c992c --- /dev/null +++ b/pkg/ext-proc/util/error/error.go @@ -0,0 +1,34 @@ +package error + +import ( + "fmt" +) + +// Error is an error struct for errors returned by the epp server. +type Error struct { + Code string + Msg string +} + +const ( + Unknown = "Unknown" + BadRequest = "BadRequest" + Internal = "Internal" + ModelServerError = "ModelServerError" + BadConfiguration = "BadConfiguration" + InferencePoolResourceExhausted = "InferencePoolResourceExhausted" +) + +// Error returns a string version of the error. +func (e Error) Error() string { + return fmt.Sprintf("inference gateway: %s - %s", e.Code, e.Msg) +} + +// CanonicalCode returns the error's ErrorCode. +func CanonicalCode(err error) string { + e, ok := err.(Error) + if ok { + return e.Code + } + return Unknown +} From 2577f63f6a1c94292393a9750975320f2462485d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Kupka?= Date: Wed, 19 Feb 2025 22:34:27 +0100 Subject: [PATCH 38/96] Rename pkg/ext-proc to pkg/epp (#372) --- Dockerfile | 10 ++-- cmd/{ext-proc => epp}/health.go | 4 +- cmd/{ext-proc => epp}/main.go | 12 ++--- docs/dev.md | 2 +- .../003-endpoint-picker-protocol/README.md | 2 +- pkg/{ext-proc => epp}/backend/fake.go | 4 +- pkg/{ext-proc => epp}/backend/provider.go | 6 +-- .../backend/provider_test.go | 2 +- pkg/{ext-proc => epp}/backend/vllm/metrics.go | 4 +- .../backend/vllm/metrics_test.go | 4 +- .../controller/inferencemodel_reconciler.go | 4 +- .../inferencemodel_reconciler_test.go | 4 +- .../controller/inferencepool_reconciler.go | 4 +- .../inferencepool_reconciler_test.go | 4 +- .../controller/pod_reconciler.go | 4 +- .../controller/pod_reconciler_test.go | 2 +- pkg/{ext-proc => epp}/datastore/datastore.go | 2 +- .../datastore/datastore_test.go | 2 +- pkg/{ext-proc => epp}/datastore/types.go | 0 pkg/{ext-proc => epp}/handlers/request.go | 8 +-- pkg/{ext-proc => epp}/handlers/response.go | 4 +- .../handlers/response_test.go | 2 +- pkg/{ext-proc => epp}/handlers/server.go | 11 ++-- pkg/{ext-proc => epp}/metrics/README.md | 0 pkg/{ext-proc => epp}/metrics/metrics.go | 2 +- pkg/{ext-proc => epp}/metrics/metrics_test.go | 51 ++++++++++--------- .../metrics/testdata/input_tokens_metric | 0 .../metrics/testdata/kv_cache_avg_metrics | 0 .../metrics/testdata/output_tokens_metric | 0 .../metrics/testdata/queue_avg_size_metrics | 0 .../testdata/request_duration_seconds_metric | 0 .../testdata/request_error_total_metric | 0 .../metrics/testdata/request_sizes_metric | 0 .../metrics/testdata/request_total_metric | 0 .../metrics/testdata/response_sizes_metric | 0 pkg/{ext-proc => epp}/scheduling/filter.go | 4 +- .../scheduling/filter_test.go | 4 +- pkg/{ext-proc => epp}/scheduling/scheduler.go | 9 ++-- pkg/{ext-proc => epp}/scheduling/types.go | 0 pkg/{ext-proc => epp}/server/runserver.go | 10 ++-- .../server/runserver_test.go | 4 +- .../test/benchmark/benchmark.go | 8 +-- pkg/{ext-proc => epp}/test/utils.go | 12 ++--- pkg/{ext-proc => epp}/util/error/error.go | 0 pkg/{ext-proc => epp}/util/logging/fatal.go | 0 pkg/{ext-proc => epp}/util/logging/logger.go | 0 .../util/logging/logging_const.go | 0 .../util/testing/wrappers.go | 0 test/integration/hermetic_test.go | 12 ++--- tools/dashboards/README.md | 3 +- tools/dashboards/inference_gateway.json | 2 +- 51 files changed, 112 insertions(+), 110 deletions(-) rename cmd/{ext-proc => epp}/health.go (91%) rename cmd/{ext-proc => epp}/main.go (95%) rename pkg/{ext-proc => epp}/backend/fake.go (90%) rename pkg/{ext-proc => epp}/backend/provider.go (95%) rename pkg/{ext-proc => epp}/backend/provider_test.go (98%) rename pkg/{ext-proc => epp}/backend/vllm/metrics.go (97%) rename pkg/{ext-proc => epp}/backend/vllm/metrics_test.go (97%) rename pkg/{ext-proc => epp}/controller/inferencemodel_reconciler.go (95%) rename pkg/{ext-proc => epp}/controller/inferencemodel_reconciler_test.go (98%) rename pkg/{ext-proc => epp}/controller/inferencepool_reconciler.go (96%) rename pkg/{ext-proc => epp}/controller/inferencepool_reconciler_test.go (97%) rename pkg/{ext-proc => epp}/controller/pod_reconciler.go (95%) rename pkg/{ext-proc => epp}/controller/pod_reconciler_test.go (99%) rename pkg/{ext-proc => epp}/datastore/datastore.go (98%) rename pkg/{ext-proc => epp}/datastore/datastore_test.go (97%) rename pkg/{ext-proc => epp}/datastore/types.go (100%) rename pkg/{ext-proc => epp}/handlers/request.go (95%) rename pkg/{ext-proc => epp}/handlers/response.go (97%) rename pkg/{ext-proc => epp}/handlers/response_test.go (97%) rename pkg/{ext-proc => epp}/handlers/server.go (95%) rename pkg/{ext-proc => epp}/metrics/README.md (100%) rename pkg/{ext-proc => epp}/metrics/metrics.go (98%) rename pkg/{ext-proc => epp}/metrics/metrics_test.go (93%) rename pkg/{ext-proc => epp}/metrics/testdata/input_tokens_metric (100%) rename pkg/{ext-proc => epp}/metrics/testdata/kv_cache_avg_metrics (100%) rename pkg/{ext-proc => epp}/metrics/testdata/output_tokens_metric (100%) rename pkg/{ext-proc => epp}/metrics/testdata/queue_avg_size_metrics (100%) rename pkg/{ext-proc => epp}/metrics/testdata/request_duration_seconds_metric (100%) rename pkg/{ext-proc => epp}/metrics/testdata/request_error_total_metric (100%) rename pkg/{ext-proc => epp}/metrics/testdata/request_sizes_metric (100%) rename pkg/{ext-proc => epp}/metrics/testdata/request_total_metric (100%) rename pkg/{ext-proc => epp}/metrics/testdata/response_sizes_metric (100%) rename pkg/{ext-proc => epp}/scheduling/filter.go (98%) rename pkg/{ext-proc => epp}/scheduling/filter_test.go (98%) rename pkg/{ext-proc => epp}/scheduling/scheduler.go (94%) rename pkg/{ext-proc => epp}/scheduling/types.go (100%) rename pkg/{ext-proc => epp}/server/runserver.go (95%) rename pkg/{ext-proc => epp}/server/runserver_test.go (87%) rename pkg/{ext-proc => epp}/test/benchmark/benchmark.go (93%) rename pkg/{ext-proc => epp}/test/utils.go (88%) rename pkg/{ext-proc => epp}/util/error/error.go (100%) rename pkg/{ext-proc => epp}/util/logging/fatal.go (100%) rename pkg/{ext-proc => epp}/util/logging/logger.go (100%) rename pkg/{ext-proc => epp}/util/logging/logging_const.go (100%) rename pkg/{ext-proc => epp}/util/testing/wrappers.go (100%) diff --git a/Dockerfile b/Dockerfile index 5d6f08a5..4adc82e4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ ARG BUILDER_IMAGE=golang:1.23-alpine ARG BASE_IMAGE=gcr.io/distroless/base-debian10 ## Multistage build -FROM ${BUILDER_IMAGE} as builder +FROM ${BUILDER_IMAGE} AS builder ENV CGO_ENABLED=0 ENV GOOS=linux ENV GOARCH=amd64 @@ -19,13 +19,13 @@ COPY cmd ./cmd COPY pkg ./pkg COPY internal ./internal COPY api ./api -WORKDIR /src/cmd/ext-proc -RUN go build -o /ext-proc +WORKDIR /src/cmd/epp +RUN go build -o /epp ## Multistage deploy FROM ${BASE_IMAGE} WORKDIR / -COPY --from=builder /ext-proc /ext-proc +COPY --from=builder /epp /epp -ENTRYPOINT ["/ext-proc"] \ No newline at end of file +ENTRYPOINT ["/epp"] diff --git a/cmd/ext-proc/health.go b/cmd/epp/health.go similarity index 91% rename from cmd/ext-proc/health.go rename to cmd/epp/health.go index 26a58df8..335c0849 100644 --- a/cmd/ext-proc/health.go +++ b/cmd/epp/health.go @@ -23,8 +23,8 @@ import ( "google.golang.org/grpc/codes" healthPb "google.golang.org/grpc/health/grpc_health_v1" "google.golang.org/grpc/status" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) type healthServer struct { diff --git a/cmd/ext-proc/main.go b/cmd/epp/main.go similarity index 95% rename from cmd/ext-proc/main.go rename to cmd/epp/main.go index 047a1fa7..a189984b 100644 --- a/cmd/ext-proc/main.go +++ b/cmd/epp/main.go @@ -41,12 +41,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/metrics/filters" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend/vllm" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" - runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/vllm" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) const ( diff --git a/docs/dev.md b/docs/dev.md index 2af39668..d223ed6a 100644 --- a/docs/dev.md +++ b/docs/dev.md @@ -37,7 +37,7 @@ const( ) ``` -The guidelines are written in the context of a k8s controller. Our [ext-proc](../pkg/ext-proc/) does more things such as handling requests and scraping metrics, therefore we adapt the guidelines as follows: +The guidelines are written in the context of a k8s controller. Our [epp](../pkg/epp/) does more things such as handling requests and scraping metrics, therefore we adapt the guidelines as follows: 1. The server startup process and configuration. diff --git a/docs/proposals/003-endpoint-picker-protocol/README.md b/docs/proposals/003-endpoint-picker-protocol/README.md index 8e96a630..6876135d 100644 --- a/docs/proposals/003-endpoint-picker-protocol/README.md +++ b/docs/proposals/003-endpoint-picker-protocol/README.md @@ -2,7 +2,7 @@ The Endpoint Picker, or EPP, is a core component of the inference extension. Ultimately it's responsible for picking an endpoint from the `InferencePool`. A reference implementation can be -found [here](../../../pkg/ext-proc/). +found [here](../../../pkg/epp/). ## Proxy Protocol diff --git a/pkg/ext-proc/backend/fake.go b/pkg/epp/backend/fake.go similarity index 90% rename from pkg/ext-proc/backend/fake.go rename to pkg/epp/backend/fake.go index 2de34c16..e81b3817 100644 --- a/pkg/ext-proc/backend/fake.go +++ b/pkg/epp/backend/fake.go @@ -22,8 +22,8 @@ import ( "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) type FakePodMetricsClient struct { diff --git a/pkg/ext-proc/backend/provider.go b/pkg/epp/backend/provider.go similarity index 95% rename from pkg/ext-proc/backend/provider.go rename to pkg/epp/backend/provider.go index 974319f7..a12f84d5 100644 --- a/pkg/ext-proc/backend/provider.go +++ b/pkg/epp/backend/provider.go @@ -25,9 +25,9 @@ import ( "github.com/go-logr/logr" "go.uber.org/multierr" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) const ( diff --git a/pkg/ext-proc/backend/provider_test.go b/pkg/epp/backend/provider_test.go similarity index 98% rename from pkg/ext-proc/backend/provider_test.go rename to pkg/epp/backend/provider_test.go index 7736dd8d..1e11afe2 100644 --- a/pkg/ext-proc/backend/provider_test.go +++ b/pkg/epp/backend/provider_test.go @@ -27,7 +27,7 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" "github.com/stretchr/testify/assert" "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" ) var ( diff --git a/pkg/ext-proc/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go similarity index 97% rename from pkg/ext-proc/backend/vllm/metrics.go rename to pkg/epp/backend/vllm/metrics.go index 59a132c8..8648e24c 100644 --- a/pkg/ext-proc/backend/vllm/metrics.go +++ b/pkg/epp/backend/vllm/metrics.go @@ -30,8 +30,8 @@ import ( "github.com/prometheus/common/expfmt" "go.uber.org/multierr" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) const ( diff --git a/pkg/ext-proc/backend/vllm/metrics_test.go b/pkg/epp/backend/vllm/metrics_test.go similarity index 97% rename from pkg/ext-proc/backend/vllm/metrics_test.go rename to pkg/epp/backend/vllm/metrics_test.go index 1c9d5448..12aac1a1 100644 --- a/pkg/ext-proc/backend/vllm/metrics_test.go +++ b/pkg/epp/backend/vllm/metrics_test.go @@ -23,8 +23,8 @@ import ( dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" "google.golang.org/protobuf/proto" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) func TestPromToPodMetrics(t *testing.T) { diff --git a/pkg/ext-proc/controller/inferencemodel_reconciler.go b/pkg/epp/controller/inferencemodel_reconciler.go similarity index 95% rename from pkg/ext-proc/controller/inferencemodel_reconciler.go rename to pkg/epp/controller/inferencemodel_reconciler.go index cca05fce..99a1eb26 100644 --- a/pkg/ext-proc/controller/inferencemodel_reconciler.go +++ b/pkg/epp/controller/inferencemodel_reconciler.go @@ -28,8 +28,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) type InferenceModelReconciler struct { diff --git a/pkg/ext-proc/controller/inferencemodel_reconciler_test.go b/pkg/epp/controller/inferencemodel_reconciler_test.go similarity index 98% rename from pkg/ext-proc/controller/inferencemodel_reconciler_test.go rename to pkg/epp/controller/inferencemodel_reconciler_test.go index 583f5f75..cf94b168 100644 --- a/pkg/ext-proc/controller/inferencemodel_reconciler_test.go +++ b/pkg/epp/controller/inferencemodel_reconciler_test.go @@ -29,8 +29,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) var ( diff --git a/pkg/ext-proc/controller/inferencepool_reconciler.go b/pkg/epp/controller/inferencepool_reconciler.go similarity index 96% rename from pkg/ext-proc/controller/inferencepool_reconciler.go rename to pkg/epp/controller/inferencepool_reconciler.go index b2cd01c0..f2c56991 100644 --- a/pkg/ext-proc/controller/inferencepool_reconciler.go +++ b/pkg/epp/controller/inferencepool_reconciler.go @@ -28,8 +28,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) // InferencePoolReconciler utilizes the controller runtime to reconcile Instance Gateway resources diff --git a/pkg/ext-proc/controller/inferencepool_reconciler_test.go b/pkg/epp/controller/inferencepool_reconciler_test.go similarity index 97% rename from pkg/ext-proc/controller/inferencepool_reconciler_test.go rename to pkg/epp/controller/inferencepool_reconciler_test.go index 925cb236..6263fa16 100644 --- a/pkg/ext-proc/controller/inferencepool_reconciler_test.go +++ b/pkg/epp/controller/inferencepool_reconciler_test.go @@ -31,8 +31,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - utiltesting "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/testing" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + utiltesting "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" ) var ( diff --git a/pkg/ext-proc/controller/pod_reconciler.go b/pkg/epp/controller/pod_reconciler.go similarity index 95% rename from pkg/ext-proc/controller/pod_reconciler.go rename to pkg/epp/controller/pod_reconciler.go index 871e1da5..5b0c25c9 100644 --- a/pkg/ext-proc/controller/pod_reconciler.go +++ b/pkg/epp/controller/pod_reconciler.go @@ -28,8 +28,8 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) type PodReconciler struct { diff --git a/pkg/ext-proc/controller/pod_reconciler_test.go b/pkg/epp/controller/pod_reconciler_test.go similarity index 99% rename from pkg/ext-proc/controller/pod_reconciler_test.go rename to pkg/epp/controller/pod_reconciler_test.go index c87ee54d..b3869113 100644 --- a/pkg/ext-proc/controller/pod_reconciler_test.go +++ b/pkg/epp/controller/pod_reconciler_test.go @@ -32,7 +32,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" ) var ( diff --git a/pkg/ext-proc/datastore/datastore.go b/pkg/epp/datastore/datastore.go similarity index 98% rename from pkg/ext-proc/datastore/datastore.go rename to pkg/epp/datastore/datastore.go index 60236496..eecea59c 100644 --- a/pkg/ext-proc/datastore/datastore.go +++ b/pkg/epp/datastore/datastore.go @@ -29,7 +29,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) // The datastore is a local cache of relevant data for the given InferencePool (currently all pulled from k8s-api) diff --git a/pkg/ext-proc/datastore/datastore_test.go b/pkg/epp/datastore/datastore_test.go similarity index 97% rename from pkg/ext-proc/datastore/datastore_test.go rename to pkg/epp/datastore/datastore_test.go index f32d8d77..bd5c5020 100644 --- a/pkg/ext-proc/datastore/datastore_test.go +++ b/pkg/epp/datastore/datastore_test.go @@ -21,7 +21,7 @@ import ( v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) func TestHasSynced(t *testing.T) { diff --git a/pkg/ext-proc/datastore/types.go b/pkg/epp/datastore/types.go similarity index 100% rename from pkg/ext-proc/datastore/types.go rename to pkg/epp/datastore/types.go diff --git a/pkg/ext-proc/handlers/request.go b/pkg/epp/handlers/request.go similarity index 95% rename from pkg/ext-proc/handlers/request.go rename to pkg/epp/handlers/request.go index 34db206d..b9ffd0b0 100644 --- a/pkg/ext-proc/handlers/request.go +++ b/pkg/epp/handlers/request.go @@ -26,10 +26,10 @@ import ( extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "google.golang.org/protobuf/types/known/structpb" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" - errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/error" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) // HandleRequestBody handles body of the request to the backend server, such as parsing the "model" diff --git a/pkg/ext-proc/handlers/response.go b/pkg/epp/handlers/response.go similarity index 97% rename from pkg/ext-proc/handlers/response.go rename to pkg/epp/handlers/response.go index ed3082c5..f9396acf 100644 --- a/pkg/ext-proc/handlers/response.go +++ b/pkg/epp/handlers/response.go @@ -24,8 +24,8 @@ import ( configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "sigs.k8s.io/controller-runtime/pkg/log" - errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/error" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) // HandleResponseHeaders processes response headers from the backend model server. diff --git a/pkg/ext-proc/handlers/response_test.go b/pkg/epp/handlers/response_test.go similarity index 97% rename from pkg/ext-proc/handlers/response_test.go rename to pkg/epp/handlers/response_test.go index dbb7e700..01f02d09 100644 --- a/pkg/ext-proc/handlers/response_test.go +++ b/pkg/epp/handlers/response_test.go @@ -22,7 +22,7 @@ import ( extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "github.com/google/go-cmp/cmp" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) const ( diff --git a/pkg/ext-proc/handlers/server.go b/pkg/epp/handlers/server.go similarity index 95% rename from pkg/ext-proc/handlers/server.go rename to pkg/epp/handlers/server.go index 506eaa97..2c61118c 100644 --- a/pkg/ext-proc/handlers/server.go +++ b/pkg/epp/handlers/server.go @@ -27,11 +27,11 @@ import ( "google.golang.org/grpc/codes" "google.golang.org/grpc/status" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/metrics" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" - errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/error" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) func NewServer(scheduler Scheduler, targetEndpointKey string, datastore datastore.Datastore) *Server { @@ -185,7 +185,6 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) } } - } // RequestContext stores context information during the life time of an HTTP request. diff --git a/pkg/ext-proc/metrics/README.md b/pkg/epp/metrics/README.md similarity index 100% rename from pkg/ext-proc/metrics/README.md rename to pkg/epp/metrics/README.md diff --git a/pkg/ext-proc/metrics/metrics.go b/pkg/epp/metrics/metrics.go similarity index 98% rename from pkg/ext-proc/metrics/metrics.go rename to pkg/epp/metrics/metrics.go index cc21d531..e86ca901 100644 --- a/pkg/ext-proc/metrics/metrics.go +++ b/pkg/epp/metrics/metrics.go @@ -24,7 +24,7 @@ import ( compbasemetrics "k8s.io/component-base/metrics" "k8s.io/component-base/metrics/legacyregistry" "sigs.k8s.io/controller-runtime/pkg/log" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) const ( diff --git a/pkg/ext-proc/metrics/metrics_test.go b/pkg/epp/metrics/metrics_test.go similarity index 93% rename from pkg/ext-proc/metrics/metrics_test.go rename to pkg/epp/metrics/metrics_test.go index 2e891066..c2436bab 100644 --- a/pkg/ext-proc/metrics/metrics_test.go +++ b/pkg/epp/metrics/metrics_test.go @@ -24,8 +24,8 @@ import ( "k8s.io/component-base/metrics/legacyregistry" "k8s.io/component-base/metrics/testutil" - errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/error" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) const ( @@ -119,31 +119,32 @@ func TestRecordRequestErrorCounter(t *testing.T) { name string reqs []requests invalid bool - }{{ - name: "multiple requests", - reqs: []requests{ - { - modelName: "m10", - targetModelName: "t10", - error: errutil.Internal, - }, - { - modelName: "m10", - targetModelName: "t10", - error: errutil.Internal, - }, - { - modelName: "m10", - targetModelName: "t11", - error: errutil.ModelServerError, - }, - { - modelName: "m20", - targetModelName: "t20", - error: errutil.InferencePoolResourceExhausted, + }{ + { + name: "multiple requests", + reqs: []requests{ + { + modelName: "m10", + targetModelName: "t10", + error: errutil.Internal, + }, + { + modelName: "m10", + targetModelName: "t10", + error: errutil.Internal, + }, + { + modelName: "m10", + targetModelName: "t11", + error: errutil.ModelServerError, + }, + { + modelName: "m20", + targetModelName: "t20", + error: errutil.InferencePoolResourceExhausted, + }, }, }, - }, } Register() for _, scenario := range scenarios { diff --git a/pkg/ext-proc/metrics/testdata/input_tokens_metric b/pkg/epp/metrics/testdata/input_tokens_metric similarity index 100% rename from pkg/ext-proc/metrics/testdata/input_tokens_metric rename to pkg/epp/metrics/testdata/input_tokens_metric diff --git a/pkg/ext-proc/metrics/testdata/kv_cache_avg_metrics b/pkg/epp/metrics/testdata/kv_cache_avg_metrics similarity index 100% rename from pkg/ext-proc/metrics/testdata/kv_cache_avg_metrics rename to pkg/epp/metrics/testdata/kv_cache_avg_metrics diff --git a/pkg/ext-proc/metrics/testdata/output_tokens_metric b/pkg/epp/metrics/testdata/output_tokens_metric similarity index 100% rename from pkg/ext-proc/metrics/testdata/output_tokens_metric rename to pkg/epp/metrics/testdata/output_tokens_metric diff --git a/pkg/ext-proc/metrics/testdata/queue_avg_size_metrics b/pkg/epp/metrics/testdata/queue_avg_size_metrics similarity index 100% rename from pkg/ext-proc/metrics/testdata/queue_avg_size_metrics rename to pkg/epp/metrics/testdata/queue_avg_size_metrics diff --git a/pkg/ext-proc/metrics/testdata/request_duration_seconds_metric b/pkg/epp/metrics/testdata/request_duration_seconds_metric similarity index 100% rename from pkg/ext-proc/metrics/testdata/request_duration_seconds_metric rename to pkg/epp/metrics/testdata/request_duration_seconds_metric diff --git a/pkg/ext-proc/metrics/testdata/request_error_total_metric b/pkg/epp/metrics/testdata/request_error_total_metric similarity index 100% rename from pkg/ext-proc/metrics/testdata/request_error_total_metric rename to pkg/epp/metrics/testdata/request_error_total_metric diff --git a/pkg/ext-proc/metrics/testdata/request_sizes_metric b/pkg/epp/metrics/testdata/request_sizes_metric similarity index 100% rename from pkg/ext-proc/metrics/testdata/request_sizes_metric rename to pkg/epp/metrics/testdata/request_sizes_metric diff --git a/pkg/ext-proc/metrics/testdata/request_total_metric b/pkg/epp/metrics/testdata/request_total_metric similarity index 100% rename from pkg/ext-proc/metrics/testdata/request_total_metric rename to pkg/epp/metrics/testdata/request_total_metric diff --git a/pkg/ext-proc/metrics/testdata/response_sizes_metric b/pkg/epp/metrics/testdata/response_sizes_metric similarity index 100% rename from pkg/ext-proc/metrics/testdata/response_sizes_metric rename to pkg/epp/metrics/testdata/response_sizes_metric diff --git a/pkg/ext-proc/scheduling/filter.go b/pkg/epp/scheduling/filter.go similarity index 98% rename from pkg/ext-proc/scheduling/filter.go rename to pkg/epp/scheduling/filter.go index 36691a73..b7881468 100644 --- a/pkg/ext-proc/scheduling/filter.go +++ b/pkg/epp/scheduling/filter.go @@ -21,8 +21,8 @@ import ( "math" "github.com/go-logr/logr" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) type Filter interface { diff --git a/pkg/ext-proc/scheduling/filter_test.go b/pkg/epp/scheduling/filter_test.go similarity index 98% rename from pkg/ext-proc/scheduling/filter_test.go rename to pkg/epp/scheduling/filter_test.go index 01909fea..ac765b78 100644 --- a/pkg/ext-proc/scheduling/filter_test.go +++ b/pkg/epp/scheduling/filter_test.go @@ -23,8 +23,8 @@ import ( "github.com/go-logr/logr" "github.com/google/go-cmp/cmp" "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) func TestFilter(t *testing.T) { diff --git a/pkg/ext-proc/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go similarity index 94% rename from pkg/ext-proc/scheduling/scheduler.go rename to pkg/epp/scheduling/scheduler.go index b5f2f4f2..a969948e 100644 --- a/pkg/ext-proc/scheduling/scheduler.go +++ b/pkg/epp/scheduling/scheduler.go @@ -24,9 +24,9 @@ import ( "github.com/go-logr/logr" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/error" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) const ( @@ -102,7 +102,8 @@ var ( filter: func(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { logger.V(logutil.DEFAULT).Info("Request dropped", "request", req) return []*datastore.PodMetrics{}, errutil.Error{ - Code: errutil.InferencePoolResourceExhausted, Msg: "dropping request due to limited backend resources"} + Code: errutil.InferencePoolResourceExhausted, Msg: "dropping request due to limited backend resources", + } }, }, } diff --git a/pkg/ext-proc/scheduling/types.go b/pkg/epp/scheduling/types.go similarity index 100% rename from pkg/ext-proc/scheduling/types.go rename to pkg/epp/scheduling/types.go diff --git a/pkg/ext-proc/server/runserver.go b/pkg/epp/server/runserver.go similarity index 95% rename from pkg/ext-proc/server/runserver.go rename to pkg/epp/server/runserver.go index 795b242d..92b7be7f 100644 --- a/pkg/ext-proc/server/runserver.go +++ b/pkg/epp/server/runserver.go @@ -36,11 +36,11 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/controller" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/controller" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" ) // ExtProcServerRunner provides methods to manage an external process server. diff --git a/pkg/ext-proc/server/runserver_test.go b/pkg/epp/server/runserver_test.go similarity index 87% rename from pkg/ext-proc/server/runserver_test.go rename to pkg/epp/server/runserver_test.go index 438dc096..b02688c5 100644 --- a/pkg/ext-proc/server/runserver_test.go +++ b/pkg/epp/server/runserver_test.go @@ -21,8 +21,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) func TestRunnable(t *testing.T) { diff --git a/pkg/ext-proc/test/benchmark/benchmark.go b/pkg/epp/test/benchmark/benchmark.go similarity index 93% rename from pkg/ext-proc/test/benchmark/benchmark.go rename to pkg/epp/test/benchmark/benchmark.go index dc06a27a..10987b47 100644 --- a/pkg/ext-proc/test/benchmark/benchmark.go +++ b/pkg/epp/test/benchmark/benchmark.go @@ -32,10 +32,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/test" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/test" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) var ( diff --git a/pkg/ext-proc/test/utils.go b/pkg/epp/test/utils.go similarity index 88% rename from pkg/ext-proc/test/utils.go rename to pkg/epp/test/utils.go index ef83c932..f82084d9 100644 --- a/pkg/ext-proc/test/utils.go +++ b/pkg/epp/test/utils.go @@ -30,12 +30,12 @@ import ( "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/handlers" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/scheduling" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" - utiltesting "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/testing" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + utiltesting "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" ) func StartExtProc( diff --git a/pkg/ext-proc/util/error/error.go b/pkg/epp/util/error/error.go similarity index 100% rename from pkg/ext-proc/util/error/error.go rename to pkg/epp/util/error/error.go diff --git a/pkg/ext-proc/util/logging/fatal.go b/pkg/epp/util/logging/fatal.go similarity index 100% rename from pkg/ext-proc/util/logging/fatal.go rename to pkg/epp/util/logging/fatal.go diff --git a/pkg/ext-proc/util/logging/logger.go b/pkg/epp/util/logging/logger.go similarity index 100% rename from pkg/ext-proc/util/logging/logger.go rename to pkg/epp/util/logging/logger.go diff --git a/pkg/ext-proc/util/logging/logging_const.go b/pkg/epp/util/logging/logging_const.go similarity index 100% rename from pkg/ext-proc/util/logging/logging_const.go rename to pkg/epp/util/logging/logging_const.go diff --git a/pkg/ext-proc/util/testing/wrappers.go b/pkg/epp/util/testing/wrappers.go similarity index 100% rename from pkg/ext-proc/util/testing/wrappers.go rename to pkg/epp/util/testing/wrappers.go diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index 18efe7bf..eb2ca40e 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -47,12 +47,12 @@ import ( k8sclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/envtest" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/backend" - "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/datastore" - runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/server" - extprocutils "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/test" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/logging" - utiltesting "sigs.k8s.io/gateway-api-inference-extension/pkg/ext-proc/util/testing" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" + extprocutils "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/test" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + utiltesting "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" "sigs.k8s.io/yaml" ) diff --git a/tools/dashboards/README.md b/tools/dashboards/README.md index c8258b63..7be2a5b8 100644 --- a/tools/dashboards/README.md +++ b/tools/dashboards/README.md @@ -4,7 +4,7 @@ This documentation provides instructions for setting up grafana dashboards to se ## Requirements -Please follow [metrics](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/ext-proc/metrics) page to configure the proxy to enable all metrics. +Please follow [metrics](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp/metrics) page to configure the proxy to enable all metrics. ## Load Inference Extension dashboard into Grafana @@ -21,6 +21,7 @@ If you run the inferece gateway with [Google Managed Prometheus](https://cloud.g Please configure the `scrape_interval` of your prometheus configuration to lower than `15s`, `rate` function returns empty string if data falls too apart. See https://www.robustperception.io/what-range-should-i-use-with-rate/ for more details. Example: + ``` global: scrape_interval: 5s diff --git a/tools/dashboards/inference_gateway.json b/tools/dashboards/inference_gateway.json index 3af66703..4e872739 100644 --- a/tools/dashboards/inference_gateway.json +++ b/tools/dashboards/inference_gateway.json @@ -39,7 +39,7 @@ "showLineNumbers": false, "showMiniMap": false }, - "content": "# Inferece Gateway Dashboard\n\nPlease see https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/ext-proc/metrics for more details of underlying metrics used in the dashboard.", + "content": "# Inferece Gateway Dashboard\n\nPlease see https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp/metrics for more details of underlying metrics used in the dashboard.", "mode": "markdown" }, "pluginVersion": "11.5.0", From a78c768d401e9e444752c89e092e7ba7fc9fc082 Mon Sep 17 00:00:00 2001 From: courageJ Date: Thu, 20 Feb 2025 19:06:27 +0000 Subject: [PATCH 39/96] Move pkg/ext-proc/metrics/README.md -> site-src/guides/metrics.md (#373) * Move pkgepp/metrics/README.md -> site-src/guides/metrics.md * add docs link for metrics.md * update formatting --- mkdocs.yml | 1 + .../README.md => site-src/guides/metrics.md | 30 ++++++++----------- 2 files changed, 14 insertions(+), 17 deletions(-) rename pkg/epp/metrics/README.md => site-src/guides/metrics.md (51%) diff --git a/mkdocs.yml b/mkdocs.yml index a024c16d..8cd3f3fb 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -57,6 +57,7 @@ nav: - User Guides: - Getting started: guides/index.md - Adapter Rollout: guides/adapter-rollout.md + - Metrics: guides/metrics.md - Implementer's Guide: guides/implementers.md - Reference: - API Reference: reference/spec.md diff --git a/pkg/epp/metrics/README.md b/site-src/guides/metrics.md similarity index 51% rename from pkg/epp/metrics/README.md rename to site-src/guides/metrics.md index 1f68a0bd..f793734d 100644 --- a/pkg/epp/metrics/README.md +++ b/site-src/guides/metrics.md @@ -1,10 +1,6 @@ -# Documentation +# Metrics -This documentation is the current state of exposed metrics. - -## Table of Contents -* [Exposed Metrics](#exposed-metrics) -* [Scrape Metrics](#scrape-metrics) +This guide describes the current state of exposed metrics and how to scrape them. ## Requirements @@ -38,17 +34,17 @@ spec: ## Exposed metrics -| Metric name | Metric Type | Description | Labels | Status | -| ------------|--------------| ----------- | ------ | ------ | -| inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_request_error_total | Counter | The counter of requests errors broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_input_tokens | Distribution | Distribution of input token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_model_output_tokens | Distribution | Distribution of output token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | -| inference_pool_average_kv_cache_utilization | Gauge | The average kv cache utilization for an inference server pool. | `name`=<inference-pool-name> | ALPHA | -| inference_pool_average_queue_size | Gauge | The average number of requests pending in the model server queue. | `name`=<inference-pool-name> | ALPHA | +| **Metric name** | **Metric Type** |
**Description**
|
**Labels**
| **Status** | +|:---------------------------------------------|:-----------------|:------------------------------------------------------------------|:-----------------------------------------------------------------------------------|:------------| +| inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_request_error_total | Counter | The counter of requests errors broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_input_tokens | Distribution | Distribution of input token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_model_output_tokens | Distribution | Distribution of output token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| inference_pool_average_kv_cache_utilization | Gauge | The average kv cache utilization for an inference server pool. | `name`=<inference-pool-name> | ALPHA | +| inference_pool_average_queue_size | Gauge | The average number of requests pending in the model server queue. | `name`=<inference-pool-name> | ALPHA | ## Scrape Metrics From 2913da4595243f36536d444b279a009764dc2abe Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Fri, 21 Feb 2025 05:02:27 +0000 Subject: [PATCH 40/96] Defining an outer metadata struct as part of the extproc endpoint picking protocol (#377) * Defining an outer metadata struct as part of the extproc endpoint picking protocol * Apply suggestions from code review Update the protocol doc based on the suggested edits Co-authored-by: Lior Lieberman * Updated the flag names --------- Co-authored-by: Lior Lieberman --- cmd/epp/main.go | 34 ++++---- .../003-endpoint-picker-protocol/README.md | 24 +++++- pkg/epp/handlers/request.go | 35 +++++--- pkg/epp/handlers/server.go | 16 ++-- pkg/epp/server/runserver.go | 53 ++++++------ pkg/epp/test/utils.go | 2 +- test/integration/hermetic_test.go | 84 ++++++++----------- 7 files changed, 142 insertions(+), 106 deletions(-) diff --git a/cmd/epp/main.go b/cmd/epp/main.go index a189984b..1f76cfab 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -64,10 +64,15 @@ var ( "The port used for gRPC liveness and readiness probes") metricsPort = flag.Int( "metricsPort", 9090, "The metrics port") - targetEndpointKey = flag.String( - "targetEndpointKey", - runserver.DefaultTargetEndpointKey, - "Header key used by Envoy to route to the appropriate pod. This must match Envoy configuration.") + destinationEndpointHintKey = flag.String( + "destinationEndpointHintKey", + runserver.DefaultDestinationEndpointHintKey, + "Header and response metadata key used by Envoy to route to the appropriate pod. This must match Envoy configuration.") + destinationEndpointHintMetadataNamespace = flag.String( + "DestinationEndpointHintMetadataNamespace", + runserver.DefaultDestinationEndpointHintMetadataNamespace, + "The key for the outer namespace struct in the metadata field of the extproc response that is used to wrap the"+ + "target endpoint. If not set, then an outer namespace struct should not be created.") poolName = flag.String( "poolName", runserver.DefaultPoolName, @@ -145,16 +150,17 @@ func run() error { datastore := datastore.NewDatastore() provider := backend.NewProvider(&vllm.PodMetricsClientImpl{}, datastore) serverRunner := &runserver.ExtProcServerRunner{ - GrpcPort: *grpcPort, - TargetEndpointKey: *targetEndpointKey, - PoolName: *poolName, - PoolNamespace: *poolNamespace, - RefreshMetricsInterval: *refreshMetricsInterval, - RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval, - Datastore: datastore, - SecureServing: *secureServing, - CertPath: *certPath, - Provider: provider, + GrpcPort: *grpcPort, + DestinationEndpointHintMetadataNamespace: *destinationEndpointHintMetadataNamespace, + DestinationEndpointHintKey: *destinationEndpointHintKey, + PoolName: *poolName, + PoolNamespace: *poolNamespace, + RefreshMetricsInterval: *refreshMetricsInterval, + RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval, + Datastore: datastore, + SecureServing: *secureServing, + CertPath: *certPath, + Provider: provider, } if err := serverRunner.SetupWithManager(mgr); err != nil { setupLog.Error(err, "Failed to setup ext-proc server") diff --git a/docs/proposals/003-endpoint-picker-protocol/README.md b/docs/proposals/003-endpoint-picker-protocol/README.md index 6876135d..418c0f3c 100644 --- a/docs/proposals/003-endpoint-picker-protocol/README.md +++ b/docs/proposals/003-endpoint-picker-protocol/README.md @@ -11,8 +11,28 @@ This is the protocol between the EPP and the proxy (e.g, Envoy). The EPP MUST implement the Envoy [external processing service](https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor)protocol. -For each HTTP request, the EPP MUST communicate to the proxy the picked model server endpoint, via -adding the `x-gateway-destination-endpoint` HTTP header in the request and as an unstructured entry in the [dynamic_metadata](https://github.com/envoyproxy/go-control-plane/blob/c19bf63a811c90bf9e02f8e0dc1dcef94931ebb4/envoy/service/ext_proc/v3/external_processor.pb.go#L320) field of the ext-proc response, or otherwise return an error. The EPP MUST not set two different values in the header and the response metadata. +For each HTTP request, the EPP MUST communicate to the proxy the picked model server endpoint via: + +1. Setting the `x-gateway-destination-endpoint` HTTP header to the selected endpoint in format. + +2. Set an unstructured entry in the [dynamic_metadata](https://github.com/envoyproxy/go-control-plane/blob/c19bf63a811c90bf9e02f8e0dc1dcef94931ebb4/envoy/service/ext_proc/v3/external_processor.pb.go#L320) field of the ext-proc response. The metadata entry for the picked endpoint MUST be wrapped with an outer key (which represents the metadata namespace) with a default of `envoy.lb`. + +The final metadata necessary would look like: +```go +dynamicMetadata: { + "envoy.lb": { + "x-gateway-destination-endpoint": " + } +} +``` + +Note: +- If the EPP did not communicate the server endpoint via these two methods, it MUST return an error. +- The EPP MUST not set two different values in the header and the inner response metadata value. + +### Why envoy.lb namespace as a default? +The `envoy.lb` namesapce is a predefined namespace used for subsetting. One common way to use the selected endpoint returned from the server, is [envoy subsets](https://www.envoyproxy.io/docs/envoy/latest/intro/arch_overview/upstream/load_balancing/subsets) where host metadata for subset load balancing must be placed under `envoy.lb`. + Setting different value leads to unpredictable behavior because proxies aren't guaranteed to support both paths, and so this protocol does not define what takes precedence. ## Model Server Protocol diff --git a/pkg/epp/handlers/request.go b/pkg/epp/handlers/request.go index b9ffd0b0..c6cfdda2 100644 --- a/pkg/epp/handlers/request.go +++ b/pkg/epp/handlers/request.go @@ -119,7 +119,7 @@ func (s *Server) HandleRequestBody( headers := []*configPb.HeaderValueOption{ { Header: &configPb.HeaderValue{ - Key: s.targetEndpointKey, + Key: s.destinationEndpointHintKey, RawValue: []byte(endpoint), }, }, @@ -137,6 +137,29 @@ func (s *Server) HandleRequestBody( logger.V(logutil.DEBUG).Info("Request body header", "key", header.Header.Key, "value", header.Header.RawValue) } + targetEndpointValue := &structpb.Struct{ + Fields: map[string]*structpb.Value{ + s.destinationEndpointHintKey: { + Kind: &structpb.Value_StringValue{ + StringValue: endpoint, + }, + }, + }, + } + dynamicMetadata := targetEndpointValue + if s.destinationEndpointHintMetadataNamespace != "" { + // If a namespace is defined, wrap the selected endpoint with that. + dynamicMetadata = &structpb.Struct{ + Fields: map[string]*structpb.Value{ + s.destinationEndpointHintMetadataNamespace: { + Kind: &structpb.Value_StructValue{ + StructValue: targetEndpointValue, + }, + }, + }, + } + } + resp := &extProcPb.ProcessingResponse{ // The Endpoint Picker supports two approaches to communicating the target endpoint, as a request header // and as an unstructure ext-proc response metadata key/value pair. This enables different integration @@ -155,15 +178,7 @@ func (s *Server) HandleRequestBody( }, }, }, - DynamicMetadata: &structpb.Struct{ - Fields: map[string]*structpb.Value{ - s.targetEndpointKey: { - Kind: &structpb.Value_StringValue{ - StringValue: endpoint, - }, - }, - }, - }, + DynamicMetadata: dynamicMetadata, } return resp, nil } diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go index 2c61118c..9105e8b1 100644 --- a/pkg/epp/handlers/server.go +++ b/pkg/epp/handlers/server.go @@ -34,11 +34,12 @@ import ( logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) -func NewServer(scheduler Scheduler, targetEndpointKey string, datastore datastore.Datastore) *Server { +func NewServer(scheduler Scheduler, destinationEndpointHintMetadataNamespace, destinationEndpointHintKey string, datastore datastore.Datastore) *Server { return &Server{ - scheduler: scheduler, - targetEndpointKey: targetEndpointKey, - datastore: datastore, + scheduler: scheduler, + destinationEndpointHintMetadataNamespace: destinationEndpointHintMetadataNamespace, + destinationEndpointHintKey: destinationEndpointHintKey, + datastore: datastore, } } @@ -48,8 +49,11 @@ type Server struct { scheduler Scheduler // The key of the header to specify the target pod address. This value needs to match Envoy // configuration. - targetEndpointKey string - datastore datastore.Datastore + destinationEndpointHintKey string + // The key acting as the outer namespace struct in the metadata extproc response to communicate + // back the picked endpoints. + destinationEndpointHintMetadataNamespace string + datastore datastore.Datastore } type Scheduler interface { diff --git a/pkg/epp/server/runserver.go b/pkg/epp/server/runserver.go index 92b7be7f..6e6b68b1 100644 --- a/pkg/epp/server/runserver.go +++ b/pkg/epp/server/runserver.go @@ -45,38 +45,41 @@ import ( // ExtProcServerRunner provides methods to manage an external process server. type ExtProcServerRunner struct { - GrpcPort int - TargetEndpointKey string - PoolName string - PoolNamespace string - RefreshMetricsInterval time.Duration - RefreshPrometheusMetricsInterval time.Duration - Datastore datastore.Datastore - Provider *backend.Provider - SecureServing bool - CertPath string + GrpcPort int + DestinationEndpointHintMetadataNamespace string + DestinationEndpointHintKey string + PoolName string + PoolNamespace string + RefreshMetricsInterval time.Duration + RefreshPrometheusMetricsInterval time.Duration + Datastore datastore.Datastore + Provider *backend.Provider + SecureServing bool + CertPath string } // Default values for CLI flags in main const ( - DefaultGrpcPort = 9002 // default for --grpcPort - DefaultTargetEndpointKey = "x-gateway-destination-endpoint" // default for --targetEndpointKey - DefaultPoolName = "" // required but no default - DefaultPoolNamespace = "default" // default for --poolNamespace - DefaultRefreshMetricsInterval = 50 * time.Millisecond // default for --refreshMetricsInterval - DefaultRefreshPrometheusMetricsInterval = 5 * time.Second // default for --refreshPrometheusMetricsInterval - DefaultSecureServing = true // default for --secureServing + DefaultGrpcPort = 9002 // default for --grpcPort + DefaultDestinationEndpointHintMetadataNamespace = "envoy.lb" // default for --destinationEndpointHintMetadataNamespace + DefaultDestinationEndpointHintKey = "x-gateway-destination-endpoint" // default for --destinationEndpointHintKey + DefaultPoolName = "" // required but no default + DefaultPoolNamespace = "default" // default for --poolNamespace + DefaultRefreshMetricsInterval = 50 * time.Millisecond // default for --refreshMetricsInterval + DefaultRefreshPrometheusMetricsInterval = 5 * time.Second // default for --refreshPrometheusMetricsInterval + DefaultSecureServing = true // default for --secureServing ) func NewDefaultExtProcServerRunner() *ExtProcServerRunner { return &ExtProcServerRunner{ - GrpcPort: DefaultGrpcPort, - TargetEndpointKey: DefaultTargetEndpointKey, - PoolName: DefaultPoolName, - PoolNamespace: DefaultPoolNamespace, - RefreshMetricsInterval: DefaultRefreshMetricsInterval, - RefreshPrometheusMetricsInterval: DefaultRefreshPrometheusMetricsInterval, - SecureServing: DefaultSecureServing, + GrpcPort: DefaultGrpcPort, + DestinationEndpointHintKey: DefaultDestinationEndpointHintKey, + DestinationEndpointHintMetadataNamespace: DefaultDestinationEndpointHintMetadataNamespace, + PoolName: DefaultPoolName, + PoolNamespace: DefaultPoolNamespace, + RefreshMetricsInterval: DefaultRefreshMetricsInterval, + RefreshPrometheusMetricsInterval: DefaultRefreshPrometheusMetricsInterval, + SecureServing: DefaultSecureServing, // Datastore can be assigned later. } } @@ -156,7 +159,7 @@ func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable { } extProcPb.RegisterExternalProcessorServer( srv, - handlers.NewServer(scheduling.NewScheduler(r.Datastore), r.TargetEndpointKey, r.Datastore), + handlers.NewServer(scheduling.NewScheduler(r.Datastore), r.DestinationEndpointHintMetadataNamespace, r.DestinationEndpointHintKey, r.Datastore), ) // Forward to the gRPC runnable. diff --git a/pkg/epp/test/utils.go b/pkg/epp/test/utils.go index f82084d9..c44d7147 100644 --- a/pkg/epp/test/utils.go +++ b/pkg/epp/test/utils.go @@ -79,7 +79,7 @@ func startExtProc(logger logr.Logger, port int, datastore datastore.Datastore) * s := grpc.NewServer() - extProcPb.RegisterExternalProcessorServer(s, handlers.NewServer(scheduling.NewScheduler(datastore), "target-pod", datastore)) + extProcPb.RegisterExternalProcessorServer(s, handlers.NewServer(scheduling.NewScheduler(datastore), "", "target-pod", datastore)) logger.Info("gRPC server starting", "port", port) reflection.Register(s) diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index eb2ca40e..91bc71c6 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -100,7 +100,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { wantHeaders: []*configPb.HeaderValueOption{ { Header: &configPb.HeaderValue{ - Key: runserver.DefaultTargetEndpointKey, + Key: runserver.DefaultDestinationEndpointHintKey, RawValue: []byte("address-1:8000"), }, }, @@ -111,17 +111,9 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, }, }, - wantMetadata: &structpb.Struct{ - Fields: map[string]*structpb.Value{ - runserver.DefaultTargetEndpointKey: { - Kind: &structpb.Value_StringValue{ - StringValue: "address-1:8000", - }, - }, - }, - }, - wantBody: []byte("{\"max_tokens\":100,\"model\":\"my-model-12345\",\"prompt\":\"test1\",\"temperature\":0}"), - wantErr: false, + wantMetadata: makeMetadata("address-1:8000"), + wantBody: []byte("{\"max_tokens\":100,\"model\":\"my-model-12345\",\"prompt\":\"test1\",\"temperature\":0}"), + wantErr: false, }, { name: "select active lora, low queue", @@ -156,7 +148,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { wantHeaders: []*configPb.HeaderValueOption{ { Header: &configPb.HeaderValue{ - Key: runserver.DefaultTargetEndpointKey, + Key: runserver.DefaultDestinationEndpointHintKey, RawValue: []byte("address-1:8000"), }, }, @@ -167,17 +159,9 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, }, }, - wantMetadata: &structpb.Struct{ - Fields: map[string]*structpb.Value{ - runserver.DefaultTargetEndpointKey: { - Kind: &structpb.Value_StringValue{ - StringValue: "address-1:8000", - }, - }, - }, - }, - wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test2\",\"temperature\":0}"), - wantErr: false, + wantMetadata: makeMetadata("address-1:8000"), + wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test2\",\"temperature\":0}"), + wantErr: false, }, { name: "select no lora despite active model, avoid excessive queue size", @@ -213,7 +197,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { wantHeaders: []*configPb.HeaderValueOption{ { Header: &configPb.HeaderValue{ - Key: runserver.DefaultTargetEndpointKey, + Key: runserver.DefaultDestinationEndpointHintKey, RawValue: []byte("address-2:8000"), }, }, @@ -224,17 +208,9 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, }, }, - wantMetadata: &structpb.Struct{ - Fields: map[string]*structpb.Value{ - runserver.DefaultTargetEndpointKey: { - Kind: &structpb.Value_StringValue{ - StringValue: "address-2:8000", - }, - }, - }, - }, - wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test3\",\"temperature\":0}"), - wantErr: false, + wantMetadata: makeMetadata("address-2:8000"), + wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test3\",\"temperature\":0}"), + wantErr: false, }, { name: "noncritical and all models past threshold, shed request", @@ -312,7 +288,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { wantHeaders: []*configPb.HeaderValueOption{ { Header: &configPb.HeaderValue{ - Key: runserver.DefaultTargetEndpointKey, + Key: runserver.DefaultDestinationEndpointHintKey, RawValue: []byte("address-0:8000"), }, }, @@ -323,17 +299,9 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, }, }, - wantMetadata: &structpb.Struct{ - Fields: map[string]*structpb.Value{ - runserver.DefaultTargetEndpointKey: { - Kind: &structpb.Value_StringValue{ - StringValue: "address-0:8000", - }, - }, - }, - }, - wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"test5\",\"temperature\":0}"), - wantErr: false, + wantMetadata: makeMetadata("address-0:8000"), + wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"test5\",\"temperature\":0}"), + wantErr: false, }, } @@ -555,3 +523,23 @@ func readDocuments(fp string) ([][]byte, error) { } return docs, nil } + +func makeMetadata(endpoint string) *structpb.Struct { + return &structpb.Struct{ + Fields: map[string]*structpb.Value{ + runserver.DefaultDestinationEndpointHintMetadataNamespace: { + Kind: &structpb.Value_StructValue{ + StructValue: &structpb.Struct{ + Fields: map[string]*structpb.Value{ + runserver.DefaultDestinationEndpointHintKey: { + Kind: &structpb.Value_StringValue{ + StringValue: endpoint, + }, + }, + }, + }, + }, + }, + }, + } +} From 7e3cd457cdcd01339b65861c8e472cf27e6b6e80 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Fri, 21 Feb 2025 13:02:27 -0500 Subject: [PATCH 41/96] Draft a revised README.md (#374) Clarify the point of the project, and use the vernacular of "inference gateway" vs "ai gateway" to more succinctly explain what the distinction is. Move the website up more prominently, and describe in more detail what the immediate requirements are. Create a stub roadmap section. Add a medium complexity architecture SVG to the readme --- README.md | 27 +++++++++++++++++-------- docs/inference-gateway-architecture.svg | 1 + 2 files changed, 20 insertions(+), 8 deletions(-) create mode 100644 docs/inference-gateway-architecture.svg diff --git a/README.md b/README.md index a15e9542..89826f0c 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,35 @@ # Gateway API Inference Extension -The Gateway API Inference Extension came out of [wg-serving](https://github.com/kubernetes/community/tree/master/wg-serving) and is sponsored by [SIG Network](https://github.com/kubernetes/community/blob/master/sig-network/README.md#gateway-api-inference-extension). This repo contains: the load balancing algorithm, [ext-proc](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter) code, CRDs, and controllers of the extension. +This extension upgrades an [ext-proc](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter)-capable proxy or gateway - such as Envoy Gateway, kGateway, or the GKE Gateway - to become an **inference gateway** - supporting inference platform teams self-hosting large language models on Kubernetes. This integration makes it easy to expose and control access to your local [OpenAI-compatible chat completion endpoints](https://platform.openai.com/docs/api-reference/chat) to other workloads on or off cluster, or to integrate your self-hosted models alongside model-as-a-service providers in a higher level **AI Gateway** like LiteLLM, Solo AI Gateway, or Apigee. -This extension is intented to provide value to multiplexed LLM services on a shared pool of compute. See the [proposal](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/012-llm-instance-gateway) for more info. +The inference gateway: + +* Improves the tail latency and throughput of LLM completion requests against Kubernetes-hosted model servers using an extensible request scheduling alogrithm that is kv-cache and request cost aware, avoiding evictions or queueing as load increases +* Provides [Kubernetes-native declarative APIs](https://gateway-api-inference-extension.sigs.k8s.io/concepts/api-overview/) to route client model names to use-case specific LoRA adapters and control incremental rollout of new adapter versions, A/B traffic splitting, and safe blue-green base model and model server upgrades +* Adds end to end observability around service objective attainment +* Ensures operational guardrails between different client model names, allowing a platform team to safely serve many different GenAI workloads on the same pool of shared foundation model servers for higher utilization and fewer required accelerators + +![Architecture Diagram](./docs/inference-gateway-architecture.svg) + +It currently requires a version of vLLM that supports the necessary metrics to predict traffic load which is defined in the [model server protocol](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-endpoint-picker-protocol). Support for Google's Jetstream, nVidia Triton, text-generation-inference, and SGLang is coming soon. ## Status -This project is currently in development. +This project is [alpha (0.1 release)](https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/tag/v0.1.0). It should not be used in production yet. ## Getting Started -Follow this [README](./pkg/README.md) to get the inference-extension up and running on your cluster! +Follow our [Getting Started Guide](./pkg/README.md) to get the inference-extension up and running on your cluster! -## End-to-End Tests +See our website at https://gateway-api-inference-extension.sigs.k8s.io/ for detailed API documentation on leveraging our Kubernetes-native declarative APIs -Follow this [README](./test/e2e/README.md) to learn more about running the inference-extension end-to-end test suite on your cluster. +## Roadmap + +Coming soon! -## Website +## End-to-End Tests -Detailed documentation is available on our website: https://gateway-api-inference-extension.sigs.k8s.io/ +Follow this [README](./test/e2e/README.md) to learn more about running the inference-extension end-to-end test suite on your cluster. ## Contributing diff --git a/docs/inference-gateway-architecture.svg b/docs/inference-gateway-architecture.svg new file mode 100644 index 00000000..6c887ebe --- /dev/null +++ b/docs/inference-gateway-architecture.svg @@ -0,0 +1 @@ + \ No newline at end of file From 9bd136a08cd3cf9fa0b170dbc0906e2d9dead676 Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Fri, 21 Feb 2025 21:14:26 +0000 Subject: [PATCH 42/96] Add README.md file to the epp pkg (#386) * Polish the epp README.md file * Addressed comments --- ...-flowchart.png => scheduler-flowchart.png} | Bin pkg/epp/README.md | 24 ++++++++++++++++++ pkg/scheduling.md | 5 ---- 3 files changed, 24 insertions(+), 5 deletions(-) rename docs/{schedular-flowchart.png => scheduler-flowchart.png} (100%) create mode 100644 pkg/epp/README.md delete mode 100644 pkg/scheduling.md diff --git a/docs/schedular-flowchart.png b/docs/scheduler-flowchart.png similarity index 100% rename from docs/schedular-flowchart.png rename to docs/scheduler-flowchart.png diff --git a/pkg/epp/README.md b/pkg/epp/README.md new file mode 100644 index 00000000..e3bc26ae --- /dev/null +++ b/pkg/epp/README.md @@ -0,0 +1,24 @@ +# The EndPoint Picker (EPP) +This package provides the reference implementation for the Endpoint Picker (EPP). It implements the [extension protocol](../../docs/proposals/003-endpoint-picker-protocol), enabling a proxy or gateway to request endpoint hints from an extension. An EPP instance handles a single `InferencePool` (and so for each `InferencePool`, one must create a dedicated EPP deployment). + + +The Endpoint Picker performs the following core functions: + +- Endpoint Selection + - The EPP determines the appropriate Pod endpoint for the load balancer (LB) to route requests. + - It selects from the pool of ready Pods designated by the assigned InferencePool's [Selector](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/7e3cd457cdcd01339b65861c8e472cf27e6b6e80/api/v1alpha1/inferencepool_types.go#L53) field. + - Endpoint selection is contingent on the request's ModelName matching an `InferenceModel` that references the `InferencePool`. + - Requests with unmatched ModelName values trigger an error response to the proxy. +- Traffic Splitting and ModelName Rewriting + - The EPP facilitates controlled rollouts of new adapter versions by implementing traffic splitting between adapters within the same `InferencePool`, as defined by the `InferenceModel`. + - EPP rewrites the model name in the request to the [target model name](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/7e3cd457cdcd01339b65861c8e472cf27e6b6e80/api/v1alpha1/inferencemodel_types.go#L161) as defined on the `InferenceModel` object. +- Observability + - The EPP generates metrics to enhance observability. + - It reports InferenceModel-level metrics, further broken down by target model. + - Detailed information regarding metrics can be found on the [website](https://gateway-api-inference-extension.sigs.k8s.io/guides/metrics/). + +## The scheduling algorithm +The scheduling package implements request scheduling algorithms for load balancing requests across backend pods in an inference gateway. The scheduler ensures efficient resource utilization while maintaining low latency and prioritizing critical requests. It applies a series of filters based on metrics and heuristics to select the best pod for a given request. The following flow chart summarizes the current scheduling algorithm + +# Flowchart +Scheduling Algorithm diff --git a/pkg/scheduling.md b/pkg/scheduling.md deleted file mode 100644 index 99223ad2..00000000 --- a/pkg/scheduling.md +++ /dev/null @@ -1,5 +0,0 @@ -## Scheduling Package in Ext Proc -The scheduling package implements request scheduling algorithms for load balancing requests across backend pods in an inference gateway. The scheduler ensures efficient resource utilization while maintaining low latency and prioritizing critical requests. It applies a series of filters based on metrics and heuristics to select the best pod for a given request. - -# Flowchart -Scheduling Algorithm \ No newline at end of file From 616440bd8e29f4c1fcb57c6c62fb45c3a27ada96 Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Fri, 21 Feb 2025 21:44:26 +0000 Subject: [PATCH 43/96] split the proxy and model server protocols for easy reference (#387) --- README.md | 2 +- .../README.md | 39 +------------------ .../004-endpoint-picker-protocol/README.md | 35 +++++++++++++++++ 3 files changed, 37 insertions(+), 39 deletions(-) rename docs/proposals/{003-endpoint-picker-protocol => 003-model-server-protocol}/README.md (54%) create mode 100644 docs/proposals/004-endpoint-picker-protocol/README.md diff --git a/README.md b/README.md index 89826f0c..e6730ae4 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ The inference gateway: ![Architecture Diagram](./docs/inference-gateway-architecture.svg) -It currently requires a version of vLLM that supports the necessary metrics to predict traffic load which is defined in the [model server protocol](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-endpoint-picker-protocol). Support for Google's Jetstream, nVidia Triton, text-generation-inference, and SGLang is coming soon. +It currently requires a version of vLLM that supports the necessary metrics to predict traffic load which is defined in the [model server protocol](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol). Support for Google's Jetstream, nVidia Triton, text-generation-inference, and SGLang is coming soon. ## Status diff --git a/docs/proposals/003-endpoint-picker-protocol/README.md b/docs/proposals/003-model-server-protocol/README.md similarity index 54% rename from docs/proposals/003-endpoint-picker-protocol/README.md rename to docs/proposals/003-model-server-protocol/README.md index 418c0f3c..44ecf4e1 100644 --- a/docs/proposals/003-endpoint-picker-protocol/README.md +++ b/docs/proposals/003-model-server-protocol/README.md @@ -1,41 +1,4 @@ -# Endpoint Picker Protocol - -The Endpoint Picker, or EPP, is a core component of the inference extension. Ultimately it's -responsible for picking an endpoint from the `InferencePool`. A reference implementation can be -found [here](../../../pkg/epp/). - -## Proxy Protocol - -This is the protocol between the EPP and the proxy (e.g, Envoy). - -The EPP MUST implement the Envoy -[external processing service](https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor)protocol. - -For each HTTP request, the EPP MUST communicate to the proxy the picked model server endpoint via: - -1. Setting the `x-gateway-destination-endpoint` HTTP header to the selected endpoint in format. - -2. Set an unstructured entry in the [dynamic_metadata](https://github.com/envoyproxy/go-control-plane/blob/c19bf63a811c90bf9e02f8e0dc1dcef94931ebb4/envoy/service/ext_proc/v3/external_processor.pb.go#L320) field of the ext-proc response. The metadata entry for the picked endpoint MUST be wrapped with an outer key (which represents the metadata namespace) with a default of `envoy.lb`. - -The final metadata necessary would look like: -```go -dynamicMetadata: { - "envoy.lb": { - "x-gateway-destination-endpoint": " - } -} -``` - -Note: -- If the EPP did not communicate the server endpoint via these two methods, it MUST return an error. -- The EPP MUST not set two different values in the header and the inner response metadata value. - -### Why envoy.lb namespace as a default? -The `envoy.lb` namesapce is a predefined namespace used for subsetting. One common way to use the selected endpoint returned from the server, is [envoy subsets](https://www.envoyproxy.io/docs/envoy/latest/intro/arch_overview/upstream/load_balancing/subsets) where host metadata for subset load balancing must be placed under `envoy.lb`. - -Setting different value leads to unpredictable behavior because proxies aren't guaranteed to support both paths, and so this protocol does not define what takes precedence. - -## Model Server Protocol +# Model Server Protocol This is the protocol between the EPP and the model servers. diff --git a/docs/proposals/004-endpoint-picker-protocol/README.md b/docs/proposals/004-endpoint-picker-protocol/README.md new file mode 100644 index 00000000..1e27ff0f --- /dev/null +++ b/docs/proposals/004-endpoint-picker-protocol/README.md @@ -0,0 +1,35 @@ +# Endpoint Picker Protocol + +The Endpoint Picker, or EPP, is a core component of the inference extension. Ultimately it's +responsible for picking an endpoint from the `InferencePool`. A reference implementation can be +found [here](../../../pkg/epp/). + +This doc defines the protocol between the EPP and the proxy (e.g, Envoy). + +The EPP MUST implement the Envoy +[external processing service](https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor)protocol. + +For each HTTP request, the EPP MUST communicate to the proxy the picked model server endpoint via: + +1. Setting the `x-gateway-destination-endpoint` HTTP header to the selected endpoint in format. + +2. Set an unstructured entry in the [dynamic_metadata](https://github.com/envoyproxy/go-control-plane/blob/c19bf63a811c90bf9e02f8e0dc1dcef94931ebb4/envoy/service/ext_proc/v3/external_processor.pb.go#L320) field of the ext-proc response. The metadata entry for the picked endpoint MUST be wrapped with an outer key (which represents the metadata namespace) with a default of `envoy.lb`. + +The final metadata necessary would look like: +```go +dynamicMetadata: { + "envoy.lb": { + "x-gateway-destination-endpoint": " + } +} +``` + +Note: +- If the EPP did not communicate the server endpoint via these two methods, it MUST return an error. +- The EPP MUST not set two different values in the header and the inner response metadata value. + +## Why envoy.lb namespace as a default? +The `envoy.lb` namesapce is a predefined namespace used for subsetting. One common way to use the selected endpoint returned from the server, is [envoy subsets](https://www.envoyproxy.io/docs/envoy/latest/intro/arch_overview/upstream/load_balancing/subsets) where host metadata for subset load balancing must be placed under `envoy.lb`. + +Setting different value leads to unpredictable behavior because proxies aren't guaranteed to support both paths, and so this protocol does not define what takes precedence. + From c48a4b280fc649bbeb43e72fc4389d01abe56a6c Mon Sep 17 00:00:00 2001 From: Jeff Luo Date: Fri, 21 Feb 2025 16:44:33 -0500 Subject: [PATCH 44/96] [Metric] Add inference pool and request error metrics to the dashboard (#389) --- tools/dashboards/inference_gateway.json | 905 ++++++++++++++++-------- 1 file changed, 608 insertions(+), 297 deletions(-) diff --git a/tools/dashboards/inference_gateway.json b/tools/dashboards/inference_gateway.json index 4e872739..cf00420d 100644 --- a/tools/dashboards/inference_gateway.json +++ b/tools/dashboards/inference_gateway.json @@ -28,7 +28,7 @@ }, "gridPos": { "h": 3, - "w": 23, + "w": 20, "x": 0, "y": 0 }, @@ -42,7 +42,7 @@ "content": "# Inferece Gateway Dashboard\n\nPlease see https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp/metrics for more details of underlying metrics used in the dashboard.", "mode": "markdown" }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "title": "", "type": "text" }, @@ -54,15 +54,15 @@ "x": 0, "y": 3 }, - "id": 3, + "id": 15, "panels": [], - "title": "Inference Model", + "title": "Inference Pool", "type": "row" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "deap2an4eadc0d" }, "fieldConfig": { "defaults": { @@ -125,7 +125,7 @@ "x": 0, "y": 4 }, - "id": 1, + "id": 16, "options": { "legend": { "calcs": [], @@ -139,33 +139,27 @@ "sort": "none" } }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, "disableTextWrap": false, "editorMode": "builder", - "exemplar": false, - "expr": "sum by(model_name, target_model_name) (rate(inference_model_request_total{}[$__rate_interval]))", + "expr": "sum by(name) (inference_pool_average_kv_cache_utilization)", "fullMetaSearch": false, "includeNullMetadata": true, - "interval": "", "legendFormat": "__auto", "range": true, "refId": "A", "useBackend": false } ], - "title": "Request / s", + "title": "Average KV Cache Utilization", "type": "timeseries" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "deap2an4eadc0d" }, "fieldConfig": { "defaults": { @@ -228,7 +222,7 @@ "x": 10, "y": 4 }, - "id": 2, + "id": 17, "options": { "legend": { "calcs": [], @@ -242,55 +236,36 @@ "sort": "none" } }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "targets": [ { "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", + "expr": "sum by(name) (inference_pool_average_queue_size)", "fullMetaSearch": false, - "includeNullMetadata": false, - "legendFormat": "95%", + "includeNullMetadata": true, + "legendFormat": "__auto", "range": true, "refId": "A", "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "90%", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "50%", - "range": true, - "refId": "C", - "useBackend": false } ], - "title": "E2E Request Latency", + "title": "Average Queue Size", "type": "timeseries" }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 3, + "panels": [], + "title": "Inference Model", + "type": "row" + }, { "datasource": { "type": "prometheus", @@ -353,11 +328,11 @@ }, "gridPos": { "h": 8, - "w": 10, + "w": 20, "x": 0, - "y": 12 + "y": 13 }, - "id": 6, + "id": 2, "options": { "legend": { "calcs": [], @@ -371,12 +346,12 @@ "sort": "none" } }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "targets": [ { "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "legendFormat": "95%", @@ -391,7 +366,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -407,7 +382,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_request_duration_seconds_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -417,7 +392,7 @@ "useBackend": false } ], - "title": "Request Size", + "title": "E2E Request Latency", "type": "timeseries" }, { @@ -483,10 +458,10 @@ "gridPos": { "h": 8, "w": 10, - "x": 10, - "y": 12 + "x": 0, + "y": 21 }, - "id": 7, + "id": 1, "options": { "legend": { "calcs": [], @@ -500,35 +475,8 @@ "sort": "none" } }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "targets": [ - { - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "includeNullMetadata": false, - "legendFormat": "95%", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "90%", - "range": true, - "refId": "B", - "useBackend": false - }, { "datasource": { "type": "prometheus", @@ -536,17 +484,18 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", + "exemplar": false, + "expr": "sum by(model_name, target_model_name) (rate(inference_model_request_total{}[$__rate_interval]))", "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "50%", + "includeNullMetadata": true, + "interval": "", + "legendFormat": "__auto", "range": true, - "refId": "C", + "refId": "A", "useBackend": false } ], - "title": "Response Size", + "title": "Request / s", "type": "timeseries" }, { @@ -612,10 +561,10 @@ "gridPos": { "h": 8, "w": 10, - "x": 0, - "y": 20 + "x": 10, + "y": 21 }, - "id": 8, + "id": 18, "options": { "legend": { "calcs": [], @@ -629,19 +578,8 @@ "sort": "none" } }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "targets": [ - { - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "includeNullMetadata": false, - "legendFormat": "95%", - "range": true, - "refId": "A", - "useBackend": false - }, { "datasource": { "type": "prometheus", @@ -649,33 +587,18 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "90%", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", + "exemplar": false, + "expr": "sum by(error_code, model_name, target_model_name) (rate(inference_model_request_error_total[$__rate_interval]))", "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": false, - "legendFormat": "50%", + "includeNullMetadata": true, + "interval": "", + "legendFormat": "__auto", "range": true, - "refId": "C", + "refId": "A", "useBackend": false } ], - "title": "Input Token Count", + "title": "Request Error / s", "type": "timeseries" }, { @@ -741,10 +664,10 @@ "gridPos": { "h": 8, "w": 10, - "x": 10, - "y": 20 + "x": 0, + "y": 29 }, - "id": 9, + "id": 6, "options": { "legend": { "calcs": [], @@ -758,12 +681,12 @@ "sort": "none" } }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "targets": [ { "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "legendFormat": "95%", @@ -778,7 +701,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -794,7 +717,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_request_sizes_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -804,22 +727,9 @@ "useBackend": false } ], - "title": "Output Token Count", + "title": "Request Size", "type": "timeseries" }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 28 - }, - "id": 10, - "panels": [], - "title": "vLLM", - "type": "row" - }, { "datasource": { "type": "prometheus", @@ -881,12 +791,12 @@ "overrides": [] }, "gridPos": { - "h": 7, + "h": 8, "w": 10, - "x": 0, + "x": 10, "y": 29 }, - "id": 14, + "id": 7, "options": { "legend": { "calcs": [], @@ -900,15 +810,15 @@ "sort": "none" } }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "targets": [ { "disableTextWrap": false, "editorMode": "builder", - "expr": "sum by(model_name) (rate(vllm:prompt_tokens_total[$__rate_interval]))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", "fullMetaSearch": false, - "includeNullMetadata": true, - "legendFormat": "Prompt Tokens/Sec", + "includeNullMetadata": false, + "legendFormat": "95%", "range": true, "refId": "A", "useBackend": false @@ -920,17 +830,33 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "sum by(model_name) (rate(vllm:generation_tokens_total[$__rate_interval]))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, - "includeNullMetadata": true, - "legendFormat": "Generation Tokens/Sec", + "includeNullMetadata": false, + "legendFormat": "90%", "range": true, "refId": "B", "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_response_sizes_bucket{}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": false, + "legendFormat": "50%", + "range": true, + "refId": "C", + "useBackend": false } ], - "title": "Token Throughput", + "title": "Response Size", "type": "timeseries" }, { @@ -994,12 +920,12 @@ "overrides": [] }, "gridPos": { - "h": 7, + "h": 8, "w": 10, - "x": 10, - "y": 29 + "x": 0, + "y": 37 }, - "id": 11, + "id": 8, "options": { "legend": { "calcs": [], @@ -1013,14 +939,14 @@ "sort": "none" } }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "targets": [ { "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", "fullMetaSearch": false, - "includeNullMetadata": true, + "includeNullMetadata": false, "legendFormat": "95%", "range": true, "refId": "A", @@ -1033,10 +959,10 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, - "includeNullMetadata": true, + "includeNullMetadata": false, "legendFormat": "90%", "range": true, "refId": "B", @@ -1049,17 +975,17 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_input_tokens_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, - "includeNullMetadata": true, + "includeNullMetadata": false, "legendFormat": "50%", "range": true, "refId": "C", "useBackend": false } ], - "title": "E2E Request Latency", + "title": "Input Token Count", "type": "timeseries" }, { @@ -1123,12 +1049,12 @@ "overrides": [] }, "gridPos": { - "h": 7, + "h": 8, "w": 10, - "x": 0, - "y": 36 + "x": 10, + "y": 37 }, - "id": 13, + "id": 9, "options": { "legend": { "calcs": [], @@ -1142,14 +1068,14 @@ "sort": "none" } }, - "pluginVersion": "11.5.0", + "pluginVersion": "11.5.2", "targets": [ { "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", "fullMetaSearch": false, - "includeNullMetadata": true, + "includeNullMetadata": false, "legendFormat": "95%", "range": true, "refId": "A", @@ -1162,10 +1088,10 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, - "includeNullMetadata": true, + "includeNullMetadata": false, "legendFormat": "90%", "range": true, "refId": "B", @@ -1178,147 +1104,532 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(inference_model_output_tokens_bucket{}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, - "includeNullMetadata": true, + "includeNullMetadata": false, "legendFormat": "50%", "range": true, "refId": "C", "useBackend": false } ], - "title": "Time Per Output Token Latency", + "title": "Output Token Count", "type": "timeseries" }, { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 45 }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" + "id": 10, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 10, + "x": 0, + "y": 52 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "thresholdsStyle": { - "mode": "off" + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "pluginVersion": "11.5.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(model_name) (rate(vllm:prompt_tokens_total[$__rate_interval]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "Prompt Tokens/Sec", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 10, - "x": 10, - "y": 36 - }, - "id": 12, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(model_name) (rate(vllm:generation_tokens_total[$__rate_interval]))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "Generation Tokens/Sec", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Token Throughput", + "type": "timeseries" }, - "tooltip": { - "hideZeros": false, - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.5.0", - "targets": [ { - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", - "fullMetaSearch": false, - "includeNullMetadata": true, - "legendFormat": "95%", - "range": true, - "refId": "A", - "useBackend": false + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 10, + "x": 10, + "y": 52 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "95%", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "90%", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "50%", + "range": true, + "refId": "C", + "useBackend": false + } + ], + "title": "E2E Request Latency", + "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "legendFormat": "90%", - "range": true, - "refId": "B", - "useBackend": false + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 10, + "x": 0, + "y": 59 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "95%", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "90%", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "50%", + "range": true, + "refId": "C", + "useBackend": false + } + ], + "title": "Time Per Output Token Latency", + "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "legendFormat": "50%", - "range": true, - "refId": "C", - "useBackend": false + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 10, + "x": 10, + "y": 59 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.5.2", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "95%", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "90%", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "legendFormat": "50%", + "range": true, + "refId": "C", + "useBackend": false + } + ], + "title": "Time To First Token Latency", + "type": "timeseries" } ], - "title": "Time To First Token Latency", - "type": "timeseries" + "title": "vLLM", + "type": "row" } ], "preload": false, @@ -1350,6 +1661,6 @@ "timezone": "browser", "title": "Inference Gateway", "uid": "aeap3g4ujefb4b", - "version": 16, + "version": 20, "weekStart": "" } From 432f5ed69402e67c38b6b8a2f0e4d68baf84d701 Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Fri, 21 Feb 2025 22:02:25 +0000 Subject: [PATCH 45/96] Use gcr.io/distroless/static:nonroot base image (#384) --- Dockerfile | 4 ++-- Makefile | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 4adc82e4..312700bc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ # Dockerfile has specific requirement to put this ARG at the beginning: # https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact -ARG BUILDER_IMAGE=golang:1.23-alpine -ARG BASE_IMAGE=gcr.io/distroless/base-debian10 +ARG BUILDER_IMAGE=golang:1.23 +ARG BASE_IMAGE=gcr.io/distroless/static:nonroot ## Multistage build FROM ${BUILDER_IMAGE} AS builder diff --git a/Makefile b/Makefile index 1d8fc531..8d02a5e8 100644 --- a/Makefile +++ b/Makefile @@ -36,8 +36,8 @@ SYNCER_IMAGE_NAME := lora-syncer SYNCER_IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(SYNCER_IMAGE_NAME) SYNCER_IMAGE_TAG ?= $(SYNCER_IMAGE_REPO):$(GIT_TAG) -BASE_IMAGE ?= gcr.io/distroless/base-debian10 -BUILDER_IMAGE ?= golang:1.23-alpine +BASE_IMAGE ?= gcr.io/distroless/static:nonroot +BUILDER_IMAGE ?= golang:1.23 ifdef GO_VERSION BUILDER_IMAGE = golang:$(GO_VERSION) endif From 2a615e981228aa6ffc2a89219c986ac863dde776 Mon Sep 17 00:00:00 2001 From: Kuromesi Date: Sun, 23 Feb 2025 16:06:27 +0800 Subject: [PATCH 46/96] fix context canceled recv error handling (#390) Signed-off-by: Kuromesi --- pkg/epp/handlers/server.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go index 9105e8b1..3270134b 100644 --- a/pkg/epp/handlers/server.go +++ b/pkg/epp/handlers/server.go @@ -18,7 +18,6 @@ package handlers import ( "context" - "errors" "io" "time" @@ -90,7 +89,7 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { } req, recvErr := srv.Recv() - if recvErr == io.EOF || errors.Is(recvErr, context.Canceled) { + if recvErr == io.EOF || status.Code(recvErr) == codes.Canceled { return nil } if recvErr != nil { From 6ea3ac6b70c7ba9568316eb0bab36a2942989816 Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Mon, 24 Feb 2025 14:54:30 +0000 Subject: [PATCH 47/96] Added endpoint picker diagram (#396) --- docs/endpoint-picker.svg | 3 +++ pkg/epp/README.md | 12 ++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) create mode 100644 docs/endpoint-picker.svg diff --git a/docs/endpoint-picker.svg b/docs/endpoint-picker.svg new file mode 100644 index 00000000..3ec8eed4 --- /dev/null +++ b/docs/endpoint-picker.svg @@ -0,0 +1,3 @@ +Endpoint PickerServiceModelServerL7 Proxy / Gateway InferencePool API Selects - the model servers (the endpoints) - the endpoint picker serviceModel ServerProtocolTrafficExtensionProtocolGateway ControllerClientTrafficConfiguresWatchesWatches InferenceModel API Defines - the model/adapter to serve - the serving objectives for the modelObservabilityMetrics ScrapingObservabilityDashboardsStandard GatewayElementsInference ExtensionElementsInference Gateway \ No newline at end of file diff --git a/pkg/epp/README.md b/pkg/epp/README.md index e3bc26ae..1bf47993 100644 --- a/pkg/epp/README.md +++ b/pkg/epp/README.md @@ -1,8 +1,12 @@ # The EndPoint Picker (EPP) -This package provides the reference implementation for the Endpoint Picker (EPP). It implements the [extension protocol](../../docs/proposals/003-endpoint-picker-protocol), enabling a proxy or gateway to request endpoint hints from an extension. An EPP instance handles a single `InferencePool` (and so for each `InferencePool`, one must create a dedicated EPP deployment). +This package provides the reference implementation for the Endpoint Picker (EPP). As demonistrated in the diagram below, it implements the [extension protocol](../../docs/proposals/004-endpoint-picker-protocol), enabling a proxy or gateway to request endpoint hints from an extension, and interacts with the model servers through the defined [model server protocol](../..//docs/proposals/003-model-server-protocol). +![Architecture Diagram](../../docs/endpoint-picker.svg) -The Endpoint Picker performs the following core functions: + +## Core Functions + +An EPP instance handles a single `InferencePool` (and so for each `InferencePool`, one must create a dedicated EPP deployment), it performs the following core functions: - Endpoint Selection - The EPP determines the appropriate Pod endpoint for the load balancer (LB) to route requests. @@ -17,8 +21,8 @@ The Endpoint Picker performs the following core functions: - It reports InferenceModel-level metrics, further broken down by target model. - Detailed information regarding metrics can be found on the [website](https://gateway-api-inference-extension.sigs.k8s.io/guides/metrics/). -## The scheduling algorithm + +## Scheduling Algorithm The scheduling package implements request scheduling algorithms for load balancing requests across backend pods in an inference gateway. The scheduler ensures efficient resource utilization while maintaining low latency and prioritizing critical requests. It applies a series of filters based on metrics and heuristics to select the best pod for a given request. The following flow chart summarizes the current scheduling algorithm -# Flowchart Scheduling Algorithm From 58335c0b4fb415d44235331bb49ed61d93ce37b1 Mon Sep 17 00:00:00 2001 From: Tiger Xu / Zhonghu Xu Date: Tue, 25 Feb 2025 00:26:28 +0800 Subject: [PATCH 48/96] Added v1alpha2 api (#398) * copy api v1alpha1 to v1alpha2 * Add nested status * auto generate * use v1alpha2 --- api/v1alpha2/doc.go | 23 ++ api/v1alpha2/groupversion_info.go | 45 ++ api/v1alpha2/inferencemodel_types.go | 235 +++++++++++ api/v1alpha2/inferencepool_types.go | 255 ++++++++++++ api/v1alpha2/zz_generated.deepcopy.go | 384 ++++++++++++++++++ .../api/v1alpha2/endpointpickerconfig.go | 38 ++ .../api/v1alpha2/extension.go | 75 ++++ .../api/v1alpha2/extensionconnection.go | 42 ++ .../api/v1alpha2/extensionreference.go | 65 +++ .../api/v1alpha2/inferencemodel.go | 224 ++++++++++ .../api/v1alpha2/inferencemodelspec.go | 74 ++++ .../api/v1alpha2/inferencemodelstatus.go | 47 +++ .../api/v1alpha2/inferencepool.go | 224 ++++++++++ .../api/v1alpha2/inferencepoolspec.go | 66 +++ .../api/v1alpha2/inferencepoolstatus.go | 43 ++ .../api/v1alpha2/poolobjectreference.go | 56 +++ .../api/v1alpha2/poolstatus.go | 57 +++ .../api/v1alpha2/targetmodel.go | 47 +++ client-go/applyconfiguration/utils.go | 30 ++ client-go/clientset/versioned/clientset.go | 13 + .../versioned/fake/clientset_generated.go | 7 + .../clientset/versioned/fake/register.go | 2 + .../clientset/versioned/scheme/register.go | 2 + .../typed/api/v1alpha2/api_client.go | 111 +++++ .../versioned/typed/api/v1alpha2/doc.go | 19 + .../versioned/typed/api/v1alpha2/fake/doc.go | 19 + .../api/v1alpha2/fake/fake_api_client.go | 43 ++ .../api/v1alpha2/fake/fake_inferencemodel.go | 52 +++ .../api/v1alpha2/fake/fake_inferencepool.go | 52 +++ .../typed/api/v1alpha2/generated_expansion.go | 22 + .../typed/api/v1alpha2/inferencemodel.go | 73 ++++ .../typed/api/v1alpha2/inferencepool.go | 73 ++++ .../externalversions/api/interface.go | 8 + .../api/v1alpha2/inferencemodel.go | 89 ++++ .../api/v1alpha2/inferencepool.go | 89 ++++ .../api/v1alpha2/interface.go | 51 +++ .../informers/externalversions/generic.go | 7 + .../api/v1alpha2/expansion_generated.go | 34 ++ .../listers/api/v1alpha2/inferencemodel.go | 69 ++++ .../listers/api/v1alpha2/inferencepool.go | 69 ++++ cmd/epp/main.go | 3 + ...e.networking.x-k8s.io_inferencemodels.yaml | 224 ++++++++++ ...ce.networking.x-k8s.io_inferencepools.yaml | 252 ++++++++++++ pkg/epp/backend/fake.go | 6 +- .../controller/inferencemodel_reconciler.go | 8 +- .../inferencemodel_reconciler_test.go | 78 ++-- .../controller/inferencepool_reconciler.go | 8 +- .../inferencepool_reconciler_test.go | 18 +- pkg/epp/controller/pod_reconciler_test.go | 44 +- pkg/epp/datastore/datastore.go | 34 +- pkg/epp/datastore/datastore_test.go | 26 +- pkg/epp/test/benchmark/benchmark.go | 8 +- pkg/epp/test/utils.go | 4 +- test/e2e/e2e_suite_test.go | 4 +- test/e2e/e2e_test.go | 8 +- test/integration/hermetic_test.go | 8 +- test/utils/utils.go | 8 +- test/utils/wrappers.go | 20 +- 58 files changed, 3554 insertions(+), 141 deletions(-) create mode 100644 api/v1alpha2/doc.go create mode 100644 api/v1alpha2/groupversion_info.go create mode 100644 api/v1alpha2/inferencemodel_types.go create mode 100644 api/v1alpha2/inferencepool_types.go create mode 100644 api/v1alpha2/zz_generated.deepcopy.go create mode 100644 client-go/applyconfiguration/api/v1alpha2/endpointpickerconfig.go create mode 100644 client-go/applyconfiguration/api/v1alpha2/extension.go create mode 100644 client-go/applyconfiguration/api/v1alpha2/extensionconnection.go create mode 100644 client-go/applyconfiguration/api/v1alpha2/extensionreference.go create mode 100644 client-go/applyconfiguration/api/v1alpha2/inferencemodel.go create mode 100644 client-go/applyconfiguration/api/v1alpha2/inferencemodelspec.go create mode 100644 client-go/applyconfiguration/api/v1alpha2/inferencemodelstatus.go create mode 100644 client-go/applyconfiguration/api/v1alpha2/inferencepool.go create mode 100644 client-go/applyconfiguration/api/v1alpha2/inferencepoolspec.go create mode 100644 client-go/applyconfiguration/api/v1alpha2/inferencepoolstatus.go create mode 100644 client-go/applyconfiguration/api/v1alpha2/poolobjectreference.go create mode 100644 client-go/applyconfiguration/api/v1alpha2/poolstatus.go create mode 100644 client-go/applyconfiguration/api/v1alpha2/targetmodel.go create mode 100644 client-go/clientset/versioned/typed/api/v1alpha2/api_client.go create mode 100644 client-go/clientset/versioned/typed/api/v1alpha2/doc.go create mode 100644 client-go/clientset/versioned/typed/api/v1alpha2/fake/doc.go create mode 100644 client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_api_client.go create mode 100644 client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencemodel.go create mode 100644 client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencepool.go create mode 100644 client-go/clientset/versioned/typed/api/v1alpha2/generated_expansion.go create mode 100644 client-go/clientset/versioned/typed/api/v1alpha2/inferencemodel.go create mode 100644 client-go/clientset/versioned/typed/api/v1alpha2/inferencepool.go create mode 100644 client-go/informers/externalversions/api/v1alpha2/inferencemodel.go create mode 100644 client-go/informers/externalversions/api/v1alpha2/inferencepool.go create mode 100644 client-go/informers/externalversions/api/v1alpha2/interface.go create mode 100644 client-go/listers/api/v1alpha2/expansion_generated.go create mode 100644 client-go/listers/api/v1alpha2/inferencemodel.go create mode 100644 client-go/listers/api/v1alpha2/inferencepool.go diff --git a/api/v1alpha2/doc.go b/api/v1alpha2/doc.go new file mode 100644 index 00000000..90a35f58 --- /dev/null +++ b/api/v1alpha2/doc.go @@ -0,0 +1,23 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1alpha2 contains API Schema definitions for the +// inference.networking.x-k8s.io API group. +// +// +k8s:openapi-gen=true +// +kubebuilder:object:generate=true +// +groupName=inference.networking.x-k8s.io +package v1alpha2 diff --git a/api/v1alpha2/groupversion_info.go b/api/v1alpha2/groupversion_info.go new file mode 100644 index 00000000..f9eb9b1e --- /dev/null +++ b/api/v1alpha2/groupversion_info.go @@ -0,0 +1,45 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1alpha2 contains API Schema definitions for the gateway v1alpha2 API group +// +kubebuilder:object:generate=true +// +groupName=inference.networking.x-k8s.io +package v1alpha2 + +import ( + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/scheme" +) + +var ( + // GroupVersion is group version used to register these objects + GroupVersion = schema.GroupVersion{Group: "inference.networking.x-k8s.io", Version: "v1alpha2"} + + // SchemeGroupVersion is alias to GroupVersion for client-go libraries. + // It is required by pkg/client/informers/externalversions/... + SchemeGroupVersion = GroupVersion + + // SchemeBuilder is used to add go types to the GroupVersionKind scheme + SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} + + // AddToScheme adds the types in this group-version to the given scheme. + AddToScheme = SchemeBuilder.AddToScheme +) + +// Resource is required by pkg/client/listers/... +func Resource(resource string) schema.GroupResource { + return GroupVersion.WithResource(resource).GroupResource() +} diff --git a/api/v1alpha2/inferencemodel_types.go b/api/v1alpha2/inferencemodel_types.go new file mode 100644 index 00000000..9ab1fd86 --- /dev/null +++ b/api/v1alpha2/inferencemodel_types.go @@ -0,0 +1,235 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha2 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// InferenceModel is the Schema for the InferenceModels API. +// +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:storageversion +// +genclient +type InferenceModel struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec InferenceModelSpec `json:"spec,omitempty"` + Status InferenceModelStatus `json:"status,omitempty"` +} + +// InferenceModelList contains a list of InferenceModel. +// +// +kubebuilder:object:root=true +type InferenceModelList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []InferenceModel `json:"items"` +} + +// InferenceModelSpec represents the desired state of a specific model use case. This resource is +// managed by the "Inference Workload Owner" persona. +// +// The Inference Workload Owner persona is someone that trains, verifies, and +// leverages a large language model from a model frontend, drives the lifecycle +// and rollout of new versions of those models, and defines the specific +// performance and latency goals for the model. These workloads are +// expected to operate within an InferencePool sharing compute capacity with other +// InferenceModels, defined by the Inference Platform Admin. +// +// InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool, +// if the name is reused, an error will be shown on the status of a +// InferenceModel that attempted to reuse. The oldest InferenceModel, based on +// creation timestamp, will be selected to remain valid. In the event of a race +// condition, one will be selected at random. +type InferenceModelSpec struct { + // ModelName is the name of the model as it will be set in the "model" parameter for an incoming request. + // ModelNames must be unique for a referencing InferencePool + // (names can be reused for a different pool in the same cluster). + // The modelName with the oldest creation timestamp is retained, and the incoming + // InferenceModel is sets the Ready status to false with a corresponding reason. + // In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected. + // Names can be reserved without an underlying model configured in the pool. + // This can be done by specifying a target model and setting the weight to zero, + // an error will be returned specifying that no valid target model is found. + // + // +kubebuilder:validation:MaxLength=256 + // +kubebuilder:validation:Required + ModelName string `json:"modelName"` + + // Criticality defines how important it is to serve the model compared to other models referencing the same pool. + // Criticality impacts how traffic is handled in resource constrained situations. It handles this by + // queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will + // fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness, + // and the proportionality of fairness will be configurable. + // + // Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field. + // Any implementations that may consume this field may treat an unset value as the 'Standard' range. + // +optional + Criticality *Criticality `json:"criticality,omitempty"` + + // TargetModels allow multiple versions of a model for traffic splitting. + // If not specified, the target model name is defaulted to the modelName parameter. + // modelName is often in reference to a LoRA adapter. + // + // +optional + // +kubebuilder:validation:MaxItems=10 + // +kubebuilder:validation:XValidation:message="Weights should be set for all models, or none of the models.",rule="self.all(model, has(model.weight)) || self.all(model, !has(model.weight))" + TargetModels []TargetModel `json:"targetModels,omitempty"` + + // PoolRef is a reference to the inference pool, the pool must exist in the same namespace. + // + // +kubebuilder:validation:Required + PoolRef PoolObjectReference `json:"poolRef"` +} + +// PoolObjectReference identifies an API object within the namespace of the +// referrer. +type PoolObjectReference struct { + // Group is the group of the referent. + // + // +optional + // +kubebuilder:default="inference.networking.x-k8s.io" + // +kubebuilder:validation:MaxLength=253 + // +kubebuilder:validation:Pattern=`^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$` + Group string `json:"group,omitempty"` + + // Kind is kind of the referent. For example "InferencePool". + // + // +optional + // +kubebuilder:default="InferencePool" + // +kubebuilder:validation:MinLength=1 + // +kubebuilder:validation:MaxLength=63 + // +kubebuilder:validation:Pattern=`^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$` + Kind string `json:"kind,omitempty"` + + // Name is the name of the referent. + // + // +kubebuilder:validation:MinLength=1 + // +kubebuilder:validation:MaxLength=253 + // +kubebuilder:validation:Required + Name string `json:"name"` +} + +// Criticality defines how important it is to serve the model compared to other models. +// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional(use a pointer), and set no default. +// This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior. +// +kubebuilder:validation:Enum=Critical;Standard;Sheddable +type Criticality string + +const ( + // Critical defines the highest level of criticality. Requests to this band will be shed last. + Critical Criticality = "Critical" + + // Standard defines the base criticality level and is more important than Sheddable but less + // important than Critical. Requests in this band will be shed before critical traffic. + // Most models are expected to fall within this band. + Standard Criticality = "Standard" + + // Sheddable defines the lowest level of criticality. Requests to this band will be shed before + // all other bands. + Sheddable Criticality = "Sheddable" +) + +// TargetModel represents a deployed model or a LoRA adapter. The +// Name field is expected to match the name of the LoRA adapter +// (or base model) as it is registered within the model server. Inference +// Gateway assumes that the model exists on the model server and it's the +// responsibility of the user to validate a correct match. Should a model fail +// to exist at request time, the error is processed by the Inference Gateway +// and emitted on the appropriate InferenceModel object. +type TargetModel struct { + // Name is the name of the adapter or base model, as expected by the ModelServer. + // + // +kubebuilder:validation:MaxLength=253 + // +kubebuilder:validation:Required + Name string `json:"name"` + + // Weight is used to determine the proportion of traffic that should be + // sent to this model when multiple target models are specified. + // + // Weight defines the proportion of requests forwarded to the specified + // model. This is computed as weight/(sum of all weights in this + // TargetModels list). For non-zero values, there may be some epsilon from + // the exact proportion defined here depending on the precision an + // implementation supports. Weight is not a percentage and the sum of + // weights does not need to equal 100. + // + // If a weight is set for any targetModel, it must be set for all targetModels. + // Conversely weights are optional, so long as ALL targetModels do not specify a weight. + // + // +optional + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:validation:Maximum=1000000 + Weight *int32 `json:"weight,omitempty"` +} + +// InferenceModelStatus defines the observed state of InferenceModel +type InferenceModelStatus struct { + // Conditions track the state of the InferenceModel. + // + // Known condition types are: + // + // * "Accepted" + // + // +optional + // +listType=map + // +listMapKey=type + // +kubebuilder:validation:MaxItems=8 + // +kubebuilder:default={{type: "Ready", status: "Unknown", reason:"Pending", message:"Waiting for controller", lastTransitionTime: "1970-01-01T00:00:00Z"}} + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// InferenceModelConditionType is a type of condition for the InferenceModel. +type InferenceModelConditionType string + +// InferenceModelConditionReason is the reason for a given InferenceModelConditionType. +type InferenceModelConditionReason string + +const ( + // ModelConditionAccepted indicates if the model config is accepted, and if not, why. + // + // Possible reasons for this condition to be True are: + // + // * "Accepted" + // + // Possible reasons for this condition to be False are: + // + // * "ModelNameInUse" + // + // Possible reasons for this condition to be Unknown are: + // + // * "Pending" + // + ModelConditionAccepted InferenceModelConditionType = "Accepted" + + // ModelReasonAccepted is the desired state. Model conforms to the state of the pool. + ModelReasonAccepted InferenceModelConditionReason = "Accepted" + + // ModelReasonNameInUse is used when a given ModelName already exists within the pool. + // Details about naming conflict resolution are on the ModelName field itself. + ModelReasonNameInUse InferenceModelConditionReason = "ModelNameInUse" + + // ModelReasonPending is the initial state, and indicates that the controller has not yet reconciled the InferenceModel. + ModelReasonPending InferenceModelConditionReason = "Pending" +) + +func init() { + SchemeBuilder.Register(&InferenceModel{}, &InferenceModelList{}) +} diff --git a/api/v1alpha2/inferencepool_types.go b/api/v1alpha2/inferencepool_types.go new file mode 100644 index 00000000..716bfb11 --- /dev/null +++ b/api/v1alpha2/inferencepool_types.go @@ -0,0 +1,255 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha2 + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// InferencePool is the Schema for the InferencePools API. +// +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:storageversion +// +genclient +type InferencePool struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec InferencePoolSpec `json:"spec,omitempty"` + Status InferencePoolStatus `json:"status,omitempty"` +} + +// InferencePoolList contains a list of InferencePool. +// +// +kubebuilder:object:root=true +type InferencePoolList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []InferencePool `json:"items"` +} + +// InferencePoolSpec defines the desired state of InferencePool +type InferencePoolSpec struct { + // Selector defines a map of labels to watch model server pods + // that should be included in the InferencePool. + // In some cases, implementations may translate this field to a Service selector, so this matches the simple + // map used for Service selectors instead of the full Kubernetes LabelSelector type. + // + // +kubebuilder:validation:Required + Selector map[LabelKey]LabelValue `json:"selector"` + + // TargetPortNumber defines the port number to access the selected model servers. + // The number must be in the range 1 to 65535. + // + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:validation:Maximum=65535 + // +kubebuilder:validation:Required + TargetPortNumber int32 `json:"targetPortNumber"` + + // EndpointPickerConfig specifies the configuration needed by the proxy to discover and connect to the endpoint + // picker service that picks endpoints for the requests routed to this pool. + EndpointPickerConfig `json:",inline"` +} + +// EndpointPickerConfig specifies the configuration needed by the proxy to discover and connect to the endpoint picker extension. +// This type is intended to be a union of mutually exclusive configuration options that we may add in the future. +type EndpointPickerConfig struct { + // Extension configures an endpoint picker as an extension service. + // + // +kubebuilder:validation:Required + ExtensionRef *Extension `json:"extensionRef,omitempty"` +} + +// Extension specifies how to configure an extension that runs the endpoint picker. +type Extension struct { + // Reference is a reference to a service extension. + ExtensionReference `json:",inline"` + + // ExtensionConnection configures the connection between the gateway and the extension. + ExtensionConnection `json:",inline"` +} + +// ExtensionReference is a reference to the extension deployment. +type ExtensionReference struct { + // Group is the group of the referent. + // When unspecified or empty string, core API group is inferred. + // + // +optional + // +kubebuilder:default="" + Group *string `json:"group,omitempty"` + + // Kind is the Kubernetes resource kind of the referent. For example + // "Service". + // + // Defaults to "Service" when not specified. + // + // ExternalName services can refer to CNAME DNS records that may live + // outside of the cluster and as such are difficult to reason about in + // terms of conformance. They also may not be safe to forward to (see + // CVE-2021-25740 for more information). Implementations MUST NOT + // support ExternalName Services. + // + // +optional + // +kubebuilder:default=Service + Kind *string `json:"kind,omitempty"` + + // Name is the name of the referent. + // + // +kubebuilder:validation:Required + Name string `json:"name"` + + // The port number on the service running the extension. When unspecified, implementations SHOULD infer a + // default value of 9002 when the Kind is Service. + // + // +kubebuilder:validation:Minimum=1 + // +kubebuilder:validation:Maximum=65535 + // +optional + PortNumber *int32 `json:"targetPortNumber,omitempty"` +} + +// ExtensionConnection encapsulates options that configures the connection to the extension. +type ExtensionConnection struct { + // Configures how the gateway handles the case when the extension is not responsive. + // Defaults to failClose. + // + // +optional + // +kubebuilder:default="FailClose" + FailureMode *ExtensionFailureMode `json:"failureMode"` +} + +// ExtensionFailureMode defines the options for how the gateway handles the case when the extension is not +// responsive. +// +kubebuilder:validation:Enum=FailOpen;FailClose +type ExtensionFailureMode string + +const ( + // FailOpen specifies that the proxy should not drop the request and forward the request to and endpoint of its picking. + FailOpen ExtensionFailureMode = "FailOpen" + // FailClose specifies that the proxy should drop the request. + FailClose ExtensionFailureMode = "FailClose" +) + +// LabelKey was originally copied from: https://github.com/kubernetes-sigs/gateway-api/blob/99a3934c6bc1ce0874f3a4c5f20cafd8977ffcb4/apis/v1/shared_types.go#L694-L731 +// Duplicated as to not take an unexpected dependency on gw's API. +// +// LabelKey is the key of a label. This is used for validation +// of maps. This matches the Kubernetes "qualified name" validation that is used for labels. +// Labels are case sensitive, so: my-label and My-Label are considered distinct. +// +// Valid values include: +// +// * example +// * example.com +// * example.com/path +// * example.com/path.html +// +// Invalid values include: +// +// * example~ - "~" is an invalid character +// * example.com. - can not start or end with "." +// +// +kubebuilder:validation:MinLength=1 +// +kubebuilder:validation:MaxLength=253 +// +kubebuilder:validation:Pattern=`^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9]$` +type LabelKey string + +// LabelValue is the value of a label. This is used for validation +// of maps. This matches the Kubernetes label validation rules: +// * must be 63 characters or less (can be empty), +// * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]), +// * could contain dashes (-), underscores (_), dots (.), and alphanumerics between. +// +// Valid values include: +// +// * MyValue +// * my.name +// * 123-my-value +// +// +kubebuilder:validation:MinLength=0 +// +kubebuilder:validation:MaxLength=63 +// +kubebuilder:validation:Pattern=`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$` +type LabelValue string + +// InferencePoolStatus defines the observed state of InferencePool +type InferencePoolStatus struct { + // Parents is a list of parent resources (usually Gateways) that are + // associated with the route, and the status of the InferencePool with respect to + // each parent. + // + // A maximum of 32 Gateways will be represented in this list. An empty list + // means the route has not been attached to any Gateway. + // + // +kubebuilder:validation:MaxItems=32 + Parents []PoolStatus `json:"parent,omitempty"` +} + +// PoolStatus defines the observed state of InferencePool from a gateway. +type PoolStatus struct { + // GatewayRef indicates the gateway that observed state of InferencePool. + GatewayRef corev1.ObjectReference `json:"parentRef"` + // Conditions track the state of the InferencePool. + // + // Known condition types are: + // + // * "Ready" + // + // +optional + // +listType=map + // +listMapKey=type + // +kubebuilder:validation:MaxItems=8 + // +kubebuilder:default={{type: "Ready", status: "Unknown", reason:"Pending", message:"Waiting for controller", lastTransitionTime: "1970-01-01T00:00:00Z"}} + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// InferencePoolConditionType is a type of condition for the InferencePool +type InferencePoolConditionType string + +// InferencePoolConditionReason is the reason for a given InferencePoolConditionType +type InferencePoolConditionReason string + +const ( + // PoolConditionReady indicates if the pool is ready to accept traffic, and if not, why. + // + // Possible reasons for this condition to be True are: + // + // * "Ready" + // + // Possible reasons for this condition to be False are: + // + // * "EndpointPickerNotHealthy" + // + // Possible reasons for this condition to be Unknown are: + // + // * "Pending" + // + PoolConditionReady InferencePoolConditionType = "Ready" + + // PoolReasonReady is the desired state. The pool and its components are initialized and ready for traffic. + PoolReasonReady InferencePoolConditionReason = "Ready" + + // PoolReasonEPPNotHealthy is used when the EPP has not yet passed health checks, or has started failing them. + PoolReasonEPPNotHealthy InferencePoolConditionReason = "EndpointPickerNotHealthy" + + // PoolReasonPending is the initial state, and indicates that the controller has not yet reconciled this pool. + PoolReasonPending InferencePoolConditionReason = "Pending" +) + +func init() { + SchemeBuilder.Register(&InferencePool{}, &InferencePoolList{}) +} diff --git a/api/v1alpha2/zz_generated.deepcopy.go b/api/v1alpha2/zz_generated.deepcopy.go new file mode 100644 index 00000000..9b685969 --- /dev/null +++ b/api/v1alpha2/zz_generated.deepcopy.go @@ -0,0 +1,384 @@ +//go:build !ignore_autogenerated + +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EndpointPickerConfig) DeepCopyInto(out *EndpointPickerConfig) { + *out = *in + if in.ExtensionRef != nil { + in, out := &in.ExtensionRef, &out.ExtensionRef + *out = new(Extension) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EndpointPickerConfig. +func (in *EndpointPickerConfig) DeepCopy() *EndpointPickerConfig { + if in == nil { + return nil + } + out := new(EndpointPickerConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Extension) DeepCopyInto(out *Extension) { + *out = *in + in.ExtensionReference.DeepCopyInto(&out.ExtensionReference) + in.ExtensionConnection.DeepCopyInto(&out.ExtensionConnection) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Extension. +func (in *Extension) DeepCopy() *Extension { + if in == nil { + return nil + } + out := new(Extension) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ExtensionConnection) DeepCopyInto(out *ExtensionConnection) { + *out = *in + if in.FailureMode != nil { + in, out := &in.FailureMode, &out.FailureMode + *out = new(ExtensionFailureMode) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExtensionConnection. +func (in *ExtensionConnection) DeepCopy() *ExtensionConnection { + if in == nil { + return nil + } + out := new(ExtensionConnection) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ExtensionReference) DeepCopyInto(out *ExtensionReference) { + *out = *in + if in.Group != nil { + in, out := &in.Group, &out.Group + *out = new(string) + **out = **in + } + if in.Kind != nil { + in, out := &in.Kind, &out.Kind + *out = new(string) + **out = **in + } + if in.PortNumber != nil { + in, out := &in.PortNumber, &out.PortNumber + *out = new(int32) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExtensionReference. +func (in *ExtensionReference) DeepCopy() *ExtensionReference { + if in == nil { + return nil + } + out := new(ExtensionReference) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *InferenceModel) DeepCopyInto(out *InferenceModel) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferenceModel. +func (in *InferenceModel) DeepCopy() *InferenceModel { + if in == nil { + return nil + } + out := new(InferenceModel) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *InferenceModel) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *InferenceModelList) DeepCopyInto(out *InferenceModelList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]InferenceModel, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferenceModelList. +func (in *InferenceModelList) DeepCopy() *InferenceModelList { + if in == nil { + return nil + } + out := new(InferenceModelList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *InferenceModelList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *InferenceModelSpec) DeepCopyInto(out *InferenceModelSpec) { + *out = *in + if in.Criticality != nil { + in, out := &in.Criticality, &out.Criticality + *out = new(Criticality) + **out = **in + } + if in.TargetModels != nil { + in, out := &in.TargetModels, &out.TargetModels + *out = make([]TargetModel, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + out.PoolRef = in.PoolRef +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferenceModelSpec. +func (in *InferenceModelSpec) DeepCopy() *InferenceModelSpec { + if in == nil { + return nil + } + out := new(InferenceModelSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *InferenceModelStatus) DeepCopyInto(out *InferenceModelStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferenceModelStatus. +func (in *InferenceModelStatus) DeepCopy() *InferenceModelStatus { + if in == nil { + return nil + } + out := new(InferenceModelStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *InferencePool) DeepCopyInto(out *InferencePool) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferencePool. +func (in *InferencePool) DeepCopy() *InferencePool { + if in == nil { + return nil + } + out := new(InferencePool) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *InferencePool) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *InferencePoolList) DeepCopyInto(out *InferencePoolList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]InferencePool, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferencePoolList. +func (in *InferencePoolList) DeepCopy() *InferencePoolList { + if in == nil { + return nil + } + out := new(InferencePoolList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *InferencePoolList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *InferencePoolSpec) DeepCopyInto(out *InferencePoolSpec) { + *out = *in + if in.Selector != nil { + in, out := &in.Selector, &out.Selector + *out = make(map[LabelKey]LabelValue, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + in.EndpointPickerConfig.DeepCopyInto(&out.EndpointPickerConfig) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferencePoolSpec. +func (in *InferencePoolSpec) DeepCopy() *InferencePoolSpec { + if in == nil { + return nil + } + out := new(InferencePoolSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *InferencePoolStatus) DeepCopyInto(out *InferencePoolStatus) { + *out = *in + if in.Parents != nil { + in, out := &in.Parents, &out.Parents + *out = make([]PoolStatus, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferencePoolStatus. +func (in *InferencePoolStatus) DeepCopy() *InferencePoolStatus { + if in == nil { + return nil + } + out := new(InferencePoolStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PoolObjectReference) DeepCopyInto(out *PoolObjectReference) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PoolObjectReference. +func (in *PoolObjectReference) DeepCopy() *PoolObjectReference { + if in == nil { + return nil + } + out := new(PoolObjectReference) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PoolStatus) DeepCopyInto(out *PoolStatus) { + *out = *in + out.GatewayRef = in.GatewayRef + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PoolStatus. +func (in *PoolStatus) DeepCopy() *PoolStatus { + if in == nil { + return nil + } + out := new(PoolStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TargetModel) DeepCopyInto(out *TargetModel) { + *out = *in + if in.Weight != nil { + in, out := &in.Weight, &out.Weight + *out = new(int32) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TargetModel. +func (in *TargetModel) DeepCopy() *TargetModel { + if in == nil { + return nil + } + out := new(TargetModel) + in.DeepCopyInto(out) + return out +} diff --git a/client-go/applyconfiguration/api/v1alpha2/endpointpickerconfig.go b/client-go/applyconfiguration/api/v1alpha2/endpointpickerconfig.go new file mode 100644 index 00000000..007b8870 --- /dev/null +++ b/client-go/applyconfiguration/api/v1alpha2/endpointpickerconfig.go @@ -0,0 +1,38 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha2 + +// EndpointPickerConfigApplyConfiguration represents a declarative configuration of the EndpointPickerConfig type for use +// with apply. +type EndpointPickerConfigApplyConfiguration struct { + ExtensionRef *ExtensionApplyConfiguration `json:"extensionRef,omitempty"` +} + +// EndpointPickerConfigApplyConfiguration constructs a declarative configuration of the EndpointPickerConfig type for use with +// apply. +func EndpointPickerConfig() *EndpointPickerConfigApplyConfiguration { + return &EndpointPickerConfigApplyConfiguration{} +} + +// WithExtensionRef sets the ExtensionRef field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the ExtensionRef field is set to the value of the last call. +func (b *EndpointPickerConfigApplyConfiguration) WithExtensionRef(value *ExtensionApplyConfiguration) *EndpointPickerConfigApplyConfiguration { + b.ExtensionRef = value + return b +} diff --git a/client-go/applyconfiguration/api/v1alpha2/extension.go b/client-go/applyconfiguration/api/v1alpha2/extension.go new file mode 100644 index 00000000..b3802613 --- /dev/null +++ b/client-go/applyconfiguration/api/v1alpha2/extension.go @@ -0,0 +1,75 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" +) + +// ExtensionApplyConfiguration represents a declarative configuration of the Extension type for use +// with apply. +type ExtensionApplyConfiguration struct { + ExtensionReferenceApplyConfiguration `json:",inline"` + ExtensionConnectionApplyConfiguration `json:",inline"` +} + +// ExtensionApplyConfiguration constructs a declarative configuration of the Extension type for use with +// apply. +func Extension() *ExtensionApplyConfiguration { + return &ExtensionApplyConfiguration{} +} + +// WithGroup sets the Group field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Group field is set to the value of the last call. +func (b *ExtensionApplyConfiguration) WithGroup(value string) *ExtensionApplyConfiguration { + b.ExtensionReferenceApplyConfiguration.Group = &value + return b +} + +// WithKind sets the Kind field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Kind field is set to the value of the last call. +func (b *ExtensionApplyConfiguration) WithKind(value string) *ExtensionApplyConfiguration { + b.ExtensionReferenceApplyConfiguration.Kind = &value + return b +} + +// WithName sets the Name field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Name field is set to the value of the last call. +func (b *ExtensionApplyConfiguration) WithName(value string) *ExtensionApplyConfiguration { + b.ExtensionReferenceApplyConfiguration.Name = &value + return b +} + +// WithPortNumber sets the PortNumber field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the PortNumber field is set to the value of the last call. +func (b *ExtensionApplyConfiguration) WithPortNumber(value int32) *ExtensionApplyConfiguration { + b.ExtensionReferenceApplyConfiguration.PortNumber = &value + return b +} + +// WithFailureMode sets the FailureMode field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the FailureMode field is set to the value of the last call. +func (b *ExtensionApplyConfiguration) WithFailureMode(value apiv1alpha2.ExtensionFailureMode) *ExtensionApplyConfiguration { + b.ExtensionConnectionApplyConfiguration.FailureMode = &value + return b +} diff --git a/client-go/applyconfiguration/api/v1alpha2/extensionconnection.go b/client-go/applyconfiguration/api/v1alpha2/extensionconnection.go new file mode 100644 index 00000000..2a59b830 --- /dev/null +++ b/client-go/applyconfiguration/api/v1alpha2/extensionconnection.go @@ -0,0 +1,42 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" +) + +// ExtensionConnectionApplyConfiguration represents a declarative configuration of the ExtensionConnection type for use +// with apply. +type ExtensionConnectionApplyConfiguration struct { + FailureMode *apiv1alpha2.ExtensionFailureMode `json:"failureMode,omitempty"` +} + +// ExtensionConnectionApplyConfiguration constructs a declarative configuration of the ExtensionConnection type for use with +// apply. +func ExtensionConnection() *ExtensionConnectionApplyConfiguration { + return &ExtensionConnectionApplyConfiguration{} +} + +// WithFailureMode sets the FailureMode field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the FailureMode field is set to the value of the last call. +func (b *ExtensionConnectionApplyConfiguration) WithFailureMode(value apiv1alpha2.ExtensionFailureMode) *ExtensionConnectionApplyConfiguration { + b.FailureMode = &value + return b +} diff --git a/client-go/applyconfiguration/api/v1alpha2/extensionreference.go b/client-go/applyconfiguration/api/v1alpha2/extensionreference.go new file mode 100644 index 00000000..71034710 --- /dev/null +++ b/client-go/applyconfiguration/api/v1alpha2/extensionreference.go @@ -0,0 +1,65 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha2 + +// ExtensionReferenceApplyConfiguration represents a declarative configuration of the ExtensionReference type for use +// with apply. +type ExtensionReferenceApplyConfiguration struct { + Group *string `json:"group,omitempty"` + Kind *string `json:"kind,omitempty"` + Name *string `json:"name,omitempty"` + PortNumber *int32 `json:"targetPortNumber,omitempty"` +} + +// ExtensionReferenceApplyConfiguration constructs a declarative configuration of the ExtensionReference type for use with +// apply. +func ExtensionReference() *ExtensionReferenceApplyConfiguration { + return &ExtensionReferenceApplyConfiguration{} +} + +// WithGroup sets the Group field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Group field is set to the value of the last call. +func (b *ExtensionReferenceApplyConfiguration) WithGroup(value string) *ExtensionReferenceApplyConfiguration { + b.Group = &value + return b +} + +// WithKind sets the Kind field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Kind field is set to the value of the last call. +func (b *ExtensionReferenceApplyConfiguration) WithKind(value string) *ExtensionReferenceApplyConfiguration { + b.Kind = &value + return b +} + +// WithName sets the Name field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Name field is set to the value of the last call. +func (b *ExtensionReferenceApplyConfiguration) WithName(value string) *ExtensionReferenceApplyConfiguration { + b.Name = &value + return b +} + +// WithPortNumber sets the PortNumber field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the PortNumber field is set to the value of the last call. +func (b *ExtensionReferenceApplyConfiguration) WithPortNumber(value int32) *ExtensionReferenceApplyConfiguration { + b.PortNumber = &value + return b +} diff --git a/client-go/applyconfiguration/api/v1alpha2/inferencemodel.go b/client-go/applyconfiguration/api/v1alpha2/inferencemodel.go new file mode 100644 index 00000000..1fbfe106 --- /dev/null +++ b/client-go/applyconfiguration/api/v1alpha2/inferencemodel.go @@ -0,0 +1,224 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + types "k8s.io/apimachinery/pkg/types" + v1 "k8s.io/client-go/applyconfigurations/meta/v1" +) + +// InferenceModelApplyConfiguration represents a declarative configuration of the InferenceModel type for use +// with apply. +type InferenceModelApplyConfiguration struct { + v1.TypeMetaApplyConfiguration `json:",inline"` + *v1.ObjectMetaApplyConfiguration `json:"metadata,omitempty"` + Spec *InferenceModelSpecApplyConfiguration `json:"spec,omitempty"` + Status *InferenceModelStatusApplyConfiguration `json:"status,omitempty"` +} + +// InferenceModel constructs a declarative configuration of the InferenceModel type for use with +// apply. +func InferenceModel(name, namespace string) *InferenceModelApplyConfiguration { + b := &InferenceModelApplyConfiguration{} + b.WithName(name) + b.WithNamespace(namespace) + b.WithKind("InferenceModel") + b.WithAPIVersion("inference.networking.x-k8s.io/v1alpha2") + return b +} + +// WithKind sets the Kind field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Kind field is set to the value of the last call. +func (b *InferenceModelApplyConfiguration) WithKind(value string) *InferenceModelApplyConfiguration { + b.TypeMetaApplyConfiguration.Kind = &value + return b +} + +// WithAPIVersion sets the APIVersion field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the APIVersion field is set to the value of the last call. +func (b *InferenceModelApplyConfiguration) WithAPIVersion(value string) *InferenceModelApplyConfiguration { + b.TypeMetaApplyConfiguration.APIVersion = &value + return b +} + +// WithName sets the Name field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Name field is set to the value of the last call. +func (b *InferenceModelApplyConfiguration) WithName(value string) *InferenceModelApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.Name = &value + return b +} + +// WithGenerateName sets the GenerateName field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the GenerateName field is set to the value of the last call. +func (b *InferenceModelApplyConfiguration) WithGenerateName(value string) *InferenceModelApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.GenerateName = &value + return b +} + +// WithNamespace sets the Namespace field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Namespace field is set to the value of the last call. +func (b *InferenceModelApplyConfiguration) WithNamespace(value string) *InferenceModelApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.Namespace = &value + return b +} + +// WithUID sets the UID field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the UID field is set to the value of the last call. +func (b *InferenceModelApplyConfiguration) WithUID(value types.UID) *InferenceModelApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.UID = &value + return b +} + +// WithResourceVersion sets the ResourceVersion field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the ResourceVersion field is set to the value of the last call. +func (b *InferenceModelApplyConfiguration) WithResourceVersion(value string) *InferenceModelApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.ResourceVersion = &value + return b +} + +// WithGeneration sets the Generation field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Generation field is set to the value of the last call. +func (b *InferenceModelApplyConfiguration) WithGeneration(value int64) *InferenceModelApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.Generation = &value + return b +} + +// WithCreationTimestamp sets the CreationTimestamp field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the CreationTimestamp field is set to the value of the last call. +func (b *InferenceModelApplyConfiguration) WithCreationTimestamp(value metav1.Time) *InferenceModelApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.CreationTimestamp = &value + return b +} + +// WithDeletionTimestamp sets the DeletionTimestamp field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the DeletionTimestamp field is set to the value of the last call. +func (b *InferenceModelApplyConfiguration) WithDeletionTimestamp(value metav1.Time) *InferenceModelApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.DeletionTimestamp = &value + return b +} + +// WithDeletionGracePeriodSeconds sets the DeletionGracePeriodSeconds field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the DeletionGracePeriodSeconds field is set to the value of the last call. +func (b *InferenceModelApplyConfiguration) WithDeletionGracePeriodSeconds(value int64) *InferenceModelApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.DeletionGracePeriodSeconds = &value + return b +} + +// WithLabels puts the entries into the Labels field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, the entries provided by each call will be put on the Labels field, +// overwriting an existing map entries in Labels field with the same key. +func (b *InferenceModelApplyConfiguration) WithLabels(entries map[string]string) *InferenceModelApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + if b.ObjectMetaApplyConfiguration.Labels == nil && len(entries) > 0 { + b.ObjectMetaApplyConfiguration.Labels = make(map[string]string, len(entries)) + } + for k, v := range entries { + b.ObjectMetaApplyConfiguration.Labels[k] = v + } + return b +} + +// WithAnnotations puts the entries into the Annotations field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, the entries provided by each call will be put on the Annotations field, +// overwriting an existing map entries in Annotations field with the same key. +func (b *InferenceModelApplyConfiguration) WithAnnotations(entries map[string]string) *InferenceModelApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + if b.ObjectMetaApplyConfiguration.Annotations == nil && len(entries) > 0 { + b.ObjectMetaApplyConfiguration.Annotations = make(map[string]string, len(entries)) + } + for k, v := range entries { + b.ObjectMetaApplyConfiguration.Annotations[k] = v + } + return b +} + +// WithOwnerReferences adds the given value to the OwnerReferences field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the OwnerReferences field. +func (b *InferenceModelApplyConfiguration) WithOwnerReferences(values ...*v1.OwnerReferenceApplyConfiguration) *InferenceModelApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + for i := range values { + if values[i] == nil { + panic("nil value passed to WithOwnerReferences") + } + b.ObjectMetaApplyConfiguration.OwnerReferences = append(b.ObjectMetaApplyConfiguration.OwnerReferences, *values[i]) + } + return b +} + +// WithFinalizers adds the given value to the Finalizers field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the Finalizers field. +func (b *InferenceModelApplyConfiguration) WithFinalizers(values ...string) *InferenceModelApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + for i := range values { + b.ObjectMetaApplyConfiguration.Finalizers = append(b.ObjectMetaApplyConfiguration.Finalizers, values[i]) + } + return b +} + +func (b *InferenceModelApplyConfiguration) ensureObjectMetaApplyConfigurationExists() { + if b.ObjectMetaApplyConfiguration == nil { + b.ObjectMetaApplyConfiguration = &v1.ObjectMetaApplyConfiguration{} + } +} + +// WithSpec sets the Spec field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Spec field is set to the value of the last call. +func (b *InferenceModelApplyConfiguration) WithSpec(value *InferenceModelSpecApplyConfiguration) *InferenceModelApplyConfiguration { + b.Spec = value + return b +} + +// WithStatus sets the Status field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Status field is set to the value of the last call. +func (b *InferenceModelApplyConfiguration) WithStatus(value *InferenceModelStatusApplyConfiguration) *InferenceModelApplyConfiguration { + b.Status = value + return b +} + +// GetName retrieves the value of the Name field in the declarative configuration. +func (b *InferenceModelApplyConfiguration) GetName() *string { + b.ensureObjectMetaApplyConfigurationExists() + return b.ObjectMetaApplyConfiguration.Name +} diff --git a/client-go/applyconfiguration/api/v1alpha2/inferencemodelspec.go b/client-go/applyconfiguration/api/v1alpha2/inferencemodelspec.go new file mode 100644 index 00000000..438ccd48 --- /dev/null +++ b/client-go/applyconfiguration/api/v1alpha2/inferencemodelspec.go @@ -0,0 +1,74 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" +) + +// InferenceModelSpecApplyConfiguration represents a declarative configuration of the InferenceModelSpec type for use +// with apply. +type InferenceModelSpecApplyConfiguration struct { + ModelName *string `json:"modelName,omitempty"` + Criticality *apiv1alpha2.Criticality `json:"criticality,omitempty"` + TargetModels []TargetModelApplyConfiguration `json:"targetModels,omitempty"` + PoolRef *PoolObjectReferenceApplyConfiguration `json:"poolRef,omitempty"` +} + +// InferenceModelSpecApplyConfiguration constructs a declarative configuration of the InferenceModelSpec type for use with +// apply. +func InferenceModelSpec() *InferenceModelSpecApplyConfiguration { + return &InferenceModelSpecApplyConfiguration{} +} + +// WithModelName sets the ModelName field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the ModelName field is set to the value of the last call. +func (b *InferenceModelSpecApplyConfiguration) WithModelName(value string) *InferenceModelSpecApplyConfiguration { + b.ModelName = &value + return b +} + +// WithCriticality sets the Criticality field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Criticality field is set to the value of the last call. +func (b *InferenceModelSpecApplyConfiguration) WithCriticality(value apiv1alpha2.Criticality) *InferenceModelSpecApplyConfiguration { + b.Criticality = &value + return b +} + +// WithTargetModels adds the given value to the TargetModels field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the TargetModels field. +func (b *InferenceModelSpecApplyConfiguration) WithTargetModels(values ...*TargetModelApplyConfiguration) *InferenceModelSpecApplyConfiguration { + for i := range values { + if values[i] == nil { + panic("nil value passed to WithTargetModels") + } + b.TargetModels = append(b.TargetModels, *values[i]) + } + return b +} + +// WithPoolRef sets the PoolRef field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the PoolRef field is set to the value of the last call. +func (b *InferenceModelSpecApplyConfiguration) WithPoolRef(value *PoolObjectReferenceApplyConfiguration) *InferenceModelSpecApplyConfiguration { + b.PoolRef = value + return b +} diff --git a/client-go/applyconfiguration/api/v1alpha2/inferencemodelstatus.go b/client-go/applyconfiguration/api/v1alpha2/inferencemodelstatus.go new file mode 100644 index 00000000..e8142efe --- /dev/null +++ b/client-go/applyconfiguration/api/v1alpha2/inferencemodelstatus.go @@ -0,0 +1,47 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + v1 "k8s.io/client-go/applyconfigurations/meta/v1" +) + +// InferenceModelStatusApplyConfiguration represents a declarative configuration of the InferenceModelStatus type for use +// with apply. +type InferenceModelStatusApplyConfiguration struct { + Conditions []v1.ConditionApplyConfiguration `json:"conditions,omitempty"` +} + +// InferenceModelStatusApplyConfiguration constructs a declarative configuration of the InferenceModelStatus type for use with +// apply. +func InferenceModelStatus() *InferenceModelStatusApplyConfiguration { + return &InferenceModelStatusApplyConfiguration{} +} + +// WithConditions adds the given value to the Conditions field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the Conditions field. +func (b *InferenceModelStatusApplyConfiguration) WithConditions(values ...*v1.ConditionApplyConfiguration) *InferenceModelStatusApplyConfiguration { + for i := range values { + if values[i] == nil { + panic("nil value passed to WithConditions") + } + b.Conditions = append(b.Conditions, *values[i]) + } + return b +} diff --git a/client-go/applyconfiguration/api/v1alpha2/inferencepool.go b/client-go/applyconfiguration/api/v1alpha2/inferencepool.go new file mode 100644 index 00000000..cd725cb6 --- /dev/null +++ b/client-go/applyconfiguration/api/v1alpha2/inferencepool.go @@ -0,0 +1,224 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + types "k8s.io/apimachinery/pkg/types" + v1 "k8s.io/client-go/applyconfigurations/meta/v1" +) + +// InferencePoolApplyConfiguration represents a declarative configuration of the InferencePool type for use +// with apply. +type InferencePoolApplyConfiguration struct { + v1.TypeMetaApplyConfiguration `json:",inline"` + *v1.ObjectMetaApplyConfiguration `json:"metadata,omitempty"` + Spec *InferencePoolSpecApplyConfiguration `json:"spec,omitempty"` + Status *InferencePoolStatusApplyConfiguration `json:"status,omitempty"` +} + +// InferencePool constructs a declarative configuration of the InferencePool type for use with +// apply. +func InferencePool(name, namespace string) *InferencePoolApplyConfiguration { + b := &InferencePoolApplyConfiguration{} + b.WithName(name) + b.WithNamespace(namespace) + b.WithKind("InferencePool") + b.WithAPIVersion("inference.networking.x-k8s.io/v1alpha2") + return b +} + +// WithKind sets the Kind field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Kind field is set to the value of the last call. +func (b *InferencePoolApplyConfiguration) WithKind(value string) *InferencePoolApplyConfiguration { + b.TypeMetaApplyConfiguration.Kind = &value + return b +} + +// WithAPIVersion sets the APIVersion field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the APIVersion field is set to the value of the last call. +func (b *InferencePoolApplyConfiguration) WithAPIVersion(value string) *InferencePoolApplyConfiguration { + b.TypeMetaApplyConfiguration.APIVersion = &value + return b +} + +// WithName sets the Name field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Name field is set to the value of the last call. +func (b *InferencePoolApplyConfiguration) WithName(value string) *InferencePoolApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.Name = &value + return b +} + +// WithGenerateName sets the GenerateName field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the GenerateName field is set to the value of the last call. +func (b *InferencePoolApplyConfiguration) WithGenerateName(value string) *InferencePoolApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.GenerateName = &value + return b +} + +// WithNamespace sets the Namespace field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Namespace field is set to the value of the last call. +func (b *InferencePoolApplyConfiguration) WithNamespace(value string) *InferencePoolApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.Namespace = &value + return b +} + +// WithUID sets the UID field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the UID field is set to the value of the last call. +func (b *InferencePoolApplyConfiguration) WithUID(value types.UID) *InferencePoolApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.UID = &value + return b +} + +// WithResourceVersion sets the ResourceVersion field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the ResourceVersion field is set to the value of the last call. +func (b *InferencePoolApplyConfiguration) WithResourceVersion(value string) *InferencePoolApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.ResourceVersion = &value + return b +} + +// WithGeneration sets the Generation field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Generation field is set to the value of the last call. +func (b *InferencePoolApplyConfiguration) WithGeneration(value int64) *InferencePoolApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.Generation = &value + return b +} + +// WithCreationTimestamp sets the CreationTimestamp field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the CreationTimestamp field is set to the value of the last call. +func (b *InferencePoolApplyConfiguration) WithCreationTimestamp(value metav1.Time) *InferencePoolApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.CreationTimestamp = &value + return b +} + +// WithDeletionTimestamp sets the DeletionTimestamp field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the DeletionTimestamp field is set to the value of the last call. +func (b *InferencePoolApplyConfiguration) WithDeletionTimestamp(value metav1.Time) *InferencePoolApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.DeletionTimestamp = &value + return b +} + +// WithDeletionGracePeriodSeconds sets the DeletionGracePeriodSeconds field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the DeletionGracePeriodSeconds field is set to the value of the last call. +func (b *InferencePoolApplyConfiguration) WithDeletionGracePeriodSeconds(value int64) *InferencePoolApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + b.ObjectMetaApplyConfiguration.DeletionGracePeriodSeconds = &value + return b +} + +// WithLabels puts the entries into the Labels field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, the entries provided by each call will be put on the Labels field, +// overwriting an existing map entries in Labels field with the same key. +func (b *InferencePoolApplyConfiguration) WithLabels(entries map[string]string) *InferencePoolApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + if b.ObjectMetaApplyConfiguration.Labels == nil && len(entries) > 0 { + b.ObjectMetaApplyConfiguration.Labels = make(map[string]string, len(entries)) + } + for k, v := range entries { + b.ObjectMetaApplyConfiguration.Labels[k] = v + } + return b +} + +// WithAnnotations puts the entries into the Annotations field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, the entries provided by each call will be put on the Annotations field, +// overwriting an existing map entries in Annotations field with the same key. +func (b *InferencePoolApplyConfiguration) WithAnnotations(entries map[string]string) *InferencePoolApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + if b.ObjectMetaApplyConfiguration.Annotations == nil && len(entries) > 0 { + b.ObjectMetaApplyConfiguration.Annotations = make(map[string]string, len(entries)) + } + for k, v := range entries { + b.ObjectMetaApplyConfiguration.Annotations[k] = v + } + return b +} + +// WithOwnerReferences adds the given value to the OwnerReferences field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the OwnerReferences field. +func (b *InferencePoolApplyConfiguration) WithOwnerReferences(values ...*v1.OwnerReferenceApplyConfiguration) *InferencePoolApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + for i := range values { + if values[i] == nil { + panic("nil value passed to WithOwnerReferences") + } + b.ObjectMetaApplyConfiguration.OwnerReferences = append(b.ObjectMetaApplyConfiguration.OwnerReferences, *values[i]) + } + return b +} + +// WithFinalizers adds the given value to the Finalizers field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the Finalizers field. +func (b *InferencePoolApplyConfiguration) WithFinalizers(values ...string) *InferencePoolApplyConfiguration { + b.ensureObjectMetaApplyConfigurationExists() + for i := range values { + b.ObjectMetaApplyConfiguration.Finalizers = append(b.ObjectMetaApplyConfiguration.Finalizers, values[i]) + } + return b +} + +func (b *InferencePoolApplyConfiguration) ensureObjectMetaApplyConfigurationExists() { + if b.ObjectMetaApplyConfiguration == nil { + b.ObjectMetaApplyConfiguration = &v1.ObjectMetaApplyConfiguration{} + } +} + +// WithSpec sets the Spec field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Spec field is set to the value of the last call. +func (b *InferencePoolApplyConfiguration) WithSpec(value *InferencePoolSpecApplyConfiguration) *InferencePoolApplyConfiguration { + b.Spec = value + return b +} + +// WithStatus sets the Status field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Status field is set to the value of the last call. +func (b *InferencePoolApplyConfiguration) WithStatus(value *InferencePoolStatusApplyConfiguration) *InferencePoolApplyConfiguration { + b.Status = value + return b +} + +// GetName retrieves the value of the Name field in the declarative configuration. +func (b *InferencePoolApplyConfiguration) GetName() *string { + b.ensureObjectMetaApplyConfigurationExists() + return b.ObjectMetaApplyConfiguration.Name +} diff --git a/client-go/applyconfiguration/api/v1alpha2/inferencepoolspec.go b/client-go/applyconfiguration/api/v1alpha2/inferencepoolspec.go new file mode 100644 index 00000000..e4d5a97d --- /dev/null +++ b/client-go/applyconfiguration/api/v1alpha2/inferencepoolspec.go @@ -0,0 +1,66 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" +) + +// InferencePoolSpecApplyConfiguration represents a declarative configuration of the InferencePoolSpec type for use +// with apply. +type InferencePoolSpecApplyConfiguration struct { + Selector map[apiv1alpha2.LabelKey]apiv1alpha2.LabelValue `json:"selector,omitempty"` + TargetPortNumber *int32 `json:"targetPortNumber,omitempty"` + EndpointPickerConfigApplyConfiguration `json:",inline"` +} + +// InferencePoolSpecApplyConfiguration constructs a declarative configuration of the InferencePoolSpec type for use with +// apply. +func InferencePoolSpec() *InferencePoolSpecApplyConfiguration { + return &InferencePoolSpecApplyConfiguration{} +} + +// WithSelector puts the entries into the Selector field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, the entries provided by each call will be put on the Selector field, +// overwriting an existing map entries in Selector field with the same key. +func (b *InferencePoolSpecApplyConfiguration) WithSelector(entries map[apiv1alpha2.LabelKey]apiv1alpha2.LabelValue) *InferencePoolSpecApplyConfiguration { + if b.Selector == nil && len(entries) > 0 { + b.Selector = make(map[apiv1alpha2.LabelKey]apiv1alpha2.LabelValue, len(entries)) + } + for k, v := range entries { + b.Selector[k] = v + } + return b +} + +// WithTargetPortNumber sets the TargetPortNumber field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the TargetPortNumber field is set to the value of the last call. +func (b *InferencePoolSpecApplyConfiguration) WithTargetPortNumber(value int32) *InferencePoolSpecApplyConfiguration { + b.TargetPortNumber = &value + return b +} + +// WithExtensionRef sets the ExtensionRef field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the ExtensionRef field is set to the value of the last call. +func (b *InferencePoolSpecApplyConfiguration) WithExtensionRef(value *ExtensionApplyConfiguration) *InferencePoolSpecApplyConfiguration { + b.EndpointPickerConfigApplyConfiguration.ExtensionRef = value + return b +} diff --git a/client-go/applyconfiguration/api/v1alpha2/inferencepoolstatus.go b/client-go/applyconfiguration/api/v1alpha2/inferencepoolstatus.go new file mode 100644 index 00000000..9587dabe --- /dev/null +++ b/client-go/applyconfiguration/api/v1alpha2/inferencepoolstatus.go @@ -0,0 +1,43 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha2 + +// InferencePoolStatusApplyConfiguration represents a declarative configuration of the InferencePoolStatus type for use +// with apply. +type InferencePoolStatusApplyConfiguration struct { + Parents []PoolStatusApplyConfiguration `json:"parent,omitempty"` +} + +// InferencePoolStatusApplyConfiguration constructs a declarative configuration of the InferencePoolStatus type for use with +// apply. +func InferencePoolStatus() *InferencePoolStatusApplyConfiguration { + return &InferencePoolStatusApplyConfiguration{} +} + +// WithParents adds the given value to the Parents field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the Parents field. +func (b *InferencePoolStatusApplyConfiguration) WithParents(values ...*PoolStatusApplyConfiguration) *InferencePoolStatusApplyConfiguration { + for i := range values { + if values[i] == nil { + panic("nil value passed to WithParents") + } + b.Parents = append(b.Parents, *values[i]) + } + return b +} diff --git a/client-go/applyconfiguration/api/v1alpha2/poolobjectreference.go b/client-go/applyconfiguration/api/v1alpha2/poolobjectreference.go new file mode 100644 index 00000000..cc88c950 --- /dev/null +++ b/client-go/applyconfiguration/api/v1alpha2/poolobjectreference.go @@ -0,0 +1,56 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha2 + +// PoolObjectReferenceApplyConfiguration represents a declarative configuration of the PoolObjectReference type for use +// with apply. +type PoolObjectReferenceApplyConfiguration struct { + Group *string `json:"group,omitempty"` + Kind *string `json:"kind,omitempty"` + Name *string `json:"name,omitempty"` +} + +// PoolObjectReferenceApplyConfiguration constructs a declarative configuration of the PoolObjectReference type for use with +// apply. +func PoolObjectReference() *PoolObjectReferenceApplyConfiguration { + return &PoolObjectReferenceApplyConfiguration{} +} + +// WithGroup sets the Group field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Group field is set to the value of the last call. +func (b *PoolObjectReferenceApplyConfiguration) WithGroup(value string) *PoolObjectReferenceApplyConfiguration { + b.Group = &value + return b +} + +// WithKind sets the Kind field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Kind field is set to the value of the last call. +func (b *PoolObjectReferenceApplyConfiguration) WithKind(value string) *PoolObjectReferenceApplyConfiguration { + b.Kind = &value + return b +} + +// WithName sets the Name field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Name field is set to the value of the last call. +func (b *PoolObjectReferenceApplyConfiguration) WithName(value string) *PoolObjectReferenceApplyConfiguration { + b.Name = &value + return b +} diff --git a/client-go/applyconfiguration/api/v1alpha2/poolstatus.go b/client-go/applyconfiguration/api/v1alpha2/poolstatus.go new file mode 100644 index 00000000..bff29935 --- /dev/null +++ b/client-go/applyconfiguration/api/v1alpha2/poolstatus.go @@ -0,0 +1,57 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/client-go/applyconfigurations/meta/v1" +) + +// PoolStatusApplyConfiguration represents a declarative configuration of the PoolStatus type for use +// with apply. +type PoolStatusApplyConfiguration struct { + GatewayRef *v1.ObjectReference `json:"parentRef,omitempty"` + Conditions []metav1.ConditionApplyConfiguration `json:"conditions,omitempty"` +} + +// PoolStatusApplyConfiguration constructs a declarative configuration of the PoolStatus type for use with +// apply. +func PoolStatus() *PoolStatusApplyConfiguration { + return &PoolStatusApplyConfiguration{} +} + +// WithGatewayRef sets the GatewayRef field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the GatewayRef field is set to the value of the last call. +func (b *PoolStatusApplyConfiguration) WithGatewayRef(value v1.ObjectReference) *PoolStatusApplyConfiguration { + b.GatewayRef = &value + return b +} + +// WithConditions adds the given value to the Conditions field in the declarative configuration +// and returns the receiver, so that objects can be build by chaining "With" function invocations. +// If called multiple times, values provided by each call will be appended to the Conditions field. +func (b *PoolStatusApplyConfiguration) WithConditions(values ...*metav1.ConditionApplyConfiguration) *PoolStatusApplyConfiguration { + for i := range values { + if values[i] == nil { + panic("nil value passed to WithConditions") + } + b.Conditions = append(b.Conditions, *values[i]) + } + return b +} diff --git a/client-go/applyconfiguration/api/v1alpha2/targetmodel.go b/client-go/applyconfiguration/api/v1alpha2/targetmodel.go new file mode 100644 index 00000000..4ed9b4bc --- /dev/null +++ b/client-go/applyconfiguration/api/v1alpha2/targetmodel.go @@ -0,0 +1,47 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by applyconfiguration-gen. DO NOT EDIT. + +package v1alpha2 + +// TargetModelApplyConfiguration represents a declarative configuration of the TargetModel type for use +// with apply. +type TargetModelApplyConfiguration struct { + Name *string `json:"name,omitempty"` + Weight *int32 `json:"weight,omitempty"` +} + +// TargetModelApplyConfiguration constructs a declarative configuration of the TargetModel type for use with +// apply. +func TargetModel() *TargetModelApplyConfiguration { + return &TargetModelApplyConfiguration{} +} + +// WithName sets the Name field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Name field is set to the value of the last call. +func (b *TargetModelApplyConfiguration) WithName(value string) *TargetModelApplyConfiguration { + b.Name = &value + return b +} + +// WithWeight sets the Weight field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Weight field is set to the value of the last call. +func (b *TargetModelApplyConfiguration) WithWeight(value int32) *TargetModelApplyConfiguration { + b.Weight = &value + return b +} diff --git a/client-go/applyconfiguration/utils.go b/client-go/applyconfiguration/utils.go index 677fa6e3..eacc9c43 100644 --- a/client-go/applyconfiguration/utils.go +++ b/client-go/applyconfiguration/utils.go @@ -22,7 +22,9 @@ import ( schema "k8s.io/apimachinery/pkg/runtime/schema" testing "k8s.io/client-go/testing" v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha2" internal "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/internal" ) @@ -56,6 +58,34 @@ func ForKind(kind schema.GroupVersionKind) interface{} { case v1alpha1.SchemeGroupVersion.WithKind("TargetModel"): return &apiv1alpha1.TargetModelApplyConfiguration{} + // Group=inference.networking.x-k8s.io, Version=v1alpha2 + case v1alpha2.SchemeGroupVersion.WithKind("EndpointPickerConfig"): + return &apiv1alpha2.EndpointPickerConfigApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("Extension"): + return &apiv1alpha2.ExtensionApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("ExtensionConnection"): + return &apiv1alpha2.ExtensionConnectionApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("ExtensionReference"): + return &apiv1alpha2.ExtensionReferenceApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("InferenceModel"): + return &apiv1alpha2.InferenceModelApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("InferenceModelSpec"): + return &apiv1alpha2.InferenceModelSpecApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("InferenceModelStatus"): + return &apiv1alpha2.InferenceModelStatusApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("InferencePool"): + return &apiv1alpha2.InferencePoolApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("InferencePoolSpec"): + return &apiv1alpha2.InferencePoolSpecApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("InferencePoolStatus"): + return &apiv1alpha2.InferencePoolStatusApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("PoolObjectReference"): + return &apiv1alpha2.PoolObjectReferenceApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("PoolStatus"): + return &apiv1alpha2.PoolStatusApplyConfiguration{} + case v1alpha2.SchemeGroupVersion.WithKind("TargetModel"): + return &apiv1alpha2.TargetModelApplyConfiguration{} + } return nil } diff --git a/client-go/clientset/versioned/clientset.go b/client-go/clientset/versioned/clientset.go index b7ebc1d8..4266285a 100644 --- a/client-go/clientset/versioned/clientset.go +++ b/client-go/clientset/versioned/clientset.go @@ -25,17 +25,20 @@ import ( rest "k8s.io/client-go/rest" flowcontrol "k8s.io/client-go/util/flowcontrol" inferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" + inferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha2" ) type Interface interface { Discovery() discovery.DiscoveryInterface InferenceV1alpha1() inferencev1alpha1.InferenceV1alpha1Interface + InferenceV1alpha2() inferencev1alpha2.InferenceV1alpha2Interface } // Clientset contains the clients for groups. type Clientset struct { *discovery.DiscoveryClient inferenceV1alpha1 *inferencev1alpha1.InferenceV1alpha1Client + inferenceV1alpha2 *inferencev1alpha2.InferenceV1alpha2Client } // InferenceV1alpha1 retrieves the InferenceV1alpha1Client @@ -43,6 +46,11 @@ func (c *Clientset) InferenceV1alpha1() inferencev1alpha1.InferenceV1alpha1Inter return c.inferenceV1alpha1 } +// InferenceV1alpha2 retrieves the InferenceV1alpha2Client +func (c *Clientset) InferenceV1alpha2() inferencev1alpha2.InferenceV1alpha2Interface { + return c.inferenceV1alpha2 +} + // Discovery retrieves the DiscoveryClient func (c *Clientset) Discovery() discovery.DiscoveryInterface { if c == nil { @@ -91,6 +99,10 @@ func NewForConfigAndClient(c *rest.Config, httpClient *http.Client) (*Clientset, if err != nil { return nil, err } + cs.inferenceV1alpha2, err = inferencev1alpha2.NewForConfigAndClient(&configShallowCopy, httpClient) + if err != nil { + return nil, err + } cs.DiscoveryClient, err = discovery.NewDiscoveryClientForConfigAndClient(&configShallowCopy, httpClient) if err != nil { @@ -113,6 +125,7 @@ func NewForConfigOrDie(c *rest.Config) *Clientset { func New(c rest.Interface) *Clientset { var cs Clientset cs.inferenceV1alpha1 = inferencev1alpha1.New(c) + cs.inferenceV1alpha2 = inferencev1alpha2.New(c) cs.DiscoveryClient = discovery.NewDiscoveryClient(c) return &cs diff --git a/client-go/clientset/versioned/fake/clientset_generated.go b/client-go/clientset/versioned/fake/clientset_generated.go index 1e54db31..f4f33032 100644 --- a/client-go/clientset/versioned/fake/clientset_generated.go +++ b/client-go/clientset/versioned/fake/clientset_generated.go @@ -27,6 +27,8 @@ import ( clientset "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" inferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" fakeinferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1/fake" + inferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha2" + fakeinferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha2/fake" ) // NewSimpleClientset returns a clientset that will respond with the provided objects. @@ -119,3 +121,8 @@ var ( func (c *Clientset) InferenceV1alpha1() inferencev1alpha1.InferenceV1alpha1Interface { return &fakeinferencev1alpha1.FakeInferenceV1alpha1{Fake: &c.Fake} } + +// InferenceV1alpha2 retrieves the InferenceV1alpha2Client +func (c *Clientset) InferenceV1alpha2() inferencev1alpha2.InferenceV1alpha2Interface { + return &fakeinferencev1alpha2.FakeInferenceV1alpha2{Fake: &c.Fake} +} diff --git a/client-go/clientset/versioned/fake/register.go b/client-go/clientset/versioned/fake/register.go index b72a8ce3..bc8e6903 100644 --- a/client-go/clientset/versioned/fake/register.go +++ b/client-go/clientset/versioned/fake/register.go @@ -24,6 +24,7 @@ import ( serializer "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" inferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + inferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) var scheme = runtime.NewScheme() @@ -31,6 +32,7 @@ var codecs = serializer.NewCodecFactory(scheme) var localSchemeBuilder = runtime.SchemeBuilder{ inferencev1alpha1.AddToScheme, + inferencev1alpha2.AddToScheme, } // AddToScheme adds all types of this clientset into the given scheme. This allows composition diff --git a/client-go/clientset/versioned/scheme/register.go b/client-go/clientset/versioned/scheme/register.go index c4c06158..5727d404 100644 --- a/client-go/clientset/versioned/scheme/register.go +++ b/client-go/clientset/versioned/scheme/register.go @@ -24,6 +24,7 @@ import ( serializer "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" inferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + inferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) var Scheme = runtime.NewScheme() @@ -31,6 +32,7 @@ var Codecs = serializer.NewCodecFactory(Scheme) var ParameterCodec = runtime.NewParameterCodec(Scheme) var localSchemeBuilder = runtime.SchemeBuilder{ inferencev1alpha1.AddToScheme, + inferencev1alpha2.AddToScheme, } // AddToScheme adds all types of this clientset into the given scheme. This allows composition diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/api_client.go b/client-go/clientset/versioned/typed/api/v1alpha2/api_client.go new file mode 100644 index 00000000..b011ca92 --- /dev/null +++ b/client-go/clientset/versioned/typed/api/v1alpha2/api_client.go @@ -0,0 +1,111 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by client-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + http "net/http" + + rest "k8s.io/client-go/rest" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + scheme "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" +) + +type InferenceV1alpha2Interface interface { + RESTClient() rest.Interface + InferenceModelsGetter + InferencePoolsGetter +} + +// InferenceV1alpha2Client is used to interact with features provided by the inference.networking.x-k8s.io group. +type InferenceV1alpha2Client struct { + restClient rest.Interface +} + +func (c *InferenceV1alpha2Client) InferenceModels(namespace string) InferenceModelInterface { + return newInferenceModels(c, namespace) +} + +func (c *InferenceV1alpha2Client) InferencePools(namespace string) InferencePoolInterface { + return newInferencePools(c, namespace) +} + +// NewForConfig creates a new InferenceV1alpha2Client for the given config. +// NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), +// where httpClient was generated with rest.HTTPClientFor(c). +func NewForConfig(c *rest.Config) (*InferenceV1alpha2Client, error) { + config := *c + if err := setConfigDefaults(&config); err != nil { + return nil, err + } + httpClient, err := rest.HTTPClientFor(&config) + if err != nil { + return nil, err + } + return NewForConfigAndClient(&config, httpClient) +} + +// NewForConfigAndClient creates a new InferenceV1alpha2Client for the given config and http client. +// Note the http client provided takes precedence over the configured transport values. +func NewForConfigAndClient(c *rest.Config, h *http.Client) (*InferenceV1alpha2Client, error) { + config := *c + if err := setConfigDefaults(&config); err != nil { + return nil, err + } + client, err := rest.RESTClientForConfigAndClient(&config, h) + if err != nil { + return nil, err + } + return &InferenceV1alpha2Client{client}, nil +} + +// NewForConfigOrDie creates a new InferenceV1alpha2Client for the given config and +// panics if there is an error in the config. +func NewForConfigOrDie(c *rest.Config) *InferenceV1alpha2Client { + client, err := NewForConfig(c) + if err != nil { + panic(err) + } + return client +} + +// New creates a new InferenceV1alpha2Client for the given RESTClient. +func New(c rest.Interface) *InferenceV1alpha2Client { + return &InferenceV1alpha2Client{c} +} + +func setConfigDefaults(config *rest.Config) error { + gv := apiv1alpha2.SchemeGroupVersion + config.GroupVersion = &gv + config.APIPath = "/apis" + config.NegotiatedSerializer = rest.CodecFactoryForGeneratedClient(scheme.Scheme, scheme.Codecs).WithoutConversion() + + if config.UserAgent == "" { + config.UserAgent = rest.DefaultKubernetesUserAgent() + } + + return nil +} + +// RESTClient returns a RESTClient that is used to communicate +// with API server by this client implementation. +func (c *InferenceV1alpha2Client) RESTClient() rest.Interface { + if c == nil { + return nil + } + return c.restClient +} diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/doc.go b/client-go/clientset/versioned/typed/api/v1alpha2/doc.go new file mode 100644 index 00000000..2bcba220 --- /dev/null +++ b/client-go/clientset/versioned/typed/api/v1alpha2/doc.go @@ -0,0 +1,19 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by client-gen. DO NOT EDIT. + +// This package has the automatically generated typed clients. +package v1alpha2 diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/fake/doc.go b/client-go/clientset/versioned/typed/api/v1alpha2/fake/doc.go new file mode 100644 index 00000000..fbfccbb9 --- /dev/null +++ b/client-go/clientset/versioned/typed/api/v1alpha2/fake/doc.go @@ -0,0 +1,19 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by client-gen. DO NOT EDIT. + +// Package fake has the automatically generated clients. +package fake diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_api_client.go b/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_api_client.go new file mode 100644 index 00000000..0296608c --- /dev/null +++ b/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_api_client.go @@ -0,0 +1,43 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + rest "k8s.io/client-go/rest" + testing "k8s.io/client-go/testing" + v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha2" +) + +type FakeInferenceV1alpha2 struct { + *testing.Fake +} + +func (c *FakeInferenceV1alpha2) InferenceModels(namespace string) v1alpha2.InferenceModelInterface { + return newFakeInferenceModels(c, namespace) +} + +func (c *FakeInferenceV1alpha2) InferencePools(namespace string) v1alpha2.InferencePoolInterface { + return newFakeInferencePools(c, namespace) +} + +// RESTClient returns a RESTClient that is used to communicate +// with API server by this client implementation. +func (c *FakeInferenceV1alpha2) RESTClient() rest.Interface { + var ret *rest.RESTClient + return ret +} diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencemodel.go b/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencemodel.go new file mode 100644 index 00000000..2492a557 --- /dev/null +++ b/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencemodel.go @@ -0,0 +1,52 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + gentype "k8s.io/client-go/gentype" + v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha2" + typedapiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha2" +) + +// fakeInferenceModels implements InferenceModelInterface +type fakeInferenceModels struct { + *gentype.FakeClientWithListAndApply[*v1alpha2.InferenceModel, *v1alpha2.InferenceModelList, *apiv1alpha2.InferenceModelApplyConfiguration] + Fake *FakeInferenceV1alpha2 +} + +func newFakeInferenceModels(fake *FakeInferenceV1alpha2, namespace string) typedapiv1alpha2.InferenceModelInterface { + return &fakeInferenceModels{ + gentype.NewFakeClientWithListAndApply[*v1alpha2.InferenceModel, *v1alpha2.InferenceModelList, *apiv1alpha2.InferenceModelApplyConfiguration]( + fake.Fake, + namespace, + v1alpha2.SchemeGroupVersion.WithResource("inferencemodels"), + v1alpha2.SchemeGroupVersion.WithKind("InferenceModel"), + func() *v1alpha2.InferenceModel { return &v1alpha2.InferenceModel{} }, + func() *v1alpha2.InferenceModelList { return &v1alpha2.InferenceModelList{} }, + func(dst, src *v1alpha2.InferenceModelList) { dst.ListMeta = src.ListMeta }, + func(list *v1alpha2.InferenceModelList) []*v1alpha2.InferenceModel { + return gentype.ToPointerSlice(list.Items) + }, + func(list *v1alpha2.InferenceModelList, items []*v1alpha2.InferenceModel) { + list.Items = gentype.FromPointerSlice(items) + }, + ), + fake, + } +} diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencepool.go b/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencepool.go new file mode 100644 index 00000000..64b087dd --- /dev/null +++ b/client-go/clientset/versioned/typed/api/v1alpha2/fake/fake_inferencepool.go @@ -0,0 +1,52 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by client-gen. DO NOT EDIT. + +package fake + +import ( + gentype "k8s.io/client-go/gentype" + v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha2" + typedapiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha2" +) + +// fakeInferencePools implements InferencePoolInterface +type fakeInferencePools struct { + *gentype.FakeClientWithListAndApply[*v1alpha2.InferencePool, *v1alpha2.InferencePoolList, *apiv1alpha2.InferencePoolApplyConfiguration] + Fake *FakeInferenceV1alpha2 +} + +func newFakeInferencePools(fake *FakeInferenceV1alpha2, namespace string) typedapiv1alpha2.InferencePoolInterface { + return &fakeInferencePools{ + gentype.NewFakeClientWithListAndApply[*v1alpha2.InferencePool, *v1alpha2.InferencePoolList, *apiv1alpha2.InferencePoolApplyConfiguration]( + fake.Fake, + namespace, + v1alpha2.SchemeGroupVersion.WithResource("inferencepools"), + v1alpha2.SchemeGroupVersion.WithKind("InferencePool"), + func() *v1alpha2.InferencePool { return &v1alpha2.InferencePool{} }, + func() *v1alpha2.InferencePoolList { return &v1alpha2.InferencePoolList{} }, + func(dst, src *v1alpha2.InferencePoolList) { dst.ListMeta = src.ListMeta }, + func(list *v1alpha2.InferencePoolList) []*v1alpha2.InferencePool { + return gentype.ToPointerSlice(list.Items) + }, + func(list *v1alpha2.InferencePoolList, items []*v1alpha2.InferencePool) { + list.Items = gentype.FromPointerSlice(items) + }, + ), + fake, + } +} diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/generated_expansion.go b/client-go/clientset/versioned/typed/api/v1alpha2/generated_expansion.go new file mode 100644 index 00000000..399789d8 --- /dev/null +++ b/client-go/clientset/versioned/typed/api/v1alpha2/generated_expansion.go @@ -0,0 +1,22 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by client-gen. DO NOT EDIT. + +package v1alpha2 + +type InferenceModelExpansion interface{} + +type InferencePoolExpansion interface{} diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/inferencemodel.go b/client-go/clientset/versioned/typed/api/v1alpha2/inferencemodel.go new file mode 100644 index 00000000..ee0d92c1 --- /dev/null +++ b/client-go/clientset/versioned/typed/api/v1alpha2/inferencemodel.go @@ -0,0 +1,73 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by client-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + context "context" + + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + gentype "k8s.io/client-go/gentype" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + applyconfigurationapiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha2" + scheme "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" +) + +// InferenceModelsGetter has a method to return a InferenceModelInterface. +// A group's client should implement this interface. +type InferenceModelsGetter interface { + InferenceModels(namespace string) InferenceModelInterface +} + +// InferenceModelInterface has methods to work with InferenceModel resources. +type InferenceModelInterface interface { + Create(ctx context.Context, inferenceModel *apiv1alpha2.InferenceModel, opts v1.CreateOptions) (*apiv1alpha2.InferenceModel, error) + Update(ctx context.Context, inferenceModel *apiv1alpha2.InferenceModel, opts v1.UpdateOptions) (*apiv1alpha2.InferenceModel, error) + // Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). + UpdateStatus(ctx context.Context, inferenceModel *apiv1alpha2.InferenceModel, opts v1.UpdateOptions) (*apiv1alpha2.InferenceModel, error) + Delete(ctx context.Context, name string, opts v1.DeleteOptions) error + DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error + Get(ctx context.Context, name string, opts v1.GetOptions) (*apiv1alpha2.InferenceModel, error) + List(ctx context.Context, opts v1.ListOptions) (*apiv1alpha2.InferenceModelList, error) + Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) + Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *apiv1alpha2.InferenceModel, err error) + Apply(ctx context.Context, inferenceModel *applyconfigurationapiv1alpha2.InferenceModelApplyConfiguration, opts v1.ApplyOptions) (result *apiv1alpha2.InferenceModel, err error) + // Add a +genclient:noStatus comment above the type to avoid generating ApplyStatus(). + ApplyStatus(ctx context.Context, inferenceModel *applyconfigurationapiv1alpha2.InferenceModelApplyConfiguration, opts v1.ApplyOptions) (result *apiv1alpha2.InferenceModel, err error) + InferenceModelExpansion +} + +// inferenceModels implements InferenceModelInterface +type inferenceModels struct { + *gentype.ClientWithListAndApply[*apiv1alpha2.InferenceModel, *apiv1alpha2.InferenceModelList, *applyconfigurationapiv1alpha2.InferenceModelApplyConfiguration] +} + +// newInferenceModels returns a InferenceModels +func newInferenceModels(c *InferenceV1alpha2Client, namespace string) *inferenceModels { + return &inferenceModels{ + gentype.NewClientWithListAndApply[*apiv1alpha2.InferenceModel, *apiv1alpha2.InferenceModelList, *applyconfigurationapiv1alpha2.InferenceModelApplyConfiguration]( + "inferencemodels", + c.RESTClient(), + scheme.ParameterCodec, + namespace, + func() *apiv1alpha2.InferenceModel { return &apiv1alpha2.InferenceModel{} }, + func() *apiv1alpha2.InferenceModelList { return &apiv1alpha2.InferenceModelList{} }, + ), + } +} diff --git a/client-go/clientset/versioned/typed/api/v1alpha2/inferencepool.go b/client-go/clientset/versioned/typed/api/v1alpha2/inferencepool.go new file mode 100644 index 00000000..8482451e --- /dev/null +++ b/client-go/clientset/versioned/typed/api/v1alpha2/inferencepool.go @@ -0,0 +1,73 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by client-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + context "context" + + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + types "k8s.io/apimachinery/pkg/types" + watch "k8s.io/apimachinery/pkg/watch" + gentype "k8s.io/client-go/gentype" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + applyconfigurationapiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha2" + scheme "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" +) + +// InferencePoolsGetter has a method to return a InferencePoolInterface. +// A group's client should implement this interface. +type InferencePoolsGetter interface { + InferencePools(namespace string) InferencePoolInterface +} + +// InferencePoolInterface has methods to work with InferencePool resources. +type InferencePoolInterface interface { + Create(ctx context.Context, inferencePool *apiv1alpha2.InferencePool, opts v1.CreateOptions) (*apiv1alpha2.InferencePool, error) + Update(ctx context.Context, inferencePool *apiv1alpha2.InferencePool, opts v1.UpdateOptions) (*apiv1alpha2.InferencePool, error) + // Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). + UpdateStatus(ctx context.Context, inferencePool *apiv1alpha2.InferencePool, opts v1.UpdateOptions) (*apiv1alpha2.InferencePool, error) + Delete(ctx context.Context, name string, opts v1.DeleteOptions) error + DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error + Get(ctx context.Context, name string, opts v1.GetOptions) (*apiv1alpha2.InferencePool, error) + List(ctx context.Context, opts v1.ListOptions) (*apiv1alpha2.InferencePoolList, error) + Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) + Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *apiv1alpha2.InferencePool, err error) + Apply(ctx context.Context, inferencePool *applyconfigurationapiv1alpha2.InferencePoolApplyConfiguration, opts v1.ApplyOptions) (result *apiv1alpha2.InferencePool, err error) + // Add a +genclient:noStatus comment above the type to avoid generating ApplyStatus(). + ApplyStatus(ctx context.Context, inferencePool *applyconfigurationapiv1alpha2.InferencePoolApplyConfiguration, opts v1.ApplyOptions) (result *apiv1alpha2.InferencePool, err error) + InferencePoolExpansion +} + +// inferencePools implements InferencePoolInterface +type inferencePools struct { + *gentype.ClientWithListAndApply[*apiv1alpha2.InferencePool, *apiv1alpha2.InferencePoolList, *applyconfigurationapiv1alpha2.InferencePoolApplyConfiguration] +} + +// newInferencePools returns a InferencePools +func newInferencePools(c *InferenceV1alpha2Client, namespace string) *inferencePools { + return &inferencePools{ + gentype.NewClientWithListAndApply[*apiv1alpha2.InferencePool, *apiv1alpha2.InferencePoolList, *applyconfigurationapiv1alpha2.InferencePoolApplyConfiguration]( + "inferencepools", + c.RESTClient(), + scheme.ParameterCodec, + namespace, + func() *apiv1alpha2.InferencePool { return &apiv1alpha2.InferencePool{} }, + func() *apiv1alpha2.InferencePoolList { return &apiv1alpha2.InferencePoolList{} }, + ), + } +} diff --git a/client-go/informers/externalversions/api/interface.go b/client-go/informers/externalversions/api/interface.go index fbf5ba09..210b89f8 100644 --- a/client-go/informers/externalversions/api/interface.go +++ b/client-go/informers/externalversions/api/interface.go @@ -19,6 +19,7 @@ package api import ( v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/api/v1alpha1" + v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/api/v1alpha2" internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" ) @@ -26,6 +27,8 @@ import ( type Interface interface { // V1alpha1 provides access to shared informers for resources in V1alpha1. V1alpha1() v1alpha1.Interface + // V1alpha2 provides access to shared informers for resources in V1alpha2. + V1alpha2() v1alpha2.Interface } type group struct { @@ -43,3 +46,8 @@ func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakList func (g *group) V1alpha1() v1alpha1.Interface { return v1alpha1.New(g.factory, g.namespace, g.tweakListOptions) } + +// V1alpha2 returns a new v1alpha2.Interface. +func (g *group) V1alpha2() v1alpha2.Interface { + return v1alpha2.New(g.factory, g.namespace, g.tweakListOptions) +} diff --git a/client-go/informers/externalversions/api/v1alpha2/inferencemodel.go b/client-go/informers/externalversions/api/v1alpha2/inferencemodel.go new file mode 100644 index 00000000..74f640d1 --- /dev/null +++ b/client-go/informers/externalversions/api/v1alpha2/inferencemodel.go @@ -0,0 +1,89 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by informer-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + context "context" + time "time" + + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + watch "k8s.io/apimachinery/pkg/watch" + cache "k8s.io/client-go/tools/cache" + gatewayapiinferenceextensionapiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + versioned "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" + internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/listers/api/v1alpha2" +) + +// InferenceModelInformer provides access to a shared informer and lister for +// InferenceModels. +type InferenceModelInformer interface { + Informer() cache.SharedIndexInformer + Lister() apiv1alpha2.InferenceModelLister +} + +type inferenceModelInformer struct { + factory internalinterfaces.SharedInformerFactory + tweakListOptions internalinterfaces.TweakListOptionsFunc + namespace string +} + +// NewInferenceModelInformer constructs a new informer for InferenceModel type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewInferenceModelInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { + return NewFilteredInferenceModelInformer(client, namespace, resyncPeriod, indexers, nil) +} + +// NewFilteredInferenceModelInformer constructs a new informer for InferenceModel type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewFilteredInferenceModelInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { + return cache.NewSharedIndexInformer( + &cache.ListWatch{ + ListFunc: func(options v1.ListOptions) (runtime.Object, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.InferenceV1alpha2().InferenceModels(namespace).List(context.TODO(), options) + }, + WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.InferenceV1alpha2().InferenceModels(namespace).Watch(context.TODO(), options) + }, + }, + &gatewayapiinferenceextensionapiv1alpha2.InferenceModel{}, + resyncPeriod, + indexers, + ) +} + +func (f *inferenceModelInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { + return NewFilteredInferenceModelInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) +} + +func (f *inferenceModelInformer) Informer() cache.SharedIndexInformer { + return f.factory.InformerFor(&gatewayapiinferenceextensionapiv1alpha2.InferenceModel{}, f.defaultInformer) +} + +func (f *inferenceModelInformer) Lister() apiv1alpha2.InferenceModelLister { + return apiv1alpha2.NewInferenceModelLister(f.Informer().GetIndexer()) +} diff --git a/client-go/informers/externalversions/api/v1alpha2/inferencepool.go b/client-go/informers/externalversions/api/v1alpha2/inferencepool.go new file mode 100644 index 00000000..d04591dd --- /dev/null +++ b/client-go/informers/externalversions/api/v1alpha2/inferencepool.go @@ -0,0 +1,89 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by informer-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + context "context" + time "time" + + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + runtime "k8s.io/apimachinery/pkg/runtime" + watch "k8s.io/apimachinery/pkg/watch" + cache "k8s.io/client-go/tools/cache" + gatewayapiinferenceextensionapiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + versioned "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" + internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/listers/api/v1alpha2" +) + +// InferencePoolInformer provides access to a shared informer and lister for +// InferencePools. +type InferencePoolInformer interface { + Informer() cache.SharedIndexInformer + Lister() apiv1alpha2.InferencePoolLister +} + +type inferencePoolInformer struct { + factory internalinterfaces.SharedInformerFactory + tweakListOptions internalinterfaces.TweakListOptionsFunc + namespace string +} + +// NewInferencePoolInformer constructs a new informer for InferencePool type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewInferencePoolInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { + return NewFilteredInferencePoolInformer(client, namespace, resyncPeriod, indexers, nil) +} + +// NewFilteredInferencePoolInformer constructs a new informer for InferencePool type. +// Always prefer using an informer factory to get a shared informer instead of getting an independent +// one. This reduces memory footprint and number of connections to the server. +func NewFilteredInferencePoolInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { + return cache.NewSharedIndexInformer( + &cache.ListWatch{ + ListFunc: func(options v1.ListOptions) (runtime.Object, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.InferenceV1alpha2().InferencePools(namespace).List(context.TODO(), options) + }, + WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { + if tweakListOptions != nil { + tweakListOptions(&options) + } + return client.InferenceV1alpha2().InferencePools(namespace).Watch(context.TODO(), options) + }, + }, + &gatewayapiinferenceextensionapiv1alpha2.InferencePool{}, + resyncPeriod, + indexers, + ) +} + +func (f *inferencePoolInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { + return NewFilteredInferencePoolInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) +} + +func (f *inferencePoolInformer) Informer() cache.SharedIndexInformer { + return f.factory.InformerFor(&gatewayapiinferenceextensionapiv1alpha2.InferencePool{}, f.defaultInformer) +} + +func (f *inferencePoolInformer) Lister() apiv1alpha2.InferencePoolLister { + return apiv1alpha2.NewInferencePoolLister(f.Informer().GetIndexer()) +} diff --git a/client-go/informers/externalversions/api/v1alpha2/interface.go b/client-go/informers/externalversions/api/v1alpha2/interface.go new file mode 100644 index 00000000..9e5c4d9c --- /dev/null +++ b/client-go/informers/externalversions/api/v1alpha2/interface.go @@ -0,0 +1,51 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by informer-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" +) + +// Interface provides access to all the informers in this group version. +type Interface interface { + // InferenceModels returns a InferenceModelInformer. + InferenceModels() InferenceModelInformer + // InferencePools returns a InferencePoolInformer. + InferencePools() InferencePoolInformer +} + +type version struct { + factory internalinterfaces.SharedInformerFactory + namespace string + tweakListOptions internalinterfaces.TweakListOptionsFunc +} + +// New returns a new Interface. +func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { + return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} +} + +// InferenceModels returns a InferenceModelInformer. +func (v *version) InferenceModels() InferenceModelInformer { + return &inferenceModelInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions} +} + +// InferencePools returns a InferencePoolInformer. +func (v *version) InferencePools() InferencePoolInformer { + return &inferencePoolInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions} +} diff --git a/client-go/informers/externalversions/generic.go b/client-go/informers/externalversions/generic.go index 672998f5..9f363d88 100644 --- a/client-go/informers/externalversions/generic.go +++ b/client-go/informers/externalversions/generic.go @@ -23,6 +23,7 @@ import ( schema "k8s.io/apimachinery/pkg/runtime/schema" cache "k8s.io/client-go/tools/cache" v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) // GenericInformer is type of SharedIndexInformer which will locate and delegate to other @@ -57,6 +58,12 @@ func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource case v1alpha1.SchemeGroupVersion.WithResource("inferencepools"): return &genericInformer{resource: resource.GroupResource(), informer: f.Inference().V1alpha1().InferencePools().Informer()}, nil + // Group=inference.networking.x-k8s.io, Version=v1alpha2 + case v1alpha2.SchemeGroupVersion.WithResource("inferencemodels"): + return &genericInformer{resource: resource.GroupResource(), informer: f.Inference().V1alpha2().InferenceModels().Informer()}, nil + case v1alpha2.SchemeGroupVersion.WithResource("inferencepools"): + return &genericInformer{resource: resource.GroupResource(), informer: f.Inference().V1alpha2().InferencePools().Informer()}, nil + } return nil, fmt.Errorf("no informer found for %v", resource) diff --git a/client-go/listers/api/v1alpha2/expansion_generated.go b/client-go/listers/api/v1alpha2/expansion_generated.go new file mode 100644 index 00000000..204c375b --- /dev/null +++ b/client-go/listers/api/v1alpha2/expansion_generated.go @@ -0,0 +1,34 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by lister-gen. DO NOT EDIT. + +package v1alpha2 + +// InferenceModelListerExpansion allows custom methods to be added to +// InferenceModelLister. +type InferenceModelListerExpansion interface{} + +// InferenceModelNamespaceListerExpansion allows custom methods to be added to +// InferenceModelNamespaceLister. +type InferenceModelNamespaceListerExpansion interface{} + +// InferencePoolListerExpansion allows custom methods to be added to +// InferencePoolLister. +type InferencePoolListerExpansion interface{} + +// InferencePoolNamespaceListerExpansion allows custom methods to be added to +// InferencePoolNamespaceLister. +type InferencePoolNamespaceListerExpansion interface{} diff --git a/client-go/listers/api/v1alpha2/inferencemodel.go b/client-go/listers/api/v1alpha2/inferencemodel.go new file mode 100644 index 00000000..ce83b85f --- /dev/null +++ b/client-go/listers/api/v1alpha2/inferencemodel.go @@ -0,0 +1,69 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by lister-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + labels "k8s.io/apimachinery/pkg/labels" + listers "k8s.io/client-go/listers" + cache "k8s.io/client-go/tools/cache" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" +) + +// InferenceModelLister helps list InferenceModels. +// All objects returned here must be treated as read-only. +type InferenceModelLister interface { + // List lists all InferenceModels in the indexer. + // Objects returned here must be treated as read-only. + List(selector labels.Selector) (ret []*apiv1alpha2.InferenceModel, err error) + // InferenceModels returns an object that can list and get InferenceModels. + InferenceModels(namespace string) InferenceModelNamespaceLister + InferenceModelListerExpansion +} + +// inferenceModelLister implements the InferenceModelLister interface. +type inferenceModelLister struct { + listers.ResourceIndexer[*apiv1alpha2.InferenceModel] +} + +// NewInferenceModelLister returns a new InferenceModelLister. +func NewInferenceModelLister(indexer cache.Indexer) InferenceModelLister { + return &inferenceModelLister{listers.New[*apiv1alpha2.InferenceModel](indexer, apiv1alpha2.Resource("inferencemodel"))} +} + +// InferenceModels returns an object that can list and get InferenceModels. +func (s *inferenceModelLister) InferenceModels(namespace string) InferenceModelNamespaceLister { + return inferenceModelNamespaceLister{listers.NewNamespaced[*apiv1alpha2.InferenceModel](s.ResourceIndexer, namespace)} +} + +// InferenceModelNamespaceLister helps list and get InferenceModels. +// All objects returned here must be treated as read-only. +type InferenceModelNamespaceLister interface { + // List lists all InferenceModels in the indexer for a given namespace. + // Objects returned here must be treated as read-only. + List(selector labels.Selector) (ret []*apiv1alpha2.InferenceModel, err error) + // Get retrieves the InferenceModel from the indexer for a given namespace and name. + // Objects returned here must be treated as read-only. + Get(name string) (*apiv1alpha2.InferenceModel, error) + InferenceModelNamespaceListerExpansion +} + +// inferenceModelNamespaceLister implements the InferenceModelNamespaceLister +// interface. +type inferenceModelNamespaceLister struct { + listers.ResourceIndexer[*apiv1alpha2.InferenceModel] +} diff --git a/client-go/listers/api/v1alpha2/inferencepool.go b/client-go/listers/api/v1alpha2/inferencepool.go new file mode 100644 index 00000000..c7e49a1e --- /dev/null +++ b/client-go/listers/api/v1alpha2/inferencepool.go @@ -0,0 +1,69 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +// Code generated by lister-gen. DO NOT EDIT. + +package v1alpha2 + +import ( + labels "k8s.io/apimachinery/pkg/labels" + listers "k8s.io/client-go/listers" + cache "k8s.io/client-go/tools/cache" + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" +) + +// InferencePoolLister helps list InferencePools. +// All objects returned here must be treated as read-only. +type InferencePoolLister interface { + // List lists all InferencePools in the indexer. + // Objects returned here must be treated as read-only. + List(selector labels.Selector) (ret []*apiv1alpha2.InferencePool, err error) + // InferencePools returns an object that can list and get InferencePools. + InferencePools(namespace string) InferencePoolNamespaceLister + InferencePoolListerExpansion +} + +// inferencePoolLister implements the InferencePoolLister interface. +type inferencePoolLister struct { + listers.ResourceIndexer[*apiv1alpha2.InferencePool] +} + +// NewInferencePoolLister returns a new InferencePoolLister. +func NewInferencePoolLister(indexer cache.Indexer) InferencePoolLister { + return &inferencePoolLister{listers.New[*apiv1alpha2.InferencePool](indexer, apiv1alpha2.Resource("inferencepool"))} +} + +// InferencePools returns an object that can list and get InferencePools. +func (s *inferencePoolLister) InferencePools(namespace string) InferencePoolNamespaceLister { + return inferencePoolNamespaceLister{listers.NewNamespaced[*apiv1alpha2.InferencePool](s.ResourceIndexer, namespace)} +} + +// InferencePoolNamespaceLister helps list and get InferencePools. +// All objects returned here must be treated as read-only. +type InferencePoolNamespaceLister interface { + // List lists all InferencePools in the indexer for a given namespace. + // Objects returned here must be treated as read-only. + List(selector labels.Selector) (ret []*apiv1alpha2.InferencePool, err error) + // Get retrieves the InferencePool from the indexer for a given namespace and name. + // Objects returned here must be treated as read-only. + Get(name string) (*apiv1alpha2.InferencePool, error) + InferencePoolNamespaceListerExpansion +} + +// inferencePoolNamespaceLister implements the InferencePoolNamespaceLister +// interface. +type inferencePoolNamespaceLister struct { + listers.ResourceIndexer[*apiv1alpha2.InferencePool] +} diff --git a/cmd/epp/main.go b/cmd/epp/main.go index 1f76cfab..dd47fa27 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -40,6 +40,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/vllm" @@ -104,6 +105,8 @@ var ( func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) utilruntime.Must(v1alpha1.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) + } func main() { diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml index bca19605..09258c20 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml @@ -235,6 +235,230 @@ spec: type: object type: object served: true + storage: false + subresources: + status: {} + - name: v1alpha2 + schema: + openAPIV3Schema: + description: InferenceModel is the Schema for the InferenceModels API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + InferenceModelSpec represents the desired state of a specific model use case. This resource is + managed by the "Inference Workload Owner" persona. + + The Inference Workload Owner persona is someone that trains, verifies, and + leverages a large language model from a model frontend, drives the lifecycle + and rollout of new versions of those models, and defines the specific + performance and latency goals for the model. These workloads are + expected to operate within an InferencePool sharing compute capacity with other + InferenceModels, defined by the Inference Platform Admin. + + InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool, + if the name is reused, an error will be shown on the status of a + InferenceModel that attempted to reuse. The oldest InferenceModel, based on + creation timestamp, will be selected to remain valid. In the event of a race + condition, one will be selected at random. + properties: + criticality: + description: |- + Criticality defines how important it is to serve the model compared to other models referencing the same pool. + Criticality impacts how traffic is handled in resource constrained situations. It handles this by + queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will + fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness, + and the proportionality of fairness will be configurable. + + Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field. + Any implementations that may consume this field may treat an unset value as the 'Standard' range. + enum: + - Critical + - Standard + - Sheddable + type: string + modelName: + description: |- + ModelName is the name of the model as it will be set in the "model" parameter for an incoming request. + ModelNames must be unique for a referencing InferencePool + (names can be reused for a different pool in the same cluster). + The modelName with the oldest creation timestamp is retained, and the incoming + InferenceModel is sets the Ready status to false with a corresponding reason. + In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected. + Names can be reserved without an underlying model configured in the pool. + This can be done by specifying a target model and setting the weight to zero, + an error will be returned specifying that no valid target model is found. + maxLength: 256 + type: string + poolRef: + description: PoolRef is a reference to the inference pool, the pool + must exist in the same namespace. + properties: + group: + default: inference.networking.x-k8s.io + description: Group is the group of the referent. + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ + type: string + kind: + default: InferencePool + description: Kind is kind of the referent. For example "InferencePool". + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ + type: string + name: + description: Name is the name of the referent. + maxLength: 253 + minLength: 1 + type: string + required: + - name + type: object + targetModels: + description: |- + TargetModels allow multiple versions of a model for traffic splitting. + If not specified, the target model name is defaulted to the modelName parameter. + modelName is often in reference to a LoRA adapter. + items: + description: |- + TargetModel represents a deployed model or a LoRA adapter. The + Name field is expected to match the name of the LoRA adapter + (or base model) as it is registered within the model server. Inference + Gateway assumes that the model exists on the model server and it's the + responsibility of the user to validate a correct match. Should a model fail + to exist at request time, the error is processed by the Inference Gateway + and emitted on the appropriate InferenceModel object. + properties: + name: + description: Name is the name of the adapter or base model, + as expected by the ModelServer. + maxLength: 253 + type: string + weight: + description: |- + Weight is used to determine the proportion of traffic that should be + sent to this model when multiple target models are specified. + + Weight defines the proportion of requests forwarded to the specified + model. This is computed as weight/(sum of all weights in this + TargetModels list). For non-zero values, there may be some epsilon from + the exact proportion defined here depending on the precision an + implementation supports. Weight is not a percentage and the sum of + weights does not need to equal 100. + + If a weight is set for any targetModel, it must be set for all targetModels. + Conversely weights are optional, so long as ALL targetModels do not specify a weight. + format: int32 + maximum: 1000000 + minimum: 0 + type: integer + required: + - name + type: object + maxItems: 10 + type: array + x-kubernetes-validations: + - message: Weights should be set for all models, or none of the models. + rule: self.all(model, has(model.weight)) || self.all(model, !has(model.weight)) + required: + - modelName + - poolRef + type: object + status: + description: InferenceModelStatus defines the observed state of InferenceModel + properties: + conditions: + default: + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for controller + reason: Pending + status: Unknown + type: Ready + description: |- + Conditions track the state of the InferenceModel. + + Known condition types are: + + * "Accepted" + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + maxItems: 8 + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + type: object + type: object + served: true storage: true subresources: status: {} diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml index 9e6473b9..918e95cb 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml @@ -201,6 +201,258 @@ spec: type: object type: object served: true + storage: false + subresources: + status: {} + - name: v1alpha2 + schema: + openAPIV3Schema: + description: InferencePool is the Schema for the InferencePools API. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: InferencePoolSpec defines the desired state of InferencePool + properties: + extensionRef: + description: Extension configures an endpoint picker as an extension + service. + properties: + failureMode: + default: FailClose + description: |- + Configures how the gateway handles the case when the extension is not responsive. + Defaults to failClose. + enum: + - FailOpen + - FailClose + type: string + group: + default: "" + description: |- + Group is the group of the referent. + When unspecified or empty string, core API group is inferred. + type: string + kind: + default: Service + description: |- + Kind is the Kubernetes resource kind of the referent. For example + "Service". + + Defaults to "Service" when not specified. + + ExternalName services can refer to CNAME DNS records that may live + outside of the cluster and as such are difficult to reason about in + terms of conformance. They also may not be safe to forward to (see + CVE-2021-25740 for more information). Implementations MUST NOT + support ExternalName Services. + type: string + name: + description: Name is the name of the referent. + type: string + targetPortNumber: + description: |- + The port number on the service running the extension. When unspecified, implementations SHOULD infer a + default value of 9002 when the Kind is Service. + format: int32 + maximum: 65535 + minimum: 1 + type: integer + required: + - name + type: object + selector: + additionalProperties: + description: |- + LabelValue is the value of a label. This is used for validation + of maps. This matches the Kubernetes label validation rules: + * must be 63 characters or less (can be empty), + * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]), + * could contain dashes (-), underscores (_), dots (.), and alphanumerics between. + + Valid values include: + + * MyValue + * my.name + * 123-my-value + maxLength: 63 + minLength: 0 + pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ + type: string + description: |- + Selector defines a map of labels to watch model server pods + that should be included in the InferencePool. + In some cases, implementations may translate this field to a Service selector, so this matches the simple + map used for Service selectors instead of the full Kubernetes LabelSelector type. + type: object + targetPortNumber: + description: |- + TargetPortNumber defines the port number to access the selected model servers. + The number must be in the range 1 to 65535. + format: int32 + maximum: 65535 + minimum: 1 + type: integer + required: + - extensionRef + - selector + - targetPortNumber + type: object + status: + description: InferencePoolStatus defines the observed state of InferencePool + properties: + parent: + description: |- + Parents is a list of parent resources (usually Gateways) that are + associated with the route, and the status of the InferencePool with respect to + each parent. + + A maximum of 32 Gateways will be represented in this list. An empty list + means the route has not been attached to any Gateway. + items: + description: PoolStatus defines the observed state of InferencePool + from a gateway. + properties: + conditions: + default: + - lastTransitionTime: "1970-01-01T00:00:00Z" + message: Waiting for controller + reason: Pending + status: Unknown + type: Ready + description: |- + Conditions track the state of the InferencePool. + + Known condition types are: + + * "Ready" + items: + description: Condition contains details for one aspect of + the current state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, + Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + maxItems: 8 + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + parentRef: + description: GatewayRef indicates the gateway that observed + state of InferencePool. + properties: + apiVersion: + description: API version of the referent. + type: string + fieldPath: + description: |- + If referring to a piece of an object instead of an entire object, this string + should contain a valid JSON/Go field access statement, such as desiredState.manifest.containers[2]. + For example, if the object reference is to a container within a pod, this would take on a value like: + "spec.containers{name}" (where "name" refers to the name of the container that triggered + the event) or if no container name is specified "spec.containers[2]" (container with + index 2 in this pod). This syntax is chosen only to have some well-defined way of + referencing a part of an object. + type: string + kind: + description: |- + Kind of the referent. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + name: + description: |- + Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + namespace: + description: |- + Namespace of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ + type: string + resourceVersion: + description: |- + Specific resourceVersion to which this reference is made, if any. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency + type: string + uid: + description: |- + UID of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids + type: string + type: object + x-kubernetes-map-type: atomic + required: + - parentRef + type: object + maxItems: 32 + type: array + type: object + type: object + served: true storage: true subresources: status: {} diff --git a/pkg/epp/backend/fake.go b/pkg/epp/backend/fake.go index e81b3817..06f14f69 100644 --- a/pkg/epp/backend/fake.go +++ b/pkg/epp/backend/fake.go @@ -21,7 +21,7 @@ import ( "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -40,9 +40,9 @@ func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, existing *datas } type FakeDataStore struct { - Res map[string]*v1alpha1.InferenceModel + Res map[string]*v1alpha2.InferenceModel } -func (fds *FakeDataStore) FetchModelData(modelName string) (returnModel *v1alpha1.InferenceModel) { +func (fds *FakeDataStore) FetchModelData(modelName string) (returnModel *v1alpha2.InferenceModel) { return fds.Res[modelName] } diff --git a/pkg/epp/controller/inferencemodel_reconciler.go b/pkg/epp/controller/inferencemodel_reconciler.go index 99a1eb26..00358740 100644 --- a/pkg/epp/controller/inferencemodel_reconciler.go +++ b/pkg/epp/controller/inferencemodel_reconciler.go @@ -27,7 +27,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -49,7 +49,7 @@ func (c *InferenceModelReconciler) Reconcile(ctx context.Context, req ctrl.Reque loggerDefault := logger.V(logutil.DEFAULT) loggerDefault.Info("Reconciling InferenceModel", "name", req.NamespacedName) - infModel := &v1alpha1.InferenceModel{} + infModel := &v1alpha2.InferenceModel{} if err := c.Get(ctx, req.NamespacedName, infModel); err != nil { if errors.IsNotFound(err) { loggerDefault.Info("InferenceModel not found. Removing from datastore since object must be deleted", "name", req.NamespacedName) @@ -68,7 +68,7 @@ func (c *InferenceModelReconciler) Reconcile(ctx context.Context, req ctrl.Reque return ctrl.Result{}, nil } -func (c *InferenceModelReconciler) updateDatastore(logger logr.Logger, infModel *v1alpha1.InferenceModel) { +func (c *InferenceModelReconciler) updateDatastore(logger logr.Logger, infModel *v1alpha2.InferenceModel) { loggerDefault := logger.V(logutil.DEFAULT) if infModel.Spec.PoolRef.Name == c.PoolNamespacedName.Name { @@ -84,6 +84,6 @@ func (c *InferenceModelReconciler) updateDatastore(logger logr.Logger, infModel func (c *InferenceModelReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). - For(&v1alpha1.InferenceModel{}). + For(&v1alpha2.InferenceModel{}). Complete(c) } diff --git a/pkg/epp/controller/inferencemodel_reconciler_test.go b/pkg/epp/controller/inferencemodel_reconciler_test.go index cf94b168..cea7bf42 100644 --- a/pkg/epp/controller/inferencemodel_reconciler_test.go +++ b/pkg/epp/controller/inferencemodel_reconciler_test.go @@ -28,34 +28,34 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) var ( - infModel1 = &v1alpha1.InferenceModel{ - Spec: v1alpha1.InferenceModelSpec{ + infModel1 = &v1alpha2.InferenceModel{ + Spec: v1alpha2.InferenceModelSpec{ ModelName: "fake model1", - PoolRef: v1alpha1.PoolObjectReference{Name: "test-pool"}, + PoolRef: v1alpha2.PoolObjectReference{Name: "test-pool"}, }, ObjectMeta: metav1.ObjectMeta{ Name: "test-service", }, } - infModel1Modified = &v1alpha1.InferenceModel{ - Spec: v1alpha1.InferenceModelSpec{ + infModel1Modified = &v1alpha2.InferenceModel{ + Spec: v1alpha2.InferenceModelSpec{ ModelName: "fake model1", - PoolRef: v1alpha1.PoolObjectReference{Name: "test-poolio"}, + PoolRef: v1alpha2.PoolObjectReference{Name: "test-poolio"}, }, ObjectMeta: metav1.ObjectMeta{ Name: "test-service", }, } - infModel2 = &v1alpha1.InferenceModel{ - Spec: v1alpha1.InferenceModelSpec{ + infModel2 = &v1alpha2.InferenceModel{ + Spec: v1alpha2.InferenceModelSpec{ ModelName: "fake model", - PoolRef: v1alpha1.PoolObjectReference{Name: "test-pool"}, + PoolRef: v1alpha2.PoolObjectReference{Name: "test-pool"}, }, ObjectMeta: metav1.ObjectMeta{ Name: "test-service-2", @@ -69,14 +69,14 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { tests := []struct { name string datastore datastore.Datastore - incomingService *v1alpha1.InferenceModel + incomingService *v1alpha2.InferenceModel wantInferenceModels *sync.Map }{ { name: "No Services registered; valid, new service incoming.", - datastore: datastore.NewFakeDatastore(nil, nil, &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, + datastore: datastore.NewFakeDatastore(nil, nil, &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{"app": "vllm"}, }, ObjectMeta: metav1.ObjectMeta{ Name: "test-pool", @@ -89,9 +89,9 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { }, { name: "Removing existing service.", - datastore: datastore.NewFakeDatastore(nil, populateServiceMap(infModel1), &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, + datastore: datastore.NewFakeDatastore(nil, populateServiceMap(infModel1), &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{"app": "vllm"}, }, ObjectMeta: metav1.ObjectMeta{ Name: "test-pool", @@ -103,19 +103,19 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { }, { name: "Unrelated service, do nothing.", - datastore: datastore.NewFakeDatastore(nil, populateServiceMap(infModel1), &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, + datastore: datastore.NewFakeDatastore(nil, populateServiceMap(infModel1), &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{"app": "vllm"}, }, ObjectMeta: metav1.ObjectMeta{ Name: "test-pool", ResourceVersion: "Old and boring", }, }), - incomingService: &v1alpha1.InferenceModel{ - Spec: v1alpha1.InferenceModelSpec{ + incomingService: &v1alpha2.InferenceModel{ + Spec: v1alpha2.InferenceModelSpec{ ModelName: "fake model", - PoolRef: v1alpha1.PoolObjectReference{Name: "test-poolio"}, + PoolRef: v1alpha2.PoolObjectReference{Name: "test-poolio"}, }, ObjectMeta: metav1.ObjectMeta{ Name: "unrelated-service", @@ -125,9 +125,9 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { }, { name: "Add to existing", - datastore: datastore.NewFakeDatastore(nil, populateServiceMap(infModel1), &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm"}, + datastore: datastore.NewFakeDatastore(nil, populateServiceMap(infModel1), &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{"app": "vllm"}, }, ObjectMeta: metav1.ObjectMeta{ Name: "test-pool", @@ -164,13 +164,13 @@ func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { func TestReconcile_ResourceNotFound(t *testing.T) { // Set up the scheme. scheme := runtime.NewScheme() - _ = v1alpha1.AddToScheme(scheme) + _ = v1alpha2.AddToScheme(scheme) // Create a fake client with no InferenceModel objects. fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() // Create a minimal datastore. - datastore := datastore.NewFakeDatastore(nil, nil, &v1alpha1.InferencePool{ + datastore := datastore.NewFakeDatastore(nil, nil, &v1alpha2.InferencePool{ ObjectMeta: metav1.ObjectMeta{Name: "test-pool"}, }) @@ -201,20 +201,20 @@ func TestReconcile_ResourceNotFound(t *testing.T) { func TestReconcile_ModelMarkedForDeletion(t *testing.T) { // Set up the scheme. scheme := runtime.NewScheme() - _ = v1alpha1.AddToScheme(scheme) + _ = v1alpha2.AddToScheme(scheme) // Create an InferenceModel object. now := metav1.Now() - existingModel := &v1alpha1.InferenceModel{ + existingModel := &v1alpha2.InferenceModel{ ObjectMeta: metav1.ObjectMeta{ Name: "existing-model", Namespace: "default", DeletionTimestamp: &now, Finalizers: []string{"finalizer"}, }, - Spec: v1alpha1.InferenceModelSpec{ + Spec: v1alpha2.InferenceModelSpec{ ModelName: "fake-model", - PoolRef: v1alpha1.PoolObjectReference{Name: "test-pool"}, + PoolRef: v1alpha2.PoolObjectReference{Name: "test-pool"}, }, } @@ -222,7 +222,7 @@ func TestReconcile_ModelMarkedForDeletion(t *testing.T) { fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existingModel).Build() // Create a minimal datastore. - datastore := datastore.NewFakeDatastore(nil, nil, &v1alpha1.InferencePool{ + datastore := datastore.NewFakeDatastore(nil, nil, &v1alpha2.InferencePool{ ObjectMeta: metav1.ObjectMeta{Name: "test-pool"}, }) @@ -258,17 +258,17 @@ func TestReconcile_ModelMarkedForDeletion(t *testing.T) { func TestReconcile_ResourceExists(t *testing.T) { // Set up the scheme. scheme := runtime.NewScheme() - _ = v1alpha1.AddToScheme(scheme) + _ = v1alpha2.AddToScheme(scheme) // Create an InferenceModel object. - existingModel := &v1alpha1.InferenceModel{ + existingModel := &v1alpha2.InferenceModel{ ObjectMeta: metav1.ObjectMeta{ Name: "existing-model", Namespace: "default", }, - Spec: v1alpha1.InferenceModelSpec{ + Spec: v1alpha2.InferenceModelSpec{ ModelName: "fake-model", - PoolRef: v1alpha1.PoolObjectReference{Name: "test-pool"}, + PoolRef: v1alpha2.PoolObjectReference{Name: "test-pool"}, }, } @@ -276,7 +276,7 @@ func TestReconcile_ResourceExists(t *testing.T) { fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existingModel).Build() // Create a minimal datastore. - datastore := datastore.NewFakeDatastore(nil, nil, &v1alpha1.InferencePool{ + datastore := datastore.NewFakeDatastore(nil, nil, &v1alpha2.InferencePool{ ObjectMeta: metav1.ObjectMeta{Name: "test-pool"}, }) @@ -309,7 +309,7 @@ func TestReconcile_ResourceExists(t *testing.T) { } } -func populateServiceMap(services ...*v1alpha1.InferenceModel) *sync.Map { +func populateServiceMap(services ...*v1alpha2.InferenceModel) *sync.Map { returnVal := &sync.Map{} for _, service := range services { diff --git a/pkg/epp/controller/inferencepool_reconciler.go b/pkg/epp/controller/inferencepool_reconciler.go index f2c56991..baf3332b 100644 --- a/pkg/epp/controller/inferencepool_reconciler.go +++ b/pkg/epp/controller/inferencepool_reconciler.go @@ -27,7 +27,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -52,7 +52,7 @@ func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Reques loggerDefault := logger.V(logutil.DEFAULT) loggerDefault.Info("Reconciling InferencePool", "name", req.NamespacedName) - serverPool := &v1alpha1.InferencePool{} + serverPool := &v1alpha2.InferencePool{} if err := c.Get(ctx, req.NamespacedName, serverPool); err != nil { if errors.IsNotFound(err) { @@ -73,7 +73,7 @@ func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Reques return ctrl.Result{}, nil } -func (c *InferencePoolReconciler) updateDatastore(ctx context.Context, newPool *v1alpha1.InferencePool) { +func (c *InferencePoolReconciler) updateDatastore(ctx context.Context, newPool *v1alpha2.InferencePool) { logger := log.FromContext(ctx) oldPool, err := c.Datastore.PoolGet() c.Datastore.PoolSet(newPool) @@ -91,6 +91,6 @@ func (c *InferencePoolReconciler) updateDatastore(ctx context.Context, newPool * func (c *InferencePoolReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). - For(&v1alpha1.InferencePool{}). + For(&v1alpha2.InferencePool{}). Complete(c) } diff --git a/pkg/epp/controller/inferencepool_reconciler_test.go b/pkg/epp/controller/inferencepool_reconciler_test.go index 6263fa16..a96406f0 100644 --- a/pkg/epp/controller/inferencepool_reconciler_test.go +++ b/pkg/epp/controller/inferencepool_reconciler_test.go @@ -30,7 +30,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" utiltesting "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" ) @@ -38,17 +38,17 @@ import ( var ( selector_v1 = map[string]string{"app": "vllm_v1"} selector_v2 = map[string]string{"app": "vllm_v2"} - pool1 = &v1alpha1.InferencePool{ + pool1 = &v1alpha2.InferencePool{ ObjectMeta: metav1.ObjectMeta{ Name: "pool1", Namespace: "pool1-ns", }, - Spec: v1alpha1.InferencePoolSpec{ - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm_v1"}, + Spec: v1alpha2.InferencePoolSpec{ + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{"app": "vllm_v1"}, TargetPortNumber: 8080, }, } - pool2 = &v1alpha1.InferencePool{ + pool2 = &v1alpha2.InferencePool{ ObjectMeta: metav1.ObjectMeta{ Name: "pool2", Namespace: "pool2-ns", @@ -74,7 +74,7 @@ func TestReconcile_InferencePoolReconciler(t *testing.T) { // Set up the scheme. scheme := runtime.NewScheme() _ = clientgoscheme.AddToScheme(scheme) - _ = v1alpha1.AddToScheme(scheme) + _ = v1alpha2.AddToScheme(scheme) // Create a fake client with the pool and the pods. initialObjects := []client.Object{pool1, pool2} @@ -111,11 +111,11 @@ func TestReconcile_InferencePoolReconciler(t *testing.T) { } // Step 3: update the pool selector to include more pods - newPool1 := &v1alpha1.InferencePool{} + newPool1 := &v1alpha2.InferencePool{} if err := fakeClient.Get(ctx, req.NamespacedName, newPool1); err != nil { t.Errorf("Unexpected pool get error: %v", err) } - newPool1.Spec.Selector = map[v1alpha1.LabelKey]v1alpha1.LabelValue{"app": "vllm_v2"} + newPool1.Spec.Selector = map[v1alpha2.LabelKey]v1alpha2.LabelValue{"app": "vllm_v2"} if err := fakeClient.Update(ctx, newPool1, &client.UpdateOptions{}); err != nil { t.Errorf("Unexpected pool update error: %v", err) } @@ -157,7 +157,7 @@ func TestReconcile_InferencePoolReconciler(t *testing.T) { } } -func diffPool(datastore datastore.Datastore, wantPool *v1alpha1.InferencePool, wantPods []string) string { +func diffPool(datastore datastore.Datastore, wantPool *v1alpha2.InferencePool, wantPods []string) string { gotPool, _ := datastore.PoolGet() if diff := cmp.Diff(wantPool, gotPool); diff != "" { return diff diff --git a/pkg/epp/controller/pod_reconciler_test.go b/pkg/epp/controller/pod_reconciler_test.go index b3869113..8a39dbab 100644 --- a/pkg/epp/controller/pod_reconciler_test.go +++ b/pkg/epp/controller/pod_reconciler_test.go @@ -31,7 +31,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" ) @@ -53,10 +53,10 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }{ { name: "Add new pod", - datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ + datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ "some-key": "some-val", }, }, @@ -82,10 +82,10 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, { name: "Update pod1 address", - datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ + datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ "some-key": "some-val", }, }, @@ -111,10 +111,10 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, { name: "Delete pod with DeletionTimestamp", - datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ + datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ "some-key": "some-val", }, }, @@ -141,10 +141,10 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, { name: "Delete notfound pod", - datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ + datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ "some-key": "some-val", }, }, @@ -154,10 +154,10 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, { name: "New pod, not ready, valid selector", - datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ + datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ "some-key": "some-val", }, }, @@ -182,10 +182,10 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, { name: "Remove pod that does not match selector", - datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ + datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ "some-key": "some-val", }, }, @@ -210,10 +210,10 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, { name: "Remove pod that is not ready", - datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha1.InferencePool{ - Spec: v1alpha1.InferencePoolSpec{ + datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), - Selector: map[v1alpha1.LabelKey]v1alpha1.LabelValue{ + Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ "some-key": "some-val", }, }, diff --git a/pkg/epp/datastore/datastore.go b/pkg/epp/datastore/datastore.go index eecea59c..c5bbddcf 100644 --- a/pkg/epp/datastore/datastore.go +++ b/pkg/epp/datastore/datastore.go @@ -28,21 +28,21 @@ import ( "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) // The datastore is a local cache of relevant data for the given InferencePool (currently all pulled from k8s-api) type Datastore interface { // InferencePool operations - PoolSet(pool *v1alpha1.InferencePool) - PoolGet() (*v1alpha1.InferencePool, error) + PoolSet(pool *v1alpha2.InferencePool) + PoolGet() (*v1alpha2.InferencePool, error) PoolHasSynced() bool PoolLabelsMatch(podLabels map[string]string) bool // InferenceModel operations - ModelSet(infModel *v1alpha1.InferenceModel) - ModelGet(modelName string) (*v1alpha1.InferenceModel, bool) + ModelSet(infModel *v1alpha2.InferenceModel) + ModelGet(modelName string) (*v1alpha2.InferenceModel, bool) ModelDelete(modelName string) // PodMetrics operations @@ -69,7 +69,7 @@ func NewDatastore() Datastore { } // Used for test only -func NewFakeDatastore(pods, models *sync.Map, pool *v1alpha1.InferencePool) Datastore { +func NewFakeDatastore(pods, models *sync.Map, pool *v1alpha2.InferencePool) Datastore { store := NewDatastore() if pods != nil { store.(*datastore).pods = pods @@ -86,7 +86,7 @@ func NewFakeDatastore(pods, models *sync.Map, pool *v1alpha1.InferencePool) Data type datastore struct { // poolMu is used to synchronize access to the inferencePool. poolMu sync.RWMutex - pool *v1alpha1.InferencePool + pool *v1alpha2.InferencePool models *sync.Map // key: types.NamespacedName, value: *PodMetrics pods *sync.Map @@ -101,13 +101,13 @@ func (ds *datastore) Clear() { } // /// InferencePool APIs /// -func (ds *datastore) PoolSet(pool *v1alpha1.InferencePool) { +func (ds *datastore) PoolSet(pool *v1alpha2.InferencePool) { ds.poolMu.Lock() defer ds.poolMu.Unlock() ds.pool = pool } -func (ds *datastore) PoolGet() (*v1alpha1.InferencePool, error) { +func (ds *datastore) PoolGet() (*v1alpha2.InferencePool, error) { ds.poolMu.RLock() defer ds.poolMu.RUnlock() if !ds.PoolHasSynced() { @@ -129,14 +129,14 @@ func (ds *datastore) PoolLabelsMatch(podLabels map[string]string) bool { } // /// InferenceModel APIs /// -func (ds *datastore) ModelSet(infModel *v1alpha1.InferenceModel) { +func (ds *datastore) ModelSet(infModel *v1alpha2.InferenceModel) { ds.models.Store(infModel.Spec.ModelName, infModel) } -func (ds *datastore) ModelGet(modelName string) (*v1alpha1.InferenceModel, bool) { +func (ds *datastore) ModelGet(modelName string) (*v1alpha2.InferenceModel, bool) { infModel, ok := ds.models.Load(modelName) if ok { - return infModel.(*v1alpha1.InferenceModel), true + return infModel.(*v1alpha2.InferenceModel), true } return nil, false } @@ -243,11 +243,11 @@ func (ds *datastore) PodDeleteAll() { ds.pods.Clear() } -func selectorFromInferencePoolSelector(selector map[v1alpha1.LabelKey]v1alpha1.LabelValue) labels.Selector { +func selectorFromInferencePoolSelector(selector map[v1alpha2.LabelKey]v1alpha2.LabelValue) labels.Selector { return labels.SelectorFromSet(stripLabelKeyAliasFromLabelMap(selector)) } -func stripLabelKeyAliasFromLabelMap(labels map[v1alpha1.LabelKey]v1alpha1.LabelValue) map[string]string { +func stripLabelKeyAliasFromLabelMap(labels map[v1alpha2.LabelKey]v1alpha2.LabelValue) map[string]string { outMap := make(map[string]string) for k, v := range labels { outMap[string(k)] = string(v) @@ -255,7 +255,7 @@ func stripLabelKeyAliasFromLabelMap(labels map[v1alpha1.LabelKey]v1alpha1.LabelV return outMap } -func RandomWeightedDraw(logger logr.Logger, model *v1alpha1.InferenceModel, seed int64) string { +func RandomWeightedDraw(logger logr.Logger, model *v1alpha2.InferenceModel, seed int64) string { var weights int32 source := rand.NewSource(rand.Int63()) @@ -277,8 +277,8 @@ func RandomWeightedDraw(logger logr.Logger, model *v1alpha1.InferenceModel, seed return "" } -func IsCritical(model *v1alpha1.InferenceModel) bool { - if model.Spec.Criticality != nil && *model.Spec.Criticality == v1alpha1.Critical { +func IsCritical(model *v1alpha2.InferenceModel) bool { + if model.Spec.Criticality != nil && *model.Spec.Criticality == v1alpha2.Critical { return true } return false diff --git a/pkg/epp/datastore/datastore_test.go b/pkg/epp/datastore/datastore_test.go index bd5c5020..2af36541 100644 --- a/pkg/epp/datastore/datastore_test.go +++ b/pkg/epp/datastore/datastore_test.go @@ -20,19 +20,19 @@ import ( "testing" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) func TestHasSynced(t *testing.T) { tests := []struct { name string - inferencePool *v1alpha1.InferencePool + inferencePool *v1alpha2.InferencePool hasSynced bool }{ { name: "Ready when InferencePool exists in data store", - inferencePool: &v1alpha1.InferencePool{ + inferencePool: &v1alpha2.InferencePool{ ObjectMeta: v1.ObjectMeta{ Name: "test-pool", Namespace: "default", @@ -66,14 +66,14 @@ func TestRandomWeightedDraw(t *testing.T) { logger := logutil.NewTestLogger() tests := []struct { name string - model *v1alpha1.InferenceModel + model *v1alpha2.InferenceModel want string }{ { name: "'random' distribution", - model: &v1alpha1.InferenceModel{ - Spec: v1alpha1.InferenceModelSpec{ - TargetModels: []v1alpha1.TargetModel{ + model: &v1alpha2.InferenceModel{ + Spec: v1alpha2.InferenceModelSpec{ + TargetModels: []v1alpha2.TargetModel{ { Name: "canary", Weight: pointer(50), @@ -89,9 +89,9 @@ func TestRandomWeightedDraw(t *testing.T) { }, { name: "'random' distribution", - model: &v1alpha1.InferenceModel{ - Spec: v1alpha1.InferenceModelSpec{ - TargetModels: []v1alpha1.TargetModel{ + model: &v1alpha2.InferenceModel{ + Spec: v1alpha2.InferenceModelSpec{ + TargetModels: []v1alpha2.TargetModel{ { Name: "canary", Weight: pointer(25), @@ -111,9 +111,9 @@ func TestRandomWeightedDraw(t *testing.T) { }, { name: "'random' distribution", - model: &v1alpha1.InferenceModel{ - Spec: v1alpha1.InferenceModelSpec{ - TargetModels: []v1alpha1.TargetModel{ + model: &v1alpha2.InferenceModel{ + Spec: v1alpha2.InferenceModelSpec{ + TargetModels: []v1alpha2.TargetModel{ { Name: "canary", Weight: pointer(20), diff --git a/pkg/epp/test/benchmark/benchmark.go b/pkg/epp/test/benchmark/benchmark.go index 10987b47..67783480 100644 --- a/pkg/epp/test/benchmark/benchmark.go +++ b/pkg/epp/test/benchmark/benchmark.go @@ -31,7 +31,7 @@ import ( "google.golang.org/protobuf/proto" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/test" @@ -108,12 +108,12 @@ func generateRequestFunc(logger logr.Logger) func(mtd *desc.MethodDescriptor, ca } } -func fakeModels() map[string]*v1alpha1.InferenceModel { - models := map[string]*v1alpha1.InferenceModel{} +func fakeModels() map[string]*v1alpha2.InferenceModel { + models := map[string]*v1alpha2.InferenceModel{} for i := range *numFakePods { for j := range *numModelsPerPod { m := modelName(i*(*numModelsPerPod) + j) - models[m] = &v1alpha1.InferenceModel{Spec: v1alpha1.InferenceModelSpec{ModelName: m}} + models[m] = &v1alpha2.InferenceModel{Spec: v1alpha2.InferenceModelSpec{ModelName: m}} } } diff --git a/pkg/epp/test/utils.go b/pkg/epp/test/utils.go index c44d7147..6a75ed2f 100644 --- a/pkg/epp/test/utils.go +++ b/pkg/epp/test/utils.go @@ -29,7 +29,7 @@ import ( "google.golang.org/grpc/reflection" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" @@ -43,7 +43,7 @@ func StartExtProc( port int, refreshPodsInterval, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration, pods []*datastore.PodMetrics, - models map[string]*v1alpha1.InferenceModel, + models map[string]*v1alpha2.InferenceModel, ) *grpc.Server { logger := log.FromContext(ctx) pms := make(map[types.NamespacedName]*datastore.PodMetrics) diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index c4342775..14ee738f 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -38,7 +38,7 @@ import ( clientgoscheme "k8s.io/client-go/kubernetes/scheme" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/config" - infextv1a1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + infextv1a2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" testutils "sigs.k8s.io/gateway-api-inference-extension/test/utils" ) @@ -136,7 +136,7 @@ func setupSuite() { err = apiextv1.AddToScheme(scheme) gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) - err = infextv1a1.AddToScheme(scheme) + err = infextv1a2.AddToScheme(scheme) gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) cli, err = client.New(cfg, client.Options{Scheme: scheme}) diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 087097a7..8cd73d32 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -26,7 +26,7 @@ import ( "github.com/onsi/gomega" "k8s.io/apimachinery/pkg/types" "k8s.io/utils/ptr" - infextv1a1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" testutils "sigs.k8s.io/gateway-api-inference-extension/test/utils" ) @@ -95,8 +95,8 @@ var _ = ginkgo.Describe("InferencePool", func() { }) // newInferenceModel creates an InferenceModel in the given namespace for testutils. -func newInferenceModel(ns string) *infextv1a1.InferenceModel { - targets := []infextv1a1.TargetModel{ +func newInferenceModel(ns string) *v1alpha2.InferenceModel { + targets := []v1alpha2.TargetModel{ { Name: modelName + "-0", Weight: ptr.To(int32(50)), @@ -107,7 +107,7 @@ func newInferenceModel(ns string) *infextv1a1.InferenceModel { }, } return testutils.MakeModelWrapper("inferencemodel-sample", ns). - SetCriticality(infextv1a1.Critical). + SetCriticality(v1alpha2.Critical). SetModelName(modelName). SetPoolRef(modelServerName). SetTargetModels(targets). diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index 91bc71c6..85c49913 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -46,7 +46,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" k8sclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/envtest" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" @@ -407,7 +407,7 @@ func BeforeSuit(t *testing.T) func() { } utilruntime.Must(clientgoscheme.AddToScheme(scheme)) - utilruntime.Must(v1alpha1.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) k8sClient, err = k8sclient.New(cfg, k8sclient.Options{Scheme: scheme}) if err != nil { @@ -450,7 +450,7 @@ func BeforeSuit(t *testing.T) func() { } for _, doc := range docs { - inferenceModel := &v1alpha1.InferenceModel{} + inferenceModel := &v1alpha2.InferenceModel{} if err = yaml.Unmarshal(doc, inferenceModel); err != nil { logutil.Fatal(logger, err, "Can't unmarshal object", "document", doc) } @@ -462,7 +462,7 @@ func BeforeSuit(t *testing.T) func() { } } for _, doc := range docs { - inferencePool := &v1alpha1.InferencePool{} + inferencePool := &v1alpha2.InferencePool{} if err = yaml.Unmarshal(doc, inferencePool); err != nil { logutil.Fatal(logger, err, "Can't unmarshal object", "document", doc) } diff --git a/test/utils/utils.go b/test/utils/utils.go index 777eadd8..1ec0fbaa 100644 --- a/test/utils/utils.go +++ b/test/utils/utils.go @@ -36,7 +36,7 @@ import ( "k8s.io/client-go/rest" "k8s.io/client-go/tools/remotecommand" "sigs.k8s.io/controller-runtime/pkg/client" - infextv1a1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) // DeleteClusterResources deletes all cluster-scoped objects the tests typically create. @@ -106,11 +106,11 @@ func DeleteNamespacedResources(ctx context.Context, cli client.Client, ns string if err != nil && !apierrors.IsNotFound(err) { return err } - err = cli.DeleteAllOf(ctx, &infextv1a1.InferencePool{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + err = cli.DeleteAllOf(ctx, &v1alpha2.InferencePool{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } - err = cli.DeleteAllOf(ctx, &infextv1a1.InferenceModel{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + err = cli.DeleteAllOf(ctx, &v1alpha2.InferenceModel{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } @@ -132,7 +132,7 @@ func DeleteInferenceModelResources(ctx context.Context, cli client.Client, ns st if ns == "" { return nil } - err := cli.DeleteAllOf(ctx, &infextv1a1.InferenceModel{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) + err := cli.DeleteAllOf(ctx, &v1alpha2.InferenceModel{}, client.InNamespace(ns), client.PropagationPolicy(metav1.DeletePropagationForeground)) if err != nil && !apierrors.IsNotFound(err) { return err } diff --git a/test/utils/wrappers.go b/test/utils/wrappers.go index 668a5adc..3280cb11 100644 --- a/test/utils/wrappers.go +++ b/test/utils/wrappers.go @@ -18,25 +18,25 @@ package utils import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - infextv1a1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) // InferenceModelWrapper wraps an InferenceModel. type InferenceModelWrapper struct { - infextv1a1.InferenceModel + v1alpha2.InferenceModel } // MakeModelWrapper creates a wrapper for an MakeModelWrapper. func MakeModelWrapper(name, ns string) *InferenceModelWrapper { return &InferenceModelWrapper{ - infextv1a1.InferenceModel{ + v1alpha2.InferenceModel{ ObjectMeta: metav1.ObjectMeta{ Name: name, Namespace: ns, }, - Spec: infextv1a1.InferenceModelSpec{ + Spec: v1alpha2.InferenceModelSpec{ ModelName: "", - PoolRef: infextv1a1.PoolObjectReference{}, + PoolRef: v1alpha2.PoolObjectReference{}, }, }, } @@ -49,7 +49,7 @@ func (m *InferenceModelWrapper) SetModelName(name string) *InferenceModelWrapper } // SetCriticality sets the value of the inferenceModel.spec.criticality. -func (m *InferenceModelWrapper) SetCriticality(level infextv1a1.Criticality) *InferenceModelWrapper { +func (m *InferenceModelWrapper) SetCriticality(level v1alpha2.Criticality) *InferenceModelWrapper { m.Spec.Criticality = &level return m } @@ -57,8 +57,8 @@ func (m *InferenceModelWrapper) SetCriticality(level infextv1a1.Criticality) *In // SetPoolRef sets the value of the inferenceModel.spec.poolRef using defaults // for group/kind and name as the PoolObjectReference name. func (m *InferenceModelWrapper) SetPoolRef(name string) *InferenceModelWrapper { - ref := infextv1a1.PoolObjectReference{ - Group: infextv1a1.GroupVersion.Group, + ref := v1alpha2.PoolObjectReference{ + Group: v1alpha2.GroupVersion.Group, Kind: "inferencepools", Name: name, } @@ -67,12 +67,12 @@ func (m *InferenceModelWrapper) SetPoolRef(name string) *InferenceModelWrapper { } // SetTargetModels sets the value of the inferenceModel.spec.targetModels. -func (m *InferenceModelWrapper) SetTargetModels(models []infextv1a1.TargetModel) *InferenceModelWrapper { +func (m *InferenceModelWrapper) SetTargetModels(models []v1alpha2.TargetModel) *InferenceModelWrapper { m.Spec.TargetModels = models return m } // Obj returns the inner InferenceModel. -func (m *InferenceModelWrapper) Obj() *infextv1a1.InferenceModel { +func (m *InferenceModelWrapper) Obj() *v1alpha2.InferenceModel { return &m.InferenceModel } From c25f0c98609362e73ae0f0f6bfdfbb58a5390468 Mon Sep 17 00:00:00 2001 From: kfswain <137822113+kfswain@users.noreply.github.com> Date: Mon, 24 Feb 2025 14:00:28 -0700 Subject: [PATCH 49/96] Adding a slim roadmap to README (#400) --- README.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e6730ae4..c500602c 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,16 @@ See our website at https://gateway-api-inference-extension.sigs.k8s.io/ for deta ## Roadmap -Coming soon! +As Inference Gateway builds towards a GA release. We will continue to expand our capabilities, namely: +1. Prefix-cache aware load balancing with interfaces for remote caches +1. Recommended LoRA adapter pipeline for automated rollout +1. Fairness and priority between workloads within the same criticality band +1. HPA support for autoscaling on aggregate metrics derived from the load balancer +1. Support for large multi-modal inputs and outputs +1. Support for other GenAI model types (diffusion and other non-completion protocols) +1. Heterogeneous accelerators - serve workloads on multiple types of accelerator using latency and request cost-aware load balancing +1. Disaggregated serving support with independently scaling pools + ## End-to-End Tests From 45f9898ea02a26a4f36c5e411f04e9fc1bcf8e5b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Feb 2025 14:40:28 -0800 Subject: [PATCH 50/96] Bump github.com/prometheus/client_golang from 1.20.5 to 1.21.0 (#402) Bumps [github.com/prometheus/client_golang](https://github.com/prometheus/client_golang) from 1.20.5 to 1.21.0. - [Release notes](https://github.com/prometheus/client_golang/releases) - [Changelog](https://github.com/prometheus/client_golang/blob/main/CHANGELOG.md) - [Commits](https://github.com/prometheus/client_golang/compare/v1.20.5...v1.21.0) --- updated-dependencies: - dependency-name: github.com/prometheus/client_golang dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 4 ++-- go.sum | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index ca4a1633..09af73d8 100644 --- a/go.mod +++ b/go.mod @@ -13,7 +13,7 @@ require ( github.com/jhump/protoreflect v1.17.0 github.com/onsi/ginkgo/v2 v2.22.2 github.com/onsi/gomega v1.36.2 - github.com/prometheus/client_golang v1.20.5 + github.com/prometheus/client_golang v1.21.0 github.com/prometheus/client_model v0.6.1 github.com/prometheus/common v0.62.0 github.com/stretchr/testify v1.10.0 @@ -85,7 +85,7 @@ require ( github.com/jinzhu/configor v1.2.1 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/compress v1.17.9 // indirect + github.com/klauspost/compress v1.17.11 // indirect github.com/kylelemons/godebug v1.1.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/mattn/go-colorable v0.1.13 // indirect diff --git a/go.sum b/go.sum index 2d54aba2..8bb93777 100644 --- a/go.sum +++ b/go.sum @@ -138,8 +138,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA= -github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= +github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= +github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= @@ -190,8 +190,8 @@ github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y= -github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= +github.com/prometheus/client_golang v1.21.0 h1:DIsaGmiaBkSangBgMtWdNfxbMNdku5IK6iNhrEqWvdA= +github.com/prometheus/client_golang v1.21.0/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg= github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io= From 40d024baabf3e4e174dc2c9884511bd27f50606e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Feb 2025 14:52:29 -0800 Subject: [PATCH 51/96] Bump github.com/google/go-cmp from 0.6.0 to 0.7.0 (#403) Bumps [github.com/google/go-cmp](https://github.com/google/go-cmp) from 0.6.0 to 0.7.0. - [Release notes](https://github.com/google/go-cmp/releases) - [Commits](https://github.com/google/go-cmp/compare/v0.6.0...v0.7.0) --- updated-dependencies: - dependency-name: github.com/google/go-cmp dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 09af73d8..91173449 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ require ( github.com/elastic/crd-ref-docs v0.1.0 github.com/envoyproxy/go-control-plane/envoy v1.32.4 github.com/go-logr/logr v1.4.2 - github.com/google/go-cmp v0.6.0 + github.com/google/go-cmp v0.7.0 github.com/jhump/protoreflect v1.17.0 github.com/onsi/ginkgo/v2 v2.22.2 github.com/onsi/gomega v1.36.2 diff --git a/go.sum b/go.sum index 8bb93777..f55f404b 100644 --- a/go.sum +++ b/go.sum @@ -108,8 +108,8 @@ github.com/google/cel-go v0.22.0/go.mod h1:BuznPXXfQDpXKWQ9sPW3TzlAJN5zzFe+i9tIs github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= -github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= From 2a88b3bbb1344bf0558ea6915f9441becd1c12e1 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Tue, 25 Feb 2025 18:44:31 +0200 Subject: [PATCH 52/96] updated logging in inferencepool reconciler (#399) * updated logging + predicate in inferencepool reconciler Signed-off-by: Nir Rozenbaum * removed irrelevant unit test. after adding predicate to the controller registration, the reconcile function no longer contains the filtering logic. therefore, it's not relecant to test the reconcile function with pool2. in runtime, this event will be filtered out much earlier Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum --- .../controller/inferencepool_reconciler.go | 27 ++++++++++--------- .../inferencepool_reconciler_test.go | 14 +++------- 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/pkg/epp/controller/inferencepool_reconciler.go b/pkg/epp/controller/inferencepool_reconciler.go index baf3332b..2ad7d2bb 100644 --- a/pkg/epp/controller/inferencepool_reconciler.go +++ b/pkg/epp/controller/inferencepool_reconciler.go @@ -27,6 +27,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" @@ -44,31 +45,28 @@ type InferencePoolReconciler struct { } func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - if req.NamespacedName.Name != c.PoolNamespacedName.Name || req.NamespacedName.Namespace != c.PoolNamespacedName.Namespace { - return ctrl.Result{}, nil - } + logger := log.FromContext(ctx).WithValues("inferencePool", req.NamespacedName).V(logutil.DEFAULT) + ctx = ctrl.LoggerInto(ctx, logger) - logger := log.FromContext(ctx) - loggerDefault := logger.V(logutil.DEFAULT) - loggerDefault.Info("Reconciling InferencePool", "name", req.NamespacedName) + logger.Info("Reconciling InferencePool") - serverPool := &v1alpha2.InferencePool{} + infPool := &v1alpha2.InferencePool{} - if err := c.Get(ctx, req.NamespacedName, serverPool); err != nil { + if err := c.Get(ctx, req.NamespacedName, infPool); err != nil { if errors.IsNotFound(err) { - loggerDefault.Info("InferencePool not found. Clearing the datastore", "name", req.NamespacedName) + logger.Info("InferencePool not found. Clearing the datastore") c.Datastore.Clear() return ctrl.Result{}, nil } - loggerDefault.Error(err, "Unable to get InferencePool", "name", req.NamespacedName) + logger.Error(err, "Unable to get InferencePool") return ctrl.Result{}, err - } else if !serverPool.DeletionTimestamp.IsZero() { - loggerDefault.Info("InferencePool is marked for deletion. Clearing the datastore", "name", req.NamespacedName) + } else if !infPool.DeletionTimestamp.IsZero() { + logger.Info("InferencePool is marked for deletion. Clearing the datastore") c.Datastore.Clear() return ctrl.Result{}, nil } - c.updateDatastore(ctx, serverPool) + c.updateDatastore(ctx, infPool) return ctrl.Result{}, nil } @@ -92,5 +90,8 @@ func (c *InferencePoolReconciler) updateDatastore(ctx context.Context, newPool * func (c *InferencePoolReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). For(&v1alpha2.InferencePool{}). + WithEventFilter(predicate.NewPredicateFuncs(func(object client.Object) bool { + return (object.GetNamespace() == c.PoolNamespacedName.Namespace) && (object.GetName() == c.PoolNamespacedName.Name) + })). Complete(c) } diff --git a/pkg/epp/controller/inferencepool_reconciler_test.go b/pkg/epp/controller/inferencepool_reconciler_test.go index a96406f0..26b81d9a 100644 --- a/pkg/epp/controller/inferencepool_reconciler_test.go +++ b/pkg/epp/controller/inferencepool_reconciler_test.go @@ -102,15 +102,7 @@ func TestReconcile_InferencePoolReconciler(t *testing.T) { t.Errorf("Unexpected diff (+got/-want): %s", diff) } - // Step 2: A reconcile on pool2 should not change anything. - if _, err := inferencePoolReconciler.Reconcile(ctx, ctrl.Request{NamespacedName: types.NamespacedName{Name: pool2.Name, Namespace: pool2.Namespace}}); err != nil { - t.Errorf("Unexpected InferencePool reconcile error: %v", err) - } - if diff := diffPool(datastore, pool1, []string{"pod1", "pod2"}); diff != "" { - t.Errorf("Unexpected diff (+got/-want): %s", diff) - } - - // Step 3: update the pool selector to include more pods + // Step 2: update the pool selector to include more pods newPool1 := &v1alpha2.InferencePool{} if err := fakeClient.Get(ctx, req.NamespacedName, newPool1); err != nil { t.Errorf("Unexpected pool get error: %v", err) @@ -127,7 +119,7 @@ func TestReconcile_InferencePoolReconciler(t *testing.T) { t.Errorf("Unexpected diff (+got/-want): %s", diff) } - // Step 4: update the pool port + // Step 3: update the pool port if err := fakeClient.Get(ctx, req.NamespacedName, newPool1); err != nil { t.Errorf("Unexpected pool get error: %v", err) } @@ -142,7 +134,7 @@ func TestReconcile_InferencePoolReconciler(t *testing.T) { t.Errorf("Unexpected diff (+got/-want): %s", diff) } - // Step 5: delete the pool to trigger a datastore clear + // Step 4: delete the pool to trigger a datastore clear if err := fakeClient.Get(ctx, req.NamespacedName, newPool1); err != nil { t.Errorf("Unexpected pool get error: %v", err) } From 2ad70e34a9e973dd011466f2d5b82d80465825b9 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Tue, 25 Feb 2025 18:58:31 +0200 Subject: [PATCH 53/96] updated inferencemodel predicate (#397) Signed-off-by: Nir Rozenbaum --- cmd/epp/main.go | 4 ++-- .../controller/inferencemodel_reconciler.go | 18 ++++++++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/cmd/epp/main.go b/cmd/epp/main.go index dd47fa27..5d399a42 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -166,7 +166,7 @@ func run() error { Provider: provider, } if err := serverRunner.SetupWithManager(mgr); err != nil { - setupLog.Error(err, "Failed to setup ext-proc server") + setupLog.Error(err, "Failed to setup ext-proc controllers") return err } @@ -177,7 +177,7 @@ func run() error { // Register ext-proc server. if err := mgr.Add(serverRunner.AsRunnable(ctrl.Log.WithName("ext-proc"))); err != nil { - setupLog.Error(err, "Failed to register ext-proc server") + setupLog.Error(err, "Failed to register ext-proc gRPC server") return err } diff --git a/pkg/epp/controller/inferencemodel_reconciler.go b/pkg/epp/controller/inferencemodel_reconciler.go index 00358740..9de77989 100644 --- a/pkg/epp/controller/inferencemodel_reconciler.go +++ b/pkg/epp/controller/inferencemodel_reconciler.go @@ -26,7 +26,9 @@ import ( "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" @@ -41,10 +43,6 @@ type InferenceModelReconciler struct { } func (c *InferenceModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - if req.Namespace != c.PoolNamespacedName.Namespace { - return ctrl.Result{}, nil - } - logger := log.FromContext(ctx) loggerDefault := logger.V(logutil.DEFAULT) loggerDefault.Info("Reconciling InferenceModel", "name", req.NamespacedName) @@ -85,5 +83,17 @@ func (c *InferenceModelReconciler) updateDatastore(logger logr.Logger, infModel func (c *InferenceModelReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). For(&v1alpha2.InferenceModel{}). + WithEventFilter(predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { return c.eventPredicate(e.Object.(*v1alpha2.InferenceModel)) }, + UpdateFunc: func(e event.UpdateEvent) bool { + return c.eventPredicate(e.ObjectOld.(*v1alpha2.InferenceModel)) || c.eventPredicate(e.ObjectNew.(*v1alpha2.InferenceModel)) + }, + DeleteFunc: func(e event.DeleteEvent) bool { return c.eventPredicate(e.Object.(*v1alpha2.InferenceModel)) }, + GenericFunc: func(e event.GenericEvent) bool { return c.eventPredicate(e.Object.(*v1alpha2.InferenceModel)) }, + }). Complete(c) } + +func (c *InferenceModelReconciler) eventPredicate(infModel *v1alpha2.InferenceModel) bool { + return (infModel.Spec.PoolRef.Name == c.PoolNamespacedName.Name) && (infModel.GetNamespace() == c.PoolNamespacedName.Namespace) +} From 9bee374d76f7c126aad5cda89349815e4d1764f8 Mon Sep 17 00:00:00 2001 From: kfswain <137822113+kfswain@users.noreply.github.com> Date: Wed, 26 Feb 2025 10:36:30 -0700 Subject: [PATCH 54/96] Syncing readme all to main (#410) --- site-src/guides/index.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 4478128f..e0593f3b 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -23,7 +23,8 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Install the Inference Extension CRDs ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v0.1.0/manifests.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml ``` ### Deploy InferenceModel From 7f804ae0bcac8ab8ec580836e29327a9a23ade08 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Wed, 26 Feb 2025 20:38:30 +0200 Subject: [PATCH 55/96] fixed the filepath (#412) Signed-off-by: Nir Rozenbaum --- site-src/guides/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index e0593f3b..2949d387 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -7,7 +7,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv - A cluster with: - Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind, you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). - - 3 GPUs to run the sample model server. Adjust the number of replicas in `./manifests/vllm/deployment.yaml` as needed. + - 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/deployment.yaml` as needed. ## **Steps** From 7ed54a4c2b9db9d0c7c95bfaa99c466080e5bb24 Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Wed, 26 Feb 2025 19:08:30 +0000 Subject: [PATCH 56/96] Fix InferenceModel deletion logic (#393) * Currently the logic tracks the models by Spec.ModelName, since this is not guaranteed to be unique within the cluster, we could run into two issues: 1) If the model name changes on the same InferenceModel object, we don't delete the original model entry in the datastore. 2) We don't enforce the semantics that the modelName with the oldest creation timestamp is retained. While the api is assuming that this is enforced by another controller via the Ready condition, we don't have this controller yet, and so currently the behavior is unpredictable depending on InferenceModel events order. To address the above, the PR makes changes to both the InferenceModel reconciler and the Model APIs in the datastore to ensure thread safe updates of the entries. In the store, the sync.Map was replaced with two maps to track the InferenceModel entries by both ModelName and InferenceModel object NamespacedName. This is needed to properly handle deletions when the object doesn't exist anymore (could be handled in other ways, but this seemed like a reasonable approach). The PR increases the datastore pkg unit test coverage the Pool and Model APIs. We still need to followup with adding unit test coverage to the pods APIs, which is currently non-existent. * Convert unit test to a table * remove the dual map for the models store, and rely on linear search when looking up the model by object name * Added ModelResync to handle a race condition * Update pkg/epp/controller/inferencemodel_reconciler.go --- cmd/epp/main.go | 6 +- pkg/epp/backend/provider_test.go | 50 +- .../controller/inferencemodel_reconciler.go | 86 +++- .../inferencemodel_reconciler_test.go | 439 +++++++----------- .../inferencepool_reconciler_test.go | 93 ++-- pkg/epp/controller/pod_reconciler.go | 2 +- pkg/epp/controller/pod_reconciler_test.go | 148 ++---- pkg/epp/datastore/datastore.go | 145 ++++-- pkg/epp/datastore/datastore_test.go | 195 +++++++- pkg/epp/server/runserver.go | 4 +- pkg/epp/test/utils.go | 9 +- pkg/epp/util/testing/diff.go | 27 ++ pkg/epp/util/testing/wrappers.go | 117 ++++- test/e2e/e2e_suite_test.go | 5 - test/integration/hermetic_test.go | 9 +- 15 files changed, 789 insertions(+), 546 deletions(-) create mode 100644 pkg/epp/util/testing/diff.go diff --git a/cmd/epp/main.go b/cmd/epp/main.go index 5d399a42..b66024ec 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -149,6 +149,8 @@ func run() error { return err } + ctx := ctrl.SetupSignalHandler() + // Setup runner. datastore := datastore.NewDatastore() provider := backend.NewProvider(&vllm.PodMetricsClientImpl{}, datastore) @@ -165,7 +167,7 @@ func run() error { CertPath: *certPath, Provider: provider, } - if err := serverRunner.SetupWithManager(mgr); err != nil { + if err := serverRunner.SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "Failed to setup ext-proc controllers") return err } @@ -188,7 +190,7 @@ func run() error { // Start the manager. This blocks until a signal is received. setupLog.Info("Controller manager starting") - if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + if err := mgr.Start(ctx); err != nil { setupLog.Error(err, "Error starting controller manager") return err } diff --git a/pkg/epp/backend/provider_test.go b/pkg/epp/backend/provider_test.go index 1e11afe2..f2db09fe 100644 --- a/pkg/epp/backend/provider_test.go +++ b/pkg/epp/backend/provider_test.go @@ -19,7 +19,6 @@ package backend import ( "context" "errors" - "sync" "testing" "time" @@ -37,6 +36,9 @@ var ( Name: "pod1", }, }, + } + pod1WithMetrics = &datastore.PodMetrics{ + Pod: pod1.Pod, Metrics: datastore.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, @@ -53,6 +55,9 @@ var ( Name: "pod2", }, }, + } + pod2WithMetrics = &datastore.PodMetrics{ + Pod: pod2.Pod, Metrics: datastore.Metrics{ WaitingQueueSize: 1, KVCacheUsagePercent: 0.2, @@ -69,35 +74,30 @@ func TestProvider(t *testing.T) { tests := []struct { name string pmc PodMetricsClient - datastore datastore.Datastore + storePods []*datastore.PodMetrics want []*datastore.PodMetrics }{ { name: "Probing metrics success", pmc: &FakePodMetricsClient{ Res: map[types.NamespacedName]*datastore.PodMetrics{ - pod1.NamespacedName: pod1, - pod2.NamespacedName: pod2, + pod1.NamespacedName: pod1WithMetrics, + pod2.NamespacedName: pod2WithMetrics, }, }, - datastore: datastore.NewFakeDatastore(populateMap(pod1, pod2), nil, nil), - want: []*datastore.PodMetrics{ - pod1, - pod2, - }, + storePods: []*datastore.PodMetrics{pod1, pod2}, + want: []*datastore.PodMetrics{pod1WithMetrics, pod2WithMetrics}, }, { name: "Only pods in the datastore are probed", pmc: &FakePodMetricsClient{ Res: map[types.NamespacedName]*datastore.PodMetrics{ - pod1.NamespacedName: pod1, - pod2.NamespacedName: pod2, + pod1.NamespacedName: pod1WithMetrics, + pod2.NamespacedName: pod2WithMetrics, }, }, - datastore: datastore.NewFakeDatastore(populateMap(pod1), nil, nil), - want: []*datastore.PodMetrics{ - pod1, - }, + storePods: []*datastore.PodMetrics{pod1}, + want: []*datastore.PodMetrics{pod1WithMetrics}, }, { name: "Probing metrics error", @@ -106,13 +106,12 @@ func TestProvider(t *testing.T) { pod2.NamespacedName: errors.New("injected error"), }, Res: map[types.NamespacedName]*datastore.PodMetrics{ - pod1.NamespacedName: pod1, + pod1.NamespacedName: pod1WithMetrics, }, }, - datastore: datastore.NewFakeDatastore(populateMap(pod1, pod2), nil, nil), - + storePods: []*datastore.PodMetrics{pod1, pod2}, want: []*datastore.PodMetrics{ - pod1, + pod1WithMetrics, // Failed to fetch pod2 metrics so it remains the default values. { Pod: datastore.Pod{NamespacedName: pod2.NamespacedName}, @@ -128,12 +127,13 @@ func TestProvider(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - p := NewProvider(test.pmc, test.datastore) + ds := datastore.NewFakeDatastore(test.storePods, nil, nil) + p := NewProvider(test.pmc, ds) ctx, cancel := context.WithCancel(context.Background()) defer cancel() _ = p.Init(ctx, time.Millisecond, time.Millisecond) assert.EventuallyWithT(t, func(t *assert.CollectT) { - metrics := test.datastore.PodGetAll() + metrics := ds.PodGetAll() diff := cmp.Diff(test.want, metrics, cmpopts.SortSlices(func(a, b *datastore.PodMetrics) bool { return a.String() < b.String() })) @@ -142,11 +142,3 @@ func TestProvider(t *testing.T) { }) } } - -func populateMap(pods ...*datastore.PodMetrics) *sync.Map { - newMap := &sync.Map{} - for _, pod := range pods { - newMap.Store(pod.NamespacedName, &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: pod.NamespacedName, Address: pod.Address}}) - } - return newMap -} diff --git a/pkg/epp/controller/inferencemodel_reconciler.go b/pkg/epp/controller/inferencemodel_reconciler.go index 9de77989..7cf18808 100644 --- a/pkg/epp/controller/inferencemodel_reconciler.go +++ b/pkg/epp/controller/inferencemodel_reconciler.go @@ -18,8 +18,8 @@ package controller import ( "context" + "fmt" - "github.com/go-logr/logr" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -43,44 +43,80 @@ type InferenceModelReconciler struct { } func (c *InferenceModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - logger := log.FromContext(ctx) - loggerDefault := logger.V(logutil.DEFAULT) - loggerDefault.Info("Reconciling InferenceModel", "name", req.NamespacedName) + if req.Namespace != c.PoolNamespacedName.Namespace { + return ctrl.Result{}, nil + } + logger := log.FromContext(ctx).V(logutil.DEFAULT).WithValues("inferenceModel", req.Name) + ctx = ctrl.LoggerInto(ctx, logger) + + logger.Info("Reconciling InferenceModel") infModel := &v1alpha2.InferenceModel{} + notFound := false if err := c.Get(ctx, req.NamespacedName, infModel); err != nil { - if errors.IsNotFound(err) { - loggerDefault.Info("InferenceModel not found. Removing from datastore since object must be deleted", "name", req.NamespacedName) - c.Datastore.ModelDelete(infModel.Spec.ModelName) - return ctrl.Result{}, nil + if !errors.IsNotFound(err) { + logger.Error(err, "Unable to get InferenceModel") + return ctrl.Result{}, err } - loggerDefault.Error(err, "Unable to get InferenceModel", "name", req.NamespacedName) + notFound = true + } + + if notFound || !infModel.DeletionTimestamp.IsZero() || infModel.Spec.PoolRef.Name != c.PoolNamespacedName.Name { + // InferenceModel object got deleted or changed the referenced pool. + err := c.handleModelDeleted(ctx, req.NamespacedName) return ctrl.Result{}, err - } else if !infModel.DeletionTimestamp.IsZero() { - loggerDefault.Info("InferenceModel is marked for deletion. Removing from datastore", "name", req.NamespacedName) - c.Datastore.ModelDelete(infModel.Spec.ModelName) - return ctrl.Result{}, nil } - c.updateDatastore(logger, infModel) + // Add or update if the InferenceModel instance has a creation timestamp older than the existing entry of the model. + logger = logger.WithValues("poolRef", infModel.Spec.PoolRef).WithValues("modelName", infModel.Spec.ModelName) + if !c.Datastore.ModelSetIfOlder(infModel) { + logger.Info("Skipping InferenceModel, existing instance has older creation timestamp") + + } + logger.Info("Added/Updated InferenceModel") + return ctrl.Result{}, nil } -func (c *InferenceModelReconciler) updateDatastore(logger logr.Logger, infModel *v1alpha2.InferenceModel) { - loggerDefault := logger.V(logutil.DEFAULT) +func (c *InferenceModelReconciler) handleModelDeleted(ctx context.Context, req types.NamespacedName) error { + logger := log.FromContext(ctx) - if infModel.Spec.PoolRef.Name == c.PoolNamespacedName.Name { - loggerDefault.Info("Updating datastore", "poolRef", infModel.Spec.PoolRef, "serverPoolName", c.PoolNamespacedName) - loggerDefault.Info("Adding/Updating InferenceModel", "modelName", infModel.Spec.ModelName) - c.Datastore.ModelSet(infModel) - return + // We will lookup and delete the modelName associated with this object, and search for + // other instances referencing the same modelName if exist, and store the oldest in + // its place. This ensures that the InferenceModel with the oldest creation + // timestamp is active. + existing, exists := c.Datastore.ModelDelete(req) + if !exists { + // No entry exists in the first place, nothing to do. + return nil + } + logger.Info("InferenceModel removed from datastore", "poolRef", existing.Spec.PoolRef, "modelName", existing.Spec.ModelName) + + // TODO(#409): replace this backfill logic with one that is based on InferenceModel Ready conditions once those are set by an external controller. + updated, err := c.Datastore.ModelResync(ctx, c.Client, existing.Spec.ModelName) + if err != nil { + return err + } + if updated { + logger.Info("Model replaced.", "modelName", existing.Spec.ModelName) } - loggerDefault.Info("Removing/Not adding InferenceModel", "modelName", infModel.Spec.ModelName) - // If we get here. The model is not relevant to this pool, remove. - c.Datastore.ModelDelete(infModel.Spec.ModelName) + return nil } -func (c *InferenceModelReconciler) SetupWithManager(mgr ctrl.Manager) error { +func indexInferenceModelsByModelName(obj client.Object) []string { + m, ok := obj.(*v1alpha2.InferenceModel) + if !ok { + return nil + } + return []string{m.Spec.ModelName} +} + +func (c *InferenceModelReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error { + // Create an index on ModelName for InferenceModel objects. + indexer := mgr.GetFieldIndexer() + if err := indexer.IndexField(ctx, &v1alpha2.InferenceModel{}, datastore.ModelNameIndexKey, indexInferenceModelsByModelName); err != nil { + return fmt.Errorf("setting index on ModelName for InferenceModel: %w", err) + } return ctrl.NewControllerManagedBy(mgr). For(&v1alpha2.InferenceModel{}). WithEventFilter(predicate.Funcs{ diff --git a/pkg/epp/controller/inferencemodel_reconciler_test.go b/pkg/epp/controller/inferencemodel_reconciler_test.go index cea7bf42..87323e80 100644 --- a/pkg/epp/controller/inferencemodel_reconciler_test.go +++ b/pkg/epp/controller/inferencemodel_reconciler_test.go @@ -18,302 +18,219 @@ package controller import ( "context" - "sync" "testing" + "github.com/google/go-cmp/cmp" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + utiltest "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" ) var ( - infModel1 = &v1alpha2.InferenceModel{ - Spec: v1alpha2.InferenceModelSpec{ - ModelName: "fake model1", - PoolRef: v1alpha2.PoolObjectReference{Name: "test-pool"}, + pool = utiltest.MakeInferencePool("test-pool1").Namespace("ns1").ObjRef() + infModel1 = utiltest.MakeInferenceModel("model1"). + Namespace(pool.Namespace). + ModelName("fake model1"). + Criticality(v1alpha2.Standard). + CreationTimestamp(metav1.Unix(1000, 0)). + PoolName(pool.Name).ObjRef() + infModel1Pool2 = utiltest.MakeInferenceModel(infModel1.Name). + Namespace(infModel1.Namespace). + ModelName(infModel1.Spec.ModelName). + Criticality(*infModel1.Spec.Criticality). + CreationTimestamp(metav1.Unix(1001, 0)). + PoolName("test-pool2").ObjRef() + infModel1NS2 = utiltest.MakeInferenceModel(infModel1.Name). + Namespace("ns2"). + ModelName(infModel1.Spec.ModelName). + Criticality(*infModel1.Spec.Criticality). + CreationTimestamp(metav1.Unix(1002, 0)). + PoolName(pool.Name).ObjRef() + infModel1Critical = utiltest.MakeInferenceModel(infModel1.Name). + Namespace(infModel1.Namespace). + ModelName(infModel1.Spec.ModelName). + Criticality(v1alpha2.Critical). + CreationTimestamp(metav1.Unix(1003, 0)). + PoolName(pool.Name).ObjRef() + infModel1Deleted = utiltest.MakeInferenceModel(infModel1.Name). + Namespace(infModel1.Namespace). + ModelName(infModel1.Spec.ModelName). + CreationTimestamp(metav1.Unix(1004, 0)). + DeletionTimestamp(). + PoolName(pool.Name).ObjRef() + // Same ModelName, different object with newer creation timestamp + infModel1Newer = utiltest.MakeInferenceModel("model1-newer"). + Namespace(pool.Namespace). + ModelName("fake model1"). + Criticality(v1alpha2.Standard). + CreationTimestamp(metav1.Unix(1005, 0)). + PoolName(pool.Name).ObjRef() + // Same ModelName, different object with older creation timestamp + infModel1Older = utiltest.MakeInferenceModel("model1-older"). + Namespace(pool.Namespace). + ModelName("fake model1"). + Criticality(v1alpha2.Standard). + CreationTimestamp(metav1.Unix(999, 0)). + PoolName(pool.Name).ObjRef() + + infModel2 = utiltest.MakeInferenceModel("model2"). + Namespace(pool.Namespace). + ModelName("fake model2"). + CreationTimestamp(metav1.Unix(1000, 0)). + PoolName(pool.Name).ObjRef() + infModel2NS2 = utiltest.MakeInferenceModel(infModel2.Name). + Namespace("ns2"). + ModelName(infModel2.Spec.ModelName). + CreationTimestamp(metav1.Unix(1000, 0)). + PoolName(pool.Name).ObjRef() +) + +func TestInferenceModelReconciler(t *testing.T) { + tests := []struct { + name string + modelsInStore []*v1alpha2.InferenceModel + modelsInAPIServer []*v1alpha2.InferenceModel + model *v1alpha2.InferenceModel + incomingReq *types.NamespacedName + wantModels []*v1alpha2.InferenceModel + wantResult ctrl.Result + }{ + { + name: "Empty store, add new model", + model: infModel1, + wantModels: []*v1alpha2.InferenceModel{infModel1}, }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-service", + { + name: "Existing model changed pools", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + model: infModel1Pool2, + wantModels: []*v1alpha2.InferenceModel{}, }, - } - infModel1Modified = &v1alpha2.InferenceModel{ - Spec: v1alpha2.InferenceModelSpec{ - ModelName: "fake model1", - PoolRef: v1alpha2.PoolObjectReference{Name: "test-poolio"}, + { + name: "Not found, delete existing model", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + incomingReq: &types.NamespacedName{Name: infModel1.Name, Namespace: infModel1.Namespace}, + wantModels: []*v1alpha2.InferenceModel{}, }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-service", + { + name: "Deletion timestamp set, delete existing model", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + model: infModel1Deleted, + wantModels: []*v1alpha2.InferenceModel{}, }, - } - infModel2 = &v1alpha2.InferenceModel{ - Spec: v1alpha2.InferenceModelSpec{ - ModelName: "fake model", - PoolRef: v1alpha2.PoolObjectReference{Name: "test-pool"}, + { + name: "Model referencing a different pool, different pool name but same namespace", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + model: infModel1NS2, + wantModels: []*v1alpha2.InferenceModel{infModel1}, }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-service-2", + { + name: "Model referencing a different pool, same pool name but different namespace", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + model: infModel2NS2, + wantModels: []*v1alpha2.InferenceModel{infModel1}, }, - } -) - -func TestUpdateDatastore_InferenceModelReconciler(t *testing.T) { - logger := logutil.NewTestLogger() - - tests := []struct { - name string - datastore datastore.Datastore - incomingService *v1alpha2.InferenceModel - wantInferenceModels *sync.Map - }{ { - name: "No Services registered; valid, new service incoming.", - datastore: datastore.NewFakeDatastore(nil, nil, &v1alpha2.InferencePool{ - Spec: v1alpha2.InferencePoolSpec{ - Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{"app": "vllm"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool", - ResourceVersion: "Old and boring", - }, - }), - - incomingService: infModel1, - wantInferenceModels: populateServiceMap(infModel1), + name: "Existing model changed pools, replaced with another", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + model: infModel1Pool2, + modelsInAPIServer: []*v1alpha2.InferenceModel{infModel1Newer}, + wantModels: []*v1alpha2.InferenceModel{infModel1Newer}, + }, + { + name: "Not found, delete existing model, replaced with another", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + incomingReq: &types.NamespacedName{Name: infModel1.Name, Namespace: infModel1.Namespace}, + modelsInAPIServer: []*v1alpha2.InferenceModel{infModel1Newer}, + wantModels: []*v1alpha2.InferenceModel{infModel1Newer}, + }, + { + name: "Deletion timestamp set, delete existing model, replaced with another", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + model: infModel1Deleted, + modelsInAPIServer: []*v1alpha2.InferenceModel{infModel1Newer}, + wantModels: []*v1alpha2.InferenceModel{infModel1Newer}, }, { - name: "Removing existing service.", - datastore: datastore.NewFakeDatastore(nil, populateServiceMap(infModel1), &v1alpha2.InferencePool{ - Spec: v1alpha2.InferencePoolSpec{ - Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{"app": "vllm"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool", - ResourceVersion: "Old and boring", - }, - }), - incomingService: infModel1Modified, - wantInferenceModels: populateServiceMap(), + name: "Older instance of the model observed", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + model: infModel1Older, + wantModels: []*v1alpha2.InferenceModel{infModel1Older}, }, { - name: "Unrelated service, do nothing.", - datastore: datastore.NewFakeDatastore(nil, populateServiceMap(infModel1), &v1alpha2.InferencePool{ - Spec: v1alpha2.InferencePoolSpec{ - Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{"app": "vllm"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool", - ResourceVersion: "Old and boring", - }, - }), - incomingService: &v1alpha2.InferenceModel{ - Spec: v1alpha2.InferenceModelSpec{ - ModelName: "fake model", - PoolRef: v1alpha2.PoolObjectReference{Name: "test-poolio"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "unrelated-service", - }, - }, - wantInferenceModels: populateServiceMap(infModel1), + name: "Model changed criticality", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + model: infModel1Critical, + wantModels: []*v1alpha2.InferenceModel{infModel1Critical}, }, { - name: "Add to existing", - datastore: datastore.NewFakeDatastore(nil, populateServiceMap(infModel1), &v1alpha2.InferencePool{ - Spec: v1alpha2.InferencePoolSpec{ - Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{"app": "vllm"}, - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "test-pool", - ResourceVersion: "Old and boring", - }, - }), - incomingService: infModel2, - wantInferenceModels: populateServiceMap(infModel1, infModel2), + name: "Model not found, no matching existing model to delete", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + incomingReq: &types.NamespacedName{Name: "non-existent-model", Namespace: pool.Namespace}, + wantModels: []*v1alpha2.InferenceModel{infModel1}, + }, + { + name: "Add to existing", + modelsInStore: []*v1alpha2.InferenceModel{infModel1}, + model: infModel2, + wantModels: []*v1alpha2.InferenceModel{infModel1, infModel2}, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - pool, err := test.datastore.PoolGet() - if err != nil { - t.Fatalf("failed to get pool: %v", err) + // Create a fake client with no InferenceModel objects. + scheme := runtime.NewScheme() + _ = v1alpha2.AddToScheme(scheme) + initObjs := []client.Object{} + if test.model != nil { + initObjs = append(initObjs, test.model) } - reconciler := &InferenceModelReconciler{ - Datastore: test.datastore, - PoolNamespacedName: types.NamespacedName{Name: pool.Name}, + for _, m := range test.modelsInAPIServer { + initObjs = append(initObjs, m) } - reconciler.updateDatastore(logger, test.incomingService) - - test.wantInferenceModels.Range(func(k, v any) bool { - _, exist := test.datastore.ModelGet(k.(string)) - if !exist { - t.Fatalf("failed to get model %s", k) - } - return true - }) - }) - } -} - -func TestReconcile_ResourceNotFound(t *testing.T) { - // Set up the scheme. - scheme := runtime.NewScheme() - _ = v1alpha2.AddToScheme(scheme) - - // Create a fake client with no InferenceModel objects. - fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() - - // Create a minimal datastore. - datastore := datastore.NewFakeDatastore(nil, nil, &v1alpha2.InferencePool{ - ObjectMeta: metav1.ObjectMeta{Name: "test-pool"}, - }) - - // Create the reconciler. - reconciler := &InferenceModelReconciler{ - Client: fakeClient, - Scheme: scheme, - Record: record.NewFakeRecorder(10), - Datastore: datastore, - PoolNamespacedName: types.NamespacedName{Name: "test-pool"}, - } - - // Create a request for a non-existent resource. - req := ctrl.Request{NamespacedName: types.NamespacedName{Name: "non-existent-model", Namespace: "default"}} - - // Call Reconcile. - result, err := reconciler.Reconcile(context.Background(), req) - if err != nil { - t.Fatalf("expected no error when resource is not found, got %v", err) - } - - // Check that no requeue is requested. - if result.Requeue || result.RequeueAfter != 0 { - t.Errorf("expected no requeue, got %+v", result) - } -} - -func TestReconcile_ModelMarkedForDeletion(t *testing.T) { - // Set up the scheme. - scheme := runtime.NewScheme() - _ = v1alpha2.AddToScheme(scheme) - - // Create an InferenceModel object. - now := metav1.Now() - existingModel := &v1alpha2.InferenceModel{ - ObjectMeta: metav1.ObjectMeta{ - Name: "existing-model", - Namespace: "default", - DeletionTimestamp: &now, - Finalizers: []string{"finalizer"}, - }, - Spec: v1alpha2.InferenceModelSpec{ - ModelName: "fake-model", - PoolRef: v1alpha2.PoolObjectReference{Name: "test-pool"}, - }, - } - - // Create a fake client with the existing model. - fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existingModel).Build() - - // Create a minimal datastore. - datastore := datastore.NewFakeDatastore(nil, nil, &v1alpha2.InferencePool{ - ObjectMeta: metav1.ObjectMeta{Name: "test-pool"}, - }) - - // Create the reconciler. - reconciler := &InferenceModelReconciler{ - Client: fakeClient, - Scheme: scheme, - Record: record.NewFakeRecorder(10), - Datastore: datastore, - PoolNamespacedName: types.NamespacedName{Name: "test-pool", Namespace: "default"}, - } - - // Create a request for the existing resource. - req := ctrl.Request{NamespacedName: types.NamespacedName{Name: "existing-model", Namespace: "default"}} - - // Call Reconcile. - result, err := reconciler.Reconcile(context.Background(), req) - if err != nil { - t.Fatalf("expected no error when resource exists, got %v", err) - } - - // Check that no requeue is requested. - if result.Requeue || result.RequeueAfter != 0 { - t.Errorf("expected no requeue, got %+v", result) - } - - // Verify that the datastore was not updated. - if _, exist := datastore.ModelGet(existingModel.Spec.ModelName); exist { - t.Errorf("expected datastore to not contain model %q", existingModel.Spec.ModelName) - } -} - -func TestReconcile_ResourceExists(t *testing.T) { - // Set up the scheme. - scheme := runtime.NewScheme() - _ = v1alpha2.AddToScheme(scheme) + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(initObjs...). + WithIndex(&v1alpha2.InferenceModel{}, datastore.ModelNameIndexKey, indexInferenceModelsByModelName). + Build() - // Create an InferenceModel object. - existingModel := &v1alpha2.InferenceModel{ - ObjectMeta: metav1.ObjectMeta{ - Name: "existing-model", - Namespace: "default", - }, - Spec: v1alpha2.InferenceModelSpec{ - ModelName: "fake-model", - PoolRef: v1alpha2.PoolObjectReference{Name: "test-pool"}, - }, - } - - // Create a fake client with the existing model. - fakeClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existingModel).Build() - - // Create a minimal datastore. - datastore := datastore.NewFakeDatastore(nil, nil, &v1alpha2.InferencePool{ - ObjectMeta: metav1.ObjectMeta{Name: "test-pool"}, - }) - - // Create the reconciler. - reconciler := &InferenceModelReconciler{ - Client: fakeClient, - Scheme: scheme, - Record: record.NewFakeRecorder(10), - Datastore: datastore, - PoolNamespacedName: types.NamespacedName{Name: "test-pool", Namespace: "default"}, - } - - // Create a request for the existing resource. - req := ctrl.Request{NamespacedName: types.NamespacedName{Name: "existing-model", Namespace: "default"}} + datastore := datastore.NewFakeDatastore(nil, test.modelsInStore, pool) + reconciler := &InferenceModelReconciler{ + Client: fakeClient, + Scheme: scheme, + Record: record.NewFakeRecorder(10), + Datastore: datastore, + PoolNamespacedName: types.NamespacedName{Name: pool.Name, Namespace: pool.Namespace}, + } + if test.incomingReq == nil { + test.incomingReq = &types.NamespacedName{Name: test.model.Name, Namespace: test.model.Namespace} + } - // Call Reconcile. - result, err := reconciler.Reconcile(context.Background(), req) - if err != nil { - t.Fatalf("expected no error when resource exists, got %v", err) - } + // Call Reconcile. + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{NamespacedName: *test.incomingReq}) + if err != nil { + t.Fatalf("expected no error when resource is not found, got %v", err) + } - // Check that no requeue is requested. - if result.Requeue || result.RequeueAfter != 0 { - t.Errorf("expected no requeue, got %+v", result) - } + if diff := cmp.Diff(result, test.wantResult); diff != "" { + t.Errorf("Unexpected result diff (+got/-want): %s", diff) + } - // Verify that the datastore was updated. - if _, exist := datastore.ModelGet(existingModel.Spec.ModelName); !exist { - t.Errorf("expected datastore to contain model %q", existingModel.Spec.ModelName) - } -} + if len(test.wantModels) != len(datastore.ModelGetAll()) { + t.Errorf("Unexpected; want: %d, got:%d", len(test.wantModels), len(datastore.ModelGetAll())) + } -func populateServiceMap(services ...*v1alpha2.InferenceModel) *sync.Map { - returnVal := &sync.Map{} + if diff := diffStore(datastore, diffStoreParams{wantPool: pool, wantModels: test.wantModels}); diff != "" { + t.Errorf("Unexpected diff (+got/-want): %s", diff) + } - for _, service := range services { - returnVal.Store(service.Spec.ModelName, service) + }) } - return returnVal } diff --git a/pkg/epp/controller/inferencepool_reconciler_test.go b/pkg/epp/controller/inferencepool_reconciler_test.go index 26b81d9a..f35b8dc0 100644 --- a/pkg/epp/controller/inferencepool_reconciler_test.go +++ b/pkg/epp/controller/inferencepool_reconciler_test.go @@ -23,7 +23,6 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -32,42 +31,44 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" - utiltesting "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" + utiltest "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" ) var ( selector_v1 = map[string]string{"app": "vllm_v1"} selector_v2 = map[string]string{"app": "vllm_v2"} - pool1 = &v1alpha2.InferencePool{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pool1", - Namespace: "pool1-ns", - }, - Spec: v1alpha2.InferencePoolSpec{ - Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{"app": "vllm_v1"}, - TargetPortNumber: 8080, - }, - } - pool2 = &v1alpha2.InferencePool{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pool2", - Namespace: "pool2-ns", - }, - } - pods = []corev1.Pod{ + pool1 = utiltest.MakeInferencePool("pool1"). + Namespace("pool1-ns"). + Selector(selector_v1). + TargetPortNumber(8080).ObjRef() + pool2 = utiltest.MakeInferencePool("pool2").Namespace("pool2-ns").ObjRef() + pods = []*corev1.Pod{ // Two ready pods matching pool1 - utiltesting.MakePod("pod1", "pool1-ns").Labels(selector_v1).ReadyCondition().Obj(), - utiltesting.MakePod("pod2", "pool1-ns").Labels(selector_v1).ReadyCondition().Obj(), + utiltest.MakePod("pod1"). + Namespace("pool1-ns"). + Labels(selector_v1).ReadyCondition().ObjRef(), + utiltest.MakePod("pod2"). + Namespace("pool1-ns"). + Labels(selector_v1). + ReadyCondition().ObjRef(), // A not ready pod matching pool1 - utiltesting.MakePod("pod3", "pool1-ns").Labels(selector_v1).Obj(), + utiltest.MakePod("pod3"). + Namespace("pool1-ns"). + Labels(selector_v1).ObjRef(), // A pod not matching pool1 namespace - utiltesting.MakePod("pod4", "pool2-ns").Labels(selector_v1).ReadyCondition().Obj(), + utiltest.MakePod("pod4"). + Namespace("pool2-ns"). + Labels(selector_v1). + ReadyCondition().ObjRef(), // A ready pod matching pool1 with a new selector - utiltesting.MakePod("pod5", "pool1-ns").Labels(selector_v2).ReadyCondition().Obj(), + utiltest.MakePod("pod5"). + Namespace("pool1-ns"). + Labels(selector_v2). + ReadyCondition().ObjRef(), } ) -func TestReconcile_InferencePoolReconciler(t *testing.T) { +func TestInferencePoolReconciler(t *testing.T) { // The best practice is to use table-driven tests, however in this scaenario it seems // more logical to do a single test with steps that depend on each other. @@ -79,7 +80,7 @@ func TestReconcile_InferencePoolReconciler(t *testing.T) { // Create a fake client with the pool and the pods. initialObjects := []client.Object{pool1, pool2} for i := range pods { - initialObjects = append(initialObjects, &pods[i]) + initialObjects = append(initialObjects, pods[i]) } fakeClient := fake.NewClientBuilder(). WithScheme(scheme). @@ -98,11 +99,10 @@ func TestReconcile_InferencePoolReconciler(t *testing.T) { if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { t.Errorf("Unexpected InferencePool reconcile error: %v", err) } - if diff := diffPool(datastore, pool1, []string{"pod1", "pod2"}); diff != "" { + if diff := diffStore(datastore, diffStoreParams{wantPool: pool1, wantPods: []string{"pod1", "pod2"}}); diff != "" { t.Errorf("Unexpected diff (+got/-want): %s", diff) } - // Step 2: update the pool selector to include more pods newPool1 := &v1alpha2.InferencePool{} if err := fakeClient.Get(ctx, req.NamespacedName, newPool1); err != nil { t.Errorf("Unexpected pool get error: %v", err) @@ -115,7 +115,7 @@ func TestReconcile_InferencePoolReconciler(t *testing.T) { if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { t.Errorf("Unexpected InferencePool reconcile error: %v", err) } - if diff := diffPool(datastore, newPool1, []string{"pod5"}); diff != "" { + if diff := diffStore(datastore, diffStoreParams{wantPool: newPool1, wantPods: []string{"pod5"}}); diff != "" { t.Errorf("Unexpected diff (+got/-want): %s", diff) } @@ -130,7 +130,7 @@ func TestReconcile_InferencePoolReconciler(t *testing.T) { if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { t.Errorf("Unexpected InferencePool reconcile error: %v", err) } - if diff := diffPool(datastore, newPool1, []string{"pod5"}); diff != "" { + if diff := diffStore(datastore, diffStoreParams{wantPool: newPool1, wantPods: []string{"pod5"}}); diff != "" { t.Errorf("Unexpected diff (+got/-want): %s", diff) } @@ -144,19 +144,42 @@ func TestReconcile_InferencePoolReconciler(t *testing.T) { if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { t.Errorf("Unexpected InferencePool reconcile error: %v", err) } - if diff := diffPool(datastore, nil, []string{}); diff != "" { + if diff := diffStore(datastore, diffStoreParams{wantPods: []string{}}); diff != "" { t.Errorf("Unexpected diff (+got/-want): %s", diff) } } -func diffPool(datastore datastore.Datastore, wantPool *v1alpha2.InferencePool, wantPods []string) string { +type diffStoreParams struct { + wantPool *v1alpha2.InferencePool + wantPods []string + wantModels []*v1alpha2.InferenceModel +} + +func diffStore(datastore datastore.Datastore, params diffStoreParams) string { gotPool, _ := datastore.PoolGet() - if diff := cmp.Diff(wantPool, gotPool); diff != "" { - return diff + if diff := cmp.Diff(params.wantPool, gotPool); diff != "" { + return "pool:" + diff + } + + // Default wantPods if not set because PodGetAll returns an empty slice when empty. + if params.wantPods == nil { + params.wantPods = []string{} } gotPods := []string{} for _, pm := range datastore.PodGetAll() { gotPods = append(gotPods, pm.NamespacedName.Name) } - return cmp.Diff(wantPods, gotPods, cmpopts.SortSlices(func(a, b string) bool { return a < b })) + if diff := cmp.Diff(params.wantPods, gotPods, cmpopts.SortSlices(func(a, b string) bool { return a < b })); diff != "" { + return "pods:" + diff + } + + // Default wantModels if not set because ModelGetAll returns an empty slice when empty. + if params.wantModels == nil { + params.wantModels = []*v1alpha2.InferenceModel{} + } + gotModels := datastore.ModelGetAll() + if diff := utiltest.DiffModelLists(params.wantModels, gotModels); diff != "" { + return "models:" + diff + } + return "" } diff --git a/pkg/epp/controller/pod_reconciler.go b/pkg/epp/controller/pod_reconciler.go index 5b0c25c9..717d9f60 100644 --- a/pkg/epp/controller/pod_reconciler.go +++ b/pkg/epp/controller/pod_reconciler.go @@ -75,7 +75,7 @@ func (c *PodReconciler) SetupWithManager(mgr ctrl.Manager) error { func (c *PodReconciler) updateDatastore(logger logr.Logger, pod *corev1.Pod) { namespacedName := types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace} if !pod.DeletionTimestamp.IsZero() || !c.Datastore.PoolLabelsMatch(pod.Labels) || !podIsReady(pod) { - logger.V(logutil.DEFAULT).Info("Pod removed or not added", "name", namespacedName) + logger.V(logutil.DEBUG).Info("Pod removed or not added", "name", namespacedName) c.Datastore.PodDelete(namespacedName) } else { if c.Datastore.PodUpdateOrAddIfNotExist(pod) { diff --git a/pkg/epp/controller/pod_reconciler_test.go b/pkg/epp/controller/pod_reconciler_test.go index 8a39dbab..57576213 100644 --- a/pkg/epp/controller/pod_reconciler_test.go +++ b/pkg/epp/controller/pod_reconciler_test.go @@ -18,13 +18,11 @@ package controller import ( "context" - "sync" "testing" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -33,6 +31,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + utiltest "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" ) var ( @@ -42,8 +41,7 @@ var ( basePod11 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}, Address: "address-11", ScrapePath: "/metrics", ScrapePort: 8000}} ) -func TestUpdateDatastore_PodReconciler(t *testing.T) { - now := metav1.Now() +func TestPodReconciler(t *testing.T) { tests := []struct { name string datastore datastore.Datastore @@ -53,7 +51,7 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }{ { name: "Add new pod", - datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha2.InferencePool{ + datastore: datastore.NewFakeDatastore([]*datastore.PodMetrics{basePod1, basePod2}, nil, &v1alpha2.InferencePool{ Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ @@ -61,28 +59,15 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, }, }), - incomingPod: &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: basePod3.NamespacedName.Name, - Labels: map[string]string{ - "some-key": "some-val", - }, - }, - Status: corev1.PodStatus{ - PodIP: basePod3.Address, - Conditions: []corev1.PodCondition{ - { - Type: corev1.PodReady, - Status: corev1.ConditionTrue, - }, - }, - }, - }, + incomingPod: utiltest.MakePod(basePod3.NamespacedName.Name). + Labels(map[string]string{"some-key": "some-val"}). + IP(basePod3.Address). + ReadyCondition().ObjRef(), wantPods: []datastore.Pod{basePod1.Pod, basePod2.Pod, basePod3.Pod}, }, { name: "Update pod1 address", - datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha2.InferencePool{ + datastore: datastore.NewFakeDatastore([]*datastore.PodMetrics{basePod1, basePod2}, nil, &v1alpha2.InferencePool{ Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ @@ -90,28 +75,15 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, }, }), - incomingPod: &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: basePod11.NamespacedName.Name, - Labels: map[string]string{ - "some-key": "some-val", - }, - }, - Status: corev1.PodStatus{ - PodIP: basePod11.Address, - Conditions: []corev1.PodCondition{ - { - Type: corev1.PodReady, - Status: corev1.ConditionTrue, - }, - }, - }, - }, + incomingPod: utiltest.MakePod(basePod11.NamespacedName.Name). + Labels(map[string]string{"some-key": "some-val"}). + IP(basePod11.Address). + ReadyCondition().ObjRef(), wantPods: []datastore.Pod{basePod11.Pod, basePod2.Pod}, }, { name: "Delete pod with DeletionTimestamp", - datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha2.InferencePool{ + datastore: datastore.NewFakeDatastore([]*datastore.PodMetrics{basePod1, basePod2}, nil, &v1alpha2.InferencePool{ Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ @@ -119,29 +91,15 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, }, }), - incomingPod: &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod1", - Labels: map[string]string{ - "some-key": "some-val", - }, - DeletionTimestamp: &now, - Finalizers: []string{"finalizer"}, - }, - Status: corev1.PodStatus{ - Conditions: []corev1.PodCondition{ - { - Type: corev1.PodReady, - Status: corev1.ConditionTrue, - }, - }, - }, - }, + incomingPod: utiltest.MakePod("pod1"). + Labels(map[string]string{"some-key": "some-val"}). + DeletionTimestamp(). + ReadyCondition().ObjRef(), wantPods: []datastore.Pod{basePod2.Pod}, }, { name: "Delete notfound pod", - datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha2.InferencePool{ + datastore: datastore.NewFakeDatastore([]*datastore.PodMetrics{basePod1, basePod2}, nil, &v1alpha2.InferencePool{ Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ @@ -154,7 +112,7 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, { name: "New pod, not ready, valid selector", - datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha2.InferencePool{ + datastore: datastore.NewFakeDatastore([]*datastore.PodMetrics{basePod1, basePod2}, nil, &v1alpha2.InferencePool{ Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ @@ -162,27 +120,13 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, }, }), - incomingPod: &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod3", - Labels: map[string]string{ - "some-key": "some-val", - }, - }, - Status: corev1.PodStatus{ - Conditions: []corev1.PodCondition{ - { - Type: corev1.PodReady, - Status: corev1.ConditionFalse, - }, - }, - }, - }, + incomingPod: utiltest.MakePod("pod3"). + Labels(map[string]string{"some-key": "some-val"}).ObjRef(), wantPods: []datastore.Pod{basePod1.Pod, basePod2.Pod}, }, { name: "Remove pod that does not match selector", - datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha2.InferencePool{ + datastore: datastore.NewFakeDatastore([]*datastore.PodMetrics{basePod1, basePod2}, nil, &v1alpha2.InferencePool{ Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ @@ -190,27 +134,14 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, }, }), - incomingPod: &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod1", - Labels: map[string]string{ - "some-wrong-key": "some-val", - }, - }, - Status: corev1.PodStatus{ - Conditions: []corev1.PodCondition{ - { - Type: corev1.PodReady, - Status: corev1.ConditionTrue, - }, - }, - }, - }, + incomingPod: utiltest.MakePod("pod1"). + Labels(map[string]string{"some-wrong-key": "some-val"}). + ReadyCondition().ObjRef(), wantPods: []datastore.Pod{basePod2.Pod}, }, { name: "Remove pod that is not ready", - datastore: datastore.NewFakeDatastore(populateMap(basePod1, basePod2), nil, &v1alpha2.InferencePool{ + datastore: datastore.NewFakeDatastore([]*datastore.PodMetrics{basePod1, basePod2}, nil, &v1alpha2.InferencePool{ Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ @@ -218,22 +149,9 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }, }, }), - incomingPod: &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "pod1", - Labels: map[string]string{ - "some-wrong-key": "some-val", - }, - }, - Status: corev1.PodStatus{ - Conditions: []corev1.PodCondition{ - { - Type: corev1.PodReady, - Status: corev1.ConditionFalse, - }, - }, - }, - }, + incomingPod: utiltest.MakePod("pod1"). + Labels(map[string]string{"some-wrong-key": "some-val"}). + ReadyCondition().ObjRef(), wantPods: []datastore.Pod{basePod2.Pod}, }, } @@ -274,11 +192,3 @@ func TestUpdateDatastore_PodReconciler(t *testing.T) { }) } } - -func populateMap(pods ...*datastore.PodMetrics) *sync.Map { - newMap := &sync.Map{} - for _, pod := range pods { - newMap.Store(pod.NamespacedName, &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: pod.NamespacedName, Address: pod.Address, ScrapePort: pod.ScrapePort, ScrapePath: pod.ScrapePath}}) - } - return newMap -} diff --git a/pkg/epp/datastore/datastore.go b/pkg/epp/datastore/datastore.go index c5bbddcf..cd5d290f 100644 --- a/pkg/epp/datastore/datastore.go +++ b/pkg/epp/datastore/datastore.go @@ -19,6 +19,7 @@ package datastore import ( "context" "errors" + "fmt" "math/rand" "sync" @@ -32,6 +33,14 @@ import ( logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) +const ( + ModelNameIndexKey = "spec.modelName" +) + +var ( + errPoolNotSynced = errors.New("InferencePool is not initialized in data store") +) + // The datastore is a local cache of relevant data for the given InferencePool (currently all pulled from k8s-api) type Datastore interface { // InferencePool operations @@ -41,9 +50,11 @@ type Datastore interface { PoolLabelsMatch(podLabels map[string]string) bool // InferenceModel operations - ModelSet(infModel *v1alpha2.InferenceModel) + ModelSetIfOlder(infModel *v1alpha2.InferenceModel) bool ModelGet(modelName string) (*v1alpha2.InferenceModel, bool) - ModelDelete(modelName string) + ModelDelete(namespacedName types.NamespacedName) (*v1alpha2.InferenceModel, bool) + ModelResync(ctx context.Context, ctrlClient client.Client, modelName string) (bool, error) + ModelGetAll() []*v1alpha2.InferenceModel // PodMetrics operations PodUpdateOrAddIfNotExist(pod *corev1.Pod) bool @@ -61,22 +72,27 @@ type Datastore interface { func NewDatastore() Datastore { store := &datastore{ - poolMu: sync.RWMutex{}, - models: &sync.Map{}, - pods: &sync.Map{}, + poolAndModelsMu: sync.RWMutex{}, + models: make(map[string]*v1alpha2.InferenceModel), + pods: &sync.Map{}, } return store } // Used for test only -func NewFakeDatastore(pods, models *sync.Map, pool *v1alpha2.InferencePool) Datastore { +func NewFakeDatastore(pods []*PodMetrics, models []*v1alpha2.InferenceModel, pool *v1alpha2.InferencePool) Datastore { store := NewDatastore() - if pods != nil { - store.(*datastore).pods = pods + + for _, pod := range pods { + // Making a copy since in tests we may use the same global PodMetric across tests. + p := *pod + store.(*datastore).pods.Store(pod.NamespacedName, &p) } - if models != nil { - store.(*datastore).models = models + + for _, m := range models { + store.ModelSetIfOlder(m) } + if pool != nil { store.(*datastore).pool = pool } @@ -84,65 +100,132 @@ func NewFakeDatastore(pods, models *sync.Map, pool *v1alpha2.InferencePool) Data } type datastore struct { - // poolMu is used to synchronize access to the inferencePool. - poolMu sync.RWMutex - pool *v1alpha2.InferencePool - models *sync.Map + // poolAndModelsMu is used to synchronize access to pool and the models map. + poolAndModelsMu sync.RWMutex + pool *v1alpha2.InferencePool + // key: InferenceModel.Spec.ModelName, value: *InferenceModel + models map[string]*v1alpha2.InferenceModel // key: types.NamespacedName, value: *PodMetrics pods *sync.Map } func (ds *datastore) Clear() { - ds.poolMu.Lock() - defer ds.poolMu.Unlock() + ds.poolAndModelsMu.Lock() + defer ds.poolAndModelsMu.Unlock() ds.pool = nil - ds.models.Clear() + ds.models = make(map[string]*v1alpha2.InferenceModel) ds.pods.Clear() } // /// InferencePool APIs /// func (ds *datastore) PoolSet(pool *v1alpha2.InferencePool) { - ds.poolMu.Lock() - defer ds.poolMu.Unlock() + ds.poolAndModelsMu.Lock() + defer ds.poolAndModelsMu.Unlock() ds.pool = pool } func (ds *datastore) PoolGet() (*v1alpha2.InferencePool, error) { - ds.poolMu.RLock() - defer ds.poolMu.RUnlock() + ds.poolAndModelsMu.RLock() + defer ds.poolAndModelsMu.RUnlock() if !ds.PoolHasSynced() { - return nil, errors.New("InferencePool is not initialized in data store") + return nil, errPoolNotSynced } return ds.pool, nil } func (ds *datastore) PoolHasSynced() bool { - ds.poolMu.RLock() - defer ds.poolMu.RUnlock() + ds.poolAndModelsMu.RLock() + defer ds.poolAndModelsMu.RUnlock() return ds.pool != nil } func (ds *datastore) PoolLabelsMatch(podLabels map[string]string) bool { + ds.poolAndModelsMu.RLock() + defer ds.poolAndModelsMu.RUnlock() poolSelector := selectorFromInferencePoolSelector(ds.pool.Spec.Selector) podSet := labels.Set(podLabels) return poolSelector.Matches(podSet) } // /// InferenceModel APIs /// -func (ds *datastore) ModelSet(infModel *v1alpha2.InferenceModel) { - ds.models.Store(infModel.Spec.ModelName, infModel) +func (ds *datastore) ModelSetIfOlder(infModel *v1alpha2.InferenceModel) bool { + ds.poolAndModelsMu.Lock() + defer ds.poolAndModelsMu.Unlock() + + // Check first if the existing model is older. + // One exception is if the incoming model object is the same, in which case, we should not + // check for creation timestamp since that means the object was re-created, and so we should override. + existing, exists := ds.models[infModel.Spec.ModelName] + if exists { + diffObj := infModel.Name != existing.Name || infModel.Namespace != existing.Namespace + if diffObj && existing.ObjectMeta.CreationTimestamp.Before(&infModel.ObjectMeta.CreationTimestamp) { + return false + } + } + // Set the model. + ds.models[infModel.Spec.ModelName] = infModel + return true +} + +func (ds *datastore) ModelResync(ctx context.Context, c client.Client, modelName string) (bool, error) { + ds.poolAndModelsMu.Lock() + defer ds.poolAndModelsMu.Unlock() + + var models v1alpha2.InferenceModelList + if err := c.List(ctx, &models, client.MatchingFields{ModelNameIndexKey: modelName}, client.InNamespace(ds.pool.Namespace)); err != nil { + return false, fmt.Errorf("listing models that match the modelName %s: %w", modelName, err) + } + if len(models.Items) == 0 { + // No other instances of InferenceModels with this ModelName exists. + return false, nil + } + + var oldest *v1alpha2.InferenceModel + for i := range models.Items { + m := &models.Items[i] + if m.Spec.ModelName != modelName || // The index should filter those out, but just in case! + m.Spec.PoolRef.Name != ds.pool.Name || // We don't care about other pools, we could setup an index on this too! + !m.DeletionTimestamp.IsZero() { // ignore objects marked for deletion + continue + } + if oldest == nil || m.ObjectMeta.CreationTimestamp.Before(&oldest.ObjectMeta.CreationTimestamp) { + oldest = m + } + } + if oldest == nil { + return false, nil + } + ds.models[modelName] = oldest + return true, nil } func (ds *datastore) ModelGet(modelName string) (*v1alpha2.InferenceModel, bool) { - infModel, ok := ds.models.Load(modelName) - if ok { - return infModel.(*v1alpha2.InferenceModel), true + ds.poolAndModelsMu.RLock() + defer ds.poolAndModelsMu.RUnlock() + m, exists := ds.models[modelName] + return m, exists +} + +func (ds *datastore) ModelDelete(namespacedName types.NamespacedName) (*v1alpha2.InferenceModel, bool) { + ds.poolAndModelsMu.Lock() + defer ds.poolAndModelsMu.Unlock() + for _, m := range ds.models { + if m.Name == namespacedName.Name && m.Namespace == namespacedName.Namespace { + delete(ds.models, m.Spec.ModelName) + return m, true + } } return nil, false } -func (ds *datastore) ModelDelete(modelName string) { - ds.models.Delete(modelName) +func (ds *datastore) ModelGetAll() []*v1alpha2.InferenceModel { + ds.poolAndModelsMu.RLock() + defer ds.poolAndModelsMu.RUnlock() + res := []*v1alpha2.InferenceModel{} + for _, v := range ds.models { + res = append(res, v) + } + return res } // /// Pods/endpoints APIs /// diff --git a/pkg/epp/datastore/datastore_test.go b/pkg/epp/datastore/datastore_test.go index 2af36541..edc96626 100644 --- a/pkg/epp/datastore/datastore_test.go +++ b/pkg/epp/datastore/datastore_test.go @@ -19,45 +19,194 @@ package datastore import ( "testing" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + testutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" ) -func TestHasSynced(t *testing.T) { +func TestPool(t *testing.T) { + pool1Selector := map[string]string{"app": "vllm_v1"} + pool1 := testutil.MakeInferencePool("pool1"). + Namespace("default"). + Selector(pool1Selector).ObjRef() tests := []struct { - name string - inferencePool *v1alpha2.InferencePool - hasSynced bool + name string + inferencePool *v1alpha2.InferencePool + labels map[string]string + wantSynced bool + wantPool *v1alpha2.InferencePool + wantErr error + wantLabelsMatch bool }{ { - name: "Ready when InferencePool exists in data store", - inferencePool: &v1alpha2.InferencePool{ - ObjectMeta: v1.ObjectMeta{ - Name: "test-pool", - Namespace: "default", - }, - }, - hasSynced: true, + name: "Ready when InferencePool exists in data store", + inferencePool: pool1, + labels: pool1Selector, + wantSynced: true, + wantPool: pool1, + wantLabelsMatch: true, + }, + { + name: "Labels not matched", + inferencePool: pool1, + labels: map[string]string{"app": "vllm_v2"}, + wantSynced: true, + wantPool: pool1, + wantLabelsMatch: false, }, { - name: "Not ready when InferencePool is nil in data store", - inferencePool: nil, - hasSynced: false, + name: "Not ready when InferencePool is nil in data store", + wantErr: errPoolNotSynced, + wantSynced: false, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { datastore := NewDatastore() - // Set the inference pool - if tt.inferencePool != nil { - datastore.PoolSet(tt.inferencePool) + datastore.PoolSet(tt.inferencePool) + gotPool, gotErr := datastore.PoolGet() + if diff := cmp.Diff(tt.wantErr, gotErr, cmpopts.EquateErrors()); diff != "" { + t.Errorf("Unexpected error diff (+got/-want): %s", diff) + } + if diff := cmp.Diff(tt.wantPool, gotPool); diff != "" { + t.Errorf("Unexpected pool diff (+got/-want): %s", diff) + } + gotSynced := datastore.PoolHasSynced() + if diff := cmp.Diff(tt.wantSynced, gotSynced); diff != "" { + t.Errorf("Unexpected synced diff (+got/-want): %s", diff) + } + if tt.labels != nil { + gotLabelsMatch := datastore.PoolLabelsMatch(tt.labels) + if diff := cmp.Diff(tt.wantLabelsMatch, gotLabelsMatch); diff != "" { + t.Errorf("Unexpected labels match diff (+got/-want): %s", diff) + } + } + }) + } +} + +func TestModel(t *testing.T) { + chatModel := "chat" + tsModel := "tweet-summary" + model1ts := testutil.MakeInferenceModel("model1"). + CreationTimestamp(metav1.Unix(1000, 0)). + ModelName(tsModel).ObjRef() + // Same model name as model1ts, different object name. + model2ts := testutil.MakeInferenceModel("model2"). + CreationTimestamp(metav1.Unix(1001, 0)). + ModelName(tsModel).ObjRef() + // Same model name as model1ts, newer timestamp + model1tsNewer := testutil.MakeInferenceModel("model1"). + CreationTimestamp(metav1.Unix(1002, 0)). + Criticality(v1alpha2.Critical). + ModelName(tsModel).ObjRef() + model2tsNewer := testutil.MakeInferenceModel("model2"). + CreationTimestamp(metav1.Unix(1003, 0)). + ModelName(tsModel).ObjRef() + // Same object name as model2ts, different model name. + model2chat := testutil.MakeInferenceModel(model2ts.Name). + CreationTimestamp(metav1.Unix(1005, 0)). + ModelName(chatModel).ObjRef() + + tests := []struct { + name string + existingModels []*v1alpha2.InferenceModel + op func(ds Datastore) bool + wantOpResult bool + wantModels []*v1alpha2.InferenceModel + }{ + { + name: "Add model1 with tweet-summary as modelName", + op: func(ds Datastore) bool { + return ds.ModelSetIfOlder(model1ts) + }, + wantModels: []*v1alpha2.InferenceModel{model1ts}, + wantOpResult: true, + }, + { + name: "Set model1 with the same modelName, but with diff criticality and newer creation timestamp, should update.", + existingModels: []*v1alpha2.InferenceModel{model1ts}, + op: func(ds Datastore) bool { + return ds.ModelSetIfOlder(model1tsNewer) + }, + wantOpResult: true, + wantModels: []*v1alpha2.InferenceModel{model1tsNewer}, + }, + { + name: "set model2 with the same modelName, but newer creation timestamp, should not update.", + existingModels: []*v1alpha2.InferenceModel{model1tsNewer}, + op: func(ds Datastore) bool { + return ds.ModelSetIfOlder(model2tsNewer) + }, + wantOpResult: false, + wantModels: []*v1alpha2.InferenceModel{model1tsNewer}, + }, + { + name: "Set model2 with the same modelName, but older creation timestamp, should update", + existingModels: []*v1alpha2.InferenceModel{model1tsNewer}, + op: func(ds Datastore) bool { + return ds.ModelSetIfOlder(model2ts) + }, + wantOpResult: true, + wantModels: []*v1alpha2.InferenceModel{model2ts}, + }, + { + name: "Set model1 with the tweet-summary modelName, both models should exist", + existingModels: []*v1alpha2.InferenceModel{model2chat}, + op: func(ds Datastore) bool { + return ds.ModelSetIfOlder(model1ts) + }, + wantOpResult: true, + wantModels: []*v1alpha2.InferenceModel{model2chat, model1ts}, + }, + { + name: "Set model1 with the tweet-summary modelName, both models should exist", + existingModels: []*v1alpha2.InferenceModel{model2chat, model1ts}, + op: func(ds Datastore) bool { + return ds.ModelSetIfOlder(model1ts) + }, + wantOpResult: true, + wantModels: []*v1alpha2.InferenceModel{model2chat, model1ts}, + }, + { + name: "Getting by model name, chat -> model2", + existingModels: []*v1alpha2.InferenceModel{model2chat, model1ts}, + op: func(ds Datastore) bool { + gotChat, exists := ds.ModelGet(chatModel) + return exists && cmp.Diff(model2chat, gotChat) == "" + }, + wantOpResult: true, + wantModels: []*v1alpha2.InferenceModel{model2chat, model1ts}, + }, + { + name: "Delete the model", + existingModels: []*v1alpha2.InferenceModel{model2chat, model1ts}, + op: func(ds Datastore) bool { + _, existed := ds.ModelDelete(types.NamespacedName{Name: model1ts.Name, Namespace: model1ts.Namespace}) + _, exists := ds.ModelGet(tsModel) + return existed && !exists + + }, + wantOpResult: true, + wantModels: []*v1alpha2.InferenceModel{model2chat}, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ds := NewFakeDatastore(nil, test.existingModels, nil) + gotOpResult := test.op(ds) + if gotOpResult != test.wantOpResult { + t.Errorf("Unexpected operation result, want: %v, got: %v", test.wantOpResult, gotOpResult) } - // Check if the data store has been initialized - hasSynced := datastore.PoolHasSynced() - if hasSynced != tt.hasSynced { - t.Errorf("IsInitialized() = %v, want %v", hasSynced, tt.hasSynced) + + if diff := testutil.DiffModelLists(test.wantModels, ds.ModelGetAll()); diff != "" { + t.Errorf("Unexpected models diff: %s", diff) } + }) } } diff --git a/pkg/epp/server/runserver.go b/pkg/epp/server/runserver.go index 6e6b68b1..f3d9b6ac 100644 --- a/pkg/epp/server/runserver.go +++ b/pkg/epp/server/runserver.go @@ -85,7 +85,7 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner { } // SetupWithManager sets up the runner with the given manager. -func (r *ExtProcServerRunner) SetupWithManager(mgr ctrl.Manager) error { +func (r *ExtProcServerRunner) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error { // Create the controllers and register them with the manager if err := (&controller.InferencePoolReconciler{ Datastore: r.Datastore, @@ -109,7 +109,7 @@ func (r *ExtProcServerRunner) SetupWithManager(mgr ctrl.Manager) error { Namespace: r.PoolNamespace, }, Record: mgr.GetEventRecorderFor("InferenceModel"), - }).SetupWithManager(mgr); err != nil { + }).SetupWithManager(ctx, mgr); err != nil { return fmt.Errorf("failed setting up InferenceModelReconciler: %w", err) } diff --git a/pkg/epp/test/utils.go b/pkg/epp/test/utils.go index 6a75ed2f..a916bda2 100644 --- a/pkg/epp/test/utils.go +++ b/pkg/epp/test/utils.go @@ -53,14 +53,15 @@ func StartExtProc( pmc := &backend.FakePodMetricsClient{Res: pms} datastore := datastore.NewDatastore() for _, m := range models { - datastore.ModelSet(m) + datastore.ModelSetIfOlder(m) } for _, pm := range pods { - pod := utiltesting.MakePod(pm.NamespacedName.Name, pm.NamespacedName.Namespace). + pod := utiltesting.MakePod(pm.NamespacedName.Name). + Namespace(pm.NamespacedName.Namespace). ReadyCondition(). IP(pm.Address). - Obj() - datastore.PodUpdateOrAddIfNotExist(&pod) + ObjRef() + datastore.PodUpdateOrAddIfNotExist(pod) datastore.PodUpdateMetricsIfExist(pm.NamespacedName, &pm.Metrics) } pp := backend.NewProvider(pmc, datastore) diff --git a/pkg/epp/util/testing/diff.go b/pkg/epp/util/testing/diff.go new file mode 100644 index 00000000..34b0b8ca --- /dev/null +++ b/pkg/epp/util/testing/diff.go @@ -0,0 +1,27 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package testing + +import ( + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" +) + +func DiffModelLists(want, got []*v1alpha2.InferenceModel) string { + return cmp.Diff(want, got, cmpopts.SortSlices(func(a, b *v1alpha2.InferenceModel) bool { return a.Name < b.Name })) +} diff --git a/pkg/epp/util/testing/wrappers.go b/pkg/epp/util/testing/wrappers.go index 7c9a2939..bfcf2690 100644 --- a/pkg/epp/util/testing/wrappers.go +++ b/pkg/epp/util/testing/wrappers.go @@ -19,6 +19,7 @@ package testing import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) // PodWrapper wraps a Pod. @@ -27,12 +28,11 @@ type PodWrapper struct { } // MakePod creates a wrapper for a Pod. -func MakePod(podName, ns string) *PodWrapper { +func MakePod(podName string) *PodWrapper { return &PodWrapper{ corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ - Name: podName, - Namespace: ns, + Name: podName, }, Spec: corev1.PodSpec{}, Status: corev1.PodStatus{}, @@ -40,6 +40,11 @@ func MakePod(podName, ns string) *PodWrapper { } } +func (p *PodWrapper) Namespace(ns string) *PodWrapper { + p.ObjectMeta.Namespace = ns + return p +} + // Labels sets the pod labels. func (p *PodWrapper) Labels(labels map[string]string) *PodWrapper { p.ObjectMeta.Labels = labels @@ -60,7 +65,109 @@ func (p *PodWrapper) IP(ip string) *PodWrapper { return p } +func (p *PodWrapper) DeletionTimestamp() *PodWrapper { + now := metav1.Now() + p.ObjectMeta.DeletionTimestamp = &now + p.ObjectMeta.Finalizers = []string{"finalizer"} + return p +} + // Obj returns the wrapped Pod. -func (p *PodWrapper) Obj() corev1.Pod { - return p.Pod +func (p *PodWrapper) ObjRef() *corev1.Pod { + return &p.Pod +} + +// InferenceModelWrapper wraps an InferenceModel. +type InferenceModelWrapper struct { + v1alpha2.InferenceModel +} + +// MakeInferenceModel creates a wrapper for a InferenceModel. +func MakeInferenceModel(name string) *InferenceModelWrapper { + return &InferenceModelWrapper{ + v1alpha2.InferenceModel{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Spec: v1alpha2.InferenceModelSpec{}, + }, + } +} + +func (m *InferenceModelWrapper) Namespace(ns string) *InferenceModelWrapper { + m.ObjectMeta.Namespace = ns + return m +} + +// Obj returns the wrapped InferenceModel. +func (m *InferenceModelWrapper) ObjRef() *v1alpha2.InferenceModel { + return &m.InferenceModel +} + +func (m *InferenceModelWrapper) ModelName(modelName string) *InferenceModelWrapper { + m.Spec.ModelName = modelName + return m +} + +func (m *InferenceModelWrapper) PoolName(poolName string) *InferenceModelWrapper { + m.Spec.PoolRef = v1alpha2.PoolObjectReference{Name: poolName} + return m +} + +func (m *InferenceModelWrapper) Criticality(criticality v1alpha2.Criticality) *InferenceModelWrapper { + m.Spec.Criticality = &criticality + return m +} + +func (m *InferenceModelWrapper) DeletionTimestamp() *InferenceModelWrapper { + now := metav1.Now() + m.ObjectMeta.DeletionTimestamp = &now + m.ObjectMeta.Finalizers = []string{"finalizer"} + return m +} + +func (m *InferenceModelWrapper) CreationTimestamp(t metav1.Time) *InferenceModelWrapper { + m.ObjectMeta.CreationTimestamp = t + return m +} + +// InferencePoolWrapper wraps an InferencePool. +type InferencePoolWrapper struct { + v1alpha2.InferencePool +} + +// MakeInferencePool creates a wrapper for a InferencePool. +func MakeInferencePool(name string) *InferencePoolWrapper { + return &InferencePoolWrapper{ + v1alpha2.InferencePool{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Spec: v1alpha2.InferencePoolSpec{}, + }, + } +} + +func (m *InferencePoolWrapper) Namespace(ns string) *InferencePoolWrapper { + m.ObjectMeta.Namespace = ns + return m +} + +func (m *InferencePoolWrapper) Selector(selector map[string]string) *InferencePoolWrapper { + s := make(map[v1alpha2.LabelKey]v1alpha2.LabelValue) + for k, v := range selector { + s[v1alpha2.LabelKey(k)] = v1alpha2.LabelValue(v) + } + m.Spec.Selector = s + return m +} + +func (m *InferencePoolWrapper) TargetPortNumber(p int32) *InferencePoolWrapper { + m.Spec.TargetPortNumber = p + return m +} + +// Obj returns the wrapped InferencePool. +func (m *InferencePoolWrapper) ObjRef() *v1alpha2.InferencePool { + return &m.InferencePool } diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index 14ee738f..3d068c9f 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -245,11 +245,6 @@ func createModelServer(k8sClient client.Client, secretPath, deployPath string) { // Wait for the deployment to be available. testutils.DeploymentAvailable(ctx, k8sClient, deploy, modelReadyTimeout, interval) - - // Wait for the service to exist. - testutils.EventuallyExists(ctx, func() error { - return k8sClient.Get(ctx, types.NamespacedName{Namespace: nsName, Name: modelServerName}, &corev1.Service{}) - }, existsTimeout, interval) } // createEnvoy creates the envoy proxy resources used for testing from the given filePath. diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index 85c49913..2ea66dba 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -360,11 +360,12 @@ func setUpHermeticServer(podMetrics []*datastore.PodMetrics) (client extProcPb.E go func() { serverRunner.Datastore.PodDeleteAll() for _, pm := range podMetrics { - pod := utiltesting.MakePod(pm.NamespacedName.Name, pm.NamespacedName.Namespace). + pod := utiltesting.MakePod(pm.NamespacedName.Name). + Namespace(pm.NamespacedName.Namespace). ReadyCondition(). IP(pm.Address). - Obj() - serverRunner.Datastore.PodUpdateOrAddIfNotExist(&pod) + ObjRef() + serverRunner.Datastore.PodUpdateOrAddIfNotExist(pod) serverRunner.Datastore.PodUpdateMetricsIfExist(pm.NamespacedName, &pm.Metrics) } serverRunner.Provider = backend.NewProvider(pmc, serverRunner.Datastore) @@ -429,7 +430,7 @@ func BeforeSuit(t *testing.T) func() { serverRunner.Datastore = datastore.NewDatastore() serverRunner.SecureServing = false - if err := serverRunner.SetupWithManager(mgr); err != nil { + if err := serverRunner.SetupWithManager(context.Background(), mgr); err != nil { logutil.Fatal(logger, err, "Failed to setup server runner") } From b9bbc2e1864521cfe62fe09c924638e60b79a6ab Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Thu, 27 Feb 2025 20:02:31 +0000 Subject: [PATCH 57/96] Updated yamls to use v1alpha2 (#420) --- config/manifests/ext_proc.yaml | 2 +- config/manifests/inferencemodel.yaml | 2 +- test/testdata/inferencepool-with-model-hermetic.yaml | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/config/manifests/ext_proc.yaml b/config/manifests/ext_proc.yaml index 49145d24..f96113e1 100644 --- a/config/manifests/ext_proc.yaml +++ b/config/manifests/ext_proc.yaml @@ -40,7 +40,7 @@ roleRef: kind: ClusterRole name: pod-read --- -apiVersion: inference.networking.x-k8s.io/v1alpha1 +apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferencePool metadata: labels: diff --git a/config/manifests/inferencemodel.yaml b/config/manifests/inferencemodel.yaml index 2a292c16..57240298 100644 --- a/config/manifests/inferencemodel.yaml +++ b/config/manifests/inferencemodel.yaml @@ -1,4 +1,4 @@ -apiVersion: inference.networking.x-k8s.io/v1alpha1 +apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferenceModel metadata: name: inferencemodel-sample diff --git a/test/testdata/inferencepool-with-model-hermetic.yaml b/test/testdata/inferencepool-with-model-hermetic.yaml index 372a8512..c9ca763e 100644 --- a/test/testdata/inferencepool-with-model-hermetic.yaml +++ b/test/testdata/inferencepool-with-model-hermetic.yaml @@ -1,4 +1,4 @@ -apiVersion: inference.networking.x-k8s.io/v1alpha1 +apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferencePool metadata: name: vllm-llama2-7b-pool @@ -10,7 +10,7 @@ spec: extensionRef: name: epp --- -apiVersion: inference.networking.x-k8s.io/v1alpha1 +apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferenceModel metadata: name: inferencemodel-sample @@ -24,7 +24,7 @@ spec: - name: sql-lora-1fdg2 weight: 100 --- -apiVersion: inference.networking.x-k8s.io/v1alpha1 +apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferenceModel metadata: name: inferencemodel-sheddable @@ -37,7 +37,7 @@ spec: - name: sql-lora-1fdg3 weight: 100 --- -apiVersion: inference.networking.x-k8s.io/v1alpha1 +apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferenceModel metadata: name: inferencemodel-generic From 10133bafe4e28368b2df52ff2d59ee151b38ca68 Mon Sep 17 00:00:00 2001 From: Tiger Xu / Zhonghu Xu Date: Fri, 28 Feb 2025 04:14:31 +0800 Subject: [PATCH 58/96] Rm v1alpha1 api (#405) * remove v1alpha1 * auto gen * Add document to disallow cross namespace match explicitly --- api/v1alpha1/doc.go | 23 -- api/v1alpha1/groupversion_info.go | 45 --- api/v1alpha1/inferencemodel_types.go | 234 ------------ api/v1alpha1/inferencepool_types.go | 238 ------------ api/v1alpha1/zz_generated.deepcopy.go | 361 ------------------ api/v1alpha2/inferencepool_types.go | 2 + .../api/v1alpha1/endpointpickerconfig.go | 38 -- .../api/v1alpha1/extension.go | 75 ---- .../api/v1alpha1/extensionconnection.go | 42 -- .../api/v1alpha1/extensionreference.go | 65 ---- .../api/v1alpha1/inferencemodel.go | 224 ----------- .../api/v1alpha1/inferencemodelspec.go | 74 ---- .../api/v1alpha1/inferencemodelstatus.go | 47 --- .../api/v1alpha1/inferencepool.go | 224 ----------- .../api/v1alpha1/inferencepoolspec.go | 66 ---- .../api/v1alpha1/inferencepoolstatus.go | 47 --- .../api/v1alpha1/poolobjectreference.go | 56 --- .../api/v1alpha1/targetmodel.go | 47 --- client-go/applyconfiguration/utils.go | 30 +- client-go/clientset/versioned/clientset.go | 13 - .../versioned/fake/clientset_generated.go | 7 - .../clientset/versioned/fake/register.go | 2 - .../clientset/versioned/scheme/register.go | 2 - .../typed/api/v1alpha1/api_client.go | 111 ------ .../versioned/typed/api/v1alpha1/doc.go | 19 - .../versioned/typed/api/v1alpha1/fake/doc.go | 19 - .../api/v1alpha1/fake/fake_api_client.go | 43 --- .../api/v1alpha1/fake/fake_inferencemodel.go | 52 --- .../api/v1alpha1/fake/fake_inferencepool.go | 52 --- .../typed/api/v1alpha1/generated_expansion.go | 22 -- .../typed/api/v1alpha1/inferencemodel.go | 73 ---- .../typed/api/v1alpha1/inferencepool.go | 73 ---- .../externalversions/api/interface.go | 8 - .../api/v1alpha1/inferencemodel.go | 89 ----- .../api/v1alpha1/inferencepool.go | 89 ----- .../api/v1alpha1/interface.go | 51 --- .../informers/externalversions/generic.go | 9 +- .../api/v1alpha1/expansion_generated.go | 34 -- .../listers/api/v1alpha1/inferencemodel.go | 69 ---- .../listers/api/v1alpha1/inferencepool.go | 69 ---- cmd/epp/main.go | 3 - ...e.networking.x-k8s.io_inferencemodels.yaml | 224 ----------- ...ce.networking.x-k8s.io_inferencepools.yaml | 192 +--------- 43 files changed, 6 insertions(+), 3257 deletions(-) delete mode 100644 api/v1alpha1/doc.go delete mode 100644 api/v1alpha1/groupversion_info.go delete mode 100644 api/v1alpha1/inferencemodel_types.go delete mode 100644 api/v1alpha1/inferencepool_types.go delete mode 100644 api/v1alpha1/zz_generated.deepcopy.go delete mode 100644 client-go/applyconfiguration/api/v1alpha1/endpointpickerconfig.go delete mode 100644 client-go/applyconfiguration/api/v1alpha1/extension.go delete mode 100644 client-go/applyconfiguration/api/v1alpha1/extensionconnection.go delete mode 100644 client-go/applyconfiguration/api/v1alpha1/extensionreference.go delete mode 100644 client-go/applyconfiguration/api/v1alpha1/inferencemodel.go delete mode 100644 client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go delete mode 100644 client-go/applyconfiguration/api/v1alpha1/inferencemodelstatus.go delete mode 100644 client-go/applyconfiguration/api/v1alpha1/inferencepool.go delete mode 100644 client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go delete mode 100644 client-go/applyconfiguration/api/v1alpha1/inferencepoolstatus.go delete mode 100644 client-go/applyconfiguration/api/v1alpha1/poolobjectreference.go delete mode 100644 client-go/applyconfiguration/api/v1alpha1/targetmodel.go delete mode 100644 client-go/clientset/versioned/typed/api/v1alpha1/api_client.go delete mode 100644 client-go/clientset/versioned/typed/api/v1alpha1/doc.go delete mode 100644 client-go/clientset/versioned/typed/api/v1alpha1/fake/doc.go delete mode 100644 client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go delete mode 100644 client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go delete mode 100644 client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go delete mode 100644 client-go/clientset/versioned/typed/api/v1alpha1/generated_expansion.go delete mode 100644 client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go delete mode 100644 client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go delete mode 100644 client-go/informers/externalversions/api/v1alpha1/inferencemodel.go delete mode 100644 client-go/informers/externalversions/api/v1alpha1/inferencepool.go delete mode 100644 client-go/informers/externalversions/api/v1alpha1/interface.go delete mode 100644 client-go/listers/api/v1alpha1/expansion_generated.go delete mode 100644 client-go/listers/api/v1alpha1/inferencemodel.go delete mode 100644 client-go/listers/api/v1alpha1/inferencepool.go diff --git a/api/v1alpha1/doc.go b/api/v1alpha1/doc.go deleted file mode 100644 index 8e970ced..00000000 --- a/api/v1alpha1/doc.go +++ /dev/null @@ -1,23 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package v1alpha1 contains API Schema definitions for the -// inference.networking.x-k8s.io API group. -// -// +k8s:openapi-gen=true -// +kubebuilder:object:generate=true -// +groupName=inference.networking.x-k8s.io -package v1alpha1 diff --git a/api/v1alpha1/groupversion_info.go b/api/v1alpha1/groupversion_info.go deleted file mode 100644 index 8c0a449f..00000000 --- a/api/v1alpha1/groupversion_info.go +++ /dev/null @@ -1,45 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package v1alpha1 contains API Schema definitions for the gateway v1alpha1 API group -// +kubebuilder:object:generate=true -// +groupName=inference.networking.x-k8s.io -package v1alpha1 - -import ( - "k8s.io/apimachinery/pkg/runtime/schema" - "sigs.k8s.io/controller-runtime/pkg/scheme" -) - -var ( - // GroupVersion is group version used to register these objects - GroupVersion = schema.GroupVersion{Group: "inference.networking.x-k8s.io", Version: "v1alpha1"} - - // SchemeGroupVersion is alias to GroupVersion for client-go libraries. - // It is required by pkg/client/informers/externalversions/... - SchemeGroupVersion = GroupVersion - - // SchemeBuilder is used to add go types to the GroupVersionKind scheme - SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} - - // AddToScheme adds the types in this group-version to the given scheme. - AddToScheme = SchemeBuilder.AddToScheme -) - -// Resource is required by pkg/client/listers/... -func Resource(resource string) schema.GroupResource { - return GroupVersion.WithResource(resource).GroupResource() -} diff --git a/api/v1alpha1/inferencemodel_types.go b/api/v1alpha1/inferencemodel_types.go deleted file mode 100644 index f171c10e..00000000 --- a/api/v1alpha1/inferencemodel_types.go +++ /dev/null @@ -1,234 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1alpha1 - -import ( - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -// InferenceModel is the Schema for the InferenceModels API. -// -// +kubebuilder:object:root=true -// +kubebuilder:subresource:status -// +genclient -type InferenceModel struct { - metav1.TypeMeta `json:",inline"` - metav1.ObjectMeta `json:"metadata,omitempty"` - - Spec InferenceModelSpec `json:"spec,omitempty"` - Status InferenceModelStatus `json:"status,omitempty"` -} - -// InferenceModelList contains a list of InferenceModel. -// -// +kubebuilder:object:root=true -type InferenceModelList struct { - metav1.TypeMeta `json:",inline"` - metav1.ListMeta `json:"metadata,omitempty"` - Items []InferenceModel `json:"items"` -} - -// InferenceModelSpec represents the desired state of a specific model use case. This resource is -// managed by the "Inference Workload Owner" persona. -// -// The Inference Workload Owner persona is someone that trains, verifies, and -// leverages a large language model from a model frontend, drives the lifecycle -// and rollout of new versions of those models, and defines the specific -// performance and latency goals for the model. These workloads are -// expected to operate within an InferencePool sharing compute capacity with other -// InferenceModels, defined by the Inference Platform Admin. -// -// InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool, -// if the name is reused, an error will be shown on the status of a -// InferenceModel that attempted to reuse. The oldest InferenceModel, based on -// creation timestamp, will be selected to remain valid. In the event of a race -// condition, one will be selected at random. -type InferenceModelSpec struct { - // ModelName is the name of the model as it will be set in the "model" parameter for an incoming request. - // ModelNames must be unique for a referencing InferencePool - // (names can be reused for a different pool in the same cluster). - // The modelName with the oldest creation timestamp is retained, and the incoming - // InferenceModel is sets the Ready status to false with a corresponding reason. - // In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected. - // Names can be reserved without an underlying model configured in the pool. - // This can be done by specifying a target model and setting the weight to zero, - // an error will be returned specifying that no valid target model is found. - // - // +kubebuilder:validation:MaxLength=256 - // +kubebuilder:validation:Required - ModelName string `json:"modelName"` - - // Criticality defines how important it is to serve the model compared to other models referencing the same pool. - // Criticality impacts how traffic is handled in resource constrained situations. It handles this by - // queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will - // fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness, - // and the proportionality of fairness will be configurable. - // - // Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field. - // Any implementations that may consume this field may treat an unset value as the 'Standard' range. - // +optional - Criticality *Criticality `json:"criticality,omitempty"` - - // TargetModels allow multiple versions of a model for traffic splitting. - // If not specified, the target model name is defaulted to the modelName parameter. - // modelName is often in reference to a LoRA adapter. - // - // +optional - // +kubebuilder:validation:MaxItems=10 - // +kubebuilder:validation:XValidation:message="Weights should be set for all models, or none of the models.",rule="self.all(model, has(model.weight)) || self.all(model, !has(model.weight))" - TargetModels []TargetModel `json:"targetModels,omitempty"` - - // PoolRef is a reference to the inference pool, the pool must exist in the same namespace. - // - // +kubebuilder:validation:Required - PoolRef PoolObjectReference `json:"poolRef"` -} - -// PoolObjectReference identifies an API object within the namespace of the -// referrer. -type PoolObjectReference struct { - // Group is the group of the referent. - // - // +optional - // +kubebuilder:default="inference.networking.x-k8s.io" - // +kubebuilder:validation:MaxLength=253 - // +kubebuilder:validation:Pattern=`^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$` - Group string `json:"group,omitempty"` - - // Kind is kind of the referent. For example "InferencePool". - // - // +optional - // +kubebuilder:default="InferencePool" - // +kubebuilder:validation:MinLength=1 - // +kubebuilder:validation:MaxLength=63 - // +kubebuilder:validation:Pattern=`^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$` - Kind string `json:"kind,omitempty"` - - // Name is the name of the referent. - // - // +kubebuilder:validation:MinLength=1 - // +kubebuilder:validation:MaxLength=253 - // +kubebuilder:validation:Required - Name string `json:"name"` -} - -// Criticality defines how important it is to serve the model compared to other models. -// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional(use a pointer), and set no default. -// This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior. -// +kubebuilder:validation:Enum=Critical;Standard;Sheddable -type Criticality string - -const ( - // Critical defines the highest level of criticality. Requests to this band will be shed last. - Critical Criticality = "Critical" - - // Standard defines the base criticality level and is more important than Sheddable but less - // important than Critical. Requests in this band will be shed before critical traffic. - // Most models are expected to fall within this band. - Standard Criticality = "Standard" - - // Sheddable defines the lowest level of criticality. Requests to this band will be shed before - // all other bands. - Sheddable Criticality = "Sheddable" -) - -// TargetModel represents a deployed model or a LoRA adapter. The -// Name field is expected to match the name of the LoRA adapter -// (or base model) as it is registered within the model server. Inference -// Gateway assumes that the model exists on the model server and it's the -// responsibility of the user to validate a correct match. Should a model fail -// to exist at request time, the error is processed by the Inference Gateway -// and emitted on the appropriate InferenceModel object. -type TargetModel struct { - // Name is the name of the adapter or base model, as expected by the ModelServer. - // - // +kubebuilder:validation:MaxLength=253 - // +kubebuilder:validation:Required - Name string `json:"name"` - - // Weight is used to determine the proportion of traffic that should be - // sent to this model when multiple target models are specified. - // - // Weight defines the proportion of requests forwarded to the specified - // model. This is computed as weight/(sum of all weights in this - // TargetModels list). For non-zero values, there may be some epsilon from - // the exact proportion defined here depending on the precision an - // implementation supports. Weight is not a percentage and the sum of - // weights does not need to equal 100. - // - // If a weight is set for any targetModel, it must be set for all targetModels. - // Conversely weights are optional, so long as ALL targetModels do not specify a weight. - // - // +optional - // +kubebuilder:validation:Minimum=0 - // +kubebuilder:validation:Maximum=1000000 - Weight *int32 `json:"weight,omitempty"` -} - -// InferenceModelStatus defines the observed state of InferenceModel -type InferenceModelStatus struct { - // Conditions track the state of the InferenceModel. - // - // Known condition types are: - // - // * "Accepted" - // - // +optional - // +listType=map - // +listMapKey=type - // +kubebuilder:validation:MaxItems=8 - // +kubebuilder:default={{type: "Ready", status: "Unknown", reason:"Pending", message:"Waiting for controller", lastTransitionTime: "1970-01-01T00:00:00Z"}} - Conditions []metav1.Condition `json:"conditions,omitempty"` -} - -// InferenceModelConditionType is a type of condition for the InferenceModel. -type InferenceModelConditionType string - -// InferenceModelConditionReason is the reason for a given InferenceModelConditionType. -type InferenceModelConditionReason string - -const ( - // ModelConditionAccepted indicates if the model config is accepted, and if not, why. - // - // Possible reasons for this condition to be True are: - // - // * "Accepted" - // - // Possible reasons for this condition to be False are: - // - // * "ModelNameInUse" - // - // Possible reasons for this condition to be Unknown are: - // - // * "Pending" - // - ModelConditionAccepted InferenceModelConditionType = "Accepted" - - // ModelReasonAccepted is the desired state. Model conforms to the state of the pool. - ModelReasonAccepted InferenceModelConditionReason = "Accepted" - - // ModelReasonNameInUse is used when a given ModelName already exists within the pool. - // Details about naming conflict resolution are on the ModelName field itself. - ModelReasonNameInUse InferenceModelConditionReason = "ModelNameInUse" - - // ModelReasonPending is the initial state, and indicates that the controller has not yet reconciled the InferenceModel. - ModelReasonPending InferenceModelConditionReason = "Pending" -) - -func init() { - SchemeBuilder.Register(&InferenceModel{}, &InferenceModelList{}) -} diff --git a/api/v1alpha1/inferencepool_types.go b/api/v1alpha1/inferencepool_types.go deleted file mode 100644 index b4c95d40..00000000 --- a/api/v1alpha1/inferencepool_types.go +++ /dev/null @@ -1,238 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package v1alpha1 - -import ( - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -// InferencePool is the Schema for the InferencePools API. -// -// +kubebuilder:object:root=true -// +kubebuilder:subresource:status -// +genclient -type InferencePool struct { - metav1.TypeMeta `json:",inline"` - metav1.ObjectMeta `json:"metadata,omitempty"` - - Spec InferencePoolSpec `json:"spec,omitempty"` - Status InferencePoolStatus `json:"status,omitempty"` -} - -// InferencePoolList contains a list of InferencePool. -// -// +kubebuilder:object:root=true -type InferencePoolList struct { - metav1.TypeMeta `json:",inline"` - metav1.ListMeta `json:"metadata,omitempty"` - Items []InferencePool `json:"items"` -} - -// InferencePoolSpec defines the desired state of InferencePool -type InferencePoolSpec struct { - // Selector defines a map of labels to watch model server pods - // that should be included in the InferencePool. - // In some cases, implementations may translate this field to a Service selector, so this matches the simple - // map used for Service selectors instead of the full Kubernetes LabelSelector type. - // - // +kubebuilder:validation:Required - Selector map[LabelKey]LabelValue `json:"selector"` - - // TargetPortNumber defines the port number to access the selected model servers. - // The number must be in the range 1 to 65535. - // - // +kubebuilder:validation:Minimum=1 - // +kubebuilder:validation:Maximum=65535 - // +kubebuilder:validation:Required - TargetPortNumber int32 `json:"targetPortNumber"` - - // EndpointPickerConfig specifies the configuration needed by the proxy to discover and connect to the endpoint - // picker service that picks endpoints for the requests routed to this pool. - EndpointPickerConfig `json:",inline"` -} - -// EndpointPickerConfig specifies the configuration needed by the proxy to discover and connect to the endpoint picker extension. -// This type is intended to be a union of mutually exclusive configuration options that we may add in the future. -type EndpointPickerConfig struct { - // Extension configures an endpoint picker as an extension service. - // - // +kubebuilder:validation:Required - ExtensionRef *Extension `json:"extensionRef,omitempty"` -} - -// Extension specifies how to configure an extension that runs the endpoint picker. -type Extension struct { - // Reference is a reference to a service extension. - ExtensionReference `json:",inline"` - - // ExtensionConnection configures the connection between the gateway and the extension. - ExtensionConnection `json:",inline"` -} - -// ExtensionReference is a reference to the extension deployment. -type ExtensionReference struct { - // Group is the group of the referent. - // When unspecified or empty string, core API group is inferred. - // - // +optional - // +kubebuilder:default="" - Group *string `json:"group,omitempty"` - - // Kind is the Kubernetes resource kind of the referent. For example - // "Service". - // - // Defaults to "Service" when not specified. - // - // ExternalName services can refer to CNAME DNS records that may live - // outside of the cluster and as such are difficult to reason about in - // terms of conformance. They also may not be safe to forward to (see - // CVE-2021-25740 for more information). Implementations MUST NOT - // support ExternalName Services. - // - // +optional - // +kubebuilder:default=Service - Kind *string `json:"kind,omitempty"` - - // Name is the name of the referent. - // - // +kubebuilder:validation:Required - Name string `json:"name"` - - // The port number on the pods running the extension. When unspecified, implementations SHOULD infer a - // default value of 9002 when the Kind is Service. - // - // +kubebuilder:validation:Minimum=1 - // +kubebuilder:validation:Maximum=65535 - // +optional - TargetPortNumber *int32 `json:"targetPortNumber,omitempty"` -} - -// ExtensionConnection encapsulates options that configures the connection to the extension. -type ExtensionConnection struct { - // Configures how the gateway handles the case when the extension is not responsive. - // Defaults to failClose. - // - // +optional - // +kubebuilder:default="FailClose" - FailureMode *ExtensionFailureMode `json:"failureMode"` -} - -// ExtensionFailureMode defines the options for how the gateway handles the case when the extension is not -// responsive. -// +kubebuilder:validation:Enum=FailOpen;FailClose -type ExtensionFailureMode string - -const ( - // FailOpen specifies that the proxy should not drop the request and forward the request to and endpoint of its picking. - FailOpen ExtensionFailureMode = "FailOpen" - // FailClose specifies that the proxy should drop the request. - FailClose ExtensionFailureMode = "FailClose" -) - -// LabelKey was originally copied from: https://github.com/kubernetes-sigs/gateway-api/blob/99a3934c6bc1ce0874f3a4c5f20cafd8977ffcb4/apis/v1/shared_types.go#L694-L731 -// Duplicated as to not take an unexpected dependency on gw's API. -// -// LabelKey is the key of a label. This is used for validation -// of maps. This matches the Kubernetes "qualified name" validation that is used for labels. -// Labels are case sensitive, so: my-label and My-Label are considered distinct. -// -// Valid values include: -// -// * example -// * example.com -// * example.com/path -// * example.com/path.html -// -// Invalid values include: -// -// * example~ - "~" is an invalid character -// * example.com. - can not start or end with "." -// -// +kubebuilder:validation:MinLength=1 -// +kubebuilder:validation:MaxLength=253 -// +kubebuilder:validation:Pattern=`^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9]$` -type LabelKey string - -// LabelValue is the value of a label. This is used for validation -// of maps. This matches the Kubernetes label validation rules: -// * must be 63 characters or less (can be empty), -// * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]), -// * could contain dashes (-), underscores (_), dots (.), and alphanumerics between. -// -// Valid values include: -// -// * MyValue -// * my.name -// * 123-my-value -// -// +kubebuilder:validation:MinLength=0 -// +kubebuilder:validation:MaxLength=63 -// +kubebuilder:validation:Pattern=`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$` -type LabelValue string - -// InferencePoolStatus defines the observed state of InferencePool -type InferencePoolStatus struct { - // Conditions track the state of the InferencePool. - // - // Known condition types are: - // - // * "Ready" - // - // +optional - // +listType=map - // +listMapKey=type - // +kubebuilder:validation:MaxItems=8 - // +kubebuilder:default={{type: "Ready", status: "Unknown", reason:"Pending", message:"Waiting for controller", lastTransitionTime: "1970-01-01T00:00:00Z"}} - Conditions []metav1.Condition `json:"conditions,omitempty"` -} - -// InferencePoolConditionType is a type of condition for the InferencePool -type InferencePoolConditionType string - -// InferencePoolConditionReason is the reason for a given InferencePoolConditionType -type InferencePoolConditionReason string - -const ( - // PoolConditionReady indicates if the pool is ready to accept traffic, and if not, why. - // - // Possible reasons for this condition to be True are: - // - // * "Ready" - // - // Possible reasons for this condition to be False are: - // - // * "EndpointPickerNotHealthy" - // - // Possible reasons for this condition to be Unknown are: - // - // * "Pending" - // - PoolConditionReady InferencePoolConditionType = "Ready" - - // PoolReasonReady is the desired state. The pool and its components are initialized and ready for traffic. - PoolReasonReady InferencePoolConditionReason = "Ready" - - // PoolReasonEPPNotHealthy is used when the EPP has not yet passed health checks, or has started failing them. - PoolReasonEPPNotHealthy InferencePoolConditionReason = "EndpointPickerNotHealthy" - - // PoolReasonPending is the initial state, and indicates that the controller has not yet reconciled this pool. - PoolReasonPending InferencePoolConditionReason = "Pending" -) - -func init() { - SchemeBuilder.Register(&InferencePool{}, &InferencePoolList{}) -} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go deleted file mode 100644 index fd55379e..00000000 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ /dev/null @@ -1,361 +0,0 @@ -//go:build !ignore_autogenerated - -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Code generated by controller-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - "k8s.io/apimachinery/pkg/apis/meta/v1" - runtime "k8s.io/apimachinery/pkg/runtime" -) - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *EndpointPickerConfig) DeepCopyInto(out *EndpointPickerConfig) { - *out = *in - if in.ExtensionRef != nil { - in, out := &in.ExtensionRef, &out.ExtensionRef - *out = new(Extension) - (*in).DeepCopyInto(*out) - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EndpointPickerConfig. -func (in *EndpointPickerConfig) DeepCopy() *EndpointPickerConfig { - if in == nil { - return nil - } - out := new(EndpointPickerConfig) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *Extension) DeepCopyInto(out *Extension) { - *out = *in - in.ExtensionReference.DeepCopyInto(&out.ExtensionReference) - in.ExtensionConnection.DeepCopyInto(&out.ExtensionConnection) -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Extension. -func (in *Extension) DeepCopy() *Extension { - if in == nil { - return nil - } - out := new(Extension) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ExtensionConnection) DeepCopyInto(out *ExtensionConnection) { - *out = *in - if in.FailureMode != nil { - in, out := &in.FailureMode, &out.FailureMode - *out = new(ExtensionFailureMode) - **out = **in - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExtensionConnection. -func (in *ExtensionConnection) DeepCopy() *ExtensionConnection { - if in == nil { - return nil - } - out := new(ExtensionConnection) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ExtensionReference) DeepCopyInto(out *ExtensionReference) { - *out = *in - if in.Group != nil { - in, out := &in.Group, &out.Group - *out = new(string) - **out = **in - } - if in.Kind != nil { - in, out := &in.Kind, &out.Kind - *out = new(string) - **out = **in - } - if in.TargetPortNumber != nil { - in, out := &in.TargetPortNumber, &out.TargetPortNumber - *out = new(int32) - **out = **in - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExtensionReference. -func (in *ExtensionReference) DeepCopy() *ExtensionReference { - if in == nil { - return nil - } - out := new(ExtensionReference) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *InferenceModel) DeepCopyInto(out *InferenceModel) { - *out = *in - out.TypeMeta = in.TypeMeta - in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - in.Spec.DeepCopyInto(&out.Spec) - in.Status.DeepCopyInto(&out.Status) -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferenceModel. -func (in *InferenceModel) DeepCopy() *InferenceModel { - if in == nil { - return nil - } - out := new(InferenceModel) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *InferenceModel) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *InferenceModelList) DeepCopyInto(out *InferenceModelList) { - *out = *in - out.TypeMeta = in.TypeMeta - in.ListMeta.DeepCopyInto(&out.ListMeta) - if in.Items != nil { - in, out := &in.Items, &out.Items - *out = make([]InferenceModel, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferenceModelList. -func (in *InferenceModelList) DeepCopy() *InferenceModelList { - if in == nil { - return nil - } - out := new(InferenceModelList) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *InferenceModelList) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *InferenceModelSpec) DeepCopyInto(out *InferenceModelSpec) { - *out = *in - if in.Criticality != nil { - in, out := &in.Criticality, &out.Criticality - *out = new(Criticality) - **out = **in - } - if in.TargetModels != nil { - in, out := &in.TargetModels, &out.TargetModels - *out = make([]TargetModel, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } - out.PoolRef = in.PoolRef -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferenceModelSpec. -func (in *InferenceModelSpec) DeepCopy() *InferenceModelSpec { - if in == nil { - return nil - } - out := new(InferenceModelSpec) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *InferenceModelStatus) DeepCopyInto(out *InferenceModelStatus) { - *out = *in - if in.Conditions != nil { - in, out := &in.Conditions, &out.Conditions - *out = make([]v1.Condition, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferenceModelStatus. -func (in *InferenceModelStatus) DeepCopy() *InferenceModelStatus { - if in == nil { - return nil - } - out := new(InferenceModelStatus) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *InferencePool) DeepCopyInto(out *InferencePool) { - *out = *in - out.TypeMeta = in.TypeMeta - in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - in.Spec.DeepCopyInto(&out.Spec) - in.Status.DeepCopyInto(&out.Status) -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferencePool. -func (in *InferencePool) DeepCopy() *InferencePool { - if in == nil { - return nil - } - out := new(InferencePool) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *InferencePool) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *InferencePoolList) DeepCopyInto(out *InferencePoolList) { - *out = *in - out.TypeMeta = in.TypeMeta - in.ListMeta.DeepCopyInto(&out.ListMeta) - if in.Items != nil { - in, out := &in.Items, &out.Items - *out = make([]InferencePool, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferencePoolList. -func (in *InferencePoolList) DeepCopy() *InferencePoolList { - if in == nil { - return nil - } - out := new(InferencePoolList) - in.DeepCopyInto(out) - return out -} - -// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. -func (in *InferencePoolList) DeepCopyObject() runtime.Object { - if c := in.DeepCopy(); c != nil { - return c - } - return nil -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *InferencePoolSpec) DeepCopyInto(out *InferencePoolSpec) { - *out = *in - if in.Selector != nil { - in, out := &in.Selector, &out.Selector - *out = make(map[LabelKey]LabelValue, len(*in)) - for key, val := range *in { - (*out)[key] = val - } - } - in.EndpointPickerConfig.DeepCopyInto(&out.EndpointPickerConfig) -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferencePoolSpec. -func (in *InferencePoolSpec) DeepCopy() *InferencePoolSpec { - if in == nil { - return nil - } - out := new(InferencePoolSpec) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *InferencePoolStatus) DeepCopyInto(out *InferencePoolStatus) { - *out = *in - if in.Conditions != nil { - in, out := &in.Conditions, &out.Conditions - *out = make([]v1.Condition, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferencePoolStatus. -func (in *InferencePoolStatus) DeepCopy() *InferencePoolStatus { - if in == nil { - return nil - } - out := new(InferencePoolStatus) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *PoolObjectReference) DeepCopyInto(out *PoolObjectReference) { - *out = *in -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PoolObjectReference. -func (in *PoolObjectReference) DeepCopy() *PoolObjectReference { - if in == nil { - return nil - } - out := new(PoolObjectReference) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *TargetModel) DeepCopyInto(out *TargetModel) { - *out = *in - if in.Weight != nil { - in, out := &in.Weight, &out.Weight - *out = new(int32) - **out = **in - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TargetModel. -func (in *TargetModel) DeepCopy() *TargetModel { - if in == nil { - return nil - } - out := new(TargetModel) - in.DeepCopyInto(out) - return out -} diff --git a/api/v1alpha2/inferencepool_types.go b/api/v1alpha2/inferencepool_types.go index 716bfb11..0781f044 100644 --- a/api/v1alpha2/inferencepool_types.go +++ b/api/v1alpha2/inferencepool_types.go @@ -50,6 +50,8 @@ type InferencePoolSpec struct { // that should be included in the InferencePool. // In some cases, implementations may translate this field to a Service selector, so this matches the simple // map used for Service selectors instead of the full Kubernetes LabelSelector type. + // If sepecified, it will be applied to match the model server pods in the same namespace as the InferencePool. + // Cross namesoace selector is not supported. // // +kubebuilder:validation:Required Selector map[LabelKey]LabelValue `json:"selector"` diff --git a/client-go/applyconfiguration/api/v1alpha1/endpointpickerconfig.go b/client-go/applyconfiguration/api/v1alpha1/endpointpickerconfig.go deleted file mode 100644 index 91895ddc..00000000 --- a/client-go/applyconfiguration/api/v1alpha1/endpointpickerconfig.go +++ /dev/null @@ -1,38 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by applyconfiguration-gen. DO NOT EDIT. - -package v1alpha1 - -// EndpointPickerConfigApplyConfiguration represents a declarative configuration of the EndpointPickerConfig type for use -// with apply. -type EndpointPickerConfigApplyConfiguration struct { - ExtensionRef *ExtensionApplyConfiguration `json:"extensionRef,omitempty"` -} - -// EndpointPickerConfigApplyConfiguration constructs a declarative configuration of the EndpointPickerConfig type for use with -// apply. -func EndpointPickerConfig() *EndpointPickerConfigApplyConfiguration { - return &EndpointPickerConfigApplyConfiguration{} -} - -// WithExtensionRef sets the ExtensionRef field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the ExtensionRef field is set to the value of the last call. -func (b *EndpointPickerConfigApplyConfiguration) WithExtensionRef(value *ExtensionApplyConfiguration) *EndpointPickerConfigApplyConfiguration { - b.ExtensionRef = value - return b -} diff --git a/client-go/applyconfiguration/api/v1alpha1/extension.go b/client-go/applyconfiguration/api/v1alpha1/extension.go deleted file mode 100644 index 4213af88..00000000 --- a/client-go/applyconfiguration/api/v1alpha1/extension.go +++ /dev/null @@ -1,75 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by applyconfiguration-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" -) - -// ExtensionApplyConfiguration represents a declarative configuration of the Extension type for use -// with apply. -type ExtensionApplyConfiguration struct { - ExtensionReferenceApplyConfiguration `json:",inline"` - ExtensionConnectionApplyConfiguration `json:",inline"` -} - -// ExtensionApplyConfiguration constructs a declarative configuration of the Extension type for use with -// apply. -func Extension() *ExtensionApplyConfiguration { - return &ExtensionApplyConfiguration{} -} - -// WithGroup sets the Group field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Group field is set to the value of the last call. -func (b *ExtensionApplyConfiguration) WithGroup(value string) *ExtensionApplyConfiguration { - b.ExtensionReferenceApplyConfiguration.Group = &value - return b -} - -// WithKind sets the Kind field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Kind field is set to the value of the last call. -func (b *ExtensionApplyConfiguration) WithKind(value string) *ExtensionApplyConfiguration { - b.ExtensionReferenceApplyConfiguration.Kind = &value - return b -} - -// WithName sets the Name field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Name field is set to the value of the last call. -func (b *ExtensionApplyConfiguration) WithName(value string) *ExtensionApplyConfiguration { - b.ExtensionReferenceApplyConfiguration.Name = &value - return b -} - -// WithTargetPortNumber sets the TargetPortNumber field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the TargetPortNumber field is set to the value of the last call. -func (b *ExtensionApplyConfiguration) WithTargetPortNumber(value int32) *ExtensionApplyConfiguration { - b.ExtensionReferenceApplyConfiguration.TargetPortNumber = &value - return b -} - -// WithFailureMode sets the FailureMode field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the FailureMode field is set to the value of the last call. -func (b *ExtensionApplyConfiguration) WithFailureMode(value apiv1alpha1.ExtensionFailureMode) *ExtensionApplyConfiguration { - b.ExtensionConnectionApplyConfiguration.FailureMode = &value - return b -} diff --git a/client-go/applyconfiguration/api/v1alpha1/extensionconnection.go b/client-go/applyconfiguration/api/v1alpha1/extensionconnection.go deleted file mode 100644 index ff8752a9..00000000 --- a/client-go/applyconfiguration/api/v1alpha1/extensionconnection.go +++ /dev/null @@ -1,42 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by applyconfiguration-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" -) - -// ExtensionConnectionApplyConfiguration represents a declarative configuration of the ExtensionConnection type for use -// with apply. -type ExtensionConnectionApplyConfiguration struct { - FailureMode *apiv1alpha1.ExtensionFailureMode `json:"failureMode,omitempty"` -} - -// ExtensionConnectionApplyConfiguration constructs a declarative configuration of the ExtensionConnection type for use with -// apply. -func ExtensionConnection() *ExtensionConnectionApplyConfiguration { - return &ExtensionConnectionApplyConfiguration{} -} - -// WithFailureMode sets the FailureMode field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the FailureMode field is set to the value of the last call. -func (b *ExtensionConnectionApplyConfiguration) WithFailureMode(value apiv1alpha1.ExtensionFailureMode) *ExtensionConnectionApplyConfiguration { - b.FailureMode = &value - return b -} diff --git a/client-go/applyconfiguration/api/v1alpha1/extensionreference.go b/client-go/applyconfiguration/api/v1alpha1/extensionreference.go deleted file mode 100644 index c72c0306..00000000 --- a/client-go/applyconfiguration/api/v1alpha1/extensionreference.go +++ /dev/null @@ -1,65 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by applyconfiguration-gen. DO NOT EDIT. - -package v1alpha1 - -// ExtensionReferenceApplyConfiguration represents a declarative configuration of the ExtensionReference type for use -// with apply. -type ExtensionReferenceApplyConfiguration struct { - Group *string `json:"group,omitempty"` - Kind *string `json:"kind,omitempty"` - Name *string `json:"name,omitempty"` - TargetPortNumber *int32 `json:"targetPortNumber,omitempty"` -} - -// ExtensionReferenceApplyConfiguration constructs a declarative configuration of the ExtensionReference type for use with -// apply. -func ExtensionReference() *ExtensionReferenceApplyConfiguration { - return &ExtensionReferenceApplyConfiguration{} -} - -// WithGroup sets the Group field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Group field is set to the value of the last call. -func (b *ExtensionReferenceApplyConfiguration) WithGroup(value string) *ExtensionReferenceApplyConfiguration { - b.Group = &value - return b -} - -// WithKind sets the Kind field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Kind field is set to the value of the last call. -func (b *ExtensionReferenceApplyConfiguration) WithKind(value string) *ExtensionReferenceApplyConfiguration { - b.Kind = &value - return b -} - -// WithName sets the Name field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Name field is set to the value of the last call. -func (b *ExtensionReferenceApplyConfiguration) WithName(value string) *ExtensionReferenceApplyConfiguration { - b.Name = &value - return b -} - -// WithTargetPortNumber sets the TargetPortNumber field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the TargetPortNumber field is set to the value of the last call. -func (b *ExtensionReferenceApplyConfiguration) WithTargetPortNumber(value int32) *ExtensionReferenceApplyConfiguration { - b.TargetPortNumber = &value - return b -} diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencemodel.go b/client-go/applyconfiguration/api/v1alpha1/inferencemodel.go deleted file mode 100644 index d2a5b2b4..00000000 --- a/client-go/applyconfiguration/api/v1alpha1/inferencemodel.go +++ /dev/null @@ -1,224 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by applyconfiguration-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - types "k8s.io/apimachinery/pkg/types" - v1 "k8s.io/client-go/applyconfigurations/meta/v1" -) - -// InferenceModelApplyConfiguration represents a declarative configuration of the InferenceModel type for use -// with apply. -type InferenceModelApplyConfiguration struct { - v1.TypeMetaApplyConfiguration `json:",inline"` - *v1.ObjectMetaApplyConfiguration `json:"metadata,omitempty"` - Spec *InferenceModelSpecApplyConfiguration `json:"spec,omitempty"` - Status *InferenceModelStatusApplyConfiguration `json:"status,omitempty"` -} - -// InferenceModel constructs a declarative configuration of the InferenceModel type for use with -// apply. -func InferenceModel(name, namespace string) *InferenceModelApplyConfiguration { - b := &InferenceModelApplyConfiguration{} - b.WithName(name) - b.WithNamespace(namespace) - b.WithKind("InferenceModel") - b.WithAPIVersion("inference.networking.x-k8s.io/v1alpha1") - return b -} - -// WithKind sets the Kind field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Kind field is set to the value of the last call. -func (b *InferenceModelApplyConfiguration) WithKind(value string) *InferenceModelApplyConfiguration { - b.TypeMetaApplyConfiguration.Kind = &value - return b -} - -// WithAPIVersion sets the APIVersion field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the APIVersion field is set to the value of the last call. -func (b *InferenceModelApplyConfiguration) WithAPIVersion(value string) *InferenceModelApplyConfiguration { - b.TypeMetaApplyConfiguration.APIVersion = &value - return b -} - -// WithName sets the Name field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Name field is set to the value of the last call. -func (b *InferenceModelApplyConfiguration) WithName(value string) *InferenceModelApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - b.ObjectMetaApplyConfiguration.Name = &value - return b -} - -// WithGenerateName sets the GenerateName field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the GenerateName field is set to the value of the last call. -func (b *InferenceModelApplyConfiguration) WithGenerateName(value string) *InferenceModelApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - b.ObjectMetaApplyConfiguration.GenerateName = &value - return b -} - -// WithNamespace sets the Namespace field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Namespace field is set to the value of the last call. -func (b *InferenceModelApplyConfiguration) WithNamespace(value string) *InferenceModelApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - b.ObjectMetaApplyConfiguration.Namespace = &value - return b -} - -// WithUID sets the UID field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the UID field is set to the value of the last call. -func (b *InferenceModelApplyConfiguration) WithUID(value types.UID) *InferenceModelApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - b.ObjectMetaApplyConfiguration.UID = &value - return b -} - -// WithResourceVersion sets the ResourceVersion field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the ResourceVersion field is set to the value of the last call. -func (b *InferenceModelApplyConfiguration) WithResourceVersion(value string) *InferenceModelApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - b.ObjectMetaApplyConfiguration.ResourceVersion = &value - return b -} - -// WithGeneration sets the Generation field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Generation field is set to the value of the last call. -func (b *InferenceModelApplyConfiguration) WithGeneration(value int64) *InferenceModelApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - b.ObjectMetaApplyConfiguration.Generation = &value - return b -} - -// WithCreationTimestamp sets the CreationTimestamp field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the CreationTimestamp field is set to the value of the last call. -func (b *InferenceModelApplyConfiguration) WithCreationTimestamp(value metav1.Time) *InferenceModelApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - b.ObjectMetaApplyConfiguration.CreationTimestamp = &value - return b -} - -// WithDeletionTimestamp sets the DeletionTimestamp field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the DeletionTimestamp field is set to the value of the last call. -func (b *InferenceModelApplyConfiguration) WithDeletionTimestamp(value metav1.Time) *InferenceModelApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - b.ObjectMetaApplyConfiguration.DeletionTimestamp = &value - return b -} - -// WithDeletionGracePeriodSeconds sets the DeletionGracePeriodSeconds field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the DeletionGracePeriodSeconds field is set to the value of the last call. -func (b *InferenceModelApplyConfiguration) WithDeletionGracePeriodSeconds(value int64) *InferenceModelApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - b.ObjectMetaApplyConfiguration.DeletionGracePeriodSeconds = &value - return b -} - -// WithLabels puts the entries into the Labels field in the declarative configuration -// and returns the receiver, so that objects can be build by chaining "With" function invocations. -// If called multiple times, the entries provided by each call will be put on the Labels field, -// overwriting an existing map entries in Labels field with the same key. -func (b *InferenceModelApplyConfiguration) WithLabels(entries map[string]string) *InferenceModelApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - if b.ObjectMetaApplyConfiguration.Labels == nil && len(entries) > 0 { - b.ObjectMetaApplyConfiguration.Labels = make(map[string]string, len(entries)) - } - for k, v := range entries { - b.ObjectMetaApplyConfiguration.Labels[k] = v - } - return b -} - -// WithAnnotations puts the entries into the Annotations field in the declarative configuration -// and returns the receiver, so that objects can be build by chaining "With" function invocations. -// If called multiple times, the entries provided by each call will be put on the Annotations field, -// overwriting an existing map entries in Annotations field with the same key. -func (b *InferenceModelApplyConfiguration) WithAnnotations(entries map[string]string) *InferenceModelApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - if b.ObjectMetaApplyConfiguration.Annotations == nil && len(entries) > 0 { - b.ObjectMetaApplyConfiguration.Annotations = make(map[string]string, len(entries)) - } - for k, v := range entries { - b.ObjectMetaApplyConfiguration.Annotations[k] = v - } - return b -} - -// WithOwnerReferences adds the given value to the OwnerReferences field in the declarative configuration -// and returns the receiver, so that objects can be build by chaining "With" function invocations. -// If called multiple times, values provided by each call will be appended to the OwnerReferences field. -func (b *InferenceModelApplyConfiguration) WithOwnerReferences(values ...*v1.OwnerReferenceApplyConfiguration) *InferenceModelApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - for i := range values { - if values[i] == nil { - panic("nil value passed to WithOwnerReferences") - } - b.ObjectMetaApplyConfiguration.OwnerReferences = append(b.ObjectMetaApplyConfiguration.OwnerReferences, *values[i]) - } - return b -} - -// WithFinalizers adds the given value to the Finalizers field in the declarative configuration -// and returns the receiver, so that objects can be build by chaining "With" function invocations. -// If called multiple times, values provided by each call will be appended to the Finalizers field. -func (b *InferenceModelApplyConfiguration) WithFinalizers(values ...string) *InferenceModelApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - for i := range values { - b.ObjectMetaApplyConfiguration.Finalizers = append(b.ObjectMetaApplyConfiguration.Finalizers, values[i]) - } - return b -} - -func (b *InferenceModelApplyConfiguration) ensureObjectMetaApplyConfigurationExists() { - if b.ObjectMetaApplyConfiguration == nil { - b.ObjectMetaApplyConfiguration = &v1.ObjectMetaApplyConfiguration{} - } -} - -// WithSpec sets the Spec field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Spec field is set to the value of the last call. -func (b *InferenceModelApplyConfiguration) WithSpec(value *InferenceModelSpecApplyConfiguration) *InferenceModelApplyConfiguration { - b.Spec = value - return b -} - -// WithStatus sets the Status field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Status field is set to the value of the last call. -func (b *InferenceModelApplyConfiguration) WithStatus(value *InferenceModelStatusApplyConfiguration) *InferenceModelApplyConfiguration { - b.Status = value - return b -} - -// GetName retrieves the value of the Name field in the declarative configuration. -func (b *InferenceModelApplyConfiguration) GetName() *string { - b.ensureObjectMetaApplyConfigurationExists() - return b.ObjectMetaApplyConfiguration.Name -} diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go b/client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go deleted file mode 100644 index 2b1a4cbf..00000000 --- a/client-go/applyconfiguration/api/v1alpha1/inferencemodelspec.go +++ /dev/null @@ -1,74 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by applyconfiguration-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" -) - -// InferenceModelSpecApplyConfiguration represents a declarative configuration of the InferenceModelSpec type for use -// with apply. -type InferenceModelSpecApplyConfiguration struct { - ModelName *string `json:"modelName,omitempty"` - Criticality *apiv1alpha1.Criticality `json:"criticality,omitempty"` - TargetModels []TargetModelApplyConfiguration `json:"targetModels,omitempty"` - PoolRef *PoolObjectReferenceApplyConfiguration `json:"poolRef,omitempty"` -} - -// InferenceModelSpecApplyConfiguration constructs a declarative configuration of the InferenceModelSpec type for use with -// apply. -func InferenceModelSpec() *InferenceModelSpecApplyConfiguration { - return &InferenceModelSpecApplyConfiguration{} -} - -// WithModelName sets the ModelName field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the ModelName field is set to the value of the last call. -func (b *InferenceModelSpecApplyConfiguration) WithModelName(value string) *InferenceModelSpecApplyConfiguration { - b.ModelName = &value - return b -} - -// WithCriticality sets the Criticality field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Criticality field is set to the value of the last call. -func (b *InferenceModelSpecApplyConfiguration) WithCriticality(value apiv1alpha1.Criticality) *InferenceModelSpecApplyConfiguration { - b.Criticality = &value - return b -} - -// WithTargetModels adds the given value to the TargetModels field in the declarative configuration -// and returns the receiver, so that objects can be build by chaining "With" function invocations. -// If called multiple times, values provided by each call will be appended to the TargetModels field. -func (b *InferenceModelSpecApplyConfiguration) WithTargetModels(values ...*TargetModelApplyConfiguration) *InferenceModelSpecApplyConfiguration { - for i := range values { - if values[i] == nil { - panic("nil value passed to WithTargetModels") - } - b.TargetModels = append(b.TargetModels, *values[i]) - } - return b -} - -// WithPoolRef sets the PoolRef field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the PoolRef field is set to the value of the last call. -func (b *InferenceModelSpecApplyConfiguration) WithPoolRef(value *PoolObjectReferenceApplyConfiguration) *InferenceModelSpecApplyConfiguration { - b.PoolRef = value - return b -} diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencemodelstatus.go b/client-go/applyconfiguration/api/v1alpha1/inferencemodelstatus.go deleted file mode 100644 index b0b003bb..00000000 --- a/client-go/applyconfiguration/api/v1alpha1/inferencemodelstatus.go +++ /dev/null @@ -1,47 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by applyconfiguration-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - v1 "k8s.io/client-go/applyconfigurations/meta/v1" -) - -// InferenceModelStatusApplyConfiguration represents a declarative configuration of the InferenceModelStatus type for use -// with apply. -type InferenceModelStatusApplyConfiguration struct { - Conditions []v1.ConditionApplyConfiguration `json:"conditions,omitempty"` -} - -// InferenceModelStatusApplyConfiguration constructs a declarative configuration of the InferenceModelStatus type for use with -// apply. -func InferenceModelStatus() *InferenceModelStatusApplyConfiguration { - return &InferenceModelStatusApplyConfiguration{} -} - -// WithConditions adds the given value to the Conditions field in the declarative configuration -// and returns the receiver, so that objects can be build by chaining "With" function invocations. -// If called multiple times, values provided by each call will be appended to the Conditions field. -func (b *InferenceModelStatusApplyConfiguration) WithConditions(values ...*v1.ConditionApplyConfiguration) *InferenceModelStatusApplyConfiguration { - for i := range values { - if values[i] == nil { - panic("nil value passed to WithConditions") - } - b.Conditions = append(b.Conditions, *values[i]) - } - return b -} diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencepool.go b/client-go/applyconfiguration/api/v1alpha1/inferencepool.go deleted file mode 100644 index 2940143e..00000000 --- a/client-go/applyconfiguration/api/v1alpha1/inferencepool.go +++ /dev/null @@ -1,224 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by applyconfiguration-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - types "k8s.io/apimachinery/pkg/types" - v1 "k8s.io/client-go/applyconfigurations/meta/v1" -) - -// InferencePoolApplyConfiguration represents a declarative configuration of the InferencePool type for use -// with apply. -type InferencePoolApplyConfiguration struct { - v1.TypeMetaApplyConfiguration `json:",inline"` - *v1.ObjectMetaApplyConfiguration `json:"metadata,omitempty"` - Spec *InferencePoolSpecApplyConfiguration `json:"spec,omitempty"` - Status *InferencePoolStatusApplyConfiguration `json:"status,omitempty"` -} - -// InferencePool constructs a declarative configuration of the InferencePool type for use with -// apply. -func InferencePool(name, namespace string) *InferencePoolApplyConfiguration { - b := &InferencePoolApplyConfiguration{} - b.WithName(name) - b.WithNamespace(namespace) - b.WithKind("InferencePool") - b.WithAPIVersion("inference.networking.x-k8s.io/v1alpha1") - return b -} - -// WithKind sets the Kind field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Kind field is set to the value of the last call. -func (b *InferencePoolApplyConfiguration) WithKind(value string) *InferencePoolApplyConfiguration { - b.TypeMetaApplyConfiguration.Kind = &value - return b -} - -// WithAPIVersion sets the APIVersion field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the APIVersion field is set to the value of the last call. -func (b *InferencePoolApplyConfiguration) WithAPIVersion(value string) *InferencePoolApplyConfiguration { - b.TypeMetaApplyConfiguration.APIVersion = &value - return b -} - -// WithName sets the Name field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Name field is set to the value of the last call. -func (b *InferencePoolApplyConfiguration) WithName(value string) *InferencePoolApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - b.ObjectMetaApplyConfiguration.Name = &value - return b -} - -// WithGenerateName sets the GenerateName field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the GenerateName field is set to the value of the last call. -func (b *InferencePoolApplyConfiguration) WithGenerateName(value string) *InferencePoolApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - b.ObjectMetaApplyConfiguration.GenerateName = &value - return b -} - -// WithNamespace sets the Namespace field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Namespace field is set to the value of the last call. -func (b *InferencePoolApplyConfiguration) WithNamespace(value string) *InferencePoolApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - b.ObjectMetaApplyConfiguration.Namespace = &value - return b -} - -// WithUID sets the UID field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the UID field is set to the value of the last call. -func (b *InferencePoolApplyConfiguration) WithUID(value types.UID) *InferencePoolApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - b.ObjectMetaApplyConfiguration.UID = &value - return b -} - -// WithResourceVersion sets the ResourceVersion field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the ResourceVersion field is set to the value of the last call. -func (b *InferencePoolApplyConfiguration) WithResourceVersion(value string) *InferencePoolApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - b.ObjectMetaApplyConfiguration.ResourceVersion = &value - return b -} - -// WithGeneration sets the Generation field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Generation field is set to the value of the last call. -func (b *InferencePoolApplyConfiguration) WithGeneration(value int64) *InferencePoolApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - b.ObjectMetaApplyConfiguration.Generation = &value - return b -} - -// WithCreationTimestamp sets the CreationTimestamp field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the CreationTimestamp field is set to the value of the last call. -func (b *InferencePoolApplyConfiguration) WithCreationTimestamp(value metav1.Time) *InferencePoolApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - b.ObjectMetaApplyConfiguration.CreationTimestamp = &value - return b -} - -// WithDeletionTimestamp sets the DeletionTimestamp field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the DeletionTimestamp field is set to the value of the last call. -func (b *InferencePoolApplyConfiguration) WithDeletionTimestamp(value metav1.Time) *InferencePoolApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - b.ObjectMetaApplyConfiguration.DeletionTimestamp = &value - return b -} - -// WithDeletionGracePeriodSeconds sets the DeletionGracePeriodSeconds field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the DeletionGracePeriodSeconds field is set to the value of the last call. -func (b *InferencePoolApplyConfiguration) WithDeletionGracePeriodSeconds(value int64) *InferencePoolApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - b.ObjectMetaApplyConfiguration.DeletionGracePeriodSeconds = &value - return b -} - -// WithLabels puts the entries into the Labels field in the declarative configuration -// and returns the receiver, so that objects can be build by chaining "With" function invocations. -// If called multiple times, the entries provided by each call will be put on the Labels field, -// overwriting an existing map entries in Labels field with the same key. -func (b *InferencePoolApplyConfiguration) WithLabels(entries map[string]string) *InferencePoolApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - if b.ObjectMetaApplyConfiguration.Labels == nil && len(entries) > 0 { - b.ObjectMetaApplyConfiguration.Labels = make(map[string]string, len(entries)) - } - for k, v := range entries { - b.ObjectMetaApplyConfiguration.Labels[k] = v - } - return b -} - -// WithAnnotations puts the entries into the Annotations field in the declarative configuration -// and returns the receiver, so that objects can be build by chaining "With" function invocations. -// If called multiple times, the entries provided by each call will be put on the Annotations field, -// overwriting an existing map entries in Annotations field with the same key. -func (b *InferencePoolApplyConfiguration) WithAnnotations(entries map[string]string) *InferencePoolApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - if b.ObjectMetaApplyConfiguration.Annotations == nil && len(entries) > 0 { - b.ObjectMetaApplyConfiguration.Annotations = make(map[string]string, len(entries)) - } - for k, v := range entries { - b.ObjectMetaApplyConfiguration.Annotations[k] = v - } - return b -} - -// WithOwnerReferences adds the given value to the OwnerReferences field in the declarative configuration -// and returns the receiver, so that objects can be build by chaining "With" function invocations. -// If called multiple times, values provided by each call will be appended to the OwnerReferences field. -func (b *InferencePoolApplyConfiguration) WithOwnerReferences(values ...*v1.OwnerReferenceApplyConfiguration) *InferencePoolApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - for i := range values { - if values[i] == nil { - panic("nil value passed to WithOwnerReferences") - } - b.ObjectMetaApplyConfiguration.OwnerReferences = append(b.ObjectMetaApplyConfiguration.OwnerReferences, *values[i]) - } - return b -} - -// WithFinalizers adds the given value to the Finalizers field in the declarative configuration -// and returns the receiver, so that objects can be build by chaining "With" function invocations. -// If called multiple times, values provided by each call will be appended to the Finalizers field. -func (b *InferencePoolApplyConfiguration) WithFinalizers(values ...string) *InferencePoolApplyConfiguration { - b.ensureObjectMetaApplyConfigurationExists() - for i := range values { - b.ObjectMetaApplyConfiguration.Finalizers = append(b.ObjectMetaApplyConfiguration.Finalizers, values[i]) - } - return b -} - -func (b *InferencePoolApplyConfiguration) ensureObjectMetaApplyConfigurationExists() { - if b.ObjectMetaApplyConfiguration == nil { - b.ObjectMetaApplyConfiguration = &v1.ObjectMetaApplyConfiguration{} - } -} - -// WithSpec sets the Spec field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Spec field is set to the value of the last call. -func (b *InferencePoolApplyConfiguration) WithSpec(value *InferencePoolSpecApplyConfiguration) *InferencePoolApplyConfiguration { - b.Spec = value - return b -} - -// WithStatus sets the Status field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Status field is set to the value of the last call. -func (b *InferencePoolApplyConfiguration) WithStatus(value *InferencePoolStatusApplyConfiguration) *InferencePoolApplyConfiguration { - b.Status = value - return b -} - -// GetName retrieves the value of the Name field in the declarative configuration. -func (b *InferencePoolApplyConfiguration) GetName() *string { - b.ensureObjectMetaApplyConfigurationExists() - return b.ObjectMetaApplyConfiguration.Name -} diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go b/client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go deleted file mode 100644 index 5f69a154..00000000 --- a/client-go/applyconfiguration/api/v1alpha1/inferencepoolspec.go +++ /dev/null @@ -1,66 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by applyconfiguration-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" -) - -// InferencePoolSpecApplyConfiguration represents a declarative configuration of the InferencePoolSpec type for use -// with apply. -type InferencePoolSpecApplyConfiguration struct { - Selector map[apiv1alpha1.LabelKey]apiv1alpha1.LabelValue `json:"selector,omitempty"` - TargetPortNumber *int32 `json:"targetPortNumber,omitempty"` - EndpointPickerConfigApplyConfiguration `json:",inline"` -} - -// InferencePoolSpecApplyConfiguration constructs a declarative configuration of the InferencePoolSpec type for use with -// apply. -func InferencePoolSpec() *InferencePoolSpecApplyConfiguration { - return &InferencePoolSpecApplyConfiguration{} -} - -// WithSelector puts the entries into the Selector field in the declarative configuration -// and returns the receiver, so that objects can be build by chaining "With" function invocations. -// If called multiple times, the entries provided by each call will be put on the Selector field, -// overwriting an existing map entries in Selector field with the same key. -func (b *InferencePoolSpecApplyConfiguration) WithSelector(entries map[apiv1alpha1.LabelKey]apiv1alpha1.LabelValue) *InferencePoolSpecApplyConfiguration { - if b.Selector == nil && len(entries) > 0 { - b.Selector = make(map[apiv1alpha1.LabelKey]apiv1alpha1.LabelValue, len(entries)) - } - for k, v := range entries { - b.Selector[k] = v - } - return b -} - -// WithTargetPortNumber sets the TargetPortNumber field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the TargetPortNumber field is set to the value of the last call. -func (b *InferencePoolSpecApplyConfiguration) WithTargetPortNumber(value int32) *InferencePoolSpecApplyConfiguration { - b.TargetPortNumber = &value - return b -} - -// WithExtensionRef sets the ExtensionRef field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the ExtensionRef field is set to the value of the last call. -func (b *InferencePoolSpecApplyConfiguration) WithExtensionRef(value *ExtensionApplyConfiguration) *InferencePoolSpecApplyConfiguration { - b.EndpointPickerConfigApplyConfiguration.ExtensionRef = value - return b -} diff --git a/client-go/applyconfiguration/api/v1alpha1/inferencepoolstatus.go b/client-go/applyconfiguration/api/v1alpha1/inferencepoolstatus.go deleted file mode 100644 index f61a81b3..00000000 --- a/client-go/applyconfiguration/api/v1alpha1/inferencepoolstatus.go +++ /dev/null @@ -1,47 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by applyconfiguration-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - v1 "k8s.io/client-go/applyconfigurations/meta/v1" -) - -// InferencePoolStatusApplyConfiguration represents a declarative configuration of the InferencePoolStatus type for use -// with apply. -type InferencePoolStatusApplyConfiguration struct { - Conditions []v1.ConditionApplyConfiguration `json:"conditions,omitempty"` -} - -// InferencePoolStatusApplyConfiguration constructs a declarative configuration of the InferencePoolStatus type for use with -// apply. -func InferencePoolStatus() *InferencePoolStatusApplyConfiguration { - return &InferencePoolStatusApplyConfiguration{} -} - -// WithConditions adds the given value to the Conditions field in the declarative configuration -// and returns the receiver, so that objects can be build by chaining "With" function invocations. -// If called multiple times, values provided by each call will be appended to the Conditions field. -func (b *InferencePoolStatusApplyConfiguration) WithConditions(values ...*v1.ConditionApplyConfiguration) *InferencePoolStatusApplyConfiguration { - for i := range values { - if values[i] == nil { - panic("nil value passed to WithConditions") - } - b.Conditions = append(b.Conditions, *values[i]) - } - return b -} diff --git a/client-go/applyconfiguration/api/v1alpha1/poolobjectreference.go b/client-go/applyconfiguration/api/v1alpha1/poolobjectreference.go deleted file mode 100644 index 692a185e..00000000 --- a/client-go/applyconfiguration/api/v1alpha1/poolobjectreference.go +++ /dev/null @@ -1,56 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by applyconfiguration-gen. DO NOT EDIT. - -package v1alpha1 - -// PoolObjectReferenceApplyConfiguration represents a declarative configuration of the PoolObjectReference type for use -// with apply. -type PoolObjectReferenceApplyConfiguration struct { - Group *string `json:"group,omitempty"` - Kind *string `json:"kind,omitempty"` - Name *string `json:"name,omitempty"` -} - -// PoolObjectReferenceApplyConfiguration constructs a declarative configuration of the PoolObjectReference type for use with -// apply. -func PoolObjectReference() *PoolObjectReferenceApplyConfiguration { - return &PoolObjectReferenceApplyConfiguration{} -} - -// WithGroup sets the Group field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Group field is set to the value of the last call. -func (b *PoolObjectReferenceApplyConfiguration) WithGroup(value string) *PoolObjectReferenceApplyConfiguration { - b.Group = &value - return b -} - -// WithKind sets the Kind field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Kind field is set to the value of the last call. -func (b *PoolObjectReferenceApplyConfiguration) WithKind(value string) *PoolObjectReferenceApplyConfiguration { - b.Kind = &value - return b -} - -// WithName sets the Name field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Name field is set to the value of the last call. -func (b *PoolObjectReferenceApplyConfiguration) WithName(value string) *PoolObjectReferenceApplyConfiguration { - b.Name = &value - return b -} diff --git a/client-go/applyconfiguration/api/v1alpha1/targetmodel.go b/client-go/applyconfiguration/api/v1alpha1/targetmodel.go deleted file mode 100644 index f6ac83f8..00000000 --- a/client-go/applyconfiguration/api/v1alpha1/targetmodel.go +++ /dev/null @@ -1,47 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by applyconfiguration-gen. DO NOT EDIT. - -package v1alpha1 - -// TargetModelApplyConfiguration represents a declarative configuration of the TargetModel type for use -// with apply. -type TargetModelApplyConfiguration struct { - Name *string `json:"name,omitempty"` - Weight *int32 `json:"weight,omitempty"` -} - -// TargetModelApplyConfiguration constructs a declarative configuration of the TargetModel type for use with -// apply. -func TargetModel() *TargetModelApplyConfiguration { - return &TargetModelApplyConfiguration{} -} - -// WithName sets the Name field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Name field is set to the value of the last call. -func (b *TargetModelApplyConfiguration) WithName(value string) *TargetModelApplyConfiguration { - b.Name = &value - return b -} - -// WithWeight sets the Weight field in the declarative configuration to the given value -// and returns the receiver, so that objects can be built by chaining "With" function invocations. -// If called multiple times, the Weight field is set to the value of the last call. -func (b *TargetModelApplyConfiguration) WithWeight(value int32) *TargetModelApplyConfiguration { - b.Weight = &value - return b -} diff --git a/client-go/applyconfiguration/utils.go b/client-go/applyconfiguration/utils.go index eacc9c43..e1ad5ea4 100644 --- a/client-go/applyconfiguration/utils.go +++ b/client-go/applyconfiguration/utils.go @@ -21,9 +21,7 @@ import ( runtime "k8s.io/apimachinery/pkg/runtime" schema "k8s.io/apimachinery/pkg/runtime/schema" testing "k8s.io/client-go/testing" - v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" - apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha2" internal "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/internal" ) @@ -32,33 +30,7 @@ import ( // apply configuration type exists for the given GroupVersionKind. func ForKind(kind schema.GroupVersionKind) interface{} { switch kind { - // Group=inference.networking.x-k8s.io, Version=v1alpha1 - case v1alpha1.SchemeGroupVersion.WithKind("EndpointPickerConfig"): - return &apiv1alpha1.EndpointPickerConfigApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("Extension"): - return &apiv1alpha1.ExtensionApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("ExtensionConnection"): - return &apiv1alpha1.ExtensionConnectionApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("ExtensionReference"): - return &apiv1alpha1.ExtensionReferenceApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("InferenceModel"): - return &apiv1alpha1.InferenceModelApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("InferenceModelSpec"): - return &apiv1alpha1.InferenceModelSpecApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("InferenceModelStatus"): - return &apiv1alpha1.InferenceModelStatusApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("InferencePool"): - return &apiv1alpha1.InferencePoolApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("InferencePoolSpec"): - return &apiv1alpha1.InferencePoolSpecApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("InferencePoolStatus"): - return &apiv1alpha1.InferencePoolStatusApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("PoolObjectReference"): - return &apiv1alpha1.PoolObjectReferenceApplyConfiguration{} - case v1alpha1.SchemeGroupVersion.WithKind("TargetModel"): - return &apiv1alpha1.TargetModelApplyConfiguration{} - - // Group=inference.networking.x-k8s.io, Version=v1alpha2 + // Group=inference.networking.x-k8s.io, Version=v1alpha2 case v1alpha2.SchemeGroupVersion.WithKind("EndpointPickerConfig"): return &apiv1alpha2.EndpointPickerConfigApplyConfiguration{} case v1alpha2.SchemeGroupVersion.WithKind("Extension"): diff --git a/client-go/clientset/versioned/clientset.go b/client-go/clientset/versioned/clientset.go index 4266285a..c56d11c7 100644 --- a/client-go/clientset/versioned/clientset.go +++ b/client-go/clientset/versioned/clientset.go @@ -24,28 +24,20 @@ import ( discovery "k8s.io/client-go/discovery" rest "k8s.io/client-go/rest" flowcontrol "k8s.io/client-go/util/flowcontrol" - inferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" inferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha2" ) type Interface interface { Discovery() discovery.DiscoveryInterface - InferenceV1alpha1() inferencev1alpha1.InferenceV1alpha1Interface InferenceV1alpha2() inferencev1alpha2.InferenceV1alpha2Interface } // Clientset contains the clients for groups. type Clientset struct { *discovery.DiscoveryClient - inferenceV1alpha1 *inferencev1alpha1.InferenceV1alpha1Client inferenceV1alpha2 *inferencev1alpha2.InferenceV1alpha2Client } -// InferenceV1alpha1 retrieves the InferenceV1alpha1Client -func (c *Clientset) InferenceV1alpha1() inferencev1alpha1.InferenceV1alpha1Interface { - return c.inferenceV1alpha1 -} - // InferenceV1alpha2 retrieves the InferenceV1alpha2Client func (c *Clientset) InferenceV1alpha2() inferencev1alpha2.InferenceV1alpha2Interface { return c.inferenceV1alpha2 @@ -95,10 +87,6 @@ func NewForConfigAndClient(c *rest.Config, httpClient *http.Client) (*Clientset, var cs Clientset var err error - cs.inferenceV1alpha1, err = inferencev1alpha1.NewForConfigAndClient(&configShallowCopy, httpClient) - if err != nil { - return nil, err - } cs.inferenceV1alpha2, err = inferencev1alpha2.NewForConfigAndClient(&configShallowCopy, httpClient) if err != nil { return nil, err @@ -124,7 +112,6 @@ func NewForConfigOrDie(c *rest.Config) *Clientset { // New creates a new Clientset for the given RESTClient. func New(c rest.Interface) *Clientset { var cs Clientset - cs.inferenceV1alpha1 = inferencev1alpha1.New(c) cs.inferenceV1alpha2 = inferencev1alpha2.New(c) cs.DiscoveryClient = discovery.NewDiscoveryClient(c) diff --git a/client-go/clientset/versioned/fake/clientset_generated.go b/client-go/clientset/versioned/fake/clientset_generated.go index f4f33032..b0ecd50b 100644 --- a/client-go/clientset/versioned/fake/clientset_generated.go +++ b/client-go/clientset/versioned/fake/clientset_generated.go @@ -25,8 +25,6 @@ import ( "k8s.io/client-go/testing" applyconfiguration "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration" clientset "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" - inferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" - fakeinferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1/fake" inferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha2" fakeinferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha2/fake" ) @@ -117,11 +115,6 @@ var ( _ testing.FakeClient = &Clientset{} ) -// InferenceV1alpha1 retrieves the InferenceV1alpha1Client -func (c *Clientset) InferenceV1alpha1() inferencev1alpha1.InferenceV1alpha1Interface { - return &fakeinferencev1alpha1.FakeInferenceV1alpha1{Fake: &c.Fake} -} - // InferenceV1alpha2 retrieves the InferenceV1alpha2Client func (c *Clientset) InferenceV1alpha2() inferencev1alpha2.InferenceV1alpha2Interface { return &fakeinferencev1alpha2.FakeInferenceV1alpha2{Fake: &c.Fake} diff --git a/client-go/clientset/versioned/fake/register.go b/client-go/clientset/versioned/fake/register.go index bc8e6903..365ccb75 100644 --- a/client-go/clientset/versioned/fake/register.go +++ b/client-go/clientset/versioned/fake/register.go @@ -23,7 +23,6 @@ import ( schema "k8s.io/apimachinery/pkg/runtime/schema" serializer "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" - inferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" inferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) @@ -31,7 +30,6 @@ var scheme = runtime.NewScheme() var codecs = serializer.NewCodecFactory(scheme) var localSchemeBuilder = runtime.SchemeBuilder{ - inferencev1alpha1.AddToScheme, inferencev1alpha2.AddToScheme, } diff --git a/client-go/clientset/versioned/scheme/register.go b/client-go/clientset/versioned/scheme/register.go index 5727d404..b656f121 100644 --- a/client-go/clientset/versioned/scheme/register.go +++ b/client-go/clientset/versioned/scheme/register.go @@ -23,7 +23,6 @@ import ( schema "k8s.io/apimachinery/pkg/runtime/schema" serializer "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" - inferencev1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" inferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) @@ -31,7 +30,6 @@ var Scheme = runtime.NewScheme() var Codecs = serializer.NewCodecFactory(Scheme) var ParameterCodec = runtime.NewParameterCodec(Scheme) var localSchemeBuilder = runtime.SchemeBuilder{ - inferencev1alpha1.AddToScheme, inferencev1alpha2.AddToScheme, } diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/api_client.go b/client-go/clientset/versioned/typed/api/v1alpha1/api_client.go deleted file mode 100644 index 8cc8a643..00000000 --- a/client-go/clientset/versioned/typed/api/v1alpha1/api_client.go +++ /dev/null @@ -1,111 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by client-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - http "net/http" - - rest "k8s.io/client-go/rest" - apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - scheme "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" -) - -type InferenceV1alpha1Interface interface { - RESTClient() rest.Interface - InferenceModelsGetter - InferencePoolsGetter -} - -// InferenceV1alpha1Client is used to interact with features provided by the inference.networking.x-k8s.io group. -type InferenceV1alpha1Client struct { - restClient rest.Interface -} - -func (c *InferenceV1alpha1Client) InferenceModels(namespace string) InferenceModelInterface { - return newInferenceModels(c, namespace) -} - -func (c *InferenceV1alpha1Client) InferencePools(namespace string) InferencePoolInterface { - return newInferencePools(c, namespace) -} - -// NewForConfig creates a new InferenceV1alpha1Client for the given config. -// NewForConfig is equivalent to NewForConfigAndClient(c, httpClient), -// where httpClient was generated with rest.HTTPClientFor(c). -func NewForConfig(c *rest.Config) (*InferenceV1alpha1Client, error) { - config := *c - if err := setConfigDefaults(&config); err != nil { - return nil, err - } - httpClient, err := rest.HTTPClientFor(&config) - if err != nil { - return nil, err - } - return NewForConfigAndClient(&config, httpClient) -} - -// NewForConfigAndClient creates a new InferenceV1alpha1Client for the given config and http client. -// Note the http client provided takes precedence over the configured transport values. -func NewForConfigAndClient(c *rest.Config, h *http.Client) (*InferenceV1alpha1Client, error) { - config := *c - if err := setConfigDefaults(&config); err != nil { - return nil, err - } - client, err := rest.RESTClientForConfigAndClient(&config, h) - if err != nil { - return nil, err - } - return &InferenceV1alpha1Client{client}, nil -} - -// NewForConfigOrDie creates a new InferenceV1alpha1Client for the given config and -// panics if there is an error in the config. -func NewForConfigOrDie(c *rest.Config) *InferenceV1alpha1Client { - client, err := NewForConfig(c) - if err != nil { - panic(err) - } - return client -} - -// New creates a new InferenceV1alpha1Client for the given RESTClient. -func New(c rest.Interface) *InferenceV1alpha1Client { - return &InferenceV1alpha1Client{c} -} - -func setConfigDefaults(config *rest.Config) error { - gv := apiv1alpha1.SchemeGroupVersion - config.GroupVersion = &gv - config.APIPath = "/apis" - config.NegotiatedSerializer = rest.CodecFactoryForGeneratedClient(scheme.Scheme, scheme.Codecs).WithoutConversion() - - if config.UserAgent == "" { - config.UserAgent = rest.DefaultKubernetesUserAgent() - } - - return nil -} - -// RESTClient returns a RESTClient that is used to communicate -// with API server by this client implementation. -func (c *InferenceV1alpha1Client) RESTClient() rest.Interface { - if c == nil { - return nil - } - return c.restClient -} diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/doc.go b/client-go/clientset/versioned/typed/api/v1alpha1/doc.go deleted file mode 100644 index 28991e22..00000000 --- a/client-go/clientset/versioned/typed/api/v1alpha1/doc.go +++ /dev/null @@ -1,19 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by client-gen. DO NOT EDIT. - -// This package has the automatically generated typed clients. -package v1alpha1 diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/fake/doc.go b/client-go/clientset/versioned/typed/api/v1alpha1/fake/doc.go deleted file mode 100644 index fbfccbb9..00000000 --- a/client-go/clientset/versioned/typed/api/v1alpha1/fake/doc.go +++ /dev/null @@ -1,19 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by client-gen. DO NOT EDIT. - -// Package fake has the automatically generated clients. -package fake diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go deleted file mode 100644 index 1dee0f20..00000000 --- a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_api_client.go +++ /dev/null @@ -1,43 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by client-gen. DO NOT EDIT. - -package fake - -import ( - rest "k8s.io/client-go/rest" - testing "k8s.io/client-go/testing" - v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" -) - -type FakeInferenceV1alpha1 struct { - *testing.Fake -} - -func (c *FakeInferenceV1alpha1) InferenceModels(namespace string) v1alpha1.InferenceModelInterface { - return newFakeInferenceModels(c, namespace) -} - -func (c *FakeInferenceV1alpha1) InferencePools(namespace string) v1alpha1.InferencePoolInterface { - return newFakeInferencePools(c, namespace) -} - -// RESTClient returns a RESTClient that is used to communicate -// with API server by this client implementation. -func (c *FakeInferenceV1alpha1) RESTClient() rest.Interface { - var ret *rest.RESTClient - return ret -} diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go deleted file mode 100644 index 44007ae7..00000000 --- a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencemodel.go +++ /dev/null @@ -1,52 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by client-gen. DO NOT EDIT. - -package fake - -import ( - gentype "k8s.io/client-go/gentype" - v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" - typedapiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" -) - -// fakeInferenceModels implements InferenceModelInterface -type fakeInferenceModels struct { - *gentype.FakeClientWithListAndApply[*v1alpha1.InferenceModel, *v1alpha1.InferenceModelList, *apiv1alpha1.InferenceModelApplyConfiguration] - Fake *FakeInferenceV1alpha1 -} - -func newFakeInferenceModels(fake *FakeInferenceV1alpha1, namespace string) typedapiv1alpha1.InferenceModelInterface { - return &fakeInferenceModels{ - gentype.NewFakeClientWithListAndApply[*v1alpha1.InferenceModel, *v1alpha1.InferenceModelList, *apiv1alpha1.InferenceModelApplyConfiguration]( - fake.Fake, - namespace, - v1alpha1.SchemeGroupVersion.WithResource("inferencemodels"), - v1alpha1.SchemeGroupVersion.WithKind("InferenceModel"), - func() *v1alpha1.InferenceModel { return &v1alpha1.InferenceModel{} }, - func() *v1alpha1.InferenceModelList { return &v1alpha1.InferenceModelList{} }, - func(dst, src *v1alpha1.InferenceModelList) { dst.ListMeta = src.ListMeta }, - func(list *v1alpha1.InferenceModelList) []*v1alpha1.InferenceModel { - return gentype.ToPointerSlice(list.Items) - }, - func(list *v1alpha1.InferenceModelList, items []*v1alpha1.InferenceModel) { - list.Items = gentype.FromPointerSlice(items) - }, - ), - fake, - } -} diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go b/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go deleted file mode 100644 index cd0764aa..00000000 --- a/client-go/clientset/versioned/typed/api/v1alpha1/fake/fake_inferencepool.go +++ /dev/null @@ -1,52 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by client-gen. DO NOT EDIT. - -package fake - -import ( - gentype "k8s.io/client-go/gentype" - v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" - typedapiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/typed/api/v1alpha1" -) - -// fakeInferencePools implements InferencePoolInterface -type fakeInferencePools struct { - *gentype.FakeClientWithListAndApply[*v1alpha1.InferencePool, *v1alpha1.InferencePoolList, *apiv1alpha1.InferencePoolApplyConfiguration] - Fake *FakeInferenceV1alpha1 -} - -func newFakeInferencePools(fake *FakeInferenceV1alpha1, namespace string) typedapiv1alpha1.InferencePoolInterface { - return &fakeInferencePools{ - gentype.NewFakeClientWithListAndApply[*v1alpha1.InferencePool, *v1alpha1.InferencePoolList, *apiv1alpha1.InferencePoolApplyConfiguration]( - fake.Fake, - namespace, - v1alpha1.SchemeGroupVersion.WithResource("inferencepools"), - v1alpha1.SchemeGroupVersion.WithKind("InferencePool"), - func() *v1alpha1.InferencePool { return &v1alpha1.InferencePool{} }, - func() *v1alpha1.InferencePoolList { return &v1alpha1.InferencePoolList{} }, - func(dst, src *v1alpha1.InferencePoolList) { dst.ListMeta = src.ListMeta }, - func(list *v1alpha1.InferencePoolList) []*v1alpha1.InferencePool { - return gentype.ToPointerSlice(list.Items) - }, - func(list *v1alpha1.InferencePoolList, items []*v1alpha1.InferencePool) { - list.Items = gentype.FromPointerSlice(items) - }, - ), - fake, - } -} diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/generated_expansion.go b/client-go/clientset/versioned/typed/api/v1alpha1/generated_expansion.go deleted file mode 100644 index 65c88eb1..00000000 --- a/client-go/clientset/versioned/typed/api/v1alpha1/generated_expansion.go +++ /dev/null @@ -1,22 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by client-gen. DO NOT EDIT. - -package v1alpha1 - -type InferenceModelExpansion interface{} - -type InferencePoolExpansion interface{} diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go b/client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go deleted file mode 100644 index 4c7c5941..00000000 --- a/client-go/clientset/versioned/typed/api/v1alpha1/inferencemodel.go +++ /dev/null @@ -1,73 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by client-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - context "context" - - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - types "k8s.io/apimachinery/pkg/types" - watch "k8s.io/apimachinery/pkg/watch" - gentype "k8s.io/client-go/gentype" - apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - applyconfigurationapiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" - scheme "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" -) - -// InferenceModelsGetter has a method to return a InferenceModelInterface. -// A group's client should implement this interface. -type InferenceModelsGetter interface { - InferenceModels(namespace string) InferenceModelInterface -} - -// InferenceModelInterface has methods to work with InferenceModel resources. -type InferenceModelInterface interface { - Create(ctx context.Context, inferenceModel *apiv1alpha1.InferenceModel, opts v1.CreateOptions) (*apiv1alpha1.InferenceModel, error) - Update(ctx context.Context, inferenceModel *apiv1alpha1.InferenceModel, opts v1.UpdateOptions) (*apiv1alpha1.InferenceModel, error) - // Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). - UpdateStatus(ctx context.Context, inferenceModel *apiv1alpha1.InferenceModel, opts v1.UpdateOptions) (*apiv1alpha1.InferenceModel, error) - Delete(ctx context.Context, name string, opts v1.DeleteOptions) error - DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error - Get(ctx context.Context, name string, opts v1.GetOptions) (*apiv1alpha1.InferenceModel, error) - List(ctx context.Context, opts v1.ListOptions) (*apiv1alpha1.InferenceModelList, error) - Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) - Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *apiv1alpha1.InferenceModel, err error) - Apply(ctx context.Context, inferenceModel *applyconfigurationapiv1alpha1.InferenceModelApplyConfiguration, opts v1.ApplyOptions) (result *apiv1alpha1.InferenceModel, err error) - // Add a +genclient:noStatus comment above the type to avoid generating ApplyStatus(). - ApplyStatus(ctx context.Context, inferenceModel *applyconfigurationapiv1alpha1.InferenceModelApplyConfiguration, opts v1.ApplyOptions) (result *apiv1alpha1.InferenceModel, err error) - InferenceModelExpansion -} - -// inferenceModels implements InferenceModelInterface -type inferenceModels struct { - *gentype.ClientWithListAndApply[*apiv1alpha1.InferenceModel, *apiv1alpha1.InferenceModelList, *applyconfigurationapiv1alpha1.InferenceModelApplyConfiguration] -} - -// newInferenceModels returns a InferenceModels -func newInferenceModels(c *InferenceV1alpha1Client, namespace string) *inferenceModels { - return &inferenceModels{ - gentype.NewClientWithListAndApply[*apiv1alpha1.InferenceModel, *apiv1alpha1.InferenceModelList, *applyconfigurationapiv1alpha1.InferenceModelApplyConfiguration]( - "inferencemodels", - c.RESTClient(), - scheme.ParameterCodec, - namespace, - func() *apiv1alpha1.InferenceModel { return &apiv1alpha1.InferenceModel{} }, - func() *apiv1alpha1.InferenceModelList { return &apiv1alpha1.InferenceModelList{} }, - ), - } -} diff --git a/client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go b/client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go deleted file mode 100644 index 9af91801..00000000 --- a/client-go/clientset/versioned/typed/api/v1alpha1/inferencepool.go +++ /dev/null @@ -1,73 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by client-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - context "context" - - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - types "k8s.io/apimachinery/pkg/types" - watch "k8s.io/apimachinery/pkg/watch" - gentype "k8s.io/client-go/gentype" - apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - applyconfigurationapiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/applyconfiguration/api/v1alpha1" - scheme "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned/scheme" -) - -// InferencePoolsGetter has a method to return a InferencePoolInterface. -// A group's client should implement this interface. -type InferencePoolsGetter interface { - InferencePools(namespace string) InferencePoolInterface -} - -// InferencePoolInterface has methods to work with InferencePool resources. -type InferencePoolInterface interface { - Create(ctx context.Context, inferencePool *apiv1alpha1.InferencePool, opts v1.CreateOptions) (*apiv1alpha1.InferencePool, error) - Update(ctx context.Context, inferencePool *apiv1alpha1.InferencePool, opts v1.UpdateOptions) (*apiv1alpha1.InferencePool, error) - // Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). - UpdateStatus(ctx context.Context, inferencePool *apiv1alpha1.InferencePool, opts v1.UpdateOptions) (*apiv1alpha1.InferencePool, error) - Delete(ctx context.Context, name string, opts v1.DeleteOptions) error - DeleteCollection(ctx context.Context, opts v1.DeleteOptions, listOpts v1.ListOptions) error - Get(ctx context.Context, name string, opts v1.GetOptions) (*apiv1alpha1.InferencePool, error) - List(ctx context.Context, opts v1.ListOptions) (*apiv1alpha1.InferencePoolList, error) - Watch(ctx context.Context, opts v1.ListOptions) (watch.Interface, error) - Patch(ctx context.Context, name string, pt types.PatchType, data []byte, opts v1.PatchOptions, subresources ...string) (result *apiv1alpha1.InferencePool, err error) - Apply(ctx context.Context, inferencePool *applyconfigurationapiv1alpha1.InferencePoolApplyConfiguration, opts v1.ApplyOptions) (result *apiv1alpha1.InferencePool, err error) - // Add a +genclient:noStatus comment above the type to avoid generating ApplyStatus(). - ApplyStatus(ctx context.Context, inferencePool *applyconfigurationapiv1alpha1.InferencePoolApplyConfiguration, opts v1.ApplyOptions) (result *apiv1alpha1.InferencePool, err error) - InferencePoolExpansion -} - -// inferencePools implements InferencePoolInterface -type inferencePools struct { - *gentype.ClientWithListAndApply[*apiv1alpha1.InferencePool, *apiv1alpha1.InferencePoolList, *applyconfigurationapiv1alpha1.InferencePoolApplyConfiguration] -} - -// newInferencePools returns a InferencePools -func newInferencePools(c *InferenceV1alpha1Client, namespace string) *inferencePools { - return &inferencePools{ - gentype.NewClientWithListAndApply[*apiv1alpha1.InferencePool, *apiv1alpha1.InferencePoolList, *applyconfigurationapiv1alpha1.InferencePoolApplyConfiguration]( - "inferencepools", - c.RESTClient(), - scheme.ParameterCodec, - namespace, - func() *apiv1alpha1.InferencePool { return &apiv1alpha1.InferencePool{} }, - func() *apiv1alpha1.InferencePoolList { return &apiv1alpha1.InferencePoolList{} }, - ), - } -} diff --git a/client-go/informers/externalversions/api/interface.go b/client-go/informers/externalversions/api/interface.go index 210b89f8..10eef397 100644 --- a/client-go/informers/externalversions/api/interface.go +++ b/client-go/informers/externalversions/api/interface.go @@ -18,15 +18,12 @@ limitations under the License. package api import ( - v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/api/v1alpha1" v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/api/v1alpha2" internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" ) // Interface provides access to each of this group's versions. type Interface interface { - // V1alpha1 provides access to shared informers for resources in V1alpha1. - V1alpha1() v1alpha1.Interface // V1alpha2 provides access to shared informers for resources in V1alpha2. V1alpha2() v1alpha2.Interface } @@ -42,11 +39,6 @@ func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakList return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} } -// V1alpha1 returns a new v1alpha1.Interface. -func (g *group) V1alpha1() v1alpha1.Interface { - return v1alpha1.New(g.factory, g.namespace, g.tweakListOptions) -} - // V1alpha2 returns a new v1alpha2.Interface. func (g *group) V1alpha2() v1alpha2.Interface { return v1alpha2.New(g.factory, g.namespace, g.tweakListOptions) diff --git a/client-go/informers/externalversions/api/v1alpha1/inferencemodel.go b/client-go/informers/externalversions/api/v1alpha1/inferencemodel.go deleted file mode 100644 index a1522e48..00000000 --- a/client-go/informers/externalversions/api/v1alpha1/inferencemodel.go +++ /dev/null @@ -1,89 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by informer-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - context "context" - time "time" - - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - runtime "k8s.io/apimachinery/pkg/runtime" - watch "k8s.io/apimachinery/pkg/watch" - cache "k8s.io/client-go/tools/cache" - gatewayapiinferenceextensionapiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - versioned "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" - internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" - apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/listers/api/v1alpha1" -) - -// InferenceModelInformer provides access to a shared informer and lister for -// InferenceModels. -type InferenceModelInformer interface { - Informer() cache.SharedIndexInformer - Lister() apiv1alpha1.InferenceModelLister -} - -type inferenceModelInformer struct { - factory internalinterfaces.SharedInformerFactory - tweakListOptions internalinterfaces.TweakListOptionsFunc - namespace string -} - -// NewInferenceModelInformer constructs a new informer for InferenceModel type. -// Always prefer using an informer factory to get a shared informer instead of getting an independent -// one. This reduces memory footprint and number of connections to the server. -func NewInferenceModelInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { - return NewFilteredInferenceModelInformer(client, namespace, resyncPeriod, indexers, nil) -} - -// NewFilteredInferenceModelInformer constructs a new informer for InferenceModel type. -// Always prefer using an informer factory to get a shared informer instead of getting an independent -// one. This reduces memory footprint and number of connections to the server. -func NewFilteredInferenceModelInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { - return cache.NewSharedIndexInformer( - &cache.ListWatch{ - ListFunc: func(options v1.ListOptions) (runtime.Object, error) { - if tweakListOptions != nil { - tweakListOptions(&options) - } - return client.InferenceV1alpha1().InferenceModels(namespace).List(context.TODO(), options) - }, - WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { - if tweakListOptions != nil { - tweakListOptions(&options) - } - return client.InferenceV1alpha1().InferenceModels(namespace).Watch(context.TODO(), options) - }, - }, - &gatewayapiinferenceextensionapiv1alpha1.InferenceModel{}, - resyncPeriod, - indexers, - ) -} - -func (f *inferenceModelInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { - return NewFilteredInferenceModelInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) -} - -func (f *inferenceModelInformer) Informer() cache.SharedIndexInformer { - return f.factory.InformerFor(&gatewayapiinferenceextensionapiv1alpha1.InferenceModel{}, f.defaultInformer) -} - -func (f *inferenceModelInformer) Lister() apiv1alpha1.InferenceModelLister { - return apiv1alpha1.NewInferenceModelLister(f.Informer().GetIndexer()) -} diff --git a/client-go/informers/externalversions/api/v1alpha1/inferencepool.go b/client-go/informers/externalversions/api/v1alpha1/inferencepool.go deleted file mode 100644 index 27f2d29e..00000000 --- a/client-go/informers/externalversions/api/v1alpha1/inferencepool.go +++ /dev/null @@ -1,89 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by informer-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - context "context" - time "time" - - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - runtime "k8s.io/apimachinery/pkg/runtime" - watch "k8s.io/apimachinery/pkg/watch" - cache "k8s.io/client-go/tools/cache" - gatewayapiinferenceextensionapiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" - versioned "sigs.k8s.io/gateway-api-inference-extension/client-go/clientset/versioned" - internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" - apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/client-go/listers/api/v1alpha1" -) - -// InferencePoolInformer provides access to a shared informer and lister for -// InferencePools. -type InferencePoolInformer interface { - Informer() cache.SharedIndexInformer - Lister() apiv1alpha1.InferencePoolLister -} - -type inferencePoolInformer struct { - factory internalinterfaces.SharedInformerFactory - tweakListOptions internalinterfaces.TweakListOptionsFunc - namespace string -} - -// NewInferencePoolInformer constructs a new informer for InferencePool type. -// Always prefer using an informer factory to get a shared informer instead of getting an independent -// one. This reduces memory footprint and number of connections to the server. -func NewInferencePoolInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { - return NewFilteredInferencePoolInformer(client, namespace, resyncPeriod, indexers, nil) -} - -// NewFilteredInferencePoolInformer constructs a new informer for InferencePool type. -// Always prefer using an informer factory to get a shared informer instead of getting an independent -// one. This reduces memory footprint and number of connections to the server. -func NewFilteredInferencePoolInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { - return cache.NewSharedIndexInformer( - &cache.ListWatch{ - ListFunc: func(options v1.ListOptions) (runtime.Object, error) { - if tweakListOptions != nil { - tweakListOptions(&options) - } - return client.InferenceV1alpha1().InferencePools(namespace).List(context.TODO(), options) - }, - WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { - if tweakListOptions != nil { - tweakListOptions(&options) - } - return client.InferenceV1alpha1().InferencePools(namespace).Watch(context.TODO(), options) - }, - }, - &gatewayapiinferenceextensionapiv1alpha1.InferencePool{}, - resyncPeriod, - indexers, - ) -} - -func (f *inferencePoolInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { - return NewFilteredInferencePoolInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) -} - -func (f *inferencePoolInformer) Informer() cache.SharedIndexInformer { - return f.factory.InformerFor(&gatewayapiinferenceextensionapiv1alpha1.InferencePool{}, f.defaultInformer) -} - -func (f *inferencePoolInformer) Lister() apiv1alpha1.InferencePoolLister { - return apiv1alpha1.NewInferencePoolLister(f.Informer().GetIndexer()) -} diff --git a/client-go/informers/externalversions/api/v1alpha1/interface.go b/client-go/informers/externalversions/api/v1alpha1/interface.go deleted file mode 100644 index 3ea6d988..00000000 --- a/client-go/informers/externalversions/api/v1alpha1/interface.go +++ /dev/null @@ -1,51 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by informer-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - internalinterfaces "sigs.k8s.io/gateway-api-inference-extension/client-go/informers/externalversions/internalinterfaces" -) - -// Interface provides access to all the informers in this group version. -type Interface interface { - // InferenceModels returns a InferenceModelInformer. - InferenceModels() InferenceModelInformer - // InferencePools returns a InferencePoolInformer. - InferencePools() InferencePoolInformer -} - -type version struct { - factory internalinterfaces.SharedInformerFactory - namespace string - tweakListOptions internalinterfaces.TweakListOptionsFunc -} - -// New returns a new Interface. -func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { - return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} -} - -// InferenceModels returns a InferenceModelInformer. -func (v *version) InferenceModels() InferenceModelInformer { - return &inferenceModelInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions} -} - -// InferencePools returns a InferencePoolInformer. -func (v *version) InferencePools() InferencePoolInformer { - return &inferencePoolInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions} -} diff --git a/client-go/informers/externalversions/generic.go b/client-go/informers/externalversions/generic.go index 9f363d88..4186b2f6 100644 --- a/client-go/informers/externalversions/generic.go +++ b/client-go/informers/externalversions/generic.go @@ -22,7 +22,6 @@ import ( schema "k8s.io/apimachinery/pkg/runtime/schema" cache "k8s.io/client-go/tools/cache" - v1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" v1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) @@ -52,13 +51,7 @@ func (f *genericInformer) Lister() cache.GenericLister { // TODO extend this to unknown resources with a client pool func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) { switch resource { - // Group=inference.networking.x-k8s.io, Version=v1alpha1 - case v1alpha1.SchemeGroupVersion.WithResource("inferencemodels"): - return &genericInformer{resource: resource.GroupResource(), informer: f.Inference().V1alpha1().InferenceModels().Informer()}, nil - case v1alpha1.SchemeGroupVersion.WithResource("inferencepools"): - return &genericInformer{resource: resource.GroupResource(), informer: f.Inference().V1alpha1().InferencePools().Informer()}, nil - - // Group=inference.networking.x-k8s.io, Version=v1alpha2 + // Group=inference.networking.x-k8s.io, Version=v1alpha2 case v1alpha2.SchemeGroupVersion.WithResource("inferencemodels"): return &genericInformer{resource: resource.GroupResource(), informer: f.Inference().V1alpha2().InferenceModels().Informer()}, nil case v1alpha2.SchemeGroupVersion.WithResource("inferencepools"): diff --git a/client-go/listers/api/v1alpha1/expansion_generated.go b/client-go/listers/api/v1alpha1/expansion_generated.go deleted file mode 100644 index ffbe67cf..00000000 --- a/client-go/listers/api/v1alpha1/expansion_generated.go +++ /dev/null @@ -1,34 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by lister-gen. DO NOT EDIT. - -package v1alpha1 - -// InferenceModelListerExpansion allows custom methods to be added to -// InferenceModelLister. -type InferenceModelListerExpansion interface{} - -// InferenceModelNamespaceListerExpansion allows custom methods to be added to -// InferenceModelNamespaceLister. -type InferenceModelNamespaceListerExpansion interface{} - -// InferencePoolListerExpansion allows custom methods to be added to -// InferencePoolLister. -type InferencePoolListerExpansion interface{} - -// InferencePoolNamespaceListerExpansion allows custom methods to be added to -// InferencePoolNamespaceLister. -type InferencePoolNamespaceListerExpansion interface{} diff --git a/client-go/listers/api/v1alpha1/inferencemodel.go b/client-go/listers/api/v1alpha1/inferencemodel.go deleted file mode 100644 index b4342842..00000000 --- a/client-go/listers/api/v1alpha1/inferencemodel.go +++ /dev/null @@ -1,69 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by lister-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - labels "k8s.io/apimachinery/pkg/labels" - listers "k8s.io/client-go/listers" - cache "k8s.io/client-go/tools/cache" - apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" -) - -// InferenceModelLister helps list InferenceModels. -// All objects returned here must be treated as read-only. -type InferenceModelLister interface { - // List lists all InferenceModels in the indexer. - // Objects returned here must be treated as read-only. - List(selector labels.Selector) (ret []*apiv1alpha1.InferenceModel, err error) - // InferenceModels returns an object that can list and get InferenceModels. - InferenceModels(namespace string) InferenceModelNamespaceLister - InferenceModelListerExpansion -} - -// inferenceModelLister implements the InferenceModelLister interface. -type inferenceModelLister struct { - listers.ResourceIndexer[*apiv1alpha1.InferenceModel] -} - -// NewInferenceModelLister returns a new InferenceModelLister. -func NewInferenceModelLister(indexer cache.Indexer) InferenceModelLister { - return &inferenceModelLister{listers.New[*apiv1alpha1.InferenceModel](indexer, apiv1alpha1.Resource("inferencemodel"))} -} - -// InferenceModels returns an object that can list and get InferenceModels. -func (s *inferenceModelLister) InferenceModels(namespace string) InferenceModelNamespaceLister { - return inferenceModelNamespaceLister{listers.NewNamespaced[*apiv1alpha1.InferenceModel](s.ResourceIndexer, namespace)} -} - -// InferenceModelNamespaceLister helps list and get InferenceModels. -// All objects returned here must be treated as read-only. -type InferenceModelNamespaceLister interface { - // List lists all InferenceModels in the indexer for a given namespace. - // Objects returned here must be treated as read-only. - List(selector labels.Selector) (ret []*apiv1alpha1.InferenceModel, err error) - // Get retrieves the InferenceModel from the indexer for a given namespace and name. - // Objects returned here must be treated as read-only. - Get(name string) (*apiv1alpha1.InferenceModel, error) - InferenceModelNamespaceListerExpansion -} - -// inferenceModelNamespaceLister implements the InferenceModelNamespaceLister -// interface. -type inferenceModelNamespaceLister struct { - listers.ResourceIndexer[*apiv1alpha1.InferenceModel] -} diff --git a/client-go/listers/api/v1alpha1/inferencepool.go b/client-go/listers/api/v1alpha1/inferencepool.go deleted file mode 100644 index 387daf39..00000000 --- a/client-go/listers/api/v1alpha1/inferencepool.go +++ /dev/null @@ -1,69 +0,0 @@ -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -// Code generated by lister-gen. DO NOT EDIT. - -package v1alpha1 - -import ( - labels "k8s.io/apimachinery/pkg/labels" - listers "k8s.io/client-go/listers" - cache "k8s.io/client-go/tools/cache" - apiv1alpha1 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" -) - -// InferencePoolLister helps list InferencePools. -// All objects returned here must be treated as read-only. -type InferencePoolLister interface { - // List lists all InferencePools in the indexer. - // Objects returned here must be treated as read-only. - List(selector labels.Selector) (ret []*apiv1alpha1.InferencePool, err error) - // InferencePools returns an object that can list and get InferencePools. - InferencePools(namespace string) InferencePoolNamespaceLister - InferencePoolListerExpansion -} - -// inferencePoolLister implements the InferencePoolLister interface. -type inferencePoolLister struct { - listers.ResourceIndexer[*apiv1alpha1.InferencePool] -} - -// NewInferencePoolLister returns a new InferencePoolLister. -func NewInferencePoolLister(indexer cache.Indexer) InferencePoolLister { - return &inferencePoolLister{listers.New[*apiv1alpha1.InferencePool](indexer, apiv1alpha1.Resource("inferencepool"))} -} - -// InferencePools returns an object that can list and get InferencePools. -func (s *inferencePoolLister) InferencePools(namespace string) InferencePoolNamespaceLister { - return inferencePoolNamespaceLister{listers.NewNamespaced[*apiv1alpha1.InferencePool](s.ResourceIndexer, namespace)} -} - -// InferencePoolNamespaceLister helps list and get InferencePools. -// All objects returned here must be treated as read-only. -type InferencePoolNamespaceLister interface { - // List lists all InferencePools in the indexer for a given namespace. - // Objects returned here must be treated as read-only. - List(selector labels.Selector) (ret []*apiv1alpha1.InferencePool, err error) - // Get retrieves the InferencePool from the indexer for a given namespace and name. - // Objects returned here must be treated as read-only. - Get(name string) (*apiv1alpha1.InferencePool, error) - InferencePoolNamespaceListerExpansion -} - -// inferencePoolNamespaceLister implements the InferencePoolNamespaceLister -// interface. -type inferencePoolNamespaceLister struct { - listers.ResourceIndexer[*apiv1alpha1.InferencePool] -} diff --git a/cmd/epp/main.go b/cmd/epp/main.go index b66024ec..ab270c49 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -39,7 +39,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha1" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" @@ -104,9 +103,7 @@ var ( func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) - utilruntime.Must(v1alpha1.AddToScheme(scheme)) utilruntime.Must(v1alpha2.AddToScheme(scheme)) - } func main() { diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml index 09258c20..2995e863 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml @@ -14,230 +14,6 @@ spec: singular: inferencemodel scope: Namespaced versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: InferenceModel is the Schema for the InferenceModels API. - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: |- - InferenceModelSpec represents the desired state of a specific model use case. This resource is - managed by the "Inference Workload Owner" persona. - - The Inference Workload Owner persona is someone that trains, verifies, and - leverages a large language model from a model frontend, drives the lifecycle - and rollout of new versions of those models, and defines the specific - performance and latency goals for the model. These workloads are - expected to operate within an InferencePool sharing compute capacity with other - InferenceModels, defined by the Inference Platform Admin. - - InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool, - if the name is reused, an error will be shown on the status of a - InferenceModel that attempted to reuse. The oldest InferenceModel, based on - creation timestamp, will be selected to remain valid. In the event of a race - condition, one will be selected at random. - properties: - criticality: - description: |- - Criticality defines how important it is to serve the model compared to other models referencing the same pool. - Criticality impacts how traffic is handled in resource constrained situations. It handles this by - queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will - fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness, - and the proportionality of fairness will be configurable. - - Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field. - Any implementations that may consume this field may treat an unset value as the 'Standard' range. - enum: - - Critical - - Standard - - Sheddable - type: string - modelName: - description: |- - ModelName is the name of the model as it will be set in the "model" parameter for an incoming request. - ModelNames must be unique for a referencing InferencePool - (names can be reused for a different pool in the same cluster). - The modelName with the oldest creation timestamp is retained, and the incoming - InferenceModel is sets the Ready status to false with a corresponding reason. - In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected. - Names can be reserved without an underlying model configured in the pool. - This can be done by specifying a target model and setting the weight to zero, - an error will be returned specifying that no valid target model is found. - maxLength: 256 - type: string - poolRef: - description: PoolRef is a reference to the inference pool, the pool - must exist in the same namespace. - properties: - group: - default: inference.networking.x-k8s.io - description: Group is the group of the referent. - maxLength: 253 - pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ - type: string - kind: - default: InferencePool - description: Kind is kind of the referent. For example "InferencePool". - maxLength: 63 - minLength: 1 - pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ - type: string - name: - description: Name is the name of the referent. - maxLength: 253 - minLength: 1 - type: string - required: - - name - type: object - targetModels: - description: |- - TargetModels allow multiple versions of a model for traffic splitting. - If not specified, the target model name is defaulted to the modelName parameter. - modelName is often in reference to a LoRA adapter. - items: - description: |- - TargetModel represents a deployed model or a LoRA adapter. The - Name field is expected to match the name of the LoRA adapter - (or base model) as it is registered within the model server. Inference - Gateway assumes that the model exists on the model server and it's the - responsibility of the user to validate a correct match. Should a model fail - to exist at request time, the error is processed by the Inference Gateway - and emitted on the appropriate InferenceModel object. - properties: - name: - description: Name is the name of the adapter or base model, - as expected by the ModelServer. - maxLength: 253 - type: string - weight: - description: |- - Weight is used to determine the proportion of traffic that should be - sent to this model when multiple target models are specified. - - Weight defines the proportion of requests forwarded to the specified - model. This is computed as weight/(sum of all weights in this - TargetModels list). For non-zero values, there may be some epsilon from - the exact proportion defined here depending on the precision an - implementation supports. Weight is not a percentage and the sum of - weights does not need to equal 100. - - If a weight is set for any targetModel, it must be set for all targetModels. - Conversely weights are optional, so long as ALL targetModels do not specify a weight. - format: int32 - maximum: 1000000 - minimum: 0 - type: integer - required: - - name - type: object - maxItems: 10 - type: array - x-kubernetes-validations: - - message: Weights should be set for all models, or none of the models. - rule: self.all(model, has(model.weight)) || self.all(model, !has(model.weight)) - required: - - modelName - - poolRef - type: object - status: - description: InferenceModelStatus defines the observed state of InferenceModel - properties: - conditions: - default: - - lastTransitionTime: "1970-01-01T00:00:00Z" - message: Waiting for controller - reason: Pending - status: Unknown - type: Ready - description: |- - Conditions track the state of the InferenceModel. - - Known condition types are: - - * "Accepted" - items: - description: Condition contains details for one aspect of the current - state of this API Resource. - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - maxItems: 8 - type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map - type: object - type: object - served: true - storage: false - subresources: - status: {} - name: v1alpha2 schema: openAPIV3Schema: diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml index 918e95cb..8a7ad938 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml @@ -14,196 +14,6 @@ spec: singular: inferencepool scope: Namespaced versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: InferencePool is the Schema for the InferencePools API. - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: InferencePoolSpec defines the desired state of InferencePool - properties: - extensionRef: - description: Extension configures an endpoint picker as an extension - service. - properties: - failureMode: - default: FailClose - description: |- - Configures how the gateway handles the case when the extension is not responsive. - Defaults to failClose. - enum: - - FailOpen - - FailClose - type: string - group: - default: "" - description: |- - Group is the group of the referent. - When unspecified or empty string, core API group is inferred. - type: string - kind: - default: Service - description: |- - Kind is the Kubernetes resource kind of the referent. For example - "Service". - - Defaults to "Service" when not specified. - - ExternalName services can refer to CNAME DNS records that may live - outside of the cluster and as such are difficult to reason about in - terms of conformance. They also may not be safe to forward to (see - CVE-2021-25740 for more information). Implementations MUST NOT - support ExternalName Services. - type: string - name: - description: Name is the name of the referent. - type: string - targetPortNumber: - description: |- - The port number on the pods running the extension. When unspecified, implementations SHOULD infer a - default value of 9002 when the Kind is Service. - format: int32 - maximum: 65535 - minimum: 1 - type: integer - required: - - name - type: object - selector: - additionalProperties: - description: |- - LabelValue is the value of a label. This is used for validation - of maps. This matches the Kubernetes label validation rules: - * must be 63 characters or less (can be empty), - * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]), - * could contain dashes (-), underscores (_), dots (.), and alphanumerics between. - - Valid values include: - - * MyValue - * my.name - * 123-my-value - maxLength: 63 - minLength: 0 - pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ - type: string - description: |- - Selector defines a map of labels to watch model server pods - that should be included in the InferencePool. - In some cases, implementations may translate this field to a Service selector, so this matches the simple - map used for Service selectors instead of the full Kubernetes LabelSelector type. - type: object - targetPortNumber: - description: |- - TargetPortNumber defines the port number to access the selected model servers. - The number must be in the range 1 to 65535. - format: int32 - maximum: 65535 - minimum: 1 - type: integer - required: - - extensionRef - - selector - - targetPortNumber - type: object - status: - description: InferencePoolStatus defines the observed state of InferencePool - properties: - conditions: - default: - - lastTransitionTime: "1970-01-01T00:00:00Z" - message: Waiting for controller - reason: Pending - status: Unknown - type: Ready - description: |- - Conditions track the state of the InferencePool. - - Known condition types are: - - * "Ready" - items: - description: Condition contains details for one aspect of the current - state of this API Resource. - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - message - - reason - - status - - type - type: object - maxItems: 8 - type: array - x-kubernetes-list-map-keys: - - type - x-kubernetes-list-type: map - type: object - type: object - served: true - storage: false - subresources: - status: {} - name: v1alpha2 schema: openAPIV3Schema: @@ -299,6 +109,8 @@ spec: that should be included in the InferencePool. In some cases, implementations may translate this field to a Service selector, so this matches the simple map used for Service selectors instead of the full Kubernetes LabelSelector type. + If sepecified, it will be applied to match the model server pods in the same namespace as the InferencePool. + Cross namesoace selector is not supported. type: object targetPortNumber: description: |- From 7e08e07614ed458d4cdceefe259eb799651794fe Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Thu, 27 Feb 2025 21:06:32 +0000 Subject: [PATCH 59/96] removed the EndpointPickerNotHealthy condition form pool status (#421) --- api/v1alpha2/inferencepool_types.go | 7 ------- 1 file changed, 7 deletions(-) diff --git a/api/v1alpha2/inferencepool_types.go b/api/v1alpha2/inferencepool_types.go index 0781f044..2300f52a 100644 --- a/api/v1alpha2/inferencepool_types.go +++ b/api/v1alpha2/inferencepool_types.go @@ -232,10 +232,6 @@ const ( // // * "Ready" // - // Possible reasons for this condition to be False are: - // - // * "EndpointPickerNotHealthy" - // // Possible reasons for this condition to be Unknown are: // // * "Pending" @@ -245,9 +241,6 @@ const ( // PoolReasonReady is the desired state. The pool and its components are initialized and ready for traffic. PoolReasonReady InferencePoolConditionReason = "Ready" - // PoolReasonEPPNotHealthy is used when the EPP has not yet passed health checks, or has started failing them. - PoolReasonEPPNotHealthy InferencePoolConditionReason = "EndpointPickerNotHealthy" - // PoolReasonPending is the initial state, and indicates that the controller has not yet reconciled this pool. PoolReasonPending InferencePoolConditionReason = "Pending" ) From 29bf32dfee0ae6558c0da532555448da4db93414 Mon Sep 17 00:00:00 2001 From: Jeff Luo Date: Thu, 27 Feb 2025 16:34:30 -0500 Subject: [PATCH 60/96] Add metrics validation in integration test (#413) Start by adding request total metrics, more validation will be added in follow up. https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/326 --- test/integration/hermetic_test.go | 82 +++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 5 deletions(-) diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index 2ea66dba..b4355539 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -24,8 +24,12 @@ import ( "errors" "fmt" "io" + "net" + "net/http" "os" "path/filepath" + "strconv" + "strings" "testing" "time" @@ -33,6 +37,7 @@ import ( extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3" "github.com/google/go-cmp/cmp" + "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/stretchr/testify/assert" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" @@ -43,12 +48,16 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" k8syaml "k8s.io/apimachinery/pkg/util/yaml" clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/component-base/metrics/legacyregistry" + metricsutils "k8s.io/component-base/metrics/testutil" ctrl "sigs.k8s.io/controller-runtime" k8sclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/envtest" + "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" extprocutils "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/test" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" @@ -57,7 +66,8 @@ import ( ) const ( - port = runserver.DefaultGrpcPort + port = runserver.DefaultGrpcPort + metricsPort = 8888 ) var ( @@ -76,6 +86,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { wantHeaders []*configPb.HeaderValueOption wantMetadata *structpb.Struct wantBody []byte + wantMetrics string wantErr bool immediateResponse *extProcPb.ImmediateResponse }{ @@ -113,7 +124,12 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, wantMetadata: makeMetadata("address-1:8000"), wantBody: []byte("{\"max_tokens\":100,\"model\":\"my-model-12345\",\"prompt\":\"test1\",\"temperature\":0}"), - wantErr: false, + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="my-model",target_model_name="my-model-12345"} 1 + `, + wantErr: false, }, { name: "select active lora, low queue", @@ -161,7 +177,12 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, wantMetadata: makeMetadata("address-1:8000"), wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test2\",\"temperature\":0}"), - wantErr: false, + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1 + `, + wantErr: false, }, { name: "select no lora despite active model, avoid excessive queue size", @@ -210,7 +231,12 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, wantMetadata: makeMetadata("address-2:8000"), wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test3\",\"temperature\":0}"), - wantErr: false, + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1 + `, + wantErr: false, }, { name: "noncritical and all models past threshold, shed request", @@ -253,6 +279,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { Code: envoyTypePb.StatusCode_TooManyRequests, }, }, + wantMetrics: "", }, { name: "noncritical, but one server has capacity, do not shed", @@ -301,7 +328,12 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, wantMetadata: makeMetadata("address-0:8000"), wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"test5\",\"temperature\":0}"), - wantErr: false, + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1 + `, + wantErr: false, }, } @@ -345,6 +377,14 @@ func TestKubeInferenceModelRequest(t *testing.T) { if diff := cmp.Diff(want, res, protocmp.Transform()); diff != "" { t.Errorf("Unexpected response, (-want +got): %v", diff) } + + if test.wantMetrics != "" { + if err := metricsutils.GatherAndCompare(legacyregistry.DefaultGatherer, strings.NewReader(test.wantMetrics), "inference_model_request_total"); err != nil { + t.Error(err) + } + } + + legacyregistry.Reset() }) } } @@ -424,6 +464,10 @@ func BeforeSuit(t *testing.T) func() { logutil.Fatal(logger, err, "Failed to create controller manager") } + if err := registerMetricsHandler(mgr, metricsPort); err != nil { + logutil.Fatal(logger, err, "Failed to register metrics handler") + } + serverRunner = runserver.NewDefaultExtProcServerRunner() // Adjust from defaults serverRunner.PoolName = "vllm-llama2-7b-pool" @@ -544,3 +588,31 @@ func makeMetadata(endpoint string) *structpb.Struct { }, } } + +// registerMetricsHandler is a simplified version of metrics endpoint handler +// without Authentication for integration tests. +func registerMetricsHandler(mgr manager.Manager, port int) error { + metrics.Register() + + // Init HTTP server. + h := promhttp.HandlerFor( + legacyregistry.DefaultGatherer, + promhttp.HandlerOpts{}, + ) + + mux := http.NewServeMux() + mux.Handle("/metrics", h) + + srv := &http.Server{ + Addr: net.JoinHostPort("", strconv.Itoa(port)), + Handler: mux, + } + + if err := mgr.Add(&manager.Server{ + Name: "metrics", + Server: srv, + }); err != nil { + return err + } + return nil +} From d2c6e7a728c5d3bfe12dbb26e43bc27995962651 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Fri, 28 Feb 2025 00:18:29 +0200 Subject: [PATCH 61/96] predicate follow up PR to remove the check from Reconcile func (#418) * predicate follow up PR to remove the check from Reconcile func Signed-off-by: Nir Rozenbaum * removed irrelevant test after introducing predicate Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum --- pkg/epp/controller/inferencemodel_reconciler.go | 5 +---- pkg/epp/controller/inferencemodel_reconciler_test.go | 11 ----------- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/pkg/epp/controller/inferencemodel_reconciler.go b/pkg/epp/controller/inferencemodel_reconciler.go index 7cf18808..ebdb1cdd 100644 --- a/pkg/epp/controller/inferencemodel_reconciler.go +++ b/pkg/epp/controller/inferencemodel_reconciler.go @@ -43,10 +43,7 @@ type InferenceModelReconciler struct { } func (c *InferenceModelReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - if req.Namespace != c.PoolNamespacedName.Namespace { - return ctrl.Result{}, nil - } - logger := log.FromContext(ctx).V(logutil.DEFAULT).WithValues("inferenceModel", req.Name) + logger := log.FromContext(ctx).V(logutil.DEFAULT).WithValues("inferenceModel", req.NamespacedName) ctx = ctrl.LoggerInto(ctx, logger) logger.Info("Reconciling InferenceModel") diff --git a/pkg/epp/controller/inferencemodel_reconciler_test.go b/pkg/epp/controller/inferencemodel_reconciler_test.go index 87323e80..d5277919 100644 --- a/pkg/epp/controller/inferencemodel_reconciler_test.go +++ b/pkg/epp/controller/inferencemodel_reconciler_test.go @@ -85,11 +85,6 @@ var ( ModelName("fake model2"). CreationTimestamp(metav1.Unix(1000, 0)). PoolName(pool.Name).ObjRef() - infModel2NS2 = utiltest.MakeInferenceModel(infModel2.Name). - Namespace("ns2"). - ModelName(infModel2.Spec.ModelName). - CreationTimestamp(metav1.Unix(1000, 0)). - PoolName(pool.Name).ObjRef() ) func TestInferenceModelReconciler(t *testing.T) { @@ -131,12 +126,6 @@ func TestInferenceModelReconciler(t *testing.T) { model: infModel1NS2, wantModels: []*v1alpha2.InferenceModel{infModel1}, }, - { - name: "Model referencing a different pool, same pool name but different namespace", - modelsInStore: []*v1alpha2.InferenceModel{infModel1}, - model: infModel2NS2, - wantModels: []*v1alpha2.InferenceModel{infModel1}, - }, { name: "Existing model changed pools, replaced with another", modelsInStore: []*v1alpha2.InferenceModel{infModel1}, From 5137c591daf5dd553882c28052270edfe8c203cd Mon Sep 17 00:00:00 2001 From: Tiger Xu / Zhonghu Xu Date: Fri, 28 Feb 2025 23:06:57 +0800 Subject: [PATCH 62/96] Mis cleanup (#428) --- .../controller/inferencemodel_reconciler.go | 8 +++--- pkg/epp/datastore/datastore.go | 26 +++++++++---------- pkg/epp/datastore/datastore_test.go | 10 +++---- pkg/epp/handlers/request.go | 4 +-- test/integration/hermetic_test.go | 4 +-- 5 files changed, 25 insertions(+), 27 deletions(-) diff --git a/pkg/epp/controller/inferencemodel_reconciler.go b/pkg/epp/controller/inferencemodel_reconciler.go index ebdb1cdd..2b50537a 100644 --- a/pkg/epp/controller/inferencemodel_reconciler.go +++ b/pkg/epp/controller/inferencemodel_reconciler.go @@ -68,9 +68,9 @@ func (c *InferenceModelReconciler) Reconcile(ctx context.Context, req ctrl.Reque logger = logger.WithValues("poolRef", infModel.Spec.PoolRef).WithValues("modelName", infModel.Spec.ModelName) if !c.Datastore.ModelSetIfOlder(infModel) { logger.Info("Skipping InferenceModel, existing instance has older creation timestamp") - + } else { + logger.Info("Added/Updated InferenceModel") } - logger.Info("Added/Updated InferenceModel") return ctrl.Result{}, nil } @@ -82,8 +82,8 @@ func (c *InferenceModelReconciler) handleModelDeleted(ctx context.Context, req t // other instances referencing the same modelName if exist, and store the oldest in // its place. This ensures that the InferenceModel with the oldest creation // timestamp is active. - existing, exists := c.Datastore.ModelDelete(req) - if !exists { + existing := c.Datastore.ModelDelete(req) + if existing == nil { // No entry exists in the first place, nothing to do. return nil } diff --git a/pkg/epp/datastore/datastore.go b/pkg/epp/datastore/datastore.go index cd5d290f..eee17ed4 100644 --- a/pkg/epp/datastore/datastore.go +++ b/pkg/epp/datastore/datastore.go @@ -51,15 +51,15 @@ type Datastore interface { // InferenceModel operations ModelSetIfOlder(infModel *v1alpha2.InferenceModel) bool - ModelGet(modelName string) (*v1alpha2.InferenceModel, bool) - ModelDelete(namespacedName types.NamespacedName) (*v1alpha2.InferenceModel, bool) + ModelGet(modelName string) *v1alpha2.InferenceModel + ModelDelete(namespacedName types.NamespacedName) *v1alpha2.InferenceModel ModelResync(ctx context.Context, ctrlClient client.Client, modelName string) (bool, error) ModelGetAll() []*v1alpha2.InferenceModel // PodMetrics operations PodUpdateOrAddIfNotExist(pod *corev1.Pod) bool PodUpdateMetricsIfExist(namespacedName types.NamespacedName, m *Metrics) bool - PodGet(namespacedName types.NamespacedName) (*PodMetrics, bool) + PodGet(namespacedName types.NamespacedName) *PodMetrics PodDelete(namespacedName types.NamespacedName) PodResyncAll(ctx context.Context, ctrlClient client.Client) PodGetAll() []*PodMetrics @@ -147,7 +147,6 @@ func (ds *datastore) PoolLabelsMatch(podLabels map[string]string) bool { return poolSelector.Matches(podSet) } -// /// InferenceModel APIs /// func (ds *datastore) ModelSetIfOlder(infModel *v1alpha2.InferenceModel) bool { ds.poolAndModelsMu.Lock() defer ds.poolAndModelsMu.Unlock() @@ -199,23 +198,22 @@ func (ds *datastore) ModelResync(ctx context.Context, c client.Client, modelName return true, nil } -func (ds *datastore) ModelGet(modelName string) (*v1alpha2.InferenceModel, bool) { +func (ds *datastore) ModelGet(modelName string) *v1alpha2.InferenceModel { ds.poolAndModelsMu.RLock() defer ds.poolAndModelsMu.RUnlock() - m, exists := ds.models[modelName] - return m, exists + return ds.models[modelName] } -func (ds *datastore) ModelDelete(namespacedName types.NamespacedName) (*v1alpha2.InferenceModel, bool) { +func (ds *datastore) ModelDelete(namespacedName types.NamespacedName) *v1alpha2.InferenceModel { ds.poolAndModelsMu.Lock() defer ds.poolAndModelsMu.Unlock() for _, m := range ds.models { if m.Name == namespacedName.Name && m.Namespace == namespacedName.Namespace { delete(ds.models, m.Spec.ModelName) - return m, true + return m } } - return nil, false + return nil } func (ds *datastore) ModelGetAll() []*v1alpha2.InferenceModel { @@ -238,12 +236,12 @@ func (ds *datastore) PodUpdateMetricsIfExist(namespacedName types.NamespacedName return false } -func (ds *datastore) PodGet(namespacedName types.NamespacedName) (*PodMetrics, bool) { +func (ds *datastore) PodGet(namespacedName types.NamespacedName) *PodMetrics { val, ok := ds.pods.Load(namespacedName) if ok { - return val.(*PodMetrics), true + return val.(*PodMetrics) } - return nil, false + return nil } func (ds *datastore) PodGetAll() []*PodMetrics { @@ -311,7 +309,7 @@ func (ds *datastore) PodResyncAll(ctx context.Context, ctrlClient client.Client) } } - // Remove pods that don't exist or not ready any more. + // Remove pods that don't belong to the pool or not ready any more. deleteFn := func(k, v any) bool { pm := v.(*PodMetrics) if exist := activePods[pm.NamespacedName.Name]; !exist { diff --git a/pkg/epp/datastore/datastore_test.go b/pkg/epp/datastore/datastore_test.go index edc96626..95ac642c 100644 --- a/pkg/epp/datastore/datastore_test.go +++ b/pkg/epp/datastore/datastore_test.go @@ -176,8 +176,8 @@ func TestModel(t *testing.T) { name: "Getting by model name, chat -> model2", existingModels: []*v1alpha2.InferenceModel{model2chat, model1ts}, op: func(ds Datastore) bool { - gotChat, exists := ds.ModelGet(chatModel) - return exists && cmp.Diff(model2chat, gotChat) == "" + gotChat := ds.ModelGet(chatModel) + return gotChat != nil && cmp.Diff(model2chat, gotChat) == "" }, wantOpResult: true, wantModels: []*v1alpha2.InferenceModel{model2chat, model1ts}, @@ -186,9 +186,9 @@ func TestModel(t *testing.T) { name: "Delete the model", existingModels: []*v1alpha2.InferenceModel{model2chat, model1ts}, op: func(ds Datastore) bool { - _, existed := ds.ModelDelete(types.NamespacedName{Name: model1ts.Name, Namespace: model1ts.Namespace}) - _, exists := ds.ModelGet(tsModel) - return existed && !exists + existing := ds.ModelDelete(types.NamespacedName{Name: model1ts.Name, Namespace: model1ts.Namespace}) + got := ds.ModelGet(tsModel) + return existing != nil && got == nil }, wantOpResult: true, diff --git a/pkg/epp/handlers/request.go b/pkg/epp/handlers/request.go index c6cfdda2..20271913 100644 --- a/pkg/epp/handlers/request.go +++ b/pkg/epp/handlers/request.go @@ -64,8 +64,8 @@ func (s *Server) HandleRequestBody( // NOTE: The nil checking for the modelObject means that we DO allow passthrough currently. // This might be a security risk in the future where adapters not registered in the InferenceModel // are able to be requested by using their distinct name. - modelObj, exist := s.datastore.ModelGet(model) - if !exist { + modelObj := s.datastore.ModelGet(model) + if modelObj == nil { return nil, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error finding a model object in InferenceModel for input %v", model)} } if len(modelObj.Spec.TargetModels) > 0 { diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index b4355539..de32dce0 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -520,8 +520,8 @@ func BeforeSuit(t *testing.T) func() { } assert.EventuallyWithT(t, func(t *assert.CollectT) { - _, modelExist := serverRunner.Datastore.ModelGet("my-model") - synced := serverRunner.Datastore.PoolHasSynced() && modelExist + modelExist := serverRunner.Datastore.ModelGet("my-model") + synced := serverRunner.Datastore.PoolHasSynced() && modelExist != nil assert.True(t, synced, "Timeout waiting for the pool and models to sync") }, 10*time.Second, 10*time.Millisecond) From 0d08a07b8e9cc9da6f6e197f4223872f332db7f1 Mon Sep 17 00:00:00 2001 From: Kuromesi Date: Fri, 28 Feb 2025 23:20:56 +0800 Subject: [PATCH 63/96] fix metric scrape port not updated when inference pool target port updated (#417) * fix metric scrape port not updated when inference pool target port updated Signed-off-by: Kuromesi * bug fix Signed-off-by: Kuromesi * fix ut Signed-off-by: Kuromesi * add log Signed-off-by: Kuromesi --------- Signed-off-by: Kuromesi --- pkg/epp/backend/fake.go | 2 +- pkg/epp/backend/provider.go | 13 +++++++++---- pkg/epp/backend/provider_test.go | 9 ++++++++- pkg/epp/backend/vllm/metrics.go | 4 +++- pkg/epp/controller/pod_reconciler_test.go | 8 ++++---- pkg/epp/datastore/datastore.go | 5 +---- pkg/epp/datastore/types.go | 11 +---------- 7 files changed, 27 insertions(+), 25 deletions(-) diff --git a/pkg/epp/backend/fake.go b/pkg/epp/backend/fake.go index 06f14f69..584486c2 100644 --- a/pkg/epp/backend/fake.go +++ b/pkg/epp/backend/fake.go @@ -31,7 +31,7 @@ type FakePodMetricsClient struct { Res map[types.NamespacedName]*datastore.PodMetrics } -func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, existing *datastore.PodMetrics) (*datastore.PodMetrics, error) { +func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, existing *datastore.PodMetrics, port int32) (*datastore.PodMetrics, error) { if err, ok := f.Err[existing.NamespacedName]; ok { return nil, err } diff --git a/pkg/epp/backend/provider.go b/pkg/epp/backend/provider.go index a12f84d5..959f3e0c 100644 --- a/pkg/epp/backend/provider.go +++ b/pkg/epp/backend/provider.go @@ -49,7 +49,7 @@ type Provider struct { } type PodMetricsClient interface { - FetchMetrics(ctx context.Context, existing *datastore.PodMetrics) (*datastore.PodMetrics, error) + FetchMetrics(ctx context.Context, existing *datastore.PodMetrics, port int32) (*datastore.PodMetrics, error) } func (p *Provider) Init(ctx context.Context, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration) error { @@ -105,6 +105,11 @@ func (p *Provider) Init(ctx context.Context, refreshMetricsInterval, refreshProm func (p *Provider) refreshMetricsOnce(logger logr.Logger) error { loggerTrace := logger.V(logutil.TRACE) + pool, _ := p.datastore.PoolGet() + if pool == nil { + loggerTrace.Info("No inference pool or not initialized") + return nil + } ctx, cancel := context.WithTimeout(context.Background(), fetchMetricsTimeout) defer cancel() start := time.Now() @@ -113,6 +118,7 @@ func (p *Provider) refreshMetricsOnce(logger logr.Logger) error { // TODO: add a metric instead of logging loggerTrace.Info("Metrics refreshed", "duration", d) }() + var wg sync.WaitGroup errCh := make(chan error) processOnePod := func(key, value any) bool { @@ -121,7 +127,7 @@ func (p *Provider) refreshMetricsOnce(logger logr.Logger) error { wg.Add(1) go func() { defer wg.Done() - updated, err := p.pmc.FetchMetrics(ctx, existing) + updated, err := p.pmc.FetchMetrics(ctx, existing, pool.Spec.TargetPortNumber) if err != nil { errCh <- fmt.Errorf("failed to parse metrics from %s: %v", existing.NamespacedName, err) return @@ -151,8 +157,6 @@ func (p *Provider) refreshMetricsOnce(logger logr.Logger) error { } func (p *Provider) flushPrometheusMetricsOnce(logger logr.Logger) { - logger.V(logutil.DEBUG).Info("Flushing Prometheus Metrics") - pool, _ := p.datastore.PoolGet() if pool == nil { // No inference pool or not initialize. @@ -163,6 +167,7 @@ func (p *Provider) flushPrometheusMetricsOnce(logger logr.Logger) { var queueTotal int podMetrics := p.datastore.PodGetAll() + logger.V(logutil.VERBOSE).Info("Flushing Prometheus Metrics", "ReadyPods", len(podMetrics)) if len(podMetrics) == 0 { return } diff --git a/pkg/epp/backend/provider_test.go b/pkg/epp/backend/provider_test.go index f2db09fe..12994723 100644 --- a/pkg/epp/backend/provider_test.go +++ b/pkg/epp/backend/provider_test.go @@ -26,6 +26,7 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" "github.com/stretchr/testify/assert" "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" ) @@ -68,6 +69,12 @@ var ( }, }, } + + inferencePool = &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ + TargetPortNumber: 8000, + }, + } ) func TestProvider(t *testing.T) { @@ -127,7 +134,7 @@ func TestProvider(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - ds := datastore.NewFakeDatastore(test.storePods, nil, nil) + ds := datastore.NewFakeDatastore(test.storePods, nil, inferencePool) p := NewProvider(test.pmc, ds) ctx, cancel := context.WithCancel(context.Background()) defer cancel() diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go index 8648e24c..4973c93e 100644 --- a/pkg/epp/backend/vllm/metrics.go +++ b/pkg/epp/backend/vllm/metrics.go @@ -55,13 +55,15 @@ type PodMetricsClientImpl struct{} func (p *PodMetricsClientImpl) FetchMetrics( ctx context.Context, existing *datastore.PodMetrics, + port int32, ) (*datastore.PodMetrics, error) { logger := log.FromContext(ctx) loggerDefault := logger.V(logutil.DEFAULT) // Currently the metrics endpoint is hard-coded, which works with vLLM. // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. - url := existing.BuildScrapeEndpoint() + url := "http://" + existing.Address + ":" + strconv.Itoa(int(port)) + "/metrics" + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) diff --git a/pkg/epp/controller/pod_reconciler_test.go b/pkg/epp/controller/pod_reconciler_test.go index 57576213..7534ac0f 100644 --- a/pkg/epp/controller/pod_reconciler_test.go +++ b/pkg/epp/controller/pod_reconciler_test.go @@ -35,10 +35,10 @@ import ( ) var ( - basePod1 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}, Address: "address-1", ScrapePath: "/metrics", ScrapePort: 8000}} - basePod2 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}, Address: "address-2", ScrapePath: "/metrics", ScrapePort: 8000}} - basePod3 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}, Address: "address-3", ScrapePath: "/metrics", ScrapePort: 8000}} - basePod11 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}, Address: "address-11", ScrapePath: "/metrics", ScrapePort: 8000}} + basePod1 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}, Address: "address-1"}} + basePod2 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}, Address: "address-2"}} + basePod3 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}, Address: "address-3"}} + basePod11 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}, Address: "address-11"}} ) func TestPodReconciler(t *testing.T) { diff --git a/pkg/epp/datastore/datastore.go b/pkg/epp/datastore/datastore.go index eee17ed4..2994d6e1 100644 --- a/pkg/epp/datastore/datastore.go +++ b/pkg/epp/datastore/datastore.go @@ -263,16 +263,13 @@ func (ds *datastore) PodDelete(namespacedName types.NamespacedName) { } func (ds *datastore) PodUpdateOrAddIfNotExist(pod *corev1.Pod) bool { - pool, _ := ds.PoolGet() new := &PodMetrics{ Pod: Pod{ NamespacedName: types.NamespacedName{ Name: pod.Name, Namespace: pod.Namespace, }, - Address: pod.Status.PodIP, - ScrapePath: "/metrics", - ScrapePort: pool.Spec.TargetPortNumber, + Address: pod.Status.PodIP, }, Metrics: Metrics{ ActiveModels: make(map[string]int), diff --git a/pkg/epp/datastore/types.go b/pkg/epp/datastore/types.go index 237e98ca..8cfcf1d1 100644 --- a/pkg/epp/datastore/types.go +++ b/pkg/epp/datastore/types.go @@ -26,10 +26,6 @@ import ( type Pod struct { NamespacedName types.NamespacedName Address string - - // metrics scrape options - ScrapePort int32 - ScrapePath string } type Metrics struct { @@ -61,11 +57,10 @@ func (pm *PodMetrics) Clone() *PodMetrics { Pod: Pod{ NamespacedName: pm.NamespacedName, Address: pm.Address, - ScrapePort: pm.ScrapePort, - ScrapePath: pm.ScrapePath, }, Metrics: Metrics{ ActiveModels: cm, + MaxActiveModels: pm.MaxActiveModels, RunningQueueSize: pm.RunningQueueSize, WaitingQueueSize: pm.WaitingQueueSize, KVCacheUsagePercent: pm.KVCacheUsagePercent, @@ -74,7 +69,3 @@ func (pm *PodMetrics) Clone() *PodMetrics { } return clone } - -func (pm *PodMetrics) BuildScrapeEndpoint() string { - return fmt.Sprintf("http://%s:%d%s", pm.Address, pm.ScrapePort, pm.ScrapePath) -} From b1fed6c98b09cea9754a739af7d91909645df753 Mon Sep 17 00:00:00 2001 From: Tiger Xu / Zhonghu Xu Date: Fri, 28 Feb 2025 23:21:03 +0800 Subject: [PATCH 64/96] make ModelName immutable and fix model weight (#427) * make ModelName immutable and fix model weight * Fix ut --- api/v1alpha2/inferencemodel_types.go | 3 ++- ...e.networking.x-k8s.io_inferencemodels.yaml | 5 ++++- pkg/epp/datastore/datastore.go | 11 ++++++++-- pkg/epp/datastore/datastore_test.go | 21 ++++++++++++++++++- 4 files changed, 35 insertions(+), 5 deletions(-) diff --git a/api/v1alpha2/inferencemodel_types.go b/api/v1alpha2/inferencemodel_types.go index 9ab1fd86..a75fd699 100644 --- a/api/v1alpha2/inferencemodel_types.go +++ b/api/v1alpha2/inferencemodel_types.go @@ -71,6 +71,7 @@ type InferenceModelSpec struct { // // +kubebuilder:validation:MaxLength=256 // +kubebuilder:validation:Required + // +kubebuilder:validation:XValidation:rule="self == oldSelf",message="modelName is immutable" ModelName string `json:"modelName"` // Criticality defines how important it is to serve the model compared to other models referencing the same pool. @@ -175,7 +176,7 @@ type TargetModel struct { // Conversely weights are optional, so long as ALL targetModels do not specify a weight. // // +optional - // +kubebuilder:validation:Minimum=0 + // +kubebuilder:validation:Minimum=1 // +kubebuilder:validation:Maximum=1000000 Weight *int32 `json:"weight,omitempty"` } diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml index 2995e863..63c7fb51 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml @@ -82,6 +82,9 @@ spec: an error will be returned specifying that no valid target model is found. maxLength: 256 type: string + x-kubernetes-validations: + - message: modelName is immutable + rule: self == oldSelf poolRef: description: PoolRef is a reference to the inference pool, the pool must exist in the same namespace. @@ -143,7 +146,7 @@ spec: Conversely weights are optional, so long as ALL targetModels do not specify a weight. format: int32 maximum: 1000000 - minimum: 0 + minimum: 1 type: integer required: - name diff --git a/pkg/epp/datastore/datastore.go b/pkg/epp/datastore/datastore.go index 2994d6e1..f8d4722a 100644 --- a/pkg/epp/datastore/datastore.go +++ b/pkg/epp/datastore/datastore.go @@ -334,18 +334,25 @@ func stripLabelKeyAliasFromLabelMap(labels map[v1alpha2.LabelKey]v1alpha2.LabelV } func RandomWeightedDraw(logger logr.Logger, model *v1alpha2.InferenceModel, seed int64) string { - var weights int32 - source := rand.NewSource(rand.Int63()) if seed > 0 { source = rand.NewSource(seed) } r := rand.New(source) + + // all the weight values are nil, then we should return random model name + if model.Spec.TargetModels[0].Weight == nil { + index := r.Int31n(int32(len(model.Spec.TargetModels))) + return model.Spec.TargetModels[index].Name + } + + var weights int32 for _, model := range model.Spec.TargetModels { weights += *model.Weight } logger.V(logutil.TRACE).Info("Weights for model computed", "model", model.Name, "weights", weights) randomVal := r.Int31n(weights) + // TODO: optimize this without using loop for _, model := range model.Spec.TargetModels { if randomVal < *model.Weight { return model.Name diff --git a/pkg/epp/datastore/datastore_test.go b/pkg/epp/datastore/datastore_test.go index 95ac642c..8fb269bc 100644 --- a/pkg/epp/datastore/datastore_test.go +++ b/pkg/epp/datastore/datastore_test.go @@ -280,6 +280,25 @@ func TestRandomWeightedDraw(t *testing.T) { }, want: "v1.1", }, + { + name: "weighted distribution with weight unset", + model: &v1alpha2.InferenceModel{ + Spec: v1alpha2.InferenceModelSpec{ + TargetModels: []v1alpha2.TargetModel{ + { + Name: "canary", + }, + { + Name: "v1.1", + }, + { + Name: "v1", + }, + }, + }, + }, + want: "canary", + }, } var seedVal int64 = 420 for _, test := range tests { @@ -287,7 +306,7 @@ func TestRandomWeightedDraw(t *testing.T) { for range 10000 { model := RandomWeightedDraw(logger, test.model, seedVal) if model != test.want { - t.Errorf("Model returned!: %v", model) + t.Errorf("Model returned: %v != %v", model, test.want) break } } From 14afcd9461a57116b75df19e671f0a7033ff9c65 Mon Sep 17 00:00:00 2001 From: Rob Scott Date: Fri, 28 Feb 2025 18:26:55 -0800 Subject: [PATCH 65/96] Consistent validation for reference types (#430) * Consistent validation for reference types * Code updates after API type changes * Moving Label types to shared_types.go --- api/v1alpha2/inferencemodel_types.go | 13 +-- api/v1alpha2/inferencepool_types.go | 58 ++-------- api/v1alpha2/shared_types.go | 108 ++++++++++++++++++ api/v1alpha2/zz_generated.deepcopy.go | 6 +- .../api/v1alpha2/extension.go | 8 +- .../api/v1alpha2/extensionreference.go | 20 ++-- .../api/v1alpha2/poolobjectreference.go | 16 ++- ...ce.networking.x-k8s.io_inferencepools.yaml | 16 ++- .../controller/inferencemodel_reconciler.go | 4 +- pkg/epp/datastore/datastore.go | 2 +- pkg/epp/util/testing/wrappers.go | 2 +- test/utils/wrappers.go | 4 +- 12 files changed, 166 insertions(+), 91 deletions(-) create mode 100644 api/v1alpha2/shared_types.go diff --git a/api/v1alpha2/inferencemodel_types.go b/api/v1alpha2/inferencemodel_types.go index a75fd699..c011031e 100644 --- a/api/v1alpha2/inferencemodel_types.go +++ b/api/v1alpha2/inferencemodel_types.go @@ -107,25 +107,18 @@ type PoolObjectReference struct { // // +optional // +kubebuilder:default="inference.networking.x-k8s.io" - // +kubebuilder:validation:MaxLength=253 - // +kubebuilder:validation:Pattern=`^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$` - Group string `json:"group,omitempty"` + Group Group `json:"group,omitempty"` // Kind is kind of the referent. For example "InferencePool". // // +optional // +kubebuilder:default="InferencePool" - // +kubebuilder:validation:MinLength=1 - // +kubebuilder:validation:MaxLength=63 - // +kubebuilder:validation:Pattern=`^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$` - Kind string `json:"kind,omitempty"` + Kind Kind `json:"kind,omitempty"` // Name is the name of the referent. // - // +kubebuilder:validation:MinLength=1 - // +kubebuilder:validation:MaxLength=253 // +kubebuilder:validation:Required - Name string `json:"name"` + Name ObjectName `json:"name"` } // Criticality defines how important it is to serve the model compared to other models. diff --git a/api/v1alpha2/inferencepool_types.go b/api/v1alpha2/inferencepool_types.go index 2300f52a..ca76f347 100644 --- a/api/v1alpha2/inferencepool_types.go +++ b/api/v1alpha2/inferencepool_types.go @@ -90,11 +90,11 @@ type Extension struct { // ExtensionReference is a reference to the extension deployment. type ExtensionReference struct { // Group is the group of the referent. - // When unspecified or empty string, core API group is inferred. + // The default value is "", representing the Core API group. // // +optional // +kubebuilder:default="" - Group *string `json:"group,omitempty"` + Group *Group `json:"group,omitempty"` // Kind is the Kubernetes resource kind of the referent. For example // "Service". @@ -109,20 +109,19 @@ type ExtensionReference struct { // // +optional // +kubebuilder:default=Service - Kind *string `json:"kind,omitempty"` + Kind *Kind `json:"kind,omitempty"` // Name is the name of the referent. // // +kubebuilder:validation:Required - Name string `json:"name"` + Name ObjectName `json:"name"` - // The port number on the service running the extension. When unspecified, implementations SHOULD infer a - // default value of 9002 when the Kind is Service. + // The port number on the service running the extension. When unspecified, + // implementations SHOULD infer a default value of 9002 when the Kind is + // Service. // - // +kubebuilder:validation:Minimum=1 - // +kubebuilder:validation:Maximum=65535 // +optional - PortNumber *int32 `json:"targetPortNumber,omitempty"` + PortNumber *PortNumber `json:"portNumber,omitempty"` } // ExtensionConnection encapsulates options that configures the connection to the extension. @@ -147,47 +146,6 @@ const ( FailClose ExtensionFailureMode = "FailClose" ) -// LabelKey was originally copied from: https://github.com/kubernetes-sigs/gateway-api/blob/99a3934c6bc1ce0874f3a4c5f20cafd8977ffcb4/apis/v1/shared_types.go#L694-L731 -// Duplicated as to not take an unexpected dependency on gw's API. -// -// LabelKey is the key of a label. This is used for validation -// of maps. This matches the Kubernetes "qualified name" validation that is used for labels. -// Labels are case sensitive, so: my-label and My-Label are considered distinct. -// -// Valid values include: -// -// * example -// * example.com -// * example.com/path -// * example.com/path.html -// -// Invalid values include: -// -// * example~ - "~" is an invalid character -// * example.com. - can not start or end with "." -// -// +kubebuilder:validation:MinLength=1 -// +kubebuilder:validation:MaxLength=253 -// +kubebuilder:validation:Pattern=`^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9]$` -type LabelKey string - -// LabelValue is the value of a label. This is used for validation -// of maps. This matches the Kubernetes label validation rules: -// * must be 63 characters or less (can be empty), -// * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]), -// * could contain dashes (-), underscores (_), dots (.), and alphanumerics between. -// -// Valid values include: -// -// * MyValue -// * my.name -// * 123-my-value -// -// +kubebuilder:validation:MinLength=0 -// +kubebuilder:validation:MaxLength=63 -// +kubebuilder:validation:Pattern=`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$` -type LabelValue string - // InferencePoolStatus defines the observed state of InferencePool type InferencePoolStatus struct { // Parents is a list of parent resources (usually Gateways) that are diff --git a/api/v1alpha2/shared_types.go b/api/v1alpha2/shared_types.go new file mode 100644 index 00000000..ea5ef299 --- /dev/null +++ b/api/v1alpha2/shared_types.go @@ -0,0 +1,108 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha2 + +// Group refers to a Kubernetes Group. It must either be an empty string or a +// RFC 1123 subdomain. +// +// This validation is based off of the corresponding Kubernetes validation: +// https://github.com/kubernetes/apimachinery/blob/02cfb53916346d085a6c6c7c66f882e3c6b0eca6/pkg/util/validation/validation.go#L208 +// +// Valid values include: +// +// * "" - empty string implies core Kubernetes API group +// * "gateway.networking.k8s.io" +// * "foo.example.com" +// +// Invalid values include: +// +// * "example.com/bar" - "/" is an invalid character +// +// +kubebuilder:validation:MaxLength=253 +// +kubebuilder:validation:Pattern=`^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$` +type Group string + +// Kind refers to a Kubernetes Kind. +// +// Valid values include: +// +// * "Service" +// * "HTTPRoute" +// +// Invalid values include: +// +// * "invalid/kind" - "/" is an invalid character +// +// +kubebuilder:validation:MinLength=1 +// +kubebuilder:validation:MaxLength=63 +// +kubebuilder:validation:Pattern=`^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$` +type Kind string + +// ObjectName refers to the name of a Kubernetes object. +// Object names can have a variety of forms, including RFC 1123 subdomains, +// RFC 1123 labels, or RFC 1035 labels. +// +// +kubebuilder:validation:MinLength=1 +// +kubebuilder:validation:MaxLength=253 +type ObjectName string + +// PortNumber defines a network port. +// +// +kubebuilder:validation:Minimum=1 +// +kubebuilder:validation:Maximum=65535 +type PortNumber int32 + +// LabelKey was originally copied from: https://github.com/kubernetes-sigs/gateway-api/blob/99a3934c6bc1ce0874f3a4c5f20cafd8977ffcb4/apis/v1/shared_types.go#L694-L731 +// Duplicated as to not take an unexpected dependency on gw's API. +// +// LabelKey is the key of a label. This is used for validation +// of maps. This matches the Kubernetes "qualified name" validation that is used for labels. +// Labels are case sensitive, so: my-label and My-Label are considered distinct. +// +// Valid values include: +// +// * example +// * example.com +// * example.com/path +// * example.com/path.html +// +// Invalid values include: +// +// * example~ - "~" is an invalid character +// * example.com. - can not start or end with "." +// +// +kubebuilder:validation:MinLength=1 +// +kubebuilder:validation:MaxLength=253 +// +kubebuilder:validation:Pattern=`^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9]$` +type LabelKey string + +// LabelValue is the value of a label. This is used for validation +// of maps. This matches the Kubernetes label validation rules: +// * must be 63 characters or less (can be empty), +// * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]), +// * could contain dashes (-), underscores (_), dots (.), and alphanumerics between. +// +// Valid values include: +// +// * MyValue +// * my.name +// * 123-my-value +// +// +kubebuilder:validation:MinLength=0 +// +kubebuilder:validation:MaxLength=63 +// +kubebuilder:validation:Pattern=`^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$` +type LabelValue string diff --git a/api/v1alpha2/zz_generated.deepcopy.go b/api/v1alpha2/zz_generated.deepcopy.go index 9b685969..4dad0eff 100644 --- a/api/v1alpha2/zz_generated.deepcopy.go +++ b/api/v1alpha2/zz_generated.deepcopy.go @@ -87,17 +87,17 @@ func (in *ExtensionReference) DeepCopyInto(out *ExtensionReference) { *out = *in if in.Group != nil { in, out := &in.Group, &out.Group - *out = new(string) + *out = new(Group) **out = **in } if in.Kind != nil { in, out := &in.Kind, &out.Kind - *out = new(string) + *out = new(Kind) **out = **in } if in.PortNumber != nil { in, out := &in.PortNumber, &out.PortNumber - *out = new(int32) + *out = new(PortNumber) **out = **in } } diff --git a/client-go/applyconfiguration/api/v1alpha2/extension.go b/client-go/applyconfiguration/api/v1alpha2/extension.go index b3802613..5e17e030 100644 --- a/client-go/applyconfiguration/api/v1alpha2/extension.go +++ b/client-go/applyconfiguration/api/v1alpha2/extension.go @@ -37,7 +37,7 @@ func Extension() *ExtensionApplyConfiguration { // WithGroup sets the Group field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Group field is set to the value of the last call. -func (b *ExtensionApplyConfiguration) WithGroup(value string) *ExtensionApplyConfiguration { +func (b *ExtensionApplyConfiguration) WithGroup(value apiv1alpha2.Group) *ExtensionApplyConfiguration { b.ExtensionReferenceApplyConfiguration.Group = &value return b } @@ -45,7 +45,7 @@ func (b *ExtensionApplyConfiguration) WithGroup(value string) *ExtensionApplyCon // WithKind sets the Kind field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Kind field is set to the value of the last call. -func (b *ExtensionApplyConfiguration) WithKind(value string) *ExtensionApplyConfiguration { +func (b *ExtensionApplyConfiguration) WithKind(value apiv1alpha2.Kind) *ExtensionApplyConfiguration { b.ExtensionReferenceApplyConfiguration.Kind = &value return b } @@ -53,7 +53,7 @@ func (b *ExtensionApplyConfiguration) WithKind(value string) *ExtensionApplyConf // WithName sets the Name field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Name field is set to the value of the last call. -func (b *ExtensionApplyConfiguration) WithName(value string) *ExtensionApplyConfiguration { +func (b *ExtensionApplyConfiguration) WithName(value apiv1alpha2.ObjectName) *ExtensionApplyConfiguration { b.ExtensionReferenceApplyConfiguration.Name = &value return b } @@ -61,7 +61,7 @@ func (b *ExtensionApplyConfiguration) WithName(value string) *ExtensionApplyConf // WithPortNumber sets the PortNumber field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the PortNumber field is set to the value of the last call. -func (b *ExtensionApplyConfiguration) WithPortNumber(value int32) *ExtensionApplyConfiguration { +func (b *ExtensionApplyConfiguration) WithPortNumber(value apiv1alpha2.PortNumber) *ExtensionApplyConfiguration { b.ExtensionReferenceApplyConfiguration.PortNumber = &value return b } diff --git a/client-go/applyconfiguration/api/v1alpha2/extensionreference.go b/client-go/applyconfiguration/api/v1alpha2/extensionreference.go index 71034710..937e5795 100644 --- a/client-go/applyconfiguration/api/v1alpha2/extensionreference.go +++ b/client-go/applyconfiguration/api/v1alpha2/extensionreference.go @@ -17,13 +17,17 @@ limitations under the License. package v1alpha2 +import ( + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" +) + // ExtensionReferenceApplyConfiguration represents a declarative configuration of the ExtensionReference type for use // with apply. type ExtensionReferenceApplyConfiguration struct { - Group *string `json:"group,omitempty"` - Kind *string `json:"kind,omitempty"` - Name *string `json:"name,omitempty"` - PortNumber *int32 `json:"targetPortNumber,omitempty"` + Group *apiv1alpha2.Group `json:"group,omitempty"` + Kind *apiv1alpha2.Kind `json:"kind,omitempty"` + Name *apiv1alpha2.ObjectName `json:"name,omitempty"` + PortNumber *apiv1alpha2.PortNumber `json:"portNumber,omitempty"` } // ExtensionReferenceApplyConfiguration constructs a declarative configuration of the ExtensionReference type for use with @@ -35,7 +39,7 @@ func ExtensionReference() *ExtensionReferenceApplyConfiguration { // WithGroup sets the Group field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Group field is set to the value of the last call. -func (b *ExtensionReferenceApplyConfiguration) WithGroup(value string) *ExtensionReferenceApplyConfiguration { +func (b *ExtensionReferenceApplyConfiguration) WithGroup(value apiv1alpha2.Group) *ExtensionReferenceApplyConfiguration { b.Group = &value return b } @@ -43,7 +47,7 @@ func (b *ExtensionReferenceApplyConfiguration) WithGroup(value string) *Extensio // WithKind sets the Kind field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Kind field is set to the value of the last call. -func (b *ExtensionReferenceApplyConfiguration) WithKind(value string) *ExtensionReferenceApplyConfiguration { +func (b *ExtensionReferenceApplyConfiguration) WithKind(value apiv1alpha2.Kind) *ExtensionReferenceApplyConfiguration { b.Kind = &value return b } @@ -51,7 +55,7 @@ func (b *ExtensionReferenceApplyConfiguration) WithKind(value string) *Extension // WithName sets the Name field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Name field is set to the value of the last call. -func (b *ExtensionReferenceApplyConfiguration) WithName(value string) *ExtensionReferenceApplyConfiguration { +func (b *ExtensionReferenceApplyConfiguration) WithName(value apiv1alpha2.ObjectName) *ExtensionReferenceApplyConfiguration { b.Name = &value return b } @@ -59,7 +63,7 @@ func (b *ExtensionReferenceApplyConfiguration) WithName(value string) *Extension // WithPortNumber sets the PortNumber field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the PortNumber field is set to the value of the last call. -func (b *ExtensionReferenceApplyConfiguration) WithPortNumber(value int32) *ExtensionReferenceApplyConfiguration { +func (b *ExtensionReferenceApplyConfiguration) WithPortNumber(value apiv1alpha2.PortNumber) *ExtensionReferenceApplyConfiguration { b.PortNumber = &value return b } diff --git a/client-go/applyconfiguration/api/v1alpha2/poolobjectreference.go b/client-go/applyconfiguration/api/v1alpha2/poolobjectreference.go index cc88c950..20abf6b2 100644 --- a/client-go/applyconfiguration/api/v1alpha2/poolobjectreference.go +++ b/client-go/applyconfiguration/api/v1alpha2/poolobjectreference.go @@ -17,12 +17,16 @@ limitations under the License. package v1alpha2 +import ( + apiv1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" +) + // PoolObjectReferenceApplyConfiguration represents a declarative configuration of the PoolObjectReference type for use // with apply. type PoolObjectReferenceApplyConfiguration struct { - Group *string `json:"group,omitempty"` - Kind *string `json:"kind,omitempty"` - Name *string `json:"name,omitempty"` + Group *apiv1alpha2.Group `json:"group,omitempty"` + Kind *apiv1alpha2.Kind `json:"kind,omitempty"` + Name *apiv1alpha2.ObjectName `json:"name,omitempty"` } // PoolObjectReferenceApplyConfiguration constructs a declarative configuration of the PoolObjectReference type for use with @@ -34,7 +38,7 @@ func PoolObjectReference() *PoolObjectReferenceApplyConfiguration { // WithGroup sets the Group field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Group field is set to the value of the last call. -func (b *PoolObjectReferenceApplyConfiguration) WithGroup(value string) *PoolObjectReferenceApplyConfiguration { +func (b *PoolObjectReferenceApplyConfiguration) WithGroup(value apiv1alpha2.Group) *PoolObjectReferenceApplyConfiguration { b.Group = &value return b } @@ -42,7 +46,7 @@ func (b *PoolObjectReferenceApplyConfiguration) WithGroup(value string) *PoolObj // WithKind sets the Kind field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Kind field is set to the value of the last call. -func (b *PoolObjectReferenceApplyConfiguration) WithKind(value string) *PoolObjectReferenceApplyConfiguration { +func (b *PoolObjectReferenceApplyConfiguration) WithKind(value apiv1alpha2.Kind) *PoolObjectReferenceApplyConfiguration { b.Kind = &value return b } @@ -50,7 +54,7 @@ func (b *PoolObjectReferenceApplyConfiguration) WithKind(value string) *PoolObje // WithName sets the Name field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the Name field is set to the value of the last call. -func (b *PoolObjectReferenceApplyConfiguration) WithName(value string) *PoolObjectReferenceApplyConfiguration { +func (b *PoolObjectReferenceApplyConfiguration) WithName(value apiv1alpha2.ObjectName) *PoolObjectReferenceApplyConfiguration { b.Name = &value return b } diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml index 8a7ad938..15b79b69 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml @@ -56,7 +56,9 @@ spec: default: "" description: |- Group is the group of the referent. - When unspecified or empty string, core API group is inferred. + The default value is "", representing the Core API group. + maxLength: 253 + pattern: ^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$ type: string kind: default: Service @@ -71,14 +73,20 @@ spec: terms of conformance. They also may not be safe to forward to (see CVE-2021-25740 for more information). Implementations MUST NOT support ExternalName Services. + maxLength: 63 + minLength: 1 + pattern: ^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$ type: string name: description: Name is the name of the referent. + maxLength: 253 + minLength: 1 type: string - targetPortNumber: + portNumber: description: |- - The port number on the service running the extension. When unspecified, implementations SHOULD infer a - default value of 9002 when the Kind is Service. + The port number on the service running the extension. When unspecified, + implementations SHOULD infer a default value of 9002 when the Kind is + Service. format: int32 maximum: 65535 minimum: 1 diff --git a/pkg/epp/controller/inferencemodel_reconciler.go b/pkg/epp/controller/inferencemodel_reconciler.go index 2b50537a..8318324f 100644 --- a/pkg/epp/controller/inferencemodel_reconciler.go +++ b/pkg/epp/controller/inferencemodel_reconciler.go @@ -58,7 +58,7 @@ func (c *InferenceModelReconciler) Reconcile(ctx context.Context, req ctrl.Reque notFound = true } - if notFound || !infModel.DeletionTimestamp.IsZero() || infModel.Spec.PoolRef.Name != c.PoolNamespacedName.Name { + if notFound || !infModel.DeletionTimestamp.IsZero() || infModel.Spec.PoolRef.Name != v1alpha2.ObjectName(c.PoolNamespacedName.Name) { // InferenceModel object got deleted or changed the referenced pool. err := c.handleModelDeleted(ctx, req.NamespacedName) return ctrl.Result{}, err @@ -128,5 +128,5 @@ func (c *InferenceModelReconciler) SetupWithManager(ctx context.Context, mgr ctr } func (c *InferenceModelReconciler) eventPredicate(infModel *v1alpha2.InferenceModel) bool { - return (infModel.Spec.PoolRef.Name == c.PoolNamespacedName.Name) && (infModel.GetNamespace() == c.PoolNamespacedName.Namespace) + return (infModel.Spec.PoolRef.Name == v1alpha2.ObjectName(c.PoolNamespacedName.Name)) && (infModel.GetNamespace() == c.PoolNamespacedName.Namespace) } diff --git a/pkg/epp/datastore/datastore.go b/pkg/epp/datastore/datastore.go index f8d4722a..c7050437 100644 --- a/pkg/epp/datastore/datastore.go +++ b/pkg/epp/datastore/datastore.go @@ -183,7 +183,7 @@ func (ds *datastore) ModelResync(ctx context.Context, c client.Client, modelName for i := range models.Items { m := &models.Items[i] if m.Spec.ModelName != modelName || // The index should filter those out, but just in case! - m.Spec.PoolRef.Name != ds.pool.Name || // We don't care about other pools, we could setup an index on this too! + m.Spec.PoolRef.Name != v1alpha2.ObjectName(ds.pool.Name) || // We don't care about other pools, we could setup an index on this too! !m.DeletionTimestamp.IsZero() { // ignore objects marked for deletion continue } diff --git a/pkg/epp/util/testing/wrappers.go b/pkg/epp/util/testing/wrappers.go index bfcf2690..2b8a4fd1 100644 --- a/pkg/epp/util/testing/wrappers.go +++ b/pkg/epp/util/testing/wrappers.go @@ -110,7 +110,7 @@ func (m *InferenceModelWrapper) ModelName(modelName string) *InferenceModelWrapp } func (m *InferenceModelWrapper) PoolName(poolName string) *InferenceModelWrapper { - m.Spec.PoolRef = v1alpha2.PoolObjectReference{Name: poolName} + m.Spec.PoolRef = v1alpha2.PoolObjectReference{Name: v1alpha2.ObjectName(poolName)} return m } diff --git a/test/utils/wrappers.go b/test/utils/wrappers.go index 3280cb11..867118c1 100644 --- a/test/utils/wrappers.go +++ b/test/utils/wrappers.go @@ -58,9 +58,9 @@ func (m *InferenceModelWrapper) SetCriticality(level v1alpha2.Criticality) *Infe // for group/kind and name as the PoolObjectReference name. func (m *InferenceModelWrapper) SetPoolRef(name string) *InferenceModelWrapper { ref := v1alpha2.PoolObjectReference{ - Group: v1alpha2.GroupVersion.Group, + Group: v1alpha2.Group(v1alpha2.GroupVersion.Group), Kind: "inferencepools", - Name: name, + Name: v1alpha2.ObjectName(name), } m.Spec.PoolRef = ref return m From 4c5aa2a7f8872fa3cdd91547d8f598937f0eac81 Mon Sep 17 00:00:00 2001 From: Kuromesi Date: Sat, 1 Mar 2025 10:40:56 +0800 Subject: [PATCH 66/96] create pods during integration tests (#431) * create pods during integration tests Signed-off-by: Kuromesi * fix Signed-off-by: Kuromesi --------- Signed-off-by: Kuromesi --- pkg/epp/test/utils.go | 4 +- pkg/epp/util/testing/wrappers.go | 14 ++++++ test/integration/hermetic_test.go | 73 ++++++++++++++++++++++--------- 3 files changed, 68 insertions(+), 23 deletions(-) diff --git a/pkg/epp/test/utils.go b/pkg/epp/test/utils.go index a916bda2..b18b0919 100644 --- a/pkg/epp/test/utils.go +++ b/pkg/epp/test/utils.go @@ -114,10 +114,10 @@ func GenerateRequest(logger logr.Logger, prompt, model string) *extProcPb.Proces } func FakePodMetrics(index int, metrics datastore.Metrics) *datastore.PodMetrics { - address := fmt.Sprintf("address-%v", index) + address := fmt.Sprintf("192.168.1.%d", index+1) pod := datastore.PodMetrics{ Pod: datastore.Pod{ - NamespacedName: types.NamespacedName{Name: fmt.Sprintf("pod-%v", index)}, + NamespacedName: types.NamespacedName{Name: fmt.Sprintf("pod-%v", index), Namespace: "default"}, Address: address, }, Metrics: metrics, diff --git a/pkg/epp/util/testing/wrappers.go b/pkg/epp/util/testing/wrappers.go index 2b8a4fd1..2693734f 100644 --- a/pkg/epp/util/testing/wrappers.go +++ b/pkg/epp/util/testing/wrappers.go @@ -40,6 +40,20 @@ func MakePod(podName string) *PodWrapper { } } +// Complete sets necessary fields for a Pod to make it not denied by the apiserver +func (p *PodWrapper) Complete() *PodWrapper { + if p.Pod.Namespace == "" { + p.Namespace("default") + } + p.Spec.Containers = []corev1.Container{ + { + Name: "mock-vllm", + Image: "mock-vllm:latest", + }, + } + return p +} + func (p *PodWrapper) Namespace(ns string) *PodWrapper { p.ObjectMeta.Namespace = ns return p diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index de32dce0..7755795b 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -112,7 +112,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { { Header: &configPb.HeaderValue{ Key: runserver.DefaultDestinationEndpointHintKey, - RawValue: []byte("address-1:8000"), + RawValue: []byte("192.168.1.2:8000"), }, }, { @@ -122,7 +122,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, }, }, - wantMetadata: makeMetadata("address-1:8000"), + wantMetadata: makeMetadata("192.168.1.2:8000"), wantBody: []byte("{\"max_tokens\":100,\"model\":\"my-model-12345\",\"prompt\":\"test1\",\"temperature\":0}"), wantMetrics: ` # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. @@ -165,7 +165,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { { Header: &configPb.HeaderValue{ Key: runserver.DefaultDestinationEndpointHintKey, - RawValue: []byte("address-1:8000"), + RawValue: []byte("192.168.1.2:8000"), }, }, { @@ -175,7 +175,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, }, }, - wantMetadata: makeMetadata("address-1:8000"), + wantMetadata: makeMetadata("192.168.1.2:8000"), wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test2\",\"temperature\":0}"), wantMetrics: ` # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. @@ -219,7 +219,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { { Header: &configPb.HeaderValue{ Key: runserver.DefaultDestinationEndpointHintKey, - RawValue: []byte("address-2:8000"), + RawValue: []byte("192.168.1.3:8000"), }, }, { @@ -229,7 +229,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, }, }, - wantMetadata: makeMetadata("address-2:8000"), + wantMetadata: makeMetadata("192.168.1.3:8000"), wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test3\",\"temperature\":0}"), wantMetrics: ` # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. @@ -316,7 +316,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { { Header: &configPb.HeaderValue{ Key: runserver.DefaultDestinationEndpointHintKey, - RawValue: []byte("address-0:8000"), + RawValue: []byte("192.168.1.1:8000"), }, }, { @@ -326,7 +326,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, }, }, - wantMetadata: makeMetadata("address-0:8000"), + wantMetadata: makeMetadata("192.168.1.1:8000"), wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"test5\",\"temperature\":0}"), wantMetrics: ` # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. @@ -343,7 +343,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - client, cleanup := setUpHermeticServer(test.pods) + client, cleanup := setUpHermeticServer(t, test.pods) t.Cleanup(cleanup) want := &extProcPb.ProcessingResponse{ Response: &extProcPb.ProcessingResponse_RequestBody{ @@ -389,7 +389,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { } } -func setUpHermeticServer(podMetrics []*datastore.PodMetrics) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { +func setUpHermeticServer(t *testing.T, podMetrics []*datastore.PodMetrics) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { pms := make(map[types.NamespacedName]*datastore.PodMetrics) for _, pm := range podMetrics { pms[pm.NamespacedName] = pm @@ -397,23 +397,44 @@ func setUpHermeticServer(podMetrics []*datastore.PodMetrics) (client extProcPb.E pmc := &backend.FakePodMetricsClient{Res: pms} serverCtx, stopServer := context.WithCancel(context.Background()) - go func() { - serverRunner.Datastore.PodDeleteAll() - for _, pm := range podMetrics { - pod := utiltesting.MakePod(pm.NamespacedName.Name). - Namespace(pm.NamespacedName.Namespace). - ReadyCondition(). - IP(pm.Address). - ObjRef() - serverRunner.Datastore.PodUpdateOrAddIfNotExist(pod) - serverRunner.Datastore.PodUpdateMetricsIfExist(pm.NamespacedName, &pm.Metrics) + + // TODO: this should be consistent with the inference pool + podLabels := map[string]string{ + "app": "vllm-llama2-7b-pool", + } + + for _, pm := range podMetrics { + pod := utiltesting.MakePod(pm.NamespacedName.Name). + Namespace(pm.NamespacedName.Namespace). + ReadyCondition(). + Labels(podLabels). + IP(pm.Address). + Complete(). + ObjRef() + + copy := pod.DeepCopy() + if err := k8sClient.Create(context.Background(), copy); err != nil { + logutil.Fatal(logger, err, "Failed to create pod", "pod", pm.NamespacedName) } - serverRunner.Provider = backend.NewProvider(pmc, serverRunner.Datastore) + + // since no pod controllers deployed in fake environment, we manually update pod status + copy.Status = pod.Status + if err := k8sClient.Status().Update(context.Background(), copy); err != nil { + logutil.Fatal(logger, err, "Failed to update pod status", "pod", pm.NamespacedName) + } + } + serverRunner.Provider = backend.NewProvider(pmc, serverRunner.Datastore) + go func() { if err := serverRunner.AsRunnable(logger.WithName("ext-proc")).Start(serverCtx); err != nil { logutil.Fatal(logger, err, "Failed to start ext-proc server") } }() + // check if all pods are synced to datastore + assert.EventuallyWithT(t, func(t *assert.CollectT) { + assert.Len(t, serverRunner.Datastore.PodGetAll(), len(podMetrics), "Datastore not synced") + }, 10*time.Second, time.Second) + address := fmt.Sprintf("localhost:%v", port) // Create a grpc connection conn, err := grpc.NewClient(address, grpc.WithTransportCredentials(insecure.NewCredentials())) @@ -430,6 +451,16 @@ func setUpHermeticServer(podMetrics []*datastore.PodMetrics) (client extProcPb.E cancel() conn.Close() stopServer() + + // clear created pods + for _, pm := range podMetrics { + pod := utiltesting.MakePod(pm.NamespacedName.Name). + Namespace(pm.NamespacedName.Namespace).Complete().ObjRef() + + if err := k8sClient.Delete(context.Background(), pod); err != nil { + logutil.Fatal(logger, err, "Failed to delete pod", "pod", pm.NamespacedName) + } + } // wait a little until the goroutines actually exit time.Sleep(5 * time.Second) } From 406ffee096926c3106228126307ab2335abd2a95 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Sun, 2 Mar 2025 21:46:56 +0200 Subject: [PATCH 67/96] fixed typos (#433) Signed-off-by: Nir Rozenbaum --- docs/proposals/003-model-server-protocol/README.md | 2 +- docs/proposals/004-endpoint-picker-protocol/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/proposals/003-model-server-protocol/README.md b/docs/proposals/003-model-server-protocol/README.md index 44ecf4e1..2ab557f7 100644 --- a/docs/proposals/003-model-server-protocol/README.md +++ b/docs/proposals/003-model-server-protocol/README.md @@ -43,7 +43,7 @@ The model server MUST expose the following LoRA adapter metrics via the same Pro * Metric value: The last updated timestamp (so the EPP can find the latest). * Metric labels: * `max_lora`: The maximum number of adapters that can be loaded to GPU memory to serve a batch. - Requests will be queued if the model server has reached MaxActiveAdapter and canno load the + Requests will be queued if the model server has reached MaxActiveAdapter and cannot load the requested adapter. Example: `"max_lora": "8"`. * `running_lora_adapters`: A comma separated list of adapters that are currently loaded in GPU memory and ready to serve requests. Example: `"running_lora_adapters": "adapter1, adapter2"` diff --git a/docs/proposals/004-endpoint-picker-protocol/README.md b/docs/proposals/004-endpoint-picker-protocol/README.md index 1e27ff0f..3657a10e 100644 --- a/docs/proposals/004-endpoint-picker-protocol/README.md +++ b/docs/proposals/004-endpoint-picker-protocol/README.md @@ -7,7 +7,7 @@ found [here](../../../pkg/epp/). This doc defines the protocol between the EPP and the proxy (e.g, Envoy). The EPP MUST implement the Envoy -[external processing service](https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor)protocol. +[external processing service](https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor) protocol. For each HTTP request, the EPP MUST communicate to the proxy the picked model server endpoint via: From ddd066ae09550acb13ec0d81ca0dfc1ef8d6b0ef Mon Sep 17 00:00:00 2001 From: Rob Scott Date: Mon, 3 Mar 2025 19:57:42 -0800 Subject: [PATCH 68/96] Adding Accepted and ResolvedRefs conditions to InferencePool (#446) --- api/v1alpha2/inferencepool_types.go | 61 ++++++++++++++++--- ...ce.networking.x-k8s.io_inferencepools.yaml | 2 +- 2 files changed, 52 insertions(+), 11 deletions(-) diff --git a/api/v1alpha2/inferencepool_types.go b/api/v1alpha2/inferencepool_types.go index ca76f347..19ec799f 100644 --- a/api/v1alpha2/inferencepool_types.go +++ b/api/v1alpha2/inferencepool_types.go @@ -159,10 +159,11 @@ type InferencePoolStatus struct { Parents []PoolStatus `json:"parent,omitempty"` } -// PoolStatus defines the observed state of InferencePool from a gateway. +// PoolStatus defines the observed state of InferencePool from a Gateway. type PoolStatus struct { // GatewayRef indicates the gateway that observed state of InferencePool. GatewayRef corev1.ObjectReference `json:"parentRef"` + // Conditions track the state of the InferencePool. // // Known condition types are: @@ -180,27 +181,67 @@ type PoolStatus struct { // InferencePoolConditionType is a type of condition for the InferencePool type InferencePoolConditionType string -// InferencePoolConditionReason is the reason for a given InferencePoolConditionType -type InferencePoolConditionReason string +// InferencePoolReason is the reason for a given InferencePoolConditionType +type InferencePoolReason string const ( - // PoolConditionReady indicates if the pool is ready to accept traffic, and if not, why. + // This condition indicates whether the route has been accepted or rejected + // by a Gateway, and why. // // Possible reasons for this condition to be True are: // - // * "Ready" + // * "Accepted" + // + // Possible reasons for this condition to be False are: + // + // * "NotSupportedByGateway" // // Possible reasons for this condition to be Unknown are: // // * "Pending" // - PoolConditionReady InferencePoolConditionType = "Ready" + // Controllers MAY raise this condition with other reasons, but should + // prefer to use the reasons listed above to improve interoperability. + InferencePoolConditionAccepted InferencePoolConditionType = "Accepted" + + // This reason is used with the "Accepted" condition when the Route has been + // accepted by the Gateway. + InferencePoolReasonAccepted InferencePoolReason = "Accepted" + + // This reason is used with the "Accepted" condition when the InferencePool + // has not been accepted by a Gateway because the Gateway does not support + // InferencePool as a backend. + InferencePoolReasonNotSupportedByGateway InferencePoolReason = "NotSupportedByGateway" + + // This reason is used with the "Accepted" when a controller has not yet + // reconciled the route. + InferencePoolReasonPending InferencePoolReason = "Pending" +) + +const ( + // This condition indicates whether the controller was able to resolve all + // the object references for the InferencePool. + // + // Possible reasons for this condition to be true are: + // + // * "ResolvedRefs" + // + // Possible reasons for this condition to be False are: + // + // * "InvalidExtnesionRef" + // + // Controllers MAY raise this condition with other reasons, but should + // prefer to use the reasons listed above to improve interoperability. + ModelConditionResolvedRefs InferencePoolConditionType = "ResolvedRefs" - // PoolReasonReady is the desired state. The pool and its components are initialized and ready for traffic. - PoolReasonReady InferencePoolConditionReason = "Ready" + // This reason is used with the "ResolvedRefs" condition when the condition + // is true. + ModelReasonResolvedRefs InferencePoolReason = "ResolvedRefs" - // PoolReasonPending is the initial state, and indicates that the controller has not yet reconciled this pool. - PoolReasonPending InferencePoolConditionReason = "Pending" + // This reason is used with the "ResolvedRefs" condition when the + // ExtensionRef is invalid in some way. This can include an unsupported kind + // or API group, or a reference to a resource that can not be found. + ModelReasonInvalidExtensionRef InferencePoolReason = "InvalidExtensionRef" ) func init() { diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml index 15b79b69..5767508b 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml @@ -146,7 +146,7 @@ spec: means the route has not been attached to any Gateway. items: description: PoolStatus defines the observed state of InferencePool - from a gateway. + from a Gateway. properties: conditions: default: From 83442b0f548ba60b5c1e5c4c0d6fb6c74319fc7e Mon Sep 17 00:00:00 2001 From: Rohit Ramkumar Date: Tue, 4 Mar 2025 10:47:43 -0500 Subject: [PATCH 69/96] Add code for Envoy extension that support body-to-header translation (#355) --- body-based-routing.Dockerfile | 30 ++++ cmd/body-based-routing/health.go | 40 +++++ cmd/body-based-routing/main.go | 137 ++++++++++++++++++ pkg/body-based-routing/README.md | 14 ++ pkg/body-based-routing/handlers/request.go | 97 +++++++++++++ .../handlers/request_test.go | 128 ++++++++++++++++ pkg/body-based-routing/handlers/response.go | 48 ++++++ pkg/body-based-routing/handlers/server.go | 90 ++++++++++++ pkg/body-based-routing/server/runserver.go | 120 +++++++++++++++ 9 files changed, 704 insertions(+) create mode 100644 body-based-routing.Dockerfile create mode 100644 cmd/body-based-routing/health.go create mode 100644 cmd/body-based-routing/main.go create mode 100644 pkg/body-based-routing/README.md create mode 100644 pkg/body-based-routing/handlers/request.go create mode 100644 pkg/body-based-routing/handlers/request_test.go create mode 100644 pkg/body-based-routing/handlers/response.go create mode 100644 pkg/body-based-routing/handlers/server.go create mode 100644 pkg/body-based-routing/server/runserver.go diff --git a/body-based-routing.Dockerfile b/body-based-routing.Dockerfile new file mode 100644 index 00000000..e0afcf20 --- /dev/null +++ b/body-based-routing.Dockerfile @@ -0,0 +1,30 @@ +# Dockerfile has specific requirement to put this ARG at the beginning: +# https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact +ARG BUILDER_IMAGE=golang:1.23 +ARG BASE_IMAGE=gcr.io/distroless/static:nonroot + +## Multistage build +FROM ${BUILDER_IMAGE} AS builder +ENV CGO_ENABLED=0 +ENV GOOS=linux +ENV GOARCH=amd64 + +# Dependencies +WORKDIR /src +COPY go.mod go.sum ./ +RUN go mod download + +# Sources +COPY cmd ./cmd +COPY pkg ./pkg +COPY internal ./internal +WORKDIR /src/cmd/body-based-routing +RUN go build -o /body-based-routing + +## Multistage deploy +FROM ${BASE_IMAGE} + +WORKDIR / +COPY --from=builder /body-based-routing /body-based-routing + +ENTRYPOINT ["/body-based-routing"] diff --git a/cmd/body-based-routing/health.go b/cmd/body-based-routing/health.go new file mode 100644 index 00000000..7d1b5fd5 --- /dev/null +++ b/cmd/body-based-routing/health.go @@ -0,0 +1,40 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "context" + + "github.com/go-logr/logr" + "google.golang.org/grpc/codes" + healthPb "google.golang.org/grpc/health/grpc_health_v1" + "google.golang.org/grpc/status" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +type healthServer struct { + logger logr.Logger +} + +func (s *healthServer) Check(ctx context.Context, in *healthPb.HealthCheckRequest) (*healthPb.HealthCheckResponse, error) { + s.logger.V(logutil.VERBOSE).Info("gRPC health check serving", "service", in.Service) + return &healthPb.HealthCheckResponse{Status: healthPb.HealthCheckResponse_SERVING}, nil +} + +func (s *healthServer) Watch(in *healthPb.HealthCheckRequest, srv healthPb.Health_WatchServer) error { + return status.Error(codes.Unimplemented, "Watch is not implemented") +} diff --git a/cmd/body-based-routing/main.go b/cmd/body-based-routing/main.go new file mode 100644 index 00000000..3f586788 --- /dev/null +++ b/cmd/body-based-routing/main.go @@ -0,0 +1,137 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "flag" + "os" + + "github.com/go-logr/logr" + uberzap "go.uber.org/zap" + "go.uber.org/zap/zapcore" + "google.golang.org/grpc" + healthPb "google.golang.org/grpc/health/grpc_health_v1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" + runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/body-based-routing/server" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +var ( + grpcPort = flag.Int( + "grpcPort", + runserver.DefaultGrpcPort, + "The gRPC port used for communicating with Envoy proxy") + grpcHealthPort = flag.Int( + "grpcHealthPort", + 9003, + "The port used for gRPC liveness and readiness probes") + logVerbosity = flag.Int("v", logging.DEFAULT, "number for the log level verbosity") + + setupLog = ctrl.Log.WithName("setup") +) + +func main() { + if err := run(); err != nil { + os.Exit(1) + } +} + +func run() error { + opts := zap.Options{Development: true} + opts.BindFlags(flag.CommandLine) + flag.Parse() + initLogging(&opts) + + // Print all flag values + flags := make(map[string]any) + flag.VisitAll(func(f *flag.Flag) { + flags[f.Name] = f.Value + }) + setupLog.Info("Flags processed", "flags", flags) + + // Init runtime. + cfg, err := ctrl.GetConfig() + if err != nil { + setupLog.Error(err, "Failed to get rest config") + return err + } + + mgr, err := ctrl.NewManager(cfg, ctrl.Options{}) + if err != nil { + setupLog.Error(err, "Failed to create manager", "config", cfg) + return err + } + + ctx := ctrl.SetupSignalHandler() + + // Setup runner. + serverRunner := &runserver.ExtProcServerRunner{GrpcPort: *grpcPort} + + // Register health server. + if err := registerHealthServer(mgr, ctrl.Log.WithName("health"), *grpcHealthPort); err != nil { + return err + } + + // Register ext-proc server. + if err := mgr.Add(serverRunner.AsRunnable(ctrl.Log.WithName("ext-proc"))); err != nil { + setupLog.Error(err, "Failed to register ext-proc gRPC server") + return err + } + + // Start the manager. This blocks until a signal is received. + setupLog.Info("Manager starting") + if err := mgr.Start(ctx); err != nil { + setupLog.Error(err, "Error starting manager") + return err + } + setupLog.Info("Manager terminated") + return nil +} + +// registerHealthServer adds the Health gRPC server as a Runnable to the given manager. +func registerHealthServer(mgr manager.Manager, logger logr.Logger, port int) error { + srv := grpc.NewServer() + healthPb.RegisterHealthServer(srv, &healthServer{ + logger: logger, + }) + if err := mgr.Add( + runnable.NoLeaderElection(runnable.GRPCServer("health", srv, port))); err != nil { + setupLog.Error(err, "Failed to register health server") + return err + } + return nil +} + +func initLogging(opts *zap.Options) { + useV := true + flag.Visit(func(f *flag.Flag) { + if f.Name == "zap-log-level" { + useV = false + } + }) + if useV { + // See https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/log/zap#Options.Level + lvl := -1 * (*logVerbosity) + opts.Level = uberzap.NewAtomicLevelAt(zapcore.Level(int8(lvl))) + } + + logger := zap.New(zap.UseFlagOptions(opts), zap.RawZapOpts(uberzap.AddCaller())) + ctrl.SetLogger(logger) +} diff --git a/pkg/body-based-routing/README.md b/pkg/body-based-routing/README.md new file mode 100644 index 00000000..b5b6f770 --- /dev/null +++ b/pkg/body-based-routing/README.md @@ -0,0 +1,14 @@ +# Body-Based Routing +This package provides an extension that can be deployed to write the `model` +HTTP body parameter as a header (X-Gateway-Model-Name) so as to enable routing capabilities on the +model name. + +As per OpenAI spec, it is standard for the model name to be included in the +body of the HTTP request. However, most implementations do not support routing +based on the request body. This extension helps bridge that gap for clients. +This extension works by parsing the request body. If it finds a `model` parameter in the +request body, it will copy the value of that parameter into a request header. + +This extension is intended to be paired with an `ext_proc` capable Gateway. There is not +a standard way to represent this kind of extension in Gateway API yet, so we recommend +referring to implementation-specific documentation for how to deploy this extension. diff --git a/pkg/body-based-routing/handlers/request.go b/pkg/body-based-routing/handlers/request.go new file mode 100644 index 00000000..3c5037a9 --- /dev/null +++ b/pkg/body-based-routing/handlers/request.go @@ -0,0 +1,97 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package handlers + +import ( + "context" + "encoding/json" + "fmt" + + basepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" + eppb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "sigs.k8s.io/controller-runtime/pkg/log" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +// HandleRequestBody handles request bodies. +func (s *Server) HandleRequestBody(ctx context.Context, body *eppb.HttpBody) (*eppb.ProcessingResponse, error) { + logger := log.FromContext(ctx) + + var data map[string]any + if err := json.Unmarshal(body.GetBody(), &data); err != nil { + return nil, err + } + + modelVal, ok := data["model"] + if !ok { + logger.V(logutil.DEFAULT).Info("Request body does not contain model parameter") + return &eppb.ProcessingResponse{ + Response: &eppb.ProcessingResponse_RequestBody{ + RequestBody: &eppb.BodyResponse{}, + }, + }, nil + } + + modelStr, ok := modelVal.(string) + if !ok { + logger.V(logutil.DEFAULT).Info("Model parameter value is not a string") + return &eppb.ProcessingResponse{ + Response: &eppb.ProcessingResponse_RequestBody{ + RequestBody: &eppb.BodyResponse{}, + }, + }, fmt.Errorf("the model parameter value %v is not a string", modelVal) + } + + return &eppb.ProcessingResponse{ + Response: &eppb.ProcessingResponse_RequestBody{ + RequestBody: &eppb.BodyResponse{ + Response: &eppb.CommonResponse{ + // Necessary so that the new headers are used in the routing decision. + ClearRouteCache: true, + HeaderMutation: &eppb.HeaderMutation{ + SetHeaders: []*basepb.HeaderValueOption{ + { + Header: &basepb.HeaderValue{ + Key: "X-Gateway-Model-Name", + RawValue: []byte(modelStr), + }, + }, + }, + }, + }, + }, + }, + }, nil +} + +// HandleRequestHeaders handles request headers. +func (s *Server) HandleRequestHeaders(headers *eppb.HttpHeaders) (*eppb.ProcessingResponse, error) { + return &eppb.ProcessingResponse{ + Response: &eppb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &eppb.HeadersResponse{}, + }, + }, nil +} + +// HandleRequestTrailers handles request trailers. +func (s *Server) HandleRequestTrailers(trailers *eppb.HttpTrailers) (*eppb.ProcessingResponse, error) { + return &eppb.ProcessingResponse{ + Response: &eppb.ProcessingResponse_RequestTrailers{ + RequestTrailers: &eppb.TrailersResponse{}, + }, + }, nil +} diff --git a/pkg/body-based-routing/handlers/request_test.go b/pkg/body-based-routing/handlers/request_test.go new file mode 100644 index 00000000..9bdac521 --- /dev/null +++ b/pkg/body-based-routing/handlers/request_test.go @@ -0,0 +1,128 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package handlers + +import ( + "context" + "testing" + + basepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "github.com/google/go-cmp/cmp" + "google.golang.org/protobuf/testing/protocmp" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + bodyWithModel = ` + { + "model": "foo", + "prompt": "Tell me a joke" + } + ` + bodyWithModelNoStr = ` + { + "model": 1, + "prompt": "Tell me a joke" + } + ` + bodyWithoutModel = ` + { + "prompt": "Tell me a joke" + } + ` +) + +func TestHandleRequestBody(t *testing.T) { + ctx := logutil.NewTestLoggerIntoContext(context.Background()) + + tests := []struct { + name string + body *extProcPb.HttpBody + want *extProcPb.ProcessingResponse + wantErr bool + }{ + { + name: "malformed body", + body: &extProcPb.HttpBody{ + Body: []byte("malformed json"), + }, + wantErr: true, + }, + { + name: "model not found", + body: &extProcPb.HttpBody{ + Body: []byte(bodyWithoutModel), + }, + want: &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{}, + }, + }, + }, + { + name: "model is not string", + body: &extProcPb.HttpBody{ + Body: []byte(bodyWithModelNoStr), + }, + wantErr: true, + }, + { + name: "success", + body: &extProcPb.HttpBody{ + Body: []byte(bodyWithModel), + }, + want: &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + // Necessary so that the new headers are used in the routing decision. + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*basepb.HeaderValueOption{ + { + Header: &basepb.HeaderValue{ + Key: "X-Gateway-Model-Name", + RawValue: []byte("foo"), + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + server := &Server{} + resp, err := server.HandleRequestBody(ctx, test.body) + if err != nil { + if !test.wantErr { + t.Fatalf("HandleRequestBody returned unexpected error: %v, want %v", err, test.wantErr) + } + return + } + + if diff := cmp.Diff(test.want, resp, protocmp.Transform()); diff != "" { + t.Errorf("HandleRequestBody returned unexpected response, diff(-want, +got): %v", diff) + } + }) + } +} diff --git a/pkg/body-based-routing/handlers/response.go b/pkg/body-based-routing/handlers/response.go new file mode 100644 index 00000000..a62aa076 --- /dev/null +++ b/pkg/body-based-routing/handlers/response.go @@ -0,0 +1,48 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package handlers + +import ( + eppb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" +) + +// HandleResponseHeaders handles response headers. +func (s *Server) HandleResponseHeaders(headers *eppb.HttpHeaders) (*eppb.ProcessingResponse, error) { + return &eppb.ProcessingResponse{ + Response: &eppb.ProcessingResponse_ResponseHeaders{ + ResponseHeaders: &eppb.HeadersResponse{}, + }, + }, nil +} + +// HandleResponseBody handles response bodies. +func (s *Server) HandleResponseBody(body *eppb.HttpBody) (*eppb.ProcessingResponse, error) { + return &eppb.ProcessingResponse{ + Response: &eppb.ProcessingResponse_ResponseBody{ + ResponseBody: &eppb.BodyResponse{}, + }, + }, nil +} + +// HandleResponseTrailers handles response trailers. +func (s *Server) HandleResponseTrailers(trailers *eppb.HttpTrailers) (*eppb.ProcessingResponse, error) { + return &eppb.ProcessingResponse{ + Response: &eppb.ProcessingResponse_ResponseTrailers{ + ResponseTrailers: &eppb.TrailersResponse{}, + }, + }, nil +} diff --git a/pkg/body-based-routing/handlers/server.go b/pkg/body-based-routing/handlers/server.go new file mode 100644 index 00000000..434dd530 --- /dev/null +++ b/pkg/body-based-routing/handlers/server.go @@ -0,0 +1,90 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package handlers + +import ( + "context" + "errors" + "io" + + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + "sigs.k8s.io/controller-runtime/pkg/log" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +func NewServer() *Server { + return &Server{} +} + +// Server implements the Envoy external processing server. +// https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor.proto +type Server struct{} + +func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { + ctx := srv.Context() + logger := log.FromContext(ctx) + loggerVerbose := logger.V(logutil.VERBOSE) + loggerVerbose.Info("Processing") + + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + req, recvErr := srv.Recv() + if recvErr == io.EOF || errors.Is(recvErr, context.Canceled) { + return nil + } + if recvErr != nil { + // This error occurs very frequently, though it doesn't seem to have any impact. + // TODO Figure out if we can remove this noise. + loggerVerbose.Error(recvErr, "Cannot receive stream request") + return status.Errorf(codes.Unknown, "cannot receive stream request: %v", recvErr) + } + + var resp *extProcPb.ProcessingResponse + var err error + switch v := req.Request.(type) { + case *extProcPb.ProcessingRequest_RequestHeaders: + resp, err = s.HandleRequestHeaders(req.GetRequestHeaders()) + case *extProcPb.ProcessingRequest_RequestBody: + resp, err = s.HandleRequestBody(ctx, req.GetRequestBody()) + case *extProcPb.ProcessingRequest_ResponseHeaders: + resp, err = s.HandleResponseHeaders(req.GetResponseHeaders()) + case *extProcPb.ProcessingRequest_ResponseBody: + resp, err = s.HandleResponseBody(req.GetResponseBody()) + default: + logger.V(logutil.DEFAULT).Error(nil, "Unknown Request type", "request", v) + return status.Error(codes.Unknown, "unknown request type") + } + + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Failed to process request", "request", req) + return status.Errorf(status.Code(err), "failed to handle request: %v", err) + } + + loggerVerbose.Info("Response generated", "response", resp) + if err := srv.Send(resp); err != nil { + logger.V(logutil.DEFAULT).Error(err, "Send failed") + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + } +} diff --git a/pkg/body-based-routing/server/runserver.go b/pkg/body-based-routing/server/runserver.go new file mode 100644 index 00000000..b04602bb --- /dev/null +++ b/pkg/body-based-routing/server/runserver.go @@ -0,0 +1,120 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package server + +import ( + "context" + "crypto/rand" + "crypto/rsa" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "math/big" + "time" + + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "github.com/go-logr/logr" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" + "sigs.k8s.io/gateway-api-inference-extension/pkg/body-based-routing/handlers" +) + +// ExtProcServerRunner provides methods to manage an external process server. +type ExtProcServerRunner struct { + GrpcPort int +} + +// Default values for CLI flags in main +const ( + DefaultGrpcPort = 9002 // default for --grpcPort +) + +func NewDefaultExtProcServerRunner() *ExtProcServerRunner { + return &ExtProcServerRunner{ + GrpcPort: DefaultGrpcPort, + } +} + +// AsRunnable returns a Runnable that can be used to start the ext-proc gRPC server. +// The runnable implements LeaderElectionRunnable with leader election disabled. +func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable { + return runnable.NoLeaderElection(manager.RunnableFunc(func(ctx context.Context) error { + cert, err := createSelfSignedTLSCertificate(logger) + if err != nil { + logger.Error(err, "Failed to create self signed certificate") + return err + } + creds := credentials.NewTLS(&tls.Config{Certificates: []tls.Certificate{cert}}) + + srv := grpc.NewServer(grpc.Creds(creds)) + extProcPb.RegisterExternalProcessorServer( + srv, + handlers.NewServer(), + ) + + // Forward to the gRPC runnable. + return runnable.GRPCServer("ext-proc", srv, r.GrpcPort).Start(ctx) + })) +} + +func createSelfSignedTLSCertificate(logger logr.Logger) (tls.Certificate, error) { + serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) + serialNumber, err := rand.Int(rand.Reader, serialNumberLimit) + if err != nil { + logger.Error(err, "Failed to create serial number for self-signed cert") + return tls.Certificate{}, err + } + now := time.Now() + notBefore := now.UTC() + template := x509.Certificate{ + SerialNumber: serialNumber, + Subject: pkix.Name{ + Organization: []string{"Inference Ext"}, + }, + NotBefore: notBefore, + NotAfter: now.Add(time.Hour * 24 * 365 * 10).UTC(), // 10 years + KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + BasicConstraintsValid: true, + } + + priv, err := rsa.GenerateKey(rand.Reader, 4096) + if err != nil { + logger.Error(err, "Failed to generate key for self-signed cert") + return tls.Certificate{}, err + } + + derBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, &priv.PublicKey, priv) + if err != nil { + logger.Error(err, "Failed to create self-signed certificate") + return tls.Certificate{}, err + } + + certBytes := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: derBytes}) + + privBytes, err := x509.MarshalPKCS8PrivateKey(priv) + if err != nil { + logger.Error(err, "Failed to marshal private key for self-signed certificate") + return tls.Certificate{}, err + } + keyBytes := pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: privBytes}) + + return tls.X509KeyPair(certBytes, keyBytes) +} From 45e95330ec5a3fc017231f9c1d1f9606529e3015 Mon Sep 17 00:00:00 2001 From: Rohit Ramkumar Date: Tue, 4 Mar 2025 11:27:43 -0500 Subject: [PATCH 70/96] Add Makefile + cloudbuild configs for body-based routing extension (#442) --- Makefile | 47 +++++++++++++++++++++++++++++++++++++++++++++++ cloudbuild.yaml | 8 ++++++++ 2 files changed, 55 insertions(+) diff --git a/Makefile b/Makefile index 8d02a5e8..61b17f5b 100644 --- a/Makefile +++ b/Makefile @@ -27,6 +27,7 @@ DOCKER_BUILDX_CMD ?= docker buildx IMAGE_BUILD_CMD ?= $(DOCKER_BUILDX_CMD) build IMAGE_BUILD_EXTRA_OPTS ?= SYNCER_IMAGE_BUILD_EXTRA_OPTS ?= +BBR_IMAGE_BUILD_EXTRA_OPTS ?= IMAGE_REGISTRY ?= us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension IMAGE_NAME := epp IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(IMAGE_NAME) @@ -36,6 +37,10 @@ SYNCER_IMAGE_NAME := lora-syncer SYNCER_IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(SYNCER_IMAGE_NAME) SYNCER_IMAGE_TAG ?= $(SYNCER_IMAGE_REPO):$(GIT_TAG) +BBR_IMAGE_NAME := bbr +BBR_IMAGE_REPO ?= $(IMAGE_REGISTRY)/$(BBR_IMAGE_NAME) +BBR_IMAGE_TAG ?= $(BBR_IMAGE_REPO):$(GIT_TAG) + BASE_IMAGE ?= gcr.io/distroless/static:nonroot BUILDER_IMAGE ?= golang:1.23 ifdef GO_VERSION @@ -45,10 +50,12 @@ endif ifdef EXTRA_TAG IMAGE_EXTRA_TAG ?= $(IMAGE_REPO):$(EXTRA_TAG) SYNCER_IMAGE_EXTRA_TAG ?= $(SYNCER_IMAGE_REPO):$(EXTRA_TAG) +BBR_IMAGE_EXTRA_TAG ?= $(BBR_IMAGE_REPO):$(EXTRA_TAG) endif ifdef IMAGE_EXTRA_TAG IMAGE_BUILD_EXTRA_OPTS += -t $(IMAGE_EXTRA_TAG) SYNCER_IMAGE_BUILD_EXTRA_OPTS += -t $(SYNCER_IMAGE_EXTRA_TAG) +BBR_IMAGE_BUILD_EXTRA_OPTS += -t $(BBR_IMAGE_EXTRA_TAG) endif # The name of the kind cluster to use for the "kind-load" target. @@ -203,6 +210,46 @@ syncer-image-build: syncer-image-push: PUSH=--push syncer-image-push: syncer-image-build +##@ Body-based Routing extension + +# Build the container image +.PHONY: bbr-image-local-build +bbr-image-local-build: ## Build the image using Docker Buildx for local development. + BUILDER=$(shell $(DOCKER_BUILDX_CMD) create --use) + $(MAKE) bbr-image-build PUSH=$(PUSH) + $(MAKE) bbr-image-build LOAD=$(LOAD) + $(DOCKER_BUILDX_CMD) rm $$BUILDER + +.PHONY: bbr-image-local-push +bbr-image-local-push: PUSH=--push ## Build the image for local development and push it to $IMAGE_REPO. +bbr-image-local-push: bbr-image-local-build + +.PHONY: bbr-image-local-load +bbr-image-local-load: LOAD=--load ## Build the image for local development and load it in the local Docker registry. +bbr-image-local-load: bbr-image-local-build + +.PHONY: bbr-image-build +bbr-image-build: ## Build the image using Docker Buildx. + $(IMAGE_BUILD_CMD) -f body-based-routing.Dockerfile -t $(BBR_IMAGE_TAG) \ + --platform=$(PLATFORMS) \ + --build-arg BASE_IMAGE=$(BASE_IMAGE) \ + --build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \ + $(PUSH) \ + $(LOAD) \ + . + +.PHONY: bbr-image-push +bbr-image-push: PUSH=--push ## Build the image and push it to $IMAGE_REPO. +bbr-image-push: bbr-image-build + +.PHONY: bbr-image-load +bbr-image-load: LOAD=--load ## Build the image and load it in the local Docker registry. +bbr-image-load: bbr-image-build + +.PHONY: bbr-image-kind +bbr-image-kind: bbr-image-build ## Build the image and load it to kind cluster $KIND_CLUSTER ("kind" by default). + kind load docker-image $(BBR_IMAGE_TAG) --name $(KIND_CLUSTER) + ##@ Docs .PHONY: build-docs diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 9b345c18..3a8e008f 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -20,6 +20,14 @@ steps: - GIT_TAG=$_GIT_TAG - EXTRA_TAG=$_PULL_BASE_REF - DOCKER_BUILDX_CMD=/buildx-entrypoint + - name: gcr.io/k8s-testimages/gcb-docker-gcloud:v20220830-45cbff55bc + entrypoint: make + args: + - bbr-image-push + env: + - GIT_TAG=$_GIT_TAG + - EXTRA_TAG=$_PULL_BASE_REF + - DOCKER_BUILDX_CMD=/buildx-entrypoint substitutions: # _GIT_TAG will be filled with a git-based tag for the image, of the form vYYYYMMDD-hash, and # can be used as a substitution From 7208cff5cca57989b1c346191e373eb143e16ed6 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Tue, 4 Mar 2025 19:57:49 +0200 Subject: [PATCH 71/96] added cpu based example (#436) * added cpu based example to quickstart Signed-off-by: Nir Rozenbaum * removed quickstart cleanup instructions Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum --- config/manifests/ext_proc.yaml | 6 +- config/manifests/inferencemodel.yaml | 2 +- config/manifests/vllm/cpu-deployment.yaml | 101 ++++++++++++++++++ .../{deployment.yaml => gpu-deployment.yaml} | 6 +- site-src/guides/index.md | 31 +++++- 5 files changed, 134 insertions(+), 12 deletions(-) create mode 100644 config/manifests/vllm/cpu-deployment.yaml rename config/manifests/vllm/{deployment.yaml => gpu-deployment.yaml} (97%) diff --git a/config/manifests/ext_proc.yaml b/config/manifests/ext_proc.yaml index f96113e1..60a0fc3e 100644 --- a/config/manifests/ext_proc.yaml +++ b/config/manifests/ext_proc.yaml @@ -44,11 +44,11 @@ apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferencePool metadata: labels: - name: vllm-llama2-7b-pool + name: my-pool spec: targetPortNumber: 8000 selector: - app: vllm-llama2-7b-pool + app: my-pool extensionRef: name: inference-gateway-ext-proc --- @@ -75,7 +75,7 @@ spec: imagePullPolicy: Always args: - -poolName - - "vllm-llama2-7b-pool" + - "my-pool" - -v - "3" - -grpcPort diff --git a/config/manifests/inferencemodel.yaml b/config/manifests/inferencemodel.yaml index 57240298..94c36d84 100644 --- a/config/manifests/inferencemodel.yaml +++ b/config/manifests/inferencemodel.yaml @@ -6,7 +6,7 @@ spec: modelName: tweet-summary criticality: Critical poolRef: - name: vllm-llama2-7b-pool + name: my-pool targetModels: - name: tweet-summary-1 weight: 100 diff --git a/config/manifests/vllm/cpu-deployment.yaml b/config/manifests/vllm/cpu-deployment.yaml new file mode 100644 index 00000000..a0925c83 --- /dev/null +++ b/config/manifests/vllm/cpu-deployment.yaml @@ -0,0 +1,101 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: my-pool +spec: + replicas: 3 + selector: + matchLabels: + app: my-pool + template: + metadata: + labels: + app: my-pool + spec: + containers: + - name: lora + image: "seedjeffwan/vllm-cpu-env:bb392af4-20250203" + imagePullPolicy: Always + command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args: + - "--model" + - "Qwen/Qwen2.5-1.5B-Instruct" + - "--port" + - "8000" + - "--enable-lora" + - "--lora-modules" + - '{"name": "tweet-summary-0", "path": "/adapters/hub/models--ai-blond--Qwen-Qwen2.5-Coder-1.5B-Instruct-lora/snapshots/9cde18d8ed964b0519fb481cca6acd936b2ca811"}' + - '{"name": "tweet-summary-1", "path": "/adapters/hub/models--ai-blond--Qwen-Qwen2.5-Coder-1.5B-Instruct-lora/snapshots/9cde18d8ed964b0519fb481cca6acd936b2ca811"}' + env: + - name: PORT + value: "8000" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: token + - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING + value: "true" + ports: + - containerPort: 8000 + name: http + protocol: TCP + livenessProbe: + failureThreshold: 240 + httpGet: + path: /health + port: http + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + readinessProbe: + failureThreshold: 600 + httpGet: + path: /health + port: http + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + volumeMounts: + - mountPath: /data + name: data + - mountPath: /dev/shm + name: shm + - name: adapters + mountPath: "/adapters" + initContainers: + - name: adapter-loader + image: ghcr.io/tomatillo-and-multiverse/adapter-puller:demo + command: ["python"] + args: + - ./pull_adapters.py + - --adapter + - ai-blond/Qwen-Qwen2.5-Coder-1.5B-Instruct-lora + - --duplicate-count + - "4" + env: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: token + - name: HF_HOME + value: /adapters + volumeMounts: + - name: adapters + mountPath: "/adapters" + restartPolicy: Always + schedulerName: default-scheduler + terminationGracePeriodSeconds: 30 + volumes: + - name: data + emptyDir: {} + - name: shm + emptyDir: + medium: Memory + - name: adapters + emptyDir: {} diff --git a/config/manifests/vllm/deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml similarity index 97% rename from config/manifests/vllm/deployment.yaml rename to config/manifests/vllm/gpu-deployment.yaml index 51689c9f..d16a46a4 100644 --- a/config/manifests/vllm/deployment.yaml +++ b/config/manifests/vllm/gpu-deployment.yaml @@ -1,16 +1,16 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: vllm-llama2-7b-pool + name: my-pool spec: replicas: 3 selector: matchLabels: - app: vllm-llama2-7b-pool + app: my-pool template: metadata: labels: - app: vllm-llama2-7b-pool + app: my-pool spec: containers: - name: lora diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 2949d387..976368ac 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -5,19 +5,40 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ## **Prerequisites** - Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher - A cluster with: - - Support for Services of type `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind, - you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). - - 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/deployment.yaml` as needed. + - Support for services of typs `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). + For example, with Kind, you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). ## **Steps** ### Deploy Sample Model Server + This quickstart guide contains two options for setting up model server: + + 1. GPU-based model server. + Requirements: a Hugging Face access token that grants access to the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). + + 1. CPU-based model server (not using GPUs). + Requirements: a Hugging Face access token that grants access to the model [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct). + + Choose one of these options and follow the steps below. Please do not deploy both, as the deployments have the same name and will override each other. + +#### GPU-Based Model Server + + For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/deployment.yaml` as needed. Create a Hugging Face secret to download the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). Ensure that the token grants access to this model. Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. ```bash kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2 - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/deployment.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml + ``` + +#### CPU-Based Model Server + + Create a Hugging Face secret to download the model [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct). Ensure that the token grants access to this model. + Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. + ```bash + kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Qwen + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml ``` ### Install the Inference Extension CRDs @@ -49,7 +70,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gateway.yaml ``` - > **_NOTE:_** This file couples together the gateway infra and the HTTPRoute infra for a convenient, quick startup. Creating additional/different InferencePools on the same gateway will require an additional set of: `Backend`, `HTTPRoute`, the resources included in the `./manifests/gateway/ext-proc.yaml` file, and an additional `./manifests/gateway/patch_policy.yaml` file. ***Should you choose to experiment, familiarity with xDS and Envoy are very useful.*** + > **_NOTE:_** This file couples together the gateway infra and the HTTPRoute infra for a convenient, quick startup. Creating additional/different InferencePools on the same gateway will require an additional set of: `Backend`, `HTTPRoute`, the resources included in the `./config/manifests/gateway/ext-proc.yaml` file, and an additional `./config/manifests/gateway/patch_policy.yaml` file. ***Should you choose to experiment, familiarity with xDS and Envoy are very useful.*** Confirm that the Gateway was assigned an IP address and reports a `Programmed=True` status: ```bash From 61185343a1a63edc96209425309a87a7051c804a Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Tue, 4 Mar 2025 20:11:44 +0200 Subject: [PATCH 72/96] updated cleanup section in quickstart (#448) Signed-off-by: Nir Rozenbaum --- site-src/guides/index.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 976368ac..98ae94a3 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -114,3 +114,22 @@ This quickstart guide is intended for engineers familiar with k8s and model serv "temperature": 0 }' ``` + +### Cleanup + + The following cleanup assumes you would like to clean ALL resources that were created in this quickstart guide. + please be careful not to delete resources you'd like to keep. + ```bash + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/traffic_policy.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/extension_policy.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/patch_policy.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/ext_proc.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gateway.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/enable_patch_policy.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml --ignore-not-found + kubectl delete secret hf-token --ignore-not-found + ``` \ No newline at end of file From dfe8d9c62c0e06b522010d56fcfcb100014d8e85 Mon Sep 17 00:00:00 2001 From: kaushik mitra Date: Tue, 4 Mar 2025 10:37:43 -0800 Subject: [PATCH 73/96] scheduling changes for lora affinity load balancing (#423) * scheduling changes for lora affinity load balancing * refactor unit tests, address comments * restore vllm deployment manifest * update README for model server protocol to add waiting lora adapters * remove unused variables * removed unused func * fix model protocol readme * fix hermetic test for select active lora, low queue * update comment in metrics.go in vllm backend * add filter test TestLoRASoftAffinityDistribution * restore vllm manifest * update unit test --- .../003-model-server-protocol/README.md | 1 + pkg/epp/backend/vllm/metrics.go | 45 +++++++++- pkg/epp/scheduling/filter.go | 61 +++++++++++-- pkg/epp/scheduling/filter_test.go | 90 +++++++++++++++++++ pkg/epp/scheduling/scheduler.go | 20 ++--- test/integration/hermetic_test.go | 3 +- 6 files changed, 196 insertions(+), 24 deletions(-) diff --git a/docs/proposals/003-model-server-protocol/README.md b/docs/proposals/003-model-server-protocol/README.md index 2ab557f7..02efbe5c 100644 --- a/docs/proposals/003-model-server-protocol/README.md +++ b/docs/proposals/003-model-server-protocol/README.md @@ -47,3 +47,4 @@ The model server MUST expose the following LoRA adapter metrics via the same Pro requested adapter. Example: `"max_lora": "8"`. * `running_lora_adapters`: A comma separated list of adapters that are currently loaded in GPU memory and ready to serve requests. Example: `"running_lora_adapters": "adapter1, adapter2"` + * `waiting_lora_adapters`: A comma separated list of adapters that are waiting to be served. Example: `"waiting_lora_adapters": "adapter1, adapter2"` diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go index 4973c93e..5b36b930 100644 --- a/pkg/epp/backend/vllm/metrics.go +++ b/pkg/epp/backend/vllm/metrics.go @@ -34,9 +34,13 @@ import ( logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) +// Metric names used in the vLLM metrics implementation. +// Refer to the protocol doc for more details: +// https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol const ( LoraRequestInfoMetricName = "vllm:lora_requests_info" LoraRequestInfoRunningAdaptersMetricName = "running_lora_adapters" + LoraRequestInfoWaitingAdaptersMetricName = "waiting_lora_adapters" LoraRequestInfoMaxAdaptersMetricName = "max_lora" // TODO: Replace these with the num_tokens_running/waiting below once we add those to the fork. RunningQueueSizeMetricName = "vllm:num_requests_running" @@ -45,8 +49,7 @@ const ( RunningQueueSizeMetricName = "vllm:num_tokens_running" WaitingQueueSizeMetricName = "vllm:num_tokens_waiting" */ - KVCacheUsagePercentMetricName = "vllm:gpu_cache_usage_perc" - KvCacheMaxTokenCapacityMetricName = "vllm:gpu_cache_max_token_capacity" + KVCacheUsagePercentMetricName = "vllm:gpu_cache_usage_perc" ) type PodMetricsClientImpl struct{} @@ -138,6 +141,14 @@ func promToPodMetrics( } } } + if label.GetName() == LoraRequestInfoWaitingAdaptersMetricName { + if label.GetValue() != "" { + adapterList := strings.Split(label.GetValue(), ",") + for _, adapter := range adapterList { + updated.ActiveModels[adapter] = 0 + } + } + } if label.GetName() == LoraRequestInfoMaxAdaptersMetricName { if label.GetValue() != "" { updated.MaxActiveModels, err = strconv.Atoi(label.GetValue()) @@ -163,14 +174,42 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName) return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName) } - var latestTs float64 + var latest *dto.Metric + var latestTs float64 + + // Iterate over all metrics in the family. for _, m := range loraRequests.GetMetric() { + var running, waiting string + // Read the label values for running and waiting adapters. + for _, lp := range m.GetLabel() { + switch lp.GetName() { + case LoraRequestInfoRunningAdaptersMetricName: + running = lp.GetValue() + case LoraRequestInfoWaitingAdaptersMetricName: + waiting = lp.GetValue() + } + } + + // Ignore metrics with both labels empty. This happens when there are no running or waiting requests on + // the server, in this case it is best to use the last set of active adapters. + if running == "" && waiting == "" { + continue + } + + // Select the metric with the latest creation timestamp. if m.GetGauge().GetValue() > latestTs { latestTs = m.GetGauge().GetValue() latest = m } } + + if latest == nil { + logger.V(logutil.TRACE).Info("Metric value Empty", "value", latest, "metric", LoraRequestInfoMetricName) + return nil, time.Time{}, nil + } + + // Convert the gauge value (creation timestamp) to time.Time. return latest, time.Unix(0, int64(latestTs*1000)), nil } diff --git a/pkg/epp/scheduling/filter.go b/pkg/epp/scheduling/filter.go index b7881468..d3c22673 100644 --- a/pkg/epp/scheduling/filter.go +++ b/pkg/epp/scheduling/filter.go @@ -19,6 +19,8 @@ package scheduling import ( "errors" "math" + "math/rand" + "time" "github.com/go-logr/logr" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" @@ -183,18 +185,59 @@ func lowLoRACostPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool { return ok || len(pod.ActiveModels) < pod.MaxActiveModels } -// loRAAffinityPredicate is a filter function to check whether a pod has affinity to the lora requested. -func loRAAffinityPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool { - _, ok := pod.ActiveModels[req.ResolvedTargetModel] - return ok -} +// loRASoftAffinityPredicate implements a pod selection strategy that prioritizes pods +// with existing LoRA model affinity while allowing for load balancing through randomization. +// +// The function works by: +// 1. Separating pods into two groups: those with target model affinity and those with available capacity +// 2. Using a probability threshold to sometimes select from non-affinity pods to enable load balancing +// 3. Falling back to whatever group has pods if one group is empty +// +// Parameters: +// - logger: Logger interface for diagnostic output +// - req: LLM request containing the resolved target model +// - pods: Slice of pod metrics to filter +// +// Returns: +// - Filtered slice of pod metrics based on affinity and availability +// - Error if any issues occur during filtering +func loRASoftAffinityFilter(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { + + // Pre-allocate slices with estimated capacity + filtered_affinity := make([]*datastore.PodMetrics, 0, len(pods)) + filtered_available := make([]*datastore.PodMetrics, 0, len(pods)) + + // Categorize pods based on affinity and availability + for _, pod := range pods { + + if _, exists := pod.ActiveModels[req.ResolvedTargetModel]; exists { + filtered_affinity = append(filtered_affinity, pod) + } else if len(pod.ActiveModels) < pod.MaxActiveModels { + filtered_available = append(filtered_available, pod) + } + } + + // Use crypto/rand for better randomization in production environments + randSource := rand.NewSource(time.Now().UnixNano()) + randGen := rand.New(randSource) + + // If both groups have pods, use probability to select which group to return + if len(filtered_affinity) > 0 && len(filtered_available) > 0 { + if randGen.Float64() < loraAffinityThreshold { + return filtered_affinity, nil + } + return filtered_available, nil + } + + // Return whichever group has pods + if len(filtered_affinity) > 0 { + return filtered_affinity, nil + } -// canAcceptNewLoraPredicate is a filter function to check whether a pod has room to load the adapter. -func canAcceptNewLoraPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool { - return len(pod.ActiveModels) < pod.MaxActiveModels + return filtered_available, nil } -func criticalRequestPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool { +func criticalRequestPredicate(req *LLMRequest, _ *datastore.PodMetrics) bool { return req.Critical } diff --git a/pkg/epp/scheduling/filter_test.go b/pkg/epp/scheduling/filter_test.go index ac765b78..f76cece9 100644 --- a/pkg/epp/scheduling/filter_test.go +++ b/pkg/epp/scheduling/filter_test.go @@ -429,3 +429,93 @@ func TestFilterFunc(t *testing.T) { }) } } + +// TestLoRASoftAffinityDistribution tests that the loRASoftAffinityFilter function +// properly distributes requests according to the loraAffinityThreshold +func TestLoRASoftAffinityDistribution(t *testing.T) { + logger := logutil.NewTestLogger() + + const ( + testModelName = "test-model" + testAffinityModel = "test-affinity-model" + numIterations = 10000 + tolerancePercent = 5.0 // Allow 5% tolerance from expected distribution + ) + + // Create a test request and pods + req := &LLMRequest{ + Model: testAffinityModel, + ResolvedTargetModel: testAffinityModel, + } + + // Test setup: One affinity pod and one available pod + pods := []*datastore.PodMetrics{ + { + Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "affinity-pod"}}, + Metrics: datastore.Metrics{ + MaxActiveModels: 2, + ActiveModels: map[string]int{ + testAffinityModel: 1, + }, + }, + }, + { + Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "available-pod"}}, + Metrics: datastore.Metrics{ + MaxActiveModels: 2, + ActiveModels: map[string]int{}, + }, + }, + } + + // Run the filter function multiple times and count the results + affinityCount := 0 + availableCount := 0 + + // Use the actual loraAffinityThreshold as defined in the original code + // This test should work with whatever value is set there + expectedAffinityPercent := loraAffinityThreshold * 100 + for i := 0; i < numIterations; i++ { + result, err := loRASoftAffinityFilter(logger, req, pods) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + + // Check which type of pod was returned + if len(result) != 1 { + t.Fatalf("Expected exactly one pod in result, got %d", len(result)) + } + + // Identify if the returned pod is the affinity pod or available pod + if _, exists := result[0].ActiveModels[testAffinityModel]; exists { + affinityCount++ + } else { + availableCount++ + } + } + + // Calculate the actual percentages + actualAffinityPercent := float64(affinityCount) / float64(numIterations) * 100 + actualAvailablePercent := float64(availableCount) / float64(numIterations) * 100 + + // Check if the distribution matches expected threshold within tolerance + affinityLowerBound := expectedAffinityPercent - tolerancePercent + affinityUpperBound := expectedAffinityPercent + tolerancePercent + + availableLowerBound := actualAvailablePercent - tolerancePercent + availableUpperBound := actualAvailablePercent + tolerancePercent + + t.Logf("Distribution results over %d iterations:", numIterations) + t.Logf("Expected affinity percent: %.2f%% (threshold: %.2f)", expectedAffinityPercent, loraAffinityThreshold) + t.Logf("Actual affinity percent: %.2f%% (%d out of %d)", actualAffinityPercent, affinityCount, numIterations) + t.Logf("Actual available percent: %.2f%% (%d out of %d)", actualAvailablePercent, availableCount, numIterations) + + if actualAffinityPercent < affinityLowerBound || actualAffinityPercent > affinityUpperBound { + t.Errorf("Affinity selection percent %.2f%% outside expected range %.2f%% to %.2f%%", + actualAffinityPercent, affinityLowerBound, affinityUpperBound) + } + if actualAvailablePercent < availableLowerBound || actualAvailablePercent > availableUpperBound { + t.Errorf("Availability selection percent %.2f%% outside expected range %.2f%% to %.2f%%", + actualAvailablePercent, availableLowerBound, availableUpperBound) + } +} diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go index a969948e..bdddd972 100644 --- a/pkg/epp/scheduling/scheduler.go +++ b/pkg/epp/scheduling/scheduler.go @@ -36,8 +36,11 @@ const ( queueThresholdCritical = 5 // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable. // the threshold for queued requests to be considered low below which we can prioritize LoRA affinity. - // The value of 50 is arrived heuristicically based on experiments. - queueingThresholdLoRA = 50 + // The value of 128 is arrived heuristicically based on experiments. + queueingThresholdLoRA = 128 + // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16) Make this configurable. + // loraAffinityThreshold indicates the probability with which we prefer a pod with LoRA affinity over a pod without but having room to fit more LoRA adapters. + loraAffinityThreshold = 0.999 ) var ( @@ -54,7 +57,7 @@ var ( filter: leastQueuingFilterFunc, nextOnSuccessOrFailure: &filter{ name: "low cost LoRA", - filter: toFilterFunc(lowLoRACostPredicate), + filter: loRASoftAffinityFilter, nextOnSuccessOrFailure: &filter{ name: "least KV cache percent", filter: leastKVCacheFilterFunc, @@ -76,14 +79,9 @@ var ( name: "low queueing filter", filter: toFilterFunc((lowQueueingPodPredicate)), nextOnSuccess: &filter{ - name: "affinity LoRA", - filter: toFilterFunc(loRAAffinityPredicate), - nextOnSuccess: queueAndKVCacheFilter, - nextOnFailure: &filter{ - name: "can accept LoRA Adapter", - filter: toFilterFunc(canAcceptNewLoraPredicate), - nextOnSuccessOrFailure: queueAndKVCacheFilter, - }, + name: "affinity LoRA", + filter: loRASoftAffinityFilter, + nextOnSuccessOrFailure: queueAndKVCacheFilter, }, nextOnFailure: queueLoRAAndKVCacheFilter, } diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index 7755795b..cc836504 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -158,6 +158,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { KVCacheUsagePercent: 0.2, ActiveModels: map[string]int{ "foo": 1, + "bar": 1, }, }), }, @@ -200,7 +201,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, }), extprocutils.FakePodMetrics(1, datastore.Metrics{ - WaitingQueueSize: 50, + WaitingQueueSize: 200, KVCacheUsagePercent: 0.1, ActiveModels: map[string]int{ "foo": 1, From 48978f4e9ccb524a0c4942e7480a197dbeeba2c3 Mon Sep 17 00:00:00 2001 From: Rob Scott Date: Tue, 4 Mar 2025 12:59:45 -0800 Subject: [PATCH 74/96] Fixing default status on InferencePool (#449) --- api/v1alpha2/inferencepool_types.go | 5 +++-- .../bases/inference.networking.x-k8s.io_inferencepools.yaml | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/api/v1alpha2/inferencepool_types.go b/api/v1alpha2/inferencepool_types.go index 19ec799f..e4350417 100644 --- a/api/v1alpha2/inferencepool_types.go +++ b/api/v1alpha2/inferencepool_types.go @@ -168,13 +168,14 @@ type PoolStatus struct { // // Known condition types are: // - // * "Ready" + // * "Accepted" + // * "ResolvedRefs" // // +optional // +listType=map // +listMapKey=type // +kubebuilder:validation:MaxItems=8 - // +kubebuilder:default={{type: "Ready", status: "Unknown", reason:"Pending", message:"Waiting for controller", lastTransitionTime: "1970-01-01T00:00:00Z"}} + // +kubebuilder:default={{type: "Accepted", status: "Unknown", reason:"Pending", message:"Waiting for controller", lastTransitionTime: "1970-01-01T00:00:00Z"}} Conditions []metav1.Condition `json:"conditions,omitempty"` } diff --git a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml index 5767508b..8386db82 100644 --- a/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml +++ b/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml @@ -154,13 +154,14 @@ spec: message: Waiting for controller reason: Pending status: Unknown - type: Ready + type: Accepted description: |- Conditions track the state of the InferencePool. Known condition types are: - * "Ready" + * "Accepted" + * "ResolvedRefs" items: description: Condition contains details for one aspect of the current state of this API Resource. From 9bd981b09bd144d3f97830e9c0aa763c29516419 Mon Sep 17 00:00:00 2001 From: Tiger Xu / Zhonghu Xu Date: Thu, 6 Mar 2025 01:35:45 +0800 Subject: [PATCH 75/96] Use server side namespace filter (#429) * Add filter to pod redconciler * use poolHasSynced * filter using server-side namespace filter * update object filter * update * remove unused scheme and namespace * Move controller manager build function to pkg/epp/server so we can better test it * Update integration test --- cmd/epp/main.go | 14 +--- .../controller/inferencemodel_reconciler.go | 4 +- .../inferencemodel_reconciler_test.go | 1 - .../controller/inferencepool_reconciler.go | 6 -- pkg/epp/controller/pod_reconciler.go | 9 +-- pkg/epp/server/controller_manager.go | 73 +++++++++++++++++++ pkg/epp/server/runserver.go | 3 - test/integration/hermetic_test.go | 4 +- 8 files changed, 81 insertions(+), 33 deletions(-) create mode 100644 pkg/epp/server/controller_manager.go diff --git a/cmd/epp/main.go b/cmd/epp/main.go index ab270c49..5b350bb2 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -30,16 +30,12 @@ import ( "go.uber.org/zap/zapcore" "google.golang.org/grpc" healthPb "google.golang.org/grpc/health/grpc_health_v1" - "k8s.io/apimachinery/pkg/runtime" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" "k8s.io/component-base/metrics/legacyregistry" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/vllm" @@ -97,15 +93,9 @@ var ( "are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+ "then a self-signed certificate is used.") - scheme = runtime.NewScheme() setupLog = ctrl.Log.WithName("setup") ) -func init() { - utilruntime.Must(clientgoscheme.AddToScheme(scheme)) - utilruntime.Must(v1alpha2.AddToScheme(scheme)) -} - func main() { if err := run(); err != nil { os.Exit(1) @@ -140,9 +130,9 @@ func run() error { return err } - mgr, err := ctrl.NewManager(cfg, ctrl.Options{Scheme: scheme}) + mgr, err := runserver.NewDefaultManager(*poolNamespace, *poolName, cfg) if err != nil { - setupLog.Error(err, "Failed to create controller manager", "config", cfg) + setupLog.Error(err, "Failed to create controller manager") return err } diff --git a/pkg/epp/controller/inferencemodel_reconciler.go b/pkg/epp/controller/inferencemodel_reconciler.go index 8318324f..a7f365b7 100644 --- a/pkg/epp/controller/inferencemodel_reconciler.go +++ b/pkg/epp/controller/inferencemodel_reconciler.go @@ -21,7 +21,6 @@ import ( "fmt" "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" @@ -36,7 +35,6 @@ import ( type InferenceModelReconciler struct { client.Client - Scheme *runtime.Scheme Record record.EventRecorder Datastore datastore.Datastore PoolNamespacedName types.NamespacedName @@ -128,5 +126,5 @@ func (c *InferenceModelReconciler) SetupWithManager(ctx context.Context, mgr ctr } func (c *InferenceModelReconciler) eventPredicate(infModel *v1alpha2.InferenceModel) bool { - return (infModel.Spec.PoolRef.Name == v1alpha2.ObjectName(c.PoolNamespacedName.Name)) && (infModel.GetNamespace() == c.PoolNamespacedName.Namespace) + return string(infModel.Spec.PoolRef.Name) == c.PoolNamespacedName.Name } diff --git a/pkg/epp/controller/inferencemodel_reconciler_test.go b/pkg/epp/controller/inferencemodel_reconciler_test.go index d5277919..2ac5bb1e 100644 --- a/pkg/epp/controller/inferencemodel_reconciler_test.go +++ b/pkg/epp/controller/inferencemodel_reconciler_test.go @@ -193,7 +193,6 @@ func TestInferenceModelReconciler(t *testing.T) { datastore := datastore.NewFakeDatastore(nil, test.modelsInStore, pool) reconciler := &InferenceModelReconciler{ Client: fakeClient, - Scheme: scheme, Record: record.NewFakeRecorder(10), Datastore: datastore, PoolNamespacedName: types.NamespacedName{Name: pool.Name, Namespace: pool.Namespace}, diff --git a/pkg/epp/controller/inferencepool_reconciler.go b/pkg/epp/controller/inferencepool_reconciler.go index 2ad7d2bb..880aec8c 100644 --- a/pkg/epp/controller/inferencepool_reconciler.go +++ b/pkg/epp/controller/inferencepool_reconciler.go @@ -21,13 +21,11 @@ import ( "reflect" "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" @@ -38,7 +36,6 @@ import ( // will have the proper controller that will create/manage objects on behalf of the server pool. type InferencePoolReconciler struct { client.Client - Scheme *runtime.Scheme Record record.EventRecorder PoolNamespacedName types.NamespacedName Datastore datastore.Datastore @@ -90,8 +87,5 @@ func (c *InferencePoolReconciler) updateDatastore(ctx context.Context, newPool * func (c *InferencePoolReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). For(&v1alpha2.InferencePool{}). - WithEventFilter(predicate.NewPredicateFuncs(func(object client.Object) bool { - return (object.GetNamespace() == c.PoolNamespacedName.Namespace) && (object.GetName() == c.PoolNamespacedName.Name) - })). Complete(c) } diff --git a/pkg/epp/controller/pod_reconciler.go b/pkg/epp/controller/pod_reconciler.go index 717d9f60..a6c897c2 100644 --- a/pkg/epp/controller/pod_reconciler.go +++ b/pkg/epp/controller/pod_reconciler.go @@ -22,7 +22,6 @@ import ( "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" @@ -35,19 +34,15 @@ import ( type PodReconciler struct { client.Client Datastore datastore.Datastore - Scheme *runtime.Scheme Record record.EventRecorder } func (c *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { logger := log.FromContext(ctx) - inferencePool, err := c.Datastore.PoolGet() - if err != nil { - logger.V(logutil.TRACE).Info("Skipping reconciling Pod because the InferencePool is not available yet", "error", err) + if !c.Datastore.PoolHasSynced() { + logger.V(logutil.TRACE).Info("Skipping reconciling Pod because the InferencePool is not available yet") // When the inferencePool is initialized it lists the appropriate pods and populates the datastore, so no need to requeue. return ctrl.Result{}, nil - } else if inferencePool.Namespace != req.Namespace { - return ctrl.Result{}, nil } logger.V(logutil.VERBOSE).Info("Pod being reconciled", "name", req.NamespacedName) diff --git a/pkg/epp/server/controller_manager.go b/pkg/epp/server/controller_manager.go new file mode 100644 index 00000000..fd505d00 --- /dev/null +++ b/pkg/epp/server/controller_manager.go @@ -0,0 +1,73 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package server + +import ( + "fmt" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" +) + +var scheme = runtime.NewScheme() + +func init() { + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.AddToScheme(scheme)) +} + +// NewDefaultManager creates a new controller manager with default configuration. +func NewDefaultManager(namespace, name string, restConfig *rest.Config) (ctrl.Manager, error) { + manager, err := ctrl.NewManager(restConfig, ctrl.Options{ + Scheme: scheme, + Cache: cache.Options{ + ByObject: map[client.Object]cache.ByObject{ + &corev1.Pod{}: { + Namespaces: map[string]cache.Config{ + namespace: {}, + }, + }, + &v1alpha2.InferencePool{}: { + Namespaces: map[string]cache.Config{ + namespace: { + FieldSelector: fields.SelectorFromSet(fields.Set{ + "metadata.name": name, + }), + }, + }, + }, + &v1alpha2.InferenceModel{}: { + Namespaces: map[string]cache.Config{ + namespace: {}, + }, + }, + }, + }, + }) + if err != nil { + return nil, fmt.Errorf("failed to create controller manager: %v", err) + } + return manager, nil +} diff --git a/pkg/epp/server/runserver.go b/pkg/epp/server/runserver.go index f3d9b6ac..4c0a7e53 100644 --- a/pkg/epp/server/runserver.go +++ b/pkg/epp/server/runserver.go @@ -89,7 +89,6 @@ func (r *ExtProcServerRunner) SetupWithManager(ctx context.Context, mgr ctrl.Man // Create the controllers and register them with the manager if err := (&controller.InferencePoolReconciler{ Datastore: r.Datastore, - Scheme: mgr.GetScheme(), Client: mgr.GetClient(), PoolNamespacedName: types.NamespacedName{ Name: r.PoolName, @@ -102,7 +101,6 @@ func (r *ExtProcServerRunner) SetupWithManager(ctx context.Context, mgr ctrl.Man if err := (&controller.InferenceModelReconciler{ Datastore: r.Datastore, - Scheme: mgr.GetScheme(), Client: mgr.GetClient(), PoolNamespacedName: types.NamespacedName{ Name: r.PoolName, @@ -115,7 +113,6 @@ func (r *ExtProcServerRunner) SetupWithManager(ctx context.Context, mgr ctrl.Man if err := (&controller.PodReconciler{ Datastore: r.Datastore, - Scheme: mgr.GetScheme(), Client: mgr.GetClient(), Record: mgr.GetEventRecorderFor("pod"), }).SetupWithManager(mgr); err != nil { diff --git a/test/integration/hermetic_test.go b/test/integration/hermetic_test.go index cc836504..4fba7832 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/hermetic_test.go @@ -58,6 +58,7 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" extprocutils "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/test" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" @@ -491,7 +492,8 @@ func BeforeSuit(t *testing.T) func() { // Init runtime. ctrl.SetLogger(logger) - mgr, err := ctrl.NewManager(cfg, ctrl.Options{Scheme: scheme}) + + mgr, err := server.NewDefaultManager("default", "vllm-llama2-7b-pool", cfg) if err != nil { logutil.Fatal(logger, err, "Failed to create controller manager") } From 5b823746a009da23cd6eacca68c1fb0a35be7bac Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Wed, 5 Mar 2025 19:53:47 +0200 Subject: [PATCH 76/96] fixed filepath that points to gpu based model server deployment in few places (#451) Signed-off-by: Nir Rozenbaum --- hack/release-quickstart.sh | 4 ++-- site-src/guides/index.md | 2 +- test/e2e/e2e_suite_test.go | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hack/release-quickstart.sh b/hack/release-quickstart.sh index a21047c3..832bd872 100755 --- a/hack/release-quickstart.sh +++ b/hack/release-quickstart.sh @@ -51,9 +51,9 @@ sed -i.bak '/us-central1-docker.pkg.dev\/k8s-staging-images\/gateway-api-inferen sed -i.bak -E "s|us-central1-docker\.pkg\.dev/k8s-staging-images|registry.k8s.io|g" "$EXT_PROC" # ----------------------------------------------------------------------------- -# Update config/manifests/vllm/deployment.yaml +# Update config/manifests/vllm/gpu-deployment.yaml # ----------------------------------------------------------------------------- -VLLM_DEPLOY="config/manifests/vllm/deployment.yaml" +VLLM_DEPLOY="config/manifests/vllm/gpu-deployment.yaml" echo "Updating ${VLLM_DEPLOY} ..." # Update the vLLM image version diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 98ae94a3..b7b31000 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -24,7 +24,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv #### GPU-Based Model Server - For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/deployment.yaml` as needed. + For this setup, you will need 3 GPUs to run the sample model server. Adjust the number of replicas in `./config/manifests/vllm/gpu-deployment.yaml` as needed. Create a Hugging Face secret to download the model [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf). Ensure that the token grants access to this model. Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. ```bash diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index 3d068c9f..24a488db 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -69,7 +69,7 @@ const ( // clientManifest is the manifest for the client test resources. clientManifest = "../testdata/client.yaml" // modelServerManifest is the manifest for the model server test resources. - modelServerManifest = "../../config/manifests/vllm/deployment.yaml" + modelServerManifest = "../../config/manifests/vllm/gpu-deployment.yaml" // modelServerSecretManifest is the manifest for the model server secret resource. modelServerSecretManifest = "../testdata/model-secret.yaml" // inferPoolManifest is the manifest for the inference pool CRD. From 0aa142d79479f675d8d339f05a3275ce08d2308e Mon Sep 17 00:00:00 2001 From: Rohit Ramkumar Date: Wed, 5 Mar 2025 17:55:44 -0500 Subject: [PATCH 77/96] Add library for generating self-signed cert (#453) --- internal/tls/tls.go | 73 ++++++++++++++++++++++ pkg/body-based-routing/server/runserver.go | 55 +--------------- pkg/epp/server/runserver.go | 54 +--------------- 3 files changed, 77 insertions(+), 105 deletions(-) create mode 100644 internal/tls/tls.go diff --git a/internal/tls/tls.go b/internal/tls/tls.go new file mode 100644 index 00000000..fb8092c6 --- /dev/null +++ b/internal/tls/tls.go @@ -0,0 +1,73 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tls + +import ( + "crypto/rand" + "crypto/rsa" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "encoding/pem" + "fmt" + "math/big" + "time" + + "github.com/go-logr/logr" +) + +// CreateSelfSignedTLSCertificate creates a self-signed cert the server can use to serve TLS. +func CreateSelfSignedTLSCertificate(logger logr.Logger) (tls.Certificate, error) { + serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) + serialNumber, err := rand.Int(rand.Reader, serialNumberLimit) + if err != nil { + return tls.Certificate{}, fmt.Errorf("error creating serial number: %v", err) + } + now := time.Now() + notBefore := now.UTC() + template := x509.Certificate{ + SerialNumber: serialNumber, + Subject: pkix.Name{ + Organization: []string{"Inference Ext"}, + }, + NotBefore: notBefore, + NotAfter: now.Add(time.Hour * 24 * 365 * 10).UTC(), // 10 years + KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + BasicConstraintsValid: true, + } + + priv, err := rsa.GenerateKey(rand.Reader, 4096) + if err != nil { + return tls.Certificate{}, fmt.Errorf("error generating key: %v", err) + } + + derBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, &priv.PublicKey, priv) + if err != nil { + return tls.Certificate{}, fmt.Errorf("error creating certificate: %v", err) + } + + certBytes := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: derBytes}) + + privBytes, err := x509.MarshalPKCS8PrivateKey(priv) + if err != nil { + return tls.Certificate{}, fmt.Errorf("error marshalling private key: %v", err) + } + keyBytes := pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: privBytes}) + + return tls.X509KeyPair(certBytes, keyBytes) +} diff --git a/pkg/body-based-routing/server/runserver.go b/pkg/body-based-routing/server/runserver.go index b04602bb..3674c6cf 100644 --- a/pkg/body-based-routing/server/runserver.go +++ b/pkg/body-based-routing/server/runserver.go @@ -18,14 +18,7 @@ package server import ( "context" - "crypto/rand" - "crypto/rsa" "crypto/tls" - "crypto/x509" - "crypto/x509/pkix" - "encoding/pem" - "math/big" - "time" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "github.com/go-logr/logr" @@ -33,6 +26,7 @@ import ( "google.golang.org/grpc/credentials" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" + tlsutil "sigs.k8s.io/gateway-api-inference-extension/internal/tls" "sigs.k8s.io/gateway-api-inference-extension/pkg/body-based-routing/handlers" ) @@ -56,7 +50,7 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner { // The runnable implements LeaderElectionRunnable with leader election disabled. func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable { return runnable.NoLeaderElection(manager.RunnableFunc(func(ctx context.Context) error { - cert, err := createSelfSignedTLSCertificate(logger) + cert, err := tlsutil.CreateSelfSignedTLSCertificate(logger) if err != nil { logger.Error(err, "Failed to create self signed certificate") return err @@ -73,48 +67,3 @@ func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable { return runnable.GRPCServer("ext-proc", srv, r.GrpcPort).Start(ctx) })) } - -func createSelfSignedTLSCertificate(logger logr.Logger) (tls.Certificate, error) { - serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) - serialNumber, err := rand.Int(rand.Reader, serialNumberLimit) - if err != nil { - logger.Error(err, "Failed to create serial number for self-signed cert") - return tls.Certificate{}, err - } - now := time.Now() - notBefore := now.UTC() - template := x509.Certificate{ - SerialNumber: serialNumber, - Subject: pkix.Name{ - Organization: []string{"Inference Ext"}, - }, - NotBefore: notBefore, - NotAfter: now.Add(time.Hour * 24 * 365 * 10).UTC(), // 10 years - KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, - ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, - BasicConstraintsValid: true, - } - - priv, err := rsa.GenerateKey(rand.Reader, 4096) - if err != nil { - logger.Error(err, "Failed to generate key for self-signed cert") - return tls.Certificate{}, err - } - - derBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, &priv.PublicKey, priv) - if err != nil { - logger.Error(err, "Failed to create self-signed certificate") - return tls.Certificate{}, err - } - - certBytes := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: derBytes}) - - privBytes, err := x509.MarshalPKCS8PrivateKey(priv) - if err != nil { - logger.Error(err, "Failed to marshal private key for self-signed certificate") - return tls.Certificate{}, err - } - keyBytes := pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: privBytes}) - - return tls.X509KeyPair(certBytes, keyBytes) -} diff --git a/pkg/epp/server/runserver.go b/pkg/epp/server/runserver.go index 4c0a7e53..8c553cd5 100644 --- a/pkg/epp/server/runserver.go +++ b/pkg/epp/server/runserver.go @@ -18,14 +18,8 @@ package server import ( "context" - "crypto/rand" - "crypto/rsa" "crypto/tls" - "crypto/x509" - "crypto/x509/pkix" - "encoding/pem" "fmt" - "math/big" "time" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" @@ -36,6 +30,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" + tlsutil "sigs.k8s.io/gateway-api-inference-extension/internal/tls" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/controller" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" @@ -139,7 +134,7 @@ func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable { cert, err = tls.LoadX509KeyPair(r.CertPath+"/tls.crt", r.CertPath+"/tls.key") } else { // Create tls based credential. - cert, err = createSelfSignedTLSCertificate(logger) + cert, err = tlsutil.CreateSelfSignedTLSCertificate(logger) } if err != nil { logger.Error(err, "Failed to create self signed certificate") @@ -163,48 +158,3 @@ func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable { return runnable.GRPCServer("ext-proc", srv, r.GrpcPort).Start(ctx) })) } - -func createSelfSignedTLSCertificate(logger logr.Logger) (tls.Certificate, error) { - serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) - serialNumber, err := rand.Int(rand.Reader, serialNumberLimit) - if err != nil { - logger.Error(err, "Failed to create serial number for self-signed cert") - return tls.Certificate{}, err - } - now := time.Now() - notBefore := now.UTC() - template := x509.Certificate{ - SerialNumber: serialNumber, - Subject: pkix.Name{ - Organization: []string{"Inference Ext"}, - }, - NotBefore: notBefore, - NotAfter: now.Add(time.Hour * 24 * 365 * 10).UTC(), // 10 years - KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, - ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, - BasicConstraintsValid: true, - } - - priv, err := rsa.GenerateKey(rand.Reader, 4096) - if err != nil { - logger.Error(err, "Failed to generate key for self-signed cert") - return tls.Certificate{}, err - } - - derBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, &priv.PublicKey, priv) - if err != nil { - logger.Error(err, "Failed to create self-signed certificate") - return tls.Certificate{}, err - } - - certBytes := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: derBytes}) - - privBytes, err := x509.MarshalPKCS8PrivateKey(priv) - if err != nil { - logger.Error(err, "Failed to marshal private key for self-signed certificate") - return tls.Certificate{}, err - } - keyBytes := pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: privBytes}) - - return tls.X509KeyPair(certBytes, keyBytes) -} From 70965a060bc7541f506aebd2065fb631b5045bfe Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Wed, 5 Mar 2025 17:09:37 -0700 Subject: [PATCH 78/96] Support full duplex streaming (#450) This PR supports the FULL_DUPLEX_STREAMED mode for ext-proc. --- cmd/epp/main.go | 6 + config/manifests/ext_proc.yaml | 5 +- .../manifests/gateway/extension_policy.yaml | 1 + config/manifests/gateway/patch_policy.yaml | 33 +- pkg/epp/handlers/server.go | 104 ++-- pkg/epp/handlers/streamingserver.go | 503 ++++++++++++++++++ pkg/epp/server/runserver.go | 11 +- 7 files changed, 613 insertions(+), 50 deletions(-) create mode 100644 pkg/epp/handlers/streamingserver.go diff --git a/cmd/epp/main.go b/cmd/epp/main.go index 5b350bb2..1f62d94a 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -110,6 +110,11 @@ func run() error { flag.Parse() initLogging(&opts) + useStreamingServer, err := strconv.ParseBool(os.Getenv("USE_STREAMING")) + if err != nil { + setupLog.Error(err, "Failed to parse env var USE_STREAMING, defaulting to false") + } + // Validate flags if err := validateFlags(); err != nil { setupLog.Error(err, "Failed to validate flags") @@ -153,6 +158,7 @@ func run() error { SecureServing: *secureServing, CertPath: *certPath, Provider: provider, + UseStreaming: useStreamingServer, } if err := serverRunner.SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "Failed to setup ext-proc controllers") diff --git a/config/manifests/ext_proc.yaml b/config/manifests/ext_proc.yaml index 60a0fc3e..d70467ee 100644 --- a/config/manifests/ext_proc.yaml +++ b/config/manifests/ext_proc.yaml @@ -77,11 +77,14 @@ spec: - -poolName - "my-pool" - -v - - "3" + - "4" - -grpcPort - "9002" - -grpcHealthPort - "9003" + env: + - name: USE_STREAMING + value: "false" ports: - containerPort: 9002 - containerPort: 9003 diff --git a/config/manifests/gateway/extension_policy.yaml b/config/manifests/gateway/extension_policy.yaml index a8105d6d..14b7b123 100644 --- a/config/manifests/gateway/extension_policy.yaml +++ b/config/manifests/gateway/extension_policy.yaml @@ -11,6 +11,7 @@ spec: name: inference-gateway-ext-proc port: 9002 processingMode: + allowModeOverride: true request: body: Buffered response: diff --git a/config/manifests/gateway/patch_policy.yaml b/config/manifests/gateway/patch_policy.yaml index ae4fb6d8..3c36ed7a 100644 --- a/config/manifests/gateway/patch_policy.yaml +++ b/config/manifests/gateway/patch_policy.yaml @@ -48,10 +48,41 @@ spec: typed_config: "@type": "type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext" common_tls_context: {} - - type: "type.googleapis.com/envoy.config.route.v3.RouteConfiguration" name: default/inference-gateway/llm-gw operation: op: replace path: "/virtual_hosts/0/routes/0/route/cluster" value: original_destination_cluster +# Uncomment the below to enable full duplex streaming + # - type: "type.googleapis.com/envoy.config.listener.v3.Listener" + # name: "default/inference-gateway/llm-gw" + # operation: + # op: add + # path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/request_body_mode" + # value: FULL_DUPLEX_STREAMED + # - type: "type.googleapis.com/envoy.config.listener.v3.Listener" + # name: "default/inference-gateway/llm-gw" + # operation: + # op: add + # path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/request_trailer_mode" + # value: SEND + # - type: "type.googleapis.com/envoy.config.listener.v3.Listener" + # name: "default/inference-gateway/llm-gw" + # operation: + # op: add + # path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_body_mode" + # value: FULL_DUPLEX_STREAMED + # - type: "type.googleapis.com/envoy.config.listener.v3.Listener" + # name: "default/inference-gateway/llm-gw" + # operation: + # op: replace + # path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_trailer_mode" + # value: SEND + # - type: "type.googleapis.com/envoy.config.listener.v3.Listener" + # name: "default/inference-gateway/llm-gw" + # operation: + # op: replace + # path: "/default_filter_chain/filters/0/typed_config/http_filters/0/typed_config/processing_mode/response_header_mode" + # value: SEND + diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go index 3270134b..bbdbe83e 100644 --- a/pkg/epp/handlers/server.go +++ b/pkg/epp/handlers/server.go @@ -132,53 +132,9 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { if err != nil { logger.V(logutil.DEFAULT).Error(err, "Failed to process request", "request", req) - switch errutil.CanonicalCode(err) { - // This code can be returned by scheduler when there is no capacity for sheddable - // requests. - case errutil.InferencePoolResourceExhausted: - resp = &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_ImmediateResponse{ - ImmediateResponse: &extProcPb.ImmediateResponse{ - Status: &envoyTypePb.HttpStatus{ - Code: envoyTypePb.StatusCode_TooManyRequests, - }, - }, - }, - } - // This code can be returned by when EPP processes the request and run into server-side errors. - case errutil.Internal: - resp = &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_ImmediateResponse{ - ImmediateResponse: &extProcPb.ImmediateResponse{ - Status: &envoyTypePb.HttpStatus{ - Code: envoyTypePb.StatusCode_InternalServerError, - }, - }, - }, - } - // This code can be returned when users provide invalid json request. - case errutil.BadRequest: - resp = &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_ImmediateResponse{ - ImmediateResponse: &extProcPb.ImmediateResponse{ - Status: &envoyTypePb.HttpStatus{ - Code: envoyTypePb.StatusCode_BadRequest, - }, - }, - }, - } - case errutil.BadConfiguration: - resp = &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_ImmediateResponse{ - ImmediateResponse: &extProcPb.ImmediateResponse{ - Status: &envoyTypePb.HttpStatus{ - Code: envoyTypePb.StatusCode_NotFound, - }, - }, - }, - } - default: - return status.Errorf(status.Code(err), "failed to handle request: %v", err) + resp, err = BuildErrResponse(err) + if err != nil { + return err } } @@ -190,6 +146,60 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { } } +func BuildErrResponse(err error) (*extProcPb.ProcessingResponse, error) { + var resp *extProcPb.ProcessingResponse + + switch errutil.CanonicalCode(err) { + // This code can be returned by scheduler when there is no capacity for sheddable + // requests. + case errutil.InferencePoolResourceExhausted: + resp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: envoyTypePb.StatusCode_TooManyRequests, + }, + }, + }, + } + // This code can be returned by when EPP processes the request and run into server-side errors. + case errutil.Internal: + resp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: envoyTypePb.StatusCode_InternalServerError, + }, + }, + }, + } + // This code can be returned when users provide invalid json request. + case errutil.BadRequest: + resp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: envoyTypePb.StatusCode_BadRequest, + }, + }, + }, + } + case errutil.BadConfiguration: + resp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: envoyTypePb.StatusCode_NotFound, + }, + }, + }, + } + default: + return nil, status.Errorf(status.Code(err), "failed to handle request: %v", err) + } + return resp, nil +} + // RequestContext stores context information during the life time of an HTTP request. type RequestContext struct { TargetPod string diff --git a/pkg/epp/handlers/streamingserver.go b/pkg/epp/handlers/streamingserver.go new file mode 100644 index 00000000..821dd989 --- /dev/null +++ b/pkg/epp/handlers/streamingserver.go @@ -0,0 +1,503 @@ +package handlers + +import ( + "context" + "encoding/json" + "fmt" + "io" + "strconv" + "time" + + configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "github.com/go-logr/logr" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" + "google.golang.org/protobuf/types/known/structpb" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +func NewStreamingServer(scheduler Scheduler, destinationEndpointHintMetadataNamespace, destinationEndpointHintKey string, datastore datastore.Datastore) *StreamingServer { + return &StreamingServer{ + scheduler: scheduler, + destinationEndpointHintMetadataNamespace: destinationEndpointHintMetadataNamespace, + destinationEndpointHintKey: destinationEndpointHintKey, + datastore: datastore, + } +} + +type StreamingServer struct { + scheduler Scheduler + // The key of the header to specify the target pod address. This value needs to match Envoy + // configuration. + destinationEndpointHintKey string + // The key acting as the outer namespace struct in the metadata extproc response to communicate + // back the picked endpoints. + destinationEndpointHintMetadataNamespace string + datastore datastore.Datastore +} + +func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { + ctx := srv.Context() + logger := log.FromContext(ctx) + loggerVerbose := logger.V(logutil.VERBOSE) + loggerVerbose.Info("Processing") + + // Create request context to share states during life time of an HTTP request. + // See https://github.com/envoyproxy/envoy/issues/17540. + reqCtx := &StreamingRequestContext{ + RequestState: RequestReceived, + } + + reader, writer := io.Pipe() + decoder := json.NewDecoder(reader) + + var requestBody, responseBody map[string]interface{} + // Create error handling var as each request should only report once for + // error metrics. This doesn't cover the error "Cannot receive stream request" because + // such errors might happen even though response is processed. + var err error + defer func(error) { + if reqCtx.ResponseStatusCode != "" { + metrics.RecordRequestErrCounter(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseStatusCode) + } else if err != nil { + metrics.RecordRequestErrCounter(reqCtx.Model, reqCtx.ResolvedTargetModel, errutil.CanonicalCode(err)) + } + }(err) + + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + req, recvErr := srv.Recv() + if recvErr == io.EOF || status.Code(recvErr) == codes.Canceled { + return nil + } + if recvErr != nil { + // This error occurs very frequently, though it doesn't seem to have any impact. + // TODO Figure out if we can remove this noise. + loggerVerbose.Error(err, "Cannot receive stream request") + return status.Errorf(codes.Unknown, "cannot receive stream request: %v", err) + } + + switch v := req.Request.(type) { + case *extProcPb.ProcessingRequest_RequestHeaders: + // Do nothing. Header info is handled in the HandleRequestBody func + case *extProcPb.ProcessingRequest_RequestBody: + loggerVerbose.Info("Incoming body chunk", "body", string(v.RequestBody.Body), "EoS", v.RequestBody.EndOfStream) + // In the stream case, we can receive multiple request bodies. + // To buffer the full message, we create a goroutine with a writer.Write() + // call, which will block until the corresponding reader reads from it. + // We do not read until we receive the EndofStream signal, and then + // decode the entire JSON body. + go func() { + _, err := writer.Write(v.RequestBody.Body) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error populating writer") + } + }() + + // Message is buffered, we can read and decode. + if v.RequestBody.EndOfStream { + loggerVerbose.Info("decoding") + err = decoder.Decode(&requestBody) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error unmarshaling request body") + } + // Body stream complete. Close the reader pipe, and start anew for response. + reader.Close() + reader, writer = io.Pipe() + decoder = json.NewDecoder(reader) + + reqCtx, err = s.HandleRequestBody(ctx, reqCtx, req, requestBody) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error handling body") + } else { + metrics.RecordRequestCounter(reqCtx.Model, reqCtx.ResolvedTargetModel) + metrics.RecordRequestSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestSize) + } + loggerVerbose.Info("Request context after HandleRequestBody", "context", reqCtx) + } + case *extProcPb.ProcessingRequest_RequestTrailers: + // This is currently unused. + case *extProcPb.ProcessingRequest_ResponseHeaders: + loggerVerbose.Info("got response headers", "headers", v.ResponseHeaders.Headers.GetHeaders()) + for _, header := range v.ResponseHeaders.Headers.GetHeaders() { + code := header.RawValue[0] + if header.Key == "status" && string(code) != "200" { + reqCtx.ResponseStatusCode = errutil.ModelServerError + } + } + reqCtx.RequestState = ResponseRecieved + reqCtx.respHeaderResp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ResponseHeaders{ + ResponseHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + // This is for debugging purpose only. + Key: "x-went-into-resp-headers", + RawValue: []byte("true"), + }, + }, + }, + }, + }, + }, + }, + } + + case *extProcPb.ProcessingRequest_ResponseBody: + go func() { + _, err := writer.Write(v.ResponseBody.Body) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error populating writer") + } + }() + + // Message is buffered, we can read and decode. + if v.ResponseBody.EndOfStream { + err = decoder.Decode(&responseBody) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error unmarshaling request body") + } + // Body stream complete. Close the reader pipe. + reader.Close() + + reqCtx, err = s.HandleResponseBody(ctx, reqCtx, responseBody) + if err == nil && reqCtx.ResponseComplete { + reqCtx.ResponseCompleteTimestamp = time.Now() + metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) + metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) + metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens) + metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens) + } + loggerVerbose.Info("Request context after HandleResponseBody", "context", reqCtx) + } + case *extProcPb.ProcessingRequest_ResponseTrailers: + // This is currently unused. + } + + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Failed to process request", "request", req) + resp, err := BuildErrResponse(err) + if err != nil { + return err + } + if err := srv.Send(resp); err != nil { + logger.V(logutil.DEFAULT).Error(err, "Send failed") + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + return nil + } + loggerVerbose.Info("checking", "request state", reqCtx.RequestState) + if err := reqCtx.updateStateAndSendIfNeeded(srv, loggerVerbose); err != nil { + return err + } + } +} + +// updateStateAndSendIfNeeded checks state and can send mutiple responses in a single pass, but only if ordered properly. +// Order of requests matter in FULL_DUPLEX_STREAMING. For both request and response, the order of response sent back MUST be: Header->Body->Trailer, with trailer being optional. +func (r *StreamingRequestContext) updateStateAndSendIfNeeded(srv extProcPb.ExternalProcessor_ProcessServer, loggerVerbose logr.Logger) error { + // No switch statement as we could send multiple responses in one pass. + if r.RequestState == RequestReceived && r.reqHeaderResp != nil { + loggerVerbose.Info("Request header response", "obj", r.reqHeaderResp) + if err := srv.Send(r.reqHeaderResp); err != nil { + loggerVerbose.Error(err, "error sending response") + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + r.RequestState = HeaderRequestResponseComplete + } + if r.RequestState == HeaderRequestResponseComplete && r.reqBodyResp != nil { + loggerVerbose.Info("Request body response", "obj", r.reqBodyResp) + if err := srv.Send(r.reqBodyResp); err != nil { + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + r.RequestState = BodyRequestResponsesComplete + // Dump the response so a new stream message can begin + r.reqBodyResp = nil + } + if r.RequestState == BodyRequestResponsesComplete && r.reqTrailerResp != nil { + // Trailers in requests are not guaranteed + if err := srv.Send(r.reqHeaderResp); err != nil { + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + } + if r.RequestState == ResponseRecieved && r.respHeaderResp != nil { + loggerVerbose.Info("Response header response", "obj", r.respHeaderResp) + if err := srv.Send(r.respHeaderResp); err != nil { + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + r.RequestState = HeaderResponseResponseComplete + } + if r.RequestState == HeaderResponseResponseComplete && r.respBodyResp != nil { + loggerVerbose.Info("Response body response", "obj", r.respBodyResp) + if err := srv.Send(r.respBodyResp); err != nil { + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + r.RequestState = BodyResponseResponsesComplete + // Dump the response so a new stream message can begin + r.reqBodyResp = nil + } + if r.RequestState == BodyResponseResponsesComplete && r.respTrailerResp != nil { + // Trailers in requests are not guaranteed + if err := srv.Send(r.reqHeaderResp); err != nil { + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + } + return nil +} + +type StreamingRequestContext struct { + TargetPod string + TargetEndpoint string + Model string + ResolvedTargetModel string + RequestState StreamRequestState + RequestReceivedTimestamp time.Time + ResponseCompleteTimestamp time.Time + RequestSize int + Usage Usage + ResponseSize int + ResponseComplete bool + ResponseStatusCode string + + reqHeaderResp *extProcPb.ProcessingResponse + reqBodyResp *extProcPb.ProcessingResponse + reqTrailerResp *extProcPb.ProcessingResponse + + respHeaderResp *extProcPb.ProcessingResponse + respBodyResp *extProcPb.ProcessingResponse + respTrailerResp *extProcPb.ProcessingResponse +} + +type StreamRequestState int + +const ( + RequestReceived StreamRequestState = 0 + HeaderRequestResponseComplete StreamRequestState = 1 + BodyRequestResponsesComplete StreamRequestState = 2 + TrailerRequestResponsesComplete StreamRequestState = 3 + ResponseRecieved StreamRequestState = 4 + HeaderResponseResponseComplete StreamRequestState = 5 + BodyResponseResponsesComplete StreamRequestState = 6 + TrailerResponseResponsesComplete StreamRequestState = 7 +) + +// HandleRequestBody always returns the requestContext even in the error case, as the request context is used in error handling. +func (s *StreamingServer) HandleRequestBody( + ctx context.Context, + reqCtx *StreamingRequestContext, + req *extProcPb.ProcessingRequest, + requestBodyMap map[string]interface{}, +) (*StreamingRequestContext, error) { + var requestBodyBytes []byte + logger := log.FromContext(ctx) + loggerVerbose := logger.V(logutil.VERBOSE) + loggerVerbose.Info("Handling request body") + + // Resolve target models. + model, ok := requestBodyMap["model"].(string) + if !ok { + return reqCtx, errutil.Error{Code: errutil.BadRequest, Msg: "model not found in request"} + } + loggerVerbose.Info("Model requested", "model", model) + modelName := model + + // NOTE: The nil checking for the modelObject means that we DO allow passthrough currently. + // This might be a security risk in the future where adapters not registered in the InferenceModel + // are able to be requested by using their distinct name. + modelObj := s.datastore.ModelGet(model) + if modelObj == nil { + return reqCtx, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error finding a model object in InferenceModel for input %v", model)} + } + if len(modelObj.Spec.TargetModels) > 0 { + modelName = datastore.RandomWeightedDraw(logger, modelObj, 0) + if modelName == "" { + return reqCtx, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error getting target model name for model %v", modelObj.Name)} + } + } + llmReq := &scheduling.LLMRequest{ + Model: model, + ResolvedTargetModel: modelName, + Critical: datastore.IsCritical(modelObj), + } + loggerVerbose.Info("LLM request assembled", "request", llmReq) + + var err error + // Update target models in the body. + if llmReq.Model != llmReq.ResolvedTargetModel { + requestBodyMap["model"] = llmReq.ResolvedTargetModel + requestBodyBytes, err = json.Marshal(requestBodyMap) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error marshaling request body") + return reqCtx, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("error marshaling request body: %v", err)} + } + loggerVerbose.Info("Updated request body marshalled", "body", string(requestBodyBytes)) + } + + targetPod, err := s.scheduler.Schedule(ctx, llmReq) + if err != nil { + return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()} + } + + // Insert target endpoint to instruct Envoy to route requests to the specified target pod. + // Attach the port number + pool, err := s.datastore.PoolGet() + if err != nil { + return reqCtx, err + } + endpoint := targetPod.Address + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber)) + + logger.V(logutil.DEFAULT).Info("Request handled", + "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "endpoint", targetPod) + + reqCtx.Model = llmReq.Model + reqCtx.ResolvedTargetModel = llmReq.ResolvedTargetModel + reqCtx.RequestSize = len(requestBodyBytes) + reqCtx.TargetPod = targetPod.NamespacedName.String() + reqCtx.TargetEndpoint = endpoint + + headers := []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: s.destinationEndpointHintKey, + RawValue: []byte(endpoint), + }, + }, + // We need to update the content length header if the body is mutated, see Envoy doc: + // https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/filters/http/ext_proc/v3/processing_mode.proto + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte(strconv.Itoa(len(requestBodyBytes))), + }, + }, + } + // Print headers for debugging + for _, header := range headers { + logger.V(logutil.DEBUG).Info("Request body header", "key", header.Header.Key, "value", header.Header.RawValue) + } + + targetEndpointValue := &structpb.Struct{ + Fields: map[string]*structpb.Value{ + s.destinationEndpointHintKey: { + Kind: &structpb.Value_StringValue{ + StringValue: endpoint, + }, + }, + }, + } + dynamicMetadata := targetEndpointValue + if s.destinationEndpointHintMetadataNamespace != "" { + // If a namespace is defined, wrap the selected endpoint with that. + dynamicMetadata = &structpb.Struct{ + Fields: map[string]*structpb.Value{ + s.destinationEndpointHintMetadataNamespace: { + Kind: &structpb.Value_StructValue{ + StructValue: targetEndpointValue, + }, + }, + }, + } + } + + reqCtx.reqHeaderResp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: headers, + }, + }, + }, + }, + DynamicMetadata: dynamicMetadata, + } + reqCtx.reqBodyResp = &extProcPb.ProcessingResponse{ + // The Endpoint Picker supports two approaches to communicating the target endpoint, as a request header + // and as an unstructure ext-proc response metadata key/value pair. This enables different integration + // options for gateway providers. + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: requestBodyBytes, + EndOfStream: true, + }, + }, + }, + }, + }, + }, + } + return reqCtx, nil +} + +// HandleResponseBody always returns the requestContext even in the error case, as the request context is used in error handling. +func (s *StreamingServer) HandleResponseBody( + ctx context.Context, + reqCtx *StreamingRequestContext, + response map[string]interface{}, +) (*StreamingRequestContext, error) { + logger := log.FromContext(ctx) + loggerVerbose := logger.V(logutil.VERBOSE) + loggerVerbose.Info("Processing HandleResponseBody") + responseBytes, err := json.Marshal(response) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "error marshalling responseBody") + return reqCtx, err + } + if response["usage"] != nil { + usg := response["usage"].(map[string]interface{}) + usage := Usage{ + PromptTokens: int(usg["prompt_tokens"].(float64)), + CompletionTokens: int(usg["completion_tokens"].(float64)), + TotalTokens: int(usg["total_tokens"].(float64)), + } + reqCtx.Usage = usage + loggerVerbose.Info("Response generated", "usage", reqCtx.Usage) + } + reqCtx.ResponseSize = len(responseBytes) + // ResponseComplete is to indicate the response is complete. In non-streaming + // case, it will be set to be true once the response is processed; in + // streaming case, it will be set to be true once the last chunk is processed. + // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/178) + // will add the processing for streaming case. + reqCtx.ResponseComplete = true + + reqCtx.respBodyResp = &extProcPb.ProcessingResponse{ + // The Endpoint Picker supports two approaches to communicating the target endpoint, as a request header + // and as an unstructure ext-proc response metadata key/value pair. This enables different integration + // options for gateway providers. + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: responseBytes, + EndOfStream: true, + }, + }, + }, + }, + }, + }, + } + return reqCtx, nil +} diff --git a/pkg/epp/server/runserver.go b/pkg/epp/server/runserver.go index 8c553cd5..5b8269c1 100644 --- a/pkg/epp/server/runserver.go +++ b/pkg/epp/server/runserver.go @@ -51,6 +51,7 @@ type ExtProcServerRunner struct { Provider *backend.Provider SecureServing bool CertPath string + UseStreaming bool } // Default values for CLI flags in main @@ -149,9 +150,17 @@ func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable { } else { srv = grpc.NewServer() } + var extProcServer extProcPb.ExternalProcessorServer + if r.UseStreaming { + logger.Info("Using streaming extproc server") + extProcServer = handlers.NewStreamingServer(scheduling.NewScheduler(r.Datastore), r.DestinationEndpointHintMetadataNamespace, r.DestinationEndpointHintKey, r.Datastore) + } else { + logger.Info("Using standard extproc server") + extProcServer = handlers.NewServer(scheduling.NewScheduler(r.Datastore), r.DestinationEndpointHintMetadataNamespace, r.DestinationEndpointHintKey, r.Datastore) + } extProcPb.RegisterExternalProcessorServer( srv, - handlers.NewServer(scheduling.NewScheduler(r.Datastore), r.DestinationEndpointHintMetadataNamespace, r.DestinationEndpointHintKey, r.Datastore), + extProcServer, ) // Forward to the gRPC runnable. From b40de0474014ba9e17a5c63813b2583f0046725d Mon Sep 17 00:00:00 2001 From: Rob Scott Date: Wed, 5 Mar 2025 16:35:45 -0800 Subject: [PATCH 79/96] Renaming conditions and reasons used in InferencePool status (#454) --- api/v1alpha2/inferencepool_types.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/api/v1alpha2/inferencepool_types.go b/api/v1alpha2/inferencepool_types.go index e4350417..b411dbe3 100644 --- a/api/v1alpha2/inferencepool_types.go +++ b/api/v1alpha2/inferencepool_types.go @@ -233,16 +233,16 @@ const ( // // Controllers MAY raise this condition with other reasons, but should // prefer to use the reasons listed above to improve interoperability. - ModelConditionResolvedRefs InferencePoolConditionType = "ResolvedRefs" + InferencePoolConditionResolvedRefs InferencePoolConditionType = "ResolvedRefs" // This reason is used with the "ResolvedRefs" condition when the condition // is true. - ModelReasonResolvedRefs InferencePoolReason = "ResolvedRefs" + InferencePoolReasonResolvedRefs InferencePoolReason = "ResolvedRefs" // This reason is used with the "ResolvedRefs" condition when the // ExtensionRef is invalid in some way. This can include an unsupported kind // or API group, or a reference to a resource that can not be found. - ModelReasonInvalidExtensionRef InferencePoolReason = "InvalidExtensionRef" + InferencePoolReasonInvalidExtensionRef InferencePoolReason = "InvalidExtensionRef" ) func init() { From 9079982a8c381984a0c357f674afc30836a9773c Mon Sep 17 00:00:00 2001 From: Rohit Ramkumar Date: Thu, 6 Mar 2025 15:19:45 -0500 Subject: [PATCH 80/96] Move integration and e2e tests for epp into epp-specific directories (#457) --- test/e2e/{ => epp}/README.md | 0 test/e2e/{ => epp}/e2e_suite_test.go | 16 ++++++++-------- test/e2e/{ => epp}/e2e_test.go | 2 +- test/integration/{ => epp}/hermetic_test.go | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) rename test/e2e/{ => epp}/README.md (100%) rename test/e2e/{ => epp}/e2e_suite_test.go (96%) rename test/e2e/{ => epp}/e2e_test.go (99%) rename test/integration/{ => epp}/hermetic_test.go (98%) diff --git a/test/e2e/README.md b/test/e2e/epp/README.md similarity index 100% rename from test/e2e/README.md rename to test/e2e/epp/README.md diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/epp/e2e_suite_test.go similarity index 96% rename from test/e2e/e2e_suite_test.go rename to test/e2e/epp/e2e_suite_test.go index 24a488db..e7685c48 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/epp/e2e_suite_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package e2e +package epp import ( "context" @@ -67,19 +67,19 @@ const ( // inferExtName is the name of the inference extension test resources. inferExtName = "inference-gateway-ext-proc" // clientManifest is the manifest for the client test resources. - clientManifest = "../testdata/client.yaml" + clientManifest = "../../testdata/client.yaml" // modelServerManifest is the manifest for the model server test resources. - modelServerManifest = "../../config/manifests/vllm/gpu-deployment.yaml" + modelServerManifest = "../../../config/manifests/vllm/gpu-deployment.yaml" // modelServerSecretManifest is the manifest for the model server secret resource. - modelServerSecretManifest = "../testdata/model-secret.yaml" + modelServerSecretManifest = "../../testdata/model-secret.yaml" // inferPoolManifest is the manifest for the inference pool CRD. - inferPoolManifest = "../../config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml" + inferPoolManifest = "../../../config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml" // inferModelManifest is the manifest for the inference model CRD. - inferModelManifest = "../../config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml" + inferModelManifest = "../../../config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml" // inferExtManifest is the manifest for the inference extension test resources. - inferExtManifest = "../../config/manifests/ext_proc.yaml" + inferExtManifest = "../../../config/manifests/ext_proc.yaml" // envoyManifest is the manifest for the envoy proxy test resources. - envoyManifest = "../testdata/envoy.yaml" + envoyManifest = "../../testdata/envoy.yaml" ) var ( diff --git a/test/e2e/e2e_test.go b/test/e2e/epp/e2e_test.go similarity index 99% rename from test/e2e/e2e_test.go rename to test/e2e/epp/e2e_test.go index 8cd73d32..f5cfaf24 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/epp/e2e_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package e2e +package epp import ( "fmt" diff --git a/test/integration/hermetic_test.go b/test/integration/epp/hermetic_test.go similarity index 98% rename from test/integration/hermetic_test.go rename to test/integration/epp/hermetic_test.go index 4fba7832..765449f3 100644 --- a/test/integration/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -14,8 +14,8 @@ See the License for the specific language governing permissions and limitations under the License. */ -// Package test contains e2e tests for the ext proc while faking the backend pods. -package integration +// Package epp contains integration tests for the ext proc while faking the backend pods. +package epp import ( "bufio" @@ -472,7 +472,7 @@ func setUpHermeticServer(t *testing.T, podMetrics []*datastore.PodMetrics) (clie func BeforeSuit(t *testing.T) func() { // Set up mock k8s API Client testEnv = &envtest.Environment{ - CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, + CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "crd", "bases")}, ErrorIfCRDPathMissing: true, } cfg, err := testEnv.Start() @@ -522,7 +522,7 @@ func BeforeSuit(t *testing.T) func() { logger.Info("Setting up hermetic ExtProc server") // Unmarshal CRDs from file into structs - manifestsPath := filepath.Join("..", "testdata", "inferencepool-with-model-hermetic.yaml") + manifestsPath := filepath.Join("..", "..", "testdata", "inferencepool-with-model-hermetic.yaml") docs, err := readDocuments(manifestsPath) if err != nil { logutil.Fatal(logger, err, "Can't read object manifests", "path", manifestsPath) From 23bab8c4e0a991cbca3fe4dae24951a4a2b372fb Mon Sep 17 00:00:00 2001 From: Rohit Ramkumar Date: Fri, 7 Mar 2025 15:01:45 -0500 Subject: [PATCH 81/96] add initial integration test for body-based routing extension (#458) --- pkg/body-based-routing/server/runserver.go | 28 ++-- test/integration/bbr/hermetic_test.go | 173 +++++++++++++++++++++ 2 files changed, 189 insertions(+), 12 deletions(-) create mode 100644 test/integration/bbr/hermetic_test.go diff --git a/pkg/body-based-routing/server/runserver.go b/pkg/body-based-routing/server/runserver.go index 3674c6cf..55e79422 100644 --- a/pkg/body-based-routing/server/runserver.go +++ b/pkg/body-based-routing/server/runserver.go @@ -32,7 +32,8 @@ import ( // ExtProcServerRunner provides methods to manage an external process server. type ExtProcServerRunner struct { - GrpcPort int + GrpcPort int + SecureServing bool } // Default values for CLI flags in main @@ -42,7 +43,8 @@ const ( func NewDefaultExtProcServerRunner() *ExtProcServerRunner { return &ExtProcServerRunner{ - GrpcPort: DefaultGrpcPort, + GrpcPort: DefaultGrpcPort, + SecureServing: true, } } @@ -50,18 +52,20 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner { // The runnable implements LeaderElectionRunnable with leader election disabled. func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable { return runnable.NoLeaderElection(manager.RunnableFunc(func(ctx context.Context) error { - cert, err := tlsutil.CreateSelfSignedTLSCertificate(logger) - if err != nil { - logger.Error(err, "Failed to create self signed certificate") - return err + var srv *grpc.Server + if r.SecureServing { + cert, err := tlsutil.CreateSelfSignedTLSCertificate(logger) + if err != nil { + logger.Error(err, "Failed to create self signed certificate") + return err + } + creds := credentials.NewTLS(&tls.Config{Certificates: []tls.Certificate{cert}}) + srv = grpc.NewServer(grpc.Creds(creds)) + } else { + srv = grpc.NewServer() } - creds := credentials.NewTLS(&tls.Config{Certificates: []tls.Certificate{cert}}) - srv := grpc.NewServer(grpc.Creds(creds)) - extProcPb.RegisterExternalProcessorServer( - srv, - handlers.NewServer(), - ) + extProcPb.RegisterExternalProcessorServer(srv, handlers.NewServer()) // Forward to the gRPC runnable. return runnable.GRPCServer("ext-proc", srv, r.GrpcPort).Start(ctx) diff --git a/test/integration/bbr/hermetic_test.go b/test/integration/bbr/hermetic_test.go new file mode 100644 index 00000000..be8b2721 --- /dev/null +++ b/test/integration/bbr/hermetic_test.go @@ -0,0 +1,173 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package bbr contains integration tests for the body-based routing extension. +package bbr + +import ( + "context" + "encoding/json" + "fmt" + "testing" + "time" + + configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "github.com/go-logr/logr" + "github.com/google/go-cmp/cmp" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/protobuf/testing/protocmp" + runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/body-based-routing/server" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const port = runserver.DefaultGrpcPort + +var logger = logutil.NewTestLogger().V(logutil.VERBOSE) + +func TestBodyBasedRouting(t *testing.T) { + tests := []struct { + name string + req *extProcPb.ProcessingRequest + wantHeaders []*configPb.HeaderValueOption + wantErr bool + }{ + { + name: "success adding model parameter to header", + req: generateRequest(logger, "llama"), + wantHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "X-Gateway-Model-Name", + RawValue: []byte("llama"), + }, + }, + }, + wantErr: false, + }, + { + name: "no model parameter", + req: generateRequest(logger, ""), + wantHeaders: []*configPb.HeaderValueOption{}, + wantErr: false, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + client, cleanup := setUpHermeticServer() + t.Cleanup(cleanup) + + want := &extProcPb.ProcessingResponse{} + if len(test.wantHeaders) > 0 { + want.Response = &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: test.wantHeaders, + }, + ClearRouteCache: true, + }, + }, + } + } else { + want.Response = &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{}, + } + } + + res, err := sendRequest(t, client, test.req) + if err != nil && !test.wantErr { + t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) + } + if diff := cmp.Diff(want, res, protocmp.Transform()); diff != "" { + t.Errorf("Unexpected response, (-want +got): %v", diff) + } + }) + } +} + +func setUpHermeticServer() (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { + serverCtx, stopServer := context.WithCancel(context.Background()) + serverRunner := runserver.NewDefaultExtProcServerRunner() + serverRunner.SecureServing = false + + go func() { + if err := serverRunner.AsRunnable(logger.WithName("ext-proc")).Start(serverCtx); err != nil { + logutil.Fatal(logger, err, "Failed to start ext-proc server") + } + }() + + address := fmt.Sprintf("localhost:%v", port) + // Create a grpc connection + conn, err := grpc.NewClient(address, grpc.WithTransportCredentials(insecure.NewCredentials())) + if err != nil { + logutil.Fatal(logger, err, "Failed to connect", "address", address) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + client, err = extProcPb.NewExternalProcessorClient(conn).Process(ctx) + if err != nil { + logutil.Fatal(logger, err, "Failed to create client") + } + return client, func() { + cancel() + conn.Close() + stopServer() + + // wait a little until the goroutines actually exit + time.Sleep(5 * time.Second) + } +} + +func generateRequest(logger logr.Logger, model string) *extProcPb.ProcessingRequest { + j := map[string]interface{}{ + "prompt": "test1", + "max_tokens": 100, + "temperature": 0, + } + if model != "" { + j["model"] = model + } + + llmReq, err := json.Marshal(j) + if err != nil { + logutil.Fatal(logger, err, "Failed to unmarshal LLM request") + } + req := &extProcPb.ProcessingRequest{ + Request: &extProcPb.ProcessingRequest_RequestBody{ + RequestBody: &extProcPb.HttpBody{Body: llmReq}, + }, + } + return req +} + +func sendRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { + t.Logf("Sending request: %v", req) + if err := client.Send(req); err != nil { + t.Logf("Failed to send request %+v: %v", req, err) + return nil, err + } + + res, err := client.Recv() + if err != nil { + t.Logf("Failed to receive: %v", err) + return nil, err + } + t.Logf("Received request %+v", res) + return res, err +} From a70d66e5ac560246c42e503a285c804e67da40be Mon Sep 17 00:00:00 2001 From: Cong Liu Date: Mon, 10 Mar 2025 10:57:46 -0700 Subject: [PATCH 82/96] Each pod has independent loops to refresh metrics (#460) * Each pod has independent loops to refresh metrics * Major refactoring, move metrics logic from datastore to backend/metrics package * Address comments * Fix test and fmt * The podMetrics updates the targetPort by reading the pool from the datastore --- Makefile | 2 +- cmd/epp/main.go | 10 +- pkg/epp/backend/fake.go | 48 ----- pkg/epp/backend/metrics/fake.go | 90 +++++++++ pkg/epp/backend/metrics/logger.go | 111 +++++++++++ pkg/epp/backend/metrics/pod_metrics.go | 129 ++++++++++++ pkg/epp/backend/metrics/pod_metrics_test.go | 96 +++++++++ pkg/epp/backend/metrics/types.go | 114 +++++++++++ pkg/epp/backend/provider.go | 183 ------------------ pkg/epp/backend/provider_test.go | 151 --------------- pkg/epp/backend/vllm/metrics.go | 21 +- pkg/epp/backend/vllm/metrics_test.go | 28 +-- .../inferencemodel_reconciler_test.go | 18 +- .../controller/inferencepool_reconciler.go | 2 +- .../inferencepool_reconciler_test.go | 7 +- pkg/epp/controller/pod_reconciler.go | 10 +- pkg/epp/controller/pod_reconciler_test.go | 127 ++++++------ pkg/epp/datastore/datastore.go | 133 +++++-------- pkg/epp/datastore/datastore_test.go | 132 ++++++++++++- pkg/epp/datastore/types.go | 71 ------- pkg/epp/handlers/request.go | 3 +- pkg/epp/handlers/server.go | 3 +- pkg/epp/handlers/streamingserver.go | 3 +- pkg/epp/scheduling/filter.go | 68 +++---- pkg/epp/scheduling/filter_test.go | 171 ++++++++-------- pkg/epp/scheduling/scheduler.go | 11 +- pkg/epp/server/runserver.go | 19 +- pkg/epp/test/benchmark/benchmark.go | 145 -------------- pkg/epp/test/utils.go | 126 ------------ pkg/epp/util/testing/request.go | 45 +++++ pkg/epp/util/testing/wrappers.go | 6 + test/integration/epp/hermetic_test.go | 130 +++++++------ 32 files changed, 1115 insertions(+), 1098 deletions(-) delete mode 100644 pkg/epp/backend/fake.go create mode 100644 pkg/epp/backend/metrics/fake.go create mode 100644 pkg/epp/backend/metrics/logger.go create mode 100644 pkg/epp/backend/metrics/pod_metrics.go create mode 100644 pkg/epp/backend/metrics/pod_metrics_test.go create mode 100644 pkg/epp/backend/metrics/types.go delete mode 100644 pkg/epp/backend/provider.go delete mode 100644 pkg/epp/backend/provider_test.go delete mode 100644 pkg/epp/datastore/types.go delete mode 100644 pkg/epp/test/benchmark/benchmark.go delete mode 100644 pkg/epp/test/utils.go create mode 100644 pkg/epp/util/testing/request.go diff --git a/Makefile b/Makefile index 61b17f5b..257d2cbb 100644 --- a/Makefile +++ b/Makefile @@ -119,7 +119,7 @@ vet: ## Run go vet against code. .PHONY: test test: manifests generate fmt vet envtest ## Run tests. - KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -coverprofile cover.out + KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -race -coverprofile cover.out .PHONY: test-integration test-integration: manifests generate fmt vet envtest ## Run tests. diff --git a/cmd/epp/main.go b/cmd/epp/main.go index 1f62d94a..e1cd5015 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -37,7 +37,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/vllm" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" @@ -143,22 +143,20 @@ func run() error { ctx := ctrl.SetupSignalHandler() + pmf := backendmetrics.NewPodMetricsFactory(&vllm.PodMetricsClientImpl{}, *refreshMetricsInterval) // Setup runner. - datastore := datastore.NewDatastore() - provider := backend.NewProvider(&vllm.PodMetricsClientImpl{}, datastore) + datastore := datastore.NewDatastore(ctx, pmf) serverRunner := &runserver.ExtProcServerRunner{ GrpcPort: *grpcPort, DestinationEndpointHintMetadataNamespace: *destinationEndpointHintMetadataNamespace, DestinationEndpointHintKey: *destinationEndpointHintKey, PoolName: *poolName, PoolNamespace: *poolNamespace, - RefreshMetricsInterval: *refreshMetricsInterval, - RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval, Datastore: datastore, SecureServing: *secureServing, CertPath: *certPath, - Provider: provider, UseStreaming: useStreamingServer, + RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval, } if err := serverRunner.SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "Failed to setup ext-proc controllers") diff --git a/pkg/epp/backend/fake.go b/pkg/epp/backend/fake.go deleted file mode 100644 index 584486c2..00000000 --- a/pkg/epp/backend/fake.go +++ /dev/null @@ -1,48 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package backend - -import ( - "context" - - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" -) - -type FakePodMetricsClient struct { - Err map[types.NamespacedName]error - Res map[types.NamespacedName]*datastore.PodMetrics -} - -func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, existing *datastore.PodMetrics, port int32) (*datastore.PodMetrics, error) { - if err, ok := f.Err[existing.NamespacedName]; ok { - return nil, err - } - log.FromContext(ctx).V(logutil.VERBOSE).Info("Fetching metrics for pod", "existing", existing, "new", f.Res[existing.NamespacedName]) - return f.Res[existing.NamespacedName], nil -} - -type FakeDataStore struct { - Res map[string]*v1alpha2.InferenceModel -} - -func (fds *FakeDataStore) FetchModelData(modelName string) (returnModel *v1alpha2.InferenceModel) { - return fds.Res[modelName] -} diff --git a/pkg/epp/backend/metrics/fake.go b/pkg/epp/backend/metrics/fake.go new file mode 100644 index 00000000..fae7149d --- /dev/null +++ b/pkg/epp/backend/metrics/fake.go @@ -0,0 +1,90 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "context" + "fmt" + "sync" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +// FakePodMetrics is an implementation of PodMetrics that doesn't run the async refresh loop. +type FakePodMetrics struct { + Pod *Pod + Metrics *Metrics +} + +func (fpm *FakePodMetrics) GetPod() *Pod { + return fpm.Pod +} +func (fpm *FakePodMetrics) GetMetrics() *Metrics { + return fpm.Metrics +} +func (fpm *FakePodMetrics) UpdatePod(pod *corev1.Pod) { + fpm.Pod = toInternalPod(pod) +} +func (fpm *FakePodMetrics) StopRefreshLoop() {} // noop + +type FakePodMetricsClient struct { + errMu sync.RWMutex + Err map[types.NamespacedName]error + resMu sync.RWMutex + Res map[types.NamespacedName]*Metrics +} + +func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, pod *Pod, existing *Metrics, port int32) (*Metrics, error) { + f.errMu.RLock() + err, ok := f.Err[pod.NamespacedName] + f.errMu.RUnlock() + if ok { + return nil, err + } + f.resMu.RLock() + res, ok := f.Res[pod.NamespacedName] + f.resMu.RUnlock() + if !ok { + return nil, fmt.Errorf("no pod found: %v", pod.NamespacedName) + } + log.FromContext(ctx).V(logutil.VERBOSE).Info("Fetching metrics for pod", "existing", existing, "new", res) + return res.Clone(), nil +} + +func (f *FakePodMetricsClient) SetRes(new map[types.NamespacedName]*Metrics) { + f.resMu.Lock() + defer f.resMu.Unlock() + f.Res = new +} + +func (f *FakePodMetricsClient) SetErr(new map[types.NamespacedName]error) { + f.errMu.Lock() + defer f.errMu.Unlock() + f.Err = new +} + +type FakeDataStore struct { + Res map[string]*v1alpha2.InferenceModel +} + +func (fds *FakeDataStore) FetchModelData(modelName string) (returnModel *v1alpha2.InferenceModel) { + return fds.Res[modelName] +} diff --git a/pkg/epp/backend/metrics/logger.go b/pkg/epp/backend/metrics/logger.go new file mode 100644 index 00000000..664115eb --- /dev/null +++ b/pkg/epp/backend/metrics/logger.go @@ -0,0 +1,111 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "context" + "time" + + "github.com/go-logr/logr" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + // Note currently the EPP treats stale metrics same as fresh. + // TODO: https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/336 + metricsValidityPeriod = 5 * time.Second +) + +type Datastore interface { + PoolGet() (*v1alpha2.InferencePool, error) + // PodMetrics operations + // PodGetAll returns all pods and metrics, including fresh and stale. + PodGetAll() []PodMetrics + PodList(func(PodMetrics) bool) []PodMetrics +} + +// StartMetricsLogger starts goroutines to 1) Print metrics debug logs if the DEBUG log level is +// enabled; 2) flushes Prometheus metrics about the backend servers. +func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometheusMetricsInterval time.Duration) { + logger := log.FromContext(ctx) + + // Periodically flush prometheus metrics for inference pool + go func() { + for { + select { + case <-ctx.Done(): + logger.V(logutil.DEFAULT).Info("Shutting down prometheus metrics thread") + return + default: + time.Sleep(refreshPrometheusMetricsInterval) + flushPrometheusMetricsOnce(logger, datastore) + } + } + }() + + // Periodically print out the pods and metrics for DEBUGGING. + if logger := logger.V(logutil.DEBUG); logger.Enabled() { + go func() { + for { + select { + case <-ctx.Done(): + logger.V(logutil.DEFAULT).Info("Shutting down metrics logger thread") + return + default: + time.Sleep(5 * time.Second) + podsWithFreshMetrics := datastore.PodList(func(pm PodMetrics) bool { + return time.Since(pm.GetMetrics().UpdateTime) <= metricsValidityPeriod + }) + podsWithStaleMetrics := datastore.PodList(func(pm PodMetrics) bool { + return time.Since(pm.GetMetrics().UpdateTime) > metricsValidityPeriod + }) + logger.Info("Current Pods and metrics gathered", "fresh metrics", podsWithFreshMetrics, "stale metrics", podsWithStaleMetrics) + } + } + }() + } +} + +func flushPrometheusMetricsOnce(logger logr.Logger, datastore Datastore) { + pool, err := datastore.PoolGet() + if err != nil { + // No inference pool or not initialize. + logger.V(logutil.VERBOSE).Info("pool is not initialized, skipping flushing metrics") + return + } + + var kvCacheTotal float64 + var queueTotal int + + podMetrics := datastore.PodGetAll() + logger.V(logutil.VERBOSE).Info("Flushing Prometheus Metrics", "ReadyPods", len(podMetrics)) + if len(podMetrics) == 0 { + return + } + + for _, pod := range podMetrics { + kvCacheTotal += pod.GetMetrics().KVCacheUsagePercent + queueTotal += pod.GetMetrics().WaitingQueueSize + } + + podTotalCount := len(podMetrics) + metrics.RecordInferencePoolAvgKVCache(pool.Name, kvCacheTotal/float64(podTotalCount)) + metrics.RecordInferencePoolAvgQueueSize(pool.Name, float64(queueTotal/podTotalCount)) +} diff --git a/pkg/epp/backend/metrics/pod_metrics.go b/pkg/epp/backend/metrics/pod_metrics.go new file mode 100644 index 00000000..f76c2e8c --- /dev/null +++ b/pkg/epp/backend/metrics/pod_metrics.go @@ -0,0 +1,129 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "context" + "sync" + "sync/atomic" + "time" + "unsafe" + + "github.com/go-logr/logr" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +const ( + fetchMetricsTimeout = 5 * time.Second +) + +type podMetrics struct { + pod unsafe.Pointer // stores a *Pod + metrics unsafe.Pointer // stores a *Metrics + pmc PodMetricsClient + ds Datastore + interval time.Duration + + parentCtx context.Context + once sync.Once // ensure the StartRefreshLoop is only called once. + done chan struct{} + + logger logr.Logger +} + +type PodMetricsClient interface { + FetchMetrics(ctx context.Context, pod *Pod, existing *Metrics, port int32) (*Metrics, error) +} + +func (pm *podMetrics) GetPod() *Pod { + return (*Pod)(atomic.LoadPointer(&pm.pod)) +} + +func (pm *podMetrics) GetMetrics() *Metrics { + return (*Metrics)(atomic.LoadPointer(&pm.metrics)) +} + +func (pm *podMetrics) UpdatePod(in *corev1.Pod) { + atomic.StorePointer(&pm.pod, unsafe.Pointer(toInternalPod(in))) +} + +func toInternalPod(in *corev1.Pod) *Pod { + return &Pod{ + NamespacedName: types.NamespacedName{ + Name: in.Name, + Namespace: in.Namespace, + }, + Address: in.Status.PodIP, + } +} + +// start starts a goroutine exactly once to periodically update metrics. The goroutine will be +// stopped either when stop() is called, or the parentCtx is cancelled. +func (pm *podMetrics) startRefreshLoop() { + pm.once.Do(func() { + go func() { + pm.logger.V(logutil.DEFAULT).Info("Starting refresher", "pod", pm.GetPod()) + for { + select { + case <-pm.done: + return + case <-pm.parentCtx.Done(): + return + default: + } + + err := pm.refreshMetrics() + if err != nil { + pm.logger.V(logutil.TRACE).Error(err, "Failed to refresh metrics", "pod", pm.GetPod()) + } + + time.Sleep(pm.interval) + } + }() + }) +} + +func (pm *podMetrics) refreshMetrics() error { + pool, err := pm.ds.PoolGet() + if err != nil { + // No inference pool or not initialize. + return err + } + ctx, cancel := context.WithTimeout(context.Background(), fetchMetricsTimeout) + defer cancel() + updated, err := pm.pmc.FetchMetrics(ctx, pm.GetPod(), pm.GetMetrics(), pool.Spec.TargetPortNumber) + if err != nil { + // As refresher is running in the background, it's possible that the pod is deleted but + // the refresh goroutine doesn't read the done channel yet. In this case, we just return nil. + // The refresher will be stopped after this interval. + return nil + } + updated.UpdateTime = time.Now() + + pm.logger.V(logutil.TRACE).Info("Refreshed metrics", "updated", updated) + + atomic.StorePointer(&pm.metrics, unsafe.Pointer(updated)) + return nil +} + +func (pm *podMetrics) StopRefreshLoop() { + pm.logger.V(logutil.DEFAULT).Info("Stopping refresher", "pod", pm.GetPod()) + close(pm.done) +} diff --git a/pkg/epp/backend/metrics/pod_metrics_test.go b/pkg/epp/backend/metrics/pod_metrics_test.go new file mode 100644 index 00000000..cf6698ca --- /dev/null +++ b/pkg/epp/backend/metrics/pod_metrics_test.go @@ -0,0 +1,96 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package metrics + +import ( + "context" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" +) + +var ( + pod1 = &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Namespace: "default", + }, + } + initial = &Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + } + updated = &Metrics{ + WaitingQueueSize: 9999, + KVCacheUsagePercent: 0.99, + MaxActiveModels: 99, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + } +) + +func TestMetricsRefresh(t *testing.T) { + ctx := context.Background() + pmc := &FakePodMetricsClient{} + pmf := NewPodMetricsFactory(pmc, time.Millisecond) + + // The refresher is initialized with empty metrics. + pm := pmf.NewPodMetrics(ctx, pod1, &fakeDataStore{}) + + namespacedName := types.NamespacedName{Name: pod1.Name, Namespace: pod1.Namespace} + // Use SetRes to simulate an update of metrics from the pod. + // Verify that the metrics are updated. + pmc.SetRes(map[types.NamespacedName]*Metrics{namespacedName: initial}) + condition := func(collect *assert.CollectT) { + assert.True(collect, cmp.Equal(pm.GetMetrics(), initial, cmpopts.IgnoreFields(Metrics{}, "UpdateTime"))) + } + assert.EventuallyWithT(t, condition, time.Second, time.Millisecond) + + // Stop the loop, and simulate metric update again, this time the PodMetrics won't get the + // new update. + pm.StopRefreshLoop() + pmc.SetRes(map[types.NamespacedName]*Metrics{namespacedName: updated}) + // Still expect the same condition (no metrics update). + assert.EventuallyWithT(t, condition, time.Second, time.Millisecond) +} + +type fakeDataStore struct{} + +func (f *fakeDataStore) PoolGet() (*v1alpha2.InferencePool, error) { + return &v1alpha2.InferencePool{Spec: v1alpha2.InferencePoolSpec{TargetPortNumber: 8000}}, nil +} +func (f *fakeDataStore) PodGetAll() []PodMetrics { + // Not implemented. + return nil +} +func (f *fakeDataStore) PodList(func(PodMetrics) bool) []PodMetrics { + // Not implemented. + return nil +} diff --git a/pkg/epp/backend/metrics/types.go b/pkg/epp/backend/metrics/types.go new file mode 100644 index 00000000..cdbdb2ce --- /dev/null +++ b/pkg/epp/backend/metrics/types.go @@ -0,0 +1,114 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package metrics is a library to interact with backend metrics. +package metrics + +import ( + "context" + "fmt" + "sync" + "time" + "unsafe" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/log" +) + +func NewPodMetricsFactory(pmc PodMetricsClient, refreshMetricsInterval time.Duration) *PodMetricsFactory { + return &PodMetricsFactory{ + pmc: pmc, + refreshMetricsInterval: refreshMetricsInterval, + } +} + +type PodMetricsFactory struct { + pmc PodMetricsClient + refreshMetricsInterval time.Duration +} + +func (f *PodMetricsFactory) NewPodMetrics(parentCtx context.Context, in *corev1.Pod, ds Datastore) PodMetrics { + pm := &podMetrics{ + pod: unsafe.Pointer(toInternalPod(in)), + metrics: unsafe.Pointer(newMetrics()), + pmc: f.pmc, + ds: ds, + interval: f.refreshMetricsInterval, + parentCtx: parentCtx, + once: sync.Once{}, + done: make(chan struct{}), + logger: log.FromContext(parentCtx), + } + pm.startRefreshLoop() + return pm +} + +type PodMetrics interface { + GetPod() *Pod + GetMetrics() *Metrics + UpdatePod(*corev1.Pod) + StopRefreshLoop() +} + +type Pod struct { + NamespacedName types.NamespacedName + Address string +} + +type Metrics struct { + // ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU. + ActiveModels map[string]int + // MaxActiveModels is the maximum number of models that can be loaded to GPU. + MaxActiveModels int + RunningQueueSize int + WaitingQueueSize int + KVCacheUsagePercent float64 + KvCacheMaxTokenCapacity int + + // UpdateTime record the last time when the metrics were updated. + UpdateTime time.Time +} + +func newMetrics() *Metrics { + return &Metrics{ + ActiveModels: make(map[string]int), + } +} + +func (m *Metrics) String() string { + if m == nil { + return "" + } + return fmt.Sprintf("%+v", *m) +} + +func (m *Metrics) Clone() *Metrics { + cm := make(map[string]int, len(m.ActiveModels)) + for k, v := range m.ActiveModels { + cm[k] = v + } + clone := &Metrics{ + ActiveModels: cm, + MaxActiveModels: m.MaxActiveModels, + RunningQueueSize: m.RunningQueueSize, + WaitingQueueSize: m.WaitingQueueSize, + KVCacheUsagePercent: m.KVCacheUsagePercent, + KvCacheMaxTokenCapacity: m.KvCacheMaxTokenCapacity, + UpdateTime: m.UpdateTime, + } + return clone +} diff --git a/pkg/epp/backend/provider.go b/pkg/epp/backend/provider.go deleted file mode 100644 index 959f3e0c..00000000 --- a/pkg/epp/backend/provider.go +++ /dev/null @@ -1,183 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package backend - -import ( - "context" - "fmt" - "sync" - "time" - - "github.com/go-logr/logr" - "go.uber.org/multierr" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" -) - -const ( - fetchMetricsTimeout = 5 * time.Second -) - -func NewProvider(pmc PodMetricsClient, datastore datastore.Datastore) *Provider { - p := &Provider{ - pmc: pmc, - datastore: datastore, - } - return p -} - -// Provider provides backend pods and information such as metrics. -type Provider struct { - pmc PodMetricsClient - datastore datastore.Datastore -} - -type PodMetricsClient interface { - FetchMetrics(ctx context.Context, existing *datastore.PodMetrics, port int32) (*datastore.PodMetrics, error) -} - -func (p *Provider) Init(ctx context.Context, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration) error { - // periodically refresh metrics - logger := log.FromContext(ctx) - go func() { - for { - select { - case <-ctx.Done(): - logger.V(logutil.DEFAULT).Info("Shutting down metrics prober") - return - default: - time.Sleep(refreshMetricsInterval) - if err := p.refreshMetricsOnce(logger); err != nil { - logger.V(logutil.DEFAULT).Error(err, "Failed to refresh metrics") - } - } - } - }() - - // Periodically flush prometheus metrics for inference pool - go func() { - for { - select { - case <-ctx.Done(): - logger.V(logutil.DEFAULT).Info("Shutting down prometheus metrics thread") - return - default: - time.Sleep(refreshPrometheusMetricsInterval) - p.flushPrometheusMetricsOnce(logger) - } - } - }() - - // Periodically print out the pods and metrics for DEBUGGING. - if logger := logger.V(logutil.DEBUG); logger.Enabled() { - go func() { - for { - select { - case <-ctx.Done(): - logger.V(logutil.DEFAULT).Info("Shutting down metrics logger thread") - return - default: - time.Sleep(5 * time.Second) - logger.Info("Current Pods and metrics gathered", "metrics", p.datastore.PodGetAll()) - } - } - }() - } - - return nil -} - -func (p *Provider) refreshMetricsOnce(logger logr.Logger) error { - loggerTrace := logger.V(logutil.TRACE) - pool, _ := p.datastore.PoolGet() - if pool == nil { - loggerTrace.Info("No inference pool or not initialized") - return nil - } - ctx, cancel := context.WithTimeout(context.Background(), fetchMetricsTimeout) - defer cancel() - start := time.Now() - defer func() { - d := time.Since(start) - // TODO: add a metric instead of logging - loggerTrace.Info("Metrics refreshed", "duration", d) - }() - - var wg sync.WaitGroup - errCh := make(chan error) - processOnePod := func(key, value any) bool { - loggerTrace.Info("Pod and metric being processed", "pod", key, "metric", value) - existing := value.(*datastore.PodMetrics) - wg.Add(1) - go func() { - defer wg.Done() - updated, err := p.pmc.FetchMetrics(ctx, existing, pool.Spec.TargetPortNumber) - if err != nil { - errCh <- fmt.Errorf("failed to parse metrics from %s: %v", existing.NamespacedName, err) - return - } - p.datastore.PodUpdateMetricsIfExist(updated.NamespacedName, &updated.Metrics) - loggerTrace.Info("Updated metrics for pod", "pod", updated.NamespacedName, "metrics", updated.Metrics) - }() - return true - } - p.datastore.PodRange(processOnePod) - - // Wait for metric collection for all pods to complete and close the error channel in a - // goroutine so this is unblocking, allowing the code to proceed to the error collection code - // below. - // Note we couldn't use a buffered error channel with a size because the size of the podMetrics - // sync.Map is unknown beforehand. - go func() { - wg.Wait() - close(errCh) - }() - - var errs error - for err := range errCh { - errs = multierr.Append(errs, err) - } - return errs -} - -func (p *Provider) flushPrometheusMetricsOnce(logger logr.Logger) { - pool, _ := p.datastore.PoolGet() - if pool == nil { - // No inference pool or not initialize. - return - } - - var kvCacheTotal float64 - var queueTotal int - - podMetrics := p.datastore.PodGetAll() - logger.V(logutil.VERBOSE).Info("Flushing Prometheus Metrics", "ReadyPods", len(podMetrics)) - if len(podMetrics) == 0 { - return - } - - for _, pod := range podMetrics { - kvCacheTotal += pod.KVCacheUsagePercent - queueTotal += pod.WaitingQueueSize - } - - podTotalCount := len(podMetrics) - metrics.RecordInferencePoolAvgKVCache(pool.Name, kvCacheTotal/float64(podTotalCount)) - metrics.RecordInferencePoolAvgQueueSize(pool.Name, float64(queueTotal/podTotalCount)) -} diff --git a/pkg/epp/backend/provider_test.go b/pkg/epp/backend/provider_test.go deleted file mode 100644 index 12994723..00000000 --- a/pkg/epp/backend/provider_test.go +++ /dev/null @@ -1,151 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package backend - -import ( - "context" - "errors" - "testing" - "time" - - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" - "github.com/stretchr/testify/assert" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" -) - -var ( - pod1 = &datastore.PodMetrics{ - Pod: datastore.Pod{ - NamespacedName: types.NamespacedName{ - Name: "pod1", - }, - }, - } - pod1WithMetrics = &datastore.PodMetrics{ - Pod: pod1.Pod, - Metrics: datastore.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - } - pod2 = &datastore.PodMetrics{ - Pod: datastore.Pod{ - NamespacedName: types.NamespacedName{ - Name: "pod2", - }, - }, - } - pod2WithMetrics = &datastore.PodMetrics{ - Pod: pod2.Pod, - Metrics: datastore.Metrics{ - WaitingQueueSize: 1, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo1": 1, - "bar1": 1, - }, - }, - } - - inferencePool = &v1alpha2.InferencePool{ - Spec: v1alpha2.InferencePoolSpec{ - TargetPortNumber: 8000, - }, - } -) - -func TestProvider(t *testing.T) { - tests := []struct { - name string - pmc PodMetricsClient - storePods []*datastore.PodMetrics - want []*datastore.PodMetrics - }{ - { - name: "Probing metrics success", - pmc: &FakePodMetricsClient{ - Res: map[types.NamespacedName]*datastore.PodMetrics{ - pod1.NamespacedName: pod1WithMetrics, - pod2.NamespacedName: pod2WithMetrics, - }, - }, - storePods: []*datastore.PodMetrics{pod1, pod2}, - want: []*datastore.PodMetrics{pod1WithMetrics, pod2WithMetrics}, - }, - { - name: "Only pods in the datastore are probed", - pmc: &FakePodMetricsClient{ - Res: map[types.NamespacedName]*datastore.PodMetrics{ - pod1.NamespacedName: pod1WithMetrics, - pod2.NamespacedName: pod2WithMetrics, - }, - }, - storePods: []*datastore.PodMetrics{pod1}, - want: []*datastore.PodMetrics{pod1WithMetrics}, - }, - { - name: "Probing metrics error", - pmc: &FakePodMetricsClient{ - Err: map[types.NamespacedName]error{ - pod2.NamespacedName: errors.New("injected error"), - }, - Res: map[types.NamespacedName]*datastore.PodMetrics{ - pod1.NamespacedName: pod1WithMetrics, - }, - }, - storePods: []*datastore.PodMetrics{pod1, pod2}, - want: []*datastore.PodMetrics{ - pod1WithMetrics, - // Failed to fetch pod2 metrics so it remains the default values. - { - Pod: datastore.Pod{NamespacedName: pod2.NamespacedName}, - Metrics: datastore.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0, - MaxActiveModels: 0, - }, - }, - }, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - ds := datastore.NewFakeDatastore(test.storePods, nil, inferencePool) - p := NewProvider(test.pmc, ds) - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - _ = p.Init(ctx, time.Millisecond, time.Millisecond) - assert.EventuallyWithT(t, func(t *assert.CollectT) { - metrics := ds.PodGetAll() - diff := cmp.Diff(test.want, metrics, cmpopts.SortSlices(func(a, b *datastore.PodMetrics) bool { - return a.String() < b.String() - })) - assert.Equal(t, "", diff, "Unexpected diff (+got/-want)") - }, 5*time.Second, time.Millisecond) - }) - } -} diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go index 5b36b930..f83326eb 100644 --- a/pkg/epp/backend/vllm/metrics.go +++ b/pkg/epp/backend/vllm/metrics.go @@ -30,7 +30,7 @@ import ( "github.com/prometheus/common/expfmt" "go.uber.org/multierr" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -57,15 +57,16 @@ type PodMetricsClientImpl struct{} // FetchMetrics fetches metrics from a given pod. func (p *PodMetricsClientImpl) FetchMetrics( ctx context.Context, - existing *datastore.PodMetrics, + pod *metrics.Pod, + existing *metrics.Metrics, port int32, -) (*datastore.PodMetrics, error) { +) (*metrics.Metrics, error) { logger := log.FromContext(ctx) loggerDefault := logger.V(logutil.DEFAULT) // Currently the metrics endpoint is hard-coded, which works with vLLM. // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. - url := "http://" + existing.Address + ":" + strconv.Itoa(int(port)) + "/metrics" + url := "http://" + pod.Address + ":" + strconv.Itoa(int(port)) + "/metrics" req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { @@ -74,16 +75,16 @@ func (p *PodMetricsClientImpl) FetchMetrics( } resp, err := http.DefaultClient.Do(req) if err != nil { - loggerDefault.Error(err, "Failed to fetch metrics", "pod", existing.NamespacedName) - return nil, fmt.Errorf("failed to fetch metrics from %s: %w", existing.NamespacedName, err) + loggerDefault.Error(err, "Failed to fetch metrics", "pod", pod.NamespacedName) + return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod.NamespacedName, err) } defer func() { _ = resp.Body.Close() }() if resp.StatusCode != http.StatusOK { - loggerDefault.Error(nil, "Unexpected status code returned", "pod", existing.NamespacedName, "statusCode", resp.StatusCode) - return nil, fmt.Errorf("unexpected status code from %s: %v", existing.NamespacedName, resp.StatusCode) + loggerDefault.Error(nil, "Unexpected status code returned", "pod", pod.NamespacedName, "statusCode", resp.StatusCode) + return nil, fmt.Errorf("unexpected status code from %s: %v", pod.NamespacedName, resp.StatusCode) } parser := expfmt.TextParser{} @@ -100,8 +101,8 @@ func (p *PodMetricsClientImpl) FetchMetrics( func promToPodMetrics( logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, - existing *datastore.PodMetrics, -) (*datastore.PodMetrics, error) { + existing *metrics.Metrics, +) (*metrics.Metrics, error) { var errs error updated := existing.Clone() runningQueueSize, err := getLatestMetric(logger, metricFamilies, RunningQueueSizeMetricName) diff --git a/pkg/epp/backend/vllm/metrics_test.go b/pkg/epp/backend/vllm/metrics_test.go index 12aac1a1..5555bd26 100644 --- a/pkg/epp/backend/vllm/metrics_test.go +++ b/pkg/epp/backend/vllm/metrics_test.go @@ -23,7 +23,7 @@ import ( dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" "google.golang.org/protobuf/proto" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -31,11 +31,11 @@ func TestPromToPodMetrics(t *testing.T) { logger := logutil.NewTestLogger() testCases := []struct { - name string - metricFamilies map[string]*dto.MetricFamily - expectedMetrics *datastore.Metrics - expectedErr error - initialPodMetrics *datastore.PodMetrics + name string + metricFamilies map[string]*dto.MetricFamily + initialMetrics *metrics.Metrics + expectedMetrics *metrics.Metrics + expectedErr error }{ { name: "all metrics available", @@ -123,7 +123,7 @@ func TestPromToPodMetrics(t *testing.T) { }, }, }, - expectedMetrics: &datastore.Metrics{ + expectedMetrics: &metrics.Metrics{ RunningQueueSize: 15, WaitingQueueSize: 25, KVCacheUsagePercent: 0.9, @@ -133,8 +133,8 @@ func TestPromToPodMetrics(t *testing.T) { }, MaxActiveModels: 2, }, - initialPodMetrics: &datastore.PodMetrics{}, - expectedErr: nil, + initialMetrics: &metrics.Metrics{}, + expectedErr: nil, }, { name: "invalid max lora", @@ -222,7 +222,7 @@ func TestPromToPodMetrics(t *testing.T) { }, }, }, - expectedMetrics: &datastore.Metrics{ + expectedMetrics: &metrics.Metrics{ RunningQueueSize: 15, WaitingQueueSize: 25, KVCacheUsagePercent: 0.9, @@ -232,18 +232,18 @@ func TestPromToPodMetrics(t *testing.T) { }, MaxActiveModels: 0, }, - initialPodMetrics: &datastore.PodMetrics{}, - expectedErr: errors.New("strconv.Atoi: parsing '2a': invalid syntax"), + initialMetrics: &metrics.Metrics{}, + expectedErr: errors.New("strconv.Atoi: parsing '2a': invalid syntax"), }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - updated, err := promToPodMetrics(logger, tc.metricFamilies, tc.initialPodMetrics) + updated, err := promToPodMetrics(logger, tc.metricFamilies, tc.initialMetrics) if tc.expectedErr != nil { assert.Error(t, err) } else { assert.NoError(t, err) - assert.Equal(t, tc.expectedMetrics, &updated.Metrics) + assert.Equal(t, tc.expectedMetrics, updated) } }) } diff --git a/pkg/epp/controller/inferencemodel_reconciler_test.go b/pkg/epp/controller/inferencemodel_reconciler_test.go index 2ac5bb1e..cd1ff1fb 100644 --- a/pkg/epp/controller/inferencemodel_reconciler_test.go +++ b/pkg/epp/controller/inferencemodel_reconciler_test.go @@ -19,6 +19,7 @@ package controller import ( "context" "testing" + "time" "github.com/google/go-cmp/cmp" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -29,6 +30,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" utiltest "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" ) @@ -189,12 +191,16 @@ func TestInferenceModelReconciler(t *testing.T) { WithObjects(initObjs...). WithIndex(&v1alpha2.InferenceModel{}, datastore.ModelNameIndexKey, indexInferenceModelsByModelName). Build() - - datastore := datastore.NewFakeDatastore(nil, test.modelsInStore, pool) + pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) + ds := datastore.NewDatastore(t.Context(), pmf) + for _, m := range test.modelsInStore { + ds.ModelSetIfOlder(m) + } + ds.PoolSet(pool) reconciler := &InferenceModelReconciler{ Client: fakeClient, Record: record.NewFakeRecorder(10), - Datastore: datastore, + Datastore: ds, PoolNamespacedName: types.NamespacedName{Name: pool.Name, Namespace: pool.Namespace}, } if test.incomingReq == nil { @@ -211,11 +217,11 @@ func TestInferenceModelReconciler(t *testing.T) { t.Errorf("Unexpected result diff (+got/-want): %s", diff) } - if len(test.wantModels) != len(datastore.ModelGetAll()) { - t.Errorf("Unexpected; want: %d, got:%d", len(test.wantModels), len(datastore.ModelGetAll())) + if len(test.wantModels) != len(ds.ModelGetAll()) { + t.Errorf("Unexpected; want: %d, got:%d", len(test.wantModels), len(ds.ModelGetAll())) } - if diff := diffStore(datastore, diffStoreParams{wantPool: pool, wantModels: test.wantModels}); diff != "" { + if diff := diffStore(ds, diffStoreParams{wantPool: pool, wantModels: test.wantModels}); diff != "" { t.Errorf("Unexpected diff (+got/-want): %s", diff) } diff --git a/pkg/epp/controller/inferencepool_reconciler.go b/pkg/epp/controller/inferencepool_reconciler.go index 880aec8c..c92d4ecc 100644 --- a/pkg/epp/controller/inferencepool_reconciler.go +++ b/pkg/epp/controller/inferencepool_reconciler.go @@ -80,7 +80,7 @@ func (c *InferencePoolReconciler) updateDatastore(ctx context.Context, newPool * // 2) If the selector on the pool was updated, then we will not get any pod events, and so we need // to resync the whole pool: remove pods in the store that don't match the new selector and add // the ones that may have existed already to the store. - c.Datastore.PodResyncAll(ctx, c.Client) + c.Datastore.PodResyncAll(ctx, c.Client, newPool) } } diff --git a/pkg/epp/controller/inferencepool_reconciler_test.go b/pkg/epp/controller/inferencepool_reconciler_test.go index f35b8dc0..27c4238e 100644 --- a/pkg/epp/controller/inferencepool_reconciler_test.go +++ b/pkg/epp/controller/inferencepool_reconciler_test.go @@ -19,6 +19,7 @@ package controller import ( "context" "testing" + "time" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" @@ -30,6 +31,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" utiltest "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" ) @@ -92,7 +94,8 @@ func TestInferencePoolReconciler(t *testing.T) { req := ctrl.Request{NamespacedName: namespacedName} ctx := context.Background() - datastore := datastore.NewDatastore() + pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) + datastore := datastore.NewDatastore(ctx, pmf) inferencePoolReconciler := &InferencePoolReconciler{PoolNamespacedName: namespacedName, Client: fakeClient, Datastore: datastore} // Step 1: Inception, only ready pods matching pool1 are added to the store. @@ -167,7 +170,7 @@ func diffStore(datastore datastore.Datastore, params diffStoreParams) string { } gotPods := []string{} for _, pm := range datastore.PodGetAll() { - gotPods = append(gotPods, pm.NamespacedName.Name) + gotPods = append(gotPods, pm.GetPod().NamespacedName.Name) } if diff := cmp.Diff(params.wantPods, gotPods, cmpopts.SortSlices(func(a, b string) bool { return a < b })); diff != "" { return "pods:" + diff diff --git a/pkg/epp/controller/pod_reconciler.go b/pkg/epp/controller/pod_reconciler.go index a6c897c2..046561e4 100644 --- a/pkg/epp/controller/pod_reconciler.go +++ b/pkg/epp/controller/pod_reconciler.go @@ -27,6 +27,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -39,7 +40,8 @@ type PodReconciler struct { func (c *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { logger := log.FromContext(ctx) - if !c.Datastore.PoolHasSynced() { + pool, err := c.Datastore.PoolGet() + if err != nil { logger.V(logutil.TRACE).Info("Skipping reconciling Pod because the InferencePool is not available yet") // When the inferencePool is initialized it lists the appropriate pods and populates the datastore, so no need to requeue. return ctrl.Result{}, nil @@ -57,7 +59,7 @@ func (c *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R return ctrl.Result{}, err } - c.updateDatastore(logger, pod) + c.updateDatastore(logger, pod, pool) return ctrl.Result{}, nil } @@ -67,13 +69,13 @@ func (c *PodReconciler) SetupWithManager(mgr ctrl.Manager) error { Complete(c) } -func (c *PodReconciler) updateDatastore(logger logr.Logger, pod *corev1.Pod) { +func (c *PodReconciler) updateDatastore(logger logr.Logger, pod *corev1.Pod, pool *v1alpha2.InferencePool) { namespacedName := types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace} if !pod.DeletionTimestamp.IsZero() || !c.Datastore.PoolLabelsMatch(pod.Labels) || !podIsReady(pod) { logger.V(logutil.DEBUG).Info("Pod removed or not added", "name", namespacedName) c.Datastore.PodDelete(namespacedName) } else { - if c.Datastore.PodUpdateOrAddIfNotExist(pod) { + if c.Datastore.PodUpdateOrAddIfNotExist(pod, pool) { logger.V(logutil.DEFAULT).Info("Pod added", "name", namespacedName) } else { logger.V(logutil.DEFAULT).Info("Pod already exists", "name", namespacedName) diff --git a/pkg/epp/controller/pod_reconciler_test.go b/pkg/epp/controller/pod_reconciler_test.go index 7534ac0f..e4cb0b62 100644 --- a/pkg/epp/controller/pod_reconciler_test.go +++ b/pkg/epp/controller/pod_reconciler_test.go @@ -19,10 +19,12 @@ package controller import ( "context" "testing" + "time" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -30,129 +32,138 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" utiltest "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" ) var ( - basePod1 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}, Address: "address-1"}} - basePod2 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}, Address: "address-2"}} - basePod3 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}, Address: "address-3"}} - basePod11 = &datastore.PodMetrics{Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}, Address: "address-11"}} + basePod1 = &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1"}, Status: corev1.PodStatus{PodIP: "address-1"}} + basePod2 = &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod2"}, Status: corev1.PodStatus{PodIP: "address-2"}} + basePod3 = &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod3"}, Status: corev1.PodStatus{PodIP: "address-3"}} + basePod11 = &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: "pod1"}, Status: corev1.PodStatus{PodIP: "address-11"}} + pmc = &backendmetrics.FakePodMetricsClient{} + pmf = backendmetrics.NewPodMetricsFactory(pmc, time.Second) ) func TestPodReconciler(t *testing.T) { tests := []struct { - name string - datastore datastore.Datastore - incomingPod *corev1.Pod - wantPods []datastore.Pod - req *ctrl.Request + name string + pool *v1alpha2.InferencePool + existingPods []*corev1.Pod + incomingPod *corev1.Pod + wantPods []*corev1.Pod + req *ctrl.Request }{ { - name: "Add new pod", - datastore: datastore.NewFakeDatastore([]*datastore.PodMetrics{basePod1, basePod2}, nil, &v1alpha2.InferencePool{ + name: "Add new pod", + existingPods: []*corev1.Pod{basePod1, basePod2}, + pool: &v1alpha2.InferencePool{ Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ "some-key": "some-val", }, }, - }), - incomingPod: utiltest.MakePod(basePod3.NamespacedName.Name). + }, + incomingPod: utiltest.FromBase(basePod3). Labels(map[string]string{"some-key": "some-val"}). - IP(basePod3.Address). ReadyCondition().ObjRef(), - wantPods: []datastore.Pod{basePod1.Pod, basePod2.Pod, basePod3.Pod}, + wantPods: []*corev1.Pod{basePod1, basePod2, basePod3}, }, { - name: "Update pod1 address", - datastore: datastore.NewFakeDatastore([]*datastore.PodMetrics{basePod1, basePod2}, nil, &v1alpha2.InferencePool{ + name: "Update pod1 address", + existingPods: []*corev1.Pod{basePod1, basePod2}, + pool: &v1alpha2.InferencePool{ Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ "some-key": "some-val", }, }, - }), - incomingPod: utiltest.MakePod(basePod11.NamespacedName.Name). + }, + incomingPod: utiltest.FromBase(basePod11). Labels(map[string]string{"some-key": "some-val"}). - IP(basePod11.Address). ReadyCondition().ObjRef(), - wantPods: []datastore.Pod{basePod11.Pod, basePod2.Pod}, + wantPods: []*corev1.Pod{basePod11, basePod2}, }, { - name: "Delete pod with DeletionTimestamp", - datastore: datastore.NewFakeDatastore([]*datastore.PodMetrics{basePod1, basePod2}, nil, &v1alpha2.InferencePool{ + name: "Delete pod with DeletionTimestamp", + existingPods: []*corev1.Pod{basePod1, basePod2}, + pool: &v1alpha2.InferencePool{ Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ "some-key": "some-val", }, }, - }), - incomingPod: utiltest.MakePod("pod1"). + }, + incomingPod: utiltest.FromBase(basePod1). Labels(map[string]string{"some-key": "some-val"}). DeletionTimestamp(). ReadyCondition().ObjRef(), - wantPods: []datastore.Pod{basePod2.Pod}, + wantPods: []*corev1.Pod{basePod2}, }, { - name: "Delete notfound pod", - datastore: datastore.NewFakeDatastore([]*datastore.PodMetrics{basePod1, basePod2}, nil, &v1alpha2.InferencePool{ + name: "Delete notfound pod", + existingPods: []*corev1.Pod{basePod1, basePod2}, + pool: &v1alpha2.InferencePool{ Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ "some-key": "some-val", }, }, - }), + }, req: &ctrl.Request{NamespacedName: types.NamespacedName{Name: "pod1"}}, - wantPods: []datastore.Pod{basePod2.Pod}, + wantPods: []*corev1.Pod{basePod2}, }, { - name: "New pod, not ready, valid selector", - datastore: datastore.NewFakeDatastore([]*datastore.PodMetrics{basePod1, basePod2}, nil, &v1alpha2.InferencePool{ + name: "New pod, not ready, valid selector", + existingPods: []*corev1.Pod{basePod1, basePod2}, + pool: &v1alpha2.InferencePool{ Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ "some-key": "some-val", }, }, - }), - incomingPod: utiltest.MakePod("pod3"). + }, + incomingPod: utiltest.FromBase(basePod3). Labels(map[string]string{"some-key": "some-val"}).ObjRef(), - wantPods: []datastore.Pod{basePod1.Pod, basePod2.Pod}, + wantPods: []*corev1.Pod{basePod1, basePod2}, }, { - name: "Remove pod that does not match selector", - datastore: datastore.NewFakeDatastore([]*datastore.PodMetrics{basePod1, basePod2}, nil, &v1alpha2.InferencePool{ + name: "Remove pod that does not match selector", + existingPods: []*corev1.Pod{basePod1, basePod2}, + pool: &v1alpha2.InferencePool{ Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ "some-key": "some-val", }, }, - }), - incomingPod: utiltest.MakePod("pod1"). + }, + incomingPod: utiltest.FromBase(basePod1). Labels(map[string]string{"some-wrong-key": "some-val"}). ReadyCondition().ObjRef(), - wantPods: []datastore.Pod{basePod2.Pod}, + wantPods: []*corev1.Pod{basePod2}, }, { - name: "Remove pod that is not ready", - datastore: datastore.NewFakeDatastore([]*datastore.PodMetrics{basePod1, basePod2}, nil, &v1alpha2.InferencePool{ + name: "Remove pod that is not ready", + existingPods: []*corev1.Pod{basePod1, basePod2}, + pool: &v1alpha2.InferencePool{ Spec: v1alpha2.InferencePoolSpec{ TargetPortNumber: int32(8000), Selector: map[v1alpha2.LabelKey]v1alpha2.LabelValue{ "some-key": "some-val", }, }, - }), - incomingPod: utiltest.MakePod("pod1"). + }, + incomingPod: utiltest.FromBase(basePod1). Labels(map[string]string{"some-wrong-key": "some-val"}). ReadyCondition().ObjRef(), - wantPods: []datastore.Pod{basePod2.Pod}, + wantPods: []*corev1.Pod{basePod2}, }, } for _, test := range tests { @@ -169,24 +180,28 @@ func TestPodReconciler(t *testing.T) { WithObjects(initialObjects...). Build() - podReconciler := &PodReconciler{Client: fakeClient, Datastore: test.datastore} - namespacedName := types.NamespacedName{Name: test.incomingPod.Name, Namespace: test.incomingPod.Namespace} + // Configure the initial state of the datastore. + store := datastore.NewDatastore(t.Context(), pmf) + store.PoolSet(test.pool) + for _, pod := range test.existingPods { + store.PodUpdateOrAddIfNotExist(pod, pool) + } + + podReconciler := &PodReconciler{Client: fakeClient, Datastore: store} if test.req == nil { + namespacedName := types.NamespacedName{Name: test.incomingPod.Name, Namespace: test.incomingPod.Namespace} test.req = &ctrl.Request{NamespacedName: namespacedName} } if _, err := podReconciler.Reconcile(context.Background(), *test.req); err != nil { t.Errorf("Unexpected InferencePool reconcile error: %v", err) } - var gotPods []datastore.Pod - test.datastore.PodRange(func(k, v any) bool { - pod := v.(*datastore.PodMetrics) - if v != nil { - gotPods = append(gotPods, pod.Pod) - } - return true - }) - if !cmp.Equal(gotPods, test.wantPods, cmpopts.SortSlices(func(a, b datastore.Pod) bool { return a.NamespacedName.String() < b.NamespacedName.String() })) { + var gotPods []*corev1.Pod + for _, pm := range store.PodGetAll() { + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: pm.GetPod().NamespacedName.Name, Namespace: pm.GetPod().NamespacedName.Namespace}, Status: corev1.PodStatus{PodIP: pm.GetPod().Address}} + gotPods = append(gotPods, pod) + } + if !cmp.Equal(gotPods, test.wantPods, cmpopts.SortSlices(func(a, b *corev1.Pod) bool { return a.Name < b.Name })) { t.Errorf("got (%v) != want (%v);", gotPods, test.wantPods) } }) diff --git a/pkg/epp/datastore/datastore.go b/pkg/epp/datastore/datastore.go index c7050437..af31da42 100644 --- a/pkg/epp/datastore/datastore.go +++ b/pkg/epp/datastore/datastore.go @@ -30,6 +30,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -57,56 +58,40 @@ type Datastore interface { ModelGetAll() []*v1alpha2.InferenceModel // PodMetrics operations - PodUpdateOrAddIfNotExist(pod *corev1.Pod) bool - PodUpdateMetricsIfExist(namespacedName types.NamespacedName, m *Metrics) bool - PodGet(namespacedName types.NamespacedName) *PodMetrics + // PodGetAll returns all pods and metrics, including fresh and stale. + PodGetAll() []backendmetrics.PodMetrics + // PodList lists pods matching the given predicate. + PodList(func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics + PodUpdateOrAddIfNotExist(pod *corev1.Pod, pool *v1alpha2.InferencePool) bool PodDelete(namespacedName types.NamespacedName) - PodResyncAll(ctx context.Context, ctrlClient client.Client) - PodGetAll() []*PodMetrics - PodDeleteAll() // This is only for testing. - PodRange(f func(key, value any) bool) + PodResyncAll(ctx context.Context, ctrlClient client.Client, pool *v1alpha2.InferencePool) // Clears the store state, happens when the pool gets deleted. Clear() } -func NewDatastore() Datastore { +func NewDatastore(parentCtx context.Context, pmf *backendmetrics.PodMetricsFactory) *datastore { store := &datastore{ + parentCtx: parentCtx, poolAndModelsMu: sync.RWMutex{}, models: make(map[string]*v1alpha2.InferenceModel), pods: &sync.Map{}, - } - return store -} - -// Used for test only -func NewFakeDatastore(pods []*PodMetrics, models []*v1alpha2.InferenceModel, pool *v1alpha2.InferencePool) Datastore { - store := NewDatastore() - - for _, pod := range pods { - // Making a copy since in tests we may use the same global PodMetric across tests. - p := *pod - store.(*datastore).pods.Store(pod.NamespacedName, &p) - } - - for _, m := range models { - store.ModelSetIfOlder(m) - } - - if pool != nil { - store.(*datastore).pool = pool + pmf: pmf, } return store } type datastore struct { + // parentCtx controls the lifecycle of the background metrics goroutines that spawn up by the datastore. + parentCtx context.Context // poolAndModelsMu is used to synchronize access to pool and the models map. poolAndModelsMu sync.RWMutex pool *v1alpha2.InferencePool // key: InferenceModel.Spec.ModelName, value: *InferenceModel models map[string]*v1alpha2.InferenceModel - // key: types.NamespacedName, value: *PodMetrics + // key: types.NamespacedName, value: backendmetrics.PodMetrics pods *sync.Map + pmf *backendmetrics.PodMetricsFactory } func (ds *datastore) Clear() { @@ -227,68 +212,44 @@ func (ds *datastore) ModelGetAll() []*v1alpha2.InferenceModel { } // /// Pods/endpoints APIs /// -func (ds *datastore) PodUpdateMetricsIfExist(namespacedName types.NamespacedName, m *Metrics) bool { - if val, ok := ds.pods.Load(namespacedName); ok { - existing := val.(*PodMetrics) - existing.Metrics = *m - return true - } - return false -} -func (ds *datastore) PodGet(namespacedName types.NamespacedName) *PodMetrics { - val, ok := ds.pods.Load(namespacedName) - if ok { - return val.(*PodMetrics) - } - return nil +func (ds *datastore) PodGetAll() []backendmetrics.PodMetrics { + return ds.PodList(func(backendmetrics.PodMetrics) bool { return true }) } -func (ds *datastore) PodGetAll() []*PodMetrics { - res := []*PodMetrics{} +func (ds *datastore) PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics { + res := []backendmetrics.PodMetrics{} fn := func(k, v any) bool { - res = append(res, v.(*PodMetrics)) + pm := v.(backendmetrics.PodMetrics) + if predicate(pm) { + res = append(res, pm) + } return true } ds.pods.Range(fn) return res } -func (ds *datastore) PodRange(f func(key, value any) bool) { - ds.pods.Range(f) -} - -func (ds *datastore) PodDelete(namespacedName types.NamespacedName) { - ds.pods.Delete(namespacedName) -} - -func (ds *datastore) PodUpdateOrAddIfNotExist(pod *corev1.Pod) bool { - new := &PodMetrics{ - Pod: Pod{ - NamespacedName: types.NamespacedName{ - Name: pod.Name, - Namespace: pod.Namespace, - }, - Address: pod.Status.PodIP, - }, - Metrics: Metrics{ - ActiveModels: make(map[string]int), - }, +func (ds *datastore) PodUpdateOrAddIfNotExist(pod *corev1.Pod, pool *v1alpha2.InferencePool) bool { + namespacedName := types.NamespacedName{ + Name: pod.Name, + Namespace: pod.Namespace, } - existing, ok := ds.pods.Load(new.NamespacedName) + var pm backendmetrics.PodMetrics + existing, ok := ds.pods.Load(namespacedName) if !ok { - ds.pods.Store(new.NamespacedName, new) - return true + pm = ds.pmf.NewPodMetrics(ds.parentCtx, pod, ds) + ds.pods.Store(namespacedName, pm) + } else { + pm = existing.(backendmetrics.PodMetrics) } - // Update pod properties if anything changed. - existing.(*PodMetrics).Pod = new.Pod - return false + pm.UpdatePod(pod) + return ok } -func (ds *datastore) PodResyncAll(ctx context.Context, ctrlClient client.Client) { - // Pool must exist to invoke this function. - pool, _ := ds.PoolGet() +func (ds *datastore) PodResyncAll(ctx context.Context, ctrlClient client.Client, pool *v1alpha2.InferencePool) { + logger := log.FromContext(ctx) podList := &corev1.PodList{} if err := ctrlClient.List(ctx, podList, &client.ListOptions{ LabelSelector: selectorFromInferencePoolSelector(pool.Spec.Selector), @@ -301,24 +262,34 @@ func (ds *datastore) PodResyncAll(ctx context.Context, ctrlClient client.Client) activePods := make(map[string]bool) for _, pod := range podList.Items { if podIsReady(&pod) { + namespacedName := types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace} activePods[pod.Name] = true - ds.PodUpdateOrAddIfNotExist(&pod) + if ds.PodUpdateOrAddIfNotExist(&pod, pool) { + logger.V(logutil.DEFAULT).Info("Pod added", "name", namespacedName) + } else { + logger.V(logutil.DEFAULT).Info("Pod already exists", "name", namespacedName) + } } } // Remove pods that don't belong to the pool or not ready any more. deleteFn := func(k, v any) bool { - pm := v.(*PodMetrics) - if exist := activePods[pm.NamespacedName.Name]; !exist { - ds.pods.Delete(pm.NamespacedName) + pm := v.(backendmetrics.PodMetrics) + if exist := activePods[pm.GetPod().NamespacedName.Name]; !exist { + logger.V(logutil.VERBOSE).Info("Removing pod", "pod", pm.GetPod()) + ds.PodDelete(pm.GetPod().NamespacedName) } return true } ds.pods.Range(deleteFn) } -func (ds *datastore) PodDeleteAll() { - ds.pods.Clear() +func (ds *datastore) PodDelete(namespacedName types.NamespacedName) { + v, ok := ds.pods.LoadAndDelete(namespacedName) + if ok { + pmr := v.(backendmetrics.PodMetrics) + pmr.StopRefreshLoop() + } } func selectorFromInferencePoolSelector(selector map[v1alpha2.LabelKey]v1alpha2.LabelValue) labels.Selector { diff --git a/pkg/epp/datastore/datastore_test.go b/pkg/epp/datastore/datastore_test.go index 8fb269bc..f60a4cc9 100644 --- a/pkg/epp/datastore/datastore_test.go +++ b/pkg/epp/datastore/datastore_test.go @@ -17,13 +17,19 @@ limitations under the License. package datastore import ( + "context" + "errors" "testing" + "time" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" testutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" ) @@ -66,7 +72,8 @@ func TestPool(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - datastore := NewDatastore() + pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) + datastore := NewDatastore(context.Background(), pmf) datastore.PoolSet(tt.inferencePool) gotPool, gotErr := datastore.PoolGet() if diff := cmp.Diff(tt.wantErr, gotErr, cmpopts.EquateErrors()); diff != "" { @@ -197,7 +204,12 @@ func TestModel(t *testing.T) { } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - ds := NewFakeDatastore(nil, test.existingModels, nil) + pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) + ds := NewDatastore(t.Context(), pmf) + for _, m := range test.existingModels { + ds.ModelSetIfOlder(m) + } + gotOpResult := test.op(ds) if gotOpResult != test.wantOpResult { t.Errorf("Unexpected operation result, want: %v, got: %v", test.wantOpResult, gotOpResult) @@ -317,3 +329,119 @@ func TestRandomWeightedDraw(t *testing.T) { func pointer(v int32) *int32 { return &v } + +var ( + pod1 = &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + }, + } + pod1Metrics = &backendmetrics.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + } + pod2 = &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod2", + }, + } + pod2Metrics = &backendmetrics.Metrics{ + WaitingQueueSize: 1, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo1": 1, + "bar1": 1, + }, + } + pod1NamespacedName = types.NamespacedName{Name: pod1.Name, Namespace: pod1.Namespace} + pod2NamespacedName = types.NamespacedName{Name: pod2.Name, Namespace: pod2.Namespace} + inferencePool = &v1alpha2.InferencePool{ + Spec: v1alpha2.InferencePoolSpec{ + TargetPortNumber: 8000, + }, + } +) + +func TestMetrics(t *testing.T) { + tests := []struct { + name string + pmc backendmetrics.PodMetricsClient + storePods []*corev1.Pod + want []*backendmetrics.Metrics + }{ + { + name: "Probing metrics success", + pmc: &backendmetrics.FakePodMetricsClient{ + Res: map[types.NamespacedName]*backendmetrics.Metrics{ + pod1NamespacedName: pod1Metrics, + pod2NamespacedName: pod2Metrics, + }, + }, + storePods: []*corev1.Pod{pod1, pod2}, + want: []*backendmetrics.Metrics{pod1Metrics, pod2Metrics}, + }, + { + name: "Only pods in are probed", + pmc: &backendmetrics.FakePodMetricsClient{ + Res: map[types.NamespacedName]*backendmetrics.Metrics{ + pod1NamespacedName: pod1Metrics, + pod2NamespacedName: pod2Metrics, + }, + }, + storePods: []*corev1.Pod{pod1}, + want: []*backendmetrics.Metrics{pod1Metrics}, + }, + { + name: "Probing metrics error", + pmc: &backendmetrics.FakePodMetricsClient{ + Err: map[types.NamespacedName]error{ + pod2NamespacedName: errors.New("injected error"), + }, + Res: map[types.NamespacedName]*backendmetrics.Metrics{ + pod1NamespacedName: pod1Metrics, + }, + }, + storePods: []*corev1.Pod{pod1, pod2}, + want: []*backendmetrics.Metrics{ + pod1Metrics, + // Failed to fetch pod2 metrics so it remains the default values. + { + ActiveModels: map[string]int{}, + WaitingQueueSize: 0, + KVCacheUsagePercent: 0, + MaxActiveModels: 0, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + pmf := backendmetrics.NewPodMetricsFactory(test.pmc, time.Millisecond) + ds := NewDatastore(ctx, pmf) + ds.PoolSet(inferencePool) + for _, pod := range test.storePods { + ds.PodUpdateOrAddIfNotExist(pod, inferencePool) + } + assert.EventuallyWithT(t, func(t *assert.CollectT) { + got := ds.PodGetAll() + metrics := []*backendmetrics.Metrics{} + for _, one := range got { + metrics = append(metrics, one.GetMetrics()) + } + diff := cmp.Diff(test.want, metrics, cmpopts.IgnoreFields(backendmetrics.Metrics{}, "UpdateTime"), cmpopts.SortSlices(func(a, b *backendmetrics.Metrics) bool { + return a.String() < b.String() + })) + assert.Equal(t, "", diff, "Unexpected diff (+got/-want)") + }, 5*time.Second, time.Millisecond) + }) + } +} diff --git a/pkg/epp/datastore/types.go b/pkg/epp/datastore/types.go deleted file mode 100644 index 8cfcf1d1..00000000 --- a/pkg/epp/datastore/types.go +++ /dev/null @@ -1,71 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Package datastore is a library to interact with backend model servers such as probing metrics. -package datastore - -import ( - "fmt" - - "k8s.io/apimachinery/pkg/types" -) - -type Pod struct { - NamespacedName types.NamespacedName - Address string -} - -type Metrics struct { - // ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU. - ActiveModels map[string]int - // MaxActiveModels is the maximum number of models that can be loaded to GPU. - MaxActiveModels int - RunningQueueSize int - WaitingQueueSize int - KVCacheUsagePercent float64 - KvCacheMaxTokenCapacity int -} - -type PodMetrics struct { - Pod - Metrics -} - -func (pm *PodMetrics) String() string { - return fmt.Sprintf("Pod: %+v; Address: %+v; Metrics: %+v", pm.NamespacedName, pm.Address, pm.Metrics) -} - -func (pm *PodMetrics) Clone() *PodMetrics { - cm := make(map[string]int, len(pm.ActiveModels)) - for k, v := range pm.ActiveModels { - cm[k] = v - } - clone := &PodMetrics{ - Pod: Pod{ - NamespacedName: pm.NamespacedName, - Address: pm.Address, - }, - Metrics: Metrics{ - ActiveModels: cm, - MaxActiveModels: pm.MaxActiveModels, - RunningQueueSize: pm.RunningQueueSize, - WaitingQueueSize: pm.WaitingQueueSize, - KVCacheUsagePercent: pm.KVCacheUsagePercent, - KvCacheMaxTokenCapacity: pm.KvCacheMaxTokenCapacity, - }, - } - return clone -} diff --git a/pkg/epp/handlers/request.go b/pkg/epp/handlers/request.go index 20271913..12afe4d7 100644 --- a/pkg/epp/handlers/request.go +++ b/pkg/epp/handlers/request.go @@ -94,10 +94,11 @@ func (s *Server) HandleRequestBody( loggerVerbose.Info("Updated request body marshalled", "body", string(requestBody)) } - targetPod, err := s.scheduler.Schedule(ctx, llmReq) + target, err := s.scheduler.Schedule(ctx, llmReq) if err != nil { return nil, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()} } + targetPod := target.GetPod() logger.V(logutil.DEFAULT).Info("Request handled", "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "endpoint", targetPod) diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go index bbdbe83e..be882fc7 100644 --- a/pkg/epp/handlers/server.go +++ b/pkg/epp/handlers/server.go @@ -26,6 +26,7 @@ import ( "google.golang.org/grpc/codes" "google.golang.org/grpc/status" "sigs.k8s.io/controller-runtime/pkg/log" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" @@ -56,7 +57,7 @@ type Server struct { } type Scheduler interface { - Schedule(ctx context.Context, b *scheduling.LLMRequest) (targetPod datastore.PodMetrics, err error) + Schedule(ctx context.Context, b *scheduling.LLMRequest) (targetPod backendmetrics.PodMetrics, err error) } func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { diff --git a/pkg/epp/handlers/streamingserver.go b/pkg/epp/handlers/streamingserver.go index 821dd989..c8de7bb7 100644 --- a/pkg/epp/handlers/streamingserver.go +++ b/pkg/epp/handlers/streamingserver.go @@ -347,10 +347,11 @@ func (s *StreamingServer) HandleRequestBody( loggerVerbose.Info("Updated request body marshalled", "body", string(requestBodyBytes)) } - targetPod, err := s.scheduler.Schedule(ctx, llmReq) + target, err := s.scheduler.Schedule(ctx, llmReq) if err != nil { return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()} } + targetPod := target.GetPod() // Insert target endpoint to instruct Envoy to route requests to the specified target pod. // Attach the port number diff --git a/pkg/epp/scheduling/filter.go b/pkg/epp/scheduling/filter.go index d3c22673..cee683c5 100644 --- a/pkg/epp/scheduling/filter.go +++ b/pkg/epp/scheduling/filter.go @@ -23,13 +23,13 @@ import ( "time" "github.com/go-logr/logr" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) type Filter interface { Name() string - Filter(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) + Filter(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) } // filter applies current filterFunc, and then recursively applies next filters depending success or @@ -59,7 +59,7 @@ func (f *filter) Name() string { return f.name } -func (f *filter) Filter(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { +func (f *filter) Filter(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { loggerTrace := logger.V(logutil.TRACE) loggerTrace.Info("Running a filter", "name", f.Name(), "podCount", len(pods)) @@ -92,12 +92,12 @@ func (f *filter) Filter(logger logr.Logger, req *LLMRequest, pods []*datastore.P } // filterFunc filters a set of input pods to a subset. -type filterFunc func(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) +type filterFunc func(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) // toFilterFunc is a helper function to convert a per pod filter func to the FilterFunc. func toFilterFunc(pp podPredicate) filterFunc { - return func(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { - filtered := []*datastore.PodMetrics{} + return func(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { + filtered := []backendmetrics.PodMetrics{} for _, pod := range pods { pass := pp(req, pod) if pass { @@ -118,30 +118,30 @@ func toFilterFunc(pp podPredicate) filterFunc { // the least one as it gives more choices for the next filter, which on aggregate gave better // results. // TODO: Compare this strategy with other strategies such as top K. -func leastQueuingFilterFunc(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { +func leastQueuingFilterFunc(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { min := math.MaxInt max := 0 - filtered := []*datastore.PodMetrics{} + filtered := []backendmetrics.PodMetrics{} for _, pod := range pods { - if pod.WaitingQueueSize <= min { - min = pod.WaitingQueueSize + if pod.GetMetrics().WaitingQueueSize <= min { + min = pod.GetMetrics().WaitingQueueSize } - if pod.WaitingQueueSize >= max { - max = pod.WaitingQueueSize + if pod.GetMetrics().WaitingQueueSize >= max { + max = pod.GetMetrics().WaitingQueueSize } } for _, pod := range pods { - if pod.WaitingQueueSize >= min && pod.WaitingQueueSize <= min+(max-min)/len(pods) { + if pod.GetMetrics().WaitingQueueSize >= min && pod.GetMetrics().WaitingQueueSize <= min+(max-min)/len(pods) { filtered = append(filtered, pod) } } return filtered, nil } -func lowQueueingPodPredicate(_ *LLMRequest, pod *datastore.PodMetrics) bool { - return pod.WaitingQueueSize < queueingThresholdLoRA +func lowQueueingPodPredicate(_ *LLMRequest, pod backendmetrics.PodMetrics) bool { + return pod.GetMetrics().WaitingQueueSize < queueingThresholdLoRA } // leastKVCacheFilterFunc finds the max and min KV cache of all pods, divides the whole range @@ -150,22 +150,22 @@ func lowQueueingPodPredicate(_ *LLMRequest, pod *datastore.PodMetrics) bool { // should consider them all instead of the absolute minimum one. This worked better than picking the // least one as it gives more choices for the next filter, which on aggregate gave better results. // TODO: Compare this strategy with other strategies such as top K. -func leastKVCacheFilterFunc(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { +func leastKVCacheFilterFunc(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { min := math.MaxFloat64 var max float64 = 0 - filtered := []*datastore.PodMetrics{} + filtered := []backendmetrics.PodMetrics{} for _, pod := range pods { - if pod.KVCacheUsagePercent <= min { - min = pod.KVCacheUsagePercent + if pod.GetMetrics().KVCacheUsagePercent <= min { + min = pod.GetMetrics().KVCacheUsagePercent } - if pod.KVCacheUsagePercent >= max { - max = pod.KVCacheUsagePercent + if pod.GetMetrics().KVCacheUsagePercent >= max { + max = pod.GetMetrics().KVCacheUsagePercent } } for _, pod := range pods { - if pod.KVCacheUsagePercent >= min && pod.KVCacheUsagePercent <= min+(max-min)/float64(len(pods)) { + if pod.GetMetrics().KVCacheUsagePercent >= min && pod.GetMetrics().KVCacheUsagePercent <= min+(max-min)/float64(len(pods)) { filtered = append(filtered, pod) } } @@ -173,16 +173,16 @@ func leastKVCacheFilterFunc(logger logr.Logger, req *LLMRequest, pods []*datasto } // podPredicate is a filter function to check whether a pod is desired. -type podPredicate func(req *LLMRequest, pod *datastore.PodMetrics) bool +type podPredicate func(req *LLMRequest, pod backendmetrics.PodMetrics) bool // We consider serving an adapter low cost it the adapter is active in the model server, or the // model server has room to load the adapter. The lowLoRACostPredicate ensures weak affinity by // spreading the load of a LoRA adapter across multiple pods, avoiding "pinning" all requests to // a single pod. This gave good performance in our initial benchmarking results in the scenario // where # of lora slots > # of lora adapters. -func lowLoRACostPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool { - _, ok := pod.ActiveModels[req.ResolvedTargetModel] - return ok || len(pod.ActiveModels) < pod.MaxActiveModels +func lowLoRACostPredicate(req *LLMRequest, pod backendmetrics.PodMetrics) bool { + _, ok := pod.GetMetrics().ActiveModels[req.ResolvedTargetModel] + return ok || len(pod.GetMetrics().ActiveModels) < pod.GetMetrics().MaxActiveModels } // loRASoftAffinityPredicate implements a pod selection strategy that prioritizes pods @@ -201,18 +201,18 @@ func lowLoRACostPredicate(req *LLMRequest, pod *datastore.PodMetrics) bool { // Returns: // - Filtered slice of pod metrics based on affinity and availability // - Error if any issues occur during filtering -func loRASoftAffinityFilter(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { +func loRASoftAffinityFilter(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { // Pre-allocate slices with estimated capacity - filtered_affinity := make([]*datastore.PodMetrics, 0, len(pods)) - filtered_available := make([]*datastore.PodMetrics, 0, len(pods)) + filtered_affinity := make([]backendmetrics.PodMetrics, 0, len(pods)) + filtered_available := make([]backendmetrics.PodMetrics, 0, len(pods)) // Categorize pods based on affinity and availability for _, pod := range pods { - if _, exists := pod.ActiveModels[req.ResolvedTargetModel]; exists { + if _, exists := pod.GetMetrics().ActiveModels[req.ResolvedTargetModel]; exists { filtered_affinity = append(filtered_affinity, pod) - } else if len(pod.ActiveModels) < pod.MaxActiveModels { + } else if len(pod.GetMetrics().ActiveModels) < pod.GetMetrics().MaxActiveModels { filtered_available = append(filtered_available, pod) } } @@ -237,12 +237,12 @@ func loRASoftAffinityFilter(logger logr.Logger, req *LLMRequest, pods []*datasto return filtered_available, nil } -func criticalRequestPredicate(req *LLMRequest, _ *datastore.PodMetrics) bool { +func criticalRequestPredicate(req *LLMRequest, _ backendmetrics.PodMetrics) bool { return req.Critical } func noQueueAndLessThanKVCacheThresholdPredicate(queueThreshold int, kvCacheThreshold float64) podPredicate { - return func(req *LLMRequest, pod *datastore.PodMetrics) bool { - return pod.WaitingQueueSize <= queueThreshold && pod.KVCacheUsagePercent <= kvCacheThreshold + return func(req *LLMRequest, pod backendmetrics.PodMetrics) bool { + return pod.GetMetrics().WaitingQueueSize <= queueThreshold && pod.GetMetrics().KVCacheUsagePercent <= kvCacheThreshold } } diff --git a/pkg/epp/scheduling/filter_test.go b/pkg/epp/scheduling/filter_test.go index f76cece9..62ffe7f2 100644 --- a/pkg/epp/scheduling/filter_test.go +++ b/pkg/epp/scheduling/filter_test.go @@ -23,7 +23,7 @@ import ( "github.com/go-logr/logr" "github.com/google/go-cmp/cmp" "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -33,14 +33,14 @@ func TestFilter(t *testing.T) { tests := []struct { name string req *LLMRequest - input []*datastore.PodMetrics - output []*datastore.PodMetrics + input []*backendmetrics.FakePodMetrics + output []*backendmetrics.FakePodMetrics err bool filter *filter }{ { name: "simple filter without successor, failure", - filter: &filter{filter: func(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { + filter: &filter{filter: func(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { return nil, errors.New("filter error") }}, err: true, @@ -55,10 +55,10 @@ func TestFilter(t *testing.T) { }, // pod2 will be picked because it has relatively low queue size, with the requested // model being active, and has low KV cache. - input: []*datastore.PodMetrics{ + input: []*backendmetrics.FakePodMetrics{ { - Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, - Metrics: datastore.Metrics{ + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, MaxActiveModels: 2, @@ -69,8 +69,8 @@ func TestFilter(t *testing.T) { }, }, { - Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, - Metrics: datastore.Metrics{ + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 3, KVCacheUsagePercent: 0.1, MaxActiveModels: 2, @@ -81,8 +81,8 @@ func TestFilter(t *testing.T) { }, }, { - Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, - Metrics: datastore.Metrics{ + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.2, MaxActiveModels: 2, @@ -92,10 +92,10 @@ func TestFilter(t *testing.T) { }, }, }, - output: []*datastore.PodMetrics{ + output: []*backendmetrics.FakePodMetrics{ { - Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, - Metrics: datastore.Metrics{ + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 3, KVCacheUsagePercent: 0.1, MaxActiveModels: 2, @@ -116,10 +116,10 @@ func TestFilter(t *testing.T) { Critical: false, }, // pod1 will be picked because it has capacity for the sheddable request. - input: []*datastore.PodMetrics{ + input: []*backendmetrics.FakePodMetrics{ { - Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, - Metrics: datastore.Metrics{ + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, MaxActiveModels: 2, @@ -130,8 +130,8 @@ func TestFilter(t *testing.T) { }, }, { - Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, - Metrics: datastore.Metrics{ + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 3, KVCacheUsagePercent: 0.1, MaxActiveModels: 2, @@ -142,8 +142,8 @@ func TestFilter(t *testing.T) { }, }, { - Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, - Metrics: datastore.Metrics{ + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.2, MaxActiveModels: 2, @@ -153,10 +153,10 @@ func TestFilter(t *testing.T) { }, }, }, - output: []*datastore.PodMetrics{ + output: []*backendmetrics.FakePodMetrics{ { - Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, - Metrics: datastore.Metrics{ + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, MaxActiveModels: 2, @@ -178,10 +178,10 @@ func TestFilter(t *testing.T) { }, // All pods have higher KV cache thant the threshold, so the sheddable request will be // dropped. - input: []*datastore.PodMetrics{ + input: []*backendmetrics.FakePodMetrics{ { - Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, - Metrics: datastore.Metrics{ + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.9, MaxActiveModels: 2, @@ -192,8 +192,8 @@ func TestFilter(t *testing.T) { }, }, { - Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, - Metrics: datastore.Metrics{ + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 3, KVCacheUsagePercent: 0.85, MaxActiveModels: 2, @@ -204,8 +204,8 @@ func TestFilter(t *testing.T) { }, }, { - Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, - Metrics: datastore.Metrics{ + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.85, MaxActiveModels: 2, @@ -215,19 +215,19 @@ func TestFilter(t *testing.T) { }, }, }, - output: []*datastore.PodMetrics{}, + output: []*backendmetrics.FakePodMetrics{}, err: true, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - got, err := test.filter.Filter(logger, test.req, test.input) + got, err := test.filter.Filter(logger, test.req, toInterface(test.input)) if test.err != (err != nil) { t.Errorf("Unexpected error, got %v, want %v", err, test.err) } - if diff := cmp.Diff(test.output, got); diff != "" { + if diff := cmp.Diff(test.output, toStruct(got)); diff != "" { t.Errorf("Unexpected output (-want +got): %v", diff) } }) @@ -241,44 +241,44 @@ func TestFilterFunc(t *testing.T) { name string f filterFunc req *LLMRequest - input []*datastore.PodMetrics - output []*datastore.PodMetrics + input []*backendmetrics.FakePodMetrics + output []*backendmetrics.FakePodMetrics err bool }{ { name: "least queuing empty input", f: leastQueuingFilterFunc, - input: []*datastore.PodMetrics{}, - output: []*datastore.PodMetrics{}, + input: []*backendmetrics.FakePodMetrics{}, + output: []*backendmetrics.FakePodMetrics{}, }, { name: "least queuing", f: leastQueuingFilterFunc, - input: []*datastore.PodMetrics{ + input: []*backendmetrics.FakePodMetrics{ { - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, }, }, { - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 3, }, }, { - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 10, }, }, }, - output: []*datastore.PodMetrics{ + output: []*backendmetrics.FakePodMetrics{ { - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, }, }, { - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 3, }, }, @@ -287,37 +287,37 @@ func TestFilterFunc(t *testing.T) { { name: "least kv cache empty input", f: leastKVCacheFilterFunc, - input: []*datastore.PodMetrics{}, - output: []*datastore.PodMetrics{}, + input: []*backendmetrics.FakePodMetrics{}, + output: []*backendmetrics.FakePodMetrics{}, }, { name: "least kv cache", f: leastKVCacheFilterFunc, - input: []*datastore.PodMetrics{ + input: []*backendmetrics.FakePodMetrics{ { - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ KVCacheUsagePercent: 0, }, }, { - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ KVCacheUsagePercent: 0.3, }, }, { - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ KVCacheUsagePercent: 1.0, }, }, }, - output: []*datastore.PodMetrics{ + output: []*backendmetrics.FakePodMetrics{ { - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ KVCacheUsagePercent: 0, }, }, { - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ KVCacheUsagePercent: 0.3, }, }, @@ -326,32 +326,32 @@ func TestFilterFunc(t *testing.T) { { name: "noQueueAndLessThanKVCacheThresholdPredicate", f: toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(0, 0.8)), - input: []*datastore.PodMetrics{ + input: []*backendmetrics.FakePodMetrics{ { // This pod should be returned. - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0, }, }, { // Queue is non zero, despite low kv cache, should not return. - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 1, KVCacheUsagePercent: 0.3, }, }, { // High kv cache despite zero queue, should not return - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 1.0, }, }, }, - output: []*datastore.PodMetrics{ + output: []*backendmetrics.FakePodMetrics{ { - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0, }, @@ -365,10 +365,10 @@ func TestFilterFunc(t *testing.T) { Model: "model", ResolvedTargetModel: "model", }, - input: []*datastore.PodMetrics{ + input: []*backendmetrics.FakePodMetrics{ // ActiveModels include input model, should be returned. { - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ MaxActiveModels: 2, ActiveModels: map[string]int{ "model": 1, @@ -377,7 +377,7 @@ func TestFilterFunc(t *testing.T) { }, // Input model is not active, however the server has room to load another adapter. { - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ MaxActiveModels: 2, ActiveModels: map[string]int{ "another-model": 1, @@ -386,7 +386,7 @@ func TestFilterFunc(t *testing.T) { }, // Input is not active, and the server has reached max active models. { - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ MaxActiveModels: 2, ActiveModels: map[string]int{ "foo": 1, @@ -395,9 +395,9 @@ func TestFilterFunc(t *testing.T) { }, }, }, - output: []*datastore.PodMetrics{ + output: []*backendmetrics.FakePodMetrics{ { - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ MaxActiveModels: 2, ActiveModels: map[string]int{ "model": 1, @@ -405,7 +405,7 @@ func TestFilterFunc(t *testing.T) { }, }, { - Metrics: datastore.Metrics{ + Metrics: &backendmetrics.Metrics{ MaxActiveModels: 2, ActiveModels: map[string]int{ "another-model": 1, @@ -418,12 +418,12 @@ func TestFilterFunc(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - got, err := test.f(logger, test.req, test.input) + got, err := test.f(logger, test.req, toInterface(test.input)) if test.err != (err != nil) { t.Errorf("Unexpected error, got %v, want %v", err, test.err) } - if diff := cmp.Diff(test.output, got); diff != "" { + if diff := cmp.Diff(test.output, toStruct(got)); diff != "" { t.Errorf("Unexpected output (-want +got): %v", diff) } }) @@ -449,10 +449,10 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { } // Test setup: One affinity pod and one available pod - pods := []*datastore.PodMetrics{ + pods := []*backendmetrics.FakePodMetrics{ { - Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "affinity-pod"}}, - Metrics: datastore.Metrics{ + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "affinity-pod"}}, + Metrics: &backendmetrics.Metrics{ MaxActiveModels: 2, ActiveModels: map[string]int{ testAffinityModel: 1, @@ -460,8 +460,8 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { }, }, { - Pod: datastore.Pod{NamespacedName: types.NamespacedName{Name: "available-pod"}}, - Metrics: datastore.Metrics{ + Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "available-pod"}}, + Metrics: &backendmetrics.Metrics{ MaxActiveModels: 2, ActiveModels: map[string]int{}, }, @@ -476,7 +476,7 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { // This test should work with whatever value is set there expectedAffinityPercent := loraAffinityThreshold * 100 for i := 0; i < numIterations; i++ { - result, err := loRASoftAffinityFilter(logger, req, pods) + result, err := loRASoftAffinityFilter(logger, req, toInterface(pods)) if err != nil { t.Fatalf("Unexpected error: %v", err) } @@ -487,7 +487,7 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { } // Identify if the returned pod is the affinity pod or available pod - if _, exists := result[0].ActiveModels[testAffinityModel]; exists { + if _, exists := result[0].GetMetrics().ActiveModels[testAffinityModel]; exists { affinityCount++ } else { availableCount++ @@ -519,3 +519,22 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { actualAvailablePercent, availableLowerBound, availableUpperBound) } } + +func toInterface(input []*backendmetrics.FakePodMetrics) []backendmetrics.PodMetrics { + output := []backendmetrics.PodMetrics{} + for _, i := range input { + output = append(output, i) + } + return output +} + +func toStruct(input []backendmetrics.PodMetrics) []*backendmetrics.FakePodMetrics { + if input == nil { + return nil + } + output := []*backendmetrics.FakePodMetrics{} + for _, i := range input { + output = append(output, i.(*backendmetrics.FakePodMetrics)) + } + return output +} diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go index bdddd972..82410787 100644 --- a/pkg/epp/scheduling/scheduler.go +++ b/pkg/epp/scheduling/scheduler.go @@ -24,6 +24,7 @@ import ( "github.com/go-logr/logr" "sigs.k8s.io/controller-runtime/pkg/log" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" @@ -97,9 +98,9 @@ var ( // request to make room for critical requests. nextOnFailure: &filter{ name: "drop request", - filter: func(logger logr.Logger, req *LLMRequest, pods []*datastore.PodMetrics) ([]*datastore.PodMetrics, error) { + filter: func(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { logger.V(logutil.DEFAULT).Info("Request dropped", "request", req) - return []*datastore.PodMetrics{}, errutil.Error{ + return []backendmetrics.PodMetrics{}, errutil.Error{ Code: errutil.InferencePoolResourceExhausted, Msg: "dropping request due to limited backend resources", } }, @@ -120,16 +121,16 @@ type Scheduler struct { } // Schedule finds the target pod based on metrics and the requested lora adapter. -func (s *Scheduler) Schedule(ctx context.Context, req *LLMRequest) (targetPod datastore.PodMetrics, err error) { +func (s *Scheduler) Schedule(ctx context.Context, req *LLMRequest) (targetPod backendmetrics.PodMetrics, err error) { logger := log.FromContext(ctx).WithValues("request", req) podMetrics := s.datastore.PodGetAll() logger.V(logutil.VERBOSE).Info("Scheduling a request", "metrics", podMetrics) pods, err := s.filter.Filter(logger, req, podMetrics) if err != nil || len(pods) == 0 { - return datastore.PodMetrics{}, fmt.Errorf( + return nil, fmt.Errorf( "failed to apply filter, resulted %v pods, this should never happen: %w", len(pods), err) } logger.V(logutil.VERBOSE).Info("Selecting a random pod from the candidates", "candidatePods", pods) i := rand.Intn(len(pods)) - return *pods[i], nil + return pods[i], nil } diff --git a/pkg/epp/server/runserver.go b/pkg/epp/server/runserver.go index 5b8269c1..a6c9f1d3 100644 --- a/pkg/epp/server/runserver.go +++ b/pkg/epp/server/runserver.go @@ -31,7 +31,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" tlsutil "sigs.k8s.io/gateway-api-inference-extension/internal/tls" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/controller" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" @@ -45,13 +45,15 @@ type ExtProcServerRunner struct { DestinationEndpointHintKey string PoolName string PoolNamespace string - RefreshMetricsInterval time.Duration - RefreshPrometheusMetricsInterval time.Duration Datastore datastore.Datastore - Provider *backend.Provider SecureServing bool CertPath string UseStreaming bool + RefreshPrometheusMetricsInterval time.Duration + + // This should only be used in tests. We won't need this once we don't inject metrics in the tests. + // TODO:(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/432) Cleanup + TestPodMetricsClient *backendmetrics.FakePodMetricsClient } // Default values for CLI flags in main @@ -73,8 +75,6 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner { DestinationEndpointHintMetadataNamespace: DefaultDestinationEndpointHintMetadataNamespace, PoolName: DefaultPoolName, PoolNamespace: DefaultPoolNamespace, - RefreshMetricsInterval: DefaultRefreshMetricsInterval, - RefreshPrometheusMetricsInterval: DefaultRefreshPrometheusMetricsInterval, SecureServing: DefaultSecureServing, // Datastore can be assigned later. } @@ -121,12 +121,7 @@ func (r *ExtProcServerRunner) SetupWithManager(ctx context.Context, mgr ctrl.Man // The runnable implements LeaderElectionRunnable with leader election disabled. func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable { return runnable.NoLeaderElection(manager.RunnableFunc(func(ctx context.Context) error { - // Initialize backend provider - if err := r.Provider.Init(ctx, r.RefreshMetricsInterval, r.RefreshPrometheusMetricsInterval); err != nil { - logger.Error(err, "Failed to initialize backend provider") - return err - } - + backendmetrics.StartMetricsLogger(ctx, r.Datastore, r.RefreshPrometheusMetricsInterval) var srv *grpc.Server if r.SecureServing { var cert tls.Certificate diff --git a/pkg/epp/test/benchmark/benchmark.go b/pkg/epp/test/benchmark/benchmark.go deleted file mode 100644 index 67783480..00000000 --- a/pkg/epp/test/benchmark/benchmark.go +++ /dev/null @@ -1,145 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package main - -import ( - "context" - "flag" - "fmt" - "os" - "time" - - "github.com/bojand/ghz/printer" - "github.com/bojand/ghz/runner" - "github.com/go-logr/logr" - "github.com/jhump/protoreflect/desc" - uberzap "go.uber.org/zap" - "google.golang.org/protobuf/proto" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/log/zap" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" - runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/test" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" -) - -var ( - svrAddr = flag.String("server_address", fmt.Sprintf("localhost:%d", runserver.DefaultGrpcPort), "Address of the ext proc server") - totalRequests = flag.Int("total_requests", 100000, "number of requests to be sent for load test") - // Flags when running a local ext proc server. - numFakePods = flag.Int("num_fake_pods", 200, "number of fake pods when running a local ext proc server") - numModelsPerPod = flag.Int("num_models_per_pod", 5, "number of fake models per pod when running a local ext proc server") - localServer = flag.Bool("local_server", true, "whether to start a local ext proc server") - refreshPodsInterval = flag.Duration("refreshPodsInterval", 10*time.Second, "interval to refresh pods") - refreshMetricsInterval = flag.Duration("refreshMetricsInterval", 50*time.Millisecond, "interval to refresh metrics via polling pods") - refreshPrometheusMetricsInterval = flag.Duration("refreshPrometheusMetricsInterval", 5*time.Second, "interval to flush prometheus metrics") -) - -const ( - port = runserver.DefaultGrpcPort -) - -func main() { - if err := run(); err != nil { - os.Exit(1) - } -} - -func run() error { - opts := zap.Options{ - Development: true, - } - opts.BindFlags(flag.CommandLine) - flag.Parse() - logger := zap.New(zap.UseFlagOptions(&opts), zap.RawZapOpts(uberzap.AddCaller())) - ctx := log.IntoContext(context.Background(), logger) - - if *localServer { - test.StartExtProc(ctx, port, *refreshPodsInterval, *refreshMetricsInterval, *refreshPrometheusMetricsInterval, fakePods(), fakeModels()) - time.Sleep(time.Second) // wait until server is up - logger.Info("Server started") - } - - report, err := runner.Run( - "envoy.service.ext_proc.v3.ExternalProcessor.Process", - *svrAddr, - runner.WithInsecure(true), - runner.WithBinaryDataFunc(generateRequestFunc(logger)), - runner.WithTotalRequests(uint(*totalRequests)), - ) - if err != nil { - logger.Error(err, "Runner failed") - return err - } - - printer := printer.ReportPrinter{ - Out: os.Stdout, - Report: report, - } - - printer.Print("summary") - return nil -} - -func generateRequestFunc(logger logr.Logger) func(mtd *desc.MethodDescriptor, callData *runner.CallData) []byte { - return func(mtd *desc.MethodDescriptor, callData *runner.CallData) []byte { - numModels := *numFakePods * (*numModelsPerPod) - req := test.GenerateRequest(logger, "hello", modelName(int(callData.RequestNumber)%numModels)) - data, err := proto.Marshal(req) - if err != nil { - logutil.Fatal(logger, err, "Failed to marshal request", "request", req) - } - return data - } -} - -func fakeModels() map[string]*v1alpha2.InferenceModel { - models := map[string]*v1alpha2.InferenceModel{} - for i := range *numFakePods { - for j := range *numModelsPerPod { - m := modelName(i*(*numModelsPerPod) + j) - models[m] = &v1alpha2.InferenceModel{Spec: v1alpha2.InferenceModelSpec{ModelName: m}} - } - } - - return models -} - -func fakePods() []*datastore.PodMetrics { - pms := make([]*datastore.PodMetrics, 0, *numFakePods) - for i := 0; i < *numFakePods; i++ { - pms = append(pms, test.FakePodMetrics(i, fakeMetrics(i))) - } - - return pms -} - -// fakeMetrics adds numModelsPerPod number of adapters to the pod metrics. -func fakeMetrics(podNumber int) datastore.Metrics { - metrics := datastore.Metrics{ - ActiveModels: make(map[string]int), - } - for i := 0; i < *numModelsPerPod; i++ { - metrics.ActiveModels[modelName(podNumber*(*numModelsPerPod)+i)] = 0 - } - return metrics -} - -func modelName(i int) string { - return fmt.Sprintf("adapter-%v", i) -} diff --git a/pkg/epp/test/utils.go b/pkg/epp/test/utils.go deleted file mode 100644 index b18b0919..00000000 --- a/pkg/epp/test/utils.go +++ /dev/null @@ -1,126 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package test - -import ( - "context" - "encoding/json" - "fmt" - "net" - "time" - - extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" - "github.com/go-logr/logr" - "google.golang.org/grpc" - "google.golang.org/grpc/reflection" - "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" - utiltesting "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" -) - -func StartExtProc( - ctx context.Context, - port int, - refreshPodsInterval, refreshMetricsInterval, refreshPrometheusMetricsInterval time.Duration, - pods []*datastore.PodMetrics, - models map[string]*v1alpha2.InferenceModel, -) *grpc.Server { - logger := log.FromContext(ctx) - pms := make(map[types.NamespacedName]*datastore.PodMetrics) - for _, pod := range pods { - pms[pod.NamespacedName] = pod - } - pmc := &backend.FakePodMetricsClient{Res: pms} - datastore := datastore.NewDatastore() - for _, m := range models { - datastore.ModelSetIfOlder(m) - } - for _, pm := range pods { - pod := utiltesting.MakePod(pm.NamespacedName.Name). - Namespace(pm.NamespacedName.Namespace). - ReadyCondition(). - IP(pm.Address). - ObjRef() - datastore.PodUpdateOrAddIfNotExist(pod) - datastore.PodUpdateMetricsIfExist(pm.NamespacedName, &pm.Metrics) - } - pp := backend.NewProvider(pmc, datastore) - if err := pp.Init(ctx, refreshMetricsInterval, refreshPrometheusMetricsInterval); err != nil { - logutil.Fatal(logger, err, "Failed to initialize") - } - return startExtProc(logger, port, datastore) -} - -// startExtProc starts an extProc server with fake pods. -func startExtProc(logger logr.Logger, port int, datastore datastore.Datastore) *grpc.Server { - lis, err := net.Listen("tcp", fmt.Sprintf(":%d", port)) - if err != nil { - logutil.Fatal(logger, err, "Failed to listen", "port", port) - } - - s := grpc.NewServer() - - extProcPb.RegisterExternalProcessorServer(s, handlers.NewServer(scheduling.NewScheduler(datastore), "", "target-pod", datastore)) - - logger.Info("gRPC server starting", "port", port) - reflection.Register(s) - go func() { - err := s.Serve(lis) - if err != nil { - logutil.Fatal(logger, err, "Ext-proc failed with the err") - } - }() - return s -} - -func GenerateRequest(logger logr.Logger, prompt, model string) *extProcPb.ProcessingRequest { - j := map[string]interface{}{ - "model": model, - "prompt": prompt, - "max_tokens": 100, - "temperature": 0, - } - - llmReq, err := json.Marshal(j) - if err != nil { - logutil.Fatal(logger, err, "Failed to unmarshal LLM request") - } - req := &extProcPb.ProcessingRequest{ - Request: &extProcPb.ProcessingRequest_RequestBody{ - RequestBody: &extProcPb.HttpBody{Body: llmReq}, - }, - } - return req -} - -func FakePodMetrics(index int, metrics datastore.Metrics) *datastore.PodMetrics { - address := fmt.Sprintf("192.168.1.%d", index+1) - pod := datastore.PodMetrics{ - Pod: datastore.Pod{ - NamespacedName: types.NamespacedName{Name: fmt.Sprintf("pod-%v", index), Namespace: "default"}, - Address: address, - }, - Metrics: metrics, - } - return &pod -} diff --git a/pkg/epp/util/testing/request.go b/pkg/epp/util/testing/request.go new file mode 100644 index 00000000..fe9a0d08 --- /dev/null +++ b/pkg/epp/util/testing/request.go @@ -0,0 +1,45 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package testing + +import ( + "encoding/json" + + extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" + "github.com/go-logr/logr" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +func GenerateRequest(logger logr.Logger, prompt, model string) *extProcPb.ProcessingRequest { + j := map[string]interface{}{ + "model": model, + "prompt": prompt, + "max_tokens": 100, + "temperature": 0, + } + + llmReq, err := json.Marshal(j) + if err != nil { + logutil.Fatal(logger, err, "Failed to unmarshal LLM request") + } + req := &extProcPb.ProcessingRequest{ + Request: &extProcPb.ProcessingRequest_RequestBody{ + RequestBody: &extProcPb.HttpBody{Body: llmReq}, + }, + } + return req +} diff --git a/pkg/epp/util/testing/wrappers.go b/pkg/epp/util/testing/wrappers.go index 2693734f..c4018631 100644 --- a/pkg/epp/util/testing/wrappers.go +++ b/pkg/epp/util/testing/wrappers.go @@ -27,6 +27,12 @@ type PodWrapper struct { corev1.Pod } +func FromBase(pod *corev1.Pod) *PodWrapper { + return &PodWrapper{ + Pod: *pod, + } +} + // MakePod creates a wrapper for a Pod. func MakePod(podName string) *PodWrapper { return &PodWrapper{ diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go index 765449f3..c5e7c10a 100644 --- a/test/integration/epp/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -55,12 +55,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/envtest" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" - extprocutils "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/test" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" utiltesting "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" "sigs.k8s.io/yaml" @@ -83,7 +82,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { tests := []struct { name string req *extProcPb.ProcessingRequest - pods []*datastore.PodMetrics + pods map[backendmetrics.Pod]*backendmetrics.Metrics wantHeaders []*configPb.HeaderValueOption wantMetadata *structpb.Struct wantBody []byte @@ -93,21 +92,21 @@ func TestKubeInferenceModelRequest(t *testing.T) { }{ { name: "select lower queue and kv cache, no active lora", - req: extprocutils.GenerateRequest(logger, "test1", "my-model"), + req: utiltesting.GenerateRequest(logger, "test1", "my-model"), // pod-1 will be picked because it has relatively low queue size and low KV cache. - pods: []*datastore.PodMetrics{ - extprocutils.FakePodMetrics(0, datastore.Metrics{ + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { WaitingQueueSize: 3, KVCacheUsagePercent: 0.2, - }), - extprocutils.FakePodMetrics(1, datastore.Metrics{ + }, + fakePod(1): { WaitingQueueSize: 0, KVCacheUsagePercent: 0.1, - }), - extprocutils.FakePodMetrics(2, datastore.Metrics{ + }, + fakePod(2): { WaitingQueueSize: 10, KVCacheUsagePercent: 0.2, - }), + }, }, wantHeaders: []*configPb.HeaderValueOption{ { @@ -134,34 +133,34 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, { name: "select active lora, low queue", - req: extprocutils.GenerateRequest(logger, "test2", "sql-lora"), + req: utiltesting.GenerateRequest(logger, "test2", "sql-lora"), // pod-1 will be picked because it has relatively low queue size, with the requested // model being active, and has low KV cache. - pods: []*datastore.PodMetrics{ - extprocutils.FakePodMetrics(0, datastore.Metrics{ + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, ActiveModels: map[string]int{ "foo": 1, "bar": 1, }, - }), - extprocutils.FakePodMetrics(1, datastore.Metrics{ + }, + fakePod(1): { WaitingQueueSize: 0, KVCacheUsagePercent: 0.1, ActiveModels: map[string]int{ "foo": 1, "sql-lora-1fdg2": 1, }, - }), - extprocutils.FakePodMetrics(2, datastore.Metrics{ + }, + fakePod(2): { WaitingQueueSize: 10, KVCacheUsagePercent: 0.2, ActiveModels: map[string]int{ "foo": 1, "bar": 1, }, - }), + }, }, wantHeaders: []*configPb.HeaderValueOption{ { @@ -188,34 +187,34 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, { name: "select no lora despite active model, avoid excessive queue size", - req: extprocutils.GenerateRequest(logger, "test3", "sql-lora"), + req: utiltesting.GenerateRequest(logger, "test3", "sql-lora"), // pod-2 will be picked despite it NOT having the requested model being active // as it's above the affinity for queue size. Also is critical, so we should // still honor request despite all queues > 5 - pods: []*datastore.PodMetrics{ - extprocutils.FakePodMetrics(0, datastore.Metrics{ + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { WaitingQueueSize: 10, KVCacheUsagePercent: 0.2, ActiveModels: map[string]int{ "foo": 1, "bar": 1, }, - }), - extprocutils.FakePodMetrics(1, datastore.Metrics{ + }, + fakePod(1): { WaitingQueueSize: 200, KVCacheUsagePercent: 0.1, ActiveModels: map[string]int{ "foo": 1, "sql-lora-1fdg2": 1, }, - }), - extprocutils.FakePodMetrics(2, datastore.Metrics{ + }, + fakePod(2): { WaitingQueueSize: 6, KVCacheUsagePercent: 0.2, ActiveModels: map[string]int{ "foo": 1, }, - }), + }, }, wantHeaders: []*configPb.HeaderValueOption{ { @@ -242,11 +241,11 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, { name: "noncritical and all models past threshold, shed request", - req: extprocutils.GenerateRequest(logger, "test4", "sql-lora-sheddable"), + req: utiltesting.GenerateRequest(logger, "test4", "sql-lora-sheddable"), // no pods will be picked as all models are either above kv threshold, // queue threshold, or both. - pods: []*datastore.PodMetrics{ - extprocutils.FakePodMetrics(0, datastore.Metrics{ + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { WaitingQueueSize: 6, KVCacheUsagePercent: 0.2, ActiveModels: map[string]int{ @@ -254,23 +253,23 @@ func TestKubeInferenceModelRequest(t *testing.T) { "bar": 1, "sql-lora-1fdg3": 1, }, - }), - extprocutils.FakePodMetrics(1, datastore.Metrics{ + }, + fakePod(1): { WaitingQueueSize: 0, KVCacheUsagePercent: 0.85, ActiveModels: map[string]int{ "foo": 1, "sql-lora-1fdg3": 1, }, - }), - extprocutils.FakePodMetrics(2, datastore.Metrics{ + }, + fakePod(2): { WaitingQueueSize: 10, KVCacheUsagePercent: 0.9, ActiveModels: map[string]int{ "foo": 1, "sql-lora-1fdg3": 1, }, - }), + }, }, wantHeaders: []*configPb.HeaderValueOption{}, wantMetadata: &structpb.Struct{}, @@ -285,10 +284,10 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, { name: "noncritical, but one server has capacity, do not shed", - req: extprocutils.GenerateRequest(logger, "test5", "sql-lora-sheddable"), + req: utiltesting.GenerateRequest(logger, "test5", "sql-lora-sheddable"), // pod 0 will be picked as all other models are above threshold - pods: []*datastore.PodMetrics{ - extprocutils.FakePodMetrics(0, datastore.Metrics{ + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { WaitingQueueSize: 4, KVCacheUsagePercent: 0.2, ActiveModels: map[string]int{ @@ -296,23 +295,23 @@ func TestKubeInferenceModelRequest(t *testing.T) { "bar": 1, "sql-lora-1fdg3": 1, }, - }), - extprocutils.FakePodMetrics(1, datastore.Metrics{ + }, + fakePod(1): { WaitingQueueSize: 0, KVCacheUsagePercent: 0.85, ActiveModels: map[string]int{ "foo": 1, "sql-lora-1fdg3": 1, }, - }), - extprocutils.FakePodMetrics(2, datastore.Metrics{ + }, + fakePod(2): { WaitingQueueSize: 10, KVCacheUsagePercent: 0.9, ActiveModels: map[string]int{ "foo": 1, "sql-lora-1fdg3": 1, }, - }), + }, }, wantHeaders: []*configPb.HeaderValueOption{ { @@ -391,12 +390,13 @@ func TestKubeInferenceModelRequest(t *testing.T) { } } -func setUpHermeticServer(t *testing.T, podMetrics []*datastore.PodMetrics) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { - pms := make(map[types.NamespacedName]*datastore.PodMetrics) - for _, pm := range podMetrics { - pms[pm.NamespacedName] = pm +func setUpHermeticServer(t *testing.T, podAndMetrics map[backendmetrics.Pod]*backendmetrics.Metrics) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { + // Reconfigure the TestPodMetricsClient. + res := map[types.NamespacedName]*backendmetrics.Metrics{} + for pod, metrics := range podAndMetrics { + res[pod.NamespacedName] = metrics } - pmc := &backend.FakePodMetricsClient{Res: pms} + serverRunner.TestPodMetricsClient.SetRes(res) serverCtx, stopServer := context.WithCancel(context.Background()) @@ -405,27 +405,26 @@ func setUpHermeticServer(t *testing.T, podMetrics []*datastore.PodMetrics) (clie "app": "vllm-llama2-7b-pool", } - for _, pm := range podMetrics { - pod := utiltesting.MakePod(pm.NamespacedName.Name). - Namespace(pm.NamespacedName.Namespace). + for pod := range podAndMetrics { + pod := utiltesting.MakePod(pod.NamespacedName.Name). + Namespace(pod.NamespacedName.Namespace). ReadyCondition(). Labels(podLabels). - IP(pm.Address). + IP(pod.Address). Complete(). ObjRef() copy := pod.DeepCopy() if err := k8sClient.Create(context.Background(), copy); err != nil { - logutil.Fatal(logger, err, "Failed to create pod", "pod", pm.NamespacedName) + logutil.Fatal(logger, err, "Failed to create pod", "pod", pod) } // since no pod controllers deployed in fake environment, we manually update pod status copy.Status = pod.Status if err := k8sClient.Status().Update(context.Background(), copy); err != nil { - logutil.Fatal(logger, err, "Failed to update pod status", "pod", pm.NamespacedName) + logutil.Fatal(logger, err, "Failed to update pod status", "pod", pod) } } - serverRunner.Provider = backend.NewProvider(pmc, serverRunner.Datastore) go func() { if err := serverRunner.AsRunnable(logger.WithName("ext-proc")).Start(serverCtx); err != nil { logutil.Fatal(logger, err, "Failed to start ext-proc server") @@ -434,7 +433,7 @@ func setUpHermeticServer(t *testing.T, podMetrics []*datastore.PodMetrics) (clie // check if all pods are synced to datastore assert.EventuallyWithT(t, func(t *assert.CollectT) { - assert.Len(t, serverRunner.Datastore.PodGetAll(), len(podMetrics), "Datastore not synced") + assert.Len(t, serverRunner.Datastore.PodGetAll(), len(podAndMetrics), "Datastore not synced") }, 10*time.Second, time.Second) address := fmt.Sprintf("localhost:%v", port) @@ -455,12 +454,12 @@ func setUpHermeticServer(t *testing.T, podMetrics []*datastore.PodMetrics) (clie stopServer() // clear created pods - for _, pm := range podMetrics { - pod := utiltesting.MakePod(pm.NamespacedName.Name). - Namespace(pm.NamespacedName.Namespace).Complete().ObjRef() + for pod := range podAndMetrics { + pod := utiltesting.MakePod(pod.NamespacedName.Name). + Namespace(pod.NamespacedName.Namespace).Complete().ObjRef() if err := k8sClient.Delete(context.Background(), pod); err != nil { - logutil.Fatal(logger, err, "Failed to delete pod", "pod", pm.NamespacedName) + logutil.Fatal(logger, err, "Failed to delete pod", "pod", fakePod) } } // wait a little until the goroutines actually exit @@ -468,6 +467,13 @@ func setUpHermeticServer(t *testing.T, podMetrics []*datastore.PodMetrics) (clie } } +func fakePod(index int) backendmetrics.Pod { + return backendmetrics.Pod{ + NamespacedName: types.NamespacedName{Name: fmt.Sprintf("pod-%v", index), Namespace: "default"}, + Address: fmt.Sprintf("192.168.1.%d", index+1), + } +} + // Sets up a test environment and returns the runner struct func BeforeSuit(t *testing.T) func() { // Set up mock k8s API Client @@ -503,9 +509,11 @@ func BeforeSuit(t *testing.T) func() { } serverRunner = runserver.NewDefaultExtProcServerRunner() + serverRunner.TestPodMetricsClient = &backendmetrics.FakePodMetricsClient{} + pmf := backendmetrics.NewPodMetricsFactory(serverRunner.TestPodMetricsClient, 10*time.Millisecond) // Adjust from defaults serverRunner.PoolName = "vllm-llama2-7b-pool" - serverRunner.Datastore = datastore.NewDatastore() + serverRunner.Datastore = datastore.NewDatastore(context.Background(), pmf) serverRunner.SecureServing = false if err := serverRunner.SetupWithManager(context.Background(), mgr); err != nil { From 1dc768f171add007baba46873eeaad1aa40f1152 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Mon, 10 Mar 2025 23:09:47 +0200 Subject: [PATCH 83/96] fixed broken link (#467) Signed-off-by: Nir Rozenbaum --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c500602c..6ad19cdb 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ As Inference Gateway builds towards a GA release. We will continue to expand our ## End-to-End Tests -Follow this [README](./test/e2e/README.md) to learn more about running the inference-extension end-to-end test suite on your cluster. +Follow this [README](./test/e2e/epp/README.md) to learn more about running the inference-extension end-to-end test suite on your cluster. ## Contributing From 910407e758915a8025878038d36c695b035cc532 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Mon, 10 Mar 2025 23:23:46 +0200 Subject: [PATCH 84/96] fixed minimal requirement for envoy version (#466) Signed-off-by: Nir Rozenbaum --- site-src/guides/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index b7b31000..8bcee6e2 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -3,7 +3,7 @@ This quickstart guide is intended for engineers familiar with k8s and model servers (vLLM in this instance). The goal of this guide is to get a first, single InferencePool up and running! ## **Prerequisites** - - Envoy Gateway [v1.2.1](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher + - Envoy Gateway [v1.3.0](https://gateway.envoyproxy.io/docs/install/install-yaml/#install-with-yaml) or higher - A cluster with: - Support for services of typs `LoadBalancer`. (This can be validated by ensuring your Envoy Gateway is up and running). For example, with Kind, you can follow [these steps](https://kind.sigs.k8s.io/docs/user/loadbalancer). From 59a772d6f404257caf63f87212700ca44ec90361 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 10 Mar 2025 16:49:47 -0700 Subject: [PATCH 85/96] Bump github.com/onsi/ginkgo/v2 from 2.22.2 to 2.23.0 (#473) Bumps [github.com/onsi/ginkgo/v2](https://github.com/onsi/ginkgo) from 2.22.2 to 2.23.0. - [Release notes](https://github.com/onsi/ginkgo/releases) - [Changelog](https://github.com/onsi/ginkgo/blob/master/CHANGELOG.md) - [Commits](https://github.com/onsi/ginkgo/compare/v2.22.2...v2.23.0) --- updated-dependencies: - dependency-name: github.com/onsi/ginkgo/v2 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 34 ++++++++------------- go.sum | 95 +++++++++++++--------------------------------------------- 2 files changed, 33 insertions(+), 96 deletions(-) diff --git a/go.mod b/go.mod index 91173449..3342d001 100644 --- a/go.mod +++ b/go.mod @@ -5,13 +5,11 @@ go 1.23.0 toolchain go1.23.2 require ( - github.com/bojand/ghz v0.120.0 github.com/elastic/crd-ref-docs v0.1.0 github.com/envoyproxy/go-control-plane/envoy v1.32.4 github.com/go-logr/logr v1.4.2 github.com/google/go-cmp v0.7.0 - github.com/jhump/protoreflect v1.17.0 - github.com/onsi/ginkgo/v2 v2.22.2 + github.com/onsi/ginkgo/v2 v2.23.0 github.com/onsi/gomega v1.36.2 github.com/prometheus/client_golang v1.21.0 github.com/prometheus/client_model v0.6.1 @@ -35,26 +33,18 @@ require ( require ( cel.dev/expr v0.19.0 // indirect - cloud.google.com/go/compute/metadata v0.5.2 // indirect - github.com/BurntSushi/toml v1.1.0 // indirect github.com/Masterminds/goutils v1.1.1 // indirect github.com/Masterminds/semver v1.5.0 // indirect - github.com/Masterminds/semver/v3 v3.2.0 // indirect github.com/Masterminds/sprig v2.22.0+incompatible // indirect - github.com/Masterminds/sprig/v3 v3.2.3 // indirect - github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect github.com/antlr4-go/antlr/v4 v4.13.0 // indirect github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect - github.com/bufbuild/protocompile v0.14.1 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/dustin/go-humanize v1.0.1 // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect - github.com/envoyproxy/go-control-plane/ratelimit v0.1.0 // indirect github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/fatih/color v1.16.0 // indirect @@ -66,6 +56,8 @@ require ( github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.23.0 // indirect + github.com/go-playground/locales v0.14.1 // indirect + github.com/go-playground/universal-translator v0.18.0 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/gobuffalo/flect v1.0.2 // indirect github.com/goccy/go-yaml v1.11.3 // indirect @@ -82,11 +74,11 @@ require ( github.com/huandu/xstrings v1.3.3 // indirect github.com/imdario/mergo v0.3.11 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect - github.com/jinzhu/configor v1.2.1 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.17.11 // indirect github.com/kylelemons/godebug v1.1.0 // indirect + github.com/leodido/go-urn v1.2.1 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/mattn/go-colorable v0.1.13 // indirect github.com/mattn/go-isatty v0.0.20 // indirect @@ -101,8 +93,6 @@ require ( github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/procfs v0.15.1 // indirect - github.com/shopspring/decimal v1.2.0 // indirect - github.com/spf13/cast v1.4.1 // indirect github.com/spf13/cobra v1.8.1 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/stoewer/go-strcase v1.3.0 // indirect @@ -115,17 +105,17 @@ require ( go.opentelemetry.io/otel/sdk v1.32.0 // indirect go.opentelemetry.io/otel/trace v1.32.0 // indirect go.opentelemetry.io/proto/otlp v1.3.1 // indirect - golang.org/x/crypto v0.32.0 // indirect + golang.org/x/crypto v0.33.0 // indirect golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect - golang.org/x/mod v0.22.0 // indirect - golang.org/x/net v0.34.0 // indirect + golang.org/x/mod v0.23.0 // indirect + golang.org/x/net v0.35.0 // indirect golang.org/x/oauth2 v0.24.0 // indirect - golang.org/x/sync v0.10.0 // indirect - golang.org/x/sys v0.29.0 // indirect - golang.org/x/term v0.28.0 // indirect - golang.org/x/text v0.21.0 // indirect + golang.org/x/sync v0.11.0 // indirect + golang.org/x/sys v0.30.0 // indirect + golang.org/x/term v0.29.0 // indirect + golang.org/x/text v0.22.0 // indirect golang.org/x/time v0.7.0 // indirect - golang.org/x/tools v0.28.0 // indirect + golang.org/x/tools v0.30.0 // indirect golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20241202173237-19429a94021a // indirect diff --git a/go.sum b/go.sum index f55f404b..2e18e4ad 100644 --- a/go.sum +++ b/go.sum @@ -1,22 +1,11 @@ cel.dev/expr v0.19.0 h1:lXuo+nDhpyJSpWxpPVi5cPUwzKb+dsdOiw6IreM5yt0= cel.dev/expr v0.19.0/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= -cloud.google.com/go/compute/metadata v0.5.2 h1:UxK4uu/Tn+I3p2dYWTfiX4wva7aYlKixAHn3fyqngqo= -cloud.google.com/go/compute/metadata v0.5.2/go.mod h1:C66sj2AluDcIqakBq/M8lw8/ybHgOZqin2obFxa/E5k= -github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= -github.com/BurntSushi/toml v1.1.0 h1:ksErzDEI1khOiGPgpwuI7x2ebx/uXQNw7xJpn9Eq1+I= -github.com/BurntSushi/toml v1.1.0/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJLSYI= github.com/Masterminds/goutils v1.1.1/go.mod h1:8cTjp+g8YejhMuvIA5y2vz3BpJxksy863GQaJW2MFNU= github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y= -github.com/Masterminds/semver/v3 v3.2.0 h1:3MEsd0SM6jqZojhjLWWeBY+Kcjy9i6MQAeY7YgDP83g= -github.com/Masterminds/semver/v3 v3.2.0/go.mod h1:qvl/7zhW3nngYb5+80sSMF+FG2BjYrf8m9wsX0PNOMQ= github.com/Masterminds/sprig v2.22.0+incompatible h1:z4yfnGrZ7netVz+0EDJ0Wi+5VZCSYp4Z0m2dk6cEM60= github.com/Masterminds/sprig v2.22.0+incompatible/go.mod h1:y6hNFY5UBTIWBxnzTeuNhlNS5hqE0NB0E6fgfo2Br3o= -github.com/Masterminds/sprig/v3 v3.2.3 h1:eL2fZNezLomi0uOLqjQoN6BfsDD+fyLtgbJMAj9n6YA= -github.com/Masterminds/sprig/v3 v3.2.3/go.mod h1:rXcFaZ2zZbLRJv/xSysmlgIM1u11eBaRMhvYXJNkGuM= -github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 h1:JYp7IbQjafoB+tBA3gMyHYHrpOtNuDiK/uB5uXxq5wM= -github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8TVTI= github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= @@ -27,10 +16,6 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= -github.com/bojand/ghz v0.120.0 h1:6F4wsmZVwFg5UnD+/R+IABWk6sKE/0OKIBdUQUZnOdo= -github.com/bojand/ghz v0.120.0/go.mod h1:HfECuBZj1v02XObGnRuoZgyB1PR24/25dIYiJIMjJnE= -github.com/bufbuild/protocompile v0.14.1 h1:iA73zAf/fyljNjQKwYzUHD6AD4R8KMasmwa/FBatYVw= -github.com/bufbuild/protocompile v0.14.1/go.mod h1:ppVdAIhbr2H8asPk6k4pY7t9zB1OU5DoEw9xY/FUi1c= github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= @@ -43,18 +28,12 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= -github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/elastic/crd-ref-docs v0.1.0 h1:Cr5kz89QB3Iuuj7dhAfLMApCrChEGAaIBTxGk/xuRKw= github.com/elastic/crd-ref-docs v0.1.0/go.mod h1:X83mMBdJt05heJUYiS3T0yJ/JkCuliuhSUNav5Gjo/U= github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/envoyproxy/go-control-plane v0.13.4 h1:zEqyPVyku6IvWCFwux4x9RxkLOMUL+1vC9xUFv5l2/M= -github.com/envoyproxy/go-control-plane v0.13.4/go.mod h1:kDfuBlDVsSj2MjrLEtRWtHlsWIFcGyB2RMO44Dc5GZA= github.com/envoyproxy/go-control-plane/envoy v1.32.4 h1:jb83lalDRZSpPWW2Z7Mck/8kXZ5CQAFYVjQcdVIr83A= github.com/envoyproxy/go-control-plane/envoy v1.32.4/go.mod h1:Gzjc5k8JcJswLjAx1Zm+wSYE20UrLtt7JZMWiWQXQEw= -github.com/envoyproxy/go-control-plane/ratelimit v0.1.0 h1:/G9QYbddjL25KvtKTv3an9lx6VBE2cnb8wp1vEGNYGI= -github.com/envoyproxy/go-control-plane/ratelimit v0.1.0/go.mod h1:Wk+tMFAFbCXaJPzVVHnPgRKdUdwW/KdbRt94AzgRee4= github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8= github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU= github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= @@ -84,11 +63,11 @@ github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= +github.com/go-playground/locales v0.14.0/go.mod h1:sawfccIbzZTqEDETgFXqTho0QybSa7l++s0DH+LDiLs= github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA= github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY= github.com/go-playground/universal-translator v0.18.0 h1:82dyy6p4OuJq4/CByFNOn/jYrnRPArHwAcmLoJZxyho= github.com/go-playground/universal-translator v0.18.0/go.mod h1:UvRDBj+xPUEGrFYl+lu/H90nyDXpg0fqeB/AQUGNTVA= -github.com/go-playground/validator v9.31.0+incompatible h1:UA72EPEogEnq76ehGdEDp4Mit+3FDh548oRqwVgNsHA= github.com/go-playground/validator/v10 v10.4.1 h1:pH2c5ADXtd66mxoE0Zm9SUhxE20r7aM3F26W0hOn+GE= github.com/go-playground/validator/v10 v10.4.1/go.mod h1:nlOn6nFhuKACm19sB/8EGNn9GlaMV7XkbRSipzJ0Ii4= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= @@ -115,7 +94,6 @@ github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad h1:a6HEuzUHeKH6hwfN/ZoQgRgVIWFJljSWa/zetS2WTvg= github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= -github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= @@ -128,10 +106,6 @@ github.com/imdario/mergo v0.3.11 h1:3tnifQM4i+fbajXKBHXWEH+KvNHqojZ778UH75j3bGA= github.com/imdario/mergo v0.3.11/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= -github.com/jhump/protoreflect v1.17.0 h1:qOEr613fac2lOuTgWN4tPAtLL7fUSbuJL5X5XumQh94= -github.com/jhump/protoreflect v1.17.0/go.mod h1:h9+vUUL38jiBzck8ck+6G/aeMX8Z4QUY/NiJPwPNi+8= -github.com/jinzhu/configor v1.2.1 h1:OKk9dsR8i6HPOCZR8BcMtcEImAFjIhbJFZNyn5GCZko= -github.com/jinzhu/configor v1.2.1/go.mod h1:nX89/MOmDba7ZX7GCyU/VIaQ2Ar2aizBl2d3JLF/rDc= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= @@ -158,10 +132,8 @@ github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovk github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= -github.com/mitchellh/copystructure v1.0.0/go.mod h1:SNtv71yrdKgLRyLFxmLdkAbkKEFWgYaq1OVrnRcwhnw= github.com/mitchellh/copystructure v1.2.0 h1:vpKXTN4ewci03Vljg/q9QvCGUDttBOGBIa15WveJJGw= github.com/mitchellh/copystructure v1.2.0/go.mod h1:qLl+cE2AmVv+CoeAwDPye/v+N2HKCj9FbZEVFJRxO9s= -github.com/mitchellh/reflectwalk v1.0.0/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= github.com/mitchellh/reflectwalk v1.0.2 h1:G2LzWKi524PWgd3mLHV8Y5k7s6XUvT0Gef6zxSIeXaQ= github.com/mitchellh/reflectwalk v1.0.2/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU= @@ -179,8 +151,8 @@ github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= -github.com/onsi/ginkgo/v2 v2.22.2 h1:/3X8Panh8/WwhU/3Ssa6rCKqPLuAkVY2I0RoyDLySlU= -github.com/onsi/ginkgo/v2 v2.22.2/go.mod h1:oeMosUL+8LtarXBHu/c0bx2D/K9zyQ6uX3cTyztHwsk= +github.com/onsi/ginkgo/v2 v2.23.0 h1:FA1xjp8ieYDzlgS5ABTpdUDB7wtngggONc8a7ku2NqQ= +github.com/onsi/ginkgo/v2 v2.23.0/go.mod h1:zXTP6xIp3U8aVuXN8ENK9IXRaTjFnpVB9mGmaSRvxnM= github.com/onsi/gomega v1.36.2 h1:koNYke6TVk6ZmnyHrCXba/T/MoLBXFjeC1PtvYgw0A8= github.com/onsi/gomega v1.36.2/go.mod h1:DdwyADRjrc825LhMEkD76cHR5+pUnjhUN8GlHlRPHzY= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= @@ -201,11 +173,6 @@ github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoG github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= -github.com/shopspring/decimal v1.2.0 h1:abSATXmQEYyShuxI4/vyW3tV1MrKAJzCZ/0zLUXYbsQ= -github.com/shopspring/decimal v1.2.0/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= -github.com/spf13/cast v1.3.1/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= -github.com/spf13/cast v1.4.1 h1:s0hze+J0196ZfEMTs80N7UlFt0BDuQ7Q+JDnHiMWKdA= -github.com/spf13/cast v1.4.1/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= @@ -215,9 +182,8 @@ github.com/stoewer/go-strcase v1.3.0/go.mod h1:fAH5hQ5pehh+j3nZfvwdk2RgEgQjAoM8w github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= -github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= @@ -227,7 +193,6 @@ github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 h1:4K4tsIXefpVJtvA/8srF4V4y0akAoPHkIslgAkjixJA= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0/go.mod h1:jjdQuTGVsXV4vSs+CJ2qYDeDPf9yIJV23qlIzBm73Vg= go.opentelemetry.io/otel v1.32.0 h1:WnBN+Xjcteh0zdk01SVqV55d/m62NJLJdIyb4y/WO5U= @@ -255,66 +220,49 @@ go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.3.0/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= -golang.org/x/crypto v0.32.0 h1:euUpcYgM8WcP71gNpTqQCn6rC2t6ULUPiOzfWaXVVfc= -golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc= +golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus= +golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M= golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0LeHDbnYEryqj5Q1ug8= golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.22.0 h1:D4nJWe9zXqHOmWqj4VMOJhvzj7bEZg4wEYa759z1pH4= -golang.org/x/mod v0.22.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= +golang.org/x/mod v0.23.0 h1:Zb7khfcRGKk+kqfxFaP5tZqCnDZMjC5VtUBs87Hr6QM= +golang.org/x/mod v0.23.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= -golang.org/x/net v0.34.0 h1:Mb7Mrk043xzHgnRM88suvJFwzVrRfHEHJEl5/71CKw0= -golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k= +golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8= +golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk= golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE= golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= -golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.11.0 h1:GGz8+XQP4FvTTrjZPzNKTMFtSXH80RAzG+5ghFPgK9w= +golang.org/x/sync v0.11.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.29.0 h1:TPYlXGxvx1MGTn2GiZDhnjPA9wZzZeGKHHmKhHYvgaU= -golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= -golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= -golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= -golang.org/x/term v0.28.0 h1:/Ts8HFuMR2E6IP/jlo7QVLZHggjKQbhu/7H0LJFr3Gg= -golang.org/x/term v0.28.0/go.mod h1:Sw/lC2IAUZ92udQNf3WodGtn4k/XoLyZoh8v/8uiwek= +golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= +golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU= +golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= -golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= -golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= +golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= golang.org/x/time v0.7.0 h1:ntUhktv3OPE6TgYxXWv9vKvUSJyIFJlyohwbkEwPrKQ= golang.org/x/time v0.7.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.28.0 h1:WuB6qZ4RPCQo5aP3WdKZS7i595EdWqWR8vqJTlwTVK8= -golang.org/x/tools v0.28.0/go.mod h1:dcIOrVd3mfQKTgrDVQHqCPMWy6lnhfhtX3hLXYVLfRw= +golang.org/x/tools v0.30.0 h1:BgcpHewrV5AUp2G9MebG4XPFI1E2W41zU1SaqVA9vJY= +golang.org/x/tools v0.30.0/go.mod h1:c347cR/OJfw5TI+GfX7RUPNMdDRRbjvYTS0jPyvsVtY= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -340,7 +288,6 @@ gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= -gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= From af7fc38c6ecf404a096703ff3e835ca02dd5d677 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 11 Mar 2025 09:43:46 -0700 Subject: [PATCH 86/96] Bump sigs.k8s.io/controller-runtime from 0.20.2 to 0.20.3 (#470) Bumps [sigs.k8s.io/controller-runtime](https://github.com/kubernetes-sigs/controller-runtime) from 0.20.2 to 0.20.3. - [Release notes](https://github.com/kubernetes-sigs/controller-runtime/releases) - [Changelog](https://github.com/kubernetes-sigs/controller-runtime/blob/main/RELEASE.md) - [Commits](https://github.com/kubernetes-sigs/controller-runtime/compare/v0.20.2...v0.20.3) --- updated-dependencies: - dependency-name: sigs.k8s.io/controller-runtime dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 3342d001..a2ba0a3d 100644 --- a/go.mod +++ b/go.mod @@ -26,7 +26,7 @@ require ( k8s.io/code-generator v0.32.2 k8s.io/component-base v0.32.2 k8s.io/utils v0.0.0-20241210054802-24370beab758 - sigs.k8s.io/controller-runtime v0.20.2 + sigs.k8s.io/controller-runtime v0.20.3 sigs.k8s.io/structured-merge-diff/v4 v4.5.0 sigs.k8s.io/yaml v1.4.0 ) diff --git a/go.sum b/go.sum index 2e18e4ad..90018f15 100644 --- a/go.sum +++ b/go.sum @@ -318,8 +318,8 @@ k8s.io/utils v0.0.0-20241210054802-24370beab758 h1:sdbE21q2nlQtFh65saZY+rRM6x6aJ k8s.io/utils v0.0.0-20241210054802-24370beab758/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 h1:CPT0ExVicCzcpeN4baWEV2ko2Z/AsiZgEdwgcfwLgMo= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= -sigs.k8s.io/controller-runtime v0.20.2 h1:/439OZVxoEc02psi1h4QO3bHzTgu49bb347Xp4gW1pc= -sigs.k8s.io/controller-runtime v0.20.2/go.mod h1:xg2XB0K5ShQzAgsoujxuKN4LNXR2LfwwHsPj7Iaw+XY= +sigs.k8s.io/controller-runtime v0.20.3 h1:I6Ln8JfQjHH7JbtCD2HCYHoIzajoRxPNuvhvcDbZgkI= +sigs.k8s.io/controller-runtime v0.20.3/go.mod h1:xg2XB0K5ShQzAgsoujxuKN4LNXR2LfwwHsPj7Iaw+XY= sigs.k8s.io/controller-tools v0.14.0 h1:rnNoCC5wSXlrNoBKKzL70LNJKIQKEzT6lloG6/LF73A= sigs.k8s.io/controller-tools v0.14.0/go.mod h1:TV7uOtNNnnR72SpzhStvPkoS/U5ir0nMudrkrC4M9Sc= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= From b343b2f909358de140e02952e384feab2111bde3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 11 Mar 2025 09:57:46 -0700 Subject: [PATCH 87/96] Bump google.golang.org/grpc from 1.70.0 to 1.71.0 (#471) Bumps [google.golang.org/grpc](https://github.com/grpc/grpc-go) from 1.70.0 to 1.71.0. - [Release notes](https://github.com/grpc/grpc-go/releases) - [Commits](https://github.com/grpc/grpc-go/compare/v1.70.0...v1.71.0) --- updated-dependencies: - dependency-name: google.golang.org/grpc dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 21 +++++++++++---------- go.sum | 50 ++++++++++++++++++++++++++------------------------ 2 files changed, 37 insertions(+), 34 deletions(-) diff --git a/go.mod b/go.mod index a2ba0a3d..ef3d4b8d 100644 --- a/go.mod +++ b/go.mod @@ -17,7 +17,7 @@ require ( github.com/stretchr/testify v1.10.0 go.uber.org/multierr v1.11.0 go.uber.org/zap v1.27.0 - google.golang.org/grpc v1.70.0 + google.golang.org/grpc v1.71.0 google.golang.org/protobuf v1.36.5 k8s.io/api v0.32.2 k8s.io/apiextensions-apiserver v0.32.2 @@ -32,7 +32,7 @@ require ( ) require ( - cel.dev/expr v0.19.0 // indirect + cel.dev/expr v0.19.1 // indirect github.com/Masterminds/goutils v1.1.1 // indirect github.com/Masterminds/semver v1.5.0 // indirect github.com/Masterminds/sprig v2.22.0+incompatible // indirect @@ -42,7 +42,7 @@ require ( github.com/blang/semver/v4 v4.0.0 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78 // indirect + github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect @@ -97,19 +97,20 @@ require ( github.com/spf13/pflag v1.0.5 // indirect github.com/stoewer/go-strcase v1.3.0 // indirect github.com/x448/float16 v0.8.4 // indirect + go.opentelemetry.io/auto/sdk v1.1.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 // indirect - go.opentelemetry.io/otel v1.32.0 // indirect + go.opentelemetry.io/otel v1.34.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 // indirect - go.opentelemetry.io/otel/metric v1.32.0 // indirect - go.opentelemetry.io/otel/sdk v1.32.0 // indirect - go.opentelemetry.io/otel/trace v1.32.0 // indirect + go.opentelemetry.io/otel/metric v1.34.0 // indirect + go.opentelemetry.io/otel/sdk v1.34.0 // indirect + go.opentelemetry.io/otel/trace v1.34.0 // indirect go.opentelemetry.io/proto/otlp v1.3.1 // indirect golang.org/x/crypto v0.33.0 // indirect golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect golang.org/x/mod v0.23.0 // indirect golang.org/x/net v0.35.0 // indirect - golang.org/x/oauth2 v0.24.0 // indirect + golang.org/x/oauth2 v0.25.0 // indirect golang.org/x/sync v0.11.0 // indirect golang.org/x/sys v0.30.0 // indirect golang.org/x/term v0.29.0 // indirect @@ -118,8 +119,8 @@ require ( golang.org/x/tools v0.30.0 // indirect golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20241202173237-19429a94021a // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20241202173237-19429a94021a // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect diff --git a/go.sum b/go.sum index 90018f15..f4869047 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,5 @@ -cel.dev/expr v0.19.0 h1:lXuo+nDhpyJSpWxpPVi5cPUwzKb+dsdOiw6IreM5yt0= -cel.dev/expr v0.19.0/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= +cel.dev/expr v0.19.1 h1:NciYrtDRIR0lNCnH1LFJegdjspNx9fI59O7TWcua/W4= +cel.dev/expr v0.19.1/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJLSYI= github.com/Masterminds/goutils v1.1.1/go.mod h1:8cTjp+g8YejhMuvIA5y2vz3BpJxksy863GQaJW2MFNU= github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= @@ -20,8 +20,8 @@ github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK3 github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78 h1:QVw89YDxXxEe+l8gU8ETbOasdwEV+avkR75ZzsVV9WI= -github.com/cncf/xds/go v0.0.0-20240905190251-b4127c9b8d78/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= +github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3 h1:boJj011Hh+874zpIySeApCX4GeOjPl9qhRF3QuIZq+Q= +github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -170,8 +170,8 @@ github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= -github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= -github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= +github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= +github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM= github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y= @@ -193,22 +193,24 @@ github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= +go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 h1:4K4tsIXefpVJtvA/8srF4V4y0akAoPHkIslgAkjixJA= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0/go.mod h1:jjdQuTGVsXV4vSs+CJ2qYDeDPf9yIJV23qlIzBm73Vg= -go.opentelemetry.io/otel v1.32.0 h1:WnBN+Xjcteh0zdk01SVqV55d/m62NJLJdIyb4y/WO5U= -go.opentelemetry.io/otel v1.32.0/go.mod h1:00DCVSB0RQcnzlwyTfqtxSm+DRr9hpYrHjNGiBHVQIg= +go.opentelemetry.io/otel v1.34.0 h1:zRLXxLCgL1WyKsPVrgbSdMN4c0FMkDAskSTQP+0hdUY= +go.opentelemetry.io/otel v1.34.0/go.mod h1:OWFPOQ+h4G8xpyjgqo4SxJYdDQ/qmRH+wivy7zzx9oI= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0 h1:3Q/xZUyC1BBkualc9ROb4G8qkH90LXEIICcs5zv1OYY= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.28.0/go.mod h1:s75jGIWA9OfCMzF0xr+ZgfrB5FEbbV7UuYo32ahUiFI= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0 h1:qFffATk0X+HD+f1Z8lswGiOQYKHRlzfmdJm0wEaVrFA= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.27.0/go.mod h1:MOiCmryaYtc+V0Ei+Tx9o5S1ZjA7kzLucuVuyzBZloQ= -go.opentelemetry.io/otel/metric v1.32.0 h1:xV2umtmNcThh2/a/aCP+h64Xx5wsj8qqnkYZktzNa0M= -go.opentelemetry.io/otel/metric v1.32.0/go.mod h1:jH7CIbbK6SH2V2wE16W05BHCtIDzauciCRLoc/SyMv8= -go.opentelemetry.io/otel/sdk v1.32.0 h1:RNxepc9vK59A8XsgZQouW8ue8Gkb4jpWtJm9ge5lEG4= -go.opentelemetry.io/otel/sdk v1.32.0/go.mod h1:LqgegDBjKMmb2GC6/PrTnteJG39I8/vJCAP9LlJXEjU= -go.opentelemetry.io/otel/sdk/metric v1.32.0 h1:rZvFnvmvawYb0alrYkjraqJq0Z4ZUJAiyYCU9snn1CU= -go.opentelemetry.io/otel/sdk/metric v1.32.0/go.mod h1:PWeZlq0zt9YkYAp3gjKZ0eicRYvOh1Gd+X99x6GHpCQ= -go.opentelemetry.io/otel/trace v1.32.0 h1:WIC9mYrXf8TmY/EXuULKc8hR17vE+Hjv2cssQDe03fM= -go.opentelemetry.io/otel/trace v1.32.0/go.mod h1:+i4rkvCraA+tG6AzwloGaCtkx53Fa+L+V8e9a7YvhT8= +go.opentelemetry.io/otel/metric v1.34.0 h1:+eTR3U0MyfWjRDhmFMxe2SsW64QrZ84AOhvqS7Y+PoQ= +go.opentelemetry.io/otel/metric v1.34.0/go.mod h1:CEDrp0fy2D0MvkXE+dPV7cMi8tWZwX3dmaIhwPOaqHE= +go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A= +go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU= +go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk= +go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= +go.opentelemetry.io/otel/trace v1.34.0 h1:+ouXS2V8Rd4hp4580a8q23bg0azF2nI8cqLYnC8mh/k= +go.opentelemetry.io/otel/trace v1.34.0/go.mod h1:Svm7lSjQD7kG7KJ/MUHPVXSDGz2OX4h0M2jHBhmSfRE= go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0= go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= @@ -234,8 +236,8 @@ golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8= golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk= -golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE= -golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/oauth2 v0.25.0 h1:CY4y7XT9v0cRI9oupztF8AgiIu99L/ksR/Xp/6jrZ70= +golang.org/x/oauth2 v0.25.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -271,12 +273,12 @@ golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 h1:+cNy6SZtPcJQH3LJVLOSm golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/genproto/googleapis/api v0.0.0-20241202173237-19429a94021a h1:OAiGFfOiA0v9MRYsSidp3ubZaBnteRUyn3xB2ZQ5G/E= -google.golang.org/genproto/googleapis/api v0.0.0-20241202173237-19429a94021a/go.mod h1:jehYqy3+AhJU9ve55aNOaSml7wUXjF9x6z2LcCfpAhY= -google.golang.org/genproto/googleapis/rpc v0.0.0-20241202173237-19429a94021a h1:hgh8P4EuoxpsuKMXX/To36nOFD7vixReXgn8lPGnt+o= -google.golang.org/genproto/googleapis/rpc v0.0.0-20241202173237-19429a94021a/go.mod h1:5uTbfoYQed2U9p3KIj2/Zzm02PYhndfdmML0qC3q3FU= -google.golang.org/grpc v1.70.0 h1:pWFv03aZoHzlRKHWicjsZytKAiYCtNS0dHbXnIdq7jQ= -google.golang.org/grpc v1.70.0/go.mod h1:ofIJqVKDXx/JiXrwr2IG4/zwdH9txy3IlF40RmcJSQw= +google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 h1:GVIKPyP/kLIyVOgOnTwFOrvQaQUzOzGMCxgFUOEmm24= +google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422/go.mod h1:b6h1vNKhxaSoEI+5jc3PJUCustfli/mRab7295pY7rw= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f h1:OxYkA3wjPsZyBylwymxSHa7ViiW1Sml4ToBrncvFehI= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f/go.mod h1:+2Yz8+CLJbIfL9z73EW45avw8Lmge3xVElCP9zEKi50= +google.golang.org/grpc v1.71.0 h1:kF77BGdPTQ4/JZWMlb9VpJ5pa25aqvVqogsxNHHdeBg= +google.golang.org/grpc v1.71.0/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= google.golang.org/protobuf v1.36.5 h1:tPhr+woSbjfYvY6/GPufUoYizxw1cF/yFoxJ2fmpwlM= google.golang.org/protobuf v1.36.5/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= From 0bef35b31c683834a37bcc5428629d76b9ba39b9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 11 Mar 2025 09:57:53 -0700 Subject: [PATCH 88/96] Bump github.com/prometheus/client_golang from 1.21.0 to 1.21.1 (#474) Bumps [github.com/prometheus/client_golang](https://github.com/prometheus/client_golang) from 1.21.0 to 1.21.1. - [Release notes](https://github.com/prometheus/client_golang/releases) - [Changelog](https://github.com/prometheus/client_golang/blob/main/CHANGELOG.md) - [Commits](https://github.com/prometheus/client_golang/compare/v1.21.0...v1.21.1) --- updated-dependencies: - dependency-name: github.com/prometheus/client_golang dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index ef3d4b8d..a9b34d86 100644 --- a/go.mod +++ b/go.mod @@ -11,7 +11,7 @@ require ( github.com/google/go-cmp v0.7.0 github.com/onsi/ginkgo/v2 v2.23.0 github.com/onsi/gomega v1.36.2 - github.com/prometheus/client_golang v1.21.0 + github.com/prometheus/client_golang v1.21.1 github.com/prometheus/client_model v0.6.1 github.com/prometheus/common v0.62.0 github.com/stretchr/testify v1.10.0 diff --git a/go.sum b/go.sum index f4869047..46a731b6 100644 --- a/go.sum +++ b/go.sum @@ -162,8 +162,8 @@ github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.21.0 h1:DIsaGmiaBkSangBgMtWdNfxbMNdku5IK6iNhrEqWvdA= -github.com/prometheus/client_golang v1.21.0/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg= +github.com/prometheus/client_golang v1.21.1 h1:DOvXXTqVzvkIewV/CDPFdejpMCGeMcbGCQ8YOmu+Ibk= +github.com/prometheus/client_golang v1.21.1/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg= github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io= From 9e7d983ee29de55f2be9581826b3a17be6e4940f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 11 Mar 2025 11:45:46 -0700 Subject: [PATCH 89/96] Bump sigs.k8s.io/structured-merge-diff/v4 from 4.5.0 to 4.6.0 (#472) Bumps [sigs.k8s.io/structured-merge-diff/v4](https://github.com/kubernetes-sigs/structured-merge-diff) from 4.5.0 to 4.6.0. - [Release notes](https://github.com/kubernetes-sigs/structured-merge-diff/releases) - [Changelog](https://github.com/kubernetes-sigs/structured-merge-diff/blob/master/RELEASE.md) - [Commits](https://github.com/kubernetes-sigs/structured-merge-diff/compare/v4.5.0...v4.6.0) --- updated-dependencies: - dependency-name: sigs.k8s.io/structured-merge-diff/v4 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index a9b34d86..13ad16c4 100644 --- a/go.mod +++ b/go.mod @@ -27,7 +27,7 @@ require ( k8s.io/component-base v0.32.2 k8s.io/utils v0.0.0-20241210054802-24370beab758 sigs.k8s.io/controller-runtime v0.20.3 - sigs.k8s.io/structured-merge-diff/v4 v4.5.0 + sigs.k8s.io/structured-merge-diff/v4 v4.6.0 sigs.k8s.io/yaml v1.4.0 ) diff --git a/go.sum b/go.sum index 46a731b6..463e55ff 100644 --- a/go.sum +++ b/go.sum @@ -326,7 +326,9 @@ sigs.k8s.io/controller-tools v0.14.0 h1:rnNoCC5wSXlrNoBKKzL70LNJKIQKEzT6lloG6/LF sigs.k8s.io/controller-tools v0.14.0/go.mod h1:TV7uOtNNnnR72SpzhStvPkoS/U5ir0nMudrkrC4M9Sc= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= -sigs.k8s.io/structured-merge-diff/v4 v4.5.0 h1:nbCitCK2hfnhyiKo6uf2HxUPTCodY6Qaf85SbDIaMBk= -sigs.k8s.io/structured-merge-diff/v4 v4.5.0/go.mod h1:N8f93tFZh9U6vpxwRArLiikrE5/2tiu1w1AGfACIGE4= +sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016 h1:kXv6kKdoEtedwuqMmkqhbkgvYKeycVbC8+iPCP9j5kQ= +sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/structured-merge-diff/v4 v4.6.0 h1:IUA9nvMmnKWcj5jl84xn+T5MnlZKThmUW1TdblaLVAc= +sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= From b01bfda3b365afaafdd0d5d5ea8f2b8629f8c2ae Mon Sep 17 00:00:00 2001 From: Rohit Ramkumar Date: Tue, 11 Mar 2025 18:21:46 -0400 Subject: [PATCH 90/96] Handle request trailers (#477) --- pkg/body-based-routing/handlers/server.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/body-based-routing/handlers/server.go b/pkg/body-based-routing/handlers/server.go index 434dd530..813c55c8 100644 --- a/pkg/body-based-routing/handlers/server.go +++ b/pkg/body-based-routing/handlers/server.go @@ -67,6 +67,8 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { resp, err = s.HandleRequestHeaders(req.GetRequestHeaders()) case *extProcPb.ProcessingRequest_RequestBody: resp, err = s.HandleRequestBody(ctx, req.GetRequestBody()) + case *extProcPb.ProcessingRequest_RequestTrailers: + resp, err = s.HandleRequestTrailers(req.GetRequestTrailers()) case *extProcPb.ProcessingRequest_ResponseHeaders: resp, err = s.HandleResponseHeaders(req.GetResponseHeaders()) case *extProcPb.ProcessingRequest_ResponseBody: From 6b117dfe23e605f355d75620c17479cfff4f97f7 Mon Sep 17 00:00:00 2001 From: Cong Liu Date: Tue, 11 Mar 2025 17:33:46 -0700 Subject: [PATCH 91/96] Add the base model to InferenceModel sample manifest (#479) --- config/manifests/inferencemodel.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/config/manifests/inferencemodel.yaml b/config/manifests/inferencemodel.yaml index 94c36d84..12fb00b7 100644 --- a/config/manifests/inferencemodel.yaml +++ b/config/manifests/inferencemodel.yaml @@ -10,3 +10,14 @@ spec: targetModels: - name: tweet-summary-1 weight: 100 + +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: inferencemodel-base-model +spec: + modelName: meta-llama/Llama-2-7b-hf + criticality: Critical + poolRef: + name: my-pool From 07df6313148a9164b864c9981880bc46b6923dc8 Mon Sep 17 00:00:00 2001 From: Cong Liu Date: Tue, 11 Mar 2025 17:53:46 -0700 Subject: [PATCH 92/96] Fix metrics debug log; change metrics client log level to reduce spam (#478) --- pkg/epp/backend/metrics/fake.go | 4 ++++ pkg/epp/backend/metrics/logger.go | 4 +++- pkg/epp/backend/metrics/pod_metrics.go | 5 +++++ pkg/epp/backend/metrics/types.go | 8 ++++++++ pkg/epp/backend/vllm/metrics.go | 13 ++++++------- 5 files changed, 26 insertions(+), 8 deletions(-) diff --git a/pkg/epp/backend/metrics/fake.go b/pkg/epp/backend/metrics/fake.go index fae7149d..7fd4970d 100644 --- a/pkg/epp/backend/metrics/fake.go +++ b/pkg/epp/backend/metrics/fake.go @@ -34,6 +34,10 @@ type FakePodMetrics struct { Metrics *Metrics } +func (fpm *FakePodMetrics) String() string { + return fmt.Sprintf("Pod: %v; Metrics: %v", fpm.GetPod(), fpm.GetMetrics()) +} + func (fpm *FakePodMetrics) GetPod() *Pod { return fpm.Pod } diff --git a/pkg/epp/backend/metrics/logger.go b/pkg/epp/backend/metrics/logger.go index 664115eb..74735755 100644 --- a/pkg/epp/backend/metrics/logger.go +++ b/pkg/epp/backend/metrics/logger.go @@ -18,6 +18,7 @@ package metrics import ( "context" + "fmt" "time" "github.com/go-logr/logr" @@ -76,7 +77,8 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh podsWithStaleMetrics := datastore.PodList(func(pm PodMetrics) bool { return time.Since(pm.GetMetrics().UpdateTime) > metricsValidityPeriod }) - logger.Info("Current Pods and metrics gathered", "fresh metrics", podsWithFreshMetrics, "stale metrics", podsWithStaleMetrics) + s := fmt.Sprintf("Current Pods and metrics gathered. Fresh metrics: %+v, Stale metrics: %+v", podsWithFreshMetrics, podsWithStaleMetrics) + logger.Info(s) } } }() diff --git a/pkg/epp/backend/metrics/pod_metrics.go b/pkg/epp/backend/metrics/pod_metrics.go index f76c2e8c..b954a98c 100644 --- a/pkg/epp/backend/metrics/pod_metrics.go +++ b/pkg/epp/backend/metrics/pod_metrics.go @@ -18,6 +18,7 @@ package metrics import ( "context" + "fmt" "sync" "sync/atomic" "time" @@ -52,6 +53,10 @@ type PodMetricsClient interface { FetchMetrics(ctx context.Context, pod *Pod, existing *Metrics, port int32) (*Metrics, error) } +func (pm *podMetrics) String() string { + return fmt.Sprintf("Pod: %v; Metrics: %v", pm.GetPod(), pm.GetMetrics()) +} + func (pm *podMetrics) GetPod() *Pod { return (*Pod)(atomic.LoadPointer(&pm.pod)) } diff --git a/pkg/epp/backend/metrics/types.go b/pkg/epp/backend/metrics/types.go index cdbdb2ce..fd600163 100644 --- a/pkg/epp/backend/metrics/types.go +++ b/pkg/epp/backend/metrics/types.go @@ -62,6 +62,7 @@ type PodMetrics interface { GetMetrics() *Metrics UpdatePod(*corev1.Pod) StopRefreshLoop() + String() string } type Pod struct { @@ -69,6 +70,13 @@ type Pod struct { Address string } +func (p *Pod) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("%+v", *p) +} + type Metrics struct { // ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU. ActiveModels map[string]int diff --git a/pkg/epp/backend/vllm/metrics.go b/pkg/epp/backend/vllm/metrics.go index f83326eb..8d2dd715 100644 --- a/pkg/epp/backend/vllm/metrics.go +++ b/pkg/epp/backend/vllm/metrics.go @@ -61,8 +61,7 @@ func (p *PodMetricsClientImpl) FetchMetrics( existing *metrics.Metrics, port int32, ) (*metrics.Metrics, error) { - logger := log.FromContext(ctx) - loggerDefault := logger.V(logutil.DEFAULT) + logger := log.FromContext(ctx).V(logutil.TRACE) // Currently the metrics endpoint is hard-coded, which works with vLLM. // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. @@ -70,12 +69,12 @@ func (p *PodMetricsClientImpl) FetchMetrics( req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) if err != nil { - loggerDefault.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) + logger.Error(err, "Failed create HTTP request", "method", http.MethodGet, "url", url) return nil, fmt.Errorf("failed to create request: %v", err) } resp, err := http.DefaultClient.Do(req) if err != nil { - loggerDefault.Error(err, "Failed to fetch metrics", "pod", pod.NamespacedName) + logger.Error(err, "Failed to fetch metrics", "pod", pod.NamespacedName) return nil, fmt.Errorf("failed to fetch metrics from %s: %w", pod.NamespacedName, err) } defer func() { @@ -83,7 +82,7 @@ func (p *PodMetricsClientImpl) FetchMetrics( }() if resp.StatusCode != http.StatusOK { - loggerDefault.Error(nil, "Unexpected status code returned", "pod", pod.NamespacedName, "statusCode", resp.StatusCode) + logger.Error(nil, "Unexpected status code returned", "pod", pod.NamespacedName, "statusCode", resp.StatusCode) return nil, fmt.Errorf("unexpected status code from %s: %v", pod.NamespacedName, resp.StatusCode) } @@ -172,7 +171,7 @@ func promToPodMetrics( func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily) (*dto.Metric, time.Time, error) { loraRequests, ok := metricFamilies[LoraRequestInfoMetricName] if !ok { - logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName) + logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", LoraRequestInfoMetricName) return nil, time.Time{}, fmt.Errorf("metric family %q not found", LoraRequestInfoMetricName) } @@ -219,7 +218,7 @@ func getLatestLoraMetric(logger logr.Logger, metricFamilies map[string]*dto.Metr func getLatestMetric(logger logr.Logger, metricFamilies map[string]*dto.MetricFamily, metricName string) (*dto.Metric, error) { mf, ok := metricFamilies[metricName] if !ok { - logger.V(logutil.DEFAULT).Error(nil, "Metric family not found", "name", metricName) + logger.V(logutil.TRACE).Error(nil, "Metric family not found", "name", metricName) return nil, fmt.Errorf("metric family %q not found", metricName) } if len(mf.GetMetric()) == 0 { From 32e03eca0285cd837f73322290ef940218131f21 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Wed, 12 Mar 2025 13:47:47 -0700 Subject: [PATCH 93/96] Add support for OpenAI API streaming protocol (#469) * Add support for OpenAI API streaming protocol * Add streaming integration tests * reverting go mod changes * Uncommenting previous tests * fix errant typo * Updating test infra to work for multiple tests * Always marshal responseBody, add test case to check for this --- .golangci.yml | 1 - Makefile | 2 +- pkg/epp/handlers/streamingserver.go | 98 +- pkg/epp/server/controller_manager.go | 11 +- pkg/epp/util/testing/request.go | 24 +- test/integration/epp/hermetic_test.go | 1272 +++++++++++++++-- .../inferencepool-with-model-hermetic.yaml | 11 + 7 files changed, 1293 insertions(+), 126 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index 2ad3b93d..d1b1e112 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -25,7 +25,6 @@ linters: - makezero - errcheck - goconst - - gocyclo - gofmt - goimports - gosimple diff --git a/Makefile b/Makefile index 257d2cbb..40cb0b75 100644 --- a/Makefile +++ b/Makefile @@ -123,7 +123,7 @@ test: manifests generate fmt vet envtest ## Run tests. .PHONY: test-integration test-integration: manifests generate fmt vet envtest ## Run tests. - KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./test/integration -coverprofile cover.out + KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./test/integration/epp/... -race -coverprofile cover.out .PHONY: test-e2e test-e2e: ## Run end-to-end tests against an existing Kubernetes cluster with at least 3 available GPUs. diff --git a/pkg/epp/handlers/streamingserver.go b/pkg/epp/handlers/streamingserver.go index c8de7bb7..2aaca7f3 100644 --- a/pkg/epp/handlers/streamingserver.go +++ b/pkg/epp/handlers/streamingserver.go @@ -6,6 +6,7 @@ import ( "fmt" "io" "strconv" + "strings" "time" configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" @@ -131,9 +132,14 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer) case *extProcPb.ProcessingRequest_ResponseHeaders: loggerVerbose.Info("got response headers", "headers", v.ResponseHeaders.Headers.GetHeaders()) for _, header := range v.ResponseHeaders.Headers.GetHeaders() { - code := header.RawValue[0] - if header.Key == "status" && string(code) != "200" { + value := string(header.RawValue) + logger.Error(nil, "header", "key", header.Key, "value", value) + if header.Key == "status" && value != "200" { reqCtx.ResponseStatusCode = errutil.ModelServerError + } else if header.Key == "content-type" && strings.Contains(value, "text/event-stream") { + reqCtx.modelServerStreaming = true + loggerVerbose.Info("model server is streaming response") + logger.Error(nil, "made it here") } } reqCtx.RequestState = ResponseRecieved @@ -158,36 +164,57 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer) } case *extProcPb.ProcessingRequest_ResponseBody: - go func() { - _, err := writer.Write(v.ResponseBody.Body) - if err != nil { - logger.V(logutil.DEFAULT).Error(err, "Error populating writer") - } - }() - - // Message is buffered, we can read and decode. - if v.ResponseBody.EndOfStream { - err = decoder.Decode(&responseBody) - if err != nil { - logger.V(logutil.DEFAULT).Error(err, "Error unmarshaling request body") + if reqCtx.modelServerStreaming { + // Currently we punt on response parsing if the modelServer is streaming, and we just passthrough. + reqCtx.respBodyResp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: v.ResponseBody.Body, + EndOfStream: v.ResponseBody.EndOfStream, + }, + }, + }, + }, + }, + }, } - // Body stream complete. Close the reader pipe. - reader.Close() - - reqCtx, err = s.HandleResponseBody(ctx, reqCtx, responseBody) - if err == nil && reqCtx.ResponseComplete { - reqCtx.ResponseCompleteTimestamp = time.Now() - metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) - metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) - metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens) - metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens) + } else { + go func() { + _, err := writer.Write(v.ResponseBody.Body) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error populating writer") + } + }() + + // Message is buffered, we can read and decode. + if v.ResponseBody.EndOfStream { + err = decoder.Decode(&responseBody) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error unmarshaling request body") + } + // Body stream complete. Close the reader pipe. + reader.Close() + + reqCtx, err = s.HandleResponseBody(ctx, reqCtx, responseBody) + if err == nil && reqCtx.ResponseComplete { + reqCtx.ResponseCompleteTimestamp = time.Now() + metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) + metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) + metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens) + metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens) + } + loggerVerbose.Info("Request context after HandleResponseBody", "context", reqCtx) } - loggerVerbose.Info("Request context after HandleResponseBody", "context", reqCtx) } case *extProcPb.ProcessingRequest_ResponseTrailers: // This is currently unused. } + // Handle the err and fire an immediate response. if err != nil { logger.V(logutil.DEFAULT).Error(err, "Failed to process request", "request", req) resp, err := BuildErrResponse(err) @@ -246,7 +273,11 @@ func (r *StreamingRequestContext) updateStateAndSendIfNeeded(srv extProcPb.Exter if err := srv.Send(r.respBodyResp); err != nil { return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) } - r.RequestState = BodyResponseResponsesComplete + + body := r.respBodyResp.Response.(*extProcPb.ProcessingResponse_ResponseBody) + if body.ResponseBody.Response.GetBodyMutation().GetStreamedResponse().GetEndOfStream() { + r.RequestState = BodyResponseResponsesComplete + } // Dump the response so a new stream message can begin r.reqBodyResp = nil } @@ -273,6 +304,8 @@ type StreamingRequestContext struct { ResponseComplete bool ResponseStatusCode string + modelServerStreaming bool + reqHeaderResp *extProcPb.ProcessingResponse reqBodyResp *extProcPb.ProcessingResponse reqTrailerResp *extProcPb.ProcessingResponse @@ -339,14 +372,15 @@ func (s *StreamingServer) HandleRequestBody( // Update target models in the body. if llmReq.Model != llmReq.ResolvedTargetModel { requestBodyMap["model"] = llmReq.ResolvedTargetModel - requestBodyBytes, err = json.Marshal(requestBodyMap) - if err != nil { - logger.V(logutil.DEFAULT).Error(err, "Error marshaling request body") - return reqCtx, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("error marshaling request body: %v", err)} - } - loggerVerbose.Info("Updated request body marshalled", "body", string(requestBodyBytes)) } + requestBodyBytes, err = json.Marshal(requestBodyMap) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error marshaling request body") + return reqCtx, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("error marshaling request body: %v", err)} + } + loggerVerbose.Info("Updated request body marshalled", "body", string(requestBodyBytes)) + target, err := s.scheduler.Schedule(ctx, llmReq) if err != nil { return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()} diff --git a/pkg/epp/server/controller_manager.go b/pkg/epp/server/controller_manager.go index fd505d00..46694f7b 100644 --- a/pkg/epp/server/controller_manager.go +++ b/pkg/epp/server/controller_manager.go @@ -28,6 +28,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" ) @@ -40,7 +41,7 @@ func init() { // NewDefaultManager creates a new controller manager with default configuration. func NewDefaultManager(namespace, name string, restConfig *rest.Config) (ctrl.Manager, error) { - manager, err := ctrl.NewManager(restConfig, ctrl.Options{ + defaultOpts := ctrl.Options{ Scheme: scheme, Cache: cache.Options{ ByObject: map[client.Object]cache.ByObject{ @@ -65,7 +66,13 @@ func NewDefaultManager(namespace, name string, restConfig *rest.Config) (ctrl.Ma }, }, }, - }) + } + return NewManagerWithOptions(restConfig, defaultOpts) +} + +// NewManagerWithOptions creates a new controller manager with injectable options. +func NewManagerWithOptions(restConfig *rest.Config, opts manager.Options) (ctrl.Manager, error) { + manager, err := ctrl.NewManager(restConfig, opts) if err != nil { return nil, fmt.Errorf("failed to create controller manager: %v", err) } diff --git a/pkg/epp/util/testing/request.go b/pkg/epp/util/testing/request.go index fe9a0d08..30772ad5 100644 --- a/pkg/epp/util/testing/request.go +++ b/pkg/epp/util/testing/request.go @@ -19,6 +19,7 @@ package testing import ( "encoding/json" + envoyCorev3 "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "github.com/go-logr/logr" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" @@ -38,8 +39,29 @@ func GenerateRequest(logger logr.Logger, prompt, model string) *extProcPb.Proces } req := &extProcPb.ProcessingRequest{ Request: &extProcPb.ProcessingRequest_RequestBody{ - RequestBody: &extProcPb.HttpBody{Body: llmReq}, + RequestBody: &extProcPb.HttpBody{Body: llmReq, EndOfStream: true}, }, } return req } + +func GenerateStreamedRequestSet(logger logr.Logger, prompt, model string) []*extProcPb.ProcessingRequest { + requests := []*extProcPb.ProcessingRequest{} + headerReq := &extProcPb.ProcessingRequest{ + Request: &extProcPb.ProcessingRequest_RequestHeaders{ + RequestHeaders: &extProcPb.HttpHeaders{ + Headers: &envoyCorev3.HeaderMap{ + Headers: []*envoyCorev3.HeaderValue{ + { + Key: "hi", + Value: "mom", + }, + }, + }, + }, + }, + } + requests = append(requests, headerReq) + requests = append(requests, GenerateRequest(logger, prompt, model)) + return requests +} diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go index c5e7c10a..7dc9bdb8 100644 --- a/test/integration/epp/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -43,6 +43,8 @@ import ( "google.golang.org/grpc/credentials/insecure" "google.golang.org/protobuf/testing/protocmp" "google.golang.org/protobuf/types/known/structpb" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" utilruntime "k8s.io/apimachinery/pkg/util/runtime" @@ -51,7 +53,10 @@ import ( "k8s.io/component-base/metrics/legacyregistry" metricsutils "k8s.io/component-base/metrics/testutil" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/client" k8sclient "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/config" "sigs.k8s.io/controller-runtime/pkg/envtest" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" @@ -78,6 +83,13 @@ var ( logger = logutil.NewTestLogger().V(logutil.VERBOSE) ) +func TestMain(m *testing.M) { + cleanup := BeforeSuite() + code := m.Run() + cleanup() + os.Exit(code) +} + func TestKubeInferenceModelRequest(t *testing.T) { tests := []struct { name string @@ -196,57 +208,814 @@ func TestKubeInferenceModelRequest(t *testing.T) { WaitingQueueSize: 10, KVCacheUsagePercent: 0.2, ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, + "foo": 1, + "bar": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 200, + KVCacheUsagePercent: 0.1, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg2": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 6, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + }, + }, + }, + wantHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: runserver.DefaultDestinationEndpointHintKey, + RawValue: []byte("192.168.1.3:8000"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte("76"), + }, + }, + }, + wantMetadata: makeMetadata("192.168.1.3:8000"), + wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test3\",\"temperature\":0}"), + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1 + `, + wantErr: false, + }, + { + name: "noncritical and all models past threshold, shed request", + req: utiltesting.GenerateRequest(logger, "test4", "sql-lora-sheddable"), + // no pods will be picked as all models are either above kv threshold, + // queue threshold, or both. + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 6, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.85, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + }, + wantHeaders: []*configPb.HeaderValueOption{}, + wantMetadata: &structpb.Struct{}, + wantBody: []byte(""), + wantErr: false, + immediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: envoyTypePb.StatusCode_TooManyRequests, + }, + }, + wantMetrics: "", + }, + { + name: "noncritical, but one server has capacity, do not shed", + req: utiltesting.GenerateRequest(logger, "test5", "sql-lora-sheddable"), + // pod 0 will be picked as all other models are above threshold + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 4, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.85, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + }, + wantHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: runserver.DefaultDestinationEndpointHintKey, + RawValue: []byte("192.168.1.1:8000"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte("76"), + }, + }, + }, + wantMetadata: makeMetadata("192.168.1.1:8000"), + wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"test5\",\"temperature\":0}"), + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1 + `, + wantErr: false, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + client, cleanup := setUpHermeticServer(t, test.pods, false) + t.Cleanup(cleanup) + want := &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: test.wantHeaders, + }, + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_Body{ + Body: test.wantBody, + }, + }, + }, + }, + }, + DynamicMetadata: test.wantMetadata, + } + res, err := sendRequest(t, client, test.req) + + if err != nil && !test.wantErr { + t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) + } + if test.immediateResponse != nil { + want = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: test.immediateResponse, + }, + } + } + if diff := cmp.Diff(want, res, protocmp.Transform()); diff != "" { + t.Errorf("Unexpected response, (-want +got): %v", diff) + } + + if test.wantMetrics != "" { + if err := metricsutils.GatherAndCompare(legacyregistry.DefaultGatherer, strings.NewReader(test.wantMetrics), "inference_model_request_total"); err != nil { + t.Error(err) + } + } + + legacyregistry.Reset() + }) + } +} + +func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { + tests := []struct { + name string + requests []*extProcPb.ProcessingRequest + pods map[backendmetrics.Pod]*backendmetrics.Metrics + wantResponses []*extProcPb.ProcessingResponse + wantMetrics string + wantErr bool + immediateResponse *extProcPb.ImmediateResponse + }{ + // Request flow tests + { + name: "select lower queue and kv cache, no active lora", + requests: utiltesting.GenerateStreamedRequestSet(logger, "test1", "my-model"), + // pod-1 will be picked because it has relatively low queue size and low KV cache. + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 3, + KVCacheUsagePercent: 0.2, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.1, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + }, + }, + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="my-model",target_model_name="my-model-12345"} 1 + `, + wantErr: false, + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "x-gateway-destination-endpoint", + RawValue: []byte("192.168.1.2:8000"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte(strconv.Itoa(76)), + }, + }, + }}, + }, + }, + }, + DynamicMetadata: makeMetadata("192.168.1.2:8000"), + }, + { + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"my-model-12345\",\"prompt\":\"test1\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "select active lora, low queue", + requests: utiltesting.GenerateStreamedRequestSet(logger, "test2", "sql-lora"), + // pod-1 will be picked because it has relatively low queue size, with the requested + // model being active, and has low KV cache. + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.1, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg2": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + }, + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1 + `, + wantErr: false, + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "x-gateway-destination-endpoint", + RawValue: []byte("192.168.1.2:8000"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte(strconv.Itoa(76)), + }, + }, + }}, + }, + }, + }, + DynamicMetadata: makeMetadata("192.168.1.2:8000"), + }, + { + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test2\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "select no lora despite active model, avoid excessive queue size", + requests: utiltesting.GenerateStreamedRequestSet(logger, "test3", "sql-lora"), + // pod-2 will be picked despite it NOT having the requested model being active + // as it's above the affinity for queue size. Also is critical, so we should + // still honor request despite all queues > 5 + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 200, + KVCacheUsagePercent: 0.1, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg2": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 6, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + }, + }, + }, + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1 + `, + wantErr: false, + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "x-gateway-destination-endpoint", + RawValue: []byte("192.168.1.3:8000"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte(strconv.Itoa(76)), + }, + }, + }}, + }, + }, + }, + DynamicMetadata: makeMetadata("192.168.1.3:8000"), + }, + { + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test3\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "noncritical and all models past threshold, shed request", + requests: utiltesting.GenerateStreamedRequestSet(logger, "test4", "sql-lora-sheddable"), + // no pods will be picked as all models are either above kv threshold, + // queue threshold, or both. + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 6, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.85, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + }, + wantErr: false, + wantMetrics: "", + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_ImmediateResponse{ + ImmediateResponse: &extProcPb.ImmediateResponse{ + Status: &envoyTypePb.HttpStatus{ + Code: envoyTypePb.StatusCode_TooManyRequests, + }, + }, + }, + }, + }, + }, + { + name: "noncritical, but one server has capacity, do not shed", + requests: utiltesting.GenerateStreamedRequestSet(logger, "test5", "sql-lora-sheddable"), + // pod 0 will be picked as all other models are above threshold + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 4, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.85, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + }, + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1 + `, + wantErr: false, + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "x-gateway-destination-endpoint", + RawValue: []byte("192.168.1.1:8000"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte(strconv.Itoa(76)), + }, + }, + }}, + }, + }, + }, + DynamicMetadata: makeMetadata("192.168.1.1:8000"), + }, + { + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"test5\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "body sent over multiple requests, noncritical, but one server has capacity, do not shed", + requests: []*extProcPb.ProcessingRequest{ + { + Request: &extProcPb.ProcessingRequest_RequestHeaders{ + RequestHeaders: &extProcPb.HttpHeaders{ + Headers: &configPb.HeaderMap{ + Headers: []*configPb.HeaderValue{ + { + Key: "hi", + Value: "mom", + }, + }, + }, + }, + }, + }, + { + Request: &extProcPb.ProcessingRequest_RequestBody{ + RequestBody: &extProcPb.HttpBody{Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lo"), EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_RequestBody{ + RequestBody: &extProcPb.HttpBody{Body: []byte("ra-sheddable\",\"prompt\":\"test6\",\"temperature\":0}"), EndOfStream: true}, + }, + }, + }, + + // + // pod 0 will be picked as all other models are above threshold + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 4, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(1): { + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.85, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + fakePod(2): { + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + ActiveModels: map[string]int{ + "foo": 1, + "sql-lora-1fdg3": 1, + }, + }, + }, + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1 + `, + wantErr: false, + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "x-gateway-destination-endpoint", + RawValue: []byte("192.168.1.1:8000"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte(strconv.Itoa(76)), + }, + }, + }}, + }, + }, + }, + DynamicMetadata: makeMetadata("192.168.1.1:8000"), + }, + { + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"test6\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "inferencemodel's modelName is not translated, passthrough", + requests: []*extProcPb.ProcessingRequest{ + { + Request: &extProcPb.ProcessingRequest_RequestHeaders{ + RequestHeaders: &extProcPb.HttpHeaders{ + Headers: &configPb.HeaderMap{ + Headers: []*configPb.HeaderValue{ + { + Key: "hi", + Value: "mom", + }, + }, + }, + }, + }, + }, + { + Request: &extProcPb.ProcessingRequest_RequestBody{ + RequestBody: &extProcPb.HttpBody{Body: []byte("{\"max_tokens\":100,\"model\":\"direct-"), EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_RequestBody{ + RequestBody: &extProcPb.HttpBody{Body: []byte("model\",\"prompt\":\"test6\",\"temperature\":0}"), EndOfStream: true}, + }, + }, + }, + + // + // pod 0 will be picked as all other models are above threshold + pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + fakePod(0): { + WaitingQueueSize: 4, + KVCacheUsagePercent: 0.2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + "sql-lora-1fdg3": 1, }, }, fakePod(1): { - WaitingQueueSize: 200, - KVCacheUsagePercent: 0.1, + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.85, ActiveModels: map[string]int{ "foo": 1, - "sql-lora-1fdg2": 1, + "sql-lora-1fdg3": 1, }, }, fakePod(2): { - WaitingQueueSize: 6, - KVCacheUsagePercent: 0.2, + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, ActiveModels: map[string]int{ - "foo": 1, + "foo": 1, + "sql-lora-1fdg3": 1, }, }, }, - wantHeaders: []*configPb.HeaderValueOption{ + wantMetrics: ` + # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. + # TYPE inference_model_request_total counter + inference_model_request_total{model_name="direct-model",target_model_name="direct-model"} 1 + `, + wantErr: false, + wantResponses: []*extProcPb.ProcessingResponse{ { - Header: &configPb.HeaderValue{ - Key: runserver.DefaultDestinationEndpointHintKey, - RawValue: []byte("192.168.1.3:8000"), + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "x-gateway-destination-endpoint", + RawValue: []byte("192.168.1.2:8000"), + }, + }, + { + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte(strconv.Itoa(74)), + }, + }, + }}, + }, + }, }, + DynamicMetadata: makeMetadata("192.168.1.2:8000"), }, { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte("76"), + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"direct-model\",\"prompt\":\"test6\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, }, }, }, - wantMetadata: makeMetadata("192.168.1.3:8000"), - wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test3\",\"temperature\":0}"), - wantMetrics: ` - # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. - # TYPE inference_model_request_total counter - inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1 - `, - wantErr: false, }, + // Response flow tests { - name: "noncritical and all models past threshold, shed request", - req: utiltesting.GenerateRequest(logger, "test4", "sql-lora-sheddable"), - // no pods will be picked as all models are either above kv threshold, - // queue threshold, or both. + name: "responsebody sent over multiple requests, content-type is json, buffer", + requests: []*extProcPb.ProcessingRequest{ + { + Request: &extProcPb.ProcessingRequest_ResponseHeaders{ + ResponseHeaders: &extProcPb.HttpHeaders{ + Headers: &configPb.HeaderMap{ + Headers: []*configPb.HeaderValue{ + { + Key: "content-type", + Value: "application/json", + }, + }, + }, + }, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lo"), EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{Body: []byte("ra-sheddable\",\"prompt\":\"test6\",\"temperature\":0}"), EndOfStream: true}, + }, + }, + }, + + // + // pod 0 will be picked as all other models are above threshold pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ fakePod(0): { - WaitingQueueSize: 6, + WaitingQueueSize: 4, KVCacheUsagePercent: 0.2, ActiveModels: map[string]int{ "foo": 1, @@ -271,20 +1040,74 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, }, }, - wantHeaders: []*configPb.HeaderValueOption{}, - wantMetadata: &structpb.Struct{}, - wantBody: []byte(""), - wantErr: false, - immediateResponse: &extProcPb.ImmediateResponse{ - Status: &envoyTypePb.HttpStatus{ - Code: envoyTypePb.StatusCode_TooManyRequests, + wantErr: false, + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_ResponseHeaders{ + ResponseHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "x-went-into-resp-headers", + RawValue: []byte("true"), + }, + }, + }, + }, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-sheddable\",\"prompt\":\"test6\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, }, }, - wantMetrics: "", }, { - name: "noncritical, but one server has capacity, do not shed", - req: utiltesting.GenerateRequest(logger, "test5", "sql-lora-sheddable"), + name: "responsebody sent over a single request, but empty body with EndOfStream in the second request(this is how envoy operates); content-type is json, buffer", + requests: []*extProcPb.ProcessingRequest{ + { + Request: &extProcPb.ProcessingRequest_ResponseHeaders{ + ResponseHeaders: &extProcPb.HttpHeaders{ + Headers: &configPb.HeaderMap{ + Headers: []*configPb.HeaderValue{ + { + Key: "content-type", + Value: "application/json", + }, + }, + }, + }, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-sheddable\",\"prompt\":\"test6\",\"temperature\":0}"), EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{Body: []byte(""), EndOfStream: true}, + }, + }, + }, + + // // pod 0 will be picked as all other models are above threshold pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ fakePod(0): { @@ -313,69 +1136,261 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, }, }, - wantHeaders: []*configPb.HeaderValueOption{ + wantErr: false, + wantResponses: []*extProcPb.ProcessingResponse{ { - Header: &configPb.HeaderValue{ - Key: runserver.DefaultDestinationEndpointHintKey, - RawValue: []byte("192.168.1.1:8000"), + Response: &extProcPb.ProcessingResponse_ResponseHeaders{ + ResponseHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "x-went-into-resp-headers", + RawValue: []byte("true"), + }, + }, + }, + }, + }, + }, }, }, { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte("76"), + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-sheddable\",\"prompt\":\"test6\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, }, }, }, - wantMetadata: makeMetadata("192.168.1.1:8000"), - wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"test5\",\"temperature\":0}"), - wantMetrics: ` - # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. - # TYPE inference_model_request_total counter - inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1 - `, - wantErr: false, }, - } - - // Set up global k8sclient and extproc server runner with test environment config - cleanup := BeforeSuit(t) - defer cleanup() - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - client, cleanup := setUpHermeticServer(t, test.pods) - t.Cleanup(cleanup) - want := &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_RequestBody{ - RequestBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: test.wantHeaders, + { + name: "responsebody sent over a single request, but empty body with EndOfStream in the second request(this is how envoy operates); content-type is json, buffer", + requests: []*extProcPb.ProcessingRequest{ + { + Request: &extProcPb.ProcessingRequest_ResponseHeaders{ + ResponseHeaders: &extProcPb.HttpHeaders{ + Headers: &configPb.HeaderMap{ + Headers: []*configPb.HeaderValue{ + { + Key: "content-type", + RawValue: []byte("text/event-stream"), + }, + { + Key: "status", + RawValue: []byte("200"), + }, + }, }, - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_Body{ - Body: test.wantBody, + }, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"NEVER","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"GONNA","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"GIVE","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"YOU","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"UP","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}`), + EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_ResponseBody{ + ResponseBody: &extProcPb.HttpBody{ + Body: []byte("data: [DONE]"), + EndOfStream: true}, + }, + }, + }, + wantErr: false, + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_ResponseHeaders{ + ResponseHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "x-went-into-resp-headers", + RawValue: []byte("true"), + }, + }, + }, }, }, }, }, }, - DynamicMetadata: test.wantMetadata, - } - res, err := sendRequest(t, client, test.req) + { + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"NEVER","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false, + }, + }, + }, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"GONNA","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false, + }, + }, + }, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"GIVE","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false, + }, + }, + }, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"YOU","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false, + }, + }, + }, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[{"index":0,"text":"UP","logprobs":null,"finish_reason":null,"stop_reason":null}],"usage":null}`), + EndOfStream: false, + }, + }, + }, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte(`data: {"id":"cmpl-0fee233f-7d56-404a-acd3-4dad775d03d9","object":"text_completion","created":1741379018,"model":"tweet-summary-1","choices":[],"usage":{"prompt_tokens":7,"total_tokens":17,"completion_tokens":10}}`), + EndOfStream: false, + }, + }, + }, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("data: [DONE]"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + client, cleanup := setUpHermeticServer(t, test.pods, true) + t.Cleanup(cleanup) + responses, err := streamedRequest(t, client, test.requests, len(test.wantResponses)) if err != nil && !test.wantErr { t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) } - if test.immediateResponse != nil { - want = &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_ImmediateResponse{ - ImmediateResponse: test.immediateResponse, - }, - } - } - if diff := cmp.Diff(want, res, protocmp.Transform()); diff != "" { + if diff := cmp.Diff(test.wantResponses, responses, protocmp.Transform()); diff != "" { t.Errorf("Unexpected response, (-want +got): %v", diff) } @@ -390,13 +1405,14 @@ func TestKubeInferenceModelRequest(t *testing.T) { } } -func setUpHermeticServer(t *testing.T, podAndMetrics map[backendmetrics.Pod]*backendmetrics.Metrics) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { +func setUpHermeticServer(t *testing.T, podAndMetrics map[backendmetrics.Pod]*backendmetrics.Metrics, streamed bool) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { // Reconfigure the TestPodMetricsClient. res := map[types.NamespacedName]*backendmetrics.Metrics{} for pod, metrics := range podAndMetrics { res[pod.NamespacedName] = metrics } serverRunner.TestPodMetricsClient.SetRes(res) + serverRunner.UseStreaming = streamed serverCtx, stopServer := context.WithCancel(context.Background()) @@ -475,7 +1491,7 @@ func fakePod(index int) backendmetrics.Pod { } // Sets up a test environment and returns the runner struct -func BeforeSuit(t *testing.T) func() { +func BeforeSuite() func() { // Set up mock k8s API Client testEnv = &envtest.Environment{ CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "crd", "bases")}, @@ -499,7 +1515,7 @@ func BeforeSuit(t *testing.T) func() { // Init runtime. ctrl.SetLogger(logger) - mgr, err := server.NewDefaultManager("default", "vllm-llama2-7b-pool", cfg) + mgr, err := server.NewManagerWithOptions(cfg, managerTestOptions("default", "vllm-llama2-7b-pool")) if err != nil { logutil.Fatal(logger, err, "Failed to create controller manager") } @@ -520,7 +1536,7 @@ func BeforeSuit(t *testing.T) func() { logutil.Fatal(logger, err, "Failed to setup server runner") } - // Start the controller manager in go routine, not blocking + // Start the controller manager in a go routine, not blocking go func() { if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { logutil.Fatal(logger, err, "Failed to start manager") @@ -561,14 +1577,16 @@ func BeforeSuit(t *testing.T) func() { } } - assert.EventuallyWithT(t, func(t *assert.CollectT) { + assert.Eventually(nil, func() bool { modelExist := serverRunner.Datastore.ModelGet("my-model") synced := serverRunner.Datastore.PoolHasSynced() && modelExist != nil - assert.True(t, synced, "Timeout waiting for the pool and models to sync") + return synced }, 10*time.Second, 10*time.Millisecond) return func() { _ = testEnv.Stop() + _ = k8sClient.DeleteAllOf(context.Background(), &v1alpha2.InferencePool{}) + _ = k8sClient.DeleteAllOf(context.Background(), &v1alpha2.InferenceModel{}) } } @@ -588,6 +1606,44 @@ func sendRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, return res, err } +func streamedRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, requests []*extProcPb.ProcessingRequest, expectedResponses int) ([]*extProcPb.ProcessingResponse, error) { + for _, req := range requests { + t.Logf("Sending request: %v", req) + if err := client.Send(req); err != nil { + t.Logf("Failed to send request %+v: %v", req, err) + return nil, err + } + // Brief pause for the goroutines to execute sequentially and populate the internal pipe channels sequentially + // without the pause there can be a race condition where a goroutine from a subsequent request is able to populate + // the pipe writer channel before a previous chunk. This is simply due to everything running in memory, this would + // not happen in a real world environment with non-zero latency. + time.Sleep(1 * time.Millisecond) + } + responses := []*extProcPb.ProcessingResponse{} + + // Make an incredible simple timeout func in the case where + // there is less than the expected amount of responses; bail and fail. + var simpleTimeout bool + go func() { + time.Sleep(10 * time.Second) + simpleTimeout = true + }() + + for range expectedResponses { + if simpleTimeout { + break + } + res, err := client.Recv() + if err != nil && err != io.EOF { + t.Logf("Failed to receive: %v", err) + return nil, err + } + t.Logf("Received request %+v", res) + responses = append(responses, res) + } + return responses, nil +} + // readDocuments reads documents from file. func readDocuments(fp string) ([][]byte, error) { b, err := os.ReadFile(fp) @@ -658,3 +1714,41 @@ func registerMetricsHandler(mgr manager.Manager, port int) error { } return nil } + +// inject options that allow multiple test runs to run +// https://github.com/kubernetes-sigs/controller-runtime/issues/2937 +func managerTestOptions(namespace, name string) ctrl.Options { + return ctrl.Options{ + Scheme: scheme, + Cache: cache.Options{ + ByObject: map[client.Object]cache.ByObject{ + &corev1.Pod{}: { + Namespaces: map[string]cache.Config{ + namespace: {}, + }, + }, + &v1alpha2.InferencePool{}: { + Namespaces: map[string]cache.Config{ + namespace: { + FieldSelector: fields.SelectorFromSet(fields.Set{ + "metadata.name": name, + }), + }, + }, + }, + &v1alpha2.InferenceModel{}: { + Namespaces: map[string]cache.Config{ + namespace: {}, + }, + }, + }, + }, + Controller: config.Controller{ + SkipNameValidation: boolPointer(true), + }, + } +} + +func boolPointer(b bool) *bool { + return &b +} diff --git a/test/testdata/inferencepool-with-model-hermetic.yaml b/test/testdata/inferencepool-with-model-hermetic.yaml index c9ca763e..36b6e539 100644 --- a/test/testdata/inferencepool-with-model-hermetic.yaml +++ b/test/testdata/inferencepool-with-model-hermetic.yaml @@ -50,3 +50,14 @@ spec: targetModels: - name: my-model-12345 weight: 100 +--- +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: inferencemodel-direct-model-name + namespace: default +spec: + modelName: direct-model + criticality: Critical + poolRef: + name: vllm-llama2-7b-pool \ No newline at end of file From bd43ea4813c5cc5fc49d8bab5239cb954564a0a6 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Thu, 6 Mar 2025 01:33:19 +0000 Subject: [PATCH 94/96] Updates artifacts for v0.2.0-rc.1 release --- config/manifests/ext_proc.yaml | 4 ++-- config/manifests/vllm/gpu-deployment.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/config/manifests/ext_proc.yaml b/config/manifests/ext_proc.yaml index d70467ee..a8af1691 100644 --- a/config/manifests/ext_proc.yaml +++ b/config/manifests/ext_proc.yaml @@ -71,8 +71,8 @@ spec: spec: containers: - name: inference-gateway-ext-proc - image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main - imagePullPolicy: Always + image: registry.k8s.io/gateway-api-inference-extension/epp:v0.2.0-rc.1 + imagePullPolicy: IfNotPresent args: - -poolName - "my-pool" diff --git a/config/manifests/vllm/gpu-deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml index d16a46a4..a4ccfc0b 100644 --- a/config/manifests/vllm/gpu-deployment.yaml +++ b/config/manifests/vllm/gpu-deployment.yaml @@ -14,8 +14,8 @@ spec: spec: containers: - name: lora - image: "vllm/vllm-openai:latest" - imagePullPolicy: Always + image: "vllm/vllm-openai:v0.7.3" + imagePullPolicy: IfNotPresent command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] args: - "--model" From 0068bea23e255773c973340dd114c0367e889ac8 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Thu, 6 Mar 2025 02:14:15 +0000 Subject: [PATCH 95/96] Replacing main for getting started guide Signed-off-by: Kellen Swain --- site-src/guides/index.md | 44 ++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 8bcee6e2..d175a62d 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -29,7 +29,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. ```bash kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Llama2 - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/vllm/gpu-deployment.yaml ``` #### CPU-Based Model Server @@ -38,14 +38,14 @@ This quickstart guide is intended for engineers familiar with k8s and model serv Deploy a sample vLLM deployment with the proper protocol to work with the LLM Instance Gateway. ```bash kubectl create secret generic hf-token --from-literal=token=$HF_TOKEN # Your Hugging Face Token with access to Qwen - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/vllm/cpu-deployment.yaml ``` ### Install the Inference Extension CRDs ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml ``` ### Deploy InferenceModel @@ -53,14 +53,14 @@ This quickstart guide is intended for engineers familiar with k8s and model serv Deploy the sample InferenceModel which is configured to load balance traffic between the `tweet-summary-0` and `tweet-summary-1` [LoRA adapters](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server. ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/inferencemodel.yaml ``` ### Update Envoy Gateway Config to enable Patch Policy** Our custom LLM Gateway ext-proc is patched into the existing envoy gateway via `EnvoyPatchPolicy`. To enable this feature, we must extend the Envoy Gateway config map. To do this, simply run: ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/enable_patch_policy.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/enable_patch_policy.yaml kubectl rollout restart deployment envoy-gateway -n envoy-gateway-system ``` Additionally, if you would like to enable the admin interface, you can uncomment the admin lines and run this again. @@ -68,7 +68,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Deploy Gateway ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gateway.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/gateway.yaml ``` > **_NOTE:_** This file couples together the gateway infra and the HTTPRoute infra for a convenient, quick startup. Creating additional/different InferencePools on the same gateway will require an additional set of: `Backend`, `HTTPRoute`, the resources included in the `./config/manifests/gateway/ext-proc.yaml` file, and an additional `./config/manifests/gateway/patch_policy.yaml` file. ***Should you choose to experiment, familiarity with xDS and Envoy are very useful.*** @@ -81,13 +81,13 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Deploy the Inference Extension and InferencePool ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/ext_proc.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/ext_proc.yaml ``` ### Deploy Envoy Gateway Custom Policies ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/extension_policy.yaml - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/patch_policy.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/extension_policy.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/patch_policy.yaml ``` > **_NOTE:_** This is also per InferencePool, and will need to be configured to support the new pool should you wish to experiment further. @@ -96,7 +96,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv For high-traffic benchmarking you can apply this manifest to avoid any defaults that can cause timeouts/errors. ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/traffic_policy.yaml + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/traffic_policy.yaml ``` ### Try it out @@ -120,16 +120,16 @@ This quickstart guide is intended for engineers familiar with k8s and model serv The following cleanup assumes you would like to clean ALL resources that were created in this quickstart guide. please be careful not to delete resources you'd like to keep. ```bash - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/traffic_policy.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/extension_policy.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/patch_policy.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/ext_proc.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gateway.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/enable_patch_policy.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/cpu-deployment.yaml --ignore-not-found - kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/vllm/gpu-deployment.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/traffic_policy.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/extension_policy.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/patch_policy.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/ext_proc.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/gateway.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/gateway/enable_patch_policy.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/inferencemodel.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/crd/bases/inference.networking.x-k8s.io_inferencepools.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/vllm/cpu-deployment.yaml --ignore-not-found + kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/release-0.2/config/manifests/vllm/gpu-deployment.yaml --ignore-not-found kubectl delete secret hf-token --ignore-not-found ``` \ No newline at end of file From 5362a9eb14c1d0c9f22518a285fb6eaeb2d9a4bb Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Thu, 13 Mar 2025 20:44:35 +0000 Subject: [PATCH 96/96] Updates artifacts for v0.2.0 release Signed-off-by: Kellen Swain --- config/manifests/ext_proc.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/manifests/ext_proc.yaml b/config/manifests/ext_proc.yaml index a8af1691..15bebb6a 100644 --- a/config/manifests/ext_proc.yaml +++ b/config/manifests/ext_proc.yaml @@ -71,7 +71,7 @@ spec: spec: containers: - name: inference-gateway-ext-proc - image: registry.k8s.io/gateway-api-inference-extension/epp:v0.2.0-rc.1 + image: registry.k8s.io/gateway-api-inference-extension/epp:v0.2.0 imagePullPolicy: IfNotPresent args: - -poolName