From 2f18756707a7e611260194a52c3d577ccbf91b5e Mon Sep 17 00:00:00 2001 From: Rob Scott Date: Tue, 1 Apr 2025 16:30:38 +0100 Subject: [PATCH 01/74] Adding larger logo (#630) --- site-src/images/logo/logo-text-xl-dark.png | Bin 0 -> 88763 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 site-src/images/logo/logo-text-xl-dark.png diff --git a/site-src/images/logo/logo-text-xl-dark.png b/site-src/images/logo/logo-text-xl-dark.png new file mode 100644 index 0000000000000000000000000000000000000000..4d878e5c807084c5f39dcd57e8d72c787ceb3d24 GIT binary patch literal 88763 zcmZ5{1z1#F7w!yQf{2vDkdi7T-6Eh;D$*@5sKfw6=g=alNT*6F-OT_BBEx`mGe`{G zG4wrr`rrF~_dE|Xz;n*tXRp2DU2DBN{E4RO6|x&-AQ0%vqlXWkfk4E4AP`uKgb4T# z?WoQW@I~tQ&;Sks5yf2ofTzFv<^ca>bWzrK(S|`?+%24~LGJGEcV9W!!YwTvt?$B| zZBjR6ZU8rNU*7cmm5ZGQ`bt`3&gR;=u{o-o)8R$EsW z7i&9qWZobr2*e6{^g!{sN7~wyd)kMgv5W0q(TSUD{j^sVNuOP(y-s+0)aYKW!FvW8 z8kzwOk?==@F1wt+ID6wx@AW{U%G|%dz2BG5ny*Mm{lH-SK&(L;d)O&^)o}tfY2yZU zYuOyJH6?GK9`SW^hi+ImV~so0;BTxx`XV?243PKCZwpc^y}9tdu1{V_WQqqNGzGml zlhLI9_r=Jn4z9O)=Y%{$G9pi&N%-A#@2(UrBY|L|%aW7h2Y5lpp;2s=vHTkfoMHB8R27 zj;{7v?Tweqox5-UHQk$s(ZEe--%0RxBLXBDnL4>JF-Tek6QWMGf6vEmpaQPXHX^`X zb>tPiQ=v4bEUOrMu4Zl&+WDW`*bNkb`_$MRxwd@GokKhOQ&j@be)*Hy{n*m~_nST7 z47#2b0b4+NB5M$3j1V>9PslsSn1B%x8M=Q9jk*g&xb(aGDh3ZTffO`TwiNR4wk1wo zeKB~?WuvwwyN^L!ICClt`!v?3S(ErGz5~Zg$&0tWZU1?x_fyxi)bu|i1VCo^p~pP9 z4(WB7x=-=9N~+6N*4}TE80}(nw#?F ztFfjNeXWES4e@g}w7bi5e4}uerLYs@KvrMEEq)AF7Kh!_|B9TFtS?7kHy{ELM>?Jo z_H2S-E+H?a_Na?%CDrmHam^HDs!+kKB_}9av(vY?dJ74^gGx0;9dYXl+Q$Tz=1tJo z($x~+m>p^RZdB+0ci{(_4IFYP}iMk>mY zIgE~XX~c*-pZtb?T3;R$Yi#7aZ|#}geFlSVG>e}0>)78m8R|ei3Q6;}lMK@k=x8e{ zVo@p;KNb(T!1p5`4F1CNAN4syr>SY>^GfD`BqGdj?LTKBobkv@UxnDuxBA!gEthC} z*W|80Mefhs7+W~6@GDCm5vx4(JpJHAnAVgP83#?gww%(Q_X&)m#mV$4~^)V!cmzY5|2jKX&AZxoi8e1!BmJ;+xIG>aIIy!X0rn(>Zl_wiVZ^jO4IP z+UW2KwXR*7>>BE|_;VWbrvHCjF<{(0XN(MBHyd(fca4}AQbM8tq1MkeKP31W!SVb9 zL(X^V#}Sd`FFsfDw8i#gEDf9-%^uDA)2x=in7DDmF>T7xJEK( z^wv^)Hk^9f4mw@H?+l6)ch6W-rWjgQsxV)3HSh>ZXb zHfZ-TFCWRsglsW$EhCO`i-m>oKaT2~JmFGMHwi`!nHXSU5g|!z`&HV*S4ldZQiU?= ziB5*9j_1{c7V$l_+b>F0OBwYNSoE*)jeNccK?@hTkR#LP1O`WWN5(DK|35AZFmC%x zzHSSV|<4i}Mf-fFIxKVePpIr>?uMdiM=JSM`6Hy24 z{3f}jtfE}-G&Zs7q@so5^jaja@ zzG?|K;TDY&N-<&v-Xb)CA*v{Pu^Q)VAKh_CFmHhxZ;1WcBK4{JEYH-iC$vabVE!)e zz0eOTJOZS0l)O<-o?iR-Y0&KmuHy-@{IbKK!#Ft-GmNK)r-y&%%=5*+{l>f2$Lme- z2_a+*LdloTl~QcTbq77HDQ35Bu4WQhB#pPjAbKU;&i>8Qk$C|Uts#f@2DxBMq5R@? zU%NX`Rn;ovR>r^e>O6ZH$(YcY|IstDkRovZof1c6G0M_K$I;)qt|r!3D-d+%jbtGm zZx}h^`)^1z;Z^9BrN&rcu9+vq1Oki2n!)Bi(i?hwtq-aiVy=7u|18ovI_qh&@wIq8 z`}4pB&gfILxVYQaVn>GbSNg@9bE=DF=1@{@=I3p}Pd{e9_)OwAOn;4FxrLxQ$H0fA0F0zUsfU+-}9%&@hiqQvz@;Ak5O1juLoqD{Y3PFWy=&Q>hzW#RfS%9w>;H`Uh+E{Q%9j6h54vtqJx2A9 zX<6Bj^;Psf3>2F(BjjtRi;(mJB z+ugedA}aF@fProBKhg{=iglW(rPOWwQV|vT1Spp?-{0lcCCvSstc9@F(tLlh9EDJ4c@^w$XYmP z{^-eas&3X{izm_@5eFtWDl8asTK&XTY%==L7k+yWv9bx(!jG(8r{C|7?%_E5>WN zv`c3;a*ouFZ)09e+;}lrpC6u@@$yrdwId;RBC0UbrA~leFVmfS(frp>T|h~#(I435 z{GAC;G;UKExm|O-)k+Y}A25U8V-fLZCOaD_G1$hFhZbBcSP_&4589P)`l}(-WPJnO ze~_*mS#w2E+)WnS`RMOGl~q(58{p6U4b_e%#f%K#V{}6CYs=*?Vo*)K`AN%eNgoxb z(f5`JGr$zjL8o;2q<;|z0gee$U-X@sgo#XMt{~%2Ab6O0Ju6T34^-|^ZQW0mt_3!0 z!?0gHZ*dx5us&6`k9g;NvwP%0-PRX-bw)kp82L|$S8lxTiX*n-gsuVDNoq?x@X6WK z76);e@yMFqi=rueBWWz`v$)0Ug8b9yS81ZEMcod z3r_&`q*0;ybMldDdEeYHP)r_@@6v|*TTwgdaOPx47_bZilpWa?ORu$=Bf4kDSo@DQ zWUIcOg`fejABgd(jS1}5RXy@Q6yKV(wU_&OMc}sP8p5v)@z$Ci8}2G`_&QfY zCB8L#WbI?8$hdIpV?s7PC=(<{G)ZMlbUM-gMESw97f@xS-1fv^x`Q^VuuYP_XC4Pk zW))V>RKWX3E&4eKmkJmOeZV-rCydqls8+zm?qG>;dK?uU^ab6dP|dY1dlr5crYY-WGgL{mvnI?5er2AZZe%dW3%( zp36m>05H?oH}?}agvP7h zX<|r}Q1SRmjHvSpTaM38(hh{1iE}myx6X)h#$=QdDqA^0j*XAfTQWr5_6gu=wTB7n$(;2q!q%UB)@R!3X~ zloZ@1Ir0LkM#|N7w3F0n_-8_P-)D&FhJVtNZNZ@Cx*n&I_fWu<{8*H-j<#^A`BZNh zM_oTOy;Luw!gY;LU0`=YhGX$>!SR zmkoitE(}814YW&!Q<;Sewx5hWhN_trk1Ylw${c0sRltd${~;BYf`8t5o(wxV1Qnyj z8NR)>f{06pr^-;O{joAmwjfCb0T3Fm8DD+>spEbFvvEUd0P)4t{k-zY%J2tSUu60h z&hP9S24s9$S3I6CBlJ1V7uHNMd7QmkBf+RdQmu$%-_i?YH7TP1=go;?O7Kgrx#aO_ z{H{$55J`vg4+!)9-5>m*Z(ADM7ur9`!qNlev$!m^b-eeI1TLSw4R zL#H1^fttD%v>AUe_(QH_cu?+l4@pPJmc61wOjXF8nZg%oWKk^MTJ9tt>lu>51Za?o|n_uc-kw(=$g`1oIWOoXY!LJ5^<`HQ0IlHOp?fw#CR!uE#A z7R;xpL2+AJFRAD8s<-(2h9GN_m-WY z&%+VgKBHAjic=_A^2p@kWibE$O|L3cJ zljs>jF38+}w_>=uI48@1GZRJwmL!#`igJqjd!hs4d4K9N~>TzGoyqZ<MVswqmqb@$=2We~J`X%`fIFxWH2DL;PaGyI?nW1kpnJ@cI#|?@bWcx(N*eYI7UIGySk1VS5{@&%*E1mfUxD#%$-Zk~u zqbpn7S(0{78Iw?j6g&icq}8DOhzso|xC7EvSWpCCWy6J^`0@1at11QJCs!I*)4nc` zC_a7(r2}`+*lB4$-hzi_Nvq2C5fr+jn^NC_eZkC1kqefRwgB?E1VD?~_tCP|Y3(+E zPSg=A#`+0uh2b*y7iU?mpq~+v$X2A4`Tf&sBbH8kqUvw3)d2T^ngDmim2G*LBic3F zev4PI!UB2S92N+2)e0&6*)a&~c`YxvUV=>M29qdKZCjU?&{PH@XN@BsbMezd zS1%{0=dACuzs`hFLKYCfka=qSrPuVU2i9+4jg@Zn)&(WUUsb0_RoxQpB~m$TFSv;a zL24kU&KU0}{c%BF6kAiF^Q&jH1RZ*e8H8zIrqHnUm%mN%90-eD(M5;P;q=f0TFe>b zu`4JKw?=0HrN>w0@*Jf~gJ0}OgbL6!x`kA=NgnQV_R8W(q?xmKFa_R;`ROhDBjyU= zb41Z~-{r_t$7V>{I!dQq(dly^HA_%y0t9iO_WFKF_slS48MilgNCbeM7$DyW*H^3wMqA4e4+EEACI zn`+MgKIoBoj#G*bz8<*64Ly{1Vg%L7Oap(lz=nAeEl9@+%PQ23kht( z$!$9*3!%gIbBA2`#x1QX!HOXQqX5o1VeDiMq;pFkkX5LVMG|}W5ymH;z$7J_02(|0 zcITe{qb=#vD{@?_NnGjAW^3fMLE>|~|I z63bM?*8np~?6LiOYoE3mm%xh}bj*zK(Fr+$k^sJ3Q zdYzd+B6e<0z<5@l{Nx*O88}*Su0;yf+O>^4l22}Q%t>k3FSiiLp={rC>J@VaS!fssk zfkp2>|H|Eo!Cn{$Si?Oahu}br#LZ&M&m>X_MPd~txhGOk=n>S7sx=#(h`1m`bMCgZ zD3rP%a|igRjtxNF?&lJ7{_$<^)ynsKwn>o{NabLrx$+@J$Ma?W>QVrWjkRD*y+rID z)=YrQs}#Xr#U3_B+%5G*(b{(l7QdX_d}&8|Rw2#c5rDa3+@%N=L!tbr8u8LJye3jZ zMd?~v)x@efnKcC^7pO(LL$8`6-w-ogfSc7W!ZfHcKIZYs>5HGU;Kg)XL%9=12+gaD zIroS}x)1M#j&attz^TOWNQ)q@Xi@O1S0gO_E*J}uhBnk&r9scMER!0lWy(I^p_T#d zbeEsQtE2JAHe7IuOBcryyLNmB94AQSfkZ*>RV*h%BL5=MyFWTWzGi`Lbtk!oNJ)>^ z*8G{L`*i3AVG^{pc|PCfIa@qK=yzt&}W9RLBEO?EVTQ!@&_@O;*$v@DGU`qN0w zV>Ov4#)_q{`&CCj*q+BNgZF;(m=`4X{Mb`)PEV#y8WSKr2u1H@2hSpzp$Gh07Taf# z*9vX~OSDM8j1IvO?evp$c$YD2g%B#(eCTilLEK~akdH78UWDYx=zvEi=fLBeV`vX! zZ-TwV%sS~ny(!5hSl41{h{rEfbKYH%zW|}ByZS;xi#UVxyLh4{C0+Aqo2q|xIC;r< z$Iq&#EuZ5ezusOwj*W7YNSPQkWYzm83+j?DT=YLT=*%7ez^yb7z99DbKC{GWg1X&{ zD&{1aaqHdl5avk@A|~+@(@GgbsZ(+4C67G`)Wy!QOOL-+5{!m{YEleFDkI{jeodhYQ5HmUk&F34AAiz*a$@edagHJ$aEx4SoQ77iFXDI8olfL3@{Ho?` z&J%r6^%Zl@OBK)^ripq)(SU&1gMX-cP$W7qw33eop1;>i|RKZ#VLM)vv zM7P{#G=*w+hdRy#MfeY%laD&3OZO`k6d?y7E7|GryF`r%%TAmxDYvTBso1+C@c`Ry z`qyi}zXCpfj`?IoS$qCVMegY9oyKiV@03u^vB&&<#F;=hPzzSPaMOQWMM>?M{OTCb zTDO=#D7tr#wfsqOAp;@qk&kuN-Uf_QQb>^8rxz8ixjov6(515c5ei~%E4hDI72{QY zk~x?5m8jZN%D;XAzeR1fwm~Mng&=9OVAw4TLC&1Jld>GPk=_bPNOJUdnDiNzNXBB5`w$xbFrLYs2F7}-@x3D7j@U&z&^iFWFoR}?5v>L z2!0XS=&M}U)MXxrl!$Fuv|%|&YA%l4c!`&wVifX4C)%%OnoWwEaR*pK={eQBW2b?#pq!hGp@=KQisx#qaiIQ(D z{}X3h>01&@)p;RLJMaMabwQiQPZKBJuz8w9GaJ&4ZEpev2VqwqFiq#Yh}#25J0Cj* zZ&YmJvTD6YMOki~z?lXAeh;Zungq*MeP;pZI=zApU_jrUUoAigOqip400kP{B-IsP zEEh-~OS&GtOPdw;%HQupp3oZ2r|-KTairUE4l~Vh#@-r|tth@YnGQWfb4_y3)>EEo zuth@Hvlt0%itOOBR^36;R|3r+m*Xu!#e@ce{#s;LWw2phUY8?ADct9GQAA4Z0&kH_ zFEoTuMaLlc=MIU^LTl&Ga$IIN`*WT7rv{LZl|Ojy2$Stb0nN!;B+qSBe3uv-MlbfcM8DHR+dNWgI8LIN)d6v_RvNS0G$?$Hn* z@|XA0Fj*ntFc=1g;oB=P_fZbgs9!H&#T_5uQ4&Bw%ldcD+1iR?nH^DU_xeT3%t=eH z>f_t7<2sd~8KA zXT3Gx2qmV>AFnZnfg|r-k*Nxzt+tPtTW!@ufb`~If+BF_L({g1qPdMSLCoALjVw~K zrVXV_EkoQU#jL;l5xtE|PD`_0p8%T@OOY10$iBMefwK3XEXq4v9pwz{oNUhClycL% zv2r}DzU9mJ(Vr_g0VEHW8>YhkMU-8J$rOEB>>llfdY9_|oq7LEkZJkRt9L>`<7X;# z;qptUeTUH`xpSj^mqGMi%$K*U484R4w^~n9lpb5ym27UWf2r#!12ipaLTc>!L|4bS z=JeF?BUcT_3H^yjl_qT7{$7jmVh)~P!bM^pzT64M+e00Zp0_8s|8_0WaEs4K?jg+e zglO@g>$pUWRIlzRU4`BuOu!ca^QXgCpB30ct&zz{LYx6(-PAjX?ShO*H?iMB-I_1b z9Vj*(OB{}n;vxA-xWaLc8Ogr#%{89F8yr3expq{j6n5pkyGTb>(BbG?EjMnNEB{y>WdxG%v02rIcHjj=swi;et=|IO zNw|Iug07(JqSrI6ptMM@1E_UdmJ0INsEb?+pHRq|TJyi+gvpCMOf~D_-%XT=w&`UYfEZHWgQUZh+Z< zDbiqWtSbu={~9D;qe5Gpr}Q+y>4Zu}DECY6A*C(iDFIAfO2>~@9P zN=GeQw-GP#r@>D`3d_hvP6%7)_Dr#p-Ey{_tEvbXP4I(euHGm(+MYxxcNL-K!JzsM ze)n?=Ptw>JGAiTe5SYaseQT&q8d7*F;253fl(w4IAl40&l5Q|fIvkc`$a;RG;a%3M27{Ia*2nePk{#cw7(z0R}s z+^siO{*XN|e&50bn|5MFhGn7h$uxf!vB`RtiLQ$Y<~4`XmrZC>OR}U&BRR`3Y{iJfkvhaER)qgY#aQWEbC$z`xrXLE zy&0yto+#!*znUP?n-d1R4#Y6nIwrj+-O7PdZNt{q{dBvT=ATXMiRHvjdST#DH~iK^tHREmyyUSBbe+4*@SoN%br)kYSsyq4XYUgN zEJlk;Y;VAkDC6yW6754Cpvs0hXm%n5NF%C#8G$?M49{*@00kwe=C~_)XMHY1Nuezk zpgJ^`-~GwII$lsrI#A`hRd+E4mGizYIj6FBIDIRjJ8AaYFb3{*b%|W{az}`jQ00eV$6&8=zie*I2UhE zZpPf_JW5Qr<*P$?II!qKuW4>bg4Wb$)^~zJMK35{-?SVQ`|jp5Io$WiGU9Vx&#&p( zmv&LVhBsc?h5`JHb>`@V>hufWk+ijMW4C-xIF#J$O75&k%n}b(R!hGnk8Jb#oFB{V zf0tL7Y$q`0$Wz6$j5-|IS5Bg&#?jh&cwTq=zNx%x5sM++Y+r#glT_8qPG6w0+IWL% zpeAfJb@lp}L#q`CVYntmvTCh(=Pk)ju#(Ks7q*uZ#aojx-3YB~e(_t^sVo^u&Qxi->PZdvMYN=DSy(^2M-zXui`mx54d$@GDJT*law6!BBR17gOG2dF_wxDrBzC zrb&wO-jaLu`vk}6&Idl$z>)S*iBwRd>+bMEHaQghhb3!2 zTDT&P^dMuiLR33YE)>t@50Yoh7LG4JI3k)$k4QbSWRe8B%m4x+Ueoz;9!Hj3t;%Mp ziksdkwy|H2cCT?QFI#M>>2XarY1X zy~ChOZQ-H=0Xm8dCbVnh^7VW%Pn9@c8T^tjWN&L+{tocj8t0mUp$d&B&@BLu(H7N`77rz;m$ zw_OUmAyP3EY<85gXZsc?MTr=Zq5#|=gE>RayqE1meL+aF^-?L4J3KD&F;=yP<9nCsR-f+j#k^t1tD@dwyyc71i6{gv~e?FaGn?i!D78urJri&uYp zX%}$C$jkJIS%^W7Ge$^9F-z_eF@#~uu=ze*k!TzB_?GZ19ATHN8VEU*=%bgtWhA|> zs^|H>ScZ2|6u`?M(#V1cN%%VOn#tKeEbk4O3Xlz`T4qRenf;o+a|6StRW;uNVe>&u zuj<|o`u0m9K1^yv0xRa7Ur`}XEvKL*b=?JRf3Ms#>{4e z9<*9qkZb-(R{}dtV!)w&IZnpP;U*D=9x(A%tE6mvm%wl?3;*R+f^+Kf4)a}-S^Pb) zJ97kV$eGfyG4FjIK3U(N*lg9;^#EZ?xHd4sUFgC8*hG6Va9o`aC7!FnV?I2s%v9^n z2JjKGD}P+@VaGcd0320XEmhlOFohnEVs11VNUl<}Q+_p4TKVx7JSHxQTsIU}ylVA* zy!gV@&`O;lzBl9ZZn6YdrmQM`DR#-{fFWL0@kjh>`~r1!iUI2Bwn9^`P{%H;a=UHy z>G|0)^2*@hbQ2!w*lRp_Ou{@Z&)FZ2zJeyd@0HWod!QNWo&Rn% zdha9OQe~x3P7-Ri^hq@;%q8|D2fzFP zwjV(H)2}Gw;O6G}kmz#Od{4|~bL7|LqKB*RR@01uDM1HiuT9C+?wE}ac%Xw`*ggi2 zuUp%Hrsj)Go6t$3BkKGX$nMMfd_jDXOx34&tZ9swJuH({nXsd-Ghw%2?q|EETvAv| zbwYmy;O~|RinpJuxh+^$!JTF@g)z)4A62fDep5;}k-(-U%TlIMe^RGzg|zZtN(rOK zh9+;#7+f^Y9B9p@@mp|*(Pe>TsS7rkn=3w;YNx?hQFhdh4}R<>g7RV?@VkmcfVe|t z1iI3+-TGV$iP3fRJ!_H}R;mSPvOxyC7+Y=pw*ZF!gbN56dYl@JdI|<*zXnq#KALH< zQzr>mBiaw|ES55UUlS>2{Uw{`4fM$MTk7KUMyhsKj(JLoVc3PL^-6cj$W2T4_4=F6 zTSEPeL9VIN`ZaYyoryIYwf^&cGP%(L=76tPXEsAd%90m67JAO2liV4GP22`J6O~oc z874z?RD%mp-bC~kTCHtOH+(KOzG?ll7LRPl+gJD%%ig)rkajvPx~@ic z=tdvFippnQ+POxUg3#A0%1ejuL+RWySHO%ioIg=yh%w0HJ5Fd<04k?11e5o%E#3I= z9I=sRWC`siJrL3w`hlx6Tc6wQuR8W=cyOpEe|z3@+68@w+@SrY2tV6sL^Iw?qH#n% zytp7t>DUgQiTHH2bNV3;#EjwKq=)vh87C5xm{!2Z4LGptF3evEq!22BbqZBezRmFB zG1PcK;MF|lA0uV9TQpgTU=fmH?!U4QQqd(^g6GjXO&qFjNB9qJe^V>+_4U%c3@b5`HtClANO=}bx}1xhq@A%t(n9!8@kU_*qH`o6t1shG7XPrI!^&G zekoYM0+U7SVw>CrSl`cEMSQ}WD`jl8irVzMu3H$jGyhgM!}!p?#$2EyAv=5D13~lNZtna%7K`uEC#eUth;&6N?t3*y z>-6oq-hRC0F>xeyz}M<}drf^jz=!!nx>bxpuDJsrIAHK+3xd@BTVBO(y)JsEw2Rxf z33J-4=s7*E_F91p@&JX5HErM1!fViVlLwp+pwo;&dc(@VCVZfuCXo|^I$mt5=P%`k zcq^%G6e(zR9QQC6&P=wp3sizyEmx3D**h`3&NkV0cDY$_<_)^ z;ap0HS98corme-<;u`4yRzqO%w@8?9efgmd_L3R0W3)Otd{u@%p(y--HcORcJVJin zwy?h_JYz*>wLnu-xWWKQOnU!Y(8D_ne0Uw5u-}O#ok!d#lq&hs2F3Oh%fst24fwV9 z=If}4i&vIe<^s}UbUK^DudcbyOpwhM!e$^Brpy{waS)ZONqPMj`4k>zr*v*ZP2%Xf z8zK!c7*UZ(tBKkZl0MkM6Au35u>j;ULGQ=sC6*yeQVEqf_-`F^GpYq2lg;`*=l=p+~7D< zETJTPr;U{YHS_gTWh(7bQP8UQvytm_+Y3d~hAoM7j3*lgKU%flCRJ!rYj@nLx$oPS z?}L^v3*6Nz!~kf0^3YC%N_M84BX$F(vq0UvKl_nM{_CM~I?R3rV3`_K8!eqRyr#DG0|YEKDJh|4QMrY>*E!{6hwp4 z@DDzyuCgPuG|Q-TdpqNrDN+lO(KB+%e*&&|l}no-Uk5(oD_`F@9_4-2uPhJFoO%W{ z57x%p>$3YKUd+6RmMsmg8+gmJ=1a8*DVmsU6%nY}-s>)EER}j(>1$q6tf^b$orN{G z!OKjUU{ceoDs$1f+0r6Sl69#^Ta4cUjQytn(>Im~?M`|@*V8A>#{q1&{c@sCGVNyz z5{*vQojh=nFZ>+cZng7yp+!G2N4LmM{t%71cWr%1s(1mW;uPJiyI{+*x2N{{({lkL zjjz5UUqQ`Rj31V^X_wDrBtsPBNh{u-QDRcrqFG?j%Z&J7YqOvd)&iE`?y@`{Czf4% zyw;EoH}$U*tq4cC;o=d=! zCRg); z4+ZLZW1IZ*Z+wH!dQV=5HJd1!3bFxzM!b3%4(P7#hJSG~y`Jj8P8$E+d(A#8R{@S) z;RwciXky*i&GHUV%HQ0E1Nl=Sut&j9o0=rX0;VsR+^vf8eKef)U;D56=a2|iJ04mlYS+m`AVGiXWJ#eOMH#%_>>E^&? zn5p2-knp$g;&xOq_%5XZA0V~7U>PdXG!E{hv2COoiM*%cso`7!T4B@l@A0U9W!)l$ z5`cHM0ju5$st6Ufbw1oY0vvrU4)vIBO|K0o*;`v`TokLY=u*$(3~m9Ui)?aJJD;d2 zV(WOXH*wj*Q!};CDbN8svI7aqR5cfjVcve-n`YhdQi;BoRBrDYt&9RaaLQEbLd_Z2 z3yb^Sd3qlnT;k@u-aFP)iGb`+hE(x4a|Qj_L-7H!hoi#om?g z^zrgcs=3DX5T-;ZPW^a}7VUl@+=YE9g^?`*}GS-|Es zzaife`^h;Ir8d9Jns;+XKfje&CgRU10Qxp>%l_NEZu;+mx3#`Z@Gz9A3?%AM`+d*AE!e*VGbLUx2~?^_X2ml@h5TJKfWTW|v4VVaL4DL~NRbUB4^ z>>lkDC`nv=BZK&T3<25zX*+Y0^1Zc~c`~$QecD3-n3Md4$_JL2=iSGg^&+>;m=`LQ zj(sv=5P(A5H9~H}SURR%RQLi)h>Rj&i<-}Ey~dL*!`2~5FH+Hf=nawZ*E)$1U_F|# zWN|}Fig+i@>wRQVJ`8w0de*xMa3ewr1CpaSNG$V0isHM|;I^nWC(Kf8&hhIxKc-pk zh^dXV?y(C7KyV{q8>rXqehxFES&D?!^uLV^$}`5S!?9-Hu@sDKc2ZDkrrqxX+9IEe z!pDni17|1NIm#VW=r8k2$+wKXEUy@?o)xP=gg9fg7e8#Z)>%g~ztcZiU0+Fg0g%XU z9?DLj;3ZYm+UT2)J;hHl@$bDR^m+`We|49KvwRxggsd@*AuOG^*9cR2vB7GLkQ+mB0STK|?^!3e0XNa@ z$P3AM&{-{HL(7`wAU(uk#sR8CdT;?5oITP!SI2>xfZmJTfqd`&!Mgv0SY{G@}xu;jr%;DR)C^aetfZ^7RD?%KS!3R4zNC zcydn9B|Cc`J5+i4!XuRde};QO-rLy!IIr9U_PAl}g6Rpl57qSKXsZrDOwJ>|F$#Oe z)(Lsi2Asz7@@ZbT%iJb3J_vu@>|;n+BR_xwx6#ytb&j6**JVXappCa zcw`8GC4<#GW&mDK>0r=VXo~jtd;I;@(^<1O`1ryK+%H|d9de1;K3+*6e=;2rQ0l(r_dO0pQ zEfv9Nu|gr}4l)XuWn1-$bhy8P`?*>Q98W>UvDsQhCi`BI5Ez(?^^2=WzxEehJR8ac zdzGAIs!B5fn|#Ke7a~x-^?OCEdH2`j=#;TD0-Oacb+Lp$gZpiVMe&-EU)MCweb9WmJPpFb|(#v-9{>w9Kc%&Kv1M zmmSh`4>P5F>^t~Yf9lid(VCI(!VUN|h{NKS*X70VwPgmA32q-e;kHd0bHQ)}taFdq zuLJk1$@g8Qu%DoAEQ$=tU*<-66~rr=NKIMZZ*OT@j$1*y5%_V1PBL37n*N)wbi6)6 zoY117h1&+0>{wAEphuaSY(i|zP(HW~txkn+OdU@8QwZL+TKAX0N&!%WL}Y>(=b78d zM!GFZ?Nn;~Ke*<$R08rP@vV&aStxS*#4H;I1;~^>tGyEve!8SH=UY$s8VpQc6S1V} z{RDf{I&W8r0>?-QN9bEpJbkvgw@Qi+g;DzvHKc^M&7% zrbtlum?xVR=R`berY`%S)Z>f-TyjKjAyi2y@XyG)7K_z zlEN}zvvnimxY@CICb7>ahYnoPUD#t-)I2~}2Q2UA8`p}5b+vsG`?1gDnzLkP&w3`L znYEe+f38QQdOE}(h;C}-gqYy(4ZG!KD$0)l(H6=-Nrv%=~(Gd4;>oQ1UY1vIHxX5QMMfdbk2f zDWV8~-k{>@+D8SQ)6ckB4t+GCk?f#HEM_}trFlCZ*?L;TpNqN+I((V_x#Y0fnWOXWyo8j>xQ<`#1=+H z{_oTATmS;2+|}|$XYAxIV6?%>Jw(wOyRXD(P9Yyx^*sP4%MC5p|CbsNF8OZwpZBMR z!-}c_lo6r_o7=0)@i_}p;$7x4?ukGGXLTKa$SI#<>J6u-uiZA}yDcau+!Cq9tT#OZ z>5^`&DhQIM7l#AsnVs#E?k#nmWhkMST5A99`5@zEbzuj$qstZ5g z+`$5PB@$RK-}?YV2jdI$$7o|o9rW3!Mq@AGTOYU{Ud#)D`bj)Uo* zH|?FulUT2RdHhL}*O8$WvQj}((|JH6z222kiJQh|QaPP(4B}s~YT9#v|HlQu9VdWo z-3<3<1|-}#JTxT1xM=Kl@V*gH02}C81K1_-?G_f*tBaxAHL;yG&!mbisle`d0)Lo?smz`LszCA96z<>4Zh3Gk=LI!o^jDR z4=8gSt(&-3x+5=H$LcyS_|X^g-iOSK&y!^xPfxf+lDqag`UtYcb0WyxOjgHT?=&3f z3|FtbFlrd}x83|{NNv3yqVs%>Q1RWg1;aRKz`Kjn)8Bs*6Zi=FzDvVFIIVXVh0i6v zDchB8j`l_yJfHs^Keq2vQ)$4LeupkU#M?5$4qE#7mzcPF%hMihmLccyfaTL=q|1)5 z(&qbmXKjA*M*aPX4!VlE$&~n1L&jeTgGf`4grSLYy-1 zetGz|i^@B>L-$yM`m6Uz*n+R#e>0_d``XeYq?|o-X&FjG$G7wS$L|wUX(#54{<2bW zEs*%|I|`HnvN}!!j2~$uJVZ}R3zHM&N^G9Kv0qT$lhi)-FN=B>;86F)OF&z`U~Zvz zMe=oFTz5=wwqV}b6}~3&I0=WwIN*uSp&RKlbD!Atyh&YdmI|E1os{PjKsVs;D-wBf zO#2MEkIDTgo|yL=QoSlF5q))sKO{ciR!M5OQhVklAmpNbySU-EHY}EQ`YSI_7PcQ3 z7`JH|+v0Kd`O0NmlJ40GT{~u8;A%UX=UAOL+N>w*h{YNgq$3upXiIHxTJmUq| z$z0Dp1t9`5hfd6K89X+&IeAL9m0tHQ(nr2A9ws>N!{gt}#E1m&_2qNQIgv0m%%o6dbYvaYIw3< za^g-!C-KkDG>m)6+SLuQaz(B-$z~S1g!Q+2uN1kGK6%~tEGYh2gYq@z&r#l-=Cu)KuH+id&aa@HoHI>6rNg_n^GVHLKW?IW>_fNvB z@U?TzJxRJ=%GO4-IMkkj=08iM8JA}5C);BdP8Yj-Vo4Vk`S8jzZe0g**C#uTTdpwY zlWucoxXmldi*I&8D1{nhxB95Ok{?)9&_EDm7Z8gP-}nZFP{~_xu!=miEwrNBJl$Mm1 z29X9q>7k@y=#C+!8$_i+S{i`?iJ^yX1Vp-q4rv*>oA2`Z?s(tl{Q)z_{J4%CYn^MY zbDeu%FHAGNfVo~hOF;Y%!(@h4H))y&pxVbSpY;;m*@UI;8r+*=uY~1Cd$Jz{VAL&j zM4cxJx5m(AE@+TD95c8OJDYugG#A5F)-oHd#5U*K_js!4X);X5L`cBML1q{e;KTnh zSjI>FL|c=mMgqw)XNK|09eqS&UiYaK=_X3-YkwVWn=T}wl z_Y{=cT5(sg3NV@%*AYsoMuXk4Nv_+5P_YATyd?JyLGfu~D#69RIYF{_^InCl4hvG` z-JfH;ITgCgTG8IamG{E3+Rs5n3az#IbQ&NgcBqOL==QyPA3});F+JZ`d^P~e(VtQ1 zEXBhsWP^(M%RPd@v0aPHrQmp9mWFs~O$QWWlV-q3dHo0}(bc#&HKQN+SF#X33~^3i z`OXVLsn9MIqc9$sF8@! z2`k}mcMAz#ztYegh1o+hq7L>|Ok*-P?~K|p<*cMF@vC{{z{U6b+TeP9>-lzKqxyoN zJ@4S-FIuNYT!F>!2?FdeqMrVC3YcqG(5O8=+B~fXhurTSy>L(xW=XLw`%!59TGv}$ z;sy4CNGn^@l#)>`iWjMFY2sC(x?PQW@KQsPy62);<*D- zsn0d6l+fEc@GVGhQg4|r0il4l_!hLDUSlMd{-n#*ztny1{^tpk4q4B0nxOkHH8JBK z_&o{(<;5KjFWV--<&{F8LrQlN$t$b)XPqiVvJ1bO1gO^ zIFRoNc+9W_^bCtJ@Go$fcP6P1DlIE?AWe_5d;i*41APD zhy^?K{37`7D??WG!Mqjkx-gmjYC2D-x*djdOSocy=?eI7cbD$05DQ+6yWH%$7V=u{ z;WF@kLmHF6ywp)Tx4q${s_8tFKyq`w$votflQ@I>KJeVN{Xo;Gr8X#7JfSN_L0ZQy z=;>CxWNb?{zQShfv8xLscEE!F8H9-JmHH6kXRT|&T#5L+04ySsmfhMx0Wz|x9#5i^ z^|>2Yhis%y{{piB2%nl$b^1GQCeyf)pKV||M7JJ;wvtXJv{9*NAw1>VKb7#9OXJqB z&6=xqW4%c#ru5?y548k=GOq77SQy00zS$_3w9gwUL*j#b;2qZ6N;xR$aTJVbMC8K< zyi{i-=Htk5hKkuvcH6N@Gv}0|fu~_b37Aqp2~sa+>2c*e@Pi(}M!?)a6#dsgbkig~ z8~FPLu~f5evlg&6@99Fw%KHk9Df?;~nYmT{bo;YupRaCCa<&;Rj+8e1>So(ZRZ}iu zIq*DdX=FUyUPDuv=H^_hmp;K%%4q`joLzu&-p>kuDl3iJc_J8ba55m`=+h9f!4trZ zi)awK4t_!7SNQ4E#&R9y=u`)C%9FQvv`aqD7Edu^ZXti1DDOx^x_2RHZu{AaWP<`S z12QI(JPw{RH6pos^+n%>EG96$c@f1VSbbXss>#TgB9)sstH|n}tbOJK@rXuq&bk%FR0X74+AuFsv^Hp$ zx(^Lhq}G1ko>d@i5!USs{~Ji2q~xYb4x^pfA%lQSZ!Y6wb9KN%yw^H7Z(FR7Mku6! ze;{M-2dih&5r7J>6V~?cMt;b|YQ81R2O;80>u-mJ4LrdfBd+1oAiTGS^DD_1J{*j5 zBbeP?3I;I{E-umsKk#QAC0DG)&!&w*sR+j(jM73e1g~YdCqK_N7;@ z>7l>qL25V>#&4U+lBhnz3?rGUL2$NL^S*+SnP(_in;&eLow!GJz>DF|HuE{gj=EcOXLi4=!6}I3%i?9jF<0sju>_5mpHMW zJ6~#$%-@0eliYV}?X708$N~d$qG*%rFvK^2SnItJ;rEWuf~#>74gQG0{rKaSP@&Y} zk9@H^9%koP#kHgT>#ABuhr@BP-5e!{&RvmON?Bk9Z-)cYgKGliUlR8_opcOut{v4j za(X|`g)a=OdckM&-Q$C~6V(gd;)SbRl?)V71yPWGeZTVj!#^6*us{=)Stm7E`PEq6 zYxk$UC7EO)hvUJArdii28hgQQ;z6wK6wnC+=tLK&2!OP_MD>Q?StoXAyx-KR8i+bK zvGFadd!72WJ9I)l>f!u(Pzi6mfT&PVuv#N+uyl2m8qw9wG4*1&>@(5SuW*5C%22(1 z*4rT3By|TzpIzy!?-|-_U#|=xhQwXRG6{Oa4UEgK(23ufYpw$v0?Wiv=bO8rB31u| z*F%~h>f%I-qC|@PM3MG$ryyja`aNbX%9LqyX!`FEm6Lgrzqg>Xz)cP$E9HtUJI45} zPlNr^$HjF7Ij%ucmyy&J_d4Da#fZE4&Ru-?rjDx=F)I2TbCWu*W4+fJ(*dugWwvW= zlU=Wp^~B@2GkXv#RfST|R^UZu#K22R;Cr8kbHVcre9AtB9$u3hM~|D^H5}UjSl1@Z z&eXLbw1<@2(YO`fI&vuA zDtK`20-dBO^Qn!W)E@{mw1CdkZ>({|EKS{?w}v2O*RRZJCsrI+&wtBn`(2kiGS)39 zo1yYB$A6KT^V1q%LZ#Eu>aJu0BP*=p*r8QH52L;Z>m{R(^K6K5kAg!_%rrqz)CJyn zy11RkkP)!8w6m`t-@;(&8vPdjtlM?JSo-F-T z=`$)(WeD}I^a~VFQ2@Et#Hf4@=Lk6TG;2?BjRd_V?=eBU5bV^inp6WsP2gvP@F6cx z80HZm^urPEf_M7niO0) z`urkhTYbD*-mm8(O&9y zgm-B)T+dTgjiGH4rWvHo6>1%JcMe@qEA*~gOmC%`7wVT~0{Q$qKKnJx+2=p1CZG5mu;|C*O|C*O%<;TkV=%_TCBiH8uMmCwbzkp6qMa9TirRTyOU5Tted(m`! z8xWj(2s`EBBa0d<$jIbJmm%M0!4oyi`c(M@GwSK~N6}l)e6u9U{Se;c#Q|yOXOzKr z%$_tAiR#>elX3}^oxb?%w#fMdj83G7bzrGV8 zRte246ZV+Bx~rMYn=x891-68fghn8_VMcTNx_ZE2_Lw^h*ix^Jh~>UxOG*QSOC=_L zyxmw(*hB^iXNHUM)4HrAq8iKVAFcQWOCulCjf>3oezr3_B_(N?Rm|D<8cw6O9iRtH zzQ6TyLiI^z@wnGH^QdADi)oG79vSpB?+pZ&q|Qy)Lb^pgY0R4R6^PEKSYV{FzpIT? zgqgvpz{{S^bXhmTEn2q&kNwA)7+368+~4430OJv%c1ev_6jY>e@AVbFn3I@>nnAR# zA-<3rzxm3Eumz)$dJB2|1aXLx0K@+q=RQ<`YuHBQ(A#R=+Xot=fw*$Yl`><0a86Vu zw0W>BgPG)R^e5nEkaSq5$k9)Jtv}tNilL!wcDUL3s677))+oifx|+(MFj}Wic-zD> z+UENL-G_0s4)S8*cxT$;k)I@x>)B85SuJP_B5P}_YQHOw)@oCv5gj*-8m(W>YjGNg z!sC0ArXv@s`O*33bF4)rmq8H(}Xx2ZoH2M^IRMI zo*RGezmp2_48E~c#uaQcpq{K99poAuBAQyyA4xQY22J`xM~2gA#;sC zQsUXNM$omyKb`IGs95ZK^gBsupGzVBIf2F$RbGVu&9Y&39a9pGwv(gfWskPD{kuv( zbN0#8!&`&jX^_nZ_T!KkgvZST%hk<14FLFWyvj(j~ASy#< zlCg|?nQ9^Zw{1dXBp+XH(1A&`Ry=DEs_am06ekozSPYAH4T1zzv>8-V8pD)j15@;i z?I+RhJrZT$1y_YCK^AfoMP@vk=D$gk7y`tDLS=EODVx@{tG+ykydsPe6 zf^A@BCYCM3;JtTjY(mGw210s*!AK~mD7F1iO{AMLlhYeN2VSNP5|7q=?y21$6bPpN z8;>76A;{GVzQc01(#OsSP-T^wi`uttvY7MUaq@n&$l!dKiXexLTO(GYP4VBYR1oip zOQbxqUM_v701Yk4D!SMW_|6W^F|K2jJ#ptL&JHp4r`nm}XIZ7 zSJ2uQGF1Ykm1*GSd=dL7iqCtVP2DnhF9m#E7-s??4B2s3EHh>Ds_=xBxA}~{!!bee zAcB6udfl4E?%Z5$ZO3=N!V@xlrl>?U%Gq+PUoXF3_6)RHYrWU(?cPF@t6I4luM+lt z10FFaE%$WUZDaR*N_2SV@bR@E_QixLV{@+PwBd~`>m;?;W1-0JcE1jjFCsOxibEg1k z1Z|33p{1|y6*2RlEc1#O>khxUQpR;PGU>)d%`bS`fv2=C#2-tTd}g>KtQEg%%q&6P z(dJBDdjVt#OFv4}{s{pWmAY!EAq#3dqzv;yXJuE1sRPez6OyQFLKEr=ADuPbd`Q=R z+YGzy73UjN(N6Q9k{I7zvtS%$Vo9A>S{=R_U_7c$_|X@p+?qE7)vN93eos%?`=KWmoV>>j*A>s5LD49^Ad?_Uvx?9Ue+ z?VTHrxrJ|BmX;Mfvq#A%{pdd>s$-*S1cl`Jv2pich0E)I9G(KlBDbQG)CcN)Mk6>ed=&YR(HY!u5kDz z=VOH&%K7#pFhoP{jXav+Uqq`keWYwb1&agYaK|_NQ#*3L_6~-WHB8t6*X)MX(Q%SJ zM|D}JuT)VwE)CbFF@OdhIrpB+0N8|b^{r+J+FjT<=3zdexbjpr%YMxoMVrxCAIzFx zD7$%G+9*pvsi;3)-|{j3A(=iqc*f|rnPNkvLlSI4Fx9x3lH>LEL*AKY20hwh;?QWC zOZ>)lCLpH1o@%!6Ar;!v`tl~bjCIX2^|(sG{do_*5^t7{+|zSwm&!=*5`zK#$ zvRHlAQ&i_ToWp@IkE40(7d-gnv>~qU0vbFL!FM0}kloa<(`!&sXb=PzBO|SZo-m-2 zh5Jti!$m=|WsSl&E3#Y^Yo}LPl!SescCL@yUAp$L$zYZHPsh5yp;xC6^HTi2kYYg^L$x?q!ZGI{|JzcL2o*E71BzA&) zE#^8)f7*jLwA@XJbzvCrV^XoGG}>S6sfwt7W5GxhEZr| z5XzPTz0uC|{Uq#K3Z=B1J@S)C zGfWF?)$C4wDCo4PIlcJK3B@{{I`2WOI@?pBiSue&S+nOE7#6q{7#GBO(Pyf_LOShQ zN@q!C!{B#PjJVIC_T)IXxb#uopaShb=B>`VAwp}l(fDLq1JXhlkC~|MYkJdUQ!h*> zvw#F6C3=rOfQmZWAJUgw@k4zUztx%WBtO{MBiZ6?_XZ>HrOjRb7p|!Vzp89OPkbNB-PaXIeBnD* z*9uv_Un~p%-F$rJ)$F_gxHhmBP*EG& zTo9$gjX-_D2v`uHqA?aRVk$*wOfVj+p&3G2G+Fs+yf7F>iTB%(@`7JJC_`F6tfwS# z4jDGzr(c?X;tryCCiO(?f%YJNRyT@3R>82p9sLSPI&0 zAN+prn?5}$u>8oF?AhP$^5#b6B^z)llKFeX--mXRrUak6#wfBbDw2UTKAk$DD29C3 z;-|21K8C_h0Q_5Ox>3;!rVjlt=9?99MO3eT_If71f5Rm zgD=fY`HVU2&2DQCS_|Q<_4l1(__QST%T^onvU?dcfqcVKE{D9SH@njVK~uJPGIQYH zODzkBQ!T;DR=vzCA?n33Kjv&DO5m*%QC=v1IID#`vN?=|iM3Ki{kH{SmX=)>nEOxeLAG3ps#(4!)+$Ng9NDbvEYl?%E2gw&Dh%3zM z6`Z7f__69T^S}%bv10OnQs35RM?+-%l1?yp7!Dm1F?519xX{)*N{4FJkbPtcjcXj7 zB6rNF9?bPO+Z~r`7@QTO1|tF38_l5kuf4dQ&D!0)5^^uW0?bz2Ho3Wot}Y;klv;_q zS+K>b{9K(aN}{%Vo&CGU<@=Uq3}trDj{s`bv?lR1CYfkpwcFqG;{hL%?*1zz-Hg~< znmPO~*<#oScF$*Ll|~U(RzpQ-?Wtzw`vJC5G@m=Rf`3QRe2L*>YQ7oC$?AKwojL&R zwXAY1NxvV0SRlBS?aLh{mUu5Gc7UM3O~%i+e)?T03tj0pKcTh*lDter15VRNZf`TYIgnE+@4lP-TsqxVn))L3|W@XuJ!~M%$*NQH?Z>?- z4mRsRO7hLYzKzU3-jLTt_hE00=ev=E34FNq6a`QOnC%_;Y?OU7zbp2k&|mpR5P(hH zWXyGLJZ7EJKTk9GC6Y{!IIbZ+f!Oz4(R}G^>W&~s!js!^5zViAlL!6Vq<7b@M8;h9 z59_^GZzL?7*Kzbvog}PfaPPfZ33brym28Fsq|`5_jF06Ne~;`Xu<*3rkYq-wZ`(wr zZpAf9LT;;YkQZtoOH?w{O;m=5Cl%6=7JgC|9S{@xritcc5B)EL z*tn?77~cI?Y)R7~R!Zj=&UigN)FrdDf~ZTTnvAU_@LG8V5_&lnHy!&hbKP zNVHS+tW0~uch$ORZM49cB_?Nt%PK%HfdarffOHM0qLKr~$Q8g0IMUf^ z@ffZ|yl%$CUSyiKl?7n-$kH*I@L5Jyn4akjf#5BjlLp=m z|MDLB7d!GVOHT%&b&2Y1IOZycMeX5|!QuVo-{p<`u>4@~7~500daJ6IUTb( zM`$AR@>Q>rQchbj{MB&X2G6q4vV7E(y_``YO;n|wv5A;qO6p_t=|puol*06iiCZx8 z^nWsT13MU39LwL{5_rJv%6!V=y-NuByj!KT)1u|L`ph*0<=T;giH!{>;So!-kU4wj z5u>6E5#G$(PX!+qa{YlUcILR`(ov1`SV!<;0*N6TTU1GJJ z=a^F0`7R@29Qc$8gP#|F#qt@}eoZn))HeWg;T7vMSw-@ojAwh@a;qAahL9N-E zQ34mDMi+yU3_Z6_7{xsCKajwEZoq~XIb1S)O$1^0pkAY!2Z{6U8Jm*#PoSmq1eD8= z;|xH_NmBT&de{jQoTPr5L>)fxH52P?G->u2<5|>s=O3ur19Zr~K3(s}=K#NxmQ%Hf?{rl--4HPr( zGxB9VPU(8Fr9p1wqNmUhSx$;Lm>Hh@6Oa4?scp+7sKM==n>&^LmXoKB_*d8Jk@Qt6 zv1r(#F+&?aT!ewLpQa*BjLMRqW+?DmhJzL_a)aD8#D_xRWIn2*l~MvN&rZ&b5L85o z!pEkmV*?9Kt%wPG0gdwf;ibC#@n?$s@0d6PP`A9mOfYE_Mg;CRl%M9Ql7#eXtWc_U zs;FqTxQjOs{|`8H;TS}_|0stZ(3P57MUJ#Q;Pu`+?Sz$Ud#0@klTV%qw(9mJy1EMc zlm;d_pNol>FO4vjez4Y425#Zf92^i%DxD`EcdT+@dOPV>QmC;e)v2w%XLhw|sdV2Q zMT3bSGsm4Gv6rp5wD-%?)820M<~H@GN~RdLZVNO?AvWcBCkb=S@>7O%b{OM`C?0g; zLDUlltBcWmcj-j;tO*b+?0brxpx`aAw0vYunjK(?6SVdrW ziumgI+&4VVw$poDt~cN9v^K0+eLAaBaevNrYtDS1rPW)7hcgOq=t4NSyZ%lfQEFjZ z^VlQGr5@jtXLaX2Q0A~kXNNYT?3>ku@^`T?!vGS$lO9ymAKb^x!LJXwh2bFo4z++j zONj6oL&jfYv+M!BP7X$6 z;c+xF2FM?yISiM6bU7tEUrbY}hzeyNO1+qEj#mGJBvdszY!X~fzm~EQmEXP{d)yl8 zkNr#<);t;@xy*;wG^JYxMj?J2nY|2GXCv_4O-2o?wGh!9*fGDaSwHaThnWuom5Nayj0-z%{zQT3~ODz^DO44c#w5oO&MS*Bg)^u7zR8(c>Fdm@rvX*mVk+$il%wg-Vbyvlu)ztbh+5NVw{Hb^B=ibk%0+p+8(Az;x~~c* zdj-qun{1%jf4cM`Lo$#V>(>s~d!~`V^3eUw<>x(PPmsZS+StGXH}OF=DR@>q$*j#8 zf0+Mff}IZYzAF45P8)Yn-5RWHvmaDWDwyt8e8JPK`hxpDFNpF!al!G(&rpHSnDZNs zmtlD)aaQJh_Kw^Ahe95CG;z1OjB4y#--0VZ(XeBAq|xp7qAuciPru9i9QG?+#=$e! zgOQa>=tao6Bl}wHDv(@`(v(NkHWWG7*UdqiC_|)m&)_0K4;G$+B29pSs=*01Vsf^_ zK+D;rz0dyPdg*e1lw$a#hcm~LB|GgXlt9hb&td8(1rh6h1{H z8JvFiDHxikIj9fz%|ANfzg1A=MNW-2V1RP!8kvIcd<$vghYyL4Zn&l>9Q{ohWc2Ur z!Yw<}u*&>O!~P6?e%;-nY>sJeD06e|U_R;cgtg{M6yj?0M^&+VRl!mrcarz}259BC zOO~vLwG*18O}+-;#6+S59mR13oTB$HM-9Ls4d07e6%z}6?9#=tS=QM)THe{e zUvK2|#cJML*MkmZ8B-rLVnQhDFFM=}j+RA|$pmY?V=B=#zPOl^F;k~Mb=Hu|)zc&% zE6BRdqwaMyT(g|gP5!!{zwx#dk8P`lFYp7f->c^d+qklhoY9;myfNsD zP>ktYTkpesw3AAuHm|KuK6@Qs<02S3i4!Y7UkJbJ+5TeK6tMHDps11XPD(|fov8wS92_;;_OXrQulKRUf%A#V|(3)DE zE@chDgN2pZpsDOYH@d)Axr(7Hdx>a_HcWwzLSCAr04ywfHXxk)aVc5LZ9RwTXzS$o zZjWE6v|3_j-%)wxVUt(?<`!m7=)Fr;HL_Z9-*mK{1PpNx;z9iRNrwRX=$4XN=ZGO>AAV35_!|q8Ib1lH=^X@isWq6qQ zjFW7|nNp!8&hk?Nm(dMkQK|p#SWXUpYJW z!L+};pWU?dYhygXihk=*V!z&K9no+zN?)smHB6^Mp4 z@Yl~XlhupaNBXygI#1Z0^AU|dwN=rL>AN}pV;2uhgXM{~x`LZ)@#SkS zH;FEcSLh(njh=8-p|eyI zlbw_BAo1Cm>`}WXq2R40!NaQC&ClgC^HgK=$8Y`X>XV5a!5xvbDOe7f*(tWGGwerz zWG`H+l+$&@oPJc`Su+7uKAa0K@)#IXQ-#^xWm9`L91#wu)cw~Mq#4IBu0EXM>(;#D zCENmaF!fp-ZRG+7@D5w^B?uiBa!(hQAg{F^=~(ikY(YAxHjrQ?-} z(M2{54Kgta^J3+N)_2GeZpA4iV)zg_b2=9zM4Tai#+&>WBpk%c`)Ze{J=-GjpRAki z`yR^};BDbh>2H#%wd{MZ1XC1xt+133XdK**Yf+*)hj-^m1=)c>@x)G}uWB9{I93hl za#+f?L;08vt(4N4-eO+uWk?JIMzG;}_P}c)&&kR17P0Xy(|0k#N2Sq{sd@u5J33(J zl}dq8lB%QZjf8c-omiq~t-huJYDMp}j3Krsgj?C@fZ!95u@5fz(*w!fBZlA!^2`#N zi%7)PzPk6{EY_O`mA(;nXf2g8)dh^Lkpg2HB(cV6y?{^vwS0=g=&~l!b2O#uSa`u1+ePF#!15!i-+mbDZ*QoU-8=P zVEYI#E!rANSMV`U@kAh6ajx_B!)0^ewI>u?0JW+IX<>V~NeHYsCKw5*QL4enkYKcB zujSXvqB#u5NeJNNk2MO=YzrI(bMtWiNkIN?ZI0YrONMfpZf5VxXPxiUTjI|&)jHxHg1+hXcTkg`A6xWVy!q2PlmEgv-b zE&lXbyegP~ANEMg-I2@Ri&}$gXVB$({mKv^OAk*-E)C8sF#832mgGa-nSi#Lcb{`@ zLpHFE%*TTi#(US^?!ACM(9Z-#HZ%dXJqyT9@$xSNJZel2P~fd?DOUj?p4skljAT35 zYNdNKkf!2~fpyUN{3TGHLmSQQe{M${rpBRLpgml5&Ba+gHpQ(_f*3|Vjypyn_);-c z&RqKMw25TmC9_8mNk-fp3y%YFPIlZVhwod0S{KlkPRq#<&bQT+HXeFYzKgq~N&$A_ z<4)Al`<;e!{niCD-fKtVm;K@pj%eOwwCI4Pg`>?7Ut!_vFtoOQL&rh!aRL3-uQ`*j z2HS0{pT+vZ_gin;kbIc}SFx2plIz5npQuvoKa8ocPEJo@dG~Gbj;Ciy~6u{6Ci8W&*RK~VLi5A@q z2}kdlhnH^4-j`%^#6*5~Qw|A*WErSxLA$;-t*s1zRAZ`Lj#JPCRDTE@oF?S? znw{ym3Q&Qp1BdtiV9xI@MAven`2Vi$f4-T*VzhH&(QkYe_8Avv>*k2(0h3_3#=^f| z*k}84bi~~3l6Y)f-Hatw??bpZhwfav=n_U&3XN0Z`#(L@4W@A>(bV@l z7C>YB`G5j5H!?7A!;%R*!~4$n8h>y(&o>mSxvYB~+Kf=vb7v=kJAz8UMK`-``m7}` z)kQTQ8)akr&R>YAh&;<~xhl9cG-`%@Op#!PKg+FEzAlXdFrzMOuD|>?V9G#L)hKyb zCN@!H%HmcwiMsPk$el-`)IaB2{&E@8Z5uT4UmMxR-e#nsu^>FJg}Eh7=c$r(4yO?6 zAH5PDq8_()n7E z(xqXQab(H+I{hJ4pMaxKNue=eOW--vedv;k^Iv%@Xq@h={iPJSJo9jR7oWCqc zq~ROWh?IRe6;Wuk(t1YM3pAtizWL&M<*faKSS)Tp_;VOu_}oOBv{|+Tferr z1B8S?+v({ndjqUX7S&eL+&&iow*O^bl?84Wspgo8XqX-N&tYfI1F$)U@cx%AO^iUM zhV~-k4hlWind1B1;qIc}-BjH3*Eb>9#_m}=mCtJ@ zi3NP9eO|EYtkgL0;fK_|yib06@2-rl&z{@%I^XzQb2011w3E~usp=G>yN35wX^3yV zCZB-)eOeLnUZ-nvSo@&sh!i*X$aBGVCd1nqD7~p4`cdE7{DZT9jh{qBJ^0lJf%whG z5|f(>9^JQO8(2zNp`$2x!aZo{-kBhWQ9(us8PV6YP$pr;i(XMT6Q zMQeTXKzHzdK6H+d*&F{LkPoz_w!HjVZKlbufE_(x1{lqbz_EUn4x;hfHy1VS-Q4EJk4JW7P%TbPL_KiO z&5knaT5W1;BVzp8b>!QPp`(Pg0uzCfzVf^HXh2bsXh9jnNLj?{F{ZxjHmFD^G56ZE z?9e)8p15%>|MNDFHU2dKWH*;#Tz~#FCmSvufwY3V)N-f_Op(T*iWN(p*zua7>V|7c zceWE*-A`{Mk=$%F7(MK;96g6MTIajXHDy^iSHE6LRpwnbmb-Ol{j2fwVx0y4-@TE) zj8K{d5%-^5HY^n5sy(aEz&ZbZ-WB$%mP}B+20rut8~yp#91}pL_JrLvY5&}?V01Z` zX=yl#bEi(*Tm;~3m1W~y=XbHITduwtwInzf!qL&>HW>@yV4Gv3h>oliP&e8I+Oj_J z4aJE2%ZagV%hR5#3iF@N%%jzcZ2rCl<89U=w>|~q<#H7MzWJ;2d%Ew$1^L2>m^WO2 z=NdQ;5-8ev?>B*3jn(JA{&a(y96c2WT)Z8rPgnry90Fw22G>fw<4#Z4>cW5Jw5b0% zW~sU9DHq|%ZTcC^7tcn zK0PVxerK+5OtsXm-O>4%?hN-0<~+ZPnwaTXc9Qz~u-A_FJW1$aNg4S|r@k#_IHEN*2+aj)6*o!ElXd)DL{t4+?lt{hPrnRPpf-SwR z{J>$iPp5+xJD;g!AN^qFXzgxlYm z=-Jc%AJ`#wL~B7Ip9Zp409!fA2F0v05`OvL=+`;10vgIWCQ@mOGC+%RgN){!cL@%p zKl)Vdd(>a-BamE+(c*ufRg~fL_Eqn0ilU>{h3|7Qn!$(_5z^8!tGl0eq-AAaYuGL> z=}B{zMdB?2pV5?Ct=7s4I6~G+E@dO#nhbgCJ+bnzH+g*C;+ub38{y<;TH`;iju!&* ziy?`|&)rB0ulsAzt5p3-tH^W4rj?%>jn{@{{9-3oD-xbdx3%vXdaPXA7n^DehX*TM zeyKebJ8OkgVvv$H2>BSKZNc9(P_^KqXhGP&E zdw$+T(VUri$45;|TFzTv?D1(DkpAuk)AhSIMg}+K@Br9+?<3G=r+L%6LxD3H_N|^zNJ2F?kDf*3h|z zwTrThKqb2Dp^}w`*xG8BCbgDq3mpSDd!4qqnnr%r*1ExyYqO-WV(qM3XUeI}8P}Bl zgxv_B7JY{-(IS<>f`Gt9PDk;->|M>JoAyKgD^k+)rlv`r+v%#?FWejhMzJGxlklCB z%c&3~x%XCU)9PnQy4OiWYrvG@bs>!qwZNYVqKa!s8C82h81@4-I^~v}%pu-eu1$HY zZxrlHUjZYgp+i{bNU%9~XrmA-0${v8N zbgP1%s@`(G{#y$m`aFaN^G@MHu#L@HgOc--Xf%quUwCx{ttfJ}uC?;}JIxDUYBmbJ?X$*bcN6)R3e1-L(R*w z&fV9vmVn>wRk3Ywm8@i?XuDjzG8FpNzPD4SW0byol&~X9e_MZhY~ym-<^cw`G+!v# z5;N4vmgpZ(i|TuCRf&5x%scXd8zhdh+^a{(NdJi4(RT=%DMOt9d+?7wCP>m^*XaGU zKfO7YpiTbe`KL8rlAWaYSb1mu{>lmn^{-WAz3?yrX!r#KZiM+gzxz@^U*nK6LK5f1 z>zgD2pxGV7UUFOyajF|DmlI*39ShOG-+k?9^=@QtI+3^S?%c{xPel*z4s@h#&u;7x zNiWCSddr;HSybgW0b!kueWewPT-d%jQQZG%#rL9c9GO!l*V79Q7RCnImNR3<@oy96KA-O_i-dAmHQ;Y{K>!zgoC5axj>&n#QpVL)(Mw=U8Yy( z)qVzB@%D{iJrzHq5yQtk(#k=hN$cJmcK>g00&s%;u{IThOv=0VXV<;N)y-if+NzST*q~^rS08qf}$S0@|En9G?usg&Q;P1-d{Bj;cHMRXUK?8)($#qVdvR z&M&)85KXe68Wnm_?g)Nw{CUmuf3_)1g>eGI$Dq!Mmako66gb~_*WFgdA7?4FJMUq{ zoz+@EYgiS+qOQMi$2m;!RHOZM9&YYaqltR-`RsfQ)tr6j*W{Fx%#Viu;O#;0t!o~+ z+72gps{M@ZKKb`57C1%HxI%?+>%ZpQl&2CWNh{j^%X4Oyb5L{QYFJxEQ_4rw8#&aG z;o%%GVs5vs->?bQQVQ@>Xh%AUI6aP8`wsEnJHwFAVPOhD^S`Muki|RNr>}sP3U4{Q z7~h%qb)ns>&moIZIUomW)7{H?K&&=kW*KBc5?pqf0ZO}9>k%% zQNB~MLH49j0XWdKSrE5AbrJ^GwIp>Mov95-OM9b=yEWurt_jR8Y_w<3%;GptBA=Q`R znsn?@hNVWv>eK+e^?RS)1t!+T&74g%DYM5Be-k@@jV>(iZIwnt`*JP5E_f%GUoy#= zLx?(}lj{C*6EIla$I%Um?@LsX9YC4)dHcaliJkqKbMFL+QezJL9vvWsuo_qPH+8aA5ChGcn>@$FY}*oG)iw@j#Mtv`9OfmlPs<*7ypwou*BtAe?7HYX_y)_x_gpVu4&^Yilj(-^rN<4y=E$1&+*l+d?sivvoM^>B%*}-#>*mUN{sCo2y`|OWPU3JD=PsJ+RCvUsRX!PNL&28||h@3#PH4kH;dWH1E_v zNbe>~TVU|uk>f_8enLIdQsngA+iicdjJ@o;jBTzY1Du!qcMm@%hvlPL_KaJXrmL6yzmV;b)l8TYIOgS5%xT{hf)PNM_CINh%9{x=j8< z_VhYg>Durgs`T$-!*;moD}QBG+et_>f~4`E#P!NpP**(iA{HVdSN{Bd((#P^VQk+uTHWIB%Y~NeVWv9L3h{RB^M8-6(2F5wLvk&P!akT>frlY0WaIH4K|t$ zD?4;P9!Q$MoM+39WRYHOOz?4@Z*|}c+k=cM63r~pk(tn*gn!MF^IgHq$bP`l`?IG7 z6Y^DCFo#Q?<|Bz46|MoEA8ILS-0{oLWnd}k#)u!$?G(#I%WraYRwYQ>TmbzRti4XM zHLRJQp3m)G`=2Fvm#k|Q_T!Do6KAptZf!_gV|)>s1u8?ZVsLCQ{Qpt)mSIswUE45? zw17y13?*ICEg%g764Kq>Fr-MANJ=*nN=bKv#DGW+T|*B&G~ea>KF@u>-yepai3HpsmtiXp;mQK|g718ax`zU{?0t!W60WdQp8@5)8vD?gQAu212CWD8s) zFhu%;Mn8)nm-*(^j&>kF#&Mus0|aj5{T4d>w)#|~SG*z~V-i93C$w|&-G*WdBk zXyDr{w`6=h(}yBt*LJ4Cx664TZB?|ieUr{c3ias&iZ>xNNQA;9l>`^xbtGnx*1&|q zm6!FJM>$ap(ZfHw?boP28DxsT5J9q9&;9(LIP?L`4B`vPvwL`0KJHEjxu-_`8XuDc zn&GuHh80!Cd;V}=Nn)-p>_neBu{HajP8kW?CVJ+rX5D#k0)F~t?a(rqCg74F@1cQ9 zypuL5NCBbUYt+Agx18pw5WU_lUhfF)x)Dxy@=F~L>SXL9b=x_T)X>wkxHC4A3wvDN zG?Py?3RrJD`^h-r%8w4jnkxvpi{R8(Cy_Jf}&rxDJPr= z)H&MZGV)`qt;Fw#|1q$NYa`pFW#I%-$T7*OHU^SNsVuZHRZa>?ylH+VYZ;Sc^Ej+2 za#D+qW=EQkoN|Nb10@}(wtKYONHz$#lz1lsjPf7X&^$_w@PFC9I@k&tEVtP^ z5M$3`lL$kbkfov2(Kxe34w>8vQjplN3*72$OM*3-rsP7 z&Si?aT7!z#K_3;~pYdpun-)e??dgFUP@mDWd5z~wax{ruwGnh3TJtuJ)?r6(JecK`CLjZ0QS*(oH$R1|L#>DF067R@)Jks6Sd+Qp4vAsGJSfF+CN^v?nt z#%|T*QFoOZK0bb;+|y&T%-kXpvUdk1wv%?6URI1Z#2u{GSl-cuCEp-0%`)H97aI1e zmLdAL>0=P9>H^#d*wPosdUVq)w}ld^0S3ZP{%WVs&mb@jnV`&Gl6=>RF(jy)=3_C# z!m2$yi1>a0o5rGfeT1al1Afhk?8lQq_<>3CTzyvD_FcL1gAgFp+frHuS#< z=-Z*ayVSc_g*YCjH`@`ZnsnP-ArjGpcSVA({hwuXZ=3sFN?pHm7#msP1cD>(jo zZ$$ZVYHbrSYU$c24!uI)m55#M(h38$u_eZ)s zP7EPI{cpfGSHQ{D`N;AnIs%=M|9&%?WbA%O&obTjj8Wdp>gv;5Y~{XHgZ9o-36x#Q zr;qVNsNc#paK{tO6ufa|{0F%Uec1O#2ydG5;!@Eo*)6DOs>`B01qK~`d<{w7fq3!Q zLRM&i6zD$(0jpPBdALoOU1_cW+rPb+`eDYGDr>Z!wrc38X!T|lAHF?Lg|c0Z>2EMB1y7+F z`n*o<1eLrA5>1^cxB~}wiRJlp)gpA&72`GzyXFx+uKRy@A__W)ouHznvZqr{Io|?p zY-q%)>0xT>X7KgXBnq)eB&hKrzAOX9@Uy;2qh9V(Y+?Yy%`GLzc?2yjxvES3XE7Dj zI71yWVDB8w3e^*z>6V4i{4VNjl@A%tW?Tjas&WnDWMYBU_+}aL!GLH`?p=lW-ctia zuyzXpl84o~$)o3S&6`0bf)p?ZaaBA#IaN=aYE0*dE|PZhu|CS)yq<%zHB&nxI)7!} zo8N7GCBA|bJG`q(%zKu&2ll#HNDuI>CTsPKa<|0S=8w21BFOSscKrRr3T$@nUk%1X zs{16?(8wBgg2sP&x!SRWb3uS$__Re}p|UGU_Fr%EmH`o=f2KDrMzE#Y7X_rcCSnW< zUdb?S(`XV=4!)}smGhXKN0a#Yfv|rMy|iMG+sBVFGR`3by$h1@hG^3;MOQd|;Yj^$ zb}TUh^2>>frES$u zqERBRMZ-7h^nbHCuvB{{8+GHxPH8{BK|7=pLGZ3$n67g2Lf6-Y;55IJ}ch zHVTS+-|MdfUuEpV?I!o>Sd4w)*5I^GsEk(ZXS9rG?W9{KWgOUZ`z6Er{3UPS$wnZ< zBEU4##s6_lX=c!Z-ga2z-@9HiK91~jO{H>f1@!EZS$hSzF6+#@8|?f<O^2gj2dDZb(;tP-Nk`Ip} z*vj_3-1K@4ou6rgMr<~sB9O-dhr9}d6B!=oKfJ7~)k-$sU8RqH6#u#l6_0k#*%(oX zE%F!=;+-}}{mt36^A{QKag}VCt%9kkCguoTd+WuAT4x90O2M=w?&9F0M)*HbV_omN+*EqhYbljSY2Y@y zJqCSUss2zAe@4<{Ul(vkz*hOX`pvzUW?#OccbP3dd3yH{P7dU4URPaAAuoZQA|9$q z8P$CBT-WcuO)L}Zx@QvgkXR)L-@WO0vy3S<^{%domr|smd;|vrcsDD#%bZW`g})HJVeSlNs zm2}e<3I-e7s$#)vf%bg#u&n93W*DI*h1CBbG5kuh8#Q$!m=@?Gk-ksh=g);ZZh`hI z6mp9tkS5YRDu00$yr?q8o>>2dlu2u8CVFLskYhEP!OL4K`BEOhgRN30FsdAuBAPnc zU{zrhOc2Ca%x1Tn-Iy`iPhhU~EAfB3y-s*?KqU?mC1x7|$p2A>O`r9)14QX+t%IsTp zHk@xy$`T%=Fipdk?e<8R>L*5>f!nQL^WzDl2w+Z|e>U%?SUT1ve792KzMxMUahkyi zS*Y4vn6%y%-yXWHqyR~c9V1rc}=fgba>f^^#fG%P|nZ)D@b9#d5I|mHWg)UqNl-K2jblzW%@V2nvfTHb)q$(g0bLh_*#WtW)pF0@yZfYRNb2SY;teNPfGrFDT# zajVcl{tKGGE}@Kb+~G=-bobw#iv5Fca6^pzRRlY4TfR%@Sul@hSi(;ay(x_!9JR=- zEf9y82YzdR2=N-_q1Fkl&bgatc-`-q_%<$7#2iAUzP1u8)f}f60aVON)n(%rm$&!*N z#x=C~gsk;3jQI2$3EKIqvjuljPlwVd$w`kUHLC^` z)yk1{M1*oRS);u1@(~!Vf>)@@b2?94-!Jpjm4ti^QLx%99dm!V5W&Z}Y-AEV9#WCd zl*IyWwa_EgZOQ(^r&!V~5aV0hkeuq{1EP5sl}x8QAk!RX!8N)$*}1+S0N}Ul4rv>; zw5QuO*OT6Nc69o{oVR<6?m%rSdm(On(pdRE<5Anc!bIK1{^D+!XC0fsAr-3pOP8(Z zGoY^<;gyNCUFyg*;I52)ii7+nOI7}%Lnp4&&e#!KBx+Oq35-ce4_9^)2^+$dKc5dm z{QxyT^(^a#I~+k2`sNi0S%{akz}Vet{Ezmjv!nkOVg-VjLL)t=I}{cb)Kuy$B(iN7 zlc=n##x~&4EE}72*QvOO?JkR=aEAod%?;*rDoA6Xzc*H7ocb$)Rd*(w2Pui3FTL^PXsVIwZdYkh3{h#-qL+j4_;##gB%}_+*${=EHplxbhNoIJuZ_^ z)|-vo?^Jk!^|~5T8nTq;({!bs%^0C)zehN#KwULo3wa%afjd#`E)M^=R+X(j#_QTV zDZVv)V>eRkIic{1Pdt~C9LutJXu8sY7wFvhnmvEt2?cR`y8;_u=JBwH>h}cwF?Q?D z7wpW6vkRjy2>|De3zM)LYY#W~;MH%G#VM@U`ZP9P`?c8&y8o8W5%SJcfr2OU_Vs1F z;v=`v?mopbOW;BdJWkikElZ#iPC3S!(e8K1oZ<|%jI!|DDLGJ&iOyTd1Bs7BvaWStTCL#1%3 zdemKIZ>(1jh*4gAC|i>8N4z}yjW0rvA}mQwU2H>&KteTkTNYCAi}_NVc0nG5V3u-wlX= z3bxXXesWeE$~xPp=M`c>CMNx~l(8S%T0Unkme#k8vL52l+Al@KLo@MheGXC-x@gLA zcT)kS3-!E(9&?=t6Pg(O*A|&tPVRX5bp6c^%rMv2fn)UX_1WFE0$@0+x{}-TD7#_c zk2qpySuHh|Qi;A3H_A5ni}cjGA$Su{ONJv=)gF5(FGm+57f8w2^^FGFNhX@`^SFf) zipu8ef3sEMJqcE>ulgx>W*vn)i`+eHoa4iK{wgl`=!9>uiZF>Fgr17bndc-Nb@Mwn zQIc_Y6ca6(7|ohww`4pI$K#~H;0JT79}v{%NkVH}-Fw|+YlvBO?f>M;O8kRl$y^{V z6qpG2l89*<>gMZ`x5XckFZT8Oe#i6UHO@5S5p-ycg~1l?0!-XKP~ZetTNm|}EaiPO z`W@Fnq^NA4aor5reFH2h2l7%x7v%353^qN6t>Un-cwF(@DOoi{`s_HbkM;TU&#!dJ z6Wztb`(m%A@UONKl*^H290-A_6{%uz^~d>F2(cE9oSA`fgE zJip~YbxJ%Z^WDwa2-$hALak-EL(Nv%nR3nDqPIm%PI!wObBGm1jdw7BN=#l%<$tvRhs?m60PcHtiB%c1Ykgv|{IWL{g zHB50Y`$>V52GfOrG#%`F@nG&OHkFHh51IBq8_jfY+!CJlH&@^S=R2({!N6je8{6uF z?u!Krjcah3^u!DD=T{}oRT0&sPV2F>0_Wo!PAR$=HU&~vsSkeoU(ro$`WiI_9ND&W zgb!S!x!q0W@v|p$P?1c9(Uvm zy764athfXjSQ}&$+jE9O)sCdBD*`3;8_FZnyI3!8Jd|fYct~v`d)B<9Ke|Hpv=~z& zY{9zjdXDc=&GSy#6i$QhW}D4x+R9@c^c|CT@WY#j|H&f$9gHNngBctkca{^wXOVtA zWam21@=RE^LvE2TqNR!mM+R@7R&Mm^if*l^=vvHN9SxuK)I?u>gGnN&_zg_a?wq zGlJ!c@d))iUe+)l__uBk+Mlph_)mJe+3orOaC1EbIWCmMA@-XXZ&A+`T?f&_ql#yW z>sCsFVZaAnk>WQG{w}}EcNP+x&X3yz?3eL$$Cq5~2{0~dY;Ijfe9m|8Cc2o_7PuC? z1CYK~@Z6sdpxhq~j_-y&Iy&skB`k$C2p#-=E{34&mVA5C&dM9_W1F3o{Vex?iiCgL z2_v`2B~4oIPNnm`n>=mV%jjf9z`$9NJQp->zstQBNIF8tnnpJFj(ZJqZ&lXlK0!$z-KvP7 zS2O$-%Y;eCQlyv>`q*L{-9rC$Y1V+B7|tl4790>HrV%`)yit(y6zt?%jatQc@kuhh z#>UgJ(du;Vv%NxWeDc+r_FCn&pL`;&XI<{|U*U#T77m?R!Q-qOvswTmIHWRa+IPj8 z`}D>$bo?jzRd+GeX+Dki+YkixYIvh{`sktDr-8978@6IFa8N(@evh6{>_q@Via-sc zRWfzPdF5Mug3nHs#o-r~2~mIUyW^>G_-31r(psF6|1j#-3mrL?nb2|JEBl^h?*;Wc z&v;l~*`?`y{uWc7%R-*jJuK`fpsk%O*eHNG5ol6O4&>7FyQ$QC`Di?I(I58RVCeO+ zcpE*B##K1JN1D#rPJ7+E;FY6LprYank}8V*gn^P1l07@t&~bv%!bO?{l9ST@`K&E1Do@+n>x>_pSRu5JC9{p zLSpdJs_?JQR-rf*u5CHvjZR7dRk2x50vYl@sTOQU&h9r@odW_0F>{}IxB_YxYuALO zN)nO$%>!(CCj&pW^QNC4XAEJ@O{_!nn)nZF0E!0U;<{jG;C>Vd$b!Fj~1{gGaYrngaS3ngT~}e}txPJb5cR zN7;}F1>dda%4qs-&Pp44t2DhMZF!N};>J+E5bp&1uX7#^!7G@b}QYt zdFLC7J(Z!#22u0o1^&f>{i;ia1KymZu9Taf>794TP~W>s|HGegJdA@G;94E82_%|9 z)~t>L`xy5%A5$h*vB^;~k7S z{O$L0_>snUY>=S#u>HkB)YX6a$r%5`utEc7Wu-l?u6T~RT2ehcoGn4gV)5B(*Jjts zYx_Dmxj^N)*)=O0&3fAP=#PYZ)WsTO>=yW{s^-4OB7wQAV2M;t4k_KBu6q&*1@wUDemXcZ;YxHdGlZLoY~I9~Ns6ZhonezwKq@ zQvYFoADzBe@ucYdlGyYEKtbi)->^MpJ_R*fviXEqx6_k>z~;kRuYe{oUmgk*t4g3) zx!X!27Yi6r=(3s|Wpz-DOSGO_UE4*ybw%(gzm@DUx*n5C^pZgB=+tkDB$dz`pNnlg z>rN#mrn5M;~p|2h`^KtUlIhbplWqTPunspff5ID2Wm6zYfjUhC8%z6_7 zelC4+JgMDjt`HeH24<99Hfl1N|0dWCgo7FO7eOVOZ6&R{>P00H%4}km#m!`Y_(JhV zG(1(@rg#SAA}tTt_&8$lM>sdkQL4v-u0PA-=Y1hN*~C#6-%9lz=5S9oIHYTuS9Pw< z0^knZCael)G734(2$)pUUPf&pJp=oI*cna`FrG0w;{G^=%NICM< zKZOOhJ$#d><$Lm(;x;clB0obK)q#231fI7b>N?0o8R~lw2}@<9h_2?ieF~e==DXk7 z-gTJ%7PBrK+n?GxH=#B**LbpdBH~#I<+cB`LrciToRLcAZ`Z09c9!gqui>TE9IWIs zT+#AeM>76(@b0q*7ZM(u@(u#<>l^@L2o|0936F*c|JYbZ=r`J0k7QPqqU^eL>zM|{ zV!T(HNhjQZRin*D`r`g4wEf#eC#g4IREfVCjbL#;1}a)x5MZ@jSd#9I{CtP1!w5`i zK6I_;X-bR#hkq8Oc|jd+ER>I){5iPvvIO8o~hnt*hQf(Oei{pABLvF6PR<{@A*8r`S4vwkUAK=pPfPlvlDS;n)4dI^MrZ%JdQQtNG8Z9x zmVD|7PK;igcZ`e9reXVUS^<3OWy)l#gFwHgppo|Zw~)B=r~td zZx2#9Kf2I0<)k@}%94iSB<#M8OtC%o^Wtmx@w&y>(3#-Y+AGT8^t@sAq_bSi^G0E{ zWLrD}`c$8xD{*rk43)@iN(#PrdNUG@X9BUnKiu?1TGl1FxSoG4`)W*k3OPgMr-M|Y zsQ1(H;p^`kr-E~zec#~$fEnzefq;~1xom3ap|RkTLEh?cjMq4uUC+9TPLy!v1%tB*F^RLXm zTYAtd0P7n~_{6$$?OsnjZ6O7*JifW(#<)B)8l=jv`qKxrLvJ^5dDyCj-!zkv3g)bY zktYWrf-(QqZn;lO3u`yEU}m?YT04m_nBM>}j_6I|5&92h0i&jlmrX+=oo~!ND={>b zX0U+s7Gv_itR#>GAg`gk3kMUeuenR7<9CL^yaw@9b{euUdD_)SiB<(}pXH-o?7iHV zZH+LuFcb^8c_vc27}f%$h7+Ba1_meX6O_P|7blNs*NdQd8(vHUxe~akO|$X_Y8=&k zQi9#pZxkPi`p+c6FR$;vQ9`Kt0g(q?PWrQKZ(D)-B0p0$b%gEo3qTsx{FMoc{uc+st!P*) z@k&x>yHfch{$K+@RYBom%$O<_(GV)klR;pRX8DZl8Zk5wQ1|`QUo=uIx3>XRU%svt z%GAY<{e3M`(z5%7NKPj+HV*iPp)K=Je3>FgwEyG}9%{zTly`1`k3l_3T$V@0_j3OJ z!#@szmQQbjZhvqO^IgILSo{Vzzg0YfK3y zm5G}m4Balg4tsUGE}5us#Ot`^!VX?*dVS{6jTP(J@xqQd6Q_5v!go?P;-N_Engfb7tZgb<%BeQ?~yGC9q%+7{`(hEwxn z3rgohN!>>Xlmg^>eDPu;=UCL(NXM@EV&q;*xf-*$s&3@L1fb5p=HpeQV23}m#VGvW z1oGdGvl7XolB<l1gI7M{Z`!D!_88t%S%a>zB*1>Sm^Y+fBF2kYAU0jL)CTv!3@v0vjBzdPPbzV&ze;uA-CRm%b z3BuVAxb;@wSRa)KdIdD45>4tvPWneb3z5#{{>>@6x(c2jW^wn}9z)To^7*MMt&-xp zsc1*lSO(w62kewH2$Z*d@JX~RrXs}Lc~4bRwmheud~R>0+Lz+$Ne#n1A|`>EG&hB^1n{DJcPNq}^yQnr~|lO2{#xtW{Nynx>*P_;<+l$}~H%Yf?eM22K(enjat)M;N8 zSQL_TH;hpQ`4&HzvG!w4M=3lts<`G{Z<73=o}c<418KI!nnyGdJ!`>_W6Ll?QFKeZ zOA%GXGzY@)_s$zzC$0$TKtj>1BN!(hi=j7vpE!wAPn*BK-Q?S0K+a~()kEvKV$I^l zjAYqTf|pwP%;O>p;ofI2vBSg$m%o7nsI_`?L2`OS1DdAvC(N^_?;-J@`oW2u@i}wg z%Tb0hGiscJmC6Arj@e7!q&_XD`odaHe98uQ;2P?)uyjUI0Fu^~)avfA@p&b%NV6_s zrcYEJ&FLK6@MS-9P;wK1a{mue+85Nt&7?Fu5o4|Z(TlJg;w~<`+-AEIe@?Ac)Uto= zZ0Khwn0Bu*XcLQl<0wrhEcdv2uKdJak3Pwad!XQ%^8Nt-2<8)D_~NP6mP~{54t}>0 zS$PAGtVH$C_36|fN-c|L5GXb4xC4%n14E>aCB(y8@#m8?*TmR(bJ<`p);T==IKb#1 z#3OdNboKE!%3=F&NlkvRvb=WBK<^Fg}7*?lQO&?ZDay z`Ae+VCJ6JO`J@fu;A8+uelU2LP_;r?y17xBb5D!(WoMCaYOY3Zy1Cj8-hRH7WxyeO zcF}|V+Pf0r@={c?{yvXP;PGBg?`~TJ-c?!qh#wozL-Hg0(YX)YXDP;wml1U-c%Y2) z#Ce(9^jt68i>8)LFY0o|^_W3Yw!M16wrX?eL)Bb0rq1^FG%*u=ordz;Jg2r9l+Rc1 zWWUy~sC(TFD|Gv;)|Ix%S<=@$JK)>FtlG@R_>&Uj3L=WVB-ilac!Phi*iiy$Lcb=z zCOq)1=_b>n`dr><2m1P%B;+9S($CN*xcD*f>j(W6np!1U0ka$z=Wcx@1o7+vd`$J| z88axXR^prqm|h``#zH;}&i#ddji-mtj+#bp(TK5mlgihQGh620b=B(+X?!E|OGkDV zl`W(?)Fp)RXkCOpqkV+POA%!maQCV<&gwQA7maFL!t(A`7ek+bd*XutuTY*rAN7sp z6knJ3{eoBd({RXk`YRh}lsMYwz_ABR=2}5C?8jWr7z)>iWEZY=1Os}($K^uu711=F z2Xk_$WZKx@?y&f_Ghf`;Ag;~&@bG{eEjqt8%Tiiy&Uj4DIs}g7`PIOEGw9dc42)(W zYgBF51^PHNXr9oz;lWyo8f-+{;!PB(vOi^Np!y@S19BU3Us$07X<)%}(_qi>`sH+Y z<1=P4(aS+cyPAhZU*~(85aRxHXGI`+vu7nb9vavg|yS1ArKu|iKLoY_3$IJ6EtkFX>7?-p1* z$o_3RcX8)2xrUf$5Jq6zwXs5y$;SGpp|fiXW!nw3yrq?IRToT&w86*{_#of;-UWSs>cj~R{_Kkz z92D~_S&NVyCmV?h=K}xK^&mvB!vx1W{Sb*Tx~G*+68~UuNyzjW9v{owl9vZwPt*rY zD{oB;c$fm1Kd3%W%I$XRKGmz%eVte$*L|mVw`FrAJiI=>wHJH&k)Mj}_XN8kOv#%6 zL#&s@5qOQ-^yi9--Eh{C8=zwu~!Ls{mUOrq7VP{5;`d2{?)kA#ZdCZ)N_Hp5UQ^?&s-359h z)N0WF3Ar(x5IHs7OTr{NJ?WA6w?BVy;=fGd>1NRz>JO!(`AK$-?ThZ)C2kT1SCVbD zpeB|#H%cVvZ)Ivq7xRwj(O-I`KS4g3IyP2(+g@2D`sOPCAeU|Je4xOunqZ%>uZYz0 zOVa+?&BKzppzb*PGylEqTRQQcb@fztfUBjI4HXF*R0Tn!<=07VS==qhaEqVlG~ZJt#QDO-t-=B|F@?Fw87}nSt7l zKNfSr+lr4q*=J{BF*XMX-4TPQH{IlV%m?c=$i5i9g&Z7b&pr#*n-k5(90i7m+ge4~ zK6C8;9dSc+z+`MJ#Jp)_>5AgYIg5Af$}YuJ2jixwrSfAt{KSc!ks@7*=z8u{2>vxjY?+}8}xex-M zA^Sk{xQ`K+^5+NTiOLx_ii*QbQ{sFvLtJC|Hq5KB0hJE8PyR(wJ}}Yx`2w)@3ou@c z#h*#vQv5^$1}SST{q>#7yx8ltI}|mEeB$YMf`(hhnmVJuaJowBSl?ig=;rZ;0BR~L z=;LlBo9{&f$#{@N`ZEvf$8hHyiWLikPLaH#>f6GBbH{{mE?Px>_dcoParMXk%Dl(D z;p@(O#6;k{gKrERzUaPD#*pNQGaOm5}#vfLaYXPeda=)(zj9CJIK zXF;){PU*dgEL}T0Uoq_bnaTE;tRAGe`%@U>GHN~50adArCPur?x9M*#$}Z?nXif>$ zfl_mLQ=;qv2?}4Rd_2#i?)2v*j7b?~S$|YuEI46qmRx5FDi;!g?Ar)UWnUL6)xqE9 zSrH`MiJwDs^X)^*2Qz51Y388@XH->L8Ahz1+?0^h3Wu zus(od_VW5eRZLityU`y_$`JX`Z%OZ^M_4$kh!Wx{7TV-~aH2K}&y9a1*s1gU0cSs- zCtJ^BF7bfvQKK9QFYVv7KNtz?j*r>w&j2Or0k6G%7J^1qEy{ebIN=^v5rli>R6Y3Z z>x*$Z;5CX%XgP{Rrvf*UinTG)aM%=U3?mj!>{Qqw8Akc%^$FAd2xj%bfZL9BeljUUj?E zoWAT-^u=y~n&Kd$(7s4^ON!7>RyNK2k~oO=z#3}&>tRh;uQH(; zV>W8|d}XH8UhoxY4|K*oiU zdE*N)Ooz^dprB}llqkaxFDG}&B(G_`8fT$EMi6aO2q^mUf3*M#U9Rf3x}dKNp!Aqw z15h3HHOax?>d3D|;2}73#wfEf!wV}!7e`+aa6f>cICyxzmj`(WH3*fI-Kt*C{2YZ# zlEf9#qPNXNzsICa0L*hsjmqA^TY4QCJ)EQ$sAux3?i0+^$fO<=Wniy3DJbb1*x4-qJ2xuw!R8AIXNlOOR zp;YQmLiJ)zhW=cI!Au+I?wM=UWOJVr-i@ycNXfQF4}0jvzPFk51|Ep+me?9sFq2pH>rD(kaS z9{5`)M=k9c9WEcYgbb*P+kRNpGcS_46P(~Wb$z7wYe$4Vv7gHfGM}6^8mo|UWmnlp>w+x&t~1?5`poX&flsWVfUsK@N>Qc%A{ zhrxaJxQ33IiDO#I^Umv;@1f-V1B)gwbfBFAnI>qPlr`W3jgY&(sdipb;b+PTs(ua_ zE*#7E1 znBNGt&i)QDHd7mtO1As3R~teBQ26$VJ=ji-WlkblU0ZdRv_zGHVp4bjuJe5KmEy(aGoa3__y6R<&2?APpgYJXi_;V*Q#?F; zkaW2%5*NFR8sa3G8l4wUQK?omKOapC>V~7GWf>+6u+|Ms#{>r_!8lZ|Onea$Dk6xxt}7s^Y~Q;&?iBjUuC>4#TwF*4;r*J0^k2HCL<3z2Zq% zV{P+C?p1%bFeAYd<7+6780>^eEcl6mgtNkG-|@OD4A;pB$uX9mZT|NV#49EpRkRw|!~N(tYwGWVr#27<=y~HP z9bGcRHA5NjmL&+r_=;gM~PuW2m z;vX2zD1>K7t%4`_;yR3<8#7yYu@Ch89GjWzC_T*3Wh7n@GT&ZBd~V_$#(uS-Of1k- z>prm3NYI32Vyew(8*3gKDQBqz?%aF9xpH`fsC9lG*~n$K>p5gk>)xPc^%T#8uD4*T zc0tjxRIUU9+Ev6h*VAbpw)BC=yl^I_Hs z$&sghq>b^yytu&$Ua;w`B^|6+z}UoM_1*9H;Aoov5d3 zDQ-YluDGxY`7X~C=R3SQ%u>=S+jc-OyIpyH6BwA}bjLv3<6KiWZPF1);O}+XF$pTG ziJ5jW=m)PlKCB9)Kby-D7=~?PUPQqr>g)HGOHLhz;)NRwZ$C(oN~8@fI!#=PkcI*r z*x9~U4%sv?j5FaMF`5E)*J~JAVBH%!qMrPe4x(z0{{G>aB~o6@aLambl)W%q?$5Fw z%>RezNEB{Zt=bp?7D9Hh+N-yeTL~)sj@I*yDQH1aa;?dzc~y{2uZJS?9FlRy)AFlKKunB6ECti; zj?%a#nB%oXmi~-b-gE1;QvP(%2z-`4VswAig9BTEi3uR8s5qMYKIx$=>8emcItYvn0s zx_JyRXedY_b~YS7lZ>Hsl(c?wj@yNcCeiWzIEDP)a3>(>G8l97+2(eUvnoo&YXt6N z6LZtRcoJjDNkLn+gEPdaxx~xr)Q7srwAg;_QY2!I?)hl%O5wBQ(Bs z_pjqxdMQo7-(m&~Zy)cZm?2d4K*L`2#O)-lElGtASiX?8(Q33uS=n>S&80v#i>kU~ z`Q^s2yA{^LSI-`%A}>|9S7Ia27y6Qc*(AmcjA%3&G<=gQD_do$EkAO#0^xxTD~%k& zJQ0Bi+d#(Y@2?BMIz(W6c)tnaVEhylv7KFn;n7noL$(C%6w3E$dXXTUPoOpESpa-A zx5&1$_yywz6PW-AW=SOlJlHs|x=YzS-~}pJA4l5|WC&iAau&9EHWlo*W#DtNkzVp)PDG<&e8as*@s5e-;FkT0YU?St zd&f}FMlxk|IxkJ=54^qeHL0P$tkevNcy}^gZi1$o8!{wzm=JjV*x6AG$Z+Jc&tOhE zBAoW0v7&D`)Lm|7^NEwPY9wf(GK1f_bdl5>B7>Hk=A}#tRe|L=?m|VkCRv+chCv4PRr(%Er!l$%-RK#<7b{yluFzh;*NHg zlPhKd5qNLoH!026xvQf{He?o`z~`zB8Br?z`i?WDtkZA$URMPz9XS=eHy{er4ri@e zs9upfr#0p1cW`-g*PbeC`E0h<)f5N=Z#DGh)^vejf?Y$zD1kNOXC(Bu1Wr-4f?Umi zWtxujQ&J+np7yubF9P1~!WnI8(=L0~D7k`8H>?uMnirMpYI`#pZX?{ELv zbHUl&>w3=2Gxywc&pfflrv-LEYfgRGr8o0d%zc!gT^fDMOoi`mBIWQz?vrJ|%-f^c ztt6Mt`}3!?ohfeJEU`uvWZuK>eqjt|TmcE<~%IFHMTg>GVEip>S4JJU0C6-m5C??In&S zr7y1QvDnS}%}I!#tC#z?Zdm6-MjKs52aVB3@c z2x~|<6Aj53bTI#-KK!kwdlTOZKz0?oxyV`UP~;0^Q^gKa=}!~^gec}9E5+GhR3XEv z5lU;Jm#-Fqnbbb!*McAD^(ccT@UEXidEK~k5laD|V&=b$5oqY2a+HrAC zZgP3|Ec$It$%teyO-w5sXYMwz;8-{J(!cYBS`fQ1 zpTBq0*4=Dlu0V&<@W%16-^Yhto4Fd1J|*_g(EkerQ9N*VakIIH00{5iTW-d~t=bD7}D&QzcVC2?%4_1n6c z-gPr3o-BQfA(0#klx)-3*V+l)TT52J$x4HTF#QoEv*Q}+7mdf$ukwAYT1{m>i@KMC zg@^v*eH9mRIXCuu@_|8DA^1kxGFJ$49pQ z-wGE)>C~$ZKC=^Q5);2W;y&g`)r|Iu4HkP3^|qZ$5NLG*=|YNgOyOG-cXE(l`MVC* z!ef330N0Vf8*MFI&JVUn;Y5RXmlt&1qy_Q(%^Bw=KmpYoRUa^TlXJ#AJ*Ssd%gQ^rw;A_4T|RqKcND(-H^ikI zFXm1i(Oi~@Kj+n)W*w{NeqpWa=j0H7(mP9qTEE0W!(Wa!+yt~@Eu9}E_Lk8#a9{i+ zI6K@~OiqbPMwa^4J)w^8uiC;lYM(SR8=4uw(zh}>hzKWbc6aQ3NmsS$E?N?~hVplS zt>9I<3e7^~q*fIc#X1@0^@8LQk8?2DQ#)~>b@E+4TM&L9gCNUE*y$>_X`g0io+P*nm7qJv{;}wMyE@+6W<@dy{ z`p_lYxya3Rc>p6H=#cS zt3tRtG9n{|{@jdyZonVu;MeX$f`?5izlG;0i(5>`R9hGR2VqY#8G^lz z9huAZ5sCe;gCWjI?EB*akKY%Qz2DnRTC=Z*Q(CwfcTqVVez|HkS3OL0w}W1Jbsk9H zMry)NEAkaY^qcbI`wvo8%o5Ywi~siLy{a=YXXNi&%+ZB5au>jBTG;K1p4=QY%jvkw zlQ#+d(Z}`Flz`vrEwzVX95CwAUjgv4@3*cK1jPBmNJ`VyrGsZGxTgi`7(4gF<*!2R z);O8Q&BTMZfO~uOvv>6wjQXa6&l}xB(^W2cjQ98VlqJL>?_GzIv zYUbx<6z(@#F3V9gL*BMbQrQIjkfqRJia0KFSHU>i=vG1FaoM~~ZznSZl+|s_@Qa9G z)6Xvs@-+KvdRp0vNuT;E>#}Jo^?i5M{;d@K=Y#$n8;!IVq6A%6PX zlVxT{wysiSa=tm{m(GXrt&UsxtGhf{-1i0eA`dqt8rFHY$qu$l$y6<$YO>*X+8sxM z@VB*w6FK021~b?|$jFvymm!F3<0WepsfbTliX*Y#*%PXCCTU5EZ}qW0k2}x|0PJGr z@R2Qjq_ZwNaZjb!WZR3lih3F)ITSkB-cFZ_V-OoAGH4|u;ss1*Ky_}|59(JWiR?w( zB+mWl;Ozgq`8ENgOb@1~mL)tC)~j4oTvRRlDJj7VqX= zg4IP$C9L713K3JCP_h;#vHFhoI+aPRKnNm_dlUu_vrB(*GY+;#-IUq^64(AT>s=J^ z80M+({vjxWs(OW3@RDT7$cSEqm|(tAG<}c`%rG zKd6hh)!*NxE+X{G3e>3I)ufJxg#M7<%KsAZFId_wwS3ZQb!KxO7aH%b$~EEI^f8Gd zNn*-Mb5-Bc4mc4vI_>M4>i(m}i}5XeboF{NrSr8;bqYg)Q<=*x&;Ibo(WYoiC4*kbp4qLgEW~rx zvmFj-57U+0A?x2r@zV0wsti6k3+Q*2dholfP8d1bBE@BC63QMgP+Rlcg&Gb{%G0{$ z`e=pDf2lzksc`^nru^)MKj5dOJ(jZfc-s6mJl4J~V0;^Z)BaJ=Q5{lL(O2sCqdMul zr=9MrfE0uP3O&qW#3hRS_WCz>#^*GFEx970R-;>`qMxi>T%?Tu`^Xt~Bp*vhc+Yky9Y_*cf__hYJRB#p{$z3hm~!3*b_!5v$fdvD8Lu)U_fW$)Q{Mkb z?GaWI3B^4Ld9RrEsjp>`BWt?z$-#pzDR64!6KO(4>m$)6-Dj(#&baYH4IL8($xVXk z?mkIFVYRxmP}pg9kc@(akA<(=f>Q@tDFM!H?LH*tC*FLp^V+n!<&HNy*xU$edR)5M z-wiSGLDY}m_N~2K#vU-0n+k@9VLmKZ>ISi(HCH+yn^4=ewN>`&<6nYFq32dL&x6S! z>jsS|p-puP5IrtW&U^&W+{L}@&A4xH{^pT_v#d9nM%i5tS>uK^@b<`@F7(?MNt2nqW;UJ7;AV_aUW+mw z>s2gj*f$!!z?pZI;A=_!*^9>$3aruhlk^H)J=ql?fpR^Z`^f##BiX@(iitoa+ltM- zy!-I#+#of%+>dPUFP{Cg*%o}~`^}&5&eOwRmTT=CQ=Dsij)jZyOY}*d*{#~%X}`2D z>*OUqVCWAmb;OaHpA5th&FGbt63)^+8ow{yS zoC6)_Y8L5jZ5!cx-l@N$kaEVt<0bB9nW>r-WU+n88)PCk%AkI}$p7^z#u5SRilA ziwf5P*j5Rcd!pavKL6Wo%Mt493&!VcbdQ+hf$Ha0#IBeH1NCysPa}EM&tqlNUV;-~ zWYmf$c~j;>h}yVc3tcO5*tTr8VRJt!4TeY}snvOh)5BdJJ@^s3m^CSIE?Nwsf2Men z1wZ4zOF8=--!HY^3{zzNiG9#wjhy}YM*)gm?Bc?-R224Otbf8J=ce{*`x|3~JROll z!r4N=E4>8hz+M`Kn(AtC`c#n>>c7wp2g#jdf*|sxCO& zgdG++uVNuj6pFO@n%0N?fKIYTVGA;j=2J8U4gNQh?l$EWQ3;o`*Sucd7vYB|=YCx^ z-z}1&`?Bmj1x~LXzI{mpCV!a^&b37poYdK$Jlx(jgP*OY^DJe6VVCD)xt}nje{8oI z`p=DKdoq5>AGE>J*2;dpp!%QTakW&=tnp$I3s^qV4~cT8NZ!1?n|+V_Vu)>{>=Jjn za{OTzd+=2&hlsN-^^N0?Z4om@+qXmx(JgB5o~O(7?PedL6FqU_5HZxQglEFawUSU9 zzAxesba*N@^{wE0=sT-E(sryKhthJk1$vGNG*Eotbq#__c$zGd&9_@I z{E!4Ho`aCqpvohFtV_%f*S?(;VAI&%CUYXl|@)O&@pZIo?&FX zw_SkPHR{&~>2pSC)te-7h#Tmgfv3&Fd*3%O2U(EDBt+;S^ldI;H;MgrSqN-Td=%Di zjehN#lHai_wMf1vWEX0b9*3~DIEoykr+Hb4vi)WH577IFI0##B1)m7m)=7AKp%8F6 z9@g=L%W1cbW`E$#=OU`ltjQx+V5X@H1GP_%PjyuNc(E!`?RCy@489|%`j^QfU2DAw zsRnfl5QGWOV2}$eueV%IXs@w0(JXxM#aSlry?J+Fm)b?`@1Fb8!uBA8dU@xk^$2C_ z;&%KMWFlxp5;~RmmZ;c;w?8XfVA%`GOKZXmZm{)!h%jT!-+fw>^7(Zn_O?N+ig3km zENH+op~}H#=!^Mb$RmRwrPz=dIHjTs!kDFr9t(t!kmsJu|EC2wZW{n=SlaVvQoQE6 zFQglzLBU$Xz)Hs5K8`*U?qR^x>X(l4?)sTCvs@oX`+V3@WKN~X*y@dNjmpM)yRgIba`oAEjFg77d{e>!&_(f4tC6n<@R`%myN~lj@385y zWdPxh>)P1P#P4Q5@^`0$<@OKPP9O#T+Q%kDI8%vr`^X}36|o)`4QhwUBSG#F`k9$p zm}7VVpq20AVu6_=s8hoRE3^UJCvbs&R!6=epg719}?ZnthpShqF*iTL&Z}iQ+1fnU|qXvPx z(V}hjR9j%saP^{KRFH1jWC6I)APlvTq;aAZFVgDipnVfUZ29{uVoE>g=&Qu?;Tj`o z0$Urd8>)p$WFP~Px#9Dxw-+N@s?Zdx=m>bv7eTXr!EI!G1jyYl z-k2kE9|#SJIcpire+Th|#i8xf@af_-;x%G<1Xf`c6B99jpO+mFM|25ET5M!RBP9`T zMWlgkqAuo!Vg@G+P*EmS?=AyRZ>4?yeqTexP1En=%U05nJtmKcD_+jVYj)m572g_P z6`yQ`_Cr6`_kNZa@G8CJfQhtuIkArunTq(d7^rTv(TvtTB%db>_n#EvFqrzwk)!l44qJ2 zl7Eo!(|7sHUqo524_S3(;lzzMpDdW}CLN)Ml=VHa#(T7E?W>$dwJhNvRfpL7NFZgG zpB7So;};NzQY2B$la5$0C0|$Ivc=@4&E5`B&zX;|RZf14~kgLk>_M)_wAE5I1_FURkJkTyr-omxx_hD8Sa1 zydW0Da%si%24~~4`CQRL?|*ZtJUR3qLODRGr#vcRztEh>X2f(77w72S-D-T!hGJ`R z^~i6$FSwVy2BD|evoxW%Wu3hLInF#X0cj8>uEQ$L4v|C?@gwv2>i#PWo@Ic+8__z; zL1?^`uJ*~FXYZJf7X7TkCr&YZ31zNKJEkdOVl02C{LhN`t+9`T9mmzCAhZdY1;Ro< zOMzK@IQM{gdNtx(SQn~&!@+W;MGwPNMysbjt6<0VoxiyYlLnKGv}&(!5v@qIPj*-Uey`sU;-`xnca#na)mbDJL0IB$CdJO2ZK|&Wb}JH%!}%oNi@^(4dxVb@V)kzFSspNf20dP9$+|L*}cuv=lr&EBUv684(7MfHwj2{~4Y>Z(e^dVtS4|<;&?E9Q(#5oV~ z#!9Viqv#MR?`Idq20gcHz;fIU+t)|jShVwby@$-AF&Q-#xBFC|2bb~9hJOKxHe%`e zp1$8ESbi);J)-uU<^Ttw5t2w`6c2tcy&#cB4Mh@zJbxhgZs)-sLqZ@6YfBBXHb&b* zM7?9lJUta%x<6SHZ!~s4_W=um-4vU9d>L}#Vc>ie0Q_3UV}gz*tUD}`!NactW)%8d z-JNhMnmOoTWTIq$%EvsCdHBNZuyRJq98qxy9A7}8g@=p7=xhb?CoAAzCH=;|iK2I< z_WT%~gZ=3bXYb6%s@j{O?+$J+`9&Uf{ovh4x8LvGt5<#W1kZL8r7j65D=nhs3kIHk zREv_urH_h_mwoTo(q+Nz%eqbxbY*RdG3=$@6LmfsX{x8B92fz33HKKWYswtNi}7r5 z%sw?yujdZhbty7vbSJ?w|4o{llGy9f_tutimWLg{6PRU6;w91$b}ZViI+8wp#ICHG zzc78eispnd{#LhwG7Cm6LHy_3WeRrSG#R#v(YH*9U12A}-Jd>Q<%oyjED-UR z&SAeBYQ(GwNCN`?dToMciex~1{gmZ-C46qPY}wFN804L|%j-l)rGVFl^_rL+J6@PTjbVe(MIcgYB_3C>@#NF_UNJQIea8Bae|u z@it{@VX=lf1wWqw!`PKzSe`i_mP?TPPPjQ>dj%Jr@d4F|V}farION8-su13&V>s^e z9$J#u`S1#s-Kdn}wU7LoQrbCkRkiBl^N@)k;1p=)f|!5}z5&yeIZ)$f69U-4=BSUq zANo|CN-4L^nBxV^(7#>nD6VK1Hbto=m>(i`ahwQQ!TM+|bY8O* zPbo6TRUUzXo-b3SfAC}m2(g&k#c9b8U%r`)O-j3kM5xK+*l)&HN}oR)=D_8$vSu0I z{B_b^zfFo;0a%-uQ24*-pPWQwSKY+&{xM28~~cFR860!&bwIB#~-9 zQ=A-|lXDO0`5lQqdg1WoAhO2gj+HBAXjRifBK&pR*9zkhP3Xkgy{bA97Ha*DL5RLR zlWoOdYe2VgimuSa$0DL_OI9M@Nx)Pf>t@S*4c$flbqF`OWVPjQmLMJP?%8DNdAj2( zLzXA|&zDO$Hf5|Rw}67#<8>Z9&WzmQD&2%JT-MZODX^uXXv{NNkTIO6JFCppne4>? zo9XUfl1Zx?QDlkSUlxMn^~(3D$r{+=8T!u_IgoZiSzLMUV5@QudLd@SA)CQmG)=;s zQ0V-mR&|(Z)Zvu+Q4o`JwC2+$U;0~TxqoHOAGC=ib~X+^QTG47`JW~HS{N;P^I}94 zxxZDFpZmQw)p!rpwI+H@rRuL@ADwj48hV+fwqu?}a}|CCcT-osLaKqeVNiv(mRnM@ z&eQG%qG8cpfZ>=>WYCUGPWjx`XYhj>w0SnM_b945}JWt`dYx<#&aN|A;Y9 z4s9p88R`X)?LR#RL2fPIJ>Do;=-n5_%Pg^^XNkD=r5-Ak$|rEO#893p>`XC4j?3iQ z4U9ZR4aN^0cy52E!zq@6*Ohy^cJq>FGw$TrkDKoxjDYKT;nZa(;fBX?`>^j5Ux?u! zANn7ZiSkQB!Igs-ku>T3BxskcN;bVW2`-D1;1*$%f#paiAchYV(~B0?g?J zh>)I6*u+5>RpkmyIOy4` zSz(CUK20g_lhRM({NZXvD58l$(Gm@6rOo`(suRv_WNwOBPoWMNsXonwE1vTc$kF(c zEHdO*XTG`8=X0MCK6-)BC};K&t}&)yBxO)8*Nu_E?3diH+7{aq!t;V$TFAk5(B=&p?3bfDWvyZ*bH(;uEpDn4W4^H{J#Q z-RIZ?@It-jUEA3{-~P*rVOBLzJ&;LKFcjRK=zHaD*NZ%{9F zVg_r`F0DX}sNO)GX3tr@{Le=D{(XaX#&ssk?q{FRfdpxLV3ey+?d zy0tKW(t|E_6NUP+kwN6Wo~ui|B$N%ZPP>eQHGpr23=HbrA()O2bxb#=f!>$Eb#Q46 zM+RH~8=znFwQT2;_c?`5xKO@;>e7Lo4XcL_Tn1=wx@izXyz|2IXKp}$@b|Y%$mzH> z9l{n~n5ov9gb0@TWVO2zR^WbT6|A);=jOM$f$4Kk>(6(2;CaT5;Exd$fPs9WLuk{3 zy^(#@+yQG6seDee6Ae546k(`~vLa)H?%g>BJ`*PLrH@|b( zE!}wT<0XnrptoX03;c=d#0#2IGM>C$H{xqT4Ikexfg`Bs-+bO6zaW|3+KW{+e0Hh& zOnE_6=N@bPxu@V#2@BIRiDPSM?Rlq%TctvlW{~GnS{7vGLBU?&^5OZqFsx7XCo|1F za3&sFG^HSC!o^JZyScf_f{;f~=)hcs!ZegkrAQLL6|v^(s6;KdMD@ax<0|cQ*3EEJzutJGQ@v_YkE=81hIxD>f+OH+*$0 zH(*vmD}_fq;@ca4sCgTV@G_k4TD&7vc*_Iz5+N;k9}``YU;lK^7zSliTEP-)v!q;W z$wLG?wq?48wF9zrA0>flup|`d721op+OvJSjI(`vVpHhVlH+HK;IdOT(iVUB9wL;t zOWy7zk~kzVkai4p2*~$ZaVI5UuBR!83h$M-ok)J-g6gJ3+f4})%^CWcRN&=>ahJ;Y zE0BtxKHybT)YaA;Mk9YNJtMbPi@sXxr3EW68%y-x_F0CN3$#jP!TTkgdzF@x%XWaY zAtlOwBPn~sDy@*H^3N%}P3PpF8{%6t# z9UW9w;e=S0h~#ehREu&Swc4y&Q(xhC=}SFmwz@j2DVh9-gz|>;knjdpt-87pSGGQ_ z=H5l*U*M;1a-C1_9Ia$kW-eE^lU1L1$XsqNFUs9rhrO=n6&+cVTrRnAU0+#wz49b(C;r`^i2Om7 zCp@UzmeXAyk)wJM2q86yhEeh3u`f7k1)`y)API@$(l4(kap@@y_f(xgC3Yn%7%0Z^ zfGwGqlpS(THjuFU zDZLrS*$3DyTLdSfUJ@eKP6AdkGP0>{N5;EHKM{EaW)h}RKO`1ZR$oPN(TC;=n&3fo z8NiPn3F|@8T%YXN89z(}1P*T^g7bMygvWwjv?D%we^E=F2QtK>t!g7cxN_j>N!ZUP ztsi_k%gmygGsJEP03#qjq;WOiCJoHy&Xaj*^h$tNPQb{PTjoi%HlF3IRwsXd-?f>R z;1WAqZaRH^xd-W887SFw1)I!*ts1q$IlbP&;jPnU zQ^6~*V=w%}xQ^Iv^OJ4i3k*93tnaO&s89BiI*z>L&i0b0v(mr+Y|8+&3Aa`_<6FNz zexx-<2x{E=E~Ulv9=Cr?B8@Mj*~1*1<8Moda7egsiualFlb>C-D@Y4~g&yQFnU zD`s~wDfA6lWJQ_017qEaB+oQ{^46H%bS7DblHsHt9yjJI|0q+d-9;ae+SW!C0yQJoDfJoV%M$R z$td#Dw*Y_cvmL4sY$M@jG>mz>RTQn!MUs7dGv;I@M)b$88f$j<5Xr`KQyd#lt@kz6 zI5fO(1IqlPcgFMWntlma9rNTXu|`X0k>IvQ;gfkZ6#88k^l6pW`>k?cejoVz?Zu^T za=XOsgzkb_{{gG@qm9@S#MQ=~>DtncQbmPhz1-MBL!-{tLr7s#lheUa>&>ubd=|ve zpaw#)o6_;B3GK-)<;&5fw^agJoZe83|AD!&c-GMe1wbFQ5)oa|qimyb^6?cXPESoY z8zG^l9aurhrx+PQF#WZcX45C!nWppuH4i6Ky1%3CI?ndRS_kjWubw+W)gN{3-iMqO zPR$*SK8p*;cx7|}kVrbhuHqe8B+bC(tWFj%Epjd|lro-lq(Uky6k*Nz)~j;4XR+IN zF_jmPlz_|8rOifZkbzg$LK?lzLL5%hZ?Qe`lz~X~`BV8`s7B7<@3Ki3ZTtYzI?_A`?aG5Q{gV%90aZs<=v4qT z$$vl}+6t(5{=HR;n3EV^?#1^=#%+w`J1#KHz0-nqS#fO1=uUdPJpxju3*evAw%NX4 zASQa%eMqD zZ%(2^Ais3PM3$XoRdJF!pt{)*%YA^>$fJs()1cApMBR6tgdv)Ds!3@bS37i;FWd~} z#Z{0b3(Y)0I4(!YLwdY89UK? z9p1(&D3KgAF$29Y^Kx39!=qpvuiq-U=$PN|I7{8ZXbl+KlYGBtJXaOiVoKs-POn$F zo3DO{Ed-RkNksc6*1W5IqAd0rzeV&pj}69RsVO~S1oO*zV>{kdAKJYqtsayEg!mEq zObBf-_>?p5%VuyaA1V6_{(;hj07U&@Y52_*jRWe97dQ%mU%**Wma|0 zD)YS4gr3rruF)%FgNuu-U=DCOOwb{qz~aj(i8PwuA=Mk)f{{H}Qhk#+2t1djysxql zb`)}>2eGWWLZoGBsLbi{WauUrwu<-?7PN>c5?`F7J?HPDvsY;Bm9Y?tosBe#17zK^ z&|v7Y>0!Hg<-3m_2uPt)v~e`cvbcAUfU|5B_iyZ%D7f4;y>}w-IGAq~Z3&rr$FX@n94aUOl;GgGL5Git^o2@} zT2f9*O{LXFt`*xmV@$1tK}4ycIfKF^mR2)MPl%F)hGnsdyVv+)jlz$i<&ZU=-8&Br ztNeA9;RCaYUx5|5pM)4T`tXPUZeH)%%PGxX;xCjKgPrx`2e(4|x93?j>H)m4eZ%E}2Gf5tE;J3^=^ehlaFATtxka2XhgMsKhAY#W|G zxVk_nEqwg(=Z?_7RS4L9UWa6T@hogh_E~HnseO|G5+9`Hy&%D#%3|?=W%vJjR!JOM z=!lg5F?i3MBIMyZ@k&^E6+(um*&&Gn9i$+RhWf0!S&>FQT^4UQBhN!5QBpHbQI-}H z^S7W;ORd{4JJxu>eO}sDBgpIa;WAC}dK?!h{lK;w!5dE|wdU6J8PfvJ1=58&hxUMz z35hO2q;2nj4^kHPJwMjNVVbpFZC>MnF0A>a+_+%XPL_sC=(&d`en)#bg?3BH7mHWO zXW35vmwW`2+RfVOWa>#MuSnzu45YrvJq3*Ro7>bV$2^nmiD5pCyDe_X0P8EDGZuWJ2?6Xw5g*B?K1F3-9=RTDd4rZJ(OXHuh zNo-n8ygjp4v4xA;TIOl4X~^?jv-_AGJ!*hi+Nh2@z3ffA$V&r`BXs$@@V;1>p`vRE zkV=`IA=kB2CtO1MW9rxdobyF+=`~wKo`}cmSEw?y@ohjtyJy9QqN+}1v{uY|<5=wJ zb&b*Qb6Q3)D`et(SS&NK|03I(K&Ly zwXPg_0pFG`rtG*CTD#%RK-j>0Dv$xzw;!=;@(^<%cUqus(krgtZnoJlv!(^@hnP^0 zEdC7s5!e;(I0?9N2ko-sd{OBnWi7^4C|t3Gy?JpIu8(QY{shUJ##pyear4*%hZjpu)@z|5*4e_;F3&3#QrR8 zXnpic+C0%6;5_n#Qeh!2ke6uM&h#7+8_0Zv` zYeyA3IPWJmt%{;3ptdy=rP+DW$U5{ay6Xd0_8*~NtRnoGaG6CGWK)sp85q^7;_jRE zTCPJ9y3iKm2%zZ^onsndm{NJq_sdwh1dx&46v;gRF znHFD@_}ascvY=YszV+%%Hjo`fxhq8I;s`D?DPYk4&Gq<~vUB^ICaO$G53Jp3x83Bd z<;7<#OTN#U*`o^}>f7KRQ-bMyJhhp>m&QA2`C+m!5S?P35MVyZ7VD&O%gQ|||S@^Z$a2GdT?XVS-2vruf#zlb~ zJ?O%} za&sxTHVQ~s3sz-%@mXFk5Si=*z8v<}`96%*B+O`UFpT2whWTuaO}s(lVjx!Qix|OQ zvSp-RWYya|Y*rOd)gJey7%;2?ra(8!0c@R_z}C56abMYg5uLP=t7f6VdH846=F`r3 zJu|x+c_FL$!!B+!Ad!vO^Fv?T7FMMOilFv8Wt7CGJ0-IpdnIl5C46Oy!?qRK(USw> zK%2x%mX63kru3xjhLvMdpKkm)+av-P)cTZf&d4rDf@$06&%}Hg;cozFWu;XC=}z60jNY`WXE=$YDa@`#Yiv+kRSPwKAAyDVw z9;)5^ESP4G#){80P#J)@>l4?QgV?>woIxdfT3)GI*h`FH;G^l^(UfcZ7f2h|LqeFvCr zAZ6O+=BMAINCqetO|ou)i}G<#lib_h!SaWs=#@;RoQUKn?}~Vou$m4On9&PrF#ov= z;3s#00J!f>Yi;ACMW)Rbgk6X=g^o}$pC>OT@h;=P_7;mYvhjRJ)M5Q4Ons9VQ-etN z!OO{ZIs}UyQhLDB%LpGRYzWou%g~HOGS^au(bYHXi_;Jtu4<~W071!wxC4KDw5x2_ z_%|9~a!$h>K?lf6ie3V`R$x9g{LOjJSK#GW;5pq(L#*5T62mZyRVU7^g;$7)7Z#je z5&$*8+29R_s9QKYsEO*GtBFANYj70h+{oQe!dC>subn_R9f*v{&7|T7?T5y?0Quf7mR*d#)EyI@bY!Y1VWq#cn}tZdl2@ za`kNVi^#Fpt28ftX=dC2ZQ+?j&Cbcjk79R2LZ42q(`6d|VX=`G3$UjUL%Du;hgqCg zNd%;2sh$~S@-}@LS^M*uro_c=XpENf_99EGxT1<>Z6BP4f zvz^DPd{r6#XFXNv>K4Ve~kW)#Xk6ojsbk`fotpRaDRRT2PG5H z0W>B}id~MnW@GeDHJY8YGoYVEE1!3IDIc88+n-+;@Pej%jsAZN4cdVvR36LzcT0v< zN*$g6)w7lI7sCa+1%PmmSnz*$9r%A``gM_fn~q2eyp52_gzG737N+OF3k6~k8lryI zG|edCfM6Q7;u9{b<$Jw1*Dqd@5}!h?v<%tFqVV-EyUCc^hwIl~X3z}Pd1K9;7GMV~ z@6RwazRQgv^@?pH>o%QCx^K|$h8(oe2mQm{rW#x}URv*^N=>mNe*c74yG{I(&l#%r zVnTiHaCdeP^ZEXpB0v(N3)OIq2MLq>Xa7FAAlS@UL5)Tf4bhFWQsMC>@qM(5we*Kj z(qXJiY}=I&J7FQ|>J6+Ojp;GocO^c94Mid1VB7wgqV0)S8TnQ?wxBl^)V?zP?ZJa8 z-=u*V3@Ly$D#H_K6}bE~$$nqEOquKpl$p&yHTr@j<3s^zG)P!^t;p(vX!sOfGxRns zuU_n5PC0}@zMPs~LvWvOT^n{b*Wm!V?`F9Y!z* z8W71z@3+>%od7hE^UFDjrgU@>L|rEPdLe?kJAYAf3iZAethATqm=kf!KD1fx{r86D zRsbj+1Ia(Ynb;3ae*@&BO^byqLslfU2QjaQUc?OnobBl*Mdusk@8}9T^8it*)PT4; zyDd-iN?WE7s=64jko*@*?RR_BXJ`MYEg0*w7Q_HT7J$v|Oe~5moFO;aua*a-^7Jh| zh}MJC)=XgBS84Mt<^s?p)WFfRIpFSGnk&$}R~3r<*G~l0btG>f| zVUny{X^#vf72TdkewzFHDkCZF8N-4V)~Zk5Cv$bjaQGdj%Ob{UEK=};dg>${cjI}+ zHMT~zzLq4RrD4aJU@(H6BJv#?VeP)c_bXoq-*KsRxn(El!a=!88Kmh=l$B8s8Xveb z7{$oOGe0Fhm3qI61=NHqfEO=`)`Ul}O}m`KQ(n8#=~?hIHik__neE?ZEqK`TZHOjX z`lHdpdpGM}X%@M<<|>q4H@O$9u7GFlW5k-*ZU1n9+^kHW{C{Ih{-0E?xMltae7 zC0gJzbYTsh#Bg0ZU8VA#Q=?g3$;bE?I^$@fTRSAxj;ErJ(h}*D?OcGq%B+)7XH$_= ziB|xwrEw4Bx=BuTqg^duFDN+%phvOmHf+!T~?7EQsnyUb92Q@bsL)t*%_c^ zBGCNAWjAq>{vo6B0sbJDEW{l?WpT~k!3t=5mG7Xe<}|6&z%1Me!d=oKP<@rs^2MpP zl-t4rwqocXeZmNVSIr7q9q$%%7HKn1hyS}em*|SMHIMp4CG=z!Tr!31XD+gJ+PKVa$O-sHV#nh z+#d+hz?C`Ejk}fJ#8pAq#u@)XW%(2}m}f#p8u2q>l@7QtONvw2u24KumP|O z*3V-c)SN;pXrV|Y^CI~pooYIJv#!Qa^f+!p3*tb$;}89cuY!ZDN^qWIR}#{nnDX-I z=m<{RsMI9tB5Lusy@7H6mD!D#>z`oKr2v;f^3#qF?(nUu!A};6DsS~)h$Z94@8X4F zh}nX?HZ4whQ08|yFs0H$o;A9^ivq4#QwCUOWUpia)wEo}L7 z`9}eY@PvPr>(hP`fSM1!6&s%jqlzD}Q7L2@Lw&8I)>@^qpCdATm*?jfMTEgn%92AO zJHOGHAoZ{Fa1`wl9~Yk<*IDo9py1_x=|+WJl(?TQAh=L|p3v9YM7L){ph5PXDVMC= z*72Uq!vob>_oveE1+kLp1W2UpoYBn%cm+v2t;GH=Cp?rMr(K0UvywxnDJ7o!Q0cff zNOwULXORTIIx1(_f5|#UNqmgPfhJ1xb`MMMU+;Z=t3(6sVD~4EzpvP*@Cfms3yfeo zUl!zw_3b!1iSL}xZ?OH3e^i6&(^UwO;#(XOWw^Zc=S*6zz)xC)-t>MXeSnWWIB5~| z`UX}&Yc`e`dmZe}UwEG_<+8yYlIC1e_UH*YLn139SU zq}G3vb~A=A;dMLBF5f3=HTp(j*scBoSU;7 z4wxYh-ug{608{O|Cj5V7y>~dA;kPbqj1oqhh!%ZB??mrGM6VOUXc2@6qxYH!qDQo- z5k!U{1f!1{Ub*` zSHu|41U>0FHyD9C+u)y_oEY5YZ3%VZ_D%zw0gzZ^*3cbw?Onrl~_<0FY!UJ;PTj{ z>HJT9hZqKmMx`_Ls8*n}UG_9%wBh1wb!MQ7^!Fk9?B^Oi$;Nh6q6ed|r_Z#QqQ}Jf zGT*?}7Z<;Xz<>oxNS<=(yihq=S=l^t7eg2F10E4{sJHgy8?1bXyiOnQACClE&e;IvnYuA0@5WKJCh6QVeGK=*m53km$X)4C2JC4E5t|Um{aq=scmi!VY&O zsqyk8dpGl2Pfp(js6MUdOG6W7HqYM-Z2gfB-O}k^>{${8%G=y~1KhSZJ>$E{yV#Z) z8oOcX%EB}ggVL*;sG05YrDW51>LwG@eofJ6No?2g{owlZ5-2&xdom(dp9l`G1nh>uNOTtKBOTyZX!t1BST3Ou)yVaPM(z^4W!qK%cz= zIiIOl6fcbydXJF5U`;TV=eYDDmG#k+4a62FL_y7(&tU@&{S1>arLI;jA z4L<@apK_f1A5z1!cwjjoEKcg8#a&xNDLfxdl{kQu@Y??GvrG#AFWfI#|Knm8|FAfZ zom+;DB%Sf!A6LA+S>}7{gnF_2z60J}Ox$wmu8}TY^YQZ~yDJdr#nEPJg*D$?`vF|I zq0iqkJznxm9ir0Y2ks6VJI6IjT}p7;iR2sqzUJ{&pef8cO=ps#EZNw$n zZrWQL1=#h!?W-5-zIxX$r>kPUh#2G^hJT4D1Zp!&g^9iZA@4j1CU0gmnveO|XT%)b zEF1B2IdZqwvYp2qKw}r6y*?R!{#-~P1#AUu@PA#V_HWfdrVn;ua*!u(caI-5$drH5f+hlUC>s8_0Zs6cd`2U`oMnIqp7r|^kY^{bk0gWtCM=~$>@0tKY-8^A?FR#q zED2(vnoU0paF3;cdn||ri#L#X#1puh0Xx2!ku*rdK=Q-O`5Vm&skIwMr^$&kCkOby z{so}==a99rw5&Xw_7IlrHE~0bGbJ!J04SAIFhZJF9N#xshx;E%%`E8H{xvfI2CT(> zmvdNXsW}yTrEi4)!2j*H5`&w)@BoV@;hM)2*K7ow?f_XL;4G^WOOa)INf(^J{t3JB z7bzCe$jxmmpJv=cZ+WlMLwOi14jALivG)SCP;K+0(B2HEzBZug+HYU{t}-lrBsmBamQG=$KFTWZzM(h;9!tz31_SaO z0Qu0uubCuuRNjHcaFZg*w7KpAp|Jx%beoD(0r~z*HZI9H1112xCZ3kll%w|+OaP|b z^w+aP#S;p2FXTmd8JPY#?h*Zm@n0*k-rf`?=z~81RO9>2lL|zZctqa;2FPK(P885Z z%Q5$zZ|xeu;%m!eT8GxTznN=CFeuby&Y(s`#{QLHlT8v++p)-O&naGCA4kwwj;ZMS z&4B$2AbJfyV2PXNQ&=L`SY3Z(Ks(4!;n01Y^9)@WR@2P0@a4Q9R$$nI{y*KzT^AS^ zFvnt)4shKLTD*YFmU5bZ?yg#qV^rilTsr5=yGuS1=|2YS2%VNApV%=F%T57_FIgm3 z3aE>=z2msph&Uyb05+y8N)L!P98zI}H%|uHuq=>yNkqIYYk8zRdjd z=SxAmmM{@RI0R^wwH=kQ`ttRkQl$bljenLY^Cfgp8DsM7dhl`#i@9~#Yg)?ZDIZ{=2 zQ`M7x;!^LFmISwAhTXY%`&{e^>C*i*|oW34xuEnw$LqiKKZ zxD)r?$g=C%+edMqp7jNNAvQ35R`EsKEJ^m<4}6Pt1V=yZ92PH&*XCIqvN3c7(SiF3 zQ@*}N+Qblzy)WlXVB7A{_xq015iS?SXW#ka6aot&6X+5Z&HWKpA+k-h9D-1Y7{AP! zg+YU?FN{;JbZ>Pm>&vLN zmadlnmbXdQhOqM`HRT#_vVsiXZsY1;MY-(rPjSi(?(@c9< z@`903{o-4<4sw?{Y{L)L^UXc+Q=T_-cxqJ6buOahvwF&^>Qlcc)U>!3(bv(w)u{#3 zU!ZBS6(Mb$_W;qsP8t4Q4?i-n?zsFdd*KKM{;3LI;_T9^|%7yIXG&g>hNt*A(FEazX8kg83=hqSQ zz@}Kr6okCOBO#^AV4N~SA)RN15mM;LkGSNZU)KF>_zDz>WAKxea?T2Hv&&>vvvpc+ z7~0~B#)mJ&we`?>oKh&gb1U)2X1)`cFDZ}^#FXyaf0GwEU4Z&+t@}VlsbP;7?=g6f zP-s*j>IqZJ&%htcGQ$O0#JS);LIDsHNLm}*yn9ScUHbl@$A3f4rMf+N%}(ray5upB z28mPQAx`-Wao5eOICwYITM{x-}*}y;JoTq8hYNxe%l!qdMn&6WhxuyloV^NMP zi`u9#bozM(N}d8E*>U`rAeYX&vRjuB9DM8>lZ)?4=3p=~g-hFc#ujPwBAbz|>4JSi z|2qe{&l44(acRU_w`0j$T<6=srS4aje*CQ+gw=%zJP_YURYjgW;O-fTko>esl`xJZ z)aT)L)Z3Tpic4B-%Q(_PATA;+21ss(fUPMGQSKs;m7utb_k@jI1@rvh-&C$Xz=-F%moP;r9=+&;+ykV107p+Ze zC2cTx9fvY>Er6+QIqnxIMi^R-Q`h|rNLMRx#>dt^lO6fqT0eW=TfS}AR^&{_2|GvK z*D;YCHd8XUk0Dn91_CUWul*sdUkF?8EOO*{k@fSe;r8Rjl5HeAC}sicIh|HAZ(A@& zI?WGL!_C3(e;|r_nyJQ2mRr0@05yS-9Qzh269q5qvVvkMPalDj@I{0HlM*7&Bw14# z3#i=Hy5R`oqRsQK#JSnb%awO{SXuKhjo^%^Xe1-TvvoQ@H9(vDlk?yLL(bdcnm~Fh9S1 zN3PKDYx4fzjMzS(V1Kc9k!TG(!_Ipmwpy&0u{2bBAVcB3wD-KfUsdQ*jHS1U0&VsE6p4+lpfo3UJc0Bzde#B()qtWPG%*rI zpvPG`CXcguAn_{bZDfqz2n+QtZ4#ByiR!#6l#T=uu4#JcuODn5*PC~gVrfCBbKo08 zsj6Zm)_2p)i7(VaJ$>>kO~=*|$2^o5S&+@)QN0k^;ydl;N@dw(&!*H^vc?}T>~c;W zblgSFv2B7$1g_z6KI|R&_H3Cu68>?RjA!BJ8x6g1)#9ZF`Q~)GqYU4zwpC|AVp@fU z5Y0G;=E#TNkVka+d&y)#JjH~qi2~LM_Eo-EW z+xtioeRzFGhL*TQh%lZc_)Hzg2H!#ahG4MedX_IFJBu3RsaP9ta}18dz1dl!4SXMY z4@CaKoW0fiSPber3SkKU*1dME<{DZaW0_WfQiuA|kM+0f+T$o3`jXgG#EnfZ*2CO* z1$;N?J)mU=-8hJqG#2>?I}{7E=frO|k4W5QZU=IRK&3WW3;Mh69X%&Rc7l%9vxn4nf=<>q?`gZu8%o%qXBYU%D$=w^p&MqtN6O8` z3h69YCk1|SbIob$Y*SDB>7-fWFU!g{Mw%$q)9UlXQ&+_gS~EE;O-b?{w+927T&=$f zUNV}48@87`VuU#;@eWvB8zLF-Mm))pKVh#=w%`^_XlYm^9(kuqR!&@81)C=W-|0Kv7 z5hs|3&b?fzj39{=c>y>j=JkF$(%(IVyf%HN0;alS11!pMH2`J=7KU zdZJT3{8#Z3p9$sKKGDvl3g>^~rQbjH-{f(Kv4^*=&m7UoJ;?c^d`Js?`JsGhNXp^O zouaoY5GCxcBzo&xlkVGV>SZ#*Lh|R%w5u(Z@`ZPuMd3)WtIJB{R5S7nkL1I9j4w1Di_W;LU9@G1yT!6Mal^<_ zo7BKxY)CMo``3gILzpR?%O7qalUn-t@ByVfuE|Kd zsqT#{6NoRrwFuXuH|uIyBA)mzU0`J`lgQZ9|?Ti`)Q{0Z7Ht!C-)VcIg*5eZDXXOP1CB%!;RO(Y*kXuO9M8xHOOdZf`p$C zJ_W5URtU!m7nb=D0GAP`nG%@nRVNnmL(9vb;7kW+DL`9=Nq?lgr;BblndS~hNRWmA zznj4bKTHIC>w-(xa#GdUy5_2Nah+jBZIa0kTPxmFmXA+LQh9(oO@54ZO`}DAMgfC6 zGF+31;tpM9LzttxdQsAIYp&820wKaW6p4|k+MoUyohU<1-rlxU9-vh-f&>SJIP8ly z>tTn&)2&^?^5iuc1D68?@r$?q_Q;%;U}hxs#cxpT(I=GiotJz`D;4Y^rD(X|m(XjO zDdZVO*0~nVYXP|ICU5OdXOb`j(Og%koPcIJ0nCiB>pNZQMltprv05lnq#i{0hrrU z@o7EL&wiAttni&9$#aZ?%8O5^(i2}k8^7n$NVuZSwSfwm-bhDC$I+B1o;&5f8#Mhh z>S>Q$G#Qcs}c_%f-^0grD+zJDE#CWm$o-8GZ*#h9|epmC*trZGl!oEfA@}6<^pCc z#D!3Z8fgAP22j|5Zb@D;B!L0Bk`3O;#BTtRtUI-_Vfhy~DSOXP6e_=9v5OF9kR#g zpcG2jWU6fObLRkg(%I`9@L2hA>MIwxInzR^F5ECJXGr*(5@9K36SLuNB*qO32H$Yz zvI6@{RhQp0^mshnz*4jTn^wqhEcTsW3J|^2vZ>2QF-LVpWB!0hePbi};3F&*x>u*t zrDX?^I2}Emwnr7i^$tQxve#U)Vvh?opA``KV&>{W6Jc0&y_6z2)&EZfB+6 zzfc!QoF%#>-ql4HoW}9aK>MW2P%|*?*E_~U&M4IAg>XpZoqc~qhd)KhWWq*@6D0S| zR_KYCmuX;~C=?B3p?nsMHVY^pAfCU z3oko)O8Q&FT=~P~Yx06>VfELISnT71YI${==%KrYpO!S-fv?7&o@>a0Vt;GMet!7S zTVbCUcjXuKFi#J`b&p1+EB=wM`rDUy8qu(OtIF0*dQb0twoJeDhNq}hU(@qyn$OEB z+;6u(GLPo`2pFw^ef>=GryV{nCNbWr)gEX5zUu{k0@8IQzum8UV8AqJy5<0p%3`5q z)$pm(B0Ih2mpmD_gy`8dZaN#S!b|=Z{J|D}1OqRO=H1_0VfNYgE8kI{W?U{RhLnvs}|f>6|x;9@igbzmKNr~e$uQPb+O_8TK7>AG;9ORx?3?44$Z2u}%AeQf_BeTyBPnkz*)S(n!CERhPe=TiPj8hG#iO~; zWH^WUCukK3v90&xx1yUgfmkHV*x0?BaqC;}2l+wZGAxu6g)mtTi=sxB+d`vEs{emv_g zP&?C|uUK$qA2;yeKm-%+b|iYC{tT-knQRB=T5Ezyr%7>d)i^+9bO!DO16n z@4MDQ)kIb}enuMMegzrNi?}|1C-Wu7mHeMN;Nt|5WeUQ^Yz+!OL~avh#>+e|1m}=B zFsb2M9{Z+E!9D%#>49#vvX7UBXJ!RfLi_$uDY3|-a8w7dT?Rqq+icDY8?aX&*qgfx zD9N<&3DvyO9%3I6V&ko$LV#nx)Q|5P=HO{9(xx&Gh5F+7D8kp?$$e&wIrP1ytKS!X zf~uRnz z`QQyN3cZ_3|77(ENn@eNq`(p_g5d0T@N9cW-Olac5X`!4##|_{-D)hu|C#p3nc^=y zJwf)Lp5PI)rOS01u_u)~+JpgK$D;Wh-792Cx91sB50qn=bEbMczrug|HlY>K7wsa> zfn1`LIluw8PDG>>Rya`ZDl6&nHpU8b49_T+gFKMD8O@|u@jz4gl?7aaK$Z|#x3Jqb zC5*;VzGZA`B&Sv+@0COI2rD(a*5gPTssw?j86OXr3XXfy?xJPEw2f^wSRy}p4q23` zAT0RTNfPN$H%1m(%XsFiNB7d=^s>_xlBPq*lpl@1Tfdl$2_Bl#O?bCJEe~8%1so>B zk!J~%6p42P-eB)c?;SnM^$h><{z*E@nbb{(ZqpheQ-%uZl*3J1sT!>1jj7m$8IWkT zncM69suFcbPRSUTOEZ+(S3{J&Z05sjO6qxsx`@GAVD^6Dae~Ki6^?u}pSlB~AXAI2 zsbv8Qdk$Dm65M{gb!`YKJc|uh1|}?g<;*=v4~%&(1lNED@XW7&LBw|=WCXL`%5eU) z6M9YNCcxog==$Bo8PEA%qIbxs)y0Q$jurm>!%O*a{K1;~jCtJkdHJIzzyYH+hZ`pn zBV$^tUdoB^i)zj&qw2Wlg|$<9S8@C(kz_)YwBr-+2zSi8>~%}$*WxdSP^`!tsoKoN zuv0AOUSMIV_C7;ylX`k(QMgp}B**S!wsNy7NFjM~w@v(~o#5HV;D^U|S0}i#^?=Bv zBH;C_glYV~)bFuJf8MF|(5=WFV2=IzW%7QSb03S6bFT~xj8z7h+rfWkN#=#n3l?d4 z9u45SoJlg;#B_^e7b?iS$*s4Q(Bjf_REcV#Y9U{*)+1Ju3+ZDobG_bT*Gtq&@p+Z= zP|WYU6p)bFu&0q`RYBj!-dtECDEd;J5PJmspPdvqI4iDKSNgZpw*8$uXSy_W)l2^- zFH%bdY6D&U{)}42BZVHMEHE)N%d8TnW!9!s{v)NIRQnDdv!aU_K32wT#r8;+;Hope zngwPM0tsP=lZjyHD1Fn48`Cy(v_9N0^jiVkMAf+RJX6#FuKo0C5AGL>68EyX6a;rJ1dqI%Y6x0a`YHa25kvc(I7A_;OA1qC z-knb-(nzYblgQl*?7g>G; zQ9%h#qYVrFNiX}Ln)%=os>f&ZO=A=>aSwQJo z`0Xc~9+w-FP6>rSWxtR9+(CFD=;qEfSNR_ASV9-7c~M+lnj2@elw-O>v^FuF^-))H z2!}uA_ZBLG1##}qqOfB~x_L+l96hpN7V=vrrS907fNBAkKow1|^d>|O$=vlA*QT*E zt|h(nS-A*euGQ9xkWtc!-KqJTZ%rg0!40J0jSNTXVugD(PL+&Z`uivHyOMmJj7cjP zoaqBe5}F6&^xj%ey=`>7v`&Rlbr{~S=iLc~dNI8r&3B7v5{*M^kV;c-8Ms zN5SInnLC{~$UR})aY}E#y^&wzmyu8MzbfGY-Ses8L2kN1^4ebDrg^z)KZ(!w^Zl%l zkQ4MPvz`7%07?tn8T;wBx1~4zyhaWQY3*1H`Z^<}yZZ)z&}=50cV@b|m^AR=g#R=T ztk`uga29tJtQiuk$iGWFQ62IN!e@lbj$X>o!D2t7xB7$-@qZd!JFQVD)feTc(%l&a zUQYnDa89hxxtW~V7KhL9z*512^DcG)!uy0!DJ}T*ZYs86;Bv3a0pkD@f%q_Z#|Yx2 zcq4a;+8GDhkZZMa$Tc(=32KV-W8LG04fTqg-C`Pd!@BzClA-9?0|^{6m1VJc?fJlY z<;iTwHAvPk6HH_sT3@TFJx9?o2S>TVNd&)?N^D&u3og& zg*!$jD6>Z8g~W7g8p~`O{tqASjOh^nvilwhg(s#5O#68IcZJBGRV>+qpd0Zr651dg zyf}^qpJnCrzaKhZ#5%b4cN!1;xbQUh9srk!ZQ41~%p<)}PHPM!2xiAGsGS-P=W`@o`7uqs#-C zry{O*+qmFT;B?JK7^;aAhF$U-vJZoL90D`GIC7yynC9Hz+gva`2U`FQI$@tW=_k*x zWUzwQnhP42`R{^XVG8Knfwl~<24}O)VtXkAUGX!$+@L;YC)#Zi3jM|3!;QHvy?zdH zm9;3_k_ll30@DGj>DC{j(ier1?IGXMOJ9cH##>Ah-dUDjvZcDE4hH|NLaXx46ru#9 zy5bL-B`%dPsmS!fCjF6GSOp2q6Vl0`SsQw&L&r%xe|y%Ef8O01#aHUOoP{VJ*c{Hy z#BgA~2|F?j`%+Ei0~$Y5)x*UV>Amx&S5{m~OXWRsV)Yx>=baxD%lNW$zaU<_`{U(D zuymbOAsk~?dj*%d&ctl&hi^U#xvmvqsQ%;0zD$~-d#49XV>R-uuoFc{OU=xN7K8Dx zKWWar;XtYAMmcFcZw_JIgzSWmAObq2#a(`1g{=3#Bkx;!dp*@Wb^#zay8TXf0_1_> z(SOqNT?^a#MO;<|U9LVF;=|7@x?DXphk1|GKT||0I_P5+;=l2j(I=;%hyLb0tmgXh z)?h5Rol-;D{mI4xNS$LL`|IY}jqU3)U0?l$A)DT_7R$E1@Srx|WG$xTp~g4t=#q{; zl$=Y9Fx?|Ka6#-4x7|*9Oc>fPst**_Z>kDoU6y*Ly*|2rba;QK&cMAm9&~9UVa0Dzv?{88$!1bB~GPL~E^-EroFe zUA#jD{f*AK<#5)2GgmO}0!`$+;#p~a6TA@Yf;FQ0kd5`B+)%|~RD|EI*p4z~zdJwU z4>K3ky3i_IYuSV$R^E{nOZY_3O3YLDJjCreRgEn`W8=im>bXjIUBV^33NHwd94-9M z*EP6r1O#@X=8887t#w;pd_@rDf)UYXp`KRu^g9-eE2F(A^eGMflNTl=*PE0w{qnab z3R_R?{xlc-fjDOz`+GfQ%aQ+DeQJVua@ zoZR#S;7`SDAlhk;#oxlV0v8dr;A^d$UuH0(3Uh<(H%$djZf=wNh;qO7KA8XUaC=P@ZDlIXH%$@-U)rSi+6~Uu`Mwh%0tc5G zp(jF7K`gMECFD3FrpSp*qW_I zaBvNfr}PmUUJ8|Dlomvav1_ZvP*FLL7(a;_ZMw?Bt1<}1IQXL z+THRPuUxJ-yG=i&^u6}lFH0vanI60mU7LtVemC?cNDuxI2%?D7g6L{0oLo4{~w01%rMid-ST{?603%_-Pkg)z`K8q!KsmBr3YbZS; z8@*+02iSD~AdoE6=>w6s_`ISasQ#8)k+O-_xbCJM11sP0q&tJOP<7*XoA7)+^V0b8 zqb?mIPdOx=z^mFadKMV?da@Qs9|Ok9uKh<7Ru56AIs&wC*5!0KY<>*(!JtEtd`43! z8WdKM8kt|9l2?N~O2VXgYzq>OpTHubo=DA4g=LY{5co)LtG^I|XW<9P0P;mX z@t!iZ3k@0_-=c_v@LbQ{kSK4`fDD)aT%NHFkLtCe;7PEWMiUtq{+;l;5ub zG!yhGMD)8yy?O*fe!~LN@OR|#o8mkG{ODOJ00Y(Nh5T=fSovUU9m`sBo4R85(yMC) z(w%H(04!Xfe2<&nrmB|eI;q5M+u-tR=GpNQxKg;|m7g8uqRpHQ8Hv;3A;QLF)J?2akC45HOoYd- z>HJ1HC$!=xGn>W9y=E}A@Vw|OX|SbFblUF8t6P~OyrAu|es~qpShT$HE?!|Jxik*S zk8{7$V^5+N(9jxfBH#hf!%(9Ua#2p7QL*UDN4#y>noojqQA!A`HgjSxPz<;h*5M4V zaG_x^>7fH*BX%|PLy`T~J8!az#W{SOQ{WY%SyRta0fb_w_#T5~VM5}YYt8(5szMU<}1Nnz)gy~cnV(Z z-$5QY@N1Aj^v|-CHAYt0PVW#Mj!_kzB&mQl{-9;!Pr27CSNWD7(aKbIkyXP+LLlOvn0u#`+^E4_rE*w7;cQChNJl? zTVv1}K0*)Jp>FPC#q*M78mb@oJr*#+K(} z6|%N#`Rkt~XK@2II8)R|GfeHzFuE%31N70BxM9yc=4%2~+fyw*q5MQkMN|bWUwIb* zqq2zo?{ZwLyucBHZTsSFlOk3~hk-V?JKmNrgT~#7x?)k2W5rMDn2&+b3=^2YuY>|D z=9VG+uhC<0n!y?jirz0eomi_PRcE?(xttyC@OUwM2o%tD`emgmfDo$IE=TE18|dcPL^RAGG_r8 zcHcc%np?FEa^!KqLSv3}==K0l3GTad;Iyx$kObu|Wx^t9=7Mg+f6F;3%4;#)7MH}s z$`%|KiOXKP$D>|CHSgr0Nf{R!Kpl^Me8c?pLjX$HF@CC_u!gk{?`N!XfwnnLrcT^+ z@{&G$h^_LEE0kdJt-212-aNcR7|*M*C7H@()G1>SqNMYVqp|e}D8YwDqQgFCa85;l1(b8EnReW~u06A}LJ@AW=H-rr?W;~gZ zM)EfmcinZK!GOzN+EoQ)DP7-sOXs=R`lV-5jz*J{yh@`M)=fa>l@ao-qs>_f(oF<^xjUpN1tX40@xo zk1wQ`_)ib(OX_f~PdKl^`_4E^9~a!aec;y=lJ|V323!NgyZ1mwo>It%rGkEadj}$K z8`t`@1KGjXWf&-`PGCD~`7ZqR(~$w()C8HmtQ-))a5HYpj_iIN$Q?pf>M>xFkIa)? zD>vwu{+lEJ$DFRpjOlW3q{(*3(B)x;vnK26O(ri5F}oxB zO~EEqiGRI3Dfd;%2*o;Rhxt(##Jj7P5}^poSYx~x692n}?x;Egcs#ftobpaC2WR}Z z)^MD+=$1R?p&Kv#%ovVzh_u456N2D`Y4Q(~Qu!H1RXT9N&SZfss?z}6ggBdJ>Rf5k z%ZJ1O~|XR*xB_a!a=idn{Mz% z3S$TM)HnIcKuQHfK5S*%xAzbZhlr*>lf_thmUic!N@tv4_2D2Cs5;4KARNBlfYBjy?W`a5dX6kjmu^Tl%j+6kI3Efw#HCT>O-Dx@-+J=V4rN>yn&e#e`<31A)zc9OKs34x98}v$=>$l0Yl$wT%QZRlig3r4BW$Q< z_l{L80E*+o%|D}ZFf%_dC&rF7bCg_r+K;F6DRWjzdrvcS~Pj9WcRROmk|?3l(8%z_uESb=Il!rXR7yln&hnU$r3R7ZD38<-~b+0E8$ucnYn;vu~3ckcZJ-& z)$;m@@4%7fcCk~mNr~p$(M>Relu<9l?OeBJN2=Gyhr}<0MVLLx)|74}$Sgs9EpoJU zkVq9aI#EKn{i^BO9+*={7|e#E{0dM&N6@1}myKeON641?XAj+=Y~;)a)zy*Ao3o=6 zv4D$ww+_-?we}qKEH6Ex0@WSwo3Ye?Dupmu*pQ%~l2hFC%f~U zk9iXujL{PwexvN2_`)GLvLLbW^la42dlpc#t1l>b_=$nly?tdX_b$WDC~3q*4T<(>qchCbz)5XsAv7v}2sk+*8HlN+k)!fA@w_u)o_mxQu2e5co8FybfJ_WB_b znh?83chA`5AOkat@(V%T=)k9f=XHUyeWz{S`8Oh(eDW;gM>l^$-qpiZ44Qn0=uve~ z;es8>Yk@ea2RZ8ZuuL1n7-y89&Pz)D*y2sV|5DGLSV1{CmLg+H^S?hzP`3#u{1&ZBxXsZKOR2v27Z!bty?5$IARLFCUZX6I(adQJ1gm| zKdu6o8!jHV)tblkrA+;N7fr5Jw&cv<%>xqxc%gtRJ_5uF5O!OFXJ0QC)M_^zC_(st z9P?SoZ4I8TOi8@fKD^pKl`DzehzlNYUi@IwGbu|Gi&>x~_B}kQ&Zz=QVSk138ANN- z01|z{C)?S?2m9*tZP~jRpvK#5mSjK6AVoh zc53c_TXFUUB0MiQ51;n5t0R|FXa=8|djaz|&3BJzXI$BwKR3@|Rk0n(#JLrS1>b&2=u z*)o)7Th<4nPbhH^zz*PmK@rg(C`03yF7gQtAJzEZGv;4Wnh!F&m*(5x#o`^=rHkww zR>g2D?)f#J_L-qE=Wo19(zIAmflYabD)Y^Xow1#UgX4MsNZ@0$E~_W!zxm9kDlgH1 zr#*~*by?$o=(5qC>fwqI5k~M zEPGH7K@1!tgJIYchNQVntZ;C_U}*P4Ve}a0VeU2IliL8ihDgY#<}}uVG(!&92u2;f zef%aQkVyPnE|GD?8RK--YwF#}6k!PYg*0>m;vJ*a);Jor%icib9q7W5E;w{I-mFeOS*Ey>v>27`#e$a#CQ0y>akwq*W^io%mS^?3(`Z6G&UauZ`JA zsCl{na(e4Z(JDCXnq*GHHCoPsFdLNpbVRGiNt~M^k$4*R*Txka{_gUj5!g9#UK7)X zy2#dD0%|>w%QpYr9NicrjLTQkfDdae#!~VRRz7` zp!A)nBm~(YJ8!dR$7VK6sE3|f(Ut$blxC*s%P2+_4-2itu)+$^O&h=%BNw*uAf|8I zEdn-9&DH#$ccfl`_Cxv6-2yO?JJ?x!*z38=ZP2|Ft5j0-k`2|a*r;CsAHk|GJQN7N zBe$crk{bz2E4nH6UP%eeKfUwO$I^@j*^V5GG)2IRV$tF7?`w5f*R@(FzVKOY!Gyo} z6P)A-e_Dw8-OY9C$byUx-5~bm%Oz=o<2jzq+MoSE6W{xS9BFvGV6ioVR!zfiq4dGoyJhbb>Sl@bD>;xORcgf!$*JzX1IigzP6*w1O(0Sqc7Yc+x;rg0N*dk}HFN0b9=yZ`5 zU{oSJJm@TCi}Pkq$F2gvWsNRt)Wf0xG_oy$3URQ3NckXG#ggS*!jqoVWp8kgN1CPA zmv_FQ0}@fdtN!;VN?b^84IIA@lPtdRcT=0FCz|ae^-0s_CBQZ2(Rh3Kq54m^SO#UL z+wlDlDLJ+V4__#LynakA0(4dY-do5Gxz|%s(+&(p&j29CmUFd~yBilsU7W8tx+jiS z|L0}?>$^t@r-hv+&Xmae`}fU{W$6)AEB5gtBEhOLZ5=eO^W&{dguD?X>ykjPWt}9z z3>WcMhmiiR9&GsuH)4sIW6QaIOeNw9yz=8^!>5_1?|tl3N}`T78|*By#@aKOtAzky zQz6h-k*CEyq|Bh(dH}o7L)8bSMcJlJa&up3M*&)JWx^f<2M^(Nx*m# z#__*E!UuY7_hkE9;litY&I{Ko!mDMGKY&W`YTEVO2|LiLLMQVuN1Y9l_OK$vwe7Gw z`m_o6QL&ok>v;fPcvdC+W2bfqwh(WLvQ}}7R^BC)Pp>BWe*`ZC$=#c*33c(T(toY| zA$srG!26Q}uQ__4-1QS69W$ByBOUvV1pC={KPZ_@vjRA5M=70YHGCcZz0XU^v^}od zIKHT|m9PC5d%JyJ_AwBdzLkR%@oWWp`4;=&9(qYL+}>!&%>^TStDDvEf{r&F$>=RV z_6XR6BXCI_GV=e2Dge>~RpMNf675UOZJd;$8mxaL@g)RxSvn(OONhP>#-vkD`xW>6Y2 zpUzq)5LN$Q`U-U4eZn793U?zZ+`QOynZX1AD>$S(MI@Ayc`&RDb znP~#a_{XW3fS3W+f2s%1p2471LU4O**IwZ!R;*;|!~D9sDp0!gnj$7Db6TSuxccs|`;U2iWrqugusA|LdPP&7+>Rn11wHd94^WdzD+zw$@2L zV83CyFqKoHAy$5i&*$U(-}d0Yp2F=d{$PDI4jRYj?)uNDbFr^a_gmClEqrUZ3m9`A zR;4{5RO&Hfp6mlOQH8B^`?4L1uf?E`nN>cblNXAAa$#zn#7y;=q~E4}K8`&6g2NVI zBfOGuAxD)>eX!a|dGJb{{TZM=Hs;P>%Z-y&*Day8dd+u6R{oJT4a}j|dd*zdyfBh& znu1{8_A!?8IB_6KJ$;^PB5$t`+2fD)tXH``^mG+>(H?U7+O$cjc5nY&l!uF7V`I*Z z8R=a^ApSpfMwsJ`2@&>hAoxtjn0`e3LG6F~(P;MEBFJa-+AQkVYJ8o$KS|mLd+!JY z;SwK(1LSG@?LpZN64$^=wY}^;O@$Mqs`g}{JM^#EgKw|(0ATsA7{`P`PJXGkp#>EKJM~E7cgl)SBNSiGy6Vg9~?Ns$nev!+T_f`}?2|M3UsCR_}}d zbx<+gI4Pmob$g&o+%?Z?eO2|h%)xb^GALDy>9;qDOi zta)7{p0|osSBPu}S+{*3S?GzK@~VucC(-+y{0L z9~z1jC-JI}zUU#!<@R}@Tt^i}n$a+o;k2r~h-Z&X&h!J!5(0om9l#h!%f0j4x~BT! z1vX6;qT@Q?Z0|8<4Cg{c*Fvv7@Zmyn^y=9BJvOf?9^uhhMXuF|_#x|s&kK{(-G zXJ4+Lwtv{va1*qoS*x9x$|(bQz5!NDZG_dQD{&$v9oLOqq>UrP-&kDy6&vy%EO(z3 zeKx7Nw&b@=wE_Aij5IDiyFs05TYWYHK1)YNWc~lm26pS0dpt$y<~q;}rHI?^IaWw+s(% z4w>)=%fJ>IM@Ik8k=_B`P7^yO9Al{XRjgOx|7+vQqms(PxZo0M9aEEJD7D2bF_$(q zaBG_|71v{(DK{`GL#gKE!lcltJZj0AgtBrHmo&9N(vp<)m1zz}U^WrBkvX5-uHg@yWjinec$){-7}HC1+gz?XEaqiWTe$m?-CzR=14W!)0w2dc8FR> zCeK=w?=z|Gu2@=|A0CBC1~N5-U4+?m-+VtWyT>0(Nz22Hr4-%kD@iy)J3sZy)q8!z zt$Ybmlhk>NT|umX z-1BAj(MM8=%3m*b<`eLBH3O4D2VB9*1X$Xvy0-(u{Hv(#Tx+vT#I)o!3i!%3|Kfy= zR^~idS}-uQ4)ZQoTy{vQDyF0qMGkQ&DHVGCrRMe6v8x6K(r%hTrvQur@`iFMtlE20#|l};p|tfYvRuON8= zg{{W?)uW^J9*6C;S!B>93qNF|L?^IO96QUkyMi409;mwWL4*Txfn?@34!!n)pSkXh(OK zYqbCG$1oO-*#jZ|p!tAUZiGEVC4&`wHhCC3Y!1ep0@eCW0pbM(!p zY>Pp;J76w)>e1=%M?_fB3j@7+v6UjP$P>(y%{l`8Tr4)P{(kjLAt)U8uzfMT`-o-V zdS%np@R)5}=|@ETL}u{f_k+UO3QJzKjLl@U=yy+rO6tIYs%Jk4hLHwN{?^ok*{-*| zk>1G7^RFbZsV{z+gwW}l%@kIC$sXc?{RP}KZ~t1iE8iwl^jm@CA3TWo8Off*<6wBn zTiB-Z-a2z15@67Ua^sPH?Ov77-6MRL50F zu~FrUhb2HKqYV&U1Oj}3PKN>QX08k8%x7Wpp2+wOXI8$Q>4!nqC87goWZR*En$%UG z%HGxwJ(0Z;3f(_{Xbf@3b=d;$?aS&gS8CwiMts0rn)(Ct{;($u#JOvx@_(nW>M!Fz z0^Q080WEn8jQgx6|6^;w4$QIF*V=yT`cgJ&(`e&yXB}?rEcMbIC_3TD9F@9w*2p2# zK;+zJ!J@L-858Y*Wdr_?#chO6K=$@m*;n%;YSMyjNR{d3$|}$Jja^m@^waJFRC32N zX8y)?W*>!$@$Wz&Cm%RDxtfM3G#RZ-y9R^WNpZ2{=>;|!)<7(|=dzM$Nw-M-Jr@>& zUgzSPQv7bE_907_im`_Kf{f{CG{GBY?I_lgw}k9WxA*iL$+S1FNBB5oY|-SwTx?%N zbYnV{NTE|u6U%?_Y5#>!Ll8yHt#?U+kH1Q2lvO_rNx?ZrCE?Vhxtq%%z@b`&+jYAD z4iQ6)-mdZ0#aOG8)L1BPjjUsYFNFjw$BZ2KhNDw+-c;W10trD$#XK-s)a&KzWcFi; z$T7p{7%F`yY*3w)C6+wuHG(=^?UX?9{OK;jRjkD{RBYhAWw;>*O^+ITWXRwj9J%@Z Z0^(PiHN6m7b-)N*#JypA_`4F${|DjoQuqJ> literal 0 HcmV?d00001 From 40595e011caccd7da491fa264be84ed694c36a3b Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Tue, 1 Apr 2025 09:32:45 -0700 Subject: [PATCH 02/74] Minor fixes to the user guide (#633) * Update InferencePool name in healthcheck.yaml * Update curl command for CPU based model --- config/manifests/gateway/gke/healthcheck.yaml | 2 +- site-src/guides/index.md | 38 +++++++++++++------ 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/config/manifests/gateway/gke/healthcheck.yaml b/config/manifests/gateway/gke/healthcheck.yaml index 95f4f2d2..93b6cd7f 100644 --- a/config/manifests/gateway/gke/healthcheck.yaml +++ b/config/manifests/gateway/gke/healthcheck.yaml @@ -7,7 +7,7 @@ spec: targetRef: group: "inference.networking.x-k8s.io" kind: InferencePool - name: vllm-llama2-7b + name: vllm-llama3-8b-instruct default: config: type: HTTP diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 7fdb211c..f1545438 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -240,17 +240,33 @@ This quickstart guide is intended for engineers familiar with k8s and model serv Wait until the gateway is ready. - ```bash - IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') - PORT=80 - - curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ - "model": "food-review", - "prompt": "Write as if you were a critic: San Francisco", - "max_tokens": 100, - "temperature": 0 - }' - ``` +=== "GPU-Based Model Server" + + ```bash + IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') + PORT=80 + + curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ + "model": "food-review", + "prompt": "Write as if you were a critic: San Francisco", + "max_tokens": 100, + "temperature": 0 + }' + ``` + +=== "CPU-Based Model Server" + + ```bash + IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}') + PORT=80 + + curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "prompt": "Write as if you were a critic: San Francisco", + "max_tokens": 100, + "temperature": 0 + }' + ``` ### Cleanup From 419aba9605753e7250bfac2d339b3da4868c29a8 Mon Sep 17 00:00:00 2001 From: Lior Lieberman Date: Tue, 1 Apr 2025 12:02:37 -0700 Subject: [PATCH 03/74] Add istio to implementations.md (#631) * Add istio to implementations.md * fixes --- site-src/implementations.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/site-src/implementations.md b/site-src/implementations.md index 89acb436..8a95119d 100644 --- a/site-src/implementations.md +++ b/site-src/implementations.md @@ -54,3 +54,12 @@ Issue](https://github.com/GoogleCloudPlatform/gke-gateway-api/issues/20). [gke-gateway]:https://cloud.google.com/kubernetes-engine/docs/concepts/gateway-api [gke-gateway-deploy]:https://cloud.google.com/kubernetes-engine/docs/how-to/deploying-gateways [gke-multi-cluster-gateway]:https://cloud.google.com/kubernetes-engine/docs/how-to/deploying-multi-cluster-gateways + +## Istio + +[Istio](https://istio.io/) is an open source service mesh and gateway implementation. +It provides a fully compliant implementation of the Kubernetes Gateway API for cluster ingress traffic control. +For service mesh users, Istio also fully supports east-west (including [GAMMA](https://gateway-api.sigs.k8s.io/mesh/)) traffic management within the mesh. + +Gateway API Inference Extension support is being tracked by this [GitHub +Issue](https://github.com/istio/istio/issues/55768). From 110f490bdf2dd55230e5c387d8de091c406e5d61 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Tue, 1 Apr 2025 12:50:46 -0700 Subject: [PATCH 04/74] Update e2e test config (#636) --- test/e2e/epp/e2e_test.go | 4 ++-- test/testdata/envoy.yaml | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/test/e2e/epp/e2e_test.go b/test/e2e/epp/e2e_test.go index 09c8835a..e86b2d49 100644 --- a/test/e2e/epp/e2e_test.go +++ b/test/e2e/epp/e2e_test.go @@ -94,11 +94,11 @@ var _ = ginkgo.Describe("InferencePool", func() { func newInferenceModel(ns string) *v1alpha2.InferenceModel { targets := []v1alpha2.TargetModel{ { - Name: modelName + "-0", + Name: modelName, Weight: ptr.To(int32(50)), }, { - Name: modelName + "-1", + Name: "cad-fabricator", Weight: ptr.To(int32(50)), }, } diff --git a/test/testdata/envoy.yaml b/test/testdata/envoy.yaml index fc32b5aa..62e6b4c5 100644 --- a/test/testdata/envoy.yaml +++ b/test/testdata/envoy.yaml @@ -104,10 +104,11 @@ data: timeout: 10s processing_mode: request_header_mode: SEND - response_header_mode: SKIP - request_body_mode: BUFFERED - request_trailer_mode: SKIP - response_trailer_mode: SKIP + response_header_mode: SEND + request_body_mode: FULL_DUPLEX_STREAMED + response_body_mode: FULL_DUPLEX_STREAMED + request_trailer_mode: SEND + response_trailer_mode: SEND message_timeout: 1000s # Mark it as disabled if needed for troubleshooting: # disabled: true @@ -221,7 +222,7 @@ spec: spec: containers: - name: envoy - image: docker.io/envoyproxy/envoy:distroless-v1.32.2 + image: docker.io/envoyproxy/envoy:distroless-v1.33.2 args: - "--service-cluster" - "default/inference-gateway" From ae858abe1b5a4954443a6b6220cd0177b089d9b2 Mon Sep 17 00:00:00 2001 From: Rohit Ramkumar Date: Tue, 1 Apr 2025 18:22:42 -0400 Subject: [PATCH 05/74] Fix parsing issue in BBR helm (#638) --- config/charts/body-based-routing/README.md | 2 +- config/charts/body-based-routing/templates/gke.yaml | 2 +- config/charts/body-based-routing/values.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/config/charts/body-based-routing/README.md b/config/charts/body-based-routing/README.md index 062f2b5c..a6b8d3cd 100644 --- a/config/charts/body-based-routing/README.md +++ b/config/charts/body-based-routing/README.md @@ -47,7 +47,7 @@ The following table list the configurable parameters of the chart. | `bbr.image.tag` | Image tag. | | `bbr.image.pullPolicy` | Image pull policy for the container. Possible values: `Always`, `IfNotPresent`, or `Never`. Defaults to `Always`. | | `provider.name` | Name of the Inference Gateway implementation being used. Possible values: `istio`, `gke`. Defaults to `none`. | -| `inference-gateway.name` | The name of the Gateway. Defaults to `inference-gateway`. | +| `inferenceGateway.name` | The name of the Gateway. Defaults to `inference-gateway`. | ## Notes diff --git a/config/charts/body-based-routing/templates/gke.yaml b/config/charts/body-based-routing/templates/gke.yaml index 937bfa0b..77b776a4 100644 --- a/config/charts/body-based-routing/templates/gke.yaml +++ b/config/charts/body-based-routing/templates/gke.yaml @@ -9,7 +9,7 @@ spec: targetRefs: - group: "gateway.networking.k8s.io" kind: Gateway - name: {{ .Values.inference-gateway.name }} + name: {{ .Values.inferenceGateway.name }} extensionChains: - name: chain1 extensions: diff --git a/config/charts/body-based-routing/values.yaml b/config/charts/body-based-routing/values.yaml index b77d7542..0b88dc43 100644 --- a/config/charts/body-based-routing/values.yaml +++ b/config/charts/body-based-routing/values.yaml @@ -12,5 +12,5 @@ bbr: provider: name: none -inference-gateway: +inferenceGateway: name: inference-gateway From 740be256da42f717a7effbd11aced9ce1532f19c Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Wed, 2 Apr 2025 05:12:39 +0300 Subject: [PATCH 06/74] fixed bug - sleep is expecting to get a string (#618) * fixed bug - sleep is expecting to get a string Signed-off-by: Nir Rozenbaum * remove space Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum --- config/manifests/vllm/gpu-deployment.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/config/manifests/vllm/gpu-deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml index 4f13736d..e7cb193e 100644 --- a/config/manifests/vllm/gpu-deployment.yaml +++ b/config/manifests/vllm/gpu-deployment.yaml @@ -77,7 +77,7 @@ spec: #exec: # command: # - /usr/bin/sleep - # - 30 + # - "30" livenessProbe: httpGet: path: /health @@ -133,7 +133,6 @@ spec: path: /health port: http scheme: HTTP - resources: limits: nvidia.com/gpu: 1 From 8e793c21047d996db038aa2c92830d047f52f28d Mon Sep 17 00:00:00 2001 From: Conor O'Callaghan <4090256+Conor0Callaghan@users.noreply.github.com> Date: Wed, 2 Apr 2025 04:32:36 +0100 Subject: [PATCH 07/74] #632 Add favicon for doc site (#634) --- mkdocs.yml | 2 +- site-src/images/favicon-64.png | Bin 0 -> 5013 bytes 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 site-src/images/favicon-64.png diff --git a/mkdocs.yml b/mkdocs.yml index 2dc4d2a1..b67cf8b4 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -10,7 +10,7 @@ theme: icon: repo: fontawesome/brands/git-alt logo: images/logo/logo-text-large-horizontal-white.png - favicon: images/k8s-favicon.png + favicon: images/favicon-64.png features: - search.highlight - navigation.tabs diff --git a/site-src/images/favicon-64.png b/site-src/images/favicon-64.png new file mode 100644 index 0000000000000000000000000000000000000000..f2bd3d64a81c830be8b6349c3e78a963774fd2dd GIT binary patch literal 5013 zcmV;G6Kd>gw)da;@yOs#n*od(Sz0pS{2R z?Q_oc;(HXOl=EvDfc5|)0CqcnaJgLH=dez_P;tknhf?YZrPMa1R75FtSSj^^QfjJF zsxH7>i9Qom73wg0y@M3&P03{*3~}5oWGT*RyHqQFscF|z5QXP0 z3FGr`;z%SF3LJ`OOUeS`Nrkp$9$ax=5Mzgh(6Lp9A!-2p4Zth_YiuP>z!1m$WR~Kb z;W1kAk>+AD1uR<^#mh?~_-bohX16KDv&9&K;V@=k5aWi2aC+N}j9m-hH2_ZlSec>F zF&ZQ9yR#Jc>)2+DpKmU5&>wys#k{2vtlkj^9}!nSrXkgfr43P5GzK>7 z8N4r^P$({Ry3AkP98P7-P=7oNjghwI;dA2z*h6n{*7}+;~pN@x~7h790cZHUEu z1kD(s(OC?FM|F{2iSbae6y1{&R8*9H~y> z&SxtTiK@JekRBTlODY5lTo~FbfQh3*7|^XC`?jSzYb-z$M#y6H4X##6KLynjP`Hy{CShX^ddF4e9B}Pe+9? zq=!E{q+S3ph27>dLSw}c1MGRD{LzRdZvMZY)?)ABlqel2IDypT0uEKB^C=R~rosplAO&o!&&@+n=exx?KtB>_ikd z;&B7}j-(`}8BRyTmW0vqifsr0D>laP`KB1I7+i>7Utfx#U&cF4d%~LRXB+GGCiBmq z|5=T-I}-v}o)Z{k=73$Osk6j==>gG%Iw_o-Z4~%i;-RlDuf@{~Ys^msh%ve9iCij` z8KmbR|FtcCQf8ff6Y;5RBx<2kD=!K>E|i7bC@pj&5>sLj%R1q%#3Klhi@vH89kiU6 z>)u=#&<#L4{5?LJDEH*(Cb!UFm!B8JPX+}A)fvYOLmJM^G{KXhGIdNb zM)wcme?E)|Cuv|PS6YYhBjg*caBj8%vK^Q=_2WNu+i&(J;LW~6&wk%71!z<1!Clvt zh@!N%rDwfEOD~?dwH)_fU)n(VL`q@Lp`@@#BMdW>7VWXSfT6woc=ooIm@=kVjLvW} zAvOK-V%&OJF}k$%VsH;Xa;=we>}06K7A=FbfiT+xI%+q;zKRsSJ(7Ya3jjDU4&dSW zRk&|liMaK>V@qYm|F$fG-lzHSho84Vr`DNGZ`zZ<+h5jU#ip3F`}Fj3^gTVz-v04f zE#|%(M(-{@jOrV}rTv2T!U)5U%2LsX=O2}6opDt$CS6=4zjKnFSya7p zx7lBj!c%Y8;N7q4WY+0DbmySI{vM{-%CIHIq1!+IUKlrCSR`#5cF2$*eJ&2T(7BBl z;W~x6i)--rk7^N)S~;mWCyP4J3$y{c18B=(nS1O}XxFe)#hRT7ap}f{@kV!ldSMtd zW>-i)XD3E^HuurGC?-9A7_WU&EBv3SiN^DLUGl$?qJRq?Pr6NBKU8Jmu~)0`%cm=` z=Wx=#uNN~&8?S`SEwheD-vq#>>JW}Yuq-W1@x7eCXGi!T^U#J}iDpwAy^c)lvyCy~ z@=oEbaDY3$ONvrUA52K_EvS%;OIzJYjeV%hh0a_HfoAl6n1Gup3KnjPeZII0%L{0;oX$a8v{RuHb zKKz`Jk*LD-D~fH`?%KhNzf3R3n=9(jqmwU#xe+{> zI_dVz=}uvr_dQ>U4_8O!d;hNK)Y-G6Pe73*skwCo>aupHZ9GGW*1{12<|6813S0Ii za^}@+@p>Q&5Jp&&Q+dYs>_s(r=;fp0_C|Pp?65*ZO>JRU7o8Jmp#1JbNlbg{2tM5y zLrKUjV9T@$?b<_n`fY4!dYmq7-IqbWY(iuVV7mo7Jy4lyw)ADZ(sh5>zTSzL!)a}P zNMPs~J?~+60Il z*Tz-zE43CR=M#-7EOWQ*&-5IozxP%}@vCPm#j~73Pg(e7ow#=PEN=cbfoV@xVAsK9 zMz-Ljt=nbCvTT~LAZsog*oDz<``I%!A%a`G!XWdDkcK>fh0m2Zz>Kq8>Bt;Zitf+B z?Kzy1MNA{rwHEGv?kH}bQ;BI$RS1=JDsiyN!ojMP&HD||QIWk4VpLk_7Tm~d-&6`C z+k5PkZ~S+l88kg0pOqh_W6O97Ei9`$!pwlyr5@o1jMJfK`DNk52+{nD%`sts?DN%+ zYcXMD$iC!bVKJstD+$pDG(fqr z2IGX()04+auS*javmn7d-mQaAiXoQCeYzCLVn;4v>STB@kH=F$yK)aog7x<@iu!de zkPkhBq43p~xKL~7+64Q~2Lz!B5sNT_Xy8n`e6jEN<8?*kBb^5T{H#_#?N&_3)5TbTG%&Jaa`{< zyEOnewn9`>XW^T@Nn?QCEJM6H|H~ETe=6zAdYG1`(wtGoU1`WImD!|=L$(Xj7>%V) zORony@4a$xAubyb%xN%k2*!3Uae4s9@!-TVsUE+pObKDCjakAbbA<+hn>cXExkB7J z0q91>heO9S$wKx_7<>Qfni5<#Fo@!yt4R+b(>r+7l2F?IT@u^(C#A07pdP-dOvsCo zTn!hZ9*FURk zfMwK;X_XV=cF6|s1vJ1p{4p9+5+m*`|4|l%gj_i!X!Ct4#+*;SFs!#9=XFmL7%Dw= zR)rKER9IGcT>v2^mQ2TTug=;2l_{*<_Z&(|+XO&pkD;+hrT;vo+|KqOIXN3*DyQ&Vqf3*rrR!0SN*icc>EzEK0+B8q+;Anl;2F|?Ba+q=FOogkq z#&P|lhh*2uNtbD)t{r@0T<7z>yZA6}m|et)2X6l^Azn#ivoKq}A&Mv7sKN5}(VU=4 z-B?RF$ki&|_J;ZPi_t|AbGa%(f(n%`@MWI#;J+`v9~LG^Is1$P{Opn<#57tHKOSvSn`W4ftHL?pPE|cg*p<@P0Uh0C7xajJdFd z7}2@4N5-b58o+$F-UOACn(bP6WD%k;;Lbb0va_|j5}3aw_efNV^d!q~wbSC`&woC{tUR8(3Uab~xPdD%0(T8>|+_IX(@i6`mRi&g-TeU4NRq3sl z70db|Dei*hkt}}NxG?S~jtV(R)0Y6I(lK(KLu%@j808rN{~tQGXUwjYh0o?dk%M3c zc;c3FRMlFz?stb|2@Kjj-FoLIO>139rIfvSVi+9h@q%BsM(Yv}?wVUE=MnSi_xv8| zdO_d2UmxM!+}I(koKfw#`E~#wLFf9J6H8?gG~UFX0Tpah>XGV^$}C?Wlk~;sZ8qsm zFJqA-kkwgwyUhH?4>Ja7H-~b^H6=n|nj&Xx=d{*dAv;?6m=5L5O&P6GSkp#`=mS$b z`Gtx-a#NY`@5Tm0<6PP=V0L4L_2sg?%a!?QFfbRebdA7}aDveTf{hxUV+s1*Qf{}e zKaX&aJ~i6p5xgdGT|dZT>1J&VsR-{KSE}FQX!r~>1@VF|&l&u5V=QmV>J)keD^KDF z@nQ4cWX>!ki#3*#?Rm3lqC@vjIMK7h$i#!m0Opu4fBZ)^=DeF56geYhMQ&LfN#=PD zIF~RqylzO_EmEoe>VS zDs0J{AjhjbbQsW>M;lR|eUoO-xT?fl+Mz=>FXfSs0g!2g`v5G0cF~zPl%ZRP+@MGR zJQ=U{q*t69UMJ`gXC9nbCY2t5uvUKeu^hY2`|PJ~l&JLoO!3sJ)Qv}OZXpFsGL?6q zCoFQ*K9Jsecf4D~igx+2wslO0U$qffK2IcJ&%P7Jmi>v7a@;kSaIh_k-MI0>5Uw9?GrffXu4V>4 zt`q0SGZ2Rn9suxQ{q=MODNAZ6A^tNvP47Oja&%;#^}4)0@V&op?0BQ|p6s_mzb9Ft zl Date: Tue, 1 Apr 2025 23:50:36 -0400 Subject: [PATCH 08/74] Move integration test utils to central package (#626) * Move integration test utils to central package * Move integration test utils to central package --- test/integration/epp/hermetic_test.go | 80 ++++--------------- .../request.go => test/integration/util.go | 54 ++++++++++++- 2 files changed, 69 insertions(+), 65 deletions(-) rename pkg/epp/util/testing/request.go => test/integration/util.go (57%) diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go index 2acdacf8..0ba0e14a 100644 --- a/test/integration/epp/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -66,7 +66,8 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" - utiltesting "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" + epptestutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" + integrationutils "sigs.k8s.io/gateway-api-inference-extension/test/integration" "sigs.k8s.io/yaml" ) @@ -104,7 +105,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { }{ { name: "select lower queue and kv cache, no active lora", - req: utiltesting.GenerateRequest(logger, "test1", "my-model"), + req: integrationutils.GenerateRequest(logger, "test1", "my-model"), // pod-1 will be picked because it has relatively low queue size and low KV cache. pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ fakePod(0): { @@ -145,7 +146,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, { name: "select active lora, low queue", - req: utiltesting.GenerateRequest(logger, "test2", "sql-lora"), + req: integrationutils.GenerateRequest(logger, "test2", "sql-lora"), // pod-1 will be picked because it has relatively low queue size, with the requested // model being active, and has low KV cache. pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ @@ -199,7 +200,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, { name: "select no lora despite active model, avoid excessive queue size", - req: utiltesting.GenerateRequest(logger, "test3", "sql-lora"), + req: integrationutils.GenerateRequest(logger, "test3", "sql-lora"), // pod-2 will be picked despite it NOT having the requested model being active // as it's above the affinity for queue size. Also is critical, so we should // still honor request despite all queues > 5 @@ -253,7 +254,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, { name: "noncritical and all models past threshold, shed request", - req: utiltesting.GenerateRequest(logger, "test4", "sql-lora-sheddable"), + req: integrationutils.GenerateRequest(logger, "test4", "sql-lora-sheddable"), // no pods will be picked as all models are either above kv threshold, // queue threshold, or both. pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ @@ -296,7 +297,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, { name: "noncritical, but one server has capacity, do not shed", - req: utiltesting.GenerateRequest(logger, "test5", "sql-lora-sheddable"), + req: integrationutils.GenerateRequest(logger, "test5", "sql-lora-sheddable"), // pod 0 will be picked as all other models are above threshold pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ fakePod(0): { @@ -370,7 +371,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { }, DynamicMetadata: test.wantMetadata, } - res, err := sendRequest(t, client, test.req) + res, err := integrationutils.SendRequest(t, client, test.req) if err != nil && !test.wantErr { t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) @@ -410,7 +411,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { // Request flow tests { name: "select lower queue and kv cache, no active lora", - requests: utiltesting.GenerateStreamedRequestSet(logger, "test1", "my-model"), + requests: integrationutils.GenerateStreamedRequestSet(logger, "test1", "my-model"), // pod-1 will be picked because it has relatively low queue size and low KV cache. pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ fakePod(0): { @@ -484,7 +485,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { }, { name: "select active lora, low queue", - requests: utiltesting.GenerateStreamedRequestSet(logger, "test2", "sql-lora"), + requests: integrationutils.GenerateStreamedRequestSet(logger, "test2", "sql-lora"), // pod-1 will be picked because it has relatively low queue size, with the requested // model being active, and has low KV cache. pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ @@ -565,7 +566,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { }, { name: "select no lora despite active model, avoid excessive queue size", - requests: utiltesting.GenerateStreamedRequestSet(logger, "test3", "sql-lora"), + requests: integrationutils.GenerateStreamedRequestSet(logger, "test3", "sql-lora"), // pod-2 will be picked despite it NOT having the requested model being active // as it's above the affinity for queue size. Also is critical, so we should // still honor request despite all queues > 5 @@ -646,7 +647,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { }, { name: "noncritical and all models past threshold, shed request", - requests: utiltesting.GenerateStreamedRequestSet(logger, "test4", "sql-lora-sheddable"), + requests: integrationutils.GenerateStreamedRequestSet(logger, "test4", "sql-lora-sheddable"), // no pods will be picked as all models are either above kv threshold, // queue threshold, or both. pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ @@ -692,7 +693,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { }, { name: "noncritical, but one server has capacity, do not shed", - requests: utiltesting.GenerateStreamedRequestSet(logger, "test5", "sql-lora-sheddable"), + requests: integrationutils.GenerateStreamedRequestSet(logger, "test5", "sql-lora-sheddable"), // pod 0 will be picked as all other models are above threshold pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ fakePod(0): { @@ -1483,7 +1484,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { t.Run(test.name, func(t *testing.T) { client, cleanup := setUpHermeticServer(t, test.pods, true) t.Cleanup(cleanup) - responses, err := streamedRequest(t, client, test.requests, len(test.wantResponses)) + responses, err := integrationutils.StreamedRequest(t, client, test.requests, len(test.wantResponses)) if err != nil && !test.wantErr { t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) @@ -1522,7 +1523,7 @@ func setUpHermeticServer(t *testing.T, podAndMetrics map[backendmetrics.Pod]*bac } for pod := range podAndMetrics { - pod := utiltesting.MakePod(pod.NamespacedName.Name). + pod := epptestutil.MakePod(pod.NamespacedName.Name). Namespace(pod.NamespacedName.Namespace). ReadyCondition(). Labels(podLabels). @@ -1571,7 +1572,7 @@ func setUpHermeticServer(t *testing.T, podAndMetrics map[backendmetrics.Pod]*bac // clear created pods for pod := range podAndMetrics { - pod := utiltesting.MakePod(pod.NamespacedName.Name). + pod := epptestutil.MakePod(pod.NamespacedName.Name). Namespace(pod.NamespacedName.Namespace).Complete().ObjRef() if err := k8sClient.Delete(context.Background(), pod); err != nil { @@ -1688,55 +1689,6 @@ func BeforeSuite() func() { } } -func sendRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { - t.Logf("Sending request: %v", req) - if err := client.Send(req); err != nil { - t.Logf("Failed to send request %+v: %v", req, err) - return nil, err - } - - res, err := client.Recv() - if err != nil { - t.Logf("Failed to receive: %v", err) - return nil, err - } - t.Logf("Received request %+v", res) - return res, err -} - -func streamedRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, requests []*extProcPb.ProcessingRequest, expectedResponses int) ([]*extProcPb.ProcessingResponse, error) { - for _, req := range requests { - t.Logf("Sending request: %v", req) - if err := client.Send(req); err != nil { - t.Logf("Failed to send request %+v: %v", req, err) - return nil, err - } - } - responses := []*extProcPb.ProcessingResponse{} - - // Make an incredible simple timeout func in the case where - // there is less than the expected amount of responses; bail and fail. - var simpleTimeout bool - go func() { - time.Sleep(10 * time.Second) - simpleTimeout = true - }() - - for range expectedResponses { - if simpleTimeout { - break - } - res, err := client.Recv() - if err != nil && err != io.EOF { - t.Logf("Failed to receive: %v", err) - return nil, err - } - t.Logf("Received request %+v", res) - responses = append(responses, res) - } - return responses, nil -} - // readDocuments reads documents from file. func readDocuments(fp string) ([][]byte, error) { b, err := os.ReadFile(fp) diff --git a/pkg/epp/util/testing/request.go b/test/integration/util.go similarity index 57% rename from pkg/epp/util/testing/request.go rename to test/integration/util.go index 30772ad5..294317c3 100644 --- a/pkg/epp/util/testing/request.go +++ b/test/integration/util.go @@ -14,10 +14,13 @@ See the License for the specific language governing permissions and limitations under the License. */ -package testing +package integration import ( "encoding/json" + "io" + "testing" + "time" envoyCorev3 "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" @@ -25,6 +28,55 @@ import ( logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) +func SendRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { + t.Logf("Sending request: %v", req) + if err := client.Send(req); err != nil { + t.Logf("Failed to send request %+v: %v", req, err) + return nil, err + } + + res, err := client.Recv() + if err != nil { + t.Logf("Failed to receive: %v", err) + return nil, err + } + t.Logf("Received request %+v", res) + return res, err +} + +func StreamedRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, requests []*extProcPb.ProcessingRequest, expectedResponses int) ([]*extProcPb.ProcessingResponse, error) { + for _, req := range requests { + t.Logf("Sending request: %v", req) + if err := client.Send(req); err != nil { + t.Logf("Failed to send request %+v: %v", req, err) + return nil, err + } + } + responses := []*extProcPb.ProcessingResponse{} + + // Make an incredible simple timeout func in the case where + // there is less than the expected amount of responses; bail and fail. + var simpleTimeout bool + go func() { + time.Sleep(10 * time.Second) + simpleTimeout = true + }() + + for range expectedResponses { + if simpleTimeout { + break + } + res, err := client.Recv() + if err != nil && err != io.EOF { + t.Logf("Failed to receive: %v", err) + return nil, err + } + t.Logf("Received request %+v", res) + responses = append(responses, res) + } + return responses, nil +} + func GenerateRequest(logger logr.Logger, prompt, model string) *extProcPb.ProcessingRequest { j := map[string]interface{}{ "model": model, From a13a1239330ffaaafa4d0f948c00cafd106086aa Mon Sep 17 00:00:00 2001 From: Rohit Ramkumar Date: Wed, 2 Apr 2025 00:34:41 -0400 Subject: [PATCH 09/74] BBR readme fixes (#640) --- config/charts/body-based-routing/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config/charts/body-based-routing/README.md b/config/charts/body-based-routing/README.md index a6b8d3cd..d311b8c3 100644 --- a/config/charts/body-based-routing/README.md +++ b/config/charts/body-based-routing/README.md @@ -10,7 +10,7 @@ To install a body-based router named `body-based-router`, you can run the follow ```txt $ helm install body-based-router ./config/charts/body-based-routing \ --set provider.name=[gke|istio] \ - --set inference-gateway.name=inference-gateway + --set inferenceGateway.name=inference-gateway ``` Note that the provider name is needed to ensure provider-specific manifests are also applied. If no provider is specified, then only @@ -19,7 +19,7 @@ the deployment and service are deployed. To install via the latest published chart in staging (--version v0 indicates latest dev version), you can run the following command: ```txt -$ helm install body-based-router oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/body-based-router \ +$ helm install body-based-router oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/body-based-routing \ --version v0 --set provider.name=[gke|istio] ``` @@ -51,4 +51,4 @@ The following table list the configurable parameters of the chart. ## Notes -This chart should only be deployed once per Gateway. \ No newline at end of file +This chart should only be deployed once per Gateway. From 0a0d609c003cb504fe93a03db6dc36bd8c91ba37 Mon Sep 17 00:00:00 2001 From: Rohit Ramkumar Date: Wed, 2 Apr 2025 11:50:47 -0400 Subject: [PATCH 10/74] Add streaming integration tests for BBR (#627) --- pkg/body-based-routing/handlers/server.go | 8 +- test/integration/bbr/hermetic_test.go | 210 +++++++++++++++++----- test/integration/util.go | 8 +- 3 files changed, 174 insertions(+), 52 deletions(-) diff --git a/pkg/body-based-routing/handlers/server.go b/pkg/body-based-routing/handlers/server.go index 24664f98..484b3318 100644 --- a/pkg/body-based-routing/handlers/server.go +++ b/pkg/body-based-routing/handlers/server.go @@ -114,16 +114,16 @@ func (s *Server) processRequestBody(ctx context.Context, body *extProcPb.HttpBod var requestBody map[string]interface{} if s.streaming { + streamedBody.body = append(streamedBody.body, body.Body...) // In the stream case, we can receive multiple request bodies. - if !body.EndOfStream { - streamedBody.body = append(streamedBody.body, body.Body...) - return nil, nil - } else { + if body.EndOfStream { loggerVerbose.Info("Flushing stream buffer") err := json.Unmarshal(streamedBody.body, &requestBody) if err != nil { logger.V(logutil.DEFAULT).Error(err, "Error unmarshaling request body") } + } else { + return nil, nil } } else { if err := json.Unmarshal(body.GetBody(), &requestBody); err != nil { diff --git a/test/integration/bbr/hermetic_test.go b/test/integration/bbr/hermetic_test.go index 718bfedf..02d412ab 100644 --- a/test/integration/bbr/hermetic_test.go +++ b/test/integration/bbr/hermetic_test.go @@ -19,20 +19,19 @@ package bbr import ( "context" - "encoding/json" "fmt" "testing" "time" configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" - "github.com/go-logr/logr" "github.com/google/go-cmp/cmp" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" "google.golang.org/protobuf/testing/protocmp" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/body-based-routing/server" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + integrationutils "sigs.k8s.io/gateway-api-inference-extension/test/integration" ) var logger = logutil.NewTestLogger().V(logutil.VERBOSE) @@ -46,7 +45,7 @@ func TestBodyBasedRouting(t *testing.T) { }{ { name: "success adding model parameter to header", - req: generateRequest(logger, "llama"), + req: integrationutils.GenerateRequest(logger, "test", "llama"), wantHeaders: []*configPb.HeaderValueOption{ { Header: &configPb.HeaderValue{ @@ -59,7 +58,7 @@ func TestBodyBasedRouting(t *testing.T) { }, { name: "no model parameter", - req: generateRequest(logger, ""), + req: integrationutils.GenerateRequest(logger, "test1", ""), wantHeaders: []*configPb.HeaderValueOption{}, wantErr: false, }, @@ -67,7 +66,7 @@ func TestBodyBasedRouting(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - client, cleanup := setUpHermeticServer() + client, cleanup := setUpHermeticServer(false) t.Cleanup(cleanup) want := &extProcPb.ProcessingResponse{} @@ -88,7 +87,7 @@ func TestBodyBasedRouting(t *testing.T) { } } - res, err := sendRequest(t, client, test.req) + res, err := integrationutils.SendRequest(t, client, test.req) if err != nil && !test.wantErr { t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) } @@ -99,12 +98,171 @@ func TestBodyBasedRouting(t *testing.T) { } } -func setUpHermeticServer() (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { +func TestFullDuplexStreamed_BodyBasedRouting(t *testing.T) { + tests := []struct { + name string + reqs []*extProcPb.ProcessingRequest + wantResponses []*extProcPb.ProcessingResponse + wantErr bool + }{ + { + name: "success adding model parameter to header", + reqs: integrationutils.GenerateStreamedRequestSet(logger, "test", "foo"), + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "X-Gateway-Model-Name", + RawValue: []byte("foo"), + }, + }, + }}, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"foo\",\"prompt\":\"test\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "success adding model parameter to header with multiple body chunks", + reqs: []*extProcPb.ProcessingRequest{ + { + Request: &extProcPb.ProcessingRequest_RequestHeaders{ + RequestHeaders: &extProcPb.HttpHeaders{ + Headers: &configPb.HeaderMap{ + Headers: []*configPb.HeaderValue{ + { + Key: "hi", + Value: "mom", + }, + }, + }, + }, + }, + }, + { + Request: &extProcPb.ProcessingRequest_RequestBody{ + RequestBody: &extProcPb.HttpBody{Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lo"), EndOfStream: false}, + }, + }, + { + Request: &extProcPb.ProcessingRequest_RequestBody{ + RequestBody: &extProcPb.HttpBody{Body: []byte("ra-sheddable\",\"prompt\":\"test\",\"temperature\":0}"), EndOfStream: true}, + }, + }, + }, + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: "X-Gateway-Model-Name", + RawValue: []byte("sql-lora-sheddable"), + }, + }, + }}, + }, + }, + }, + }, + { + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-sheddable\",\"prompt\":\"test\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + { + name: "no model parameter", + reqs: integrationutils.GenerateStreamedRequestSet(logger, "test", ""), + wantResponses: []*extProcPb.ProcessingResponse{ + { + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{}, + }, + }, + { + Response: &extProcPb.ProcessingResponse_RequestBody{ + RequestBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: []byte("{\"max_tokens\":100,\"prompt\":\"test\",\"temperature\":0}"), + EndOfStream: true, + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + client, cleanup := setUpHermeticServer(true) + t.Cleanup(cleanup) + + responses, err := integrationutils.StreamedRequest(t, client, test.reqs, len(test.wantResponses)) + if err != nil && !test.wantErr { + t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) + } + + if diff := cmp.Diff(test.wantResponses, responses, protocmp.Transform()); diff != "" { + t.Errorf("Unexpected response, (-want +got): %v", diff) + } + }) + } +} + +func setUpHermeticServer(streaming bool) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { port := 9004 serverCtx, stopServer := context.WithCancel(context.Background()) serverRunner := runserver.NewDefaultExtProcServerRunner(port, false) serverRunner.SecureServing = false + serverRunner.Streaming = streaming go func() { if err := serverRunner.AsRunnable(logger.WithName("ext-proc")).Start(serverCtx); err != nil { @@ -133,41 +291,3 @@ func setUpHermeticServer() (client extProcPb.ExternalProcessor_ProcessClient, cl time.Sleep(5 * time.Second) } } - -func generateRequest(logger logr.Logger, model string) *extProcPb.ProcessingRequest { - j := map[string]interface{}{ - "prompt": "test1", - "max_tokens": 100, - "temperature": 0, - } - if model != "" { - j["model"] = model - } - - llmReq, err := json.Marshal(j) - if err != nil { - logutil.Fatal(logger, err, "Failed to unmarshal LLM request") - } - req := &extProcPb.ProcessingRequest{ - Request: &extProcPb.ProcessingRequest_RequestBody{ - RequestBody: &extProcPb.HttpBody{Body: llmReq}, - }, - } - return req -} - -func sendRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, req *extProcPb.ProcessingRequest) (*extProcPb.ProcessingResponse, error) { - t.Logf("Sending request: %v", req) - if err := client.Send(req); err != nil { - t.Logf("Failed to send request %+v: %v", req, err) - return nil, err - } - - res, err := client.Recv() - if err != nil { - t.Logf("Failed to receive: %v", err) - return nil, err - } - t.Logf("Received request %+v", res) - return res, err -} diff --git a/test/integration/util.go b/test/integration/util.go index 294317c3..5fcc9d18 100644 --- a/test/integration/util.go +++ b/test/integration/util.go @@ -40,7 +40,7 @@ func SendRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessClient, t.Logf("Failed to receive: %v", err) return nil, err } - t.Logf("Received request %+v", res) + t.Logf("Received response %+v", res) return res, err } @@ -71,7 +71,7 @@ func StreamedRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessCli t.Logf("Failed to receive: %v", err) return nil, err } - t.Logf("Received request %+v", res) + t.Logf("Received response %+v", res) responses = append(responses, res) } return responses, nil @@ -79,11 +79,13 @@ func StreamedRequest(t *testing.T, client extProcPb.ExternalProcessor_ProcessCli func GenerateRequest(logger logr.Logger, prompt, model string) *extProcPb.ProcessingRequest { j := map[string]interface{}{ - "model": model, "prompt": prompt, "max_tokens": 100, "temperature": 0, } + if model != "" { + j["model"] = model + } llmReq, err := json.Marshal(j) if err != nil { From 3b562f32c066c11c539b45af31ff75d7030290c3 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Wed, 2 Apr 2025 16:00:38 -0700 Subject: [PATCH 11/74] Adding 2 new reviewers to the reviewers alias (#644) --- OWNERS_ALIASES | 3 +++ 1 file changed, 3 insertions(+) diff --git a/OWNERS_ALIASES b/OWNERS_ALIASES index 6e8e0c5d..933fbe9c 100644 --- a/OWNERS_ALIASES +++ b/OWNERS_ALIASES @@ -11,6 +11,9 @@ aliases: gateway-api-inference-extension-reviewers: - liu-cong - robscott + - shaneutt + - nirrozenbaum + wg-serving-leads: - ArangoGutierrez From 2a48131d9aacafe84e7a1b9a2744e9b5d30e2886 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Wed, 2 Apr 2025 20:50:39 -0700 Subject: [PATCH 12/74] Add initial implementer's guide (#635) * Add initial implementer's guide * Add line break to fix the list formatting * Add line break to fix the list formatting * Address code review comments * Fix formatting for conformance tests --- site-src/guides/implementers.md | 112 +++++++++++++++++++++++++++++++- 1 file changed, 111 insertions(+), 1 deletion(-) diff --git a/site-src/guides/implementers.md b/site-src/guides/implementers.md index 5d1c6267..7bfd536a 100644 --- a/site-src/guides/implementers.md +++ b/site-src/guides/implementers.md @@ -1,3 +1,113 @@ # Implementer's Guide -TODO \ No newline at end of file +This guide is intended for developers looking to implement support for the InferencePool custom resources within their Gateway API controller. It outlines how InferencePool fits into the existing resource model, discusses implementation options, explains how to interact with extensions, and provides guidance on testing. + +## InferencePool as a Gateway Backend +Before we dive into the implementation, let’s recap how an InferencePool works. + +Overview of API integration + +**InferencePool** represents a set of Inference-focused Pods and an extension that will be used to route to them. The InferencePool introduces a new type of backend within the Gateway API resource model. Instead of targeting Services, a Gateway can route traffic to an InferencePool. This InferencePool then becomes responsible for intelligent routing to the underlying model server pods based on the associated InferenceModel configurations. + +Here is an example of how to route traffic to an InferencePool using an HTTPRoute: +``` +apiVersion: gateway.networking.k8s.io/v1 +kind: HTTPRoute +metadata: + name: llm-route +spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: base-model + matches: + - path: + type: PathPrefix + value: / +``` + +Note that the `rules.backendRefs` describes which InferencePool should receive the forwarded traffic when the path matches the corresponding path prefix. This is very similar to how we configure a Gateway with an HTTPRoute that directs traffic to a Service (a way to select Pods and specify a port). By using the InferencePool, it provides an abstraction over a set of compute resources (model server pods), and allows the controller to implement specialized routing strategies for these inference workloads. + +## Building the Gateway controller +The general idea of implementing a Gateway controller supporting the InferencePool involves two major steps: + +1. Tracking the endpoints for InferencePool backends +2. Callout to an extension to make intelligent routing decisions + +### Endpoint Tracking +Consider a simple inference pool like this: +``` +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + name: vllm-llama3-8b-instruct +spec: + targetPortNumber: 8000 + selector: + app: vllm-llama3-8b-instruct + extensionRef: + name: vllm-llama3-8b-instruct-epp +``` + +There are mainly two options for how to treat the Inference Pool in your controller. + +**Option 1: Shadow Service Creation** + +If your Gateway controller already handles Service as a backend, you can choose to create a headless Service that mirrors the endpoints defined by the InferencePool, like this: + +``` +apiVersion: v1 +kind: Service +metadata: + name: vllm-llama3-8b-instruct-shadow-service +spec: + ports: + - port: 54321 + protocol: TCP + targetPort: 8000 + selector: + app: vllm-llama3-8b-instruct + type: ClusterIP + clusterIP: None +``` + +The gateway controller would then treat this shadow service just like any other backend service it routes traffic to. + +This approach likely allows you to leverage existing service discovery, healthcheck infrastructure, and load balancing mechanisms that your controller already supports. However, it does come with the overhead of managing additional Service objects, and hence may affect the overall latency of the reconciliation of the Gateways. + +**Option 2: Tracking InferencePool Endpoints Separately** + +You can also choose to directly select and monitor the endpoints belonging to the InferencePool. For the simple inference pool example we have above, the controller would use the label `app: vllm-llama3-8b-instruct` to discover the pods matching the criteria, and get their endpoints (i.e. IP and port number). It would then need to monitor these pods for health and availability. + +With this approach, you can tailor the endpoint tracking and routing logic specifically to the characteristics and requirements of your InferencePool. + +### Callout Extension + +The [Endpoint Picker](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp), or EPP, is a core component of the inference extension. The primary interaction for routing requests is defined between the proxy (e.g., Envoy) and the EPP using the Envoy [external processing service protocol](https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor.proto). See the [Endpoint Picker Protocol](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/004-endpoint-picker-protocol) for more information. + +#### How to Callout to EPP + +For each HTTP request, the proxy CAN communicate the subset of endpoints the EPP MUST pick from by setting `x-gateway-destination-endpoint-subset` key in the filter metadata field of the ext-proc request. If this key is set, the EPP must select from this endpoint list. If the list is empty or no endpoints are eligible, it should return a 503 error. If the key isn't set, the EPP selects from the endpoints defined by the InferencePool selector. + +#### Response from the extension + +The EPP communicates the chosen endpoint to the proxy via the `x-gateway-destination-endpoint` HTTP header and the `dynamic_metadata` field of the ext-proc response. Failure to communicate the endpoint using both methods results in a 503 error if no endpoints are ready, or a 429 error if the request should be dropped. The header and metadata values must match. In addition to the chosen endpoint, a single fallback endpoint CAN be set using the key `x-gateway-destination-endpoint-fallback` in the same metadata namespace as one used for `x-gateway-destination-endpoint`. + +## Testing Tips + +Here are some tips for testing your controller end-to-end: + +- **Focus on Key Scenarios**: Add common scenarios like creating, updating, and deleting InferencePool resources, as well as different routing rules that target InferencePool backends. +- **Verify Routing Behaviors**: Design more complex routing scenarios and verify that requests are correctly routed to the appropriate model server pods within the InferencePool based on the InferenceModel configuration. +- **Test Error Handling**: Verify that the controller correctly handles scenarios like unsupported model names or resource constraints (if criticality-based shedding is implemented). Test with state transitions (such as constant requests while Pods behind EPP are being replaced and Pods behind InferencePool are being replaced) to ensure that the system is resilient to failures and can automatically recover by redirecting traffic to healthy Pods. +- **Using Reference EPP Implementation + Echoserver**: You can use the [reference EPP implementation](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp) for testing your controller end-to-end. Instead of a full-fledged model server, a simple mock server (like the [echoserver](https://github.com/kubernetes-sigs/ingress-controller-conformance/tree/master/images/echoserver)) can be very useful for verifying routing to ensure the correct pod received the request. +- **Performance Test**: Run end-to-end [benchmarks](https://gateway-api-inference-extension.sigs.k8s.io/performance/benchmark/) to make sure that your inference gateway can achieve the latency target that is desired. + +### Conformance Tests + +A set of conformance tests will be developed soon to help verify that a controller is working as expected. This guide will be updated once we have more information. Stay tuned! From 206ef937d2e693a7c05b2892f69dbb9a5e3dbf79 Mon Sep 17 00:00:00 2001 From: Rohit Ramkumar Date: Thu, 3 Apr 2025 00:30:36 -0400 Subject: [PATCH 13/74] Update BBR istio.yaml to use FULL_DUPLEX_STREAM (#629) --- config/charts/body-based-routing/templates/istio.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/charts/body-based-routing/templates/istio.yaml b/config/charts/body-based-routing/templates/istio.yaml index c4c1444f..6d4535cc 100644 --- a/config/charts/body-based-routing/templates/istio.yaml +++ b/config/charts/body-based-routing/templates/istio.yaml @@ -25,9 +25,9 @@ spec: processing_mode: request_header_mode: "SEND" response_header_mode: "SKIP" - request_body_mode: "BUFFERED" + request_body_mode: "FULL_DUPLEX_STREAMED" response_body_mode: "NONE" - request_trailer_mode: "SKIP" + request_trailer_mode: "SEND" response_trailer_mode: "SKIP" grpc_service: envoy_grpc: From 2e4642563a907d5ab241866c2f56c37b161b79d7 Mon Sep 17 00:00:00 2001 From: Daneyon Hansen Date: Thu, 3 Apr 2025 09:30:38 -0700 Subject: [PATCH 14/74] Bumps Kgateway to v2.0.0 (#646) Signed-off-by: Daneyon Hansen --- site-src/guides/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index f1545438..367ca902 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -201,7 +201,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv 2. Set the Kgateway version and install the Kgateway CRDs. ```bash - KGTW_VERSION=v2.0.0-rc.2 + KGTW_VERSION=v2.0.0 helm upgrade -i --create-namespace --namespace kgateway-system --version $KGTW_VERSION kgateway-crds oci://cr.kgateway.dev/kgateway-dev/charts/kgateway-crds ``` From 81100ffe0e9180d608fd4ba91b514e7d40c290cd Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Thu, 3 Apr 2025 22:02:37 +0300 Subject: [PATCH 15/74] remove deprecated v1alpha2.AddToScheme and use v1alpha2.Install instead (#649) Signed-off-by: Nir Rozenbaum --- pkg/epp/controller/inferencemodel_reconciler_test.go | 2 +- pkg/epp/controller/inferencepool_reconciler_test.go | 2 +- pkg/epp/server/controller_manager.go | 2 +- test/integration/epp/hermetic_test.go | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/epp/controller/inferencemodel_reconciler_test.go b/pkg/epp/controller/inferencemodel_reconciler_test.go index cd1ff1fb..57dc2469 100644 --- a/pkg/epp/controller/inferencemodel_reconciler_test.go +++ b/pkg/epp/controller/inferencemodel_reconciler_test.go @@ -178,7 +178,7 @@ func TestInferenceModelReconciler(t *testing.T) { t.Run(test.name, func(t *testing.T) { // Create a fake client with no InferenceModel objects. scheme := runtime.NewScheme() - _ = v1alpha2.AddToScheme(scheme) + _ = v1alpha2.Install(scheme) initObjs := []client.Object{} if test.model != nil { initObjs = append(initObjs, test.model) diff --git a/pkg/epp/controller/inferencepool_reconciler_test.go b/pkg/epp/controller/inferencepool_reconciler_test.go index 27c4238e..7e5d4801 100644 --- a/pkg/epp/controller/inferencepool_reconciler_test.go +++ b/pkg/epp/controller/inferencepool_reconciler_test.go @@ -77,7 +77,7 @@ func TestInferencePoolReconciler(t *testing.T) { // Set up the scheme. scheme := runtime.NewScheme() _ = clientgoscheme.AddToScheme(scheme) - _ = v1alpha2.AddToScheme(scheme) + _ = v1alpha2.Install(scheme) // Create a fake client with the pool and the pods. initialObjects := []client.Object{pool1, pool2} diff --git a/pkg/epp/server/controller_manager.go b/pkg/epp/server/controller_manager.go index 41fe86a9..aaad8976 100644 --- a/pkg/epp/server/controller_manager.go +++ b/pkg/epp/server/controller_manager.go @@ -36,7 +36,7 @@ var scheme = runtime.NewScheme() func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) - utilruntime.Must(v1alpha2.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.Install(scheme)) } // DefaultManagerOptions returns the default options used to create the manager. diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go index 0ba0e14a..cf00a049 100644 --- a/test/integration/epp/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -1602,7 +1602,7 @@ func BeforeSuite() func() { } utilruntime.Must(clientgoscheme.AddToScheme(scheme)) - utilruntime.Must(v1alpha2.AddToScheme(scheme)) + utilruntime.Must(v1alpha2.Install(scheme)) k8sClient, err = k8sclient.New(cfg, k8sclient.Options{Scheme: scheme}) if err != nil { From 2759e3f06fc65235edc131e4d9d191dd71d69b2a Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Sat, 5 Apr 2025 02:34:37 +0300 Subject: [PATCH 16/74] removed time.sleep and using ticker instead (#648) * removed time.sleep and using ticker instead Signed-off-by: Nir Rozenbaum * move ticker creation outside of go routine. make sure refresh internal is valid at the ticker creation time Signed-off-by: Nir Rozenbaum * add DefaultRefreshPrometheusMetricsInterval for test purposes. once ticker was introduced instead of sleep, having 0 as the refresh internal is not valid. Signed-off-by: Nir Rozenbaum * wait in test until metrics are available before running tests that rely on the values. up until now, the metrics go routine ran in tests with time.Sleep(0), which means metrics were avaiable immediately. while in tests in might be acceptable to wait few seconds using sleep, in the actual code (not tests) it's a bad practice to use sleep which was replaced with a ticker (to perform periodic task in an endless loop). Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum --- pkg/epp/backend/metrics/logger.go | 13 +++++++------ pkg/epp/backend/metrics/pod_metrics.go | 14 ++++++-------- pkg/epp/server/runserver.go | 1 + test/integration/epp/hermetic_test.go | 2 ++ 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/pkg/epp/backend/metrics/logger.go b/pkg/epp/backend/metrics/logger.go index d71dc3fa..d9a93027 100644 --- a/pkg/epp/backend/metrics/logger.go +++ b/pkg/epp/backend/metrics/logger.go @@ -32,6 +32,7 @@ const ( // Note currently the EPP treats stale metrics same as fresh. // TODO: https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/336 metricsValidityPeriod = 5 * time.Second + debugPrintInterval = 5 * time.Second ) type Datastore interface { @@ -46,16 +47,15 @@ type Datastore interface { // enabled; 2) flushes Prometheus metrics about the backend servers. func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometheusMetricsInterval time.Duration) { logger := log.FromContext(ctx) - - // Periodically flush prometheus metrics for inference pool + ticker := time.NewTicker(refreshPrometheusMetricsInterval) go func() { + defer ticker.Stop() for { select { case <-ctx.Done(): logger.V(logutil.DEFAULT).Info("Shutting down prometheus metrics thread") return - default: - time.Sleep(refreshPrometheusMetricsInterval) + case <-ticker.C: // Periodically flush prometheus metrics for inference pool flushPrometheusMetricsOnce(logger, datastore) } } @@ -64,13 +64,14 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh // Periodically print out the pods and metrics for DEBUGGING. if logger := logger.V(logutil.DEBUG); logger.Enabled() { go func() { + ticker := time.NewTicker(debugPrintInterval) + defer ticker.Stop() for { select { case <-ctx.Done(): logger.V(logutil.DEFAULT).Info("Shutting down metrics logger thread") return - default: - time.Sleep(5 * time.Second) + case <-ticker.C: podsWithFreshMetrics := datastore.PodList(func(pm PodMetrics) bool { return time.Since(pm.GetMetrics().UpdateTime) <= metricsValidityPeriod }) diff --git a/pkg/epp/backend/metrics/pod_metrics.go b/pkg/epp/backend/metrics/pod_metrics.go index cfb6b138..c85d4d79 100644 --- a/pkg/epp/backend/metrics/pod_metrics.go +++ b/pkg/epp/backend/metrics/pod_metrics.go @@ -84,21 +84,19 @@ func (pm *podMetrics) startRefreshLoop() { pm.once.Do(func() { go func() { pm.logger.V(logutil.DEFAULT).Info("Starting refresher", "pod", pm.GetPod()) + ticker := time.NewTicker(pm.interval) + defer ticker.Stop() for { select { case <-pm.done: return case <-pm.parentCtx.Done(): return - default: + case <-ticker.C: // refresh metrics periodically + if err := pm.refreshMetrics(); err != nil { + pm.logger.V(logutil.TRACE).Error(err, "Failed to refresh metrics", "pod", pm.GetPod()) + } } - - err := pm.refreshMetrics() - if err != nil { - pm.logger.V(logutil.TRACE).Error(err, "Failed to refresh metrics", "pod", pm.GetPod()) - } - - time.Sleep(pm.interval) } }() }) diff --git a/pkg/epp/server/runserver.go b/pkg/epp/server/runserver.go index a6c9f1d3..7ed183be 100644 --- a/pkg/epp/server/runserver.go +++ b/pkg/epp/server/runserver.go @@ -76,6 +76,7 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner { PoolName: DefaultPoolName, PoolNamespace: DefaultPoolNamespace, SecureServing: DefaultSecureServing, + RefreshPrometheusMetricsInterval: DefaultRefreshPrometheusMetricsInterval, // Datastore can be assigned later. } } diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go index cf00a049..1c5eca18 100644 --- a/test/integration/epp/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -1548,6 +1548,8 @@ func setUpHermeticServer(t *testing.T, podAndMetrics map[backendmetrics.Pod]*bac } }() + time.Sleep(serverRunner.RefreshPrometheusMetricsInterval) // wait for metrics to get available before running tests that rely on these metrics + // check if all pods are synced to datastore assert.EventuallyWithT(t, func(t *assert.CollectT) { assert.Len(t, serverRunner.Datastore.PodGetAll(), len(podAndMetrics), "Datastore not synced") From 6d7655b17755f6ea191ca53d542f699d6fb79ebb Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Mon, 7 Apr 2025 01:26:38 +0300 Subject: [PATCH 17/74] update release version in README (#653) Signed-off-by: Nir Rozenbaum --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2ff00581..b74a13e9 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ It currently requires a version of vLLM that supports the necessary metrics to p ## Status -This project is [alpha (0.2 release)](https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/tag/v0.2.0). It should not be used in production yet. +This project is [alpha (0.3 release)](https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/tag/v0.3.0). It should not be used in production yet. ## Getting Started From 2c0a637826218ab0bda3a1f2e4d43e897344315e Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Mon, 7 Apr 2025 02:44:38 +0300 Subject: [PATCH 18/74] fix some issues in e2e tests (#621) * added timeout to curl command which may otherwise hang Signed-off-by: Nir Rozenbaum * check HF_TOKEN set at the beginning of the test Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum --- test/e2e/epp/e2e_suite_test.go | 30 ++++++++++++++++++------------ test/e2e/epp/e2e_test.go | 8 ++++++-- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/test/e2e/epp/e2e_suite_test.go b/test/e2e/epp/e2e_suite_test.go index 643bbf75..61ee2540 100644 --- a/test/e2e/epp/e2e_suite_test.go +++ b/test/e2e/epp/e2e_suite_test.go @@ -49,6 +49,8 @@ const ( defaultReadyTimeout = 3 * time.Minute // defaultModelReadyTimeout is the default timeout for the model server deployment to report a ready state. defaultModelReadyTimeout = 10 * time.Minute + // defaultCurlTimeout is the default timeout for the curl command to get a response. + defaultCurlTimeout = 30 * time.Second // defaultInterval is the default interval to check if a resource exists or ready conditions. defaultInterval = time.Millisecond * 250 // defaultCurlInterval is the default interval to run the test curl command. @@ -107,7 +109,11 @@ var _ = ginkgo.BeforeSuite(func() { }) func setupInfra() { - modelServerManifest := readModelServerManifestPath() + modelServerManifestPath := readModelServerManifestPath() + modelServerManifestArray := getYamlsFromModelServerManifest(modelServerManifestPath) + if strings.Contains(modelServerManifestArray[0], "hf-token") { + createHfSecret(cli, modelServerSecretManifest) + } crds := map[string]string{ "inferencepools.inference.networking.x-k8s.io": inferPoolManifest, "inferencemodels.inference.networking.x-k8s.io": inferModelManifest, @@ -117,7 +123,7 @@ func setupInfra() { createClient(cli, clientManifest) createEnvoy(cli, envoyManifest) // Run this step last, as it requires additional time for the model server to become ready. - createModelServer(cli, modelServerSecretManifest, modelServerManifest) + createModelServer(cli, modelServerManifestArray, modelServerManifestPath) } var _ = ginkgo.AfterSuite(func() { @@ -137,7 +143,7 @@ func setupSuite() { err = apiextv1.AddToScheme(scheme) gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) - err = infextv1a2.AddToScheme(scheme) + err = infextv1a2.Install(scheme) gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) cli, err = client.New(cfg, client.Options{Scheme: scheme}) @@ -171,6 +177,7 @@ var ( existsTimeout = getTimeout("EXISTS_TIMEOUT", defaultExistsTimeout) readyTimeout = getTimeout("READY_TIMEOUT", defaultReadyTimeout) modelReadyTimeout = getTimeout("MODEL_READY_TIMEOUT", defaultModelReadyTimeout) + curlTimeout = getTimeout("CURL_TIMEOUT", defaultCurlTimeout) interval = defaultInterval curlInterval = defaultCurlInterval ) @@ -191,6 +198,13 @@ func readModelServerManifestPath() string { return modelServerManifestFilepath } +func getYamlsFromModelServerManifest(modelServerManifestPath string) []string { + ginkgo.By("Ensuring the model server manifest points to an existing file") + modelServerManifestArray := readYaml(modelServerManifestPath) + gomega.Expect(modelServerManifestArray).NotTo(gomega.BeEmpty()) + return modelServerManifestArray +} + // createCRDs creates the Inference Extension CRDs used for testing. func createCRDs(k8sClient client.Client, crds map[string]string) { for name, path := range crds { @@ -224,15 +238,7 @@ func createClient(k8sClient client.Client, filePath string) { } // createModelServer creates the model server resources used for testing from the given filePaths. -func createModelServer(k8sClient client.Client, secretPath, deployPath string) { - ginkgo.By("Ensuring the model server manifest points to an existing file") - modelServerManifestArray := readYaml(deployPath) - gomega.Expect(modelServerManifestArray).NotTo(gomega.BeEmpty()) - modelServerManifestYaml := modelServerManifestArray[0] - if strings.Contains(modelServerManifestYaml, "hf-token") { - createHfSecret(k8sClient, secretPath) - } - +func createModelServer(k8sClient client.Client, modelServerManifestArray []string, deployPath string) { ginkgo.By("Creating model server resources from manifest: " + deployPath) createObjsFromYaml(k8sClient, modelServerManifestArray) diff --git a/test/e2e/epp/e2e_test.go b/test/e2e/epp/e2e_test.go index e86b2d49..7240cebc 100644 --- a/test/e2e/epp/e2e_test.go +++ b/test/e2e/epp/e2e_test.go @@ -18,7 +18,9 @@ package epp import ( "fmt" + "strconv" "strings" + "time" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" @@ -53,7 +55,7 @@ var _ = ginkgo.Describe("InferencePool", func() { }, existsTimeout, interval).Should(gomega.Succeed()) ginkgo.By("Verifying connectivity through the inference extension") - curlCmd := getCurlCommand(envoyName, nsName, envoyPort, modelName) + curlCmd := getCurlCommand(envoyName, nsName, envoyPort, modelName, curlTimeout) // Ensure the expected responses include the inferencemodel target model names. var expected []string @@ -112,10 +114,12 @@ func newInferenceModel(ns string) *v1alpha2.InferenceModel { // getCurlCommand returns the command, as a slice of strings, for curl'ing // the test model server at the given name, namespace, port, and model name. -func getCurlCommand(name, ns, port, model string) []string { +func getCurlCommand(name, ns, port, model string, timeout time.Duration) []string { return []string{ "curl", "-i", + "--max-time", + strconv.Itoa((int)(timeout.Seconds())), fmt.Sprintf("%s.%s.svc:%s/v1/completions", name, ns, port), "-H", "Content-Type: application/json", From 264ee45a447949e4db0178ade98479b060dbc2b5 Mon Sep 17 00:00:00 2001 From: Cong Liu Date: Mon, 7 Apr 2025 04:48:41 -0700 Subject: [PATCH 19/74] Refactor scheduler (#645) --- pkg/epp/backend/metrics/metrics.go | 3 +- pkg/epp/backend/metrics/metrics_test.go | 11 +- pkg/epp/backend/metrics/pod_metrics_test.go | 2 + pkg/epp/backend/metrics/types.go | 26 +- pkg/epp/datastore/datastore_test.go | 3 + pkg/epp/handlers/request.go | 4 +- pkg/epp/handlers/server.go | 5 +- pkg/epp/handlers/streamingserver.go | 4 +- pkg/epp/scheduling/filter.go | 151 +++++---- pkg/epp/scheduling/filter_test.go | 326 +++----------------- pkg/epp/scheduling/scheduler.go | 121 ++++---- pkg/epp/scheduling/scheduler_test.go | 232 ++++++++++++++ pkg/epp/scheduling/types.go | 27 -- pkg/epp/scheduling/types/types.go | 88 ++++++ test/integration/epp/hermetic_test.go | 39 ++- 15 files changed, 592 insertions(+), 450 deletions(-) create mode 100644 pkg/epp/scheduling/scheduler_test.go delete mode 100644 pkg/epp/scheduling/types.go create mode 100644 pkg/epp/scheduling/types/types.go diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go index d48b1dc5..96814b4b 100644 --- a/pkg/epp/backend/metrics/metrics.go +++ b/pkg/epp/backend/metrics/metrics.go @@ -109,6 +109,7 @@ func (p *PodMetricsClientImpl) promToPodMetrics( if loraMetrics != nil { updated.ActiveModels = make(map[string]int) + updated.WaitingModels = make(map[string]int) for _, label := range loraMetrics.GetLabel() { if label.GetName() == LoraInfoRunningAdaptersMetricName { if label.GetValue() != "" { @@ -122,7 +123,7 @@ func (p *PodMetricsClientImpl) promToPodMetrics( if label.GetValue() != "" { adapterList := strings.Split(label.GetValue(), ",") for _, adapter := range adapterList { - updated.ActiveModels[adapter] = 0 + updated.WaitingModels[adapter] = 0 } } } diff --git a/pkg/epp/backend/metrics/metrics_test.go b/pkg/epp/backend/metrics/metrics_test.go index d0396bf7..e3b45b94 100644 --- a/pkg/epp/backend/metrics/metrics_test.go +++ b/pkg/epp/backend/metrics/metrics_test.go @@ -404,7 +404,8 @@ func TestPromToPodMetrics(t *testing.T) { expectedMetrics: &Metrics{ WaitingQueueSize: 7, KVCacheUsagePercent: 0.8, - ActiveModels: map[string]int{"lora1": 0, "lora2": 0, "lora3": 0}, + ActiveModels: map[string]int{"lora1": 0, "lora2": 0}, + WaitingModels: map[string]int{"lora3": 0}, MaxActiveModels: 3, }, }, @@ -416,8 +417,8 @@ func TestPromToPodMetrics(t *testing.T) { KVCacheUtilization: &MetricSpec{MetricName: "vllm_usage"}, LoraRequestInfo: &MetricSpec{MetricName: "vllm:lora_requests_info"}, }, - existingMetrics: &Metrics{ActiveModels: map[string]int{}}, - expectedMetrics: &Metrics{ActiveModels: map[string]int{}}, + existingMetrics: &Metrics{ActiveModels: map[string]int{}, WaitingModels: map[string]int{}}, + expectedMetrics: &Metrics{ActiveModels: map[string]int{}, WaitingModels: map[string]int{}}, expectedErr: multierr.Combine(errors.New("metric family \"vllm_waiting\" not found"), errors.New("metric family \"vllm_usage\" not found"), errors.New("metric family \"vllm:lora_requests_info\" not found")), }, { @@ -439,7 +440,8 @@ func TestPromToPodMetrics(t *testing.T) { expectedMetrics: &Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.8, - ActiveModels: map[string]int{"lora1": 0, "lora2": 0, "lora3": 0}, + ActiveModels: map[string]int{"lora1": 0, "lora2": 0}, + WaitingModels: map[string]int{"lora3": 0}, MaxActiveModels: 3, }, expectedErr: errors.New("metric family \"vllm_waiting\" not found"), @@ -457,6 +459,7 @@ func TestPromToPodMetrics(t *testing.T) { existingMetrics: &Metrics{}, expectedMetrics: &Metrics{ ActiveModels: map[string]int{"lora1": 0}, + WaitingModels: map[string]int{}, MaxActiveModels: 0, // Should still default to 0. }, diff --git a/pkg/epp/backend/metrics/pod_metrics_test.go b/pkg/epp/backend/metrics/pod_metrics_test.go index cf6698ca..e79c1bf0 100644 --- a/pkg/epp/backend/metrics/pod_metrics_test.go +++ b/pkg/epp/backend/metrics/pod_metrics_test.go @@ -44,6 +44,7 @@ var ( "foo": 1, "bar": 1, }, + WaitingModels: map[string]int{}, } updated = &Metrics{ WaitingQueueSize: 9999, @@ -53,6 +54,7 @@ var ( "foo": 1, "bar": 1, }, + WaitingModels: map[string]int{}, } ) diff --git a/pkg/epp/backend/metrics/types.go b/pkg/epp/backend/metrics/types.go index 17db23b4..925a0cc5 100644 --- a/pkg/epp/backend/metrics/types.go +++ b/pkg/epp/backend/metrics/types.go @@ -41,6 +41,7 @@ type PodMetricsFactory struct { } func (f *PodMetricsFactory) NewPodMetrics(parentCtx context.Context, in *corev1.Pod, ds Datastore) PodMetrics { + pod := toInternalPod(in) pm := &podMetrics{ pmc: f.pmc, ds: ds, @@ -48,9 +49,9 @@ func (f *PodMetricsFactory) NewPodMetrics(parentCtx context.Context, in *corev1. parentCtx: parentCtx, once: sync.Once{}, done: make(chan struct{}), - logger: log.FromContext(parentCtx), + logger: log.FromContext(parentCtx).WithValues("pod", pod.NamespacedName), } - pm.pod.Store(toInternalPod(in)) + pm.pod.Store(pod) pm.metrics.Store(newMetrics()) pm.startRefreshLoop() @@ -77,9 +78,20 @@ func (p *Pod) String() string { return fmt.Sprintf("%+v", *p) } +func (p *Pod) Clone() *Pod { + return &Pod{ + NamespacedName: types.NamespacedName{ + Name: p.NamespacedName.Name, + Namespace: p.NamespacedName.Namespace, + }, + Address: p.Address, + } +} + type Metrics struct { // ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU. - ActiveModels map[string]int + ActiveModels map[string]int + WaitingModels map[string]int // MaxActiveModels is the maximum number of models that can be loaded to GPU. MaxActiveModels int RunningQueueSize int @@ -93,7 +105,8 @@ type Metrics struct { func newMetrics() *Metrics { return &Metrics{ - ActiveModels: make(map[string]int), + ActiveModels: make(map[string]int), + WaitingModels: make(map[string]int), } } @@ -109,8 +122,13 @@ func (m *Metrics) Clone() *Metrics { for k, v := range m.ActiveModels { cm[k] = v } + wm := make(map[string]int, len(m.WaitingModels)) + for k, v := range m.WaitingModels { + wm[k] = v + } clone := &Metrics{ ActiveModels: cm, + WaitingModels: wm, MaxActiveModels: m.MaxActiveModels, RunningQueueSize: m.RunningQueueSize, WaitingQueueSize: m.WaitingQueueSize, diff --git a/pkg/epp/datastore/datastore_test.go b/pkg/epp/datastore/datastore_test.go index 22bb0365..abbff429 100644 --- a/pkg/epp/datastore/datastore_test.go +++ b/pkg/epp/datastore/datastore_test.go @@ -236,6 +236,7 @@ var ( "foo": 1, "bar": 1, }, + WaitingModels: map[string]int{}, } pod2 = &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ @@ -250,6 +251,7 @@ var ( "foo1": 1, "bar1": 1, }, + WaitingModels: map[string]int{}, } pod1NamespacedName = types.NamespacedName{Name: pod1.Name, Namespace: pod1.Namespace} pod2NamespacedName = types.NamespacedName{Name: pod2.Name, Namespace: pod2.Namespace} @@ -305,6 +307,7 @@ func TestMetrics(t *testing.T) { // Failed to fetch pod2 metrics so it remains the default values. { ActiveModels: map[string]int{}, + WaitingModels: map[string]int{}, WaitingQueueSize: 0, KVCacheUsagePercent: 0, MaxActiveModels: 0, diff --git a/pkg/epp/handlers/request.go b/pkg/epp/handlers/request.go index d7678fad..b786a15d 100644 --- a/pkg/epp/handlers/request.go +++ b/pkg/epp/handlers/request.go @@ -27,7 +27,7 @@ import ( "google.golang.org/protobuf/types/known/structpb" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" + schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -74,7 +74,7 @@ func (s *Server) HandleRequestBody( return nil, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error getting target model name for model %v", modelObj.Name)} } } - llmReq := &scheduling.LLMRequest{ + llmReq := &schedulingtypes.LLMRequest{ Model: model, ResolvedTargetModel: modelName, Critical: datastore.IsCritical(modelObj), diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go index a92f091c..f6f375dd 100644 --- a/pkg/epp/handlers/server.go +++ b/pkg/epp/handlers/server.go @@ -26,10 +26,9 @@ import ( "google.golang.org/grpc/codes" "google.golang.org/grpc/status" "sigs.k8s.io/controller-runtime/pkg/log" - backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" + schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -57,7 +56,7 @@ type Server struct { } type Scheduler interface { - Schedule(ctx context.Context, b *scheduling.LLMRequest) (targetPod backendmetrics.PodMetrics, err error) + Schedule(ctx context.Context, b *schedulingtypes.LLMRequest) (targetPod schedulingtypes.Pod, err error) } func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { diff --git a/pkg/epp/handlers/streamingserver.go b/pkg/epp/handlers/streamingserver.go index 874dd734..0e9020d8 100644 --- a/pkg/epp/handlers/streamingserver.go +++ b/pkg/epp/handlers/streamingserver.go @@ -37,7 +37,7 @@ import ( backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" + schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -343,7 +343,7 @@ func (s *StreamingServer) HandleRequestBody( return reqCtx, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error getting target model name for model %v", modelObj.Name)} } } - llmReq := &scheduling.LLMRequest{ + llmReq := &schedulingtypes.LLMRequest{ Model: model, ResolvedTargetModel: modelName, Critical: datastore.IsCritical(modelObj), diff --git a/pkg/epp/scheduling/filter.go b/pkg/epp/scheduling/filter.go index f4848089..99044e97 100644 --- a/pkg/epp/scheduling/filter.go +++ b/pkg/epp/scheduling/filter.go @@ -22,48 +22,63 @@ import ( "math/rand" "time" - "github.com/go-logr/logr" - backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) type Filter interface { Name() string - Filter(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) + Filter(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) } -// filter applies current filterFunc, and then recursively applies next filters depending success or -// failure of the current filterFunc. -// It can be used to construct a flow chart algorithm. -type filter struct { +type basicFilter struct { name string filter filterFunc +} + +func (bf *basicFilter) Name() string { + if bf == nil { + return "nil" + } + return bf.name +} + +func (bf *basicFilter) Filter(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) { + loggerTrace := ctx.Logger.V(logutil.TRACE) + loggerTrace.Info("Running a filter", "name", bf.Name(), "podCount", len(pods)) + + return bf.filter(ctx, pods) +} + +// decisionTreeFilter applies current filterFunc, and then recursively applies next filters +// depending success or failure of the current filter. +// It can be used to construct a flow chart algorithm. +type decisionTreeFilter struct { + current Filter // nextOnSuccess filter will be applied after successfully applying the current filter. // The filtered results will be passed to the next filter. - nextOnSuccess *filter + nextOnSuccess Filter // nextOnFailure filter will be applied if current filter fails. // The original input will be passed to the next filter. - nextOnFailure *filter + nextOnFailure Filter // nextOnSuccessOrFailure is a convenience field to configure the next filter regardless of the // success or failure of the current filter. // NOTE: When using nextOnSuccessOrFailure, both nextOnSuccess and nextOnFailure SHOULD be nil. // However if that's not the case, nextOnSuccess and nextOnFailure will be used, instead of // nextOnSuccessOrFailure, in the success and failure scenarios, respectively. - nextOnSuccessOrFailure *filter + nextOnSuccessOrFailure Filter } -func (f *filter) Name() string { +func (f *decisionTreeFilter) Name() string { if f == nil { return "nil" } - return f.name + return f.current.Name() } -func (f *filter) Filter(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { - loggerTrace := logger.V(logutil.TRACE) - loggerTrace.Info("Running a filter", "name", f.Name(), "podCount", len(pods)) - - filtered, err := f.filter(logger, req, pods) +func (f *decisionTreeFilter) Filter(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) { + loggerTrace := ctx.Logger.V(logutil.TRACE) + filtered, err := f.current.Filter(ctx, pods) next := f.nextOnSuccessOrFailure if err == nil && len(filtered) > 0 { @@ -76,7 +91,7 @@ func (f *filter) Filter(logger logr.Logger, req *LLMRequest, pods []backendmetri } loggerTrace.Info("Filter succeeded", "filter", f.Name(), "next", next.Name(), "filteredPodCount", len(filtered)) // On success, pass the filtered result to the next filter. - return next.Filter(logger, req, filtered) + return next.Filter(ctx, filtered) } else { if f.nextOnFailure == nil && f.nextOnSuccessOrFailure == nil { // No succeeding filters to run, return. @@ -87,19 +102,19 @@ func (f *filter) Filter(logger logr.Logger, req *LLMRequest, pods []backendmetri } loggerTrace.Info("Filter failed", "filter", f.Name(), "next", next.Name()) // On failure, pass the initial set of pods to the next filter. - return next.Filter(logger, req, pods) + return next.Filter(ctx, pods) } } // filterFunc filters a set of input pods to a subset. -type filterFunc func(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) +type filterFunc func(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) // toFilterFunc is a helper function to convert a per pod filter func to the FilterFunc. func toFilterFunc(pp podPredicate) filterFunc { - return func(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { - filtered := []backendmetrics.PodMetrics{} + return func(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) { + filtered := []*types.PodMetrics{} for _, pod := range pods { - pass := pp(req, pod) + pass := pp(ctx.Req, pod) if pass { filtered = append(filtered, pod) } @@ -111,6 +126,11 @@ func toFilterFunc(pp podPredicate) filterFunc { } } +var leastQueueFilter = &basicFilter{ + name: "least queuing", + filter: leastQueuingFilterFunc, +} + // leastQueuingFilterFunc finds the max and min queue size of all pods, divides the whole range // (max-min) by the number of pods, and finds the pods that fall into the first range. // The intuition is that if there are multiple pods that share similar queue size in the low range, @@ -118,30 +138,36 @@ func toFilterFunc(pp podPredicate) filterFunc { // the least one as it gives more choices for the next filter, which on aggregate gave better // results. // TODO: Compare this strategy with other strategies such as top K. -func leastQueuingFilterFunc(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { +func leastQueuingFilterFunc(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) { min := math.MaxInt max := 0 - filtered := []backendmetrics.PodMetrics{} + filtered := []*types.PodMetrics{} for _, pod := range pods { - if pod.GetMetrics().WaitingQueueSize <= min { - min = pod.GetMetrics().WaitingQueueSize + if pod.WaitingQueueSize <= min { + min = pod.WaitingQueueSize } - if pod.GetMetrics().WaitingQueueSize >= max { - max = pod.GetMetrics().WaitingQueueSize + if pod.WaitingQueueSize >= max { + max = pod.WaitingQueueSize } } for _, pod := range pods { - if pod.GetMetrics().WaitingQueueSize >= min && pod.GetMetrics().WaitingQueueSize <= min+(max-min)/len(pods) { + if pod.WaitingQueueSize >= min && pod.WaitingQueueSize <= min+(max-min)/len(pods) { filtered = append(filtered, pod) } } return filtered, nil } -func lowQueueingPodPredicate(_ *LLMRequest, pod backendmetrics.PodMetrics) bool { - return pod.GetMetrics().WaitingQueueSize < config.QueueingThresholdLoRA +var lowQueueFilter = &basicFilter{ + name: "low queueing filter", + filter: toFilterFunc((queueThresholdPredicate(config.QueueingThresholdLoRA))), +} + +var leastKVCacheFilter = &basicFilter{ + name: "least KV cache percent", + filter: leastKVCacheFilterFunc, } // leastKVCacheFilterFunc finds the max and min KV cache of all pods, divides the whole range @@ -150,39 +176,31 @@ func lowQueueingPodPredicate(_ *LLMRequest, pod backendmetrics.PodMetrics) bool // should consider them all instead of the absolute minimum one. This worked better than picking the // least one as it gives more choices for the next filter, which on aggregate gave better results. // TODO: Compare this strategy with other strategies such as top K. -func leastKVCacheFilterFunc(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { +func leastKVCacheFilterFunc(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) { min := math.MaxFloat64 var max float64 = 0 - filtered := []backendmetrics.PodMetrics{} + filtered := []*types.PodMetrics{} for _, pod := range pods { - if pod.GetMetrics().KVCacheUsagePercent <= min { - min = pod.GetMetrics().KVCacheUsagePercent + if pod.KVCacheUsagePercent <= min { + min = pod.KVCacheUsagePercent } - if pod.GetMetrics().KVCacheUsagePercent >= max { - max = pod.GetMetrics().KVCacheUsagePercent + if pod.KVCacheUsagePercent >= max { + max = pod.KVCacheUsagePercent } } for _, pod := range pods { - if pod.GetMetrics().KVCacheUsagePercent >= min && pod.GetMetrics().KVCacheUsagePercent <= min+(max-min)/float64(len(pods)) { + if pod.KVCacheUsagePercent >= min && pod.KVCacheUsagePercent <= min+(max-min)/float64(len(pods)) { filtered = append(filtered, pod) } } return filtered, nil } -// podPredicate is a filter function to check whether a pod is desired. -type podPredicate func(req *LLMRequest, pod backendmetrics.PodMetrics) bool - -// We consider serving an adapter low cost it the adapter is active in the model server, or the -// model server has room to load the adapter. The lowLoRACostPredicate ensures weak affinity by -// spreading the load of a LoRA adapter across multiple pods, avoiding "pinning" all requests to -// a single pod. This gave good performance in our initial benchmarking results in the scenario -// where # of lora slots > # of lora adapters. -func lowLoRACostPredicate(req *LLMRequest, pod backendmetrics.PodMetrics) bool { - _, ok := pod.GetMetrics().ActiveModels[req.ResolvedTargetModel] - return ok || len(pod.GetMetrics().ActiveModels) < pod.GetMetrics().MaxActiveModels +var loRAAffinityFilter = &basicFilter{ + name: "affinity LoRA", + filter: loRASoftAffinityFilterFunc, } // loRASoftAffinityPredicate implements a pod selection strategy that prioritizes pods @@ -201,18 +219,20 @@ func lowLoRACostPredicate(req *LLMRequest, pod backendmetrics.PodMetrics) bool { // Returns: // - Filtered slice of pod metrics based on affinity and availability // - Error if any issues occur during filtering -func loRASoftAffinityFilter(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { +func loRASoftAffinityFilterFunc(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) { // Pre-allocate slices with estimated capacity - filtered_affinity := make([]backendmetrics.PodMetrics, 0, len(pods)) - filtered_available := make([]backendmetrics.PodMetrics, 0, len(pods)) + filtered_affinity := make([]*types.PodMetrics, 0, len(pods)) + filtered_available := make([]*types.PodMetrics, 0, len(pods)) // Categorize pods based on affinity and availability for _, pod := range pods { + _, active := pod.ActiveModels[ctx.Req.ResolvedTargetModel] + _, waiting := pod.WaitingModels[ctx.Req.ResolvedTargetModel] - if _, exists := pod.GetMetrics().ActiveModels[req.ResolvedTargetModel]; exists { + if active || waiting { filtered_affinity = append(filtered_affinity, pod) - } else if len(pod.GetMetrics().ActiveModels) < pod.GetMetrics().MaxActiveModels { + } else if len(pod.ActiveModels)+len(pod.WaitingModels) < pod.MaxActiveModels { filtered_available = append(filtered_available, pod) } } @@ -237,12 +257,23 @@ func loRASoftAffinityFilter(logger logr.Logger, req *LLMRequest, pods []backendm return filtered_available, nil } -func criticalRequestPredicate(req *LLMRequest, _ backendmetrics.PodMetrics) bool { - return req.Critical +// podPredicate is a filter function to check whether a pod is desired. +type podPredicate func(req *types.LLMRequest, pod *types.PodMetrics) bool + +func queueThresholdPredicate(queueThreshold int) podPredicate { + return func(req *types.LLMRequest, pod *types.PodMetrics) bool { + return pod.WaitingQueueSize <= queueThreshold + } +} + +func kvCacheThresholdPredicate(kvCacheThreshold float64) podPredicate { + return func(req *types.LLMRequest, pod *types.PodMetrics) bool { + return pod.KVCacheUsagePercent <= kvCacheThreshold + } } -func noQueueAndLessThanKVCacheThresholdPredicate(queueThreshold int, kvCacheThreshold float64) podPredicate { - return func(req *LLMRequest, pod backendmetrics.PodMetrics) bool { - return pod.GetMetrics().WaitingQueueSize <= queueThreshold && pod.GetMetrics().KVCacheUsagePercent <= kvCacheThreshold +func (pp podPredicate) and(another podPredicate) podPredicate { + return func(req *types.LLMRequest, pod *types.PodMetrics) bool { + return pp(req, pod) && another(req, pod) } } diff --git a/pkg/epp/scheduling/filter_test.go b/pkg/epp/scheduling/filter_test.go index 127e6c21..543826d0 100644 --- a/pkg/epp/scheduling/filter_test.go +++ b/pkg/epp/scheduling/filter_test.go @@ -17,217 +17,48 @@ limitations under the License. package scheduling import ( + "context" "errors" "testing" - "github.com/go-logr/logr" "github.com/google/go-cmp/cmp" - "k8s.io/apimachinery/pkg/types" + k8stypes "k8s.io/apimachinery/pkg/types" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" ) func TestFilter(t *testing.T) { - logger := logutil.NewTestLogger() - tests := []struct { name string - req *LLMRequest - input []*backendmetrics.FakePodMetrics - output []*backendmetrics.FakePodMetrics + req *types.LLMRequest + input []*types.PodMetrics + output []*types.PodMetrics err bool - filter *filter + filter *decisionTreeFilter }{ { name: "simple filter without successor, failure", - filter: &filter{filter: func(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { - return nil, errors.New("filter error") - }}, - err: true, - }, - { - name: "default filter, critical request", - filter: defaultFilter, - req: &LLMRequest{ - Model: "critical", - ResolvedTargetModel: "critical", - Critical: true, - }, - // pod2 will be picked because it has relatively low queue size, with the requested - // model being active, and has low KV cache. - input: []*backendmetrics.FakePodMetrics{ - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - }, - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 3, - KVCacheUsagePercent: 0.1, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "critical": 1, - }, - }, - }, - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - }, - }, - }, - }, - output: []*backendmetrics.FakePodMetrics{ - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 3, - KVCacheUsagePercent: 0.1, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "critical": 1, - }, - }, - }, - }, - }, - { - name: "default filter, sheddable request, accepted", - filter: defaultFilter, - req: &LLMRequest{ - Model: "sheddable", - ResolvedTargetModel: "sheddable", - Critical: false, - }, - // pod1 will be picked because it has capacity for the sheddable request. - input: []*backendmetrics.FakePodMetrics{ - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - }, - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 3, - KVCacheUsagePercent: 0.1, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "critical": 1, - }, - }, - }, - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - }, - }, - }, - }, - output: []*backendmetrics.FakePodMetrics{ - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, + filter: &decisionTreeFilter{ + current: &basicFilter{ + name: "error", + filter: func(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) { + return nil, errors.New("filter error") }, }, }, - }, - { - name: "default filter, sheddable request, dropped", - filter: defaultFilter, - req: &LLMRequest{ - Model: "sheddable", - ResolvedTargetModel: "sheddable", - Critical: false, - }, - // All pods have higher KV cache thant the threshold, so the sheddable request will be - // dropped. - input: []*backendmetrics.FakePodMetrics{ - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod1"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.9, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - }, - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod2"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 3, - KVCacheUsagePercent: 0.85, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "critical": 1, - }, - }, - }, - { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "pod3"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.85, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - }, - }, - }, - }, - output: []*backendmetrics.FakePodMetrics{}, - err: true, + err: true, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - got, err := test.filter.Filter(logger, test.req, toInterface(test.input)) + ctx := types.NewContext(context.Background(), test.req, test.input) + got, err := test.filter.Filter(ctx, test.input) if test.err != (err != nil) { t.Errorf("Unexpected error, got %v, want %v", err, test.err) } - if diff := cmp.Diff(test.output, toStruct(got)); diff != "" { + if diff := cmp.Diff(test.output, got); diff != "" { t.Errorf("Unexpected output (-want +got): %v", diff) } }) @@ -235,26 +66,24 @@ func TestFilter(t *testing.T) { } func TestFilterFunc(t *testing.T) { - logger := logutil.NewTestLogger() - tests := []struct { name string f filterFunc - req *LLMRequest - input []*backendmetrics.FakePodMetrics - output []*backendmetrics.FakePodMetrics + req *types.LLMRequest + input []*types.PodMetrics + output []*types.PodMetrics err bool }{ { name: "least queuing empty input", f: leastQueuingFilterFunc, - input: []*backendmetrics.FakePodMetrics{}, - output: []*backendmetrics.FakePodMetrics{}, + input: []*types.PodMetrics{}, + output: []*types.PodMetrics{}, }, { name: "least queuing", f: leastQueuingFilterFunc, - input: []*backendmetrics.FakePodMetrics{ + input: []*types.PodMetrics{ { Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, @@ -271,7 +100,7 @@ func TestFilterFunc(t *testing.T) { }, }, }, - output: []*backendmetrics.FakePodMetrics{ + output: []*types.PodMetrics{ { Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, @@ -287,13 +116,13 @@ func TestFilterFunc(t *testing.T) { { name: "least kv cache empty input", f: leastKVCacheFilterFunc, - input: []*backendmetrics.FakePodMetrics{}, - output: []*backendmetrics.FakePodMetrics{}, + input: []*types.PodMetrics{}, + output: []*types.PodMetrics{}, }, { name: "least kv cache", f: leastKVCacheFilterFunc, - input: []*backendmetrics.FakePodMetrics{ + input: []*types.PodMetrics{ { Metrics: &backendmetrics.Metrics{ KVCacheUsagePercent: 0, @@ -310,7 +139,7 @@ func TestFilterFunc(t *testing.T) { }, }, }, - output: []*backendmetrics.FakePodMetrics{ + output: []*types.PodMetrics{ { Metrics: &backendmetrics.Metrics{ KVCacheUsagePercent: 0, @@ -324,9 +153,9 @@ func TestFilterFunc(t *testing.T) { }, }, { - name: "noQueueAndLessThanKVCacheThresholdPredicate", - f: toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(0, 0.8)), - input: []*backendmetrics.FakePodMetrics{ + name: "lowQueueAndLessThanKVCacheThresholdPredicate", + f: toFilterFunc(queueThresholdPredicate(0).and(kvCacheThresholdPredicate(0.8))), + input: []*types.PodMetrics{ { // This pod should be returned. Metrics: &backendmetrics.Metrics{ @@ -349,7 +178,7 @@ func TestFilterFunc(t *testing.T) { }, }, }, - output: []*backendmetrics.FakePodMetrics{ + output: []*types.PodMetrics{ { Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, @@ -358,72 +187,17 @@ func TestFilterFunc(t *testing.T) { }, }, }, - { - name: "low LoRA cost", - f: toFilterFunc(lowLoRACostPredicate), - req: &LLMRequest{ - Model: "model", - ResolvedTargetModel: "model", - }, - input: []*backendmetrics.FakePodMetrics{ - // ActiveModels include input model, should be returned. - { - Metrics: &backendmetrics.Metrics{ - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "model": 1, - }, - }, - }, - // Input model is not active, however the server has room to load another adapter. - { - Metrics: &backendmetrics.Metrics{ - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "another-model": 1, - }, - }, - }, - // Input is not active, and the server has reached max active models. - { - Metrics: &backendmetrics.Metrics{ - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - }, - }, - }, - output: []*backendmetrics.FakePodMetrics{ - { - Metrics: &backendmetrics.Metrics{ - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "model": 1, - }, - }, - }, - { - Metrics: &backendmetrics.Metrics{ - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "another-model": 1, - }, - }, - }, - }, - }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - got, err := test.f(logger, test.req, toInterface(test.input)) + ctx := types.NewContext(context.Background(), test.req, test.input) + got, err := test.f(ctx, test.input) if test.err != (err != nil) { t.Errorf("Unexpected error, got %v, want %v", err, test.err) } - if diff := cmp.Diff(test.output, toStruct(got)); diff != "" { + if diff := cmp.Diff(test.output, got); diff != "" { t.Errorf("Unexpected output (-want +got): %v", diff) } }) @@ -433,8 +207,6 @@ func TestFilterFunc(t *testing.T) { // TestLoRASoftAffinityDistribution tests that the loRASoftAffinityFilter function // properly distributes requests according to the loraAffinityThreshold func TestLoRASoftAffinityDistribution(t *testing.T) { - logger := logutil.NewTestLogger() - const ( testModelName = "test-model" testAffinityModel = "test-affinity-model" @@ -455,15 +227,15 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { }() // Create a test request and pods - req := &LLMRequest{ + req := &types.LLMRequest{ Model: testAffinityModel, ResolvedTargetModel: testAffinityModel, } // Test setup: One affinity pod and one available pod - pods := []*backendmetrics.FakePodMetrics{ + pods := []*types.PodMetrics{ { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "affinity-pod"}}, + Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "affinity-pod"}}, Metrics: &backendmetrics.Metrics{ MaxActiveModels: 2, ActiveModels: map[string]int{ @@ -472,13 +244,14 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { }, }, { - Pod: &backendmetrics.Pod{NamespacedName: types.NamespacedName{Name: "available-pod"}}, + Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "available-pod"}}, Metrics: &backendmetrics.Metrics{ MaxActiveModels: 2, ActiveModels: map[string]int{}, }, }, } + ctx := types.NewContext(context.Background(), req, pods) // Run the filter function multiple times and count the results affinityCount := 0 @@ -489,7 +262,7 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { expectedAvailabilityPercent := 100 - expectedAffinityPercent for i := 0; i < numIterations; i++ { - result, err := loRASoftAffinityFilter(logger, req, toInterface(pods)) + result, err := loRASoftAffinityFilterFunc(ctx, pods) if err != nil { t.Fatalf("Unexpected error: %v", err) } @@ -533,22 +306,3 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { actualAvailablePercent, availableLowerBound, availableUpperBound) } } - -func toInterface(input []*backendmetrics.FakePodMetrics) []backendmetrics.PodMetrics { - output := []backendmetrics.PodMetrics{} - for _, i := range input { - output = append(output, i) - } - return output -} - -func toStruct(input []backendmetrics.PodMetrics) []*backendmetrics.FakePodMetrics { - if input == nil { - return nil - } - output := []*backendmetrics.FakePodMetrics{} - for _, i := range input { - output = append(output, i.(*backendmetrics.FakePodMetrics)) - } - return output -} diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go index e874724d..8679ffba 100644 --- a/pkg/epp/scheduling/scheduler.go +++ b/pkg/epp/scheduling/scheduler.go @@ -22,10 +22,9 @@ import ( "fmt" "math/rand" - "github.com/go-logr/logr" "sigs.k8s.io/controller-runtime/pkg/log" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" envutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" @@ -67,89 +66,91 @@ func LoadConfig() Config { var config = LoadConfig() var ( - defaultFilter = &filter{ - name: "critical request", - filter: toFilterFunc(criticalRequestPredicate), - nextOnSuccess: lowLatencyFilter, - nextOnFailure: sheddableRequestFilter, - } - - // queueLoRAAndKVCacheFilter applied least queue -> low cost lora -> least KV Cache filter - queueLoRAAndKVCacheFilter = &filter{ - name: "least queuing", - filter: leastQueuingFilterFunc, - nextOnSuccessOrFailure: &filter{ - name: "low cost LoRA", - filter: loRASoftAffinityFilter, - nextOnSuccessOrFailure: &filter{ - name: "least KV cache percent", - filter: leastKVCacheFilterFunc, + lowLatencyFilter = &decisionTreeFilter{ + current: lowQueueFilter, + nextOnSuccess: &decisionTreeFilter{ + current: loRAAffinityFilter, + nextOnSuccessOrFailure: &decisionTreeFilter{ + current: leastQueueFilter, + nextOnSuccessOrFailure: &decisionTreeFilter{ + current: leastKVCacheFilter, + }, }, }, - } - - // queueAndKVCacheFilter applies least queue followed by least KV Cache filter - queueAndKVCacheFilter = &filter{ - name: "least queuing", - filter: leastQueuingFilterFunc, - nextOnSuccessOrFailure: &filter{ - name: "least KV cache percent", - filter: leastKVCacheFilterFunc, - }, - } - - lowLatencyFilter = &filter{ - name: "low queueing filter", - filter: toFilterFunc((lowQueueingPodPredicate)), - nextOnSuccess: &filter{ - name: "affinity LoRA", - filter: loRASoftAffinityFilter, - nextOnSuccessOrFailure: queueAndKVCacheFilter, + nextOnFailure: &decisionTreeFilter{ + current: leastQueueFilter, + nextOnSuccessOrFailure: &decisionTreeFilter{ + current: loRAAffinityFilter, + nextOnSuccessOrFailure: &decisionTreeFilter{ + current: leastKVCacheFilter, + }, + }, }, - nextOnFailure: queueLoRAAndKVCacheFilter, } - sheddableRequestFilter = &filter{ + sheddableRequestFilter = &decisionTreeFilter{ // When there is at least one model server that's not queuing requests, and still has KV // cache below a certain threshold, we consider this model server has capacity to handle // a sheddable request without impacting critical requests. - name: "has capacity for sheddable requests", - filter: toFilterFunc(noQueueAndLessThanKVCacheThresholdPredicate(config.QueueThresholdCritical, config.KVCacheThreshold)), - nextOnSuccess: queueLoRAAndKVCacheFilter, + current: hasCapacityFilter, + nextOnSuccess: lowLatencyFilter, // If all pods are queuing or running above the KVCache threshold, we drop the sheddable // request to make room for critical requests. - nextOnFailure: &filter{ - name: "drop request", - filter: func(logger logr.Logger, req *LLMRequest, pods []backendmetrics.PodMetrics) ([]backendmetrics.PodMetrics, error) { - logger.V(logutil.DEFAULT).Info("Request dropped", "request", req) - return []backendmetrics.PodMetrics{}, errutil.Error{ - Code: errutil.InferencePoolResourceExhausted, Msg: "dropping request due to limited backend resources", - } - }, + nextOnFailure: dropRequestFilter, + } + + hasCapacityFilter = &basicFilter{ + name: "has capacity for sheddable requests", + filter: toFilterFunc(queueThresholdPredicate(config.QueueThresholdCritical).and(kvCacheThresholdPredicate(config.KVCacheThreshold))), + } + + dropRequestFilter = &basicFilter{ + name: "drop request", + filter: func(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) { + ctx.Logger.V(logutil.DEFAULT).Info("Request dropped", "request", ctx.Req) + return []*types.PodMetrics{}, errutil.Error{ + Code: errutil.InferencePoolResourceExhausted, Msg: "dropping request due to limited backend resources", + } }, } ) -func NewScheduler(datastore datastore.Datastore) *Scheduler { +func NewScheduler(datastore Datastore) *Scheduler { return &Scheduler{ - datastore: datastore, - filter: defaultFilter, + datastore: datastore, + criticalRequestFilter: lowLatencyFilter, + sheddableRequestFilter: sheddableRequestFilter, } } type Scheduler struct { - datastore datastore.Datastore - filter Filter + datastore Datastore + criticalRequestFilter Filter + sheddableRequestFilter Filter +} + +type Datastore interface { + PodGetAll() []backendmetrics.PodMetrics } // Schedule finds the target pod based on metrics and the requested lora adapter. -func (s *Scheduler) Schedule(ctx context.Context, req *LLMRequest) (targetPod backendmetrics.PodMetrics, err error) { +func (s *Scheduler) Schedule(ctx context.Context, req *types.LLMRequest) (targetPod types.Pod, err error) { logger := log.FromContext(ctx).WithValues("request", req) - podMetrics := s.datastore.PodGetAll() - logger.V(logutil.DEBUG).Info(fmt.Sprintf("Scheduling a request. Metrics: %+v", podMetrics)) + // Snapshot pod metrics from the datastore to: + // 1. Reduce concurrent access to the datastore. + // 2. Ensure consistent data during the scheduling operation of a request. + sCtx := types.NewContext(ctx, req, types.ToSchedulerPodMetrics(s.datastore.PodGetAll())) + logger.V(logutil.DEBUG).Info(fmt.Sprintf("Scheduling a request. Metrics: %+v", sCtx.PodsSnapshot)) + + var filter Filter + if req.Critical { + filter = s.criticalRequestFilter + } else { + filter = s.sheddableRequestFilter + } - pods, err := s.filter.Filter(logger, req, podMetrics) + pods, err := filter.Filter(sCtx, sCtx.PodsSnapshot) if err != nil || len(pods) == 0 { return nil, fmt.Errorf("failed to apply filter, resulted %v pods, this should never happen: %w", len(pods), err) } diff --git a/pkg/epp/scheduling/scheduler_test.go b/pkg/epp/scheduling/scheduler_test.go new file mode 100644 index 00000000..3fd3fb24 --- /dev/null +++ b/pkg/epp/scheduling/scheduler_test.go @@ -0,0 +1,232 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduling + +import ( + "context" + "testing" + + "github.com/google/go-cmp/cmp" + k8stypes "k8s.io/apimachinery/pkg/types" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +func TestSchedule(t *testing.T) { + tests := []struct { + name string + req *types.LLMRequest + input []*backendmetrics.FakePodMetrics + output types.Pod + err bool + }{ + { + name: "critical request", + req: &types.LLMRequest{ + Model: "critical", + ResolvedTargetModel: "critical", + Critical: true, + }, + // pod2 will be picked because it has relatively low queue size, with the requested + // model being active, and has low KV cache. + input: []*backendmetrics.FakePodMetrics{ + { + Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + }, + { + Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 3, + KVCacheUsagePercent: 0.1, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "critical": 1, + }, + }, + }, + { + Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + }, + }, + }, + }, + output: &types.PodMetrics{ + Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 3, + KVCacheUsagePercent: 0.1, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "critical": 1, + }, + WaitingModels: map[string]int{}, + }, + }, + }, + { + name: "sheddable request, accepted", + req: &types.LLMRequest{ + Model: "sheddable", + ResolvedTargetModel: "sheddable", + Critical: false, + }, + // pod1 will be picked because it has capacity for the sheddable request. + input: []*backendmetrics.FakePodMetrics{ + { + Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + }, + { + Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 3, + KVCacheUsagePercent: 0.1, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "critical": 1, + }, + }, + }, + { + Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + }, + }, + }, + }, + output: &types.PodMetrics{ + Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + WaitingModels: map[string]int{}, + }, + }, + }, + { + name: "sheddable request, dropped", + req: &types.LLMRequest{ + Model: "sheddable", + ResolvedTargetModel: "sheddable", + Critical: false, + }, + // All pods have higher KV cache thant the threshold, so the sheddable request will be + // dropped. + input: []*backendmetrics.FakePodMetrics{ + { + Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.9, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + }, + }, + { + Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 3, + KVCacheUsagePercent: 0.85, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "critical": 1, + }, + }, + }, + { + Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 10, + KVCacheUsagePercent: 0.85, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + }, + }, + }, + }, + output: nil, + err: true, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + scheduler := NewScheduler(&fakeDataStore{pods: test.input}) + got, err := scheduler.Schedule(context.Background(), test.req) + if test.err != (err != nil) { + t.Errorf("Unexpected error, got %v, want %v", err, test.err) + } + + if diff := cmp.Diff(test.output, got); diff != "" { + t.Errorf("Unexpected output (-want +got): %v", diff) + } + }) + } +} + +type fakeDataStore struct { + pods []*backendmetrics.FakePodMetrics +} + +func (fds *fakeDataStore) PodGetAll() []backendmetrics.PodMetrics { + pm := make([]backendmetrics.PodMetrics, 0, len(fds.pods)) + for _, pod := range fds.pods { + pm = append(pm, pod) + } + return pm +} diff --git a/pkg/epp/scheduling/types.go b/pkg/epp/scheduling/types.go deleted file mode 100644 index 29e6648d..00000000 --- a/pkg/epp/scheduling/types.go +++ /dev/null @@ -1,27 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package scheduling - -// LLMRequest is a structured representation of the fields we parse out of the LLMRequest body. -type LLMRequest struct { - Model string - // Target models is a map of target model name to weight. - TargetModels map[string]int - // Resolved target model is the final target model after traffic split. - ResolvedTargetModel string - Critical bool -} diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go new file mode 100644 index 00000000..9450652e --- /dev/null +++ b/pkg/epp/scheduling/types/types.go @@ -0,0 +1,88 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package types + +import ( + "context" + "fmt" + + "github.com/go-logr/logr" + "sigs.k8s.io/controller-runtime/pkg/log" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" +) + +// LLMRequest is a structured representation of the fields we parse out of the LLMRequest body. +type LLMRequest struct { + Model string + // Target models is a map of target model name to weight. + TargetModels map[string]int + // Resolved target model is the final target model after traffic split. + ResolvedTargetModel string + Critical bool +} + +// Context holds contextual information during a scheduling operation. +type Context struct { + context.Context + Logger logr.Logger + Req *LLMRequest + PodsSnapshot []*PodMetrics +} + +type Pod interface { + GetPod() *backendmetrics.Pod + GetMetrics() *backendmetrics.Metrics + String() string +} + +func (pm *PodMetrics) String() string { + if pm == nil { + return "" + } + return fmt.Sprintf("%+v", *pm) +} + +func (pm *PodMetrics) GetPod() *backendmetrics.Pod { + return pm.Pod +} + +func (pm *PodMetrics) GetMetrics() *backendmetrics.Metrics { + return pm.Metrics +} + +type PodMetrics struct { + *backendmetrics.Pod + *backendmetrics.Metrics +} + +func NewContext(ctx context.Context, req *LLMRequest, pods []*PodMetrics) *Context { + logger := log.FromContext(ctx).WithValues("request", req) + return &Context{ + Context: ctx, + Logger: logger, + Req: req, + PodsSnapshot: pods, + } +} + +func ToSchedulerPodMetrics(pods []backendmetrics.PodMetrics) []*PodMetrics { + pm := make([]*PodMetrics, 0, len(pods)) + for _, pod := range pods { + pm = append(pm, &PodMetrics{pod.GetPod().Clone(), pod.GetMetrics().Clone()}) + } + return pm +} diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go index 1c5eca18..93432637 100644 --- a/test/integration/epp/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -73,7 +73,7 @@ import ( const ( port = runserver.DefaultGrpcPort - metricsPort = 8888 + metricsPort = 8889 ) var ( @@ -157,6 +157,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "foo": 1, "bar": 1, }, + WaitingModels: map[string]int{}, }, fakePod(1): { WaitingQueueSize: 0, @@ -165,6 +166,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg2": 1, }, + WaitingModels: map[string]int{}, }, fakePod(2): { WaitingQueueSize: 10, @@ -173,6 +175,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "foo": 1, "bar": 1, }, + WaitingModels: map[string]int{}, }, }, wantHeaders: []*configPb.HeaderValueOption{ @@ -212,6 +215,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "foo": 1, "bar": 1, }, + WaitingModels: map[string]int{}, }, fakePod(1): { WaitingQueueSize: 200, @@ -220,6 +224,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg2": 1, }, + WaitingModels: map[string]int{}, }, fakePod(2): { WaitingQueueSize: 6, @@ -227,6 +232,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { ActiveModels: map[string]int{ "foo": 1, }, + WaitingModels: map[string]int{}, }, }, wantHeaders: []*configPb.HeaderValueOption{ @@ -266,6 +272,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "bar": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, fakePod(1): { WaitingQueueSize: 0, @@ -274,6 +281,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, fakePod(2): { WaitingQueueSize: 10, @@ -282,6 +290,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, }, wantHeaders: []*configPb.HeaderValueOption{}, @@ -308,6 +317,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "bar": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, fakePod(1): { WaitingQueueSize: 0, @@ -316,6 +326,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, fakePod(2): { WaitingQueueSize: 10, @@ -324,6 +335,7 @@ func TestKubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, }, wantHeaders: []*configPb.HeaderValueOption{ @@ -496,6 +508,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "foo": 1, "bar": 1, }, + WaitingModels: map[string]int{}, }, fakePod(1): { WaitingQueueSize: 0, @@ -504,6 +517,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg2": 1, }, + WaitingModels: map[string]int{}, }, fakePod(2): { WaitingQueueSize: 10, @@ -512,6 +526,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "foo": 1, "bar": 1, }, + WaitingModels: map[string]int{}, }, }, wantMetrics: map[string]string{`inference_model_request_total`: ` @@ -578,6 +593,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "foo": 1, "bar": 1, }, + WaitingModels: map[string]int{}, }, fakePod(1): { WaitingQueueSize: 200, @@ -586,6 +602,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg2": 1, }, + WaitingModels: map[string]int{}, }, fakePod(2): { WaitingQueueSize: 6, @@ -593,6 +610,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { ActiveModels: map[string]int{ "foo": 1, }, + WaitingModels: map[string]int{}, }, }, wantMetrics: map[string]string{`inference_model_request_total`: ` @@ -659,6 +677,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "bar": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, fakePod(1): { WaitingQueueSize: 0, @@ -667,6 +686,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, fakePod(2): { WaitingQueueSize: 10, @@ -675,6 +695,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, }, wantErr: false, @@ -704,6 +725,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "bar": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, fakePod(1): { WaitingQueueSize: 0, @@ -712,6 +734,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, fakePod(2): { WaitingQueueSize: 10, @@ -720,6 +743,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, }, wantMetrics: map[string]string{`inference_model_request_total`: ` @@ -812,6 +836,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "bar": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, fakePod(1): { WaitingQueueSize: 0, @@ -820,6 +845,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, fakePod(2): { WaitingQueueSize: 10, @@ -828,6 +854,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, }, wantMetrics: map[string]string{`inference_model_request_total`: ` @@ -920,6 +947,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "bar": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, fakePod(1): { WaitingQueueSize: 0, @@ -928,6 +956,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, fakePod(2): { WaitingQueueSize: 10, @@ -936,6 +965,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, }, wantMetrics: map[string]string{`inference_model_request_total`: ` @@ -1029,6 +1059,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "bar": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, fakePod(1): { WaitingQueueSize: 0, @@ -1037,6 +1068,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, fakePod(2): { WaitingQueueSize: 10, @@ -1045,6 +1077,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, }, wantErr: false, @@ -1125,6 +1158,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "bar": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, fakePod(1): { WaitingQueueSize: 0, @@ -1133,6 +1167,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, fakePod(2): { WaitingQueueSize: 10, @@ -1141,6 +1176,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "foo": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, }, wantErr: false, @@ -1470,6 +1506,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { "bar": 1, "sql-lora-1fdg3": 1, }, + WaitingModels: map[string]int{}, }, }, wantMetrics: map[string]string{`inference_pool_ready_pods`: ` From 66808d484bbe97ec717f3bf07111d698cf2235f6 Mon Sep 17 00:00:00 2001 From: Sachin Varghese Date: Mon, 7 Apr 2025 11:20:40 -0400 Subject: [PATCH 20/74] Getting started docs version bump (#654) --- site-src/guides/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 367ca902..0f1fe036 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -58,7 +58,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv === "Latest Release" ```bash - VERSION=v0.2.0 + VERSION=v0.3.0 kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/$VERSION/manifests.yaml ``` From 6058b09f38bc3f88fc92c3839f04ccde781a4dff Mon Sep 17 00:00:00 2001 From: kaushik mitra Date: Mon, 7 Apr 2025 13:02:39 -0700 Subject: [PATCH 21/74] expose "Normalized Time Per Output Token" (NTPOT) metric (#643) * add tpot to inference gateway exposed metrics * add tpot to inference gateway exposed metrics * update logging and add ntpot logging to server.go * update logging and add ntpot logging to server.go * fix lint error * change metric name from ntpot to normalized time per output token * update metrics.md --- pkg/epp/handlers/server.go | 1 + pkg/epp/handlers/streamingserver.go | 2 + pkg/epp/metrics/metrics.go | 37 ++++++ pkg/epp/metrics/metrics_test.go | 122 ++++++++++++++++-- ...lized_time_per_output_token_seconds_metric | 50 +++++++ site-src/guides/metrics.md | 1 + 6 files changed, 203 insertions(+), 10 deletions(-) create mode 100644 pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go index f6f375dd..862a73b4 100644 --- a/pkg/epp/handlers/server.go +++ b/pkg/epp/handlers/server.go @@ -129,6 +129,7 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens) metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens) + metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens) } if reqCtx.modelServerStreaming { logger.V(logutil.DEBUG).Info("Request context after HandleResponseBody", "context", reqCtx) diff --git a/pkg/epp/handlers/streamingserver.go b/pkg/epp/handlers/streamingserver.go index 0e9020d8..88963f47 100644 --- a/pkg/epp/handlers/streamingserver.go +++ b/pkg/epp/handlers/streamingserver.go @@ -184,6 +184,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer) reqCtx.ResponseCompleteTimestamp = time.Now() metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) + metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens) } reqCtx.respBodyResp = &extProcPb.ProcessingResponse{ @@ -226,6 +227,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer) metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens) metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens) + metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens) } } } diff --git a/pkg/epp/metrics/metrics.go b/pkg/epp/metrics/metrics.go index 434b8381..b474df36 100644 --- a/pkg/epp/metrics/metrics.go +++ b/pkg/epp/metrics/metrics.go @@ -131,6 +131,21 @@ var ( []string{"model_name"}, ) + // NTPOT - Normalized Time Per Output Token + NormalizedTimePerOutputToken = compbasemetrics.NewHistogramVec( + &compbasemetrics.HistogramOpts{ + Subsystem: InferenceModelComponent, + Name: "normalized_time_per_output_token_seconds", + Help: "Inference model latency divided by number of output tokens in seconds for each model and target model.", + // From few milliseconds per token to multiple seconds per token + Buckets: []float64{ + 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0, + }, + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"model_name", "target_model_name"}, + ) + // Inference Pool Metrics inferencePoolAvgKVCache = compbasemetrics.NewGaugeVec( &compbasemetrics.GaugeOpts{ @@ -176,6 +191,7 @@ func Register() { legacyregistry.MustRegister(inputTokens) legacyregistry.MustRegister(outputTokens) legacyregistry.MustRegister(runningRequests) + legacyregistry.MustRegister(NormalizedTimePerOutputToken) legacyregistry.MustRegister(inferencePoolAvgKVCache) legacyregistry.MustRegister(inferencePoolAvgQueueSize) @@ -231,6 +247,27 @@ func RecordOutputTokens(modelName, targetModelName string, size int) { } } +// RecordNormalizedTimePerOutputToken (NTPOT) records the normalized time per output token. +func RecordNormalizedTimePerOutputToken(ctx context.Context, modelName, targetModelName string, received time.Time, complete time.Time, outputTokenCount int) bool { + if !complete.After(received) { + log.FromContext(ctx).Error(nil, "Request latency values are invalid for NTPOT calculation", + "modelName", modelName, "targetModelName", targetModelName, "completeTime", complete, "receivedTime", received) + return false + } + + if outputTokenCount <= 0 { + log.FromContext(ctx).Error(nil, "Output token count must be positive for NTPOT calculation", + "modelName", modelName, "targetModelName", targetModelName, "outputTokenCount", outputTokenCount) + return false + } + + elapsedSeconds := complete.Sub(received).Seconds() + secondsPerToken := elapsedSeconds / float64(outputTokenCount) + + NormalizedTimePerOutputToken.WithLabelValues(modelName, targetModelName).Observe(secondsPerToken) + return true +} + // IncRunningRequests increases the current running requests. func IncRunningRequests(modelName string) { if modelName != "" { diff --git a/pkg/epp/metrics/metrics_test.go b/pkg/epp/metrics/metrics_test.go index dc4c7044..b5f19e6d 100644 --- a/pkg/epp/metrics/metrics_test.go +++ b/pkg/epp/metrics/metrics_test.go @@ -29,16 +29,17 @@ import ( ) const ( - RequestTotalMetric = InferenceModelComponent + "_request_total" - RequestErrorTotalMetric = InferenceModelComponent + "_request_error_total" - RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds" - RequestSizesMetric = InferenceModelComponent + "_request_sizes" - ResponseSizesMetric = InferenceModelComponent + "_response_sizes" - InputTokensMetric = InferenceModelComponent + "_input_tokens" - OutputTokensMetric = InferenceModelComponent + "_output_tokens" - RunningRequestsMetric = InferenceModelComponent + "_running_requests" - KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization" - QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size" + RequestTotalMetric = InferenceModelComponent + "_request_total" + RequestErrorTotalMetric = InferenceModelComponent + "_request_error_total" + RequestLatenciesMetric = InferenceModelComponent + "_request_duration_seconds" + RequestSizesMetric = InferenceModelComponent + "_request_sizes" + ResponseSizesMetric = InferenceModelComponent + "_response_sizes" + InputTokensMetric = InferenceModelComponent + "_input_tokens" + OutputTokensMetric = InferenceModelComponent + "_output_tokens" + NormalizedTimePerOutputTokenMetric = InferenceModelComponent + "_normalized_time_per_output_token_seconds" + RunningRequestsMetric = InferenceModelComponent + "_running_requests" + KVCacheAvgUsageMetric = InferencePoolComponent + "_average_kv_cache_utilization" + QueueAvgSizeMetric = InferencePoolComponent + "_average_queue_size" ) func TestRecordRequestCounterandSizes(t *testing.T) { @@ -252,6 +253,107 @@ func TestRecordRequestLatencies(t *testing.T) { } } +func TestRecordNormalizedTimePerOutputToken(t *testing.T) { + ctx := logutil.NewTestLoggerIntoContext(context.Background()) + timeBaseline := time.Now() + type tokenRequests struct { + modelName string + targetModelName string + receivedTime time.Time + completeTime time.Time + outputTokens int + } + scenarios := []struct { + name string + reqs []tokenRequests + invalid bool + }{ + { + name: "multiple requests", + reqs: []tokenRequests{ + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 1000), + outputTokens: 100, // 10ms per token + }, + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 1600), + outputTokens: 80, // 20ms per token + }, + { + modelName: "m10", + targetModelName: "t11", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 6000), + outputTokens: 300, // 20ms per token + }, + { + modelName: "m20", + targetModelName: "t20", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 2400), + outputTokens: 400, // 6ms per token + }, + }, + }, + { + name: "invalid elapsed time", + reqs: []tokenRequests{ + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline.Add(time.Millisecond * 10), + completeTime: timeBaseline, + outputTokens: 100, + }, + }, + invalid: true, + }, + { + name: "invalid token count", + reqs: []tokenRequests{ + { + modelName: "m10", + targetModelName: "t10", + receivedTime: timeBaseline, + completeTime: timeBaseline.Add(time.Millisecond * 1000), + outputTokens: 0, // Invalid: zero tokens + }, + }, + invalid: true, + }, + } + Register() + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + for _, req := range scenario.reqs { + success := RecordNormalizedTimePerOutputToken(ctx, req.modelName, req.targetModelName, req.receivedTime, req.completeTime, req.outputTokens) + if success == scenario.invalid { + t.Errorf("got record success(%v), but the request expects invalid(%v)", success, scenario.invalid) + } + } + + wantLatencyPerToken, err := os.Open("testdata/normalized_time_per_output_token_seconds_metric") + defer func() { + if err := wantLatencyPerToken.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantLatencyPerToken, NormalizedTimePerOutputTokenMetric); err != nil { + t.Error(err) + } + }) + } +} + func TestRecordResponseMetrics(t *testing.T) { type responses struct { modelName string diff --git a/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric b/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric new file mode 100644 index 00000000..bb6e9373 --- /dev/null +++ b/pkg/epp/metrics/testdata/normalized_time_per_output_token_seconds_metric @@ -0,0 +1,50 @@ +# HELP inference_model_normalized_time_per_output_token_seconds [ALPHA] Inference model latency divided by number of output tokens in seconds for each model and target model. +# TYPE inference_model_normalized_time_per_output_token_seconds histogram +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.001"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.002"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.005"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.01"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.02"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.05"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.1"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.2"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="0.5"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="1.0"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="2.0"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="5.0"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="10.0"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t10", le="+Inf"} 2 +inference_model_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t10"} 0.03 +inference_model_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t10"} 2 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.001"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.002"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.005"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.01"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.02"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.05"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.1"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.2"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="0.5"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="1.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="2.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="5.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="10.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m10", target_model_name="t11", le="+Inf"} 1 +inference_model_normalized_time_per_output_token_seconds_sum{model_name="m10", target_model_name="t11"} 0.02 +inference_model_normalized_time_per_output_token_seconds_count{model_name="m10", target_model_name="t11"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.001"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.002"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.005"} 0 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.01"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.02"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.05"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.1"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.2"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="0.5"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="1.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="2.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="5.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="10.0"} 1 +inference_model_normalized_time_per_output_token_seconds_bucket{model_name="m20", target_model_name="t20", le="+Inf"} 1 +inference_model_normalized_time_per_output_token_seconds_sum{model_name="m20", target_model_name="t20"} 0.006 +inference_model_normalized_time_per_output_token_seconds_count{model_name="m20", target_model_name="t20"} 1 diff --git a/site-src/guides/metrics.md b/site-src/guides/metrics.md index a781f721..d16c7d47 100644 --- a/site-src/guides/metrics.md +++ b/site-src/guides/metrics.md @@ -26,6 +26,7 @@ curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ | inference_model_request_total | Counter | The counter of requests broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_model_request_error_total | Counter | The counter of requests errors broken out for each model. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_model_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | +| normalized_time_per_output_token_seconds | Distribution | Distribution of ntpot (response latency per output token) | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_model_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_model_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | | inference_model_input_tokens | Distribution | Distribution of input token count. | `model_name`=<model-name>
`target_model_name`=<target-model-name> | ALPHA | From 9181b471190a471cb7a7a913a7db252a93f0b67e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 7 Apr 2025 19:20:39 -0700 Subject: [PATCH 22/74] Bump github.com/onsi/ginkgo/v2 from 2.23.3 to 2.23.4 (#657) Bumps [github.com/onsi/ginkgo/v2](https://github.com/onsi/ginkgo) from 2.23.3 to 2.23.4. - [Release notes](https://github.com/onsi/ginkgo/releases) - [Changelog](https://github.com/onsi/ginkgo/blob/master/CHANGELOG.md) - [Commits](https://github.com/onsi/ginkgo/compare/v2.23.3...v2.23.4) --- updated-dependencies: - dependency-name: github.com/onsi/ginkgo/v2 dependency-version: 2.23.4 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 11 ++++++----- go.sum | 24 ++++++++++++++---------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/go.mod b/go.mod index fba85f91..e3239967 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/envoyproxy/go-control-plane/envoy v1.32.4 github.com/go-logr/logr v1.4.2 github.com/google/go-cmp v0.7.0 - github.com/onsi/ginkgo/v2 v2.23.3 + github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.36.3 github.com/prometheus/client_golang v1.21.1 github.com/prometheus/client_model v0.6.1 @@ -65,7 +65,7 @@ require ( github.com/google/cel-go v0.22.0 // indirect github.com/google/gnostic-models v0.6.8 // indirect github.com/google/gofuzz v1.2.0 // indirect - github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect + github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/websocket v1.5.0 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect @@ -104,17 +104,18 @@ require ( go.opentelemetry.io/otel/sdk v1.34.0 // indirect go.opentelemetry.io/otel/trace v1.34.0 // indirect go.opentelemetry.io/proto/otlp v1.3.1 // indirect + go.uber.org/automaxprocs v1.6.0 // indirect golang.org/x/crypto v0.36.0 // indirect golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 // indirect - golang.org/x/mod v0.23.0 // indirect + golang.org/x/mod v0.24.0 // indirect golang.org/x/net v0.37.0 // indirect golang.org/x/oauth2 v0.25.0 // indirect golang.org/x/sync v0.12.0 // indirect - golang.org/x/sys v0.31.0 // indirect + golang.org/x/sys v0.32.0 // indirect golang.org/x/term v0.30.0 // indirect golang.org/x/text v0.23.0 // indirect golang.org/x/time v0.7.0 // indirect - golang.org/x/tools v0.30.0 // indirect + golang.org/x/tools v0.31.0 // indirect golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 // indirect diff --git a/go.sum b/go.sum index 2bcff108..6ea76a79 100644 --- a/go.sum +++ b/go.sum @@ -92,8 +92,8 @@ github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad h1:a6HEuzUHeKH6hwfN/ZoQgRgVIWFJljSWa/zetS2WTvg= -github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= @@ -151,8 +151,8 @@ github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE= github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= -github.com/onsi/ginkgo/v2 v2.23.3 h1:edHxnszytJ4lD9D5Jjc4tiDkPBZ3siDeJJkUZJJVkp0= -github.com/onsi/ginkgo/v2 v2.23.3/go.mod h1:zXTP6xIp3U8aVuXN8ENK9IXRaTjFnpVB9mGmaSRvxnM= +github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= +github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= github.com/onsi/gomega v1.36.3 h1:hID7cr8t3Wp26+cYnfcjR6HpJ00fdogN6dqZ1t6IylU= github.com/onsi/gomega v1.36.3/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= @@ -162,6 +162,8 @@ github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= +github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= github.com/prometheus/client_golang v1.21.1 h1:DOvXXTqVzvkIewV/CDPFdejpMCGeMcbGCQ8YOmu+Ibk= github.com/prometheus/client_golang v1.21.1/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg= github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= @@ -213,6 +215,8 @@ go.opentelemetry.io/otel/trace v1.34.0 h1:+ouXS2V8Rd4hp4580a8q23bg0azF2nI8cqLYnC go.opentelemetry.io/otel/trace v1.34.0/go.mod h1:Svm7lSjQD7kG7KJ/MUHPVXSDGz2OX4h0M2jHBhmSfRE= go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0= go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8= +go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= +go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= @@ -228,8 +232,8 @@ golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56 h1:2dVuKD2vS7b0QIHQbpyTISPd0 golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56/go.mod h1:M4RDyNAINzryxdtnbRXRL/OHtkFuWGRjvuhBJpk2IlY= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.23.0 h1:Zb7khfcRGKk+kqfxFaP5tZqCnDZMjC5VtUBs87Hr6QM= -golang.org/x/mod v0.23.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= +golang.org/x/mod v0.24.0 h1:ZfthKaKaT4NrhGVZHO1/WDTwGES4De8KtWO0SIbNJMU= +golang.org/x/mod v0.24.0/go.mod h1:IXM97Txy2VM4PJ3gI61r1YEk/gAj6zAHN3AdZt6S9Ww= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -248,8 +252,8 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik= -golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= +golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y= golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -263,8 +267,8 @@ golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGm golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.30.0 h1:BgcpHewrV5AUp2G9MebG4XPFI1E2W41zU1SaqVA9vJY= -golang.org/x/tools v0.30.0/go.mod h1:c347cR/OJfw5TI+GfX7RUPNMdDRRbjvYTS0jPyvsVtY= +golang.org/x/tools v0.31.0 h1:0EedkvKDbh+qistFTd0Bcwe/YLh4vHwWEkiI0toFIBU= +golang.org/x/tools v0.31.0/go.mod h1:naFTU+Cev749tSJRXJlna0T3WxKvb1kWEx15xA4SdmQ= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= From 207f00def1b721d007310b7f5d7c9ae89aa31031 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 7 Apr 2025 19:36:39 -0700 Subject: [PATCH 23/74] Bump google.golang.org/grpc from 1.71.0 to 1.71.1 (#658) Bumps [google.golang.org/grpc](https://github.com/grpc/grpc-go) from 1.71.0 to 1.71.1. - [Release notes](https://github.com/grpc/grpc-go/releases) - [Commits](https://github.com/grpc/grpc-go/compare/v1.71.0...v1.71.1) --- updated-dependencies: - dependency-name: google.golang.org/grpc dependency-version: 1.71.1 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index e3239967..12d65014 100644 --- a/go.mod +++ b/go.mod @@ -15,7 +15,7 @@ require ( github.com/stretchr/testify v1.10.0 go.uber.org/multierr v1.11.0 go.uber.org/zap v1.27.0 - google.golang.org/grpc v1.71.0 + google.golang.org/grpc v1.71.1 google.golang.org/protobuf v1.36.6 k8s.io/api v0.32.3 k8s.io/apiextensions-apiserver v0.32.3 diff --git a/go.sum b/go.sum index 6ea76a79..ece2d3c3 100644 --- a/go.sum +++ b/go.sum @@ -281,8 +281,8 @@ google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422 h1: google.golang.org/genproto/googleapis/api v0.0.0-20250106144421-5f5ef82da422/go.mod h1:b6h1vNKhxaSoEI+5jc3PJUCustfli/mRab7295pY7rw= google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f h1:OxYkA3wjPsZyBylwymxSHa7ViiW1Sml4ToBrncvFehI= google.golang.org/genproto/googleapis/rpc v0.0.0-20250115164207-1a7da9e5054f/go.mod h1:+2Yz8+CLJbIfL9z73EW45avw8Lmge3xVElCP9zEKi50= -google.golang.org/grpc v1.71.0 h1:kF77BGdPTQ4/JZWMlb9VpJ5pa25aqvVqogsxNHHdeBg= -google.golang.org/grpc v1.71.0/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= +google.golang.org/grpc v1.71.1 h1:ffsFWr7ygTUscGPI0KKK6TLrGz0476KUvvsbqWK0rPI= +google.golang.org/grpc v1.71.1/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= From 5c908e3fafc0cf754e5d7679e51b7b8f53986a49 Mon Sep 17 00:00:00 2001 From: Xiaolin Lin Date: Tue, 8 Apr 2025 11:10:40 -0400 Subject: [PATCH 24/74] Fix links and description in implementations.md (#650) * Correct Envoy AI Gateway links Signed-off-by: Xiaolin Lin * fixes Signed-off-by: Xiaolin Lin * more fix Signed-off-by: Xiaolin Lin --------- Signed-off-by: Xiaolin Lin --- site-src/implementations.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/site-src/implementations.md b/site-src/implementations.md index 8a95119d..dc15b297 100644 --- a/site-src/implementations.md +++ b/site-src/implementations.md @@ -2,7 +2,7 @@ This project has several implementations that are planned or in progress: -* [Envoy Gateway][1] +* [Envoy AI Gateway][1] * [Kgateway][2] * [Google Kubernetes Engine][3] @@ -10,20 +10,20 @@ This project has several implementations that are planned or in progress: [2]:#kgateway [3]:#google-kubernetes-engine -## Envoy Gateway +## Envoy AI Gateway -[Envoy Gateway][eg-home] is an [Envoy][envoy-org] subproject for managing -Envoy-based application gateways. The supported APIs and fields of the Gateway -API are outlined [here][eg-supported]. Use the [quickstart][eg-quickstart] to -get Envoy Gateway running with Gateway API in a few simple steps. +[Envoy AI Gateway][aigw-home] is an open source project built on top of +[Envoy][envoy-org] and [Envoy Gateway][aigw-gateway] to handle request traffic +from application clients to GenAI services. The features and capabilities are outlined [here][aigw-capabilities]. Use the [quickstart][aigw-quickstart] to get Envoy AI Gateway running with Gateway API in a few simple steps. Progress towards supporting this project is tracked with a [GitHub -Issue](https://github.com/envoyproxy/gateway/issues/4423). +Issue](https://github.com/envoyproxy/ai-gateway/issues/423). -[eg-home]:https://gateway.envoyproxy.io/ +[aigw-home]:https://gateway.envoyproxy.io/ [envoy-org]:https://github.com/envoyproxy -[eg-supported]:https://gateway.envoyproxy.io/docs/tasks/quickstart/ -[eg-quickstart]:https://gateway.envoyproxy.io/docs/tasks/quickstart +[aigw-gateway]: https://gateway.envoyproxy.io/ +[aigw-capabilities]:https://aigateway.envoyproxy.io/docs/capabilities/ +[aigw-quickstart]:https://aigateway.envoyproxy.io/docs/capabilities/gateway-api-inference-extension ## Kgateway From f346ffb1bbb4bbf3d5a4ff2fc31a1f8954065b1a Mon Sep 17 00:00:00 2001 From: Se7en Date: Tue, 8 Apr 2025 23:10:47 +0800 Subject: [PATCH 25/74] fix manifests and description in the user guides (#652) * fix manifests and description in the user guides * add base model back --- config/manifests/inferencemodel.yaml | 4 +- config/manifests/vllm/gpu-deployment.yaml | 6 +- pkg/epp/README.md | 2 +- site-src/guides/adapter-rollout.md | 100 ++++++++++++---------- site-src/guides/index.md | 3 +- 5 files changed, 58 insertions(+), 57 deletions(-) diff --git a/config/manifests/inferencemodel.yaml b/config/manifests/inferencemodel.yaml index 75c9bb17..67c91d0e 100644 --- a/config/manifests/inferencemodel.yaml +++ b/config/manifests/inferencemodel.yaml @@ -8,9 +8,8 @@ spec: poolRef: name: vllm-llama3-8b-instruct targetModels: - - name: food-review + - name: food-review-1 weight: 100 - --- apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferenceModel @@ -21,7 +20,6 @@ spec: criticality: Critical poolRef: name: vllm-llama3-8b-instruct - --- apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferenceModel diff --git a/config/manifests/vllm/gpu-deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml index e7cb193e..d62d4b02 100644 --- a/config/manifests/vllm/gpu-deployment.yaml +++ b/config/manifests/vllm/gpu-deployment.yaml @@ -243,12 +243,10 @@ metadata: data: configmap.yaml: | vLLMLoRAConfig: - name: vllm-llama3.1-8b-instruct + name: vllm-llama3-8b-instruct-adapters port: 8000 defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct ensureExist: models: - - id: food-review + - id: food-review-1 source: Kawon/llama3.1-food-finetune_v14_r8 - - id: cad-fabricator - source: redcathode/fabricator diff --git a/pkg/epp/README.md b/pkg/epp/README.md index 1bf47993..99d1bf06 100644 --- a/pkg/epp/README.md +++ b/pkg/epp/README.md @@ -1,5 +1,5 @@ # The EndPoint Picker (EPP) -This package provides the reference implementation for the Endpoint Picker (EPP). As demonistrated in the diagram below, it implements the [extension protocol](../../docs/proposals/004-endpoint-picker-protocol), enabling a proxy or gateway to request endpoint hints from an extension, and interacts with the model servers through the defined [model server protocol](../..//docs/proposals/003-model-server-protocol). +This package provides the reference implementation for the Endpoint Picker (EPP). As demonstrated in the diagram below, it implements the [extension protocol](../../docs/proposals/004-endpoint-picker-protocol), enabling a proxy or gateway to request endpoint hints from an extension, and interacts with the model servers through the defined [model server protocol](../..//docs/proposals/003-model-server-protocol). ![Architecture Diagram](../../docs/endpoint-picker.svg) diff --git a/site-src/guides/adapter-rollout.md b/site-src/guides/adapter-rollout.md index fdf62c3a..4e7a3667 100644 --- a/site-src/guides/adapter-rollout.md +++ b/site-src/guides/adapter-rollout.md @@ -18,28 +18,28 @@ Modify the LoRA syncer ConfigMap to initiate loading of the new adapter version. ```bash - kubectl edit configmap vllm-llama3-8b-instruct-adapters +kubectl edit configmap vllm-llama3-8b-instruct-adapters ``` Change the ConfigMap to match the following (note the new entry under models): ```yaml - apiVersion: v1 - kind: ConfigMap - metadata: - name: vllm-llama3-8b-instruct-adapters - data: - configmap.yaml: | - vLLMLoRAConfig: - name: vllm-llama3-8b-instruct-adapters - port: 8000 - defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct - ensureExist: - models: - - id: food-review-1 - source: Kawon/llama3.1-food-finetune_v14_r8 - - id: food-review-2 - source: Kawon/llama3.1-food-finetune_v14_r8 +apiVersion: v1 +kind: ConfigMap +metadata: + name: vllm-llama3-8b-instruct-adapters +data: + configmap.yaml: | + vLLMLoRAConfig: + name: vllm-llama3-8b-instruct-adapters + port: 8000 + defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct + ensureExist: + models: + - id: food-review-1 + source: Kawon/llama3.1-food-finetune_v14_r8 + - id: food-review-2 + source: Kawon/llama3.1-food-finetune_v14_r8 ``` The new adapter version is applied to the model servers live, without requiring a restart. @@ -51,35 +51,34 @@ Modify the InferenceModel to configure a canary rollout with traffic splitting. ```bash - kubectl edit inferencemodel food-review +kubectl edit inferencemodel food-review ``` Change the targetModels list in InferenceModel to match the following: ```yaml -apiVersion: inference.networking.x-k8s.io/v1alpha1 +apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferenceModel metadata: - name: inferencemodel-sample + name: food-review spec: modelName: food-review - criticality: Critical + criticality: Standard poolRef: - name: vllm-llama3-8b-instruct-pool + name: vllm-llama3-8b-instruct targetModels: - name: food-review-1 weight: 90 - name: food-review-2 weight: 10 - ``` The above configuration means one in every ten requests should be sent to the new version. Try it out: 1. Get the gateway IP: ```bash -IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=8081 +IP=$(kubectl get gateway/inference-gateway -o jsonpath='{.status.addresses[0].value}'); PORT=80 ``` 2. Send a few requests as follows: @@ -98,34 +97,41 @@ curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ Modify the InferenceModel to direct 100% of the traffic to the latest version of the adapter. ```yaml -model: - name: food-review - targetModels: - targetModelName: food-review-2 - weight: 100 +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferenceModel +metadata: + name: food-review +spec: + modelName: food-review + criticality: Standard + poolRef: + name: vllm-llama3-8b-instruct + targetModels: + - name: food-review-2 + weight: 100 ``` Unload the older versions from the servers by updating the LoRA syncer ConfigMap to list the older version under the `ensureNotExist` list: ```yaml - apiVersion: v1 - kind: ConfigMap - metadata: - name: dynamic-lora-config - data: - configmap.yaml: | - vLLMLoRAConfig: - name: sql-loras-llama - port: 8000 - defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct - ensureExist: - models: - - id: food-review-2 - source: Kawon/llama3.1-food-finetune_v14_r8 - ensureNotExist: - models: - - id: food-review-1 - source: Kawon/llama3.1-food-finetune_v14_r8 +apiVersion: v1 +kind: ConfigMap +metadata: + name: vllm-llama3-8b-instruct-adapters +data: + configmap.yaml: | + vLLMLoRAConfig: + name: vllm-llama3-8b-instruct-adapters + port: 8000 + defaultBaseModel: meta-llama/Llama-3.1-8B-Instruct + ensureExist: + models: + - id: food-review-2 + source: Kawon/llama3.1-food-finetune_v14_r8 + ensureNotExist: + models: + - id: food-review-1 + source: Kawon/llama3.1-food-finetune_v14_r8 ``` With this, all requests should be served by the new adapter version. diff --git a/site-src/guides/index.md b/site-src/guides/index.md index 0f1fe036..df3d1760 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -70,8 +70,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Deploy InferenceModel - Deploy the sample InferenceModel which is configured to load balance traffic between the `food-review-0` and `food-review-1` - [LoRA adapters](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server. + Deploy the sample InferenceModel which is configured to forward traffic to the `food-review-1` [LoRA adapter](https://docs.vllm.ai/en/latest/features/lora.html) of the sample model server. ```bash kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencemodel.yaml From a107a291f0bd6fa02b42df8f74b3d7336742b3df Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 8 Apr 2025 08:10:55 -0700 Subject: [PATCH 26/74] Bump github.com/onsi/gomega from 1.36.3 to 1.37.0 (#659) Bumps [github.com/onsi/gomega](https://github.com/onsi/gomega) from 1.36.3 to 1.37.0. - [Release notes](https://github.com/onsi/gomega/releases) - [Changelog](https://github.com/onsi/gomega/blob/master/CHANGELOG.md) - [Commits](https://github.com/onsi/gomega/compare/v1.36.3...v1.37.0) --- updated-dependencies: - dependency-name: github.com/onsi/gomega dependency-version: 1.37.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 12d65014..20cf017a 100644 --- a/go.mod +++ b/go.mod @@ -8,7 +8,7 @@ require ( github.com/go-logr/logr v1.4.2 github.com/google/go-cmp v0.7.0 github.com/onsi/ginkgo/v2 v2.23.4 - github.com/onsi/gomega v1.36.3 + github.com/onsi/gomega v1.37.0 github.com/prometheus/client_golang v1.21.1 github.com/prometheus/client_model v0.6.1 github.com/prometheus/common v0.63.0 diff --git a/go.sum b/go.sum index ece2d3c3..cd6cd380 100644 --- a/go.sum +++ b/go.sum @@ -153,8 +153,8 @@ github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= github.com/onsi/ginkgo/v2 v2.23.4/go.mod h1:Bt66ApGPBFzHyR+JO10Zbt0Gsp4uWxu5mIOTusL46e8= -github.com/onsi/gomega v1.36.3 h1:hID7cr8t3Wp26+cYnfcjR6HpJ00fdogN6dqZ1t6IylU= -github.com/onsi/gomega v1.36.3/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0= +github.com/onsi/gomega v1.37.0 h1:CdEG8g0S133B4OswTDC/5XPSzE1OeP29QOioj2PID2Y= +github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 h1:GFCKgmp0tecUJ0sJuv4pzYCqS9+RGSn52M3FUwPs+uo= From 27d3991f85c2a03e6f6012c838ad4312bcd684bc Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Tue, 8 Apr 2025 15:30:40 +0000 Subject: [PATCH 27/74] adjust the gpu deployment to increase max batch size (#642) * adjust the gpu deployment to increase max batch size * Apply suggestions from code review --- config/manifests/vllm/gpu-deployment.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/config/manifests/vllm/gpu-deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml index d62d4b02..16f93882 100644 --- a/config/manifests/vllm/gpu-deployment.yaml +++ b/config/manifests/vllm/gpu-deployment.yaml @@ -24,9 +24,15 @@ spec: - "1" - "--port" - "8000" + - "--max-num-seq" + - "1024" + - "--compilation-config" + - "3" - "--enable-lora" - "--max-loras" - "2" + - "--max-lora-rank" + - "8" - "--max-cpu-loras" - "12" env: From 807d84bc2b826617c7a5ce9025f9a4958c5b5bee Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Tue, 8 Apr 2025 15:50:40 +0000 Subject: [PATCH 28/74] Cleaning up config pkg (#663) --- config/default/kustomization.yaml | 151 ------------------ config/default/manager_metrics_patch.yaml | 4 - config/default/metrics_service.yaml | 17 -- .../network-policy/allow-metrics-traffic.yaml | 26 --- config/network-policy/kustomization.yaml | 2 - config/prometheus/kustomization.yaml | 2 - config/prometheus/monitor.yaml | 30 ---- config/rbac/inferencemodel_editor_role.yaml | 27 ---- config/rbac/inferencemodel_viewer_role.yaml | 23 --- config/rbac/inferencepool_editor_role.yaml | 27 ---- config/rbac/inferencepool_viewer_role.yaml | 23 --- config/rbac/kustomization.yaml | 29 ---- config/rbac/leader_election_role.yaml | 40 ----- config/rbac/leader_election_role_binding.yaml | 15 -- config/rbac/metrics_auth_role.yaml | 17 -- config/rbac/metrics_auth_role_binding.yaml | 12 -- config/rbac/metrics_reader_role.yaml | 9 -- config/rbac/role.yaml | 11 -- config/rbac/role_binding.yaml | 15 -- config/rbac/service_account.yaml | 8 - .../gateway_v1alpha1_inferencemodel.yaml | 17 -- .../gateway_v1alpha1_inferencepool.yaml | 11 -- config/samples/kustomization.yaml | 5 - 23 files changed, 521 deletions(-) delete mode 100644 config/default/kustomization.yaml delete mode 100644 config/default/manager_metrics_patch.yaml delete mode 100644 config/default/metrics_service.yaml delete mode 100644 config/network-policy/allow-metrics-traffic.yaml delete mode 100644 config/network-policy/kustomization.yaml delete mode 100644 config/prometheus/kustomization.yaml delete mode 100644 config/prometheus/monitor.yaml delete mode 100644 config/rbac/inferencemodel_editor_role.yaml delete mode 100644 config/rbac/inferencemodel_viewer_role.yaml delete mode 100644 config/rbac/inferencepool_editor_role.yaml delete mode 100644 config/rbac/inferencepool_viewer_role.yaml delete mode 100644 config/rbac/kustomization.yaml delete mode 100644 config/rbac/leader_election_role.yaml delete mode 100644 config/rbac/leader_election_role_binding.yaml delete mode 100644 config/rbac/metrics_auth_role.yaml delete mode 100644 config/rbac/metrics_auth_role_binding.yaml delete mode 100644 config/rbac/metrics_reader_role.yaml delete mode 100644 config/rbac/role.yaml delete mode 100644 config/rbac/role_binding.yaml delete mode 100644 config/rbac/service_account.yaml delete mode 100644 config/samples/gateway_v1alpha1_inferencemodel.yaml delete mode 100644 config/samples/gateway_v1alpha1_inferencepool.yaml delete mode 100644 config/samples/kustomization.yaml diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml deleted file mode 100644 index 1fd9939f..00000000 --- a/config/default/kustomization.yaml +++ /dev/null @@ -1,151 +0,0 @@ -# Adds namespace to all resources. -namespace: api-system - -# Value of this field is prepended to the -# names of all resources, e.g. a deployment named -# "wordpress" becomes "alices-wordpress". -# Note that it should also match with the prefix (text before '-') of the namespace -# field above. -namePrefix: api- - -# Labels to add to all resources and selectors. -#labels: -#- includeSelectors: true -# pairs: -# someName: someValue - -resources: -- ../crd -- ../rbac -- ../manager -# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in -# crd/kustomization.yaml -#- ../webhook -# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required. -#- ../certmanager -# [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'. -#- ../prometheus -# [METRICS] Expose the controller manager metrics service. -- metrics_service.yaml -# [NETWORK POLICY] Protect the /metrics endpoint and Webhook Server with NetworkPolicy. -# Only Pod(s) running a namespace labeled with 'metrics: enabled' will be able to gather the metrics. -# Only CR(s) which requires webhooks and are applied on namespaces labeled with 'webhooks: enabled' will -# be able to communicate with the Webhook Server. -#- ../network-policy - -# Uncomment the patches line if you enable Metrics, and/or are using webhooks and cert-manager -patches: -# [METRICS] The following patch will enable the metrics endpoint using HTTPS and the port :8443. -# More info: https://book.kubebuilder.io/reference/metrics -- path: manager_metrics_patch.yaml - target: - kind: Deployment - -# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in -# crd/kustomization.yaml -#- path: manager_webhook_patch.yaml - -# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. -# Uncomment 'CERTMANAGER' sections in crd/kustomization.yaml to enable the CA injection in the admission webhooks. -# 'CERTMANAGER' needs to be enabled to use ca injection -#- path: webhookcainjection_patch.yaml - -# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix. -# Uncomment the following replacements to add the cert-manager CA injection annotations -#replacements: -# - source: # Add cert-manager annotation to ValidatingWebhookConfiguration, MutatingWebhookConfiguration and CRDs -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert # this name should match the one in certificate.yaml -# fieldPath: .metadata.namespace # namespace of the certificate CR -# targets: -# - select: -# kind: ValidatingWebhookConfiguration -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 0 -# create: true -# - select: -# kind: MutatingWebhookConfiguration -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 0 -# create: true -# - select: -# kind: CustomResourceDefinition -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 0 -# create: true -# - source: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert # this name should match the one in certificate.yaml -# fieldPath: .metadata.name -# targets: -# - select: -# kind: ValidatingWebhookConfiguration -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 1 -# create: true -# - select: -# kind: MutatingWebhookConfiguration -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 1 -# create: true -# - select: -# kind: CustomResourceDefinition -# fieldPaths: -# - .metadata.annotations.[cert-manager.io/inject-ca-from] -# options: -# delimiter: '/' -# index: 1 -# create: true -# - source: # Add cert-manager annotation to the webhook Service -# kind: Service -# version: v1 -# name: webhook-service -# fieldPath: .metadata.name # namespace of the service -# targets: -# - select: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# fieldPaths: -# - .spec.dnsNames.0 -# - .spec.dnsNames.1 -# options: -# delimiter: '.' -# index: 0 -# create: true -# - source: -# kind: Service -# version: v1 -# name: webhook-service -# fieldPath: .metadata.namespace # namespace of the service -# targets: -# - select: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# fieldPaths: -# - .spec.dnsNames.0 -# - .spec.dnsNames.1 -# options: -# delimiter: '.' -# index: 1 -# create: true diff --git a/config/default/manager_metrics_patch.yaml b/config/default/manager_metrics_patch.yaml deleted file mode 100644 index 2aaef653..00000000 --- a/config/default/manager_metrics_patch.yaml +++ /dev/null @@ -1,4 +0,0 @@ -# This patch adds the args to allow exposing the metrics endpoint using HTTPS -- op: add - path: /spec/template/spec/containers/0/args/0 - value: --metrics-bind-address=:8443 diff --git a/config/default/metrics_service.yaml b/config/default/metrics_service.yaml deleted file mode 100644 index 140d4943..00000000 --- a/config/default/metrics_service.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - labels: - control-plane: controller-manager - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: controller-manager-metrics-service - namespace: system -spec: - ports: - - name: https - port: 8443 - protocol: TCP - targetPort: 8443 - selector: - control-plane: controller-manager diff --git a/config/network-policy/allow-metrics-traffic.yaml b/config/network-policy/allow-metrics-traffic.yaml deleted file mode 100644 index aae53668..00000000 --- a/config/network-policy/allow-metrics-traffic.yaml +++ /dev/null @@ -1,26 +0,0 @@ -# This NetworkPolicy allows ingress traffic -# with Pods running on namespaces labeled with 'metrics: enabled'. Only Pods on those -# namespaces are able to gathering data from the metrics endpoint. -apiVersion: networking.k8s.io/v1 -kind: NetworkPolicy -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: allow-metrics-traffic - namespace: system -spec: - podSelector: - matchLabels: - control-plane: controller-manager - policyTypes: - - Ingress - ingress: - # This allows ingress traffic from any namespace with the label metrics: enabled - - from: - - namespaceSelector: - matchLabels: - metrics: enabled # Only from namespaces with this label - ports: - - port: 8443 - protocol: TCP diff --git a/config/network-policy/kustomization.yaml b/config/network-policy/kustomization.yaml deleted file mode 100644 index ec0fb5e5..00000000 --- a/config/network-policy/kustomization.yaml +++ /dev/null @@ -1,2 +0,0 @@ -resources: -- allow-metrics-traffic.yaml diff --git a/config/prometheus/kustomization.yaml b/config/prometheus/kustomization.yaml deleted file mode 100644 index ed137168..00000000 --- a/config/prometheus/kustomization.yaml +++ /dev/null @@ -1,2 +0,0 @@ -resources: -- monitor.yaml diff --git a/config/prometheus/monitor.yaml b/config/prometheus/monitor.yaml deleted file mode 100644 index aac24ef3..00000000 --- a/config/prometheus/monitor.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# Prometheus Monitor Service (Metrics) -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - control-plane: controller-manager - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: controller-manager-metrics-monitor - namespace: system -spec: - endpoints: - - path: /metrics - port: https # Ensure this is the name of the port that exposes HTTPS metrics - scheme: https - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - tlsConfig: - # TODO(user): The option insecureSkipVerify: true is not recommended for production since it disables - # certificate verification. This poses a significant security risk by making the system vulnerable to - # man-in-the-middle attacks, where an attacker could intercept and manipulate the communication between - # Prometheus and the monitored services. This could lead to unauthorized access to sensitive metrics data, - # compromising the integrity and confidentiality of the information. - # Please use the following options for secure configurations: - # caFile: /etc/metrics-certs/ca.crt - # certFile: /etc/metrics-certs/tls.crt - # keyFile: /etc/metrics-certs/tls.key - insecureSkipVerify: true - selector: - matchLabels: - control-plane: controller-manager diff --git a/config/rbac/inferencemodel_editor_role.yaml b/config/rbac/inferencemodel_editor_role.yaml deleted file mode 100644 index b175a9a3..00000000 --- a/config/rbac/inferencemodel_editor_role.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# permissions for end users to edit inferencemodels. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: inferencemodel-editor-role -rules: -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencemodels - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencemodels/status - verbs: - - get diff --git a/config/rbac/inferencemodel_viewer_role.yaml b/config/rbac/inferencemodel_viewer_role.yaml deleted file mode 100644 index 3b3e67f6..00000000 --- a/config/rbac/inferencemodel_viewer_role.yaml +++ /dev/null @@ -1,23 +0,0 @@ -# permissions for end users to view inferencemodels. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: inferencemodel-viewer-role -rules: -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencemodels - verbs: - - get - - list - - watch -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencemodels/status - verbs: - - get diff --git a/config/rbac/inferencepool_editor_role.yaml b/config/rbac/inferencepool_editor_role.yaml deleted file mode 100644 index cc1f7c35..00000000 --- a/config/rbac/inferencepool_editor_role.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# permissions for end users to edit inferencepools. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: inferencepool-editor-role -rules: -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencepools - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencepools/status - verbs: - - get diff --git a/config/rbac/inferencepool_viewer_role.yaml b/config/rbac/inferencepool_viewer_role.yaml deleted file mode 100644 index 828e0022..00000000 --- a/config/rbac/inferencepool_viewer_role.yaml +++ /dev/null @@ -1,23 +0,0 @@ -# permissions for end users to view inferencepools. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: inferencepool-viewer-role -rules: -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencepools - verbs: - - get - - list - - watch -- apiGroups: - - inference.networking.x-k8s.io - resources: - - inferencepools/status - verbs: - - get diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml deleted file mode 100644 index c3a52137..00000000 --- a/config/rbac/kustomization.yaml +++ /dev/null @@ -1,29 +0,0 @@ -resources: -# All RBAC will be applied under this service account in -# the deployment namespace. You may comment out this resource -# if your manager will use a service account that exists at -# runtime. Be sure to update RoleBinding and ClusterRoleBinding -# subjects if changing service account names. -- service_account.yaml -- role.yaml -- role_binding.yaml -- leader_election_role.yaml -- leader_election_role_binding.yaml -# The following RBAC configurations are used to protect -# the metrics endpoint with authn/authz. These configurations -# ensure that only authorized users and service accounts -# can access the metrics endpoint. Comment the following -# permissions if you want to disable this protection. -# More info: https://book.kubebuilder.io/reference/metrics.html -- metrics_auth_role.yaml -- metrics_auth_role_binding.yaml -- metrics_reader_role.yaml -# For each CRD, "Editor" and "Viewer" roles are scaffolded by -# default, aiding admins in cluster management. Those roles are -# not used by the Project itself. You can comment the following lines -# if you do not want those helpers be installed with your Project. -- inferencemodel_editor_role.yaml -- inferencemodel_viewer_role.yaml -- inferencepool_editor_role.yaml -- inferencepool_viewer_role.yaml - diff --git a/config/rbac/leader_election_role.yaml b/config/rbac/leader_election_role.yaml deleted file mode 100644 index e2f8551b..00000000 --- a/config/rbac/leader_election_role.yaml +++ /dev/null @@ -1,40 +0,0 @@ -# permissions to do leader election. -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: leader-election-role -rules: -- apiGroups: - - "" - resources: - - configmaps - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - coordination.k8s.io - resources: - - leases - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - "" - resources: - - events - verbs: - - create - - patch diff --git a/config/rbac/leader_election_role_binding.yaml b/config/rbac/leader_election_role_binding.yaml deleted file mode 100644 index fb71a122..00000000 --- a/config/rbac/leader_election_role_binding.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: leader-election-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: leader-election-role -subjects: -- kind: ServiceAccount - name: controller-manager - namespace: system diff --git a/config/rbac/metrics_auth_role.yaml b/config/rbac/metrics_auth_role.yaml deleted file mode 100644 index 32d2e4ec..00000000 --- a/config/rbac/metrics_auth_role.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: metrics-auth-role -rules: -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create diff --git a/config/rbac/metrics_auth_role_binding.yaml b/config/rbac/metrics_auth_role_binding.yaml deleted file mode 100644 index e775d67f..00000000 --- a/config/rbac/metrics_auth_role_binding.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: metrics-auth-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: metrics-auth-role -subjects: -- kind: ServiceAccount - name: controller-manager - namespace: system diff --git a/config/rbac/metrics_reader_role.yaml b/config/rbac/metrics_reader_role.yaml deleted file mode 100644 index 51a75db4..00000000 --- a/config/rbac/metrics_reader_role.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: metrics-reader -rules: -- nonResourceURLs: - - "/metrics" - verbs: - - get diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml deleted file mode 100644 index 9d6247eb..00000000 --- a/config/rbac/role.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: manager-role -rules: -- apiGroups: [""] - resources: ["pods"] - verbs: ["get", "list", "watch"] diff --git a/config/rbac/role_binding.yaml b/config/rbac/role_binding.yaml deleted file mode 100644 index c66b66bf..00000000 --- a/config/rbac/role_binding.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: manager-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: manager-role -subjects: -- kind: ServiceAccount - name: controller-manager - namespace: system diff --git a/config/rbac/service_account.yaml b/config/rbac/service_account.yaml deleted file mode 100644 index 9286120f..00000000 --- a/config/rbac/service_account.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: controller-manager - namespace: system diff --git a/config/samples/gateway_v1alpha1_inferencemodel.yaml b/config/samples/gateway_v1alpha1_inferencemodel.yaml deleted file mode 100644 index 34ea0680..00000000 --- a/config/samples/gateway_v1alpha1_inferencemodel.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: inference.networking.x-k8s.io/v1alpha1 -kind: InferenceModel -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: sample-sql-assist -spec: - criticality: Critical - modelName: sql-code-assist - poolRef: - name: vllm-llama-31-8b-sample-pool - targetModels: - - name: npc-bot-v1 - weight: 50 - - name: npc-bot-v2 - weight: 50 diff --git a/config/samples/gateway_v1alpha1_inferencepool.yaml b/config/samples/gateway_v1alpha1_inferencepool.yaml deleted file mode 100644 index 4993d786..00000000 --- a/config/samples/gateway_v1alpha1_inferencepool.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: inference.networking.x-k8s.io/v1alpha1 -kind: InferencePool -metadata: - labels: - app.kubernetes.io/name: api - app.kubernetes.io/managed-by: kustomize - name: vllm-llama-31-8b-sample-pool -spec: - selector: - app: npc-bot - targetPortNumber: 8000 diff --git a/config/samples/kustomization.yaml b/config/samples/kustomization.yaml deleted file mode 100644 index e4b9f2e8..00000000 --- a/config/samples/kustomization.yaml +++ /dev/null @@ -1,5 +0,0 @@ -## Append samples of your project ## -resources: -- gateway_v1alpha1_inferencepool.yaml -- gateway_v1alpha1_inferencemodel.yaml -# +kubebuilder:scaffold:manifestskustomizesamples From c0b3dbdb4b892c4bafdc08fcea26ae4ab14aaf99 Mon Sep 17 00:00:00 2001 From: Rohit Ramkumar Date: Tue, 8 Apr 2025 13:12:43 -0400 Subject: [PATCH 29/74] Rename pkg/body-based-routing to pkg/bbr (#664) --- cmd/{body-based-routing => bbr}/health.go | 0 cmd/{body-based-routing => bbr}/main.go | 2 +- pkg/{body-based-routing => bbr}/README.md | 0 pkg/{body-based-routing => bbr}/handlers/request.go | 2 +- pkg/{body-based-routing => bbr}/handlers/request_test.go | 2 +- pkg/{body-based-routing => bbr}/handlers/response.go | 0 pkg/{body-based-routing => bbr}/handlers/server.go | 0 pkg/{body-based-routing => bbr}/handlers/server_test.go | 0 pkg/{body-based-routing => bbr}/metrics/metrics.go | 0 pkg/{body-based-routing => bbr}/server/runserver.go | 2 +- test/integration/bbr/hermetic_test.go | 2 +- 11 files changed, 5 insertions(+), 5 deletions(-) rename cmd/{body-based-routing => bbr}/health.go (100%) rename cmd/{body-based-routing => bbr}/main.go (98%) rename pkg/{body-based-routing => bbr}/README.md (100%) rename pkg/{body-based-routing => bbr}/handlers/request.go (98%) rename pkg/{body-based-routing => bbr}/handlers/request_test.go (98%) rename pkg/{body-based-routing => bbr}/handlers/response.go (100%) rename pkg/{body-based-routing => bbr}/handlers/server.go (100%) rename pkg/{body-based-routing => bbr}/handlers/server_test.go (100%) rename pkg/{body-based-routing => bbr}/metrics/metrics.go (100%) rename pkg/{body-based-routing => bbr}/server/runserver.go (96%) diff --git a/cmd/body-based-routing/health.go b/cmd/bbr/health.go similarity index 100% rename from cmd/body-based-routing/health.go rename to cmd/bbr/health.go diff --git a/cmd/body-based-routing/main.go b/cmd/bbr/main.go similarity index 98% rename from cmd/body-based-routing/main.go rename to cmd/bbr/main.go index cfc584ce..84b1fffa 100644 --- a/cmd/body-based-routing/main.go +++ b/cmd/bbr/main.go @@ -36,7 +36,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" - runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/body-based-routing/server" + runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/bbr/server" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) diff --git a/pkg/body-based-routing/README.md b/pkg/bbr/README.md similarity index 100% rename from pkg/body-based-routing/README.md rename to pkg/bbr/README.md diff --git a/pkg/body-based-routing/handlers/request.go b/pkg/bbr/handlers/request.go similarity index 98% rename from pkg/body-based-routing/handlers/request.go rename to pkg/bbr/handlers/request.go index c0be46ac..32fffc02 100644 --- a/pkg/body-based-routing/handlers/request.go +++ b/pkg/bbr/handlers/request.go @@ -25,7 +25,7 @@ import ( eppb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/body-based-routing/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/bbr/metrics" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) diff --git a/pkg/body-based-routing/handlers/request_test.go b/pkg/bbr/handlers/request_test.go similarity index 98% rename from pkg/body-based-routing/handlers/request_test.go rename to pkg/bbr/handlers/request_test.go index 0f088702..55c42a21 100644 --- a/pkg/body-based-routing/handlers/request_test.go +++ b/pkg/bbr/handlers/request_test.go @@ -28,7 +28,7 @@ import ( "google.golang.org/protobuf/testing/protocmp" "k8s.io/component-base/metrics/legacyregistry" metricsutils "k8s.io/component-base/metrics/testutil" - "sigs.k8s.io/gateway-api-inference-extension/pkg/body-based-routing/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/bbr/metrics" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) diff --git a/pkg/body-based-routing/handlers/response.go b/pkg/bbr/handlers/response.go similarity index 100% rename from pkg/body-based-routing/handlers/response.go rename to pkg/bbr/handlers/response.go diff --git a/pkg/body-based-routing/handlers/server.go b/pkg/bbr/handlers/server.go similarity index 100% rename from pkg/body-based-routing/handlers/server.go rename to pkg/bbr/handlers/server.go diff --git a/pkg/body-based-routing/handlers/server_test.go b/pkg/bbr/handlers/server_test.go similarity index 100% rename from pkg/body-based-routing/handlers/server_test.go rename to pkg/bbr/handlers/server_test.go diff --git a/pkg/body-based-routing/metrics/metrics.go b/pkg/bbr/metrics/metrics.go similarity index 100% rename from pkg/body-based-routing/metrics/metrics.go rename to pkg/bbr/metrics/metrics.go diff --git a/pkg/body-based-routing/server/runserver.go b/pkg/bbr/server/runserver.go similarity index 96% rename from pkg/body-based-routing/server/runserver.go rename to pkg/bbr/server/runserver.go index 1646aa5a..2001b7ff 100644 --- a/pkg/body-based-routing/server/runserver.go +++ b/pkg/bbr/server/runserver.go @@ -27,7 +27,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/gateway-api-inference-extension/internal/runnable" tlsutil "sigs.k8s.io/gateway-api-inference-extension/internal/tls" - "sigs.k8s.io/gateway-api-inference-extension/pkg/body-based-routing/handlers" + "sigs.k8s.io/gateway-api-inference-extension/pkg/bbr/handlers" ) // ExtProcServerRunner provides methods to manage an external process server. diff --git a/test/integration/bbr/hermetic_test.go b/test/integration/bbr/hermetic_test.go index 02d412ab..b99186db 100644 --- a/test/integration/bbr/hermetic_test.go +++ b/test/integration/bbr/hermetic_test.go @@ -29,7 +29,7 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" "google.golang.org/protobuf/testing/protocmp" - runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/body-based-routing/server" + runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/bbr/server" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" integrationutils "sigs.k8s.io/gateway-api-inference-extension/test/integration" ) From 59c5781070496646cadabdbbefef66210577b094 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Tue, 8 Apr 2025 13:48:42 -0400 Subject: [PATCH 30/74] deploy: Enable logging for GKE gateway by default (#666) Logging dramatically reduces initial friction debugging and relative to the cost to serve is fairly minor (about 2-5% overhead). Enable by default as consistent with our guides. --- config/charts/inferencepool/templates/gke.yaml | 2 ++ config/manifests/gateway/gke/gcp-backend-policy.yaml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/config/charts/inferencepool/templates/gke.yaml b/config/charts/inferencepool/templates/gke.yaml index 220b3bea..70e05b56 100644 --- a/config/charts/inferencepool/templates/gke.yaml +++ b/config/charts/inferencepool/templates/gke.yaml @@ -33,6 +33,8 @@ spec: name: {{ .Release.Name }} default: timeoutSec: 300 # 5-minute timeout (adjust as needed) + logging: + enabled: true # log all requests by default --- apiVersion: monitoring.googleapis.com/v1 kind: ClusterPodMonitoring diff --git a/config/manifests/gateway/gke/gcp-backend-policy.yaml b/config/manifests/gateway/gke/gcp-backend-policy.yaml index 519a5a93..7b294304 100644 --- a/config/manifests/gateway/gke/gcp-backend-policy.yaml +++ b/config/manifests/gateway/gke/gcp-backend-policy.yaml @@ -9,3 +9,5 @@ spec: name: vllm-llama3-8b-instruct default: timeoutSec: 300 + logging: + enabled: true From 3690dbe97b9572c7751ff88b524290dab9f8055e Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Tue, 8 Apr 2025 21:06:45 +0300 Subject: [PATCH 31/74] moved IsPodReady func to podutils (#662) * moved IsPodReady func to pod utils to be shared between pod reconciler and datastore Signed-off-by: Nir Rozenbaum * code review changes Signed-off-by: Nir Rozenbaum * plural to singular Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum --- pkg/epp/controller/pod_reconciler.go | 15 ++----------- pkg/epp/datastore/datastore.go | 16 ++------------ pkg/epp/util/pod/pod.go | 33 ++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 27 deletions(-) create mode 100644 pkg/epp/util/pod/pod.go diff --git a/pkg/epp/controller/pod_reconciler.go b/pkg/epp/controller/pod_reconciler.go index 046561e4..494adeb7 100644 --- a/pkg/epp/controller/pod_reconciler.go +++ b/pkg/epp/controller/pod_reconciler.go @@ -30,6 +30,7 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + podutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/pod" ) type PodReconciler struct { @@ -71,7 +72,7 @@ func (c *PodReconciler) SetupWithManager(mgr ctrl.Manager) error { func (c *PodReconciler) updateDatastore(logger logr.Logger, pod *corev1.Pod, pool *v1alpha2.InferencePool) { namespacedName := types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace} - if !pod.DeletionTimestamp.IsZero() || !c.Datastore.PoolLabelsMatch(pod.Labels) || !podIsReady(pod) { + if !pod.DeletionTimestamp.IsZero() || !c.Datastore.PoolLabelsMatch(pod.Labels) || !podutil.IsPodReady(pod) { logger.V(logutil.DEBUG).Info("Pod removed or not added", "name", namespacedName) c.Datastore.PodDelete(namespacedName) } else { @@ -82,15 +83,3 @@ func (c *PodReconciler) updateDatastore(logger logr.Logger, pod *corev1.Pod, poo } } } - -func podIsReady(pod *corev1.Pod) bool { - for _, condition := range pod.Status.Conditions { - if condition.Type == corev1.PodReady { - if condition.Status == corev1.ConditionTrue { - return true - } - break - } - } - return false -} diff --git a/pkg/epp/datastore/datastore.go b/pkg/epp/datastore/datastore.go index 8ada3e64..dc81cb48 100644 --- a/pkg/epp/datastore/datastore.go +++ b/pkg/epp/datastore/datastore.go @@ -30,6 +30,7 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" + podutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/pod" ) const ( @@ -259,7 +260,7 @@ func (ds *datastore) PodResyncAll(ctx context.Context, ctrlClient client.Client, activePods := make(map[string]bool) for _, pod := range podList.Items { - if podIsReady(&pod) { + if podutil.IsPodReady(&pod) { namespacedName := types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace} activePods[pod.Name] = true if ds.PodUpdateOrAddIfNotExist(&pod, pool) { @@ -308,16 +309,3 @@ func IsCritical(model *v1alpha2.InferenceModel) bool { } return false } - -// TODO: move out to share with pod_reconciler.go -func podIsReady(pod *corev1.Pod) bool { - for _, condition := range pod.Status.Conditions { - if condition.Type == corev1.PodReady { - if condition.Status == corev1.ConditionTrue { - return true - } - break - } - } - return false -} diff --git a/pkg/epp/util/pod/pod.go b/pkg/epp/util/pod/pod.go new file mode 100644 index 00000000..9f564024 --- /dev/null +++ b/pkg/epp/util/pod/pod.go @@ -0,0 +1,33 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package pod + +import ( + corev1 "k8s.io/api/core/v1" +) + +func IsPodReady(pod *corev1.Pod) bool { + for _, condition := range pod.Status.Conditions { + if condition.Type == corev1.PodReady { + if condition.Status == corev1.ConditionTrue { + return true + } + break + } + } + return false +} From e71fd9281b3c1958e8bccde4536851fbce0f04ab Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Tue, 8 Apr 2025 22:46:50 +0300 Subject: [PATCH 32/74] removed double loop on docs in hermetic test (#668) use unstructured instead of checking InferenceModel/InferencePool and unmarshalling to specific object Signed-off-by: Nir Rozenbaum --- test/integration/epp/hermetic_test.go | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go index 93432637..ae2c6170 100644 --- a/test/integration/epp/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -44,6 +44,7 @@ import ( "google.golang.org/protobuf/testing/protocmp" "google.golang.org/protobuf/types/known/structpb" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -1691,27 +1692,13 @@ func BeforeSuite() func() { } for _, doc := range docs { - inferenceModel := &v1alpha2.InferenceModel{} - if err = yaml.Unmarshal(doc, inferenceModel); err != nil { + obj := &unstructured.Unstructured{} + if err = yaml.Unmarshal(doc, obj); err != nil { logutil.Fatal(logger, err, "Can't unmarshal object", "document", doc) } - if inferenceModel.Kind == "InferenceModel" { - logger.Info("Creating inference model", "model", inferenceModel) - if err := k8sClient.Create(context.Background(), inferenceModel); err != nil { - logutil.Fatal(logger, err, "Unable to create inferenceModel", "modelName", inferenceModel.Name) - } - } - } - for _, doc := range docs { - inferencePool := &v1alpha2.InferencePool{} - if err = yaml.Unmarshal(doc, inferencePool); err != nil { - logutil.Fatal(logger, err, "Can't unmarshal object", "document", doc) - } - if inferencePool.Kind == "InferencePool" { - logger.Info("Creating inference pool", "pool", inferencePool) - if err := k8sClient.Create(context.Background(), inferencePool); err != nil { - logutil.Fatal(logger, err, "Unable to create inferencePool", "poolName", inferencePool.Name) - } + logger.Info("Creating object", "kind", obj.GetKind(), "object", obj) + if err := k8sClient.Create(context.Background(), obj); err != nil { + logutil.Fatal(logger, err, "Unable to create object", "object", obj.GetName()) } } From 4ed93bfe1971271936de26b547f126cf9c2e329e Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Tue, 8 Apr 2025 23:36:57 +0300 Subject: [PATCH 33/74] fix bbr dockerfile that was broken in PR #664 (#669) * fixed dockerfile of bbr that was broken in PR #664 Signed-off-by: Nir Rozenbaum * code review Signed-off-by: Nir Rozenbaum * makefile Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum --- Makefile | 2 +- body-based-routing.Dockerfile => bbr.Dockerfile | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) rename body-based-routing.Dockerfile => bbr.Dockerfile (76%) diff --git a/Makefile b/Makefile index 66fe89d4..a1845560 100644 --- a/Makefile +++ b/Makefile @@ -232,7 +232,7 @@ bbr-image-local-load: bbr-image-local-build .PHONY: bbr-image-build bbr-image-build: ## Build the image using Docker Buildx. - $(IMAGE_BUILD_CMD) -f body-based-routing.Dockerfile -t $(BBR_IMAGE_TAG) \ + $(IMAGE_BUILD_CMD) -f bbr.Dockerfile -t $(BBR_IMAGE_TAG) \ --platform=$(PLATFORMS) \ --build-arg BASE_IMAGE=$(BASE_IMAGE) \ --build-arg BUILDER_IMAGE=$(BUILDER_IMAGE) \ diff --git a/body-based-routing.Dockerfile b/bbr.Dockerfile similarity index 76% rename from body-based-routing.Dockerfile rename to bbr.Dockerfile index e0afcf20..03024e49 100644 --- a/body-based-routing.Dockerfile +++ b/bbr.Dockerfile @@ -18,13 +18,13 @@ RUN go mod download COPY cmd ./cmd COPY pkg ./pkg COPY internal ./internal -WORKDIR /src/cmd/body-based-routing -RUN go build -o /body-based-routing +WORKDIR /src/cmd/bbr +RUN go build -o /bbr ## Multistage deploy FROM ${BASE_IMAGE} WORKDIR / -COPY --from=builder /body-based-routing /body-based-routing +COPY --from=builder /bbr /bbr -ENTRYPOINT ["/body-based-routing"] +ENTRYPOINT ["/bbr"] From ae3df874157b91c1858ff7c378896416b3412b1a Mon Sep 17 00:00:00 2001 From: Rohit Ramkumar Date: Tue, 8 Apr 2025 18:20:50 -0400 Subject: [PATCH 34/74] E2E test improvements (#661) --- config/manifests/inferencepool-resources.yaml | 3 + config/manifests/vllm/cpu-deployment.yaml | 5 +- test/e2e/epp/README.md | 7 + test/e2e/epp/e2e_suite_test.go | 46 ++++++- test/testdata/envoy.yaml | 6 +- test/testdata/inferencepool-e2e.yaml | 126 ++++++++++++++++++ 6 files changed, 183 insertions(+), 10 deletions(-) create mode 100644 test/testdata/inferencepool-e2e.yaml diff --git a/config/manifests/inferencepool-resources.yaml b/config/manifests/inferencepool-resources.yaml index cef70d7f..4affa274 100644 --- a/config/manifests/inferencepool-resources.yaml +++ b/config/manifests/inferencepool-resources.yaml @@ -1,3 +1,6 @@ +# Note: If you change this file, please also change the file used for e2e tests! +# +# https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/test/testdata/inferencepool-e2e.yaml apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferencePool metadata: diff --git a/config/manifests/vllm/cpu-deployment.yaml b/config/manifests/vllm/cpu-deployment.yaml index 6fb40950..827f2156 100644 --- a/config/manifests/vllm/cpu-deployment.yaml +++ b/config/manifests/vllm/cpu-deployment.yaml @@ -113,5 +113,8 @@ data: ensureExist: models: - base-model: Qwen/Qwen2.5-1.5B - id: food-review-1 + id: food-review + source: SriSanth2345/Qwen-1.5B-Tweet-Generations + - base-model: Qwen/Qwen2.5-1.5B + id: cad-fabricator source: SriSanth2345/Qwen-1.5B-Tweet-Generations \ No newline at end of file diff --git a/test/e2e/epp/README.md b/test/e2e/epp/README.md index 247e8b12..fcc974b8 100644 --- a/test/e2e/epp/README.md +++ b/test/e2e/epp/README.md @@ -28,6 +28,13 @@ Follow these steps to run the end-to-end tests: export HF_TOKEN= ``` +1. **(Optional): Set the test namespace**: By default, the e2e test creates resources in the `inf-ext-e2e` namespace. + If you would like to change this namespace, set the following environment variable: + + ```sh + export E2E_NS= + ``` + 1. **Run the Tests**: Run the `test-e2e` target: ```sh diff --git a/test/e2e/epp/e2e_suite_test.go b/test/e2e/epp/e2e_suite_test.go index 61ee2540..01ed639d 100644 --- a/test/e2e/epp/e2e_suite_test.go +++ b/test/e2e/epp/e2e_suite_test.go @@ -30,6 +30,7 @@ import ( corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" apiextv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/serializer" @@ -55,9 +56,8 @@ const ( defaultInterval = time.Millisecond * 250 // defaultCurlInterval is the default interval to run the test curl command. defaultCurlInterval = time.Second * 5 - // nsName is the name of the Namespace used for tests. - // TODO [danehans]: Must be "default" until https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/227 is fixed - nsName = "default" + // defaultNsName is the default name of the Namespace used for tests. Can override using the E2E_NS environment variable. + defaultNsName = "inf-ext-e2e" // modelServerName is the name of the model server test resources. modelServerName = "vllm-llama3-8b-instruct" // modelName is the test model name. @@ -77,7 +77,7 @@ const ( // inferModelManifest is the manifest for the inference model CRD. inferModelManifest = "../../../config/crd/bases/inference.networking.x-k8s.io_inferencemodels.yaml" // inferExtManifest is the manifest for the inference extension test resources. - inferExtManifest = "../../../config/manifests/inferencepool-resources.yaml" + inferExtManifest = "../../testdata/inferencepool-e2e.yaml" // envoyManifest is the manifest for the envoy proxy test resources. envoyManifest = "../../testdata/envoy.yaml" // modelServerManifestFilepathEnvVar is the env var that holds absolute path to the manifest for the model server test resource. @@ -91,6 +91,7 @@ var ( kubeCli *kubernetes.Clientset scheme = runtime.NewScheme() cfg = config.GetConfigOrDie() + nsName string ) func TestAPIs(t *testing.T) { @@ -101,6 +102,11 @@ func TestAPIs(t *testing.T) { } var _ = ginkgo.BeforeSuite(func() { + nsName = os.Getenv("E2E_NS") + if nsName == "" { + nsName = defaultNsName + } + ginkgo.By("Setting up the test suite") setupSuite() @@ -109,6 +115,8 @@ var _ = ginkgo.BeforeSuite(func() { }) func setupInfra() { + createNamespace(cli, nsName) + modelServerManifestPath := readModelServerManifestPath() modelServerManifestArray := getYamlsFromModelServerManifest(modelServerManifestPath) if strings.Contains(modelServerManifestArray[0], "hf-token") { @@ -118,6 +126,7 @@ func setupInfra() { "inferencepools.inference.networking.x-k8s.io": inferPoolManifest, "inferencemodels.inference.networking.x-k8s.io": inferModelManifest, } + createCRDs(cli, crds) createInferExt(cli, inferExtManifest) createClient(cli, clientManifest) @@ -182,6 +191,17 @@ var ( curlInterval = defaultCurlInterval ) +func createNamespace(k8sClient client.Client, ns string) { + ginkgo.By("Creating e2e namespace: " + ns) + obj := &corev1.Namespace{ + ObjectMeta: v1.ObjectMeta{ + Name: ns, + }, + } + err := k8sClient.Create(ctx, obj) + gomega.Expect(err).NotTo(gomega.HaveOccurred(), "Failed to create e2e test namespace") +} + // namespaceExists ensures that a specified namespace exists and is ready for use. func namespaceExists(k8sClient client.Client, ns string) { ginkgo.By("Ensuring namespace exists: " + ns) @@ -276,8 +296,15 @@ func createHfSecret(k8sClient client.Client, secretPath string) { // createEnvoy creates the envoy proxy resources used for testing from the given filePath. func createEnvoy(k8sClient client.Client, filePath string) { + inManifests := readYaml(filePath) + ginkgo.By("Replacing placeholder namespace with E2E_NS environment variable") + outManifests := []string{} + for _, m := range inManifests { + outManifests = append(outManifests, strings.ReplaceAll(m, "$E2E_NS", nsName)) + } + ginkgo.By("Creating envoy proxy resources from manifest: " + filePath) - applyYAMLFile(k8sClient, filePath) + createObjsFromYaml(k8sClient, outManifests) // Wait for the configmap to exist before proceeding with test. cfgMap := &corev1.ConfigMap{} @@ -302,8 +329,15 @@ func createEnvoy(k8sClient client.Client, filePath string) { // createInferExt creates the inference extension resources used for testing from the given filePath. func createInferExt(k8sClient client.Client, filePath string) { + inManifests := readYaml(filePath) + ginkgo.By("Replacing placeholder namespace with E2E_NS environment variable") + outManifests := []string{} + for _, m := range inManifests { + outManifests = append(outManifests, strings.ReplaceAll(m, "$E2E_NS", nsName)) + } + ginkgo.By("Creating inference extension resources from manifest: " + filePath) - applyYAMLFile(k8sClient, filePath) + createObjsFromYaml(k8sClient, outManifests) // Wait for the clusterrole to exist. testutils.EventuallyExists(ctx, func() error { diff --git a/test/testdata/envoy.yaml b/test/testdata/envoy.yaml index 62e6b4c5..3fff8598 100644 --- a/test/testdata/envoy.yaml +++ b/test/testdata/envoy.yaml @@ -100,7 +100,7 @@ data: grpc_service: envoy_grpc: cluster_name: ext_proc - authority: vllm-llama3-8b-instruct-epp.default:9002 + authority: vllm-llama3-8b-instruct-epp.$E2E_NS:9002 timeout: 10s processing_mode: request_header_mode: SEND @@ -195,7 +195,7 @@ data: - endpoint: address: socket_address: - address: vllm-llama3-8b-instruct-epp.default + address: vllm-llama3-8b-instruct-epp.$E2E_NS port_value: 9002 health_status: HEALTHY load_balancing_weight: 1 @@ -225,7 +225,7 @@ spec: image: docker.io/envoyproxy/envoy:distroless-v1.33.2 args: - "--service-cluster" - - "default/inference-gateway" + - "$E2E_NS/inference-gateway" - "--service-node" - "$(ENVOY_POD_NAME)" - "--log-level" diff --git a/test/testdata/inferencepool-e2e.yaml b/test/testdata/inferencepool-e2e.yaml new file mode 100644 index 00000000..79339c5b --- /dev/null +++ b/test/testdata/inferencepool-e2e.yaml @@ -0,0 +1,126 @@ +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + labels: + name: vllm-llama3-8b-instruct +spec: + targetPortNumber: 8000 + selector: + app: vllm-llama3-8b-instruct + extensionRef: + name: vllm-llama3-8b-instruct-epp + namespace: $E2E_NS +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-llama3-8b-instruct-epp + namespace: $E2E_NS +spec: + selector: + app: vllm-llama3-8b-instruct-epp + ports: + - protocol: TCP + port: 9002 + targetPort: 9002 + appProtocol: http2 + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-llama3-8b-instruct-epp + namespace: $E2E_NS + labels: + app: vllm-llama3-8b-instruct-epp +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-llama3-8b-instruct-epp + template: + metadata: + labels: + app: vllm-llama3-8b-instruct-epp + spec: + # Conservatively, this timeout should mirror the longest grace period of the pods within the pool + terminationGracePeriodSeconds: 130 + containers: + - name: epp + image: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/epp:main + imagePullPolicy: Always + args: + - -poolName + - "vllm-llama3-8b-instruct" + - -poolNamespace + - "$E2E_NS" + - -v + - "4" + - --zap-encoder + - "json" + - -grpcPort + - "9002" + - -grpcHealthPort + - "9003" + env: + - name: USE_STREAMING + value: "true" + ports: + - containerPort: 9002 + - containerPort: 9003 + - name: metrics + containerPort: 9090 + livenessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + grpc: + port: 9003 + service: inference-extension + initialDelaySeconds: 5 + periodSeconds: 10 +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-read +rules: +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencemodels"] + verbs: ["get", "watch", "list"] +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "watch", "list"] +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencepools"] + verbs: ["get", "watch", "list"] +- apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["get", "watch", "list"] +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: pod-read-binding +subjects: +- kind: ServiceAccount + name: default + namespace: $E2E_NS +roleRef: + kind: ClusterRole + name: pod-read From 42eb5ff1c5af1275df43ac384df0ddf20da95134 Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> Date: Tue, 8 Apr 2025 22:20:56 +0000 Subject: [PATCH 35/74] cleaning up inferencePool helm docs (#665) --- config/charts/inferencepool/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md index 681fc783..e5468cd4 100644 --- a/config/charts/inferencepool/README.md +++ b/config/charts/inferencepool/README.md @@ -17,9 +17,12 @@ To install via the latest published chart in staging (--version v0 indicates la ```txt $ helm install vllm-llama3-8b-instruct \ --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set provider.name=[none|gke] \ oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0 ``` +Note that the provider name is needed to deploy provider-specific resources. If no provider is specified, then only the InferencePool object and the EPP are deployed. + ## Uninstall Run the following command to uninstall the chart: @@ -34,7 +37,6 @@ The following table list the configurable parameters of the chart. | **Parameter Name** | **Description** | |---------------------------------------------|------------------------------------------------------------------------------------------------------------------------| -| `inferencePool.name` | Name for the InferencePool, and endpoint picker deployment and service will be named as `{.Release.name}-epp`. | | `inferencePool.targetPortNumber` | Target port number for the vllm backends, will be used to scrape metrics by the inference extension. Defaults to 8000. | | `inferencePool.modelServers.matchLabels` | Label selector to match vllm backends managed by the inference pool. | | `inferenceExtension.replicas` | Number of replicas for the endpoint picker extension service. Defaults to `1`. | @@ -43,6 +45,7 @@ The following table list the configurable parameters of the chart. | `inferenceExtension.image.tag` | Image tag of the endpoint picker. | | `inferenceExtension.image.pullPolicy` | Image pull policy for the container. Possible values: `Always`, `IfNotPresent`, or `Never`. Defaults to `Always`. | | `inferenceExtension.extProcPort` | Port where the endpoint picker service is served for external processing. Defaults to `9002`. | +| `provider.name` | Name of the Inference Gateway implementation being used. Possible values: `gke`. Defaults to `none`. | ## Notes From 4761c71b91b3da754fdc66264c64cd56eb85c1f9 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Wed, 9 Apr 2025 19:46:40 +0300 Subject: [PATCH 36/74] move inf model IsCritial func out of datastore (#670) * move inf model IsCritial func out of datastore Signed-off-by: Nir Rozenbaum * remove IsCritical function helper function Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum --- pkg/epp/datastore/datastore.go | 9 +-------- pkg/epp/handlers/request.go | 4 ++-- pkg/epp/handlers/streamingserver.go | 2 +- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/pkg/epp/datastore/datastore.go b/pkg/epp/datastore/datastore.go index dc81cb48..5435e3af 100644 --- a/pkg/epp/datastore/datastore.go +++ b/pkg/epp/datastore/datastore.go @@ -69,7 +69,7 @@ type Datastore interface { Clear() } -func NewDatastore(parentCtx context.Context, pmf *backendmetrics.PodMetricsFactory) *datastore { +func NewDatastore(parentCtx context.Context, pmf *backendmetrics.PodMetricsFactory) Datastore { store := &datastore{ parentCtx: parentCtx, poolAndModelsMu: sync.RWMutex{}, @@ -302,10 +302,3 @@ func stripLabelKeyAliasFromLabelMap(labels map[v1alpha2.LabelKey]v1alpha2.LabelV } return outMap } - -func IsCritical(model *v1alpha2.InferenceModel) bool { - if model.Spec.Criticality != nil && *model.Spec.Criticality == v1alpha2.Critical { - return true - } - return false -} diff --git a/pkg/epp/handlers/request.go b/pkg/epp/handlers/request.go index b786a15d..e8dcf262 100644 --- a/pkg/epp/handlers/request.go +++ b/pkg/epp/handlers/request.go @@ -26,7 +26,7 @@ import ( extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "google.golang.org/protobuf/types/known/structpb" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" @@ -77,7 +77,7 @@ func (s *Server) HandleRequestBody( llmReq := &schedulingtypes.LLMRequest{ Model: model, ResolvedTargetModel: modelName, - Critical: datastore.IsCritical(modelObj), + Critical: modelObj.Spec.Criticality != nil && *modelObj.Spec.Criticality == v1alpha2.Critical, } loggerVerbose.Info("LLM request assembled", "request", llmReq) diff --git a/pkg/epp/handlers/streamingserver.go b/pkg/epp/handlers/streamingserver.go index 88963f47..ca3451cb 100644 --- a/pkg/epp/handlers/streamingserver.go +++ b/pkg/epp/handlers/streamingserver.go @@ -348,7 +348,7 @@ func (s *StreamingServer) HandleRequestBody( llmReq := &schedulingtypes.LLMRequest{ Model: model, ResolvedTargetModel: modelName, - Critical: datastore.IsCritical(modelObj), + Critical: modelObj.Spec.Criticality != nil && *modelObj.Spec.Criticality == v1alpha2.Critical, } logger.V(logutil.DEBUG).Info("LLM request assembled", "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "critical", llmReq.Critical) From 1ba13f390d17709ed825d9c952a8117e4f0df24e Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Wed, 9 Apr 2025 16:42:41 -0700 Subject: [PATCH 37/74] Consolidating down to FULL_DUPLEX_STREAMED supported ext-proc server (#672) --- cmd/epp/main.go | 6 - .../templates/epp-deployment.yaml | 3 - config/manifests/inferencepool-resources.yaml | 3 - pkg/epp/handlers/request.go | 162 ++--- pkg/epp/handlers/response.go | 216 ++----- pkg/epp/handlers/response_test.go | 79 ++- pkg/epp/handlers/server.go | 435 ++++++++++--- pkg/epp/handlers/streamingserver.go | 594 ------------------ pkg/epp/server/runserver.go | 9 +- test/integration/epp/hermetic_test.go | 319 ---------- 10 files changed, 490 insertions(+), 1336 deletions(-) delete mode 100644 pkg/epp/handlers/streamingserver.go diff --git a/cmd/epp/main.go b/cmd/epp/main.go index 39baf18b..b9c7d6e4 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -120,11 +120,6 @@ func run() error { flag.Parse() initLogging(&opts) - useStreamingServer, err := strconv.ParseBool(os.Getenv("USE_STREAMING")) - if err != nil { - setupLog.Error(err, "Failed to parse env var USE_STREAMING, defaulting to false") - } - // Validate flags if err := validateFlags(); err != nil { setupLog.Error(err, "Failed to validate flags") @@ -178,7 +173,6 @@ func run() error { Datastore: datastore, SecureServing: *secureServing, CertPath: *certPath, - UseStreaming: useStreamingServer, RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval, } if err := serverRunner.SetupWithManager(ctx, mgr); err != nil { diff --git a/config/charts/inferencepool/templates/epp-deployment.yaml b/config/charts/inferencepool/templates/epp-deployment.yaml index d925a38e..0b9fa0bd 100644 --- a/config/charts/inferencepool/templates/epp-deployment.yaml +++ b/config/charts/inferencepool/templates/epp-deployment.yaml @@ -35,9 +35,6 @@ spec: - "9003" - -metricsPort - "9090" - env: - - name: USE_STREAMING - value: "true" ports: - name: grpc containerPort: 9002 diff --git a/config/manifests/inferencepool-resources.yaml b/config/manifests/inferencepool-resources.yaml index 4affa274..993b7bf6 100644 --- a/config/manifests/inferencepool-resources.yaml +++ b/config/manifests/inferencepool-resources.yaml @@ -62,9 +62,6 @@ spec: - "9002" - -grpcHealthPort - "9003" - env: - - name: USE_STREAMING - value: "true" ports: - containerPort: 9002 - containerPort: 9003 diff --git a/pkg/epp/handlers/request.go b/pkg/epp/handlers/request.go index e8dcf262..44537923 100644 --- a/pkg/epp/handlers/request.go +++ b/pkg/epp/handlers/request.go @@ -21,10 +21,9 @@ import ( "encoding/json" "fmt" "strconv" + "time" - configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" - "google.golang.org/protobuf/types/known/structpb" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" @@ -32,33 +31,22 @@ import ( logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) -// HandleRequestBody handles body of the request to the backend server, such as parsing the "model" -// parameter. -// Envoy sends the request body to ext proc before sending the request to the backend server. -func (s *Server) HandleRequestBody( +// HandleRequestBody always returns the requestContext even in the error case, as the request context is used in error handling. +func (s *StreamingServer) HandleRequestBody( ctx context.Context, reqCtx *RequestContext, req *extProcPb.ProcessingRequest, -) (*extProcPb.ProcessingResponse, error) { + requestBodyMap map[string]interface{}, +) (*RequestContext, error) { + var requestBodyBytes []byte logger := log.FromContext(ctx) - loggerVerbose := logger.V(logutil.VERBOSE) - loggerVerbose.Info("Handling request body") - - // Unmarshal request body (must be JSON). - v := req.Request.(*extProcPb.ProcessingRequest_RequestBody) - var rb map[string]interface{} - if err := json.Unmarshal(v.RequestBody.Body, &rb); err != nil { - logger.V(logutil.DEFAULT).Error(err, "Error unmarshaling request body") - return nil, errutil.Error{Code: errutil.BadRequest, Msg: fmt.Sprintf("error unmarshaling request body: %v", err)} - } - loggerVerbose.Info("Request body unmarshalled", "body", rb) // Resolve target models. - model, ok := rb["model"].(string) + model, ok := requestBodyMap["model"].(string) if !ok { - return nil, errutil.Error{Code: errutil.BadRequest, Msg: "model not found in request"} + return reqCtx, errutil.Error{Code: errutil.BadRequest, Msg: "model not found in request"} } - loggerVerbose.Info("Model requested", "model", model) + modelName := model // NOTE: The nil checking for the modelObject means that we DO allow passthrough currently. @@ -66,12 +54,12 @@ func (s *Server) HandleRequestBody( // are able to be requested by using their distinct name. modelObj := s.datastore.ModelGet(model) if modelObj == nil { - return nil, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error finding a model object in InferenceModel for input %v", model)} + return reqCtx, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error finding a model object in InferenceModel for input %v", model)} } if len(modelObj.Spec.TargetModels) > 0 { modelName = RandomWeightedDraw(logger, modelObj, 0) if modelName == "" { - return nil, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error getting target model name for model %v", modelObj.Name)} + return reqCtx, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error getting target model name for model %v", modelObj.Name)} } } llmReq := &schedulingtypes.LLMRequest{ @@ -79,132 +67,84 @@ func (s *Server) HandleRequestBody( ResolvedTargetModel: modelName, Critical: modelObj.Spec.Criticality != nil && *modelObj.Spec.Criticality == v1alpha2.Critical, } - loggerVerbose.Info("LLM request assembled", "request", llmReq) + logger.V(logutil.DEBUG).Info("LLM request assembled", "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "critical", llmReq.Critical) - requestBody := v.RequestBody.Body var err error // Update target models in the body. if llmReq.Model != llmReq.ResolvedTargetModel { - rb["model"] = llmReq.ResolvedTargetModel - requestBody, err = json.Marshal(rb) - if err != nil { - logger.V(logutil.DEFAULT).Error(err, "Error marshaling request body") - return nil, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("error marshaling request body: %v", err)} - } - loggerVerbose.Info("Updated request body marshalled", "body", string(requestBody)) + requestBodyMap["model"] = llmReq.ResolvedTargetModel + } + + requestBodyBytes, err = json.Marshal(requestBodyMap) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error marshaling request body") + return reqCtx, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("error marshaling request body: %v", err)} } target, err := s.scheduler.Schedule(ctx, llmReq) if err != nil { - return nil, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()} + return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()} } targetPod := target.GetPod() - logger.V(logutil.DEFAULT).Info("Request handled", - "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "endpoint", targetPod) - // Insert target endpoint to instruct Envoy to route requests to the specified target pod. // Attach the port number pool, err := s.datastore.PoolGet() if err != nil { - return nil, err + return reqCtx, err } endpoint := targetPod.Address + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber)) + logger.V(logutil.DEFAULT).Info("Request handled", + "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "endpoint", targetPod, "endpoint metrics", + fmt.Sprintf("%+v", target)) + reqCtx.Model = llmReq.Model reqCtx.ResolvedTargetModel = llmReq.ResolvedTargetModel - reqCtx.RequestSize = len(v.RequestBody.Body) + reqCtx.RequestSize = len(requestBodyBytes) reqCtx.TargetPod = targetPod.NamespacedName.String() reqCtx.TargetEndpoint = endpoint - headers := []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: s.destinationEndpointHintKey, - RawValue: []byte(endpoint), - }, - }, - // We need to update the content length header if the body is mutated, see Envoy doc: - // https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/filters/http/ext_proc/v3/processing_mode.proto - { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte(strconv.Itoa(len(requestBody))), - }, - }, - } - // Print headers for debugging - for _, header := range headers { - logger.V(logutil.DEBUG).Info("Request body header", "key", header.Header.Key, "value", header.Header.RawValue) - } - - targetEndpointValue := &structpb.Struct{ - Fields: map[string]*structpb.Value{ - s.destinationEndpointHintKey: { - Kind: &structpb.Value_StringValue{ - StringValue: endpoint, - }, - }, - }, - } - dynamicMetadata := targetEndpointValue - if s.destinationEndpointHintMetadataNamespace != "" { - // If a namespace is defined, wrap the selected endpoint with that. - dynamicMetadata = &structpb.Struct{ - Fields: map[string]*structpb.Value{ - s.destinationEndpointHintMetadataNamespace: { - Kind: &structpb.Value_StructValue{ - StructValue: targetEndpointValue, - }, - }, - }, - } - } + s.populateRequestHeaderResponse(reqCtx, endpoint, len(requestBodyBytes)) - resp := &extProcPb.ProcessingResponse{ + reqCtx.reqBodyResp = &extProcPb.ProcessingResponse{ // The Endpoint Picker supports two approaches to communicating the target endpoint, as a request header // and as an unstructure ext-proc response metadata key/value pair. This enables different integration // options for gateway providers. Response: &extProcPb.ProcessingResponse_RequestBody{ RequestBody: &extProcPb.BodyResponse{ Response: &extProcPb.CommonResponse{ - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: headers, - }, BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_Body{ - Body: requestBody, + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: requestBodyBytes, + EndOfStream: true, + }, }, }, }, }, }, - DynamicMetadata: dynamicMetadata, } - return resp, nil + return reqCtx, nil } -func HandleRequestHeaders( - ctx context.Context, - reqCtx *RequestContext, - req *extProcPb.ProcessingRequest, -) *extProcPb.ProcessingResponse { - r := req.Request - h := r.(*extProcPb.ProcessingRequest_RequestHeaders) - log.FromContext(ctx).V(logutil.VERBOSE).Info("Handling request headers", "headers", h) - - resp := &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_RequestHeaders{ - RequestHeaders: &extProcPb.HeadersResponse{ - Response: &extProcPb.CommonResponse{ - // Set `clear_route_cache = true` to force Envoy to recompute the target cluster - // based on the new "target-pod" header. - // See https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor.proto#service-ext-proc-v3-commonresponse. - ClearRouteCache: true, - }, - }, - }, +func (s *StreamingServer) HandleRequestHeaders(ctx context.Context, reqCtx *RequestContext, req *extProcPb.ProcessingRequest_RequestHeaders) error { + reqCtx.RequestReceivedTimestamp = time.Now() + + // an EoS in the request headers means this request has no body or trailers. + if req.RequestHeaders.EndOfStream { + // We will route this request to a random pod as this is assumed to just be a GET + // More context: https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/526 + // The above PR will address endpoint admission, but currently any request without a body will be + // routed to a random upstream pod. + pod := GetRandomPod(s.datastore) + pool, err := s.datastore.PoolGet() + if err != nil { + return err + } + endpoint := pod.Address + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber)) + s.populateRequestHeaderResponse(reqCtx, endpoint, 0) } - - return resp + return nil } diff --git a/pkg/epp/handlers/response.go b/pkg/epp/handlers/response.go index 991b7d16..04c7a5e9 100644 --- a/pkg/epp/handlers/response.go +++ b/pkg/epp/handlers/response.go @@ -19,14 +19,11 @@ package handlers import ( "context" "encoding/json" - "fmt" "strings" - configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" - "github.com/go-logr/logr" "sigs.k8s.io/controller-runtime/pkg/log" - errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -35,78 +32,48 @@ const ( streamingEndMsg = "data: [DONE]" ) -// HandleResponseHeaders processes response headers from the backend model server. -func (s *Server) HandleResponseHeaders( +// HandleResponseBody always returns the requestContext even in the error case, as the request context is used in error handling. +func (s *StreamingServer) HandleResponseBody( ctx context.Context, reqCtx *RequestContext, - req *extProcPb.ProcessingRequest, -) (*extProcPb.ProcessingResponse, error) { - loggerVerbose := log.FromContext(ctx).V(logutil.VERBOSE) - loggerVerbose.Info("Processing ResponseHeaders") - h := req.Request.(*extProcPb.ProcessingRequest_ResponseHeaders) - loggerVerbose.Info("Headers before", "headers", h) - - // Example header - // { - // "ResponseHeaders": { - // "headers": [ - // { - // "key": ":status", - // "raw_value": "200" - // }, - // { - // "key": "date", - // "raw_value": "Thu, 30 Jan 2025 18:50:48 GMT" - // }, - // { - // "key": "server", - // "raw_value": "uvicorn" - // }, - // { - // "key": "content-type", - // "raw_value": "text/event-stream; charset=utf-8" - // }, - // { - // "key": "transfer-encoding", - // "raw_value": "chunked" - // } - // ] - // } - // } - for _, header := range h.ResponseHeaders.Headers.GetHeaders() { - var statusFound, typeFound bool - if header.Key == "status" { - code := header.RawValue[0] - if string(code) != "200" { - reqCtx.ResponseStatusCode = errutil.ModelServerError - statusFound = true - } - } - if header.Key == "content-type" { - contentType := header.RawValue - if strings.Contains(string(contentType), "text/event-stream") { - reqCtx.modelServerStreaming = true - } - typeFound = true - } - - if statusFound && typeFound { - break + response map[string]interface{}, +) (*RequestContext, error) { + logger := log.FromContext(ctx) + responseBytes, err := json.Marshal(response) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "error marshalling responseBody") + return reqCtx, err + } + if response["usage"] != nil { + usg := response["usage"].(map[string]interface{}) + usage := Usage{ + PromptTokens: int(usg["prompt_tokens"].(float64)), + CompletionTokens: int(usg["completion_tokens"].(float64)), + TotalTokens: int(usg["total_tokens"].(float64)), } + reqCtx.Usage = usage + logger.V(logutil.VERBOSE).Info("Response generated", "usage", reqCtx.Usage) } + reqCtx.ResponseSize = len(responseBytes) + // ResponseComplete is to indicate the response is complete. In non-streaming + // case, it will be set to be true once the response is processed; in + // streaming case, it will be set to be true once the last chunk is processed. + // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/178) + // will add the processing for streaming case. + reqCtx.ResponseComplete = true - resp := &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_ResponseHeaders{ - ResponseHeaders: &extProcPb.HeadersResponse{ + reqCtx.respBodyResp = &extProcPb.ProcessingResponse{ + // The Endpoint Picker supports two approaches to communicating the target endpoint, as a request header + // and as an unstructure ext-proc response metadata key/value pair. This enables different integration + // options for gateway providers. + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ Response: &extProcPb.CommonResponse{ - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - // This is for debugging purpose only. - Key: "x-went-into-resp-headers", - RawValue: []byte("true"), - }, + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: responseBytes, + EndOfStream: true, }, }, }, @@ -114,106 +81,21 @@ func (s *Server) HandleResponseHeaders( }, }, } - return resp, nil + return reqCtx, nil } -// HandleResponseBody parses response body to update information such as number of completion tokens. -// NOTE: The current implementation only supports Buffered mode, which is not enabled by default. To -// use it, you need to configure EnvoyExtensionPolicy to have response body in Buffered mode. -// https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/filters/http/ext_proc/v3/processing_mode.proto#envoy-v3-api-msg-extensions-filters-http-ext-proc-v3-processingmode -// Example response -/* -{ - "id": "cmpl-573498d260f2423f9e42817bbba3743a", - "object": "text_completion", - "created": 1732563765, - "model": "meta-llama/Llama-3.1-8B-Instruct", - "choices": [ - { - "index": 0, - "text": " Chronicle\nThe San Francisco Chronicle has a new book review section, and it's a good one. The reviews are short, but they're well-written and well-informed. The Chronicle's book review section is a good place to start if you're looking for a good book review.\nThe Chronicle's book review section is a good place to start if you're looking for a good book review. The Chronicle's book review section", - "logprobs": null, - "finish_reason": "length", - "stop_reason": null, - "prompt_logprobs": null - } - ], - "usage": { - "prompt_tokens": 11, - "total_tokens": 111, - "completion_tokens": 100 - } -}*/ -func (s *Server) HandleResponseBody( +// The function is to handle streaming response if the modelServer is streaming. +func (s *StreamingServer) HandleResponseBodyModelStreaming( ctx context.Context, reqCtx *RequestContext, - req *extProcPb.ProcessingRequest, -) (*extProcPb.ProcessingResponse, error) { - logger := log.FromContext(ctx) - loggerVerbose := logger.V(logutil.VERBOSE) - body := req.Request.(*extProcPb.ProcessingRequest_ResponseBody) - - if reqCtx.modelServerStreaming { - logger.V(logutil.DEBUG).Info("Processing HandleResponseBody") - if err := s.HandleStreaming(ctx, reqCtx, body, loggerVerbose); err != nil { - return nil, err - } - } else { - loggerVerbose.Info("Processing HandleResponseBody") - if err := s.HandleNonStreaming(ctx, reqCtx, body, loggerVerbose); err != nil { - return nil, err - } - } - - resp := &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_ResponseBody{ - ResponseBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{}, - }, - }, - } - return resp, nil -} - -func (s *Server) HandleNonStreaming( - ctx context.Context, - reqCtx *RequestContext, - body *extProcPb.ProcessingRequest_ResponseBody, - loggerVerbose logr.Logger, -) error { - loggerVerbose.Info("Processing HandleResponseBody") - - res := Response{} - if err := json.Unmarshal(body.ResponseBody.Body, &res); err != nil { - return errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("unmarshaling response body: %v", err)} - } - reqCtx.Usage = res.Usage - reqCtx.ResponseSize = len(body.ResponseBody.Body) - reqCtx.ResponseComplete = true - loggerVerbose.Info("Response generated", "response", res) - return nil -} - -func (s *Server) HandleStreaming( - ctx context.Context, - reqCtx *RequestContext, - body *extProcPb.ProcessingRequest_ResponseBody, - loggerVerbose logr.Logger, -) error { - responseText := string(body.ResponseBody.Body) + responseText string, +) { if strings.Contains(responseText, streamingEndMsg) { - parsedResp := ParseRespForUsage(ctx, responseText) - reqCtx.Usage = parsedResp.Usage + resp := parseRespForUsage(ctx, responseText) + reqCtx.Usage = resp.Usage + metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, resp.Usage.PromptTokens) + metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, resp.Usage.CompletionTokens) } - - if body.ResponseBody.EndOfStream { - loggerVerbose.Info("Streaming is completed") - reqCtx.ResponseComplete = true - } else { - reqCtx.ResponseSize += len(body.ResponseBody.Body) - } - - return nil } // Example message if "stream_options": {"include_usage": "true"} is included in the request: @@ -227,11 +109,12 @@ func (s *Server) HandleStreaming( // // If include_usage is not included in the request, `data: [DONE]` is returned separately, which // indicates end of streaming. -func ParseRespForUsage( +func parseRespForUsage( ctx context.Context, responseText string, ) Response { response := Response{} + logger := log.FromContext(ctx) lines := strings.Split(responseText, "\n") for _, line := range lines { @@ -245,8 +128,7 @@ func ParseRespForUsage( byteSlice := []byte(content) if err := json.Unmarshal(byteSlice, &response); err != nil { - logger := log.FromContext(ctx) - logger.V(logutil.DEFAULT).Error(err, "unmarshaling response body") + logger.Error(err, "unmarshaling response body") continue } } diff --git a/pkg/epp/handlers/response_test.go b/pkg/epp/handlers/response_test.go index 074b45c9..bfe5a629 100644 --- a/pkg/epp/handlers/response_test.go +++ b/pkg/epp/handlers/response_test.go @@ -18,9 +18,9 @@ package handlers import ( "context" + "encoding/json" "testing" - extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" "github.com/google/go-cmp/cmp" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -63,40 +63,61 @@ func TestHandleResponseBody(t *testing.T) { tests := []struct { name string - req *extProcPb.ProcessingRequest_ResponseBody + body []byte reqCtx *RequestContext want Usage wantErr bool }{ { name: "success", - req: &extProcPb.ProcessingRequest_ResponseBody{ - ResponseBody: &extProcPb.HttpBody{ - Body: []byte(body), - }, - }, + body: []byte(body), want: Usage{ PromptTokens: 11, TotalTokens: 111, CompletionTokens: 100, }, }, - { - name: "malformed response", - req: &extProcPb.ProcessingRequest_ResponseBody{ - ResponseBody: &extProcPb.HttpBody{ - Body: []byte("malformed json"), - }, - }, - wantErr: true, - }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + server := &StreamingServer{} + reqCtx := test.reqCtx + if reqCtx == nil { + reqCtx = &RequestContext{} + } + var responseMap map[string]interface{} + marshalErr := json.Unmarshal(test.body, &responseMap) + if marshalErr != nil { + t.Error(marshalErr, "Error unmarshaling request body") + } + _, err := server.HandleResponseBody(ctx, reqCtx, responseMap) + if err != nil { + if !test.wantErr { + t.Fatalf("HandleResponseBody returned unexpected error: %v, want %v", err, test.wantErr) + } + return + } + + if diff := cmp.Diff(test.want, reqCtx.Usage); diff != "" { + t.Errorf("HandleResponseBody returned unexpected response, diff(-want, +got): %v", diff) + } + }) + } +} + +func TestHandleStreamedResponseBody(t *testing.T) { + ctx := logutil.NewTestLoggerIntoContext(context.Background()) + tests := []struct { + name string + body string + reqCtx *RequestContext + want Usage + wantErr bool + }{ { name: "streaming request without usage", - req: &extProcPb.ProcessingRequest_ResponseBody{ - ResponseBody: &extProcPb.HttpBody{ - Body: []byte(streamingBodyWithoutUsage), - }, - }, + body: streamingBodyWithoutUsage, reqCtx: &RequestContext{ modelServerStreaming: true, }, @@ -105,11 +126,7 @@ func TestHandleResponseBody(t *testing.T) { }, { name: "streaming request with usage", - req: &extProcPb.ProcessingRequest_ResponseBody{ - ResponseBody: &extProcPb.HttpBody{ - Body: []byte(streamingBodyWithUsage), - }, - }, + body: streamingBodyWithUsage, reqCtx: &RequestContext{ modelServerStreaming: true, }, @@ -124,18 +141,12 @@ func TestHandleResponseBody(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - server := &Server{} + server := &StreamingServer{} reqCtx := test.reqCtx if reqCtx == nil { reqCtx = &RequestContext{} } - _, err := server.HandleResponseBody(ctx, reqCtx, &extProcPb.ProcessingRequest{Request: test.req}) - if err != nil { - if !test.wantErr { - t.Fatalf("HandleResponseBody returned unexpected error: %v, want %v", err, test.wantErr) - } - return - } + server.HandleResponseBodyModelStreaming(ctx, reqCtx, test.body) if diff := cmp.Diff(test.want, reqCtx.Usage); diff != "" { t.Errorf("HandleResponseBody returned unexpected response, diff(-want, +got): %v", diff) diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go index 862a73b4..7bb0fcb1 100644 --- a/pkg/epp/handlers/server.go +++ b/pkg/epp/handlers/server.go @@ -18,14 +18,23 @@ package handlers import ( "context" + "encoding/json" "io" + "math/rand" + "strconv" + "strings" "time" + configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" envoyTypePb "github.com/envoyproxy/go-control-plane/envoy/type/v3" + "github.com/go-logr/logr" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" + "google.golang.org/protobuf/types/known/structpb" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" @@ -33,8 +42,8 @@ import ( logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) -func NewServer(scheduler Scheduler, destinationEndpointHintMetadataNamespace, destinationEndpointHintKey string, datastore datastore.Datastore) *Server { - return &Server{ +func NewStreamingServer(scheduler Scheduler, destinationEndpointHintMetadataNamespace, destinationEndpointHintKey string, datastore datastore.Datastore) *StreamingServer { + return &StreamingServer{ scheduler: scheduler, destinationEndpointHintMetadataNamespace: destinationEndpointHintMetadataNamespace, destinationEndpointHintKey: destinationEndpointHintKey, @@ -44,7 +53,7 @@ func NewServer(scheduler Scheduler, destinationEndpointHintMetadataNamespace, de // Server implements the Envoy external processing server. // https://www.envoyproxy.io/docs/envoy/latest/api-v3/service/ext_proc/v3/external_processor.proto -type Server struct { +type StreamingServer struct { scheduler Scheduler // The key of the header to specify the target pod address. This value needs to match Envoy // configuration. @@ -59,27 +68,75 @@ type Scheduler interface { Schedule(ctx context.Context, b *schedulingtypes.LLMRequest) (targetPod schedulingtypes.Pod, err error) } -func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { +// RequestContext stores context information during the life time of an HTTP request. +type RequestContext struct { + TargetPod string + TargetEndpoint string + Model string + ResolvedTargetModel string + RequestReceivedTimestamp time.Time + ResponseCompleteTimestamp time.Time + RequestSize int + Usage Usage + ResponseSize int + ResponseComplete bool + ResponseStatusCode string + RequestRunning bool + + RequestState StreamRequestState + modelServerStreaming bool + + reqHeaderResp *extProcPb.ProcessingResponse + reqBodyResp *extProcPb.ProcessingResponse + reqTrailerResp *extProcPb.ProcessingResponse + + respHeaderResp *extProcPb.ProcessingResponse + respBodyResp *extProcPb.ProcessingResponse + respTrailerResp *extProcPb.ProcessingResponse +} + +type StreamRequestState int + +const ( + RequestReceived StreamRequestState = 0 + HeaderRequestResponseComplete StreamRequestState = 1 + BodyRequestResponsesComplete StreamRequestState = 2 + TrailerRequestResponsesComplete StreamRequestState = 3 + ResponseRecieved StreamRequestState = 4 + HeaderResponseResponseComplete StreamRequestState = 5 + BodyResponseResponsesComplete StreamRequestState = 6 + TrailerResponseResponsesComplete StreamRequestState = 7 +) + +func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { ctx := srv.Context() logger := log.FromContext(ctx) - loggerVerbose := logger.V(logutil.VERBOSE) - loggerVerbose.Info("Processing") + loggerTrace := logger.V(logutil.TRACE) + loggerTrace.Info("Processing") // Create request context to share states during life time of an HTTP request. // See https://github.com/envoyproxy/envoy/issues/17540. - reqCtx := &RequestContext{} + reqCtx := &RequestContext{ + RequestState: RequestReceived, + } - // Create variable for error handling as each request should only report once for - // error metric. This doesn't cover the error "Cannot receive stream request" because - // such error might happen even the response is processed. + var body []byte + var requestBody, responseBody map[string]interface{} + + // Create error handling var as each request should only report once for + // error metrics. This doesn't cover the error "Cannot receive stream request" because + // such errors might happen even though response is processed. var err error - defer func(error) { + defer func(error, *RequestContext) { if reqCtx.ResponseStatusCode != "" { metrics.RecordRequestErrCounter(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseStatusCode) } else if err != nil { metrics.RecordRequestErrCounter(reqCtx.Model, reqCtx.ResolvedTargetModel, errutil.CanonicalCode(err)) } - }(err) + if reqCtx.RequestRunning { + metrics.DecRunningRequests(reqCtx.Model) + } + }(err, reqCtx) for { select { @@ -95,70 +152,306 @@ func (s *Server) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { if recvErr != nil { // This error occurs very frequently, though it doesn't seem to have any impact. // TODO Figure out if we can remove this noise. - loggerVerbose.Error(err, "Cannot receive stream request") + logger.V(logutil.DEFAULT).Error(err, "Cannot receive stream request") return status.Errorf(codes.Unknown, "cannot receive stream request: %v", err) } - var resp *extProcPb.ProcessingResponse switch v := req.Request.(type) { case *extProcPb.ProcessingRequest_RequestHeaders: - reqCtx.RequestReceivedTimestamp = time.Now() - resp = HandleRequestHeaders(ctx, reqCtx, req) - loggerVerbose.Info("Request context after HandleRequestHeaders", "context", reqCtx) + err = s.HandleRequestHeaders(ctx, reqCtx, v) case *extProcPb.ProcessingRequest_RequestBody: - resp, err = s.HandleRequestBody(ctx, reqCtx, req) - if err == nil { - metrics.RecordRequestCounter(reqCtx.Model, reqCtx.ResolvedTargetModel) - metrics.RecordRequestSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestSize) + loggerTrace.Info("Incoming body chunk", "EoS", v.RequestBody.EndOfStream) + // In the stream case, we can receive multiple request bodies. + body = append(body, v.RequestBody.Body...) + + // Message is buffered, we can read and decode. + if v.RequestBody.EndOfStream { + loggerTrace.Info("decoding") + err = json.Unmarshal(body, &requestBody) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error unmarshaling request body") + } + + // Body stream complete. Allocate empty slice for response to use. + body = []byte{} + + reqCtx, err = s.HandleRequestBody(ctx, reqCtx, req, requestBody) + if err != nil { + logger.V(logutil.DEFAULT).Error(err, "Error handling body") + } else { + metrics.RecordRequestCounter(reqCtx.Model, reqCtx.ResolvedTargetModel) + metrics.RecordRequestSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestSize) + } } - loggerVerbose.Info("Request context after HandleRequestBody", "context", reqCtx) + case *extProcPb.ProcessingRequest_RequestTrailers: + // This is currently unused. case *extProcPb.ProcessingRequest_ResponseHeaders: - resp, err = s.HandleResponseHeaders(ctx, reqCtx, req) - loggerVerbose.Info("Request context after HandleResponseHeaders", "context", reqCtx) - case *extProcPb.ProcessingRequest_ResponseBody: - // Don't send a 500 on a response error. Just let the message passthrough and log our error for debugging purposes. - // We assume the body is valid JSON, err messages are not guaranteed to be json, and so capturing and sending a 500 obfuscates the response message. - // using the standard 'err' var will send an immediate error response back to the caller. - var responseErr error - resp, responseErr = s.HandleResponseBody(ctx, reqCtx, req) - if responseErr != nil { - logger.V(logutil.DEFAULT).Error(responseErr, "Failed to process response body", "request", req) - } else if reqCtx.ResponseComplete { - reqCtx.ResponseCompleteTimestamp = time.Now() - metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) - metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) - metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens) - metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens) - metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens) + for _, header := range v.ResponseHeaders.Headers.GetHeaders() { + value := string(header.RawValue) + + loggerTrace.Info("header", "key", header.Key, "value", value) + if header.Key == "status" && value != "200" { + reqCtx.ResponseStatusCode = errutil.ModelServerError + } else if header.Key == "content-type" && strings.Contains(value, "text/event-stream") { + reqCtx.modelServerStreaming = true + loggerTrace.Info("model server is streaming response") + } } + reqCtx.RequestState = ResponseRecieved + reqCtx.respHeaderResp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ResponseHeaders{ + ResponseHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + // This is for debugging purpose only. + Key: "x-went-into-resp-headers", + RawValue: []byte("true"), + }, + }, + }, + }, + }, + }, + }, + } + + case *extProcPb.ProcessingRequest_ResponseBody: if reqCtx.modelServerStreaming { - logger.V(logutil.DEBUG).Info("Request context after HandleResponseBody", "context", reqCtx) + // Currently we punt on response parsing if the modelServer is streaming, and we just passthrough. + + responseText := string(v.ResponseBody.Body) + s.HandleResponseBodyModelStreaming(ctx, reqCtx, responseText) + if v.ResponseBody.EndOfStream { + loggerTrace.Info("stream completed") + + reqCtx.ResponseCompleteTimestamp = time.Now() + metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) + metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) + } + + reqCtx.respBodyResp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_ResponseBody{ + ResponseBody: &extProcPb.BodyResponse{ + Response: &extProcPb.CommonResponse{ + BodyMutation: &extProcPb.BodyMutation{ + Mutation: &extProcPb.BodyMutation_StreamedResponse{ + StreamedResponse: &extProcPb.StreamedBodyResponse{ + Body: v.ResponseBody.Body, + EndOfStream: v.ResponseBody.EndOfStream, + }, + }, + }, + }, + }, + }, + } } else { - loggerVerbose.Info("Request context after HandleResponseBody", "context", reqCtx) + body = append(body, v.ResponseBody.Body...) + + // Message is buffered, we can read and decode. + if v.ResponseBody.EndOfStream { + loggerTrace.Info("stream completed") + // Don't send a 500 on a response error. Just let the message passthrough and log our error for debugging purposes. + // We assume the body is valid JSON, err messages are not guaranteed to be json, and so capturing and sending a 500 obfuscates the response message. + // using the standard 'err' var will send an immediate error response back to the caller. + var responseErr error + responseErr = json.Unmarshal(body, &responseBody) + if responseErr != nil { + logger.V(logutil.DEFAULT).Error(responseErr, "Error unmarshaling request body") + } + + reqCtx, responseErr = s.HandleResponseBody(ctx, reqCtx, responseBody) + if responseErr != nil { + logger.V(logutil.DEFAULT).Error(responseErr, "Failed to process response body", "request", req) + } else if reqCtx.ResponseComplete { + reqCtx.ResponseCompleteTimestamp = time.Now() + metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) + metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) + metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens) + metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens) + } + } } - default: - logger.V(logutil.DEFAULT).Error(nil, "Unknown Request type", "request", v) - return status.Error(codes.Unknown, "unknown request type") + case *extProcPb.ProcessingRequest_ResponseTrailers: + // This is currently unused. } + // Handle the err and fire an immediate response. if err != nil { logger.V(logutil.DEFAULT).Error(err, "Failed to process request", "request", req) - resp, err = BuildErrResponse(err) + resp, err := BuildErrResponse(err) if err != nil { return err } + if err := srv.Send(resp); err != nil { + logger.V(logutil.DEFAULT).Error(err, "Send failed") + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + return nil + } + loggerTrace.Info("checking", "request state", reqCtx.RequestState) + if err := reqCtx.updateStateAndSendIfNeeded(srv, logger); err != nil { + return err + } + } +} + +// updateStateAndSendIfNeeded checks state and can send mutiple responses in a single pass, but only if ordered properly. +// Order of requests matter in FULL_DUPLEX_STREAMING. For both request and response, the order of response sent back MUST be: Header->Body->Trailer, with trailer being optional. +func (r *RequestContext) updateStateAndSendIfNeeded(srv extProcPb.ExternalProcessor_ProcessServer, logger logr.Logger) error { + loggerTrace := logger.V(logutil.TRACE) + // No switch statement as we could send multiple responses in one pass. + if r.RequestState == RequestReceived && r.reqHeaderResp != nil { + loggerTrace.Info("Sending request header response", "obj", r.reqHeaderResp) + if err := srv.Send(r.reqHeaderResp); err != nil { + logger.V(logutil.DEFAULT).Error(err, "error sending response") + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + r.RequestState = HeaderRequestResponseComplete + } + if r.RequestState == HeaderRequestResponseComplete && r.reqBodyResp != nil { + loggerTrace.Info("Sending request body response") + if err := srv.Send(r.reqBodyResp); err != nil { + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + r.RequestState = BodyRequestResponsesComplete + metrics.IncRunningRequests(r.Model) + r.RequestRunning = true + // Dump the response so a new stream message can begin + r.reqBodyResp = nil + } + if r.RequestState == BodyRequestResponsesComplete && r.reqTrailerResp != nil { + // Trailers in requests are not guaranteed + if err := srv.Send(r.reqHeaderResp); err != nil { + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + } + if r.RequestState == ResponseRecieved && r.respHeaderResp != nil { + loggerTrace.Info("Sending response header response", "obj", r.respHeaderResp) + if err := srv.Send(r.respHeaderResp); err != nil { + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) + } + r.RequestState = HeaderResponseResponseComplete + } + if r.RequestState == HeaderResponseResponseComplete && r.respBodyResp != nil { + loggerTrace.Info("Sending response body response") + if err := srv.Send(r.respBodyResp); err != nil { + return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) } - if !reqCtx.modelServerStreaming { - loggerVerbose.Info("Response generated", "response", resp) - } else { - logger.V(logutil.DEBUG).Info("Response generated", "response", resp) + body := r.respBodyResp.Response.(*extProcPb.ProcessingResponse_ResponseBody) + if body.ResponseBody.Response.GetBodyMutation().GetStreamedResponse().GetEndOfStream() { + r.RequestState = BodyResponseResponsesComplete } - if err := srv.Send(resp); err != nil { - logger.V(logutil.DEFAULT).Error(err, "Send failed") + // Dump the response so a new stream message can begin + r.respBodyResp = nil + } + if r.RequestState == BodyResponseResponsesComplete && r.respTrailerResp != nil { + // Trailers in requests are not guaranteed + if err := srv.Send(r.reqHeaderResp); err != nil { return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) } } + return nil +} + +func (s *StreamingServer) populateRequestHeaderResponse(reqCtx *RequestContext, endpoint string, requestBodyLength int) { + headers := []*configPb.HeaderValueOption{ + { + Header: &configPb.HeaderValue{ + Key: s.destinationEndpointHintKey, + RawValue: []byte(endpoint), + }, + }, + } + if requestBodyLength > 0 { + // We need to update the content length header if the body is mutated, see Envoy doc: + // https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/filters/http/ext_proc/v3/processing_mode.proto + headers = append(headers, &configPb.HeaderValueOption{ + Header: &configPb.HeaderValue{ + Key: "Content-Length", + RawValue: []byte(strconv.Itoa(requestBodyLength)), + }, + }) + } + + targetEndpointValue := &structpb.Struct{ + Fields: map[string]*structpb.Value{ + s.destinationEndpointHintKey: { + Kind: &structpb.Value_StringValue{ + StringValue: endpoint, + }, + }, + }, + } + dynamicMetadata := targetEndpointValue + if s.destinationEndpointHintMetadataNamespace != "" { + // If a namespace is defined, wrap the selected endpoint with that. + dynamicMetadata = &structpb.Struct{ + Fields: map[string]*structpb.Value{ + s.destinationEndpointHintMetadataNamespace: { + Kind: &structpb.Value_StructValue{ + StructValue: targetEndpointValue, + }, + }, + }, + } + } + + reqCtx.reqHeaderResp = &extProcPb.ProcessingResponse{ + Response: &extProcPb.ProcessingResponse_RequestHeaders{ + RequestHeaders: &extProcPb.HeadersResponse{ + Response: &extProcPb.CommonResponse{ + ClearRouteCache: true, + HeaderMutation: &extProcPb.HeaderMutation{ + SetHeaders: headers, + }, + }, + }, + }, + DynamicMetadata: dynamicMetadata, + } +} + +func RandomWeightedDraw(logger logr.Logger, model *v1alpha2.InferenceModel, seed int64) string { + // TODO: after we are down to 1 server implementation, make these methods a part of the struct + // and handle random seeding on the struct. + source := rand.NewSource(rand.Int63()) + if seed > 0 { + source = rand.NewSource(seed) + } + r := rand.New(source) + + // all the weight values are nil, then we should return random model name + if model.Spec.TargetModels[0].Weight == nil { + index := r.Int31n(int32(len(model.Spec.TargetModels))) + return model.Spec.TargetModels[index].Name + } + + var weights int32 + for _, model := range model.Spec.TargetModels { + weights += *model.Weight + } + logger.V(logutil.TRACE).Info("Weights for model computed", "model", model.Name, "weights", weights) + randomVal := r.Int31n(weights) + // TODO: optimize this without using loop + for _, model := range model.Spec.TargetModels { + if randomVal < *model.Weight { + return model.Name + } + randomVal -= *model.Weight + } + return "" +} + +func GetRandomPod(ds datastore.Datastore) *backendmetrics.Pod { + pods := ds.PodGetAll() + number := rand.Intn(len(pods)) + pod := pods[number] + return pod.GetPod() } func BuildErrResponse(err error) (*extProcPb.ProcessingResponse, error) { @@ -214,43 +507,3 @@ func BuildErrResponse(err error) (*extProcPb.ProcessingResponse, error) { } return resp, nil } - -// RequestContext stores context information during the life time of an HTTP request. -type RequestContext struct { - TargetPod string - TargetEndpoint string - Model string - ResolvedTargetModel string - RequestReceivedTimestamp time.Time - ResponseCompleteTimestamp time.Time - RequestSize int - Usage Usage - ResponseSize int - ResponseComplete bool - ResponseStatusCode string - RequestRunning bool - - RequestState StreamRequestState - modelServerStreaming bool - - reqHeaderResp *extProcPb.ProcessingResponse - reqBodyResp *extProcPb.ProcessingResponse - reqTrailerResp *extProcPb.ProcessingResponse - - respHeaderResp *extProcPb.ProcessingResponse - respBodyResp *extProcPb.ProcessingResponse - respTrailerResp *extProcPb.ProcessingResponse -} - -type StreamRequestState int - -const ( - RequestReceived StreamRequestState = 0 - HeaderRequestResponseComplete StreamRequestState = 1 - BodyRequestResponsesComplete StreamRequestState = 2 - TrailerRequestResponsesComplete StreamRequestState = 3 - ResponseRecieved StreamRequestState = 4 - HeaderResponseResponseComplete StreamRequestState = 5 - BodyResponseResponsesComplete StreamRequestState = 6 - TrailerResponseResponsesComplete StreamRequestState = 7 -) diff --git a/pkg/epp/handlers/streamingserver.go b/pkg/epp/handlers/streamingserver.go deleted file mode 100644 index ca3451cb..00000000 --- a/pkg/epp/handlers/streamingserver.go +++ /dev/null @@ -1,594 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package handlers - -import ( - "context" - "encoding/json" - "fmt" - "io" - "math/rand" - "strconv" - "strings" - "time" - - configPb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" - extProcPb "github.com/envoyproxy/go-control-plane/envoy/service/ext_proc/v3" - "github.com/go-logr/logr" - "google.golang.org/grpc/codes" - "google.golang.org/grpc/status" - "google.golang.org/protobuf/types/known/structpb" - "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" - backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" - schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" - errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" -) - -func NewStreamingServer(scheduler Scheduler, destinationEndpointHintMetadataNamespace, destinationEndpointHintKey string, datastore datastore.Datastore) *StreamingServer { - return &StreamingServer{ - scheduler: scheduler, - destinationEndpointHintMetadataNamespace: destinationEndpointHintMetadataNamespace, - destinationEndpointHintKey: destinationEndpointHintKey, - datastore: datastore, - } -} - -type StreamingServer struct { - scheduler Scheduler - // The key of the header to specify the target pod address. This value needs to match Envoy - // configuration. - destinationEndpointHintKey string - // The key acting as the outer namespace struct in the metadata extproc response to communicate - // back the picked endpoints. - destinationEndpointHintMetadataNamespace string - datastore datastore.Datastore -} - -func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer) error { - ctx := srv.Context() - logger := log.FromContext(ctx) - loggerTrace := logger.V(logutil.TRACE) - loggerTrace.Info("Processing") - - // Create request context to share states during life time of an HTTP request. - // See https://github.com/envoyproxy/envoy/issues/17540. - reqCtx := &RequestContext{ - RequestState: RequestReceived, - } - - var body []byte - var requestBody, responseBody map[string]interface{} - - // Create error handling var as each request should only report once for - // error metrics. This doesn't cover the error "Cannot receive stream request" because - // such errors might happen even though response is processed. - var err error - defer func(error, *RequestContext) { - if reqCtx.ResponseStatusCode != "" { - metrics.RecordRequestErrCounter(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseStatusCode) - } else if err != nil { - metrics.RecordRequestErrCounter(reqCtx.Model, reqCtx.ResolvedTargetModel, errutil.CanonicalCode(err)) - } - if reqCtx.RequestRunning { - metrics.DecRunningRequests(reqCtx.Model) - } - }(err, reqCtx) - - for { - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - - req, recvErr := srv.Recv() - if recvErr == io.EOF || status.Code(recvErr) == codes.Canceled { - return nil - } - if recvErr != nil { - // This error occurs very frequently, though it doesn't seem to have any impact. - // TODO Figure out if we can remove this noise. - logger.V(logutil.DEFAULT).Error(err, "Cannot receive stream request") - return status.Errorf(codes.Unknown, "cannot receive stream request: %v", err) - } - - switch v := req.Request.(type) { - case *extProcPb.ProcessingRequest_RequestHeaders: - err = s.HandleRequestHeaders(ctx, reqCtx, v) - case *extProcPb.ProcessingRequest_RequestBody: - loggerTrace.Info("Incoming body chunk", "EoS", v.RequestBody.EndOfStream) - // In the stream case, we can receive multiple request bodies. - body = append(body, v.RequestBody.Body...) - - // Message is buffered, we can read and decode. - if v.RequestBody.EndOfStream { - loggerTrace.Info("decoding") - err = json.Unmarshal(body, &requestBody) - if err != nil { - logger.V(logutil.DEFAULT).Error(err, "Error unmarshaling request body") - } - - // Body stream complete. Allocate empty slice for response to use. - body = []byte{} - - reqCtx, err = s.HandleRequestBody(ctx, reqCtx, req, requestBody) - if err != nil { - logger.V(logutil.DEFAULT).Error(err, "Error handling body") - } else { - metrics.RecordRequestCounter(reqCtx.Model, reqCtx.ResolvedTargetModel) - metrics.RecordRequestSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestSize) - } - } - case *extProcPb.ProcessingRequest_RequestTrailers: - // This is currently unused. - case *extProcPb.ProcessingRequest_ResponseHeaders: - for _, header := range v.ResponseHeaders.Headers.GetHeaders() { - value := string(header.RawValue) - - loggerTrace.Info("header", "key", header.Key, "value", value) - if header.Key == "status" && value != "200" { - reqCtx.ResponseStatusCode = errutil.ModelServerError - } else if header.Key == "content-type" && strings.Contains(value, "text/event-stream") { - reqCtx.modelServerStreaming = true - loggerTrace.Info("model server is streaming response") - } - } - reqCtx.RequestState = ResponseRecieved - reqCtx.respHeaderResp = &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_ResponseHeaders{ - ResponseHeaders: &extProcPb.HeadersResponse{ - Response: &extProcPb.CommonResponse{ - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - // This is for debugging purpose only. - Key: "x-went-into-resp-headers", - RawValue: []byte("true"), - }, - }, - }, - }, - }, - }, - }, - } - - case *extProcPb.ProcessingRequest_ResponseBody: - if reqCtx.modelServerStreaming { - // Currently we punt on response parsing if the modelServer is streaming, and we just passthrough. - - responseText := string(v.ResponseBody.Body) - s.HandleResponseBodyModelStreaming(ctx, reqCtx, responseText) - if v.ResponseBody.EndOfStream { - loggerTrace.Info("stream completed") - - reqCtx.ResponseCompleteTimestamp = time.Now() - metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) - metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) - metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens) - } - - reqCtx.respBodyResp = &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_ResponseBody{ - ResponseBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: v.ResponseBody.Body, - EndOfStream: v.ResponseBody.EndOfStream, - }, - }, - }, - }, - }, - }, - } - } else { - body = append(body, v.ResponseBody.Body...) - - // Message is buffered, we can read and decode. - if v.ResponseBody.EndOfStream { - loggerTrace.Info("stream completed") - // Don't send a 500 on a response error. Just let the message passthrough and log our error for debugging purposes. - // We assume the body is valid JSON, err messages are not guaranteed to be json, and so capturing and sending a 500 obfuscates the response message. - // using the standard 'err' var will send an immediate error response back to the caller. - var responseErr error - responseErr = json.Unmarshal(body, &responseBody) - if responseErr != nil { - logger.V(logutil.DEFAULT).Error(responseErr, "Error unmarshaling request body") - } - - reqCtx, responseErr = s.HandleResponseBody(ctx, reqCtx, responseBody) - if responseErr != nil { - logger.V(logutil.DEFAULT).Error(responseErr, "Failed to process response body", "request", req) - } else if reqCtx.ResponseComplete { - reqCtx.ResponseCompleteTimestamp = time.Now() - metrics.RecordRequestLatencies(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp) - metrics.RecordResponseSizes(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.ResponseSize) - metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.PromptTokens) - metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.Usage.CompletionTokens) - metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.Model, reqCtx.ResolvedTargetModel, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens) - } - } - } - case *extProcPb.ProcessingRequest_ResponseTrailers: - // This is currently unused. - } - - // Handle the err and fire an immediate response. - if err != nil { - logger.V(logutil.DEFAULT).Error(err, "Failed to process request", "request", req) - resp, err := BuildErrResponse(err) - if err != nil { - return err - } - if err := srv.Send(resp); err != nil { - logger.V(logutil.DEFAULT).Error(err, "Send failed") - return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) - } - return nil - } - loggerTrace.Info("checking", "request state", reqCtx.RequestState) - if err := reqCtx.updateStateAndSendIfNeeded(srv, logger); err != nil { - return err - } - } -} - -// updateStateAndSendIfNeeded checks state and can send mutiple responses in a single pass, but only if ordered properly. -// Order of requests matter in FULL_DUPLEX_STREAMING. For both request and response, the order of response sent back MUST be: Header->Body->Trailer, with trailer being optional. -func (r *RequestContext) updateStateAndSendIfNeeded(srv extProcPb.ExternalProcessor_ProcessServer, logger logr.Logger) error { - loggerTrace := logger.V(logutil.TRACE) - // No switch statement as we could send multiple responses in one pass. - if r.RequestState == RequestReceived && r.reqHeaderResp != nil { - loggerTrace.Info("Sending request header response", "obj", r.reqHeaderResp) - if err := srv.Send(r.reqHeaderResp); err != nil { - logger.V(logutil.DEFAULT).Error(err, "error sending response") - return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) - } - r.RequestState = HeaderRequestResponseComplete - } - if r.RequestState == HeaderRequestResponseComplete && r.reqBodyResp != nil { - loggerTrace.Info("Sending request body response") - if err := srv.Send(r.reqBodyResp); err != nil { - return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) - } - r.RequestState = BodyRequestResponsesComplete - metrics.IncRunningRequests(r.Model) - r.RequestRunning = true - // Dump the response so a new stream message can begin - r.reqBodyResp = nil - } - if r.RequestState == BodyRequestResponsesComplete && r.reqTrailerResp != nil { - // Trailers in requests are not guaranteed - if err := srv.Send(r.reqHeaderResp); err != nil { - return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) - } - } - if r.RequestState == ResponseRecieved && r.respHeaderResp != nil { - loggerTrace.Info("Sending response header response", "obj", r.respHeaderResp) - if err := srv.Send(r.respHeaderResp); err != nil { - return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) - } - r.RequestState = HeaderResponseResponseComplete - } - if r.RequestState == HeaderResponseResponseComplete && r.respBodyResp != nil { - loggerTrace.Info("Sending response body response") - if err := srv.Send(r.respBodyResp); err != nil { - return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) - } - - body := r.respBodyResp.Response.(*extProcPb.ProcessingResponse_ResponseBody) - if body.ResponseBody.Response.GetBodyMutation().GetStreamedResponse().GetEndOfStream() { - r.RequestState = BodyResponseResponsesComplete - } - // Dump the response so a new stream message can begin - r.respBodyResp = nil - } - if r.RequestState == BodyResponseResponsesComplete && r.respTrailerResp != nil { - // Trailers in requests are not guaranteed - if err := srv.Send(r.reqHeaderResp); err != nil { - return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) - } - } - return nil -} - -// HandleRequestBody always returns the requestContext even in the error case, as the request context is used in error handling. -func (s *StreamingServer) HandleRequestBody( - ctx context.Context, - reqCtx *RequestContext, - req *extProcPb.ProcessingRequest, - requestBodyMap map[string]interface{}, -) (*RequestContext, error) { - var requestBodyBytes []byte - logger := log.FromContext(ctx) - - // Resolve target models. - model, ok := requestBodyMap["model"].(string) - if !ok { - return reqCtx, errutil.Error{Code: errutil.BadRequest, Msg: "model not found in request"} - } - - modelName := model - - // NOTE: The nil checking for the modelObject means that we DO allow passthrough currently. - // This might be a security risk in the future where adapters not registered in the InferenceModel - // are able to be requested by using their distinct name. - modelObj := s.datastore.ModelGet(model) - if modelObj == nil { - return reqCtx, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error finding a model object in InferenceModel for input %v", model)} - } - if len(modelObj.Spec.TargetModels) > 0 { - modelName = RandomWeightedDraw(logger, modelObj, 0) - if modelName == "" { - return reqCtx, errutil.Error{Code: errutil.BadConfiguration, Msg: fmt.Sprintf("error getting target model name for model %v", modelObj.Name)} - } - } - llmReq := &schedulingtypes.LLMRequest{ - Model: model, - ResolvedTargetModel: modelName, - Critical: modelObj.Spec.Criticality != nil && *modelObj.Spec.Criticality == v1alpha2.Critical, - } - logger.V(logutil.DEBUG).Info("LLM request assembled", "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "critical", llmReq.Critical) - - var err error - // Update target models in the body. - if llmReq.Model != llmReq.ResolvedTargetModel { - requestBodyMap["model"] = llmReq.ResolvedTargetModel - } - - requestBodyBytes, err = json.Marshal(requestBodyMap) - if err != nil { - logger.V(logutil.DEFAULT).Error(err, "Error marshaling request body") - return reqCtx, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("error marshaling request body: %v", err)} - } - - target, err := s.scheduler.Schedule(ctx, llmReq) - if err != nil { - return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()} - } - targetPod := target.GetPod() - - // Insert target endpoint to instruct Envoy to route requests to the specified target pod. - // Attach the port number - pool, err := s.datastore.PoolGet() - if err != nil { - return reqCtx, err - } - endpoint := targetPod.Address + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber)) - - logger.V(logutil.DEFAULT).Info("Request handled", - "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "endpoint", targetPod, "endpoint metrics", - fmt.Sprintf("%+v", target)) - - reqCtx.Model = llmReq.Model - reqCtx.ResolvedTargetModel = llmReq.ResolvedTargetModel - reqCtx.RequestSize = len(requestBodyBytes) - reqCtx.TargetPod = targetPod.NamespacedName.String() - reqCtx.TargetEndpoint = endpoint - - s.populateRequestHeaderResponse(reqCtx, endpoint, len(requestBodyBytes)) - - reqCtx.reqBodyResp = &extProcPb.ProcessingResponse{ - // The Endpoint Picker supports two approaches to communicating the target endpoint, as a request header - // and as an unstructure ext-proc response metadata key/value pair. This enables different integration - // options for gateway providers. - Response: &extProcPb.ProcessingResponse_RequestBody{ - RequestBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: requestBodyBytes, - EndOfStream: true, - }, - }, - }, - }, - }, - }, - } - return reqCtx, nil -} - -// HandleResponseBody always returns the requestContext even in the error case, as the request context is used in error handling. -func (s *StreamingServer) HandleResponseBody( - ctx context.Context, - reqCtx *RequestContext, - response map[string]interface{}, -) (*RequestContext, error) { - logger := log.FromContext(ctx) - responseBytes, err := json.Marshal(response) - if err != nil { - logger.V(logutil.DEFAULT).Error(err, "error marshalling responseBody") - return reqCtx, err - } - if response["usage"] != nil { - usg := response["usage"].(map[string]interface{}) - usage := Usage{ - PromptTokens: int(usg["prompt_tokens"].(float64)), - CompletionTokens: int(usg["completion_tokens"].(float64)), - TotalTokens: int(usg["total_tokens"].(float64)), - } - reqCtx.Usage = usage - logger.V(logutil.VERBOSE).Info("Response generated", "usage", reqCtx.Usage) - } - reqCtx.ResponseSize = len(responseBytes) - // ResponseComplete is to indicate the response is complete. In non-streaming - // case, it will be set to be true once the response is processed; in - // streaming case, it will be set to be true once the last chunk is processed. - // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/178) - // will add the processing for streaming case. - reqCtx.ResponseComplete = true - - reqCtx.respBodyResp = &extProcPb.ProcessingResponse{ - // The Endpoint Picker supports two approaches to communicating the target endpoint, as a request header - // and as an unstructure ext-proc response metadata key/value pair. This enables different integration - // options for gateway providers. - Response: &extProcPb.ProcessingResponse_ResponseBody{ - ResponseBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_StreamedResponse{ - StreamedResponse: &extProcPb.StreamedBodyResponse{ - Body: responseBytes, - EndOfStream: true, - }, - }, - }, - }, - }, - }, - } - return reqCtx, nil -} - -// The function is to handle streaming response if the modelServer is streaming. -func (s *StreamingServer) HandleResponseBodyModelStreaming( - ctx context.Context, - reqCtx *RequestContext, - responseText string, -) { - if strings.Contains(responseText, streamingEndMsg) { - resp := ParseRespForUsage(ctx, responseText) - metrics.RecordInputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, resp.Usage.PromptTokens) - metrics.RecordOutputTokens(reqCtx.Model, reqCtx.ResolvedTargetModel, resp.Usage.CompletionTokens) - } -} - -func (s *StreamingServer) HandleRequestHeaders(ctx context.Context, reqCtx *RequestContext, req *extProcPb.ProcessingRequest_RequestHeaders) error { - reqCtx.RequestReceivedTimestamp = time.Now() - - // an EoS in the request headers means this request has no body or trailers. - if req.RequestHeaders.EndOfStream { - // We will route this request to a random pod as this is assumed to just be a GET - // More context: https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/526 - // The above PR will address endpoint admission, but currently any request without a body will be - // routed to a random upstream pod. - pod := GetRandomPod(s.datastore) - pool, err := s.datastore.PoolGet() - if err != nil { - return err - } - endpoint := pod.Address + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber)) - s.populateRequestHeaderResponse(reqCtx, endpoint, 0) - } - return nil -} - -func (s *StreamingServer) populateRequestHeaderResponse(reqCtx *RequestContext, endpoint string, requestBodyLength int) { - headers := []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: s.destinationEndpointHintKey, - RawValue: []byte(endpoint), - }, - }, - } - if requestBodyLength > 0 { - // We need to update the content length header if the body is mutated, see Envoy doc: - // https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/filters/http/ext_proc/v3/processing_mode.proto - headers = append(headers, &configPb.HeaderValueOption{ - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte(strconv.Itoa(requestBodyLength)), - }, - }) - } - - targetEndpointValue := &structpb.Struct{ - Fields: map[string]*structpb.Value{ - s.destinationEndpointHintKey: { - Kind: &structpb.Value_StringValue{ - StringValue: endpoint, - }, - }, - }, - } - dynamicMetadata := targetEndpointValue - if s.destinationEndpointHintMetadataNamespace != "" { - // If a namespace is defined, wrap the selected endpoint with that. - dynamicMetadata = &structpb.Struct{ - Fields: map[string]*structpb.Value{ - s.destinationEndpointHintMetadataNamespace: { - Kind: &structpb.Value_StructValue{ - StructValue: targetEndpointValue, - }, - }, - }, - } - } - - reqCtx.reqHeaderResp = &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_RequestHeaders{ - RequestHeaders: &extProcPb.HeadersResponse{ - Response: &extProcPb.CommonResponse{ - ClearRouteCache: true, - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: headers, - }, - }, - }, - }, - DynamicMetadata: dynamicMetadata, - } -} - -func RandomWeightedDraw(logger logr.Logger, model *v1alpha2.InferenceModel, seed int64) string { - // TODO: after we are down to 1 server implementation, make these methods a part of the struct - // and handle random seeding on the struct. - source := rand.NewSource(rand.Int63()) - if seed > 0 { - source = rand.NewSource(seed) - } - r := rand.New(source) - - // all the weight values are nil, then we should return random model name - if model.Spec.TargetModels[0].Weight == nil { - index := r.Int31n(int32(len(model.Spec.TargetModels))) - return model.Spec.TargetModels[index].Name - } - - var weights int32 - for _, model := range model.Spec.TargetModels { - weights += *model.Weight - } - logger.V(logutil.TRACE).Info("Weights for model computed", "model", model.Name, "weights", weights) - randomVal := r.Int31n(weights) - // TODO: optimize this without using loop - for _, model := range model.Spec.TargetModels { - if randomVal < *model.Weight { - return model.Name - } - randomVal -= *model.Weight - } - return "" -} - -func GetRandomPod(ds datastore.Datastore) *backendmetrics.Pod { - pods := ds.PodGetAll() - number := rand.Intn(len(pods)) - pod := pods[number] - return pod.GetPod() -} diff --git a/pkg/epp/server/runserver.go b/pkg/epp/server/runserver.go index 7ed183be..aa048e6e 100644 --- a/pkg/epp/server/runserver.go +++ b/pkg/epp/server/runserver.go @@ -146,14 +146,7 @@ func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable { } else { srv = grpc.NewServer() } - var extProcServer extProcPb.ExternalProcessorServer - if r.UseStreaming { - logger.Info("Using streaming extproc server") - extProcServer = handlers.NewStreamingServer(scheduling.NewScheduler(r.Datastore), r.DestinationEndpointHintMetadataNamespace, r.DestinationEndpointHintKey, r.Datastore) - } else { - logger.Info("Using standard extproc server") - extProcServer = handlers.NewServer(scheduling.NewScheduler(r.Datastore), r.DestinationEndpointHintMetadataNamespace, r.DestinationEndpointHintKey, r.Datastore) - } + extProcServer := handlers.NewStreamingServer(scheduling.NewScheduler(r.Datastore), r.DestinationEndpointHintMetadataNamespace, r.DestinationEndpointHintKey, r.Datastore) extProcPb.RegisterExternalProcessorServer( srv, extProcServer, diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go index ae2c6170..372158f4 100644 --- a/test/integration/epp/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -92,325 +92,6 @@ func TestMain(m *testing.M) { os.Exit(code) } -func TestKubeInferenceModelRequest(t *testing.T) { - tests := []struct { - name string - req *extProcPb.ProcessingRequest - pods map[backendmetrics.Pod]*backendmetrics.Metrics - wantHeaders []*configPb.HeaderValueOption - wantMetadata *structpb.Struct - wantBody []byte - wantMetrics string - wantErr bool - immediateResponse *extProcPb.ImmediateResponse - }{ - { - name: "select lower queue and kv cache, no active lora", - req: integrationutils.GenerateRequest(logger, "test1", "my-model"), - // pod-1 will be picked because it has relatively low queue size and low KV cache. - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 3, - KVCacheUsagePercent: 0.2, - }, - fakePod(1): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.1, - }, - fakePod(2): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - }, - }, - wantHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: runserver.DefaultDestinationEndpointHintKey, - RawValue: []byte("192.168.1.2:8000"), - }, - }, - { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte("76"), - }, - }, - }, - wantMetadata: makeMetadata("192.168.1.2:8000"), - wantBody: []byte("{\"max_tokens\":100,\"model\":\"my-model-12345\",\"prompt\":\"test1\",\"temperature\":0}"), - wantMetrics: ` - # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. - # TYPE inference_model_request_total counter - inference_model_request_total{model_name="my-model",target_model_name="my-model-12345"} 1 - `, - wantErr: false, - }, - { - name: "select active lora, low queue", - req: integrationutils.GenerateRequest(logger, "test2", "sql-lora"), - // pod-1 will be picked because it has relatively low queue size, with the requested - // model being active, and has low KV cache. - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - WaitingModels: map[string]int{}, - }, - fakePod(1): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.1, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg2": 1, - }, - WaitingModels: map[string]int{}, - }, - fakePod(2): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - WaitingModels: map[string]int{}, - }, - }, - wantHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: runserver.DefaultDestinationEndpointHintKey, - RawValue: []byte("192.168.1.2:8000"), - }, - }, - { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte("76"), - }, - }, - }, - wantMetadata: makeMetadata("192.168.1.2:8000"), - wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test2\",\"temperature\":0}"), - wantMetrics: ` - # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. - # TYPE inference_model_request_total counter - inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1 - `, - wantErr: false, - }, - { - name: "select no lora despite active model, avoid excessive queue size", - req: integrationutils.GenerateRequest(logger, "test3", "sql-lora"), - // pod-2 will be picked despite it NOT having the requested model being active - // as it's above the affinity for queue size. Also is critical, so we should - // still honor request despite all queues > 5 - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - }, - WaitingModels: map[string]int{}, - }, - fakePod(1): { - WaitingQueueSize: 200, - KVCacheUsagePercent: 0.1, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg2": 1, - }, - WaitingModels: map[string]int{}, - }, - fakePod(2): { - WaitingQueueSize: 6, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - }, - WaitingModels: map[string]int{}, - }, - }, - wantHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: runserver.DefaultDestinationEndpointHintKey, - RawValue: []byte("192.168.1.3:8000"), - }, - }, - { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte("76"), - }, - }, - }, - wantMetadata: makeMetadata("192.168.1.3:8000"), - wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg2\",\"prompt\":\"test3\",\"temperature\":0}"), - wantMetrics: ` - # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. - # TYPE inference_model_request_total counter - inference_model_request_total{model_name="sql-lora",target_model_name="sql-lora-1fdg2"} 1 - `, - wantErr: false, - }, - { - name: "noncritical and all models past threshold, shed request", - req: integrationutils.GenerateRequest(logger, "test4", "sql-lora-sheddable"), - // no pods will be picked as all models are either above kv threshold, - // queue threshold, or both. - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 6, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - "sql-lora-1fdg3": 1, - }, - WaitingModels: map[string]int{}, - }, - fakePod(1): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.85, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, - WaitingModels: map[string]int{}, - }, - fakePod(2): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.9, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, - WaitingModels: map[string]int{}, - }, - }, - wantHeaders: []*configPb.HeaderValueOption{}, - wantMetadata: &structpb.Struct{}, - wantBody: []byte(""), - wantErr: false, - immediateResponse: &extProcPb.ImmediateResponse{ - Status: &envoyTypePb.HttpStatus{ - Code: envoyTypePb.StatusCode_TooManyRequests, - }, - }, - wantMetrics: "", - }, - { - name: "noncritical, but one server has capacity, do not shed", - req: integrationutils.GenerateRequest(logger, "test5", "sql-lora-sheddable"), - // pod 0 will be picked as all other models are above threshold - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ - fakePod(0): { - WaitingQueueSize: 4, - KVCacheUsagePercent: 0.2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, - "sql-lora-1fdg3": 1, - }, - WaitingModels: map[string]int{}, - }, - fakePod(1): { - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.85, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, - WaitingModels: map[string]int{}, - }, - fakePod(2): { - WaitingQueueSize: 10, - KVCacheUsagePercent: 0.9, - ActiveModels: map[string]int{ - "foo": 1, - "sql-lora-1fdg3": 1, - }, - WaitingModels: map[string]int{}, - }, - }, - wantHeaders: []*configPb.HeaderValueOption{ - { - Header: &configPb.HeaderValue{ - Key: runserver.DefaultDestinationEndpointHintKey, - RawValue: []byte("192.168.1.1:8000"), - }, - }, - { - Header: &configPb.HeaderValue{ - Key: "Content-Length", - RawValue: []byte("76"), - }, - }, - }, - wantMetadata: makeMetadata("192.168.1.1:8000"), - wantBody: []byte("{\"max_tokens\":100,\"model\":\"sql-lora-1fdg3\",\"prompt\":\"test5\",\"temperature\":0}"), - wantMetrics: ` - # HELP inference_model_request_total [ALPHA] Counter of inference model requests broken out for each model and target model. - # TYPE inference_model_request_total counter - inference_model_request_total{model_name="sql-lora-sheddable",target_model_name="sql-lora-1fdg3"} 1 - `, - wantErr: false, - }, - } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - client, cleanup := setUpHermeticServer(t, test.pods, false) - t.Cleanup(cleanup) - want := &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_RequestBody{ - RequestBody: &extProcPb.BodyResponse{ - Response: &extProcPb.CommonResponse{ - HeaderMutation: &extProcPb.HeaderMutation{ - SetHeaders: test.wantHeaders, - }, - BodyMutation: &extProcPb.BodyMutation{ - Mutation: &extProcPb.BodyMutation_Body{ - Body: test.wantBody, - }, - }, - }, - }, - }, - DynamicMetadata: test.wantMetadata, - } - res, err := integrationutils.SendRequest(t, client, test.req) - - if err != nil && !test.wantErr { - t.Errorf("Unexpected error, got: %v, want error: %v", err, test.wantErr) - } - if test.immediateResponse != nil { - want = &extProcPb.ProcessingResponse{ - Response: &extProcPb.ProcessingResponse_ImmediateResponse{ - ImmediateResponse: test.immediateResponse, - }, - } - } - if diff := cmp.Diff(want, res, protocmp.Transform()); diff != "" { - t.Errorf("Unexpected response, (-want +got): %v", diff) - } - - if test.wantMetrics != "" { - if err := metricsutils.GatherAndCompare(legacyregistry.DefaultGatherer, strings.NewReader(test.wantMetrics), "inference_model_request_total"); err != nil { - t.Error(err) - } - } - - legacyregistry.Reset() - }) - } -} - func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { tests := []struct { name string From 92431f582ca4f8c6d75781e303acd8c84492dbea Mon Sep 17 00:00:00 2001 From: Cong Liu Date: Mon, 14 Apr 2025 06:18:43 -0700 Subject: [PATCH 38/74] Document model server compatibility and config options (#537) * Document model server compatibility and config options * Update config/charts/inferencepool/README.md --------- Co-authored-by: Abdullah Gharaibeh <40361897+ahg-g@users.noreply.github.com> --- config/charts/inferencepool/README.md | 14 ++++++- .../templates/epp-deployment.yaml | 9 ++++- config/charts/inferencepool/values.yaml | 1 + mkdocs.yml | 4 +- .../gateways.md} | 2 +- site-src/implementations/model-servers.md | 38 +++++++++++++++++++ 6 files changed, 64 insertions(+), 4 deletions(-) rename site-src/{implementations.md => implementations/gateways.md} (99%) create mode 100644 site-src/implementations/model-servers.md diff --git a/config/charts/inferencepool/README.md b/config/charts/inferencepool/README.md index e5468cd4..301e3d9c 100644 --- a/config/charts/inferencepool/README.md +++ b/config/charts/inferencepool/README.md @@ -2,7 +2,6 @@ A chart to deploy an InferencePool and a corresponding EndpointPicker (epp) deployment. - ## Install To install an InferencePool named `vllm-llama3-8b-instruct` that selects from endpoints with label `app: vllm-llama3-8b-instruct` and listening on port `8000`, you can run the following command: @@ -23,6 +22,18 @@ $ helm install vllm-llama3-8b-instruct \ Note that the provider name is needed to deploy provider-specific resources. If no provider is specified, then only the InferencePool object and the EPP are deployed. +### Install for Triton TensorRT-LLM + +Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install for Triton TensorRT-LLM, e.g., + +```txt +$ helm install triton-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=triton-llama3-8b-instruct \ + --set inferencePool.modelServerType=triton-tensorrt-llm \ + --set provider.name=[none|gke] \ + oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool --version v0 +``` + ## Uninstall Run the following command to uninstall the chart: @@ -38,6 +49,7 @@ The following table list the configurable parameters of the chart. | **Parameter Name** | **Description** | |---------------------------------------------|------------------------------------------------------------------------------------------------------------------------| | `inferencePool.targetPortNumber` | Target port number for the vllm backends, will be used to scrape metrics by the inference extension. Defaults to 8000. | +| `inferencePool.modelServerType` | Type of the model servers in the pool, valid options are [vllm, triton-tensorrt-llm], default is vllm. | | `inferencePool.modelServers.matchLabels` | Label selector to match vllm backends managed by the inference pool. | | `inferenceExtension.replicas` | Number of replicas for the endpoint picker extension service. Defaults to `1`. | | `inferenceExtension.image.name` | Name of the container image used for the endpoint picker. | diff --git a/config/charts/inferencepool/templates/epp-deployment.yaml b/config/charts/inferencepool/templates/epp-deployment.yaml index 0b9fa0bd..fc490210 100644 --- a/config/charts/inferencepool/templates/epp-deployment.yaml +++ b/config/charts/inferencepool/templates/epp-deployment.yaml @@ -35,6 +35,14 @@ spec: - "9003" - -metricsPort - "9090" + {{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }} + - -totalQueuedRequestsMetric + - "nv_trt_llm_request_metrics{request_type=waiting}" + - -kvCacheUsagePercentageMetric + - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}" + - -loraInfoMetric + - "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet. + {{- end }} ports: - name: grpc containerPort: 9002 @@ -54,4 +62,3 @@ spec: service: inference-extension initialDelaySeconds: 5 periodSeconds: 10 - diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index 766ee087..bd48f37e 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -9,6 +9,7 @@ inferenceExtension: inferencePool: targetPortNumber: 8000 + modelServerType: vllm # vllm, triton-tensorrt-llm # modelServers: # REQUIRED # matchLabels: # app: vllm-llama3-8b-instruct diff --git a/mkdocs.yml b/mkdocs.yml index b67cf8b4..bdfffe05 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -54,7 +54,9 @@ nav: API Overview: concepts/api-overview.md Conformance: concepts/conformance.md Roles and Personas: concepts/roles-and-personas.md - - Implementations: implementations.md + - Implementations: + - Gateways: implementations/gateways.md + - Model Servers: implementations/model-servers.md - FAQ: faq.md - Guides: - User Guides: diff --git a/site-src/implementations.md b/site-src/implementations/gateways.md similarity index 99% rename from site-src/implementations.md rename to site-src/implementations/gateways.md index dc15b297..d4e919be 100644 --- a/site-src/implementations.md +++ b/site-src/implementations/gateways.md @@ -1,4 +1,4 @@ -# Implementations +# Gateway Implementations This project has several implementations that are planned or in progress: diff --git a/site-src/implementations/model-servers.md b/site-src/implementations/model-servers.md new file mode 100644 index 00000000..3d475aaa --- /dev/null +++ b/site-src/implementations/model-servers.md @@ -0,0 +1,38 @@ + + +# Supported Model Servers + +Any model server that conform to the [model server protocol](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol) are supported by the inference extension. + +## Compatible Model Server Versions + +| Model Server | Version | Commit | Notes | +| -------------------- | ---------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | +| vLLM V0 | v0.6.4 and above | [commit 0ad216f](https://github.com/vllm-project/vllm/commit/0ad216f5750742115c686723bf38698372d483fd) | | +| vLLM V1 | v0.8.0 and above | [commit bc32bc7](https://github.com/vllm-project/vllm/commit/bc32bc73aad076849ac88565cff745b01b17d89c) | | +| Triton(TensorRT-LLM) | [25.03](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-25-03.html#rel-25-03) and above | [commit 15cb989](https://github.com/triton-inference-server/tensorrtllm_backend/commit/15cb989b00523d8e92dce5165b9b9846c047a70d). | LoRA affinity feature is not available as the required LoRA metrics haven't been implemented in Triton yet. | + +## vLLM + +vLLM is configured as the default in the [endpoint picker extension](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp). No further configuration is required. + +## Triton with TensorRT-LLM Backend + +Triton specific metric names need to be specified when starting the EPP. + +### Option 1: Use Helm + +Use `--set inferencePool.modelServerType=triton-tensorrt-llm` to install the [`inferencepool` via helm](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/charts/inferencepool). See the [`inferencepool` helm guide](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/charts/inferencepool/README.md) for more details. + +### Option 2: Edit EPP deployment yaml + + Add the following to the `args` of the [EPP deployment](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/42eb5ff1c5af1275df43ac384df0ddf20da95134/config/manifests/inferencepool-resources.yaml#L32) + + ``` +- -totalQueuedRequestsMetric +- "nv_trt_llm_request_metrics{request_type=waiting}" +- -kvCacheUsagePercentageMetric +- "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}" +- -loraInfoMetric +- "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet. +``` \ No newline at end of file From bd9ee36450d68fb4d0d8ac4f9be4db7d1ec4fee3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 15 Apr 2025 10:45:07 -0700 Subject: [PATCH 39/74] Bump github.com/prometheus/client_model from 0.6.1 to 0.6.2 (#687) Bumps [github.com/prometheus/client_model](https://github.com/prometheus/client_model) from 0.6.1 to 0.6.2. - [Release notes](https://github.com/prometheus/client_model/releases) - [Commits](https://github.com/prometheus/client_model/compare/v0.6.1...v0.6.2) --- updated-dependencies: - dependency-name: github.com/prometheus/client_model dependency-version: 0.6.2 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 20cf017a..c3ad8e5d 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,7 @@ require ( github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.37.0 github.com/prometheus/client_golang v1.21.1 - github.com/prometheus/client_model v0.6.1 + github.com/prometheus/client_model v0.6.2 github.com/prometheus/common v0.63.0 github.com/stretchr/testify v1.10.0 go.uber.org/multierr v1.11.0 diff --git a/go.sum b/go.sum index cd6cd380..838eb402 100644 --- a/go.sum +++ b/go.sum @@ -166,8 +166,8 @@ github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4 github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= github.com/prometheus/client_golang v1.21.1 h1:DOvXXTqVzvkIewV/CDPFdejpMCGeMcbGCQ8YOmu+Ibk= github.com/prometheus/client_golang v1.21.1/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg= -github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= -github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= github.com/prometheus/common v0.63.0 h1:YR/EIY1o3mEFP/kZCD7iDMnLPlGyuU2Gb3HIcXnA98k= github.com/prometheus/common v0.63.0/go.mod h1:VVFF/fBIoToEnWRVkYoXEkq3R3paCoxG9PXP74SnV18= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= From b18abf248857b1f5599a9a29486a4f8a182a9906 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 15 Apr 2025 20:47:06 -0700 Subject: [PATCH 40/74] Bump github.com/prometheus/client_golang from 1.21.1 to 1.22.0 (#688) Bumps [github.com/prometheus/client_golang](https://github.com/prometheus/client_golang) from 1.21.1 to 1.22.0. - [Release notes](https://github.com/prometheus/client_golang/releases) - [Changelog](https://github.com/prometheus/client_golang/blob/main/CHANGELOG.md) - [Commits](https://github.com/prometheus/client_golang/compare/v1.21.1...v1.22.0) --- updated-dependencies: - dependency-name: github.com/prometheus/client_golang dependency-version: 1.22.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 3 +-- go.sum | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index c3ad8e5d..4a0d5d63 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ require ( github.com/google/go-cmp v0.7.0 github.com/onsi/ginkgo/v2 v2.23.4 github.com/onsi/gomega v1.37.0 - github.com/prometheus/client_golang v1.21.1 + github.com/prometheus/client_golang v1.22.0 github.com/prometheus/client_model v0.6.2 github.com/prometheus/common v0.63.0 github.com/stretchr/testify v1.10.0 @@ -74,7 +74,6 @@ require ( github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/compress v1.17.11 // indirect github.com/kylelemons/godebug v1.1.0 // indirect github.com/leodido/go-urn v1.2.1 // indirect github.com/mailru/easyjson v0.7.7 // indirect diff --git a/go.sum b/go.sum index 838eb402..c551d3ed 100644 --- a/go.sum +++ b/go.sum @@ -112,8 +112,8 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= -github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= @@ -164,8 +164,8 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prashantv/gostub v1.1.0 h1:BTyx3RfQjRHnUWaGF9oQos79AlQ5k8WNktv7VGvVH4g= github.com/prashantv/gostub v1.1.0/go.mod h1:A5zLQHz7ieHGG7is6LLXLz7I8+3LZzsrV0P1IAHhP5U= -github.com/prometheus/client_golang v1.21.1 h1:DOvXXTqVzvkIewV/CDPFdejpMCGeMcbGCQ8YOmu+Ibk= -github.com/prometheus/client_golang v1.21.1/go.mod h1:U9NM32ykUErtVBxdvD3zfi+EuFkkaBvMb09mIfe0Zgg= +github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= +github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= github.com/prometheus/common v0.63.0 h1:YR/EIY1o3mEFP/kZCD7iDMnLPlGyuU2Gb3HIcXnA98k= From cd8e91f325221a2f0ea21a269c2a3092108e64c9 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Wed, 16 Apr 2025 07:05:05 +0300 Subject: [PATCH 41/74] added badges to README (#682) * added badges to README Signed-off-by: Nir Rozenbaum * typo Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index b74a13e9..f7943d2f 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ +[![Go Report Card](https://goreportcard.com/badge/sigs.k8s.io/gateway-api-inference-extension)](https://goreportcard.com/report/sigs.k8s.io/gateway-api-inference-extension) +[![Go Reference](https://pkg.go.dev/badge/sigs.k8s.io/gateway-api-inference-extension.svg)](https://pkg.go.dev/sigs.k8s.io/gateway-api-inference-extension) +[![License](https://img.shields.io/github/license/kubernetes-sigs/gateway-api-inference-extension)](/LICENSE) + # Gateway API Inference Extension This extension upgrades an [ext-proc](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter)-capable proxy or gateway - such as Envoy Gateway, kGateway, or the GKE Gateway - to become an **inference gateway** - supporting inference platform teams self-hosting large language models on Kubernetes. This integration makes it easy to expose and control access to your local [OpenAI-compatible chat completion endpoints](https://platform.openai.com/docs/api-reference/chat) to other workloads on or off cluster, or to integrate your self-hosted models alongside model-as-a-service providers in a higher level **AI Gateway** like LiteLLM, Solo AI Gateway, or Apigee. From f7faddc277a335c49e129b8c0a1d7fe179718f95 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 15 Apr 2025 21:05:12 -0700 Subject: [PATCH 42/74] Bump sigs.k8s.io/structured-merge-diff/v4 from 4.6.0 to 4.7.0 (#686) Bumps [sigs.k8s.io/structured-merge-diff/v4](https://github.com/kubernetes-sigs/structured-merge-diff) from 4.6.0 to 4.7.0. - [Release notes](https://github.com/kubernetes-sigs/structured-merge-diff/releases) - [Changelog](https://github.com/kubernetes-sigs/structured-merge-diff/blob/master/RELEASE.md) - [Commits](https://github.com/kubernetes-sigs/structured-merge-diff/compare/v4.6.0...v4.7.0) --- updated-dependencies: - dependency-name: sigs.k8s.io/structured-merge-diff/v4 dependency-version: 4.7.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 4a0d5d63..fcfb60af 100644 --- a/go.mod +++ b/go.mod @@ -25,7 +25,7 @@ require ( k8s.io/component-base v0.32.3 k8s.io/utils v0.0.0-20241210054802-24370beab758 sigs.k8s.io/controller-runtime v0.20.4 - sigs.k8s.io/structured-merge-diff/v4 v4.6.0 + sigs.k8s.io/structured-merge-diff/v4 v4.7.0 sigs.k8s.io/yaml v1.4.0 ) diff --git a/go.sum b/go.sum index c551d3ed..b2c05a61 100644 --- a/go.sum +++ b/go.sum @@ -332,7 +332,7 @@ sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1 sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016 h1:kXv6kKdoEtedwuqMmkqhbkgvYKeycVbC8+iPCP9j5kQ= sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= -sigs.k8s.io/structured-merge-diff/v4 v4.6.0 h1:IUA9nvMmnKWcj5jl84xn+T5MnlZKThmUW1TdblaLVAc= -sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= +sigs.k8s.io/structured-merge-diff/v4 v4.7.0 h1:qPeWmscJcXP0snki5IYF79Z8xrl8ETFxgMd7wez1XkI= +sigs.k8s.io/structured-merge-diff/v4 v4.7.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= From 6a18bebff710ce1596b57e7399814f64ac033084 Mon Sep 17 00:00:00 2001 From: Daneyon Hansen Date: Wed, 16 Apr 2025 12:19:38 -0700 Subject: [PATCH 43/74] Docs: Adds Kgateway Cleanup to Quickstart Signed-off-by: Daneyon Hansen --- site-src/guides/index.md | 46 ++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/site-src/guides/index.md b/site-src/guides/index.md index df3d1760..bcd1068d 100644 --- a/site-src/guides/index.md +++ b/site-src/guides/index.md @@ -119,9 +119,9 @@ This quickstart guide is intended for engineers familiar with k8s and model serv 5. Given that the default connection timeout may be insufficient for most inference workloads, it is recommended to configure a timeout appropriate for your intended use case. - ```bash - kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gcp-backend-policy.yaml - ``` + ```bash + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gcp-backend-policy.yaml + ``` === "Istio" @@ -269,10 +269,10 @@ This quickstart guide is intended for engineers familiar with k8s and model serv ### Cleanup - The following cleanup assumes you would like to clean ALL resources that were created in this quickstart guide. + The following instructions assume you would like to cleanup ALL resources that were created in this quickstart guide. Please be careful not to delete resources you'd like to keep. - 1. Uninstall the Inference Pool + 1. Uninstall the InferencePool, InferenceModel, and model server resources ```bash kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/inferencepool-resources.yaml --ignore-not-found @@ -282,7 +282,7 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl delete secret hf-token --ignore-not-found ``` - 1. Uninstall the Gateway + 1. Uninstall the Gateway API resources ```bash kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/gke/gateway.yaml --ignore-not-found @@ -296,8 +296,40 @@ This quickstart guide is intended for engineers familiar with k8s and model serv kubectl delete -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/raw/main/config/manifests/gateway/kgateway/httproute.yaml --ignore-not-found ``` - 1. Uninstall the CRDs + 1. Uninstall the Gateway API Inference Extension CRDs ```bash kubectl delete -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd --ignore-not-found ``` + + 1. Choose one of the following options to cleanup the Inference Gateway. + +=== "GKE" + + **TODO** + +=== "Istio" + + **TODO** + +=== "Kgateway" + + The following instructions assume you would like to cleanup ALL Kgateway resources that were created in this quickstart guide. + + 1. Uninstall Kgateway + + ```bash + helm uninstall kgateway -n kgateway-system + ``` + + 1. Uninstall the Kgateway CRDs. + + ```bash + helm uninstall kgateway-crds -n kgateway-system + ``` + + 1. Remove the Kgateway namespace. + + ```bash + kubectl delete ns kgateway-system + ``` From 944d63cc204ea6fc54c2b2aca4cdbb7966da1fe4 Mon Sep 17 00:00:00 2001 From: Maxime Brunet Date: Thu, 17 Apr 2025 15:33:08 +0000 Subject: [PATCH 44/74] docs(gateways): fix Envoy AI Gateway link (#700) --- site-src/implementations/gateways.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/site-src/implementations/gateways.md b/site-src/implementations/gateways.md index d4e919be..c3e17acd 100644 --- a/site-src/implementations/gateways.md +++ b/site-src/implementations/gateways.md @@ -13,15 +13,15 @@ This project has several implementations that are planned or in progress: ## Envoy AI Gateway [Envoy AI Gateway][aigw-home] is an open source project built on top of -[Envoy][envoy-org] and [Envoy Gateway][aigw-gateway] to handle request traffic +[Envoy][envoy-org] and [Envoy Gateway][envoy-gateway] to handle request traffic from application clients to GenAI services. The features and capabilities are outlined [here][aigw-capabilities]. Use the [quickstart][aigw-quickstart] to get Envoy AI Gateway running with Gateway API in a few simple steps. Progress towards supporting this project is tracked with a [GitHub Issue](https://github.com/envoyproxy/ai-gateway/issues/423). -[aigw-home]:https://gateway.envoyproxy.io/ +[aigw-home]:https://aigateway.envoyproxy.io/ [envoy-org]:https://github.com/envoyproxy -[aigw-gateway]: https://gateway.envoyproxy.io/ +[envoy-gateway]: https://gateway.envoyproxy.io/ [aigw-capabilities]:https://aigateway.envoyproxy.io/docs/capabilities/ [aigw-quickstart]:https://aigateway.envoyproxy.io/docs/capabilities/gateway-api-inference-extension From 4d7738a37be1bcc29afaa907949f632c48496e0c Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Thu, 17 Apr 2025 18:49:07 +0300 Subject: [PATCH 45/74] minor changes in few places (#702) * minor changes in few places Signed-off-by: Nir Rozenbaum * removed empty labels field Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum --- config/manifests/inferencepool-resources.yaml | 3 ++- pkg/epp/controller/inferencepool_reconciler.go | 6 ++---- pkg/epp/controller/inferencepool_reconciler_test.go | 2 +- pkg/epp/server/controller_manager.go | 6 +++--- pkg/epp/server/runserver.go | 6 +----- site-src/implementations/gateways.md | 2 ++ 6 files changed, 11 insertions(+), 14 deletions(-) diff --git a/config/manifests/inferencepool-resources.yaml b/config/manifests/inferencepool-resources.yaml index 993b7bf6..3d978292 100644 --- a/config/manifests/inferencepool-resources.yaml +++ b/config/manifests/inferencepool-resources.yaml @@ -4,7 +4,6 @@ apiVersion: inference.networking.x-k8s.io/v1alpha2 kind: InferencePool metadata: - labels: name: vllm-llama3-8b-instruct spec: targetPortNumber: 8000 @@ -54,6 +53,8 @@ spec: args: - -poolName - "vllm-llama3-8b-instruct" + - "-poolNamespace" + - "default" - -v - "4" - --zap-encoder diff --git a/pkg/epp/controller/inferencepool_reconciler.go b/pkg/epp/controller/inferencepool_reconciler.go index c92d4ecc..0738181f 100644 --- a/pkg/epp/controller/inferencepool_reconciler.go +++ b/pkg/epp/controller/inferencepool_reconciler.go @@ -21,7 +21,6 @@ import ( "reflect" "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -36,9 +35,8 @@ import ( // will have the proper controller that will create/manage objects on behalf of the server pool. type InferencePoolReconciler struct { client.Client - Record record.EventRecorder - PoolNamespacedName types.NamespacedName - Datastore datastore.Datastore + Record record.EventRecorder + Datastore datastore.Datastore } func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { diff --git a/pkg/epp/controller/inferencepool_reconciler_test.go b/pkg/epp/controller/inferencepool_reconciler_test.go index 7e5d4801..b7e28334 100644 --- a/pkg/epp/controller/inferencepool_reconciler_test.go +++ b/pkg/epp/controller/inferencepool_reconciler_test.go @@ -96,7 +96,7 @@ func TestInferencePoolReconciler(t *testing.T) { pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) datastore := datastore.NewDatastore(ctx, pmf) - inferencePoolReconciler := &InferencePoolReconciler{PoolNamespacedName: namespacedName, Client: fakeClient, Datastore: datastore} + inferencePoolReconciler := &InferencePoolReconciler{Client: fakeClient, Datastore: datastore} // Step 1: Inception, only ready pods matching pool1 are added to the store. if _, err := inferencePoolReconciler.Reconcile(ctx, req); err != nil { diff --git a/pkg/epp/server/controller_manager.go b/pkg/epp/server/controller_manager.go index aaad8976..ce5cfc89 100644 --- a/pkg/epp/server/controller_manager.go +++ b/pkg/epp/server/controller_manager.go @@ -39,8 +39,8 @@ func init() { utilruntime.Must(v1alpha2.Install(scheme)) } -// DefaultManagerOptions returns the default options used to create the manager. -func DefaultManagerOptions(namespace, name string) ctrl.Options { +// defaultManagerOptions returns the default options used to create the manager. +func defaultManagerOptions(namespace string, name string) ctrl.Options { return ctrl.Options{ Scheme: scheme, Cache: cache.Options{ @@ -71,7 +71,7 @@ func DefaultManagerOptions(namespace, name string) ctrl.Options { // NewDefaultManager creates a new controller manager with default configuration. func NewDefaultManager(namespace, name string, restConfig *rest.Config) (ctrl.Manager, error) { - manager, err := ctrl.NewManager(restConfig, DefaultManagerOptions(namespace, name)) + manager, err := ctrl.NewManager(restConfig, defaultManagerOptions(namespace, name)) if err != nil { return nil, fmt.Errorf("failed to create controller manager: %v", err) } diff --git a/pkg/epp/server/runserver.go b/pkg/epp/server/runserver.go index aa048e6e..65a6e787 100644 --- a/pkg/epp/server/runserver.go +++ b/pkg/epp/server/runserver.go @@ -87,11 +87,7 @@ func (r *ExtProcServerRunner) SetupWithManager(ctx context.Context, mgr ctrl.Man if err := (&controller.InferencePoolReconciler{ Datastore: r.Datastore, Client: mgr.GetClient(), - PoolNamespacedName: types.NamespacedName{ - Name: r.PoolName, - Namespace: r.PoolNamespace, - }, - Record: mgr.GetEventRecorderFor("InferencePool"), + Record: mgr.GetEventRecorderFor("InferencePool"), }).SetupWithManager(mgr); err != nil { return fmt.Errorf("failed setting up InferencePoolReconciler: %w", err) } diff --git a/site-src/implementations/gateways.md b/site-src/implementations/gateways.md index c3e17acd..b44dca6f 100644 --- a/site-src/implementations/gateways.md +++ b/site-src/implementations/gateways.md @@ -5,10 +5,12 @@ This project has several implementations that are planned or in progress: * [Envoy AI Gateway][1] * [Kgateway][2] * [Google Kubernetes Engine][3] +* [Istio][4] [1]:#envoy-gateway [2]:#kgateway [3]:#google-kubernetes-engine +[4]:#istio ## Envoy AI Gateway From 8b9aef6b18d710ab6d17bc9c682e819de7156be4 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Thu, 17 Apr 2025 20:41:08 +0300 Subject: [PATCH 46/74] using namespaced name (#707) Signed-off-by: Nir Rozenbaum --- cmd/epp/main.go | 7 ++++++- pkg/epp/server/controller_manager.go | 15 ++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/cmd/epp/main.go b/cmd/epp/main.go index b9c7d6e4..b5e6fbe6 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -30,6 +30,7 @@ import ( "go.uber.org/zap/zapcore" "google.golang.org/grpc" healthPb "google.golang.org/grpc/health/grpc_health_v1" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/rest" "k8s.io/component-base/metrics/legacyregistry" ctrl "sigs.k8s.io/controller-runtime" @@ -140,7 +141,11 @@ func run() error { return err } - mgr, err := runserver.NewDefaultManager(*poolNamespace, *poolName, cfg) + poolNamespacedName := types.NamespacedName{ + Namespace: *poolNamespace, + Name: *poolName, + } + mgr, err := runserver.NewDefaultManager(poolNamespacedName, cfg) if err != nil { setupLog.Error(err, "Failed to create controller manager") return err diff --git a/pkg/epp/server/controller_manager.go b/pkg/epp/server/controller_manager.go index ce5cfc89..e5668210 100644 --- a/pkg/epp/server/controller_manager.go +++ b/pkg/epp/server/controller_manager.go @@ -22,6 +22,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" @@ -40,28 +41,28 @@ func init() { } // defaultManagerOptions returns the default options used to create the manager. -func defaultManagerOptions(namespace string, name string) ctrl.Options { +func defaultManagerOptions(namespacedName types.NamespacedName) ctrl.Options { return ctrl.Options{ Scheme: scheme, Cache: cache.Options{ ByObject: map[client.Object]cache.ByObject{ &corev1.Pod{}: { Namespaces: map[string]cache.Config{ - namespace: {}, + namespacedName.Namespace: {}, }, }, &v1alpha2.InferencePool{}: { Namespaces: map[string]cache.Config{ - namespace: { + namespacedName.Namespace: { FieldSelector: fields.SelectorFromSet(fields.Set{ - "metadata.name": name, + "metadata.name": namespacedName.Name, }), }, }, }, &v1alpha2.InferenceModel{}: { Namespaces: map[string]cache.Config{ - namespace: {}, + namespacedName.Namespace: {}, }, }, }, @@ -70,8 +71,8 @@ func defaultManagerOptions(namespace string, name string) ctrl.Options { } // NewDefaultManager creates a new controller manager with default configuration. -func NewDefaultManager(namespace, name string, restConfig *rest.Config) (ctrl.Manager, error) { - manager, err := ctrl.NewManager(restConfig, defaultManagerOptions(namespace, name)) +func NewDefaultManager(namespacedName types.NamespacedName, restConfig *rest.Config) (ctrl.Manager, error) { + manager, err := ctrl.NewManager(restConfig, defaultManagerOptions(namespacedName)) if err != nil { return nil, fmt.Errorf("failed to create controller manager: %v", err) } From c54650602b6a2599846787f8c139995dbbe62560 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Mon, 21 Apr 2025 12:01:01 -0700 Subject: [PATCH 47/74] EPP Architecture proposal (#683) * initial changes * Adding to proposal to give a quick barebones definition to refactor * feedback changes * more feedback addressing --- .../00x-epp-compliance-proposal/README.md | 99 +++++++++++++++++++ .../images/epp_arch.svg | 1 + 2 files changed, 100 insertions(+) create mode 100644 docs/proposals/00x-epp-compliance-proposal/README.md create mode 100644 docs/proposals/00x-epp-compliance-proposal/images/epp_arch.svg diff --git a/docs/proposals/00x-epp-compliance-proposal/README.md b/docs/proposals/00x-epp-compliance-proposal/README.md new file mode 100644 index 00000000..48c7720f --- /dev/null +++ b/docs/proposals/00x-epp-compliance-proposal/README.md @@ -0,0 +1,99 @@ +# Gateway API Inference Extension + +Author(s): @kfswain +## Proposal Status + ***Draft*** + +## Table of Contents + + + +- [Summary](#summary) +- [Goals](#goals) +- [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [Personas](#personas) + - [Inference Platform Admin](#inference-platform-admin) + - [Inference Workload Owner](#workload-owner) + - [Axioms](#axioms) + - [InferencePool](#inferencepool) + - [InferenceModel](#inferencemodel) + - [Spec](#spec) + - [Diagrams](#diagrams) + - [Alternatives](#alternatives) +- [Open Questions](#open-questions) + + + +## Summary + +This proposal seeks to standardize the implementation of an EPP (End-point Picker) for the Inference Gateway extension (also known as Gateway API Inference Extension). Additionally, this proposes to restructure the current implementation of the EPP to be more modular, and approachable. + +## Goals + +- Set a standard on how the EPP & APIs interact +- Settle on common nomenclature for clearer communication +- Allow for modularization of the EPP, to be extended to a user's specific needs + +## Non-Goals + +- Reshaping the current API +- A change in scope of the current project + +## Proposal + +This proposal is not proposing any net new features, instead, we are refactoring our current implementation to better handle more devs, more features, etc. At the time of writing, GIE is currently at v0.3, and that stronger experimental context (along with external feedback) made clear the need this restructure. The image below give a high level view of how our components work together. + +Scheduling Algorithm + +## Overview +At a quick glance, the EPP is being broken into specific layers. The `Data Layer` is of note, as it is a vertical that will be accessed by all the others. The data layer manages the k8s, data, metric & usage data, as well as processing of the above data to determine resource scarcity regimes. + +The other layers are handled in sequential process. Starting with the **Ext-Proc** call. The request is buffered and then sent to the **Routing Layer**, which processes any User defined per-InferenceModel routing rules & request enrichment happening first (at the time of writing that is currently just translating the InferenceModel name to a weight-split actual model). Then _all_ requests pass through the to-be-implemented [**Flow Controller**](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/674) to ensure that any request entry to the pool adhereing to the guidelines set by the Priority, Fairness, & Queueing configuration. And finally, the **Scheduling Layer** is the load balancing algorithm that intelligently routes requests based on the current state of the InferencePool. + +## Components + +To further expand upon these component layers. We will first break them into `extensible` and `non-extensible` layers. `Non-extensible` layers are intended to be static, and handled on behalf of the user, typically implementing low-opinion infrastructure. + +The `Extensible` layers are: +- Data Layer +- Routing Layer +- Flow Controller +- Scheduling Layer + +The `Non-Extensible` layer(s) are: +- The Ext-Proc Server + +### `Extensible` + +#### Data Layer + +The data layer will consume and store: the InferencePool/InferenceModel config and the pre-defined [Model Server Protocol](../003-model-server-protocol/README.md). Additionally, the data fed from the model servers will be processed and digested to provide resource scarcity regime hints, and autoscaling reccomendations. + +Many extensions to scheduling will require changes to ingested metrics, as such, the data layer will be built to be extended, but extenders accept that the Model Server Protocol will no longer provide guarantees on portability of a model server out of the box. + +#### Routing Layer + +The routing layer is likely to be the most opinion heavy section, as the scope of what constitutes a 'Route Rule' is somewhat broad. The current examples we expect would be: + +- System Prompt injection +- RAG callout +- Per-InferenceModel request validation (such as saftey/on-topic, etc) + +Due to the possibility of this becoming a bit of a dumping ground. The API will keep a _very_ tight scope on which of these route rules are included in the spec. A standard method of extension will be provided if the need to define a custom rule arises. + +#### Flow Controller (WIP - implementation tracked in [#674](https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/674)) + +The flow controller will consume resource regime data, and enforce proper resource sharing between workloads. This will primarily be done through a queuing mechanism [as described here](https://docs.google.com/document/d/1VZL7opFWuwgWquvgiOzLlXAJ633qZ9U-A0ZixGjBgaI/edit?usp=sharing). + +#### Scheduling Layer + +As the Scheduling Layer is the final interface to the entirety of the pool, all configuration will be at the _pool_ level. The default scheduling layer will be an experimentally-backed LB algorithm, with exposed config values. + +The Scheduler will define a strong interface API, so that new scheduling algos may be plugged & dark-launched to test in production traffic without impacting said traffic. Extension is expected to adhere to the [Scheduler Subsystem definition](https://github.com/kubernetes-sigs/gateway-api-inference-extension/pull/603) + +### `Non-extensible` + +#### Ext-Proc Server + +The Ext-Proc Server protocol is very well defined & specific, deviation could cause the EPP to become unusable or unstable. Extension is ill-advised. diff --git a/docs/proposals/00x-epp-compliance-proposal/images/epp_arch.svg b/docs/proposals/00x-epp-compliance-proposal/images/epp_arch.svg new file mode 100644 index 00000000..4c585728 --- /dev/null +++ b/docs/proposals/00x-epp-compliance-proposal/images/epp_arch.svg @@ -0,0 +1 @@ + \ No newline at end of file From c618e1f42ff73bbfaefb86ab74ea2971e21ca892 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Tue, 22 Apr 2025 18:49:41 +0300 Subject: [PATCH 48/74] removed unused Fake struct (#723) Signed-off-by: Nir Rozenbaum --- pkg/epp/backend/metrics/fake.go | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pkg/epp/backend/metrics/fake.go b/pkg/epp/backend/metrics/fake.go index 7fd4970d..ec97c6de 100644 --- a/pkg/epp/backend/metrics/fake.go +++ b/pkg/epp/backend/metrics/fake.go @@ -24,7 +24,6 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -84,11 +83,3 @@ func (f *FakePodMetricsClient) SetErr(new map[types.NamespacedName]error) { defer f.errMu.Unlock() f.Err = new } - -type FakeDataStore struct { - Res map[string]*v1alpha2.InferenceModel -} - -func (fds *FakeDataStore) FetchModelData(modelName string) (returnModel *v1alpha2.InferenceModel) { - return fds.Res[modelName] -} From 9114b35d859c44fae9d9139f03d228e2b0748413 Mon Sep 17 00:00:00 2001 From: John Howard Date: Tue, 22 Apr 2025 14:59:40 -0700 Subject: [PATCH 49/74] epp: return correct response for trailers (#726) This looks like a copy paste error. --- pkg/epp/handlers/server.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go index 7bb0fcb1..f97e9ede 100644 --- a/pkg/epp/handlers/server.go +++ b/pkg/epp/handlers/server.go @@ -325,7 +325,7 @@ func (r *RequestContext) updateStateAndSendIfNeeded(srv extProcPb.ExternalProces } if r.RequestState == BodyRequestResponsesComplete && r.reqTrailerResp != nil { // Trailers in requests are not guaranteed - if err := srv.Send(r.reqHeaderResp); err != nil { + if err := srv.Send(r.reqTrailerResp); err != nil { return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) } } @@ -351,7 +351,7 @@ func (r *RequestContext) updateStateAndSendIfNeeded(srv extProcPb.ExternalProces } if r.RequestState == BodyResponseResponsesComplete && r.respTrailerResp != nil { // Trailers in requests are not guaranteed - if err := srv.Send(r.reqHeaderResp); err != nil { + if err := srv.Send(r.respTrailerResp); err != nil { return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err) } } From 45209f6bb93710c8a9fabc0c9f183dad0e2e94e0 Mon Sep 17 00:00:00 2001 From: Cong Liu Date: Tue, 22 Apr 2025 15:15:41 -0700 Subject: [PATCH 50/74] Refactor scheduler to run plugins (#677) * Refactor scheduler to run plugins * Add scheduler plugin latency metric * Address comments * Address comments --- pkg/epp/backend/metrics/types.go | 6 + pkg/epp/handlers/request.go | 9 +- pkg/epp/handlers/server.go | 2 +- pkg/epp/metrics/metrics.go | 22 ++ pkg/epp/metrics/metrics_test.go | 64 ++++ ...heduler_plugin_processing_latencies_metric | 67 ++++ pkg/epp/scheduling/config/config.go | 58 +++ pkg/epp/scheduling/{ => plugins}/filter.go | 144 ++++---- .../scheduling/{ => plugins}/filter_test.go | 91 ++--- pkg/epp/scheduling/plugins/noop.go | 38 ++ pkg/epp/scheduling/plugins/picker.go | 37 ++ pkg/epp/scheduling/scheduler.go | 236 ++++++++----- pkg/epp/scheduling/scheduler_test.go | 331 ++++++++++++++++-- pkg/epp/scheduling/types/interfaces.go | 75 ++++ pkg/epp/scheduling/types/types.go | 35 +- 15 files changed, 969 insertions(+), 246 deletions(-) create mode 100644 pkg/epp/metrics/testdata/scheduler_plugin_processing_latencies_metric create mode 100644 pkg/epp/scheduling/config/config.go rename pkg/epp/scheduling/{ => plugins}/filter.go (60%) rename pkg/epp/scheduling/{ => plugins}/filter_test.go (82%) create mode 100644 pkg/epp/scheduling/plugins/noop.go create mode 100644 pkg/epp/scheduling/plugins/picker.go create mode 100644 pkg/epp/scheduling/types/interfaces.go diff --git a/pkg/epp/backend/metrics/types.go b/pkg/epp/backend/metrics/types.go index 925a0cc5..21c0f401 100644 --- a/pkg/epp/backend/metrics/types.go +++ b/pkg/epp/backend/metrics/types.go @@ -79,6 +79,9 @@ func (p *Pod) String() string { } func (p *Pod) Clone() *Pod { + if p == nil { + return nil + } return &Pod{ NamespacedName: types.NamespacedName{ Name: p.NamespacedName.Name, @@ -118,6 +121,9 @@ func (m *Metrics) String() string { } func (m *Metrics) Clone() *Metrics { + if m == nil { + return nil + } cm := make(map[string]int, len(m.ActiveModels)) for k, v := range m.ActiveModels { cm[k] = v diff --git a/pkg/epp/handlers/request.go b/pkg/epp/handlers/request.go index 44537923..9121b59a 100644 --- a/pkg/epp/handlers/request.go +++ b/pkg/epp/handlers/request.go @@ -67,7 +67,7 @@ func (s *StreamingServer) HandleRequestBody( ResolvedTargetModel: modelName, Critical: modelObj.Spec.Criticality != nil && *modelObj.Spec.Criticality == v1alpha2.Critical, } - logger.V(logutil.DEBUG).Info("LLM request assembled", "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "critical", llmReq.Critical) + logger.V(logutil.DEBUG).Info("LLM request assembled", "request", llmReq) var err error // Update target models in the body. @@ -81,11 +81,11 @@ func (s *StreamingServer) HandleRequestBody( return reqCtx, errutil.Error{Code: errutil.Internal, Msg: fmt.Sprintf("error marshaling request body: %v", err)} } - target, err := s.scheduler.Schedule(ctx, llmReq) + res, err := s.scheduler.Schedule(ctx, llmReq) if err != nil { return reqCtx, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: fmt.Errorf("failed to find target pod: %w", err).Error()} } - targetPod := target.GetPod() + targetPod := res.TargetPod.GetPod() // Insert target endpoint to instruct Envoy to route requests to the specified target pod. // Attach the port number @@ -96,8 +96,7 @@ func (s *StreamingServer) HandleRequestBody( endpoint := targetPod.Address + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber)) logger.V(logutil.DEFAULT).Info("Request handled", - "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "endpoint", targetPod, "endpoint metrics", - fmt.Sprintf("%+v", target)) + "model", llmReq.Model, "targetModel", llmReq.ResolvedTargetModel, "endpoint", targetPod) reqCtx.Model = llmReq.Model reqCtx.ResolvedTargetModel = llmReq.ResolvedTargetModel diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go index f97e9ede..2e3a35fe 100644 --- a/pkg/epp/handlers/server.go +++ b/pkg/epp/handlers/server.go @@ -65,7 +65,7 @@ type StreamingServer struct { } type Scheduler interface { - Schedule(ctx context.Context, b *schedulingtypes.LLMRequest) (targetPod schedulingtypes.Pod, err error) + Schedule(ctx context.Context, b *schedulingtypes.LLMRequest) (result *schedulingtypes.Result, err error) } // RequestContext stores context information during the life time of an HTTP request. diff --git a/pkg/epp/metrics/metrics.go b/pkg/epp/metrics/metrics.go index b474df36..56dcfca8 100644 --- a/pkg/epp/metrics/metrics.go +++ b/pkg/epp/metrics/metrics.go @@ -30,6 +30,7 @@ import ( const ( InferenceModelComponent = "inference_model" InferencePoolComponent = "inference_pool" + EPPComponent = "endpoint_picker" ) var ( @@ -176,6 +177,20 @@ var ( }, []string{"name"}, ) + + // Scheduler Plugin Metrics + SchedulerPluginProcessingLatencies = compbasemetrics.NewHistogramVec( + &compbasemetrics.HistogramOpts{ + Subsystem: EPPComponent, + Name: "scheduler_plugin_duration_seconds", + Help: "Scheduler plugin processing latency distribution in seconds for each plugin type and plugin name.", + Buckets: []float64{ + 0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, + }, + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"plugin_type", "plugin_name"}, + ) ) var registerMetrics sync.Once @@ -196,6 +211,8 @@ func Register() { legacyregistry.MustRegister(inferencePoolAvgKVCache) legacyregistry.MustRegister(inferencePoolAvgQueueSize) legacyregistry.MustRegister(inferencePoolReadyPods) + + legacyregistry.MustRegister(SchedulerPluginProcessingLatencies) }) } @@ -293,3 +310,8 @@ func RecordInferencePoolAvgQueueSize(name string, queueSize float64) { func RecordinferencePoolReadyPods(name string, runningPods float64) { inferencePoolReadyPods.WithLabelValues(name).Set(runningPods) } + +// RecordSchedulerPluginProcessingLatency records the processing latency for a scheduler plugin. +func RecordSchedulerPluginProcessingLatency(pluginType, pluginName string, duration time.Duration) { + SchedulerPluginProcessingLatencies.WithLabelValues(pluginType, pluginName).Observe(duration.Seconds()) +} diff --git a/pkg/epp/metrics/metrics_test.go b/pkg/epp/metrics/metrics_test.go index b5f19e6d..81797e6d 100644 --- a/pkg/epp/metrics/metrics_test.go +++ b/pkg/epp/metrics/metrics_test.go @@ -556,3 +556,67 @@ func TestInferencePoolMetrics(t *testing.T) { }) } } + +func TestSchedulerPluginProcessingLatencies(t *testing.T) { + type pluginLatency struct { + pluginType string + pluginName string + duration time.Duration + } + scenarios := []struct { + name string + latencies []pluginLatency + }{ + { + name: "multiple plugins", + latencies: []pluginLatency{ + { + pluginType: "PreSchedule", + pluginName: "PluginA", + duration: 100 * time.Millisecond, + }, + { + pluginType: "PostSchedule", + pluginName: "PluginB", + duration: 200 * time.Millisecond, + }, + { + pluginType: "Filter", + pluginName: "PluginC", + duration: 50 * time.Millisecond, + }, + { + pluginType: "Scorer", + pluginName: "PluginD", + duration: 10 * time.Millisecond, + }, + { + pluginType: "Picker", + pluginName: "PluginE", + duration: 10 * time.Microsecond, + }, + }, + }, + } + Register() + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + for _, latency := range scenario.latencies { + RecordSchedulerPluginProcessingLatency(latency.pluginType, latency.pluginName, latency.duration) + } + + wantPluginLatencies, err := os.Open("testdata/scheduler_plugin_processing_latencies_metric") + defer func() { + if err := wantPluginLatencies.Close(); err != nil { + t.Error(err) + } + }() + if err != nil { + t.Fatal(err) + } + if err := testutil.GatherAndCompare(legacyregistry.DefaultGatherer, wantPluginLatencies, "endpoint_picker_scheduler_plugin_processing_latencies"); err != nil { + t.Error(err) + } + }) + } +} diff --git a/pkg/epp/metrics/testdata/scheduler_plugin_processing_latencies_metric b/pkg/epp/metrics/testdata/scheduler_plugin_processing_latencies_metric new file mode 100644 index 00000000..8c11757f --- /dev/null +++ b/pkg/epp/metrics/testdata/scheduler_plugin_processing_latencies_metric @@ -0,0 +1,67 @@ +# HELP endpoint_picker_scheduler_plugin_duration_seconds [ALPHA] Scheduler plugin processing latency distribution in seconds for each plugin type and plugin name. +# TYPE endpoint_picker_scheduler_plugin_duration_seconds histogram +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginA",plugin_type="PreSchedule",le="0.0001"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginA",plugin_type="PreSchedule",le="0.0002"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginA",plugin_type="PreSchedule",le="0.0005"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginA",plugin_type="PreSchedule",le="0.001"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginA",plugin_type="PreSchedule",le="0.002"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginA",plugin_type="PreSchedule",le="0.005"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginA",plugin_type="PreSchedule",le="0.01"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginA",plugin_type="PreSchedule",le="0.02"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginA",plugin_type="PreSchedule",le="0.05"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginA",plugin_type="PreSchedule",le="0.1"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginA",plugin_type="PreSchedule",le="+Inf"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_sum{plugin_name="PluginA",plugin_type="PreSchedule"} 0.1 +endpoint_picker_scheduler_plugin_duration_seconds_count{plugin_name="PluginA",plugin_type="PreSchedule"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.0001"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.0002"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.0005"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.001"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.002"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.005"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.01"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.02"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.05"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="0.1"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginB",plugin_type="PostSchedule",le="+Inf"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_sum{plugin_name="PluginB",plugin_type="PostSchedule"} 0.2 +endpoint_picker_scheduler_plugin_duration_seconds_count{plugin_name="PluginB",plugin_type="PostSchedule"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.0001"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.0002"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.0005"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.001"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.002"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.005"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.01"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.02"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.05"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="0.1"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginC",plugin_type="Filter",le="+Inf"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_sum{plugin_name="PluginC",plugin_type="Filter"} 0.05 +endpoint_picker_scheduler_plugin_duration_seconds_count{plugin_name="PluginC",plugin_type="Filter"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.0001"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.0002"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.0005"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.001"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.002"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.005"} 0 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.01"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.02"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.05"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="0.1"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginD",plugin_type="Scorer",le="+Inf"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_sum{plugin_name="PluginD",plugin_type="Scorer"} 0.01 +endpoint_picker_scheduler_plugin_duration_seconds_count{plugin_name="PluginD",plugin_type="Scorer"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.0001"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.0002"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.0005"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.001"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.002"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.005"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.01"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.02"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.05"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="0.1"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_bucket{plugin_name="PluginE",plugin_type="Picker",le="+Inf"} 1 +endpoint_picker_scheduler_plugin_duration_seconds_sum{plugin_name="PluginE",plugin_type="Picker"} 1e-05 +endpoint_picker_scheduler_plugin_duration_seconds_count{plugin_name="PluginE",plugin_type="Picker"} 1 diff --git a/pkg/epp/scheduling/config/config.go b/pkg/epp/scheduling/config/config.go new file mode 100644 index 00000000..e00b82ae --- /dev/null +++ b/pkg/epp/scheduling/config/config.go @@ -0,0 +1,58 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package config + +import ( + "sigs.k8s.io/controller-runtime/pkg/log" + envutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +// Config holds all the configuration values for the scheduler +type Config struct { + KVCacheThreshold float64 + QueueThresholdCritical int + QueueingThresholdLoRA int + LoraAffinityThreshold float64 +} + +const ( + // Default values to use if environment variables are not set + defaultKVCacheThreshold = 0.8 + defaultQueueThresholdCritical = 5 + defaultQueueingThresholdLoRA = 128 + defaultLoraAffinityThreshold = 0.999 +) + +// LoadConfig loads configuration from environment variables +func LoadConfig() Config { + // Use a default logger for initial configuration loading + baseLogger := log.Log.WithName("scheduling-config") + + config := Config{ + KVCacheThreshold: envutil.GetEnvFloat("KV_CACHE_THRESHOLD", defaultKVCacheThreshold, baseLogger), + QueueThresholdCritical: envutil.GetEnvInt("QUEUE_THRESHOLD_CRITICAL", defaultQueueThresholdCritical, baseLogger), + QueueingThresholdLoRA: envutil.GetEnvInt("QUEUING_THRESHOLD_LORA", defaultQueueingThresholdLoRA, baseLogger), + LoraAffinityThreshold: envutil.GetEnvFloat("LORA_AFFINITY_THRESHOLD", defaultLoraAffinityThreshold, baseLogger), + } + + baseLogger.V(logutil.DEFAULT).Info("Scheduler configuration loaded", "config", config) + + return config +} + +var Conf = LoadConfig() diff --git a/pkg/epp/scheduling/filter.go b/pkg/epp/scheduling/plugins/filter.go similarity index 60% rename from pkg/epp/scheduling/filter.go rename to pkg/epp/scheduling/plugins/filter.go index 99044e97..efcb6be1 100644 --- a/pkg/epp/scheduling/filter.go +++ b/pkg/epp/scheduling/plugins/filter.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package scheduling +package plugins import ( "errors" @@ -22,83 +22,80 @@ import ( "math/rand" "time" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/config" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) -type Filter interface { - Name() string - Filter(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) -} - -type basicFilter struct { +type Filter struct { name string filter filterFunc } -func (bf *basicFilter) Name() string { +func (bf *Filter) Name() string { if bf == nil { return "nil" } return bf.name } -func (bf *basicFilter) Filter(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) { +func (bf *Filter) Filter(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { loggerTrace := ctx.Logger.V(logutil.TRACE) loggerTrace.Info("Running a filter", "name", bf.Name(), "podCount", len(pods)) return bf.filter(ctx, pods) } -// decisionTreeFilter applies current filterFunc, and then recursively applies next filters +// DecisionTreeFilter applies current filterFunc, and then recursively applies next filters // depending success or failure of the current filter. // It can be used to construct a flow chart algorithm. -type decisionTreeFilter struct { - current Filter - // nextOnSuccess filter will be applied after successfully applying the current filter. +type DecisionTreeFilter struct { + Current types.Filter + // NextOnSuccess filter will be applied after successfully applying the current filter. // The filtered results will be passed to the next filter. - nextOnSuccess Filter - // nextOnFailure filter will be applied if current filter fails. + NextOnSuccess types.Filter + // NextOnFailure filter will be applied if current filter fails. // The original input will be passed to the next filter. - nextOnFailure Filter - // nextOnSuccessOrFailure is a convenience field to configure the next filter regardless of the + NextOnFailure types.Filter + // NextOnSuccessOrFailure is a convenience field to configure the next filter regardless of the // success or failure of the current filter. - // NOTE: When using nextOnSuccessOrFailure, both nextOnSuccess and nextOnFailure SHOULD be nil. + // NOTE: When using NextOnSuccessOrFailure, both nextOnSuccess and nextOnFailure SHOULD be nil. // However if that's not the case, nextOnSuccess and nextOnFailure will be used, instead of - // nextOnSuccessOrFailure, in the success and failure scenarios, respectively. - nextOnSuccessOrFailure Filter + // NextOnSuccessOrFailure, in the success and failure scenarios, respectively. + NextOnSuccessOrFailure types.Filter } -func (f *decisionTreeFilter) Name() string { +func (f *DecisionTreeFilter) Name() string { if f == nil { return "nil" } - return f.current.Name() + return f.Current.Name() } -func (f *decisionTreeFilter) Filter(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) { +func (f *DecisionTreeFilter) Filter(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { loggerTrace := ctx.Logger.V(logutil.TRACE) - filtered, err := f.current.Filter(ctx, pods) + filtered, err := f.Current.Filter(ctx, pods) - next := f.nextOnSuccessOrFailure + next := f.NextOnSuccessOrFailure if err == nil && len(filtered) > 0 { - if f.nextOnSuccess == nil && f.nextOnSuccessOrFailure == nil { + if f.NextOnSuccess == nil && f.NextOnSuccessOrFailure == nil { // No succeeding filters to run, return. return filtered, err } - if f.nextOnSuccess != nil { - next = f.nextOnSuccess + if f.NextOnSuccess != nil { + next = f.NextOnSuccess } loggerTrace.Info("Filter succeeded", "filter", f.Name(), "next", next.Name(), "filteredPodCount", len(filtered)) // On success, pass the filtered result to the next filter. return next.Filter(ctx, filtered) } else { - if f.nextOnFailure == nil && f.nextOnSuccessOrFailure == nil { + if f.NextOnFailure == nil && f.NextOnSuccessOrFailure == nil { // No succeeding filters to run, return. return filtered, err } - if f.nextOnFailure != nil { - next = f.nextOnFailure + if f.NextOnFailure != nil { + next = f.NextOnFailure } loggerTrace.Info("Filter failed", "filter", f.Name(), "next", next.Name()) // On failure, pass the initial set of pods to the next filter. @@ -107,12 +104,12 @@ func (f *decisionTreeFilter) Filter(ctx *types.Context, pods []*types.PodMetrics } // filterFunc filters a set of input pods to a subset. -type filterFunc func(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) +type filterFunc func(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) // toFilterFunc is a helper function to convert a per pod filter func to the FilterFunc. func toFilterFunc(pp podPredicate) filterFunc { - return func(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) { - filtered := []*types.PodMetrics{} + return func(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { + filtered := []types.Pod{} for _, pod := range pods { pass := pp(ctx.Req, pod) if pass { @@ -126,7 +123,7 @@ func toFilterFunc(pp podPredicate) filterFunc { } } -var leastQueueFilter = &basicFilter{ +var LeastQueueFilter = &Filter{ name: "least queuing", filter: leastQueuingFilterFunc, } @@ -138,34 +135,34 @@ var leastQueueFilter = &basicFilter{ // the least one as it gives more choices for the next filter, which on aggregate gave better // results. // TODO: Compare this strategy with other strategies such as top K. -func leastQueuingFilterFunc(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) { +func leastQueuingFilterFunc(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { min := math.MaxInt max := 0 - filtered := []*types.PodMetrics{} + filtered := []types.Pod{} for _, pod := range pods { - if pod.WaitingQueueSize <= min { - min = pod.WaitingQueueSize + if pod.GetMetrics().WaitingQueueSize <= min { + min = pod.GetMetrics().WaitingQueueSize } - if pod.WaitingQueueSize >= max { - max = pod.WaitingQueueSize + if pod.GetMetrics().WaitingQueueSize >= max { + max = pod.GetMetrics().WaitingQueueSize } } for _, pod := range pods { - if pod.WaitingQueueSize >= min && pod.WaitingQueueSize <= min+(max-min)/len(pods) { + if pod.GetMetrics().WaitingQueueSize >= min && pod.GetMetrics().WaitingQueueSize <= min+(max-min)/len(pods) { filtered = append(filtered, pod) } } return filtered, nil } -var lowQueueFilter = &basicFilter{ +var LowQueueFilter = &Filter{ name: "low queueing filter", - filter: toFilterFunc((queueThresholdPredicate(config.QueueingThresholdLoRA))), + filter: toFilterFunc((queueThresholdPredicate(config.Conf.QueueingThresholdLoRA))), } -var leastKVCacheFilter = &basicFilter{ +var LeastKVCacheFilter = &Filter{ name: "least KV cache percent", filter: leastKVCacheFilterFunc, } @@ -176,29 +173,29 @@ var leastKVCacheFilter = &basicFilter{ // should consider them all instead of the absolute minimum one. This worked better than picking the // least one as it gives more choices for the next filter, which on aggregate gave better results. // TODO: Compare this strategy with other strategies such as top K. -func leastKVCacheFilterFunc(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) { +func leastKVCacheFilterFunc(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { min := math.MaxFloat64 var max float64 = 0 - filtered := []*types.PodMetrics{} + filtered := []types.Pod{} for _, pod := range pods { - if pod.KVCacheUsagePercent <= min { - min = pod.KVCacheUsagePercent + if pod.GetMetrics().KVCacheUsagePercent <= min { + min = pod.GetMetrics().KVCacheUsagePercent } - if pod.KVCacheUsagePercent >= max { - max = pod.KVCacheUsagePercent + if pod.GetMetrics().KVCacheUsagePercent >= max { + max = pod.GetMetrics().KVCacheUsagePercent } } for _, pod := range pods { - if pod.KVCacheUsagePercent >= min && pod.KVCacheUsagePercent <= min+(max-min)/float64(len(pods)) { + if pod.GetMetrics().KVCacheUsagePercent >= min && pod.GetMetrics().KVCacheUsagePercent <= min+(max-min)/float64(len(pods)) { filtered = append(filtered, pod) } } return filtered, nil } -var loRAAffinityFilter = &basicFilter{ +var LoRAAffinityFilter = &Filter{ name: "affinity LoRA", filter: loRASoftAffinityFilterFunc, } @@ -219,20 +216,20 @@ var loRAAffinityFilter = &basicFilter{ // Returns: // - Filtered slice of pod metrics based on affinity and availability // - Error if any issues occur during filtering -func loRASoftAffinityFilterFunc(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) { +func loRASoftAffinityFilterFunc(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { // Pre-allocate slices with estimated capacity - filtered_affinity := make([]*types.PodMetrics, 0, len(pods)) - filtered_available := make([]*types.PodMetrics, 0, len(pods)) + filtered_affinity := make([]types.Pod, 0, len(pods)) + filtered_available := make([]types.Pod, 0, len(pods)) // Categorize pods based on affinity and availability for _, pod := range pods { - _, active := pod.ActiveModels[ctx.Req.ResolvedTargetModel] - _, waiting := pod.WaitingModels[ctx.Req.ResolvedTargetModel] + _, active := pod.GetMetrics().ActiveModels[ctx.Req.ResolvedTargetModel] + _, waiting := pod.GetMetrics().WaitingModels[ctx.Req.ResolvedTargetModel] if active || waiting { filtered_affinity = append(filtered_affinity, pod) - } else if len(pod.ActiveModels)+len(pod.WaitingModels) < pod.MaxActiveModels { + } else if len(pod.GetMetrics().ActiveModels)+len(pod.GetMetrics().WaitingModels) < pod.GetMetrics().MaxActiveModels { filtered_available = append(filtered_available, pod) } } @@ -243,7 +240,7 @@ func loRASoftAffinityFilterFunc(ctx *types.Context, pods []*types.PodMetrics) ([ // If both groups have pods, use probability to select which group to return if len(filtered_affinity) > 0 && len(filtered_available) > 0 { - if randGen.Float64() < config.LoraAffinityThreshold { + if randGen.Float64() < config.Conf.LoraAffinityThreshold { return filtered_affinity, nil } return filtered_available, nil @@ -257,23 +254,38 @@ func loRASoftAffinityFilterFunc(ctx *types.Context, pods []*types.PodMetrics) ([ return filtered_available, nil } +var HasCapacityFilter = &Filter{ + name: "has capacity for sheddable requests", + filter: toFilterFunc(queueThresholdPredicate(config.Conf.QueueThresholdCritical).and(kvCacheThresholdPredicate(config.Conf.KVCacheThreshold))), +} + +var DropRequestFilter = &Filter{ + name: "drop request", + filter: func(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { + ctx.Logger.V(logutil.DEFAULT).Info("Request dropped", "request", ctx.Req) + return []types.Pod{}, errutil.Error{ + Code: errutil.InferencePoolResourceExhausted, Msg: "dropping request due to limited backend resources", + } + }, +} + // podPredicate is a filter function to check whether a pod is desired. -type podPredicate func(req *types.LLMRequest, pod *types.PodMetrics) bool +type podPredicate func(req *types.LLMRequest, pod types.Pod) bool func queueThresholdPredicate(queueThreshold int) podPredicate { - return func(req *types.LLMRequest, pod *types.PodMetrics) bool { - return pod.WaitingQueueSize <= queueThreshold + return func(req *types.LLMRequest, pod types.Pod) bool { + return pod.GetMetrics().WaitingQueueSize <= queueThreshold } } func kvCacheThresholdPredicate(kvCacheThreshold float64) podPredicate { - return func(req *types.LLMRequest, pod *types.PodMetrics) bool { - return pod.KVCacheUsagePercent <= kvCacheThreshold + return func(req *types.LLMRequest, pod types.Pod) bool { + return pod.GetMetrics().KVCacheUsagePercent <= kvCacheThreshold } } func (pp podPredicate) and(another podPredicate) podPredicate { - return func(req *types.LLMRequest, pod *types.PodMetrics) bool { + return func(req *types.LLMRequest, pod types.Pod) bool { return pp(req, pod) && another(req, pod) } } diff --git a/pkg/epp/scheduling/filter_test.go b/pkg/epp/scheduling/plugins/filter_test.go similarity index 82% rename from pkg/epp/scheduling/filter_test.go rename to pkg/epp/scheduling/plugins/filter_test.go index 543826d0..107b423f 100644 --- a/pkg/epp/scheduling/filter_test.go +++ b/pkg/epp/scheduling/plugins/filter_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package scheduling +package plugins import ( "context" @@ -24,6 +24,7 @@ import ( "github.com/google/go-cmp/cmp" k8stypes "k8s.io/apimachinery/pkg/types" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/config" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" ) @@ -31,17 +32,17 @@ func TestFilter(t *testing.T) { tests := []struct { name string req *types.LLMRequest - input []*types.PodMetrics - output []*types.PodMetrics + input []types.Pod + output []types.Pod err bool - filter *decisionTreeFilter + filter *DecisionTreeFilter }{ { name: "simple filter without successor, failure", - filter: &decisionTreeFilter{ - current: &basicFilter{ + filter: &DecisionTreeFilter{ + Current: &Filter{ name: "error", - filter: func(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) { + filter: func(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { return nil, errors.New("filter error") }, }, @@ -58,7 +59,8 @@ func TestFilter(t *testing.T) { t.Errorf("Unexpected error, got %v, want %v", err, test.err) } - if diff := cmp.Diff(test.output, got); diff != "" { + opt := cmp.AllowUnexported(types.PodMetrics{}) + if diff := cmp.Diff(test.output, got, opt); diff != "" { t.Errorf("Unexpected output (-want +got): %v", diff) } }) @@ -70,43 +72,43 @@ func TestFilterFunc(t *testing.T) { name string f filterFunc req *types.LLMRequest - input []*types.PodMetrics - output []*types.PodMetrics + input []types.Pod + output []types.Pod err bool }{ { name: "least queuing empty input", f: leastQueuingFilterFunc, - input: []*types.PodMetrics{}, - output: []*types.PodMetrics{}, + input: []types.Pod{}, + output: []types.Pod{}, }, { name: "least queuing", f: leastQueuingFilterFunc, - input: []*types.PodMetrics{ - { + input: []types.Pod{ + &types.PodMetrics{ Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, }, }, - { + &types.PodMetrics{ Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 3, }, }, - { + &types.PodMetrics{ Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 10, }, }, }, - output: []*types.PodMetrics{ - { + output: []types.Pod{ + &types.PodMetrics{ Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, }, }, - { + &types.PodMetrics{ Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 3, }, @@ -116,36 +118,36 @@ func TestFilterFunc(t *testing.T) { { name: "least kv cache empty input", f: leastKVCacheFilterFunc, - input: []*types.PodMetrics{}, - output: []*types.PodMetrics{}, + input: []types.Pod{}, + output: []types.Pod{}, }, { name: "least kv cache", f: leastKVCacheFilterFunc, - input: []*types.PodMetrics{ - { + input: []types.Pod{ + &types.PodMetrics{ Metrics: &backendmetrics.Metrics{ KVCacheUsagePercent: 0, }, }, - { + &types.PodMetrics{ Metrics: &backendmetrics.Metrics{ KVCacheUsagePercent: 0.3, }, }, - { + &types.PodMetrics{ Metrics: &backendmetrics.Metrics{ KVCacheUsagePercent: 1.0, }, }, }, - output: []*types.PodMetrics{ - { + output: []types.Pod{ + &types.PodMetrics{ Metrics: &backendmetrics.Metrics{ KVCacheUsagePercent: 0, }, }, - { + &types.PodMetrics{ Metrics: &backendmetrics.Metrics{ KVCacheUsagePercent: 0.3, }, @@ -155,22 +157,22 @@ func TestFilterFunc(t *testing.T) { { name: "lowQueueAndLessThanKVCacheThresholdPredicate", f: toFilterFunc(queueThresholdPredicate(0).and(kvCacheThresholdPredicate(0.8))), - input: []*types.PodMetrics{ - { + input: []types.Pod{ + &types.PodMetrics{ // This pod should be returned. Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0, }, }, - { + &types.PodMetrics{ // Queue is non zero, despite low kv cache, should not return. Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 1, KVCacheUsagePercent: 0.3, }, }, - { + &types.PodMetrics{ // High kv cache despite zero queue, should not return Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, @@ -178,8 +180,8 @@ func TestFilterFunc(t *testing.T) { }, }, }, - output: []*types.PodMetrics{ - { + output: []types.Pod{ + &types.PodMetrics{ Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0, @@ -197,7 +199,8 @@ func TestFilterFunc(t *testing.T) { t.Errorf("Unexpected error, got %v, want %v", err, test.err) } - if diff := cmp.Diff(test.output, got); diff != "" { + opt := cmp.AllowUnexported(types.PodMetrics{}) + if diff := cmp.Diff(test.output, got, opt); diff != "" { t.Errorf("Unexpected output (-want +got): %v", diff) } }) @@ -215,15 +218,15 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { ) // Save original config value to restore later - originalThreshold := config.LoraAffinityThreshold + originalThreshold := config.Conf.LoraAffinityThreshold // Set a specific test value for this test testThreshold := 0.75 // 75% - config.LoraAffinityThreshold = testThreshold + config.Conf.LoraAffinityThreshold = testThreshold // Ensure we restore the original threshold when test completes defer func() { - config.LoraAffinityThreshold = originalThreshold + config.Conf.LoraAffinityThreshold = originalThreshold }() // Create a test request and pods @@ -233,8 +236,8 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { } // Test setup: One affinity pod and one available pod - pods := []*types.PodMetrics{ - { + pods := []types.Pod{ + &types.PodMetrics{ Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "affinity-pod"}}, Metrics: &backendmetrics.Metrics{ MaxActiveModels: 2, @@ -243,7 +246,7 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { }, }, }, - { + &types.PodMetrics{ Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "available-pod"}}, Metrics: &backendmetrics.Metrics{ MaxActiveModels: 2, @@ -258,7 +261,7 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { availableCount := 0 // Use the test threshold value - expectedAffinityPercent := config.LoraAffinityThreshold * 100 + expectedAffinityPercent := config.Conf.LoraAffinityThreshold * 100 expectedAvailabilityPercent := 100 - expectedAffinityPercent for i := 0; i < numIterations; i++ { @@ -292,8 +295,8 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { availableUpperBound := expectedAvailabilityPercent + tolerancePercent t.Logf("Distribution results over %d iterations:", numIterations) - t.Logf("Expected affinity percent: %.2f%% (threshold: %.2f)", expectedAffinityPercent, config.LoraAffinityThreshold) - t.Logf("Expected availability percent: %.2f%% (threshold: %.2f)", expectedAvailabilityPercent, config.LoraAffinityThreshold) + t.Logf("Expected affinity percent: %.2f%% (threshold: %.2f)", expectedAffinityPercent, config.Conf.LoraAffinityThreshold) + t.Logf("Expected availability percent: %.2f%% (threshold: %.2f)", expectedAvailabilityPercent, config.Conf.LoraAffinityThreshold) t.Logf("Actual affinity percent: %.2f%% (%d out of %d)", actualAffinityPercent, affinityCount, numIterations) t.Logf("Actual available percent: %.2f%% (%d out of %d)", actualAvailablePercent, availableCount, numIterations) diff --git a/pkg/epp/scheduling/plugins/noop.go b/pkg/epp/scheduling/plugins/noop.go new file mode 100644 index 00000000..1abcb95b --- /dev/null +++ b/pkg/epp/scheduling/plugins/noop.go @@ -0,0 +1,38 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package plugins + +import ( + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" +) + +// NoopPlugin provides a default, no-operation implementation of the Plugin interface. +// It can be embedded in other plugin implementations to avoid boilerplate code for +// unused methods. +type NoopPlugin struct{} + +func (p *NoopPlugin) Name() string { return "NoopPlugin" } + +func (p *NoopPlugin) Score(ctx *types.Context, pod types.Pod) (float64, error) { return 0.0, nil } + +func (p *NoopPlugin) Filter(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { + return pods, nil +} + +func (p *NoopPlugin) PreSchedule(ctx *types.Context) {} + +func (p *NoopPlugin) PostSchedule(ctx *types.Context, res *types.Result) {} diff --git a/pkg/epp/scheduling/plugins/picker.go b/pkg/epp/scheduling/plugins/picker.go new file mode 100644 index 00000000..569e4e86 --- /dev/null +++ b/pkg/epp/scheduling/plugins/picker.go @@ -0,0 +1,37 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package plugins + +import ( + "fmt" + "math/rand" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +type RandomPicker struct{} + +func (rp *RandomPicker) Name() string { + return "random" +} + +func (rp *RandomPicker) Pick(ctx *types.Context, pods []types.Pod) (*types.Result, error) { + ctx.Logger.V(logutil.DEBUG).Info(fmt.Sprintf("Selecting a random pod from %d candidates: %+v", len(pods), pods)) + i := rand.Intn(len(pods)) + return &types.Result{TargetPod: pods[i]}, nil +} diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go index 8679ffba..7cc2bd96 100644 --- a/pkg/epp/scheduling/scheduler.go +++ b/pkg/epp/scheduling/scheduler.go @@ -20,113 +20,71 @@ package scheduling import ( "context" "fmt" - "math/rand" + "time" "sigs.k8s.io/controller-runtime/pkg/log" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" - envutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/env" - errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) -// Config holds all the configuration values for the scheduler -type Config struct { - KVCacheThreshold float64 - QueueThresholdCritical int - QueueingThresholdLoRA int - LoraAffinityThreshold float64 -} - -const ( - // Default values to use if environment variables are not set - defaultKVCacheThreshold = 0.8 - defaultQueueThresholdCritical = 5 - defaultQueueingThresholdLoRA = 128 - defaultLoraAffinityThreshold = 0.999 -) - -// LoadConfig loads configuration from environment variables -func LoadConfig() Config { - // Use a default logger for initial configuration loading - baseLogger := log.Log.WithName("scheduling-config") - - config := Config{ - KVCacheThreshold: envutil.GetEnvFloat("KV_CACHE_THRESHOLD", defaultKVCacheThreshold, baseLogger), - QueueThresholdCritical: envutil.GetEnvInt("QUEUE_THRESHOLD_CRITICAL", defaultQueueThresholdCritical, baseLogger), - QueueingThresholdLoRA: envutil.GetEnvInt("QUEUING_THRESHOLD_LORA", defaultQueueingThresholdLoRA, baseLogger), - LoraAffinityThreshold: envutil.GetEnvFloat("LORA_AFFINITY_THRESHOLD", defaultLoraAffinityThreshold, baseLogger), - } - - baseLogger.V(logutil.DEFAULT).Info("Scheduler configuration loaded", "config", config) - - return config -} - -var config = LoadConfig() - var ( - lowLatencyFilter = &decisionTreeFilter{ - current: lowQueueFilter, - nextOnSuccess: &decisionTreeFilter{ - current: loRAAffinityFilter, - nextOnSuccessOrFailure: &decisionTreeFilter{ - current: leastQueueFilter, - nextOnSuccessOrFailure: &decisionTreeFilter{ - current: leastKVCacheFilter, + lowLatencyFilter = &plugins.DecisionTreeFilter{ + Current: plugins.LowQueueFilter, + NextOnSuccess: &plugins.DecisionTreeFilter{ + Current: plugins.LoRAAffinityFilter, + NextOnSuccessOrFailure: &plugins.DecisionTreeFilter{ + Current: plugins.LeastQueueFilter, + NextOnSuccessOrFailure: &plugins.DecisionTreeFilter{ + Current: plugins.LeastKVCacheFilter, }, }, }, - nextOnFailure: &decisionTreeFilter{ - current: leastQueueFilter, - nextOnSuccessOrFailure: &decisionTreeFilter{ - current: loRAAffinityFilter, - nextOnSuccessOrFailure: &decisionTreeFilter{ - current: leastKVCacheFilter, + NextOnFailure: &plugins.DecisionTreeFilter{ + Current: plugins.LeastQueueFilter, + NextOnSuccessOrFailure: &plugins.DecisionTreeFilter{ + Current: plugins.LoRAAffinityFilter, + NextOnSuccessOrFailure: &plugins.DecisionTreeFilter{ + Current: plugins.LeastKVCacheFilter, }, }, }, } - sheddableRequestFilter = &decisionTreeFilter{ + sheddableRequestFilter = &plugins.DecisionTreeFilter{ // When there is at least one model server that's not queuing requests, and still has KV // cache below a certain threshold, we consider this model server has capacity to handle // a sheddable request without impacting critical requests. - current: hasCapacityFilter, - nextOnSuccess: lowLatencyFilter, + Current: plugins.HasCapacityFilter, + NextOnSuccess: lowLatencyFilter, // If all pods are queuing or running above the KVCache threshold, we drop the sheddable // request to make room for critical requests. - nextOnFailure: dropRequestFilter, - } - - hasCapacityFilter = &basicFilter{ - name: "has capacity for sheddable requests", - filter: toFilterFunc(queueThresholdPredicate(config.QueueThresholdCritical).and(kvCacheThresholdPredicate(config.KVCacheThreshold))), - } - - dropRequestFilter = &basicFilter{ - name: "drop request", - filter: func(ctx *types.Context, pods []*types.PodMetrics) ([]*types.PodMetrics, error) { - ctx.Logger.V(logutil.DEFAULT).Info("Request dropped", "request", ctx.Req) - return []*types.PodMetrics{}, errutil.Error{ - Code: errutil.InferencePoolResourceExhausted, Msg: "dropping request due to limited backend resources", - } - }, + NextOnFailure: plugins.DropRequestFilter, } ) func NewScheduler(datastore Datastore) *Scheduler { + defaultPlugin := &defaultPlugin{} + return &Scheduler{ - datastore: datastore, - criticalRequestFilter: lowLatencyFilter, - sheddableRequestFilter: sheddableRequestFilter, + datastore: datastore, + preSchedulePlugins: []types.PreSchedule{}, + postSchedulePlugins: []types.PostSchedule{}, + scorers: []types.Scorer{}, + filters: []types.Filter{defaultPlugin}, + picker: defaultPlugin, } } type Scheduler struct { - datastore Datastore - criticalRequestFilter Filter - sheddableRequestFilter Filter + datastore Datastore + preSchedulePlugins []types.PreSchedule + postSchedulePlugins []types.PostSchedule + filters []types.Filter + scorers []types.Scorer + picker types.Picker } type Datastore interface { @@ -134,27 +92,125 @@ type Datastore interface { } // Schedule finds the target pod based on metrics and the requested lora adapter. -func (s *Scheduler) Schedule(ctx context.Context, req *types.LLMRequest) (targetPod types.Pod, err error) { +func (s *Scheduler) Schedule(ctx context.Context, req *types.LLMRequest) (*types.Result, error) { logger := log.FromContext(ctx).WithValues("request", req) + loggerDebug := logger.V(logutil.DEBUG) // Snapshot pod metrics from the datastore to: // 1. Reduce concurrent access to the datastore. // 2. Ensure consistent data during the scheduling operation of a request. sCtx := types.NewContext(ctx, req, types.ToSchedulerPodMetrics(s.datastore.PodGetAll())) - logger.V(logutil.DEBUG).Info(fmt.Sprintf("Scheduling a request. Metrics: %+v", sCtx.PodsSnapshot)) + loggerDebug.Info(fmt.Sprintf("Scheduling a request. Metrics: %+v", sCtx.PodsSnapshot)) - var filter Filter - if req.Critical { - filter = s.criticalRequestFilter - } else { - filter = s.sheddableRequestFilter + s.runPreSchedulePlugins(sCtx) + + pods, err := s.runFilterPlugins(sCtx) + if err != nil { + return nil, err + } + + if err := s.runScorerPlugins(sCtx, pods); err != nil { + return nil, err + } + + before := time.Now() + res, err := s.picker.Pick(sCtx, pods) + metrics.RecordSchedulerPluginProcessingLatency(types.PickerPluginType, s.picker.Name(), time.Since(before)) + if err != nil { + return nil, err } + loggerDebug.Info("After running picker plugins", "result", res) - pods, err := filter.Filter(sCtx, sCtx.PodsSnapshot) - if err != nil || len(pods) == 0 { - return nil, fmt.Errorf("failed to apply filter, resulted %v pods, this should never happen: %w", len(pods), err) + s.runPostSchedulePlugins(sCtx, res) + + return res, nil +} + +func (s *Scheduler) runPreSchedulePlugins(ctx *types.Context) { + for _, plugin := range s.preSchedulePlugins { + ctx.Logger.V(logutil.DEBUG).Info("Running pre-schedule plugin", "plugin", plugin.Name()) + before := time.Now() + plugin.PreSchedule(ctx) + metrics.RecordSchedulerPluginProcessingLatency(types.PreSchedulerPluginType, plugin.Name(), time.Since(before)) + } +} + +func (s *Scheduler) runPostSchedulePlugins(ctx *types.Context, res *types.Result) { + for _, plugin := range s.postSchedulePlugins { + ctx.Logger.V(logutil.DEBUG).Info("Running post-schedule plugin", "plugin", plugin.Name()) + before := time.Now() + plugin.PostSchedule(ctx, res) + metrics.RecordSchedulerPluginProcessingLatency(types.PostSchedulePluginType, plugin.Name(), time.Since(before)) + } +} + +func (s *Scheduler) runFilterPlugins(ctx *types.Context) ([]types.Pod, error) { + loggerDebug := ctx.Logger.V(logutil.DEBUG) + pods := ctx.PodsSnapshot + loggerDebug.Info("Before running filter plugins", "pods", pods) + for _, filter := range s.filters { + loggerDebug.Info("Running filter plugin", "plugin", filter.Name()) + before := time.Now() + filteredPods, err := filter.Filter(ctx, pods) + metrics.RecordSchedulerPluginProcessingLatency(types.FilterPluginType, filter.Name(), time.Since(before)) + if err != nil || len(filteredPods) == 0 { + return nil, fmt.Errorf("failed to apply filter, resulted %v pods, this should never happen: %w", len(filteredPods), err) + } + pods = filteredPods + loggerDebug.Info("Filter plugin result", "plugin", filter.Name(), "pods", pods) + } + loggerDebug.Info("After running filter plugins", "pods", pods) + return pods, nil +} + +func (s *Scheduler) runScorerPlugins(ctx *types.Context, pods []types.Pod) error { + loggerDebug := ctx.Logger.V(logutil.DEBUG) + loggerDebug.Info("Before running score plugins", "pods", pods) + for _, pod := range pods { + score, err := runScorersForPod(ctx, s.scorers, pod) + if err != nil { + return err + } + pod.SetScore(score) + } + loggerDebug.Info("After running score plugins", "pods", pods) + return nil +} + +// Iterate through each scorer in the chain and accumulate the scores. +func runScorersForPod(ctx *types.Context, scorers []types.Scorer, pod types.Pod) (float64, error) { + logger := ctx.Logger.WithValues("pod", pod.GetPod().NamespacedName).V(logutil.DEBUG) + score := float64(0) + for _, scorer := range scorers { + logger.Info("Running scorer", "scorer", scorer.Name()) + before := time.Now() + oneScore, err := scorer.Score(ctx, pod) + metrics.RecordSchedulerPluginProcessingLatency(types.ScorerPluginType, scorer.Name(), time.Since(before)) + if err != nil { + logger.Error(err, "Failed to calculate score for scorer", "scorer", scorer.Name()) + return 0, err + } + score += oneScore + logger.Info("After scorer", "scorer", scorer.Name(), "score", oneScore, "total score", score) + } + return score, nil +} + +type defaultPlugin struct { + plugins.RandomPicker +} + +func (p *defaultPlugin) Name() string { + return "DefaultPlugin" +} + +func (p *defaultPlugin) Filter(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { + req := ctx.Req + var filter types.Filter + if req.Critical { + filter = lowLatencyFilter + } else { + filter = sheddableRequestFilter } - logger.V(logutil.DEBUG).Info(fmt.Sprintf("Selecting a random pod from %d candidates: %+v", len(pods), pods)) - i := rand.Intn(len(pods)) - return pods[i], nil + return filter.Filter(ctx, pods) } diff --git a/pkg/epp/scheduling/scheduler_test.go b/pkg/epp/scheduling/scheduler_test.go index 3fd3fb24..5a2265bf 100644 --- a/pkg/epp/scheduling/scheduler_test.go +++ b/pkg/epp/scheduling/scheduler_test.go @@ -18,22 +18,34 @@ package scheduling import ( "context" + "errors" "testing" "github.com/google/go-cmp/cmp" k8stypes "k8s.io/apimachinery/pkg/types" - backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" // Import config for thresholds "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" ) +// Tests the default scheduler configuration and expected behavior. func TestSchedule(t *testing.T) { tests := []struct { - name string - req *types.LLMRequest - input []*backendmetrics.FakePodMetrics - output types.Pod - err bool + name string + req *types.LLMRequest + input []*backendmetrics.FakePodMetrics + wantRes *types.Result + err bool }{ + { + name: "no pods in datastore", + req: &types.LLMRequest{ + Model: "any-model", + ResolvedTargetModel: "any-model", + Critical: true, + }, + input: []*backendmetrics.FakePodMetrics{}, + err: true, + }, { name: "critical request", req: &types.LLMRequest{ @@ -80,17 +92,19 @@ func TestSchedule(t *testing.T) { }, }, }, - output: &types.PodMetrics{ - Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 3, - KVCacheUsagePercent: 0.1, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "critical": 1, + wantRes: &types.Result{ + TargetPod: &types.PodMetrics{ + Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 3, + KVCacheUsagePercent: 0.1, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "critical": 1, + }, + WaitingModels: map[string]int{}, }, - WaitingModels: map[string]int{}, }, }, }, @@ -139,17 +153,19 @@ func TestSchedule(t *testing.T) { }, }, }, - output: &types.PodMetrics{ - Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, + wantRes: &types.Result{ + TargetPod: &types.PodMetrics{ + Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + WaitingModels: map[string]int{}, }, - WaitingModels: map[string]int{}, }, }, }, @@ -199,8 +215,8 @@ func TestSchedule(t *testing.T) { }, }, }, - output: nil, - err: true, + wantRes: nil, + err: true, }, } @@ -212,13 +228,205 @@ func TestSchedule(t *testing.T) { t.Errorf("Unexpected error, got %v, want %v", err, test.err) } - if diff := cmp.Diff(test.output, got); diff != "" { + opt := cmp.AllowUnexported(types.PodMetrics{}) + if diff := cmp.Diff(test.wantRes, got, opt); diff != "" { t.Errorf("Unexpected output (-want +got): %v", diff) } }) } } +func TestSchedulePlugins(t *testing.T) { + tp1 := &TestPlugin{ + NameRes: "test1", + ScoreRes: 0.3, + FilterRes: []k8stypes.NamespacedName{{Name: "pod1"}, {Name: "pod2"}, {Name: "pod3"}}, + } + tp2 := &TestPlugin{ + NameRes: "test2", + ScoreRes: 0.8, + FilterRes: []k8stypes.NamespacedName{{Name: "pod1"}, {Name: "pod2"}}, + } + tpFilterErr := &TestPlugin{ + NameRes: "filter err", + FilterErr: errors.New("filter error"), + } + tpScorerErr := &TestPlugin{ + NameRes: "score err", + ScoreErr: errors.New("score err"), + } + pickerPlugin := &TestPlugin{ + NameRes: "picker", + PickRes: k8stypes.NamespacedName{Name: "pod1"}, + } + pickerErr := &TestPlugin{ + NameRes: "picker err", + PickErr: errors.New("picker err"), + } + + tests := []struct { + name string + preSchedulePlugins []types.PreSchedule + postSchedulePlugins []types.PostSchedule + filters []types.Filter + scorers []types.Scorer + picker types.Picker + input []*backendmetrics.FakePodMetrics + wantTargetPod k8stypes.NamespacedName + targetPodScore float64 + // Number of expected pods to score (after filter) + numPodsToScore int + err bool + }{ + { + name: "all plugins executed successfully", + preSchedulePlugins: []types.PreSchedule{tp1, tp2}, + postSchedulePlugins: []types.PostSchedule{tp1, tp2}, + filters: []types.Filter{tp1, tp2}, + scorers: []types.Scorer{tp1, tp2}, + picker: pickerPlugin, + input: []*backendmetrics.FakePodMetrics{ + {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, + {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, + {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}, + }, + wantTargetPod: k8stypes.NamespacedName{Name: "pod1"}, + targetPodScore: 1.1, + numPodsToScore: 2, + err: false, + }, + { + name: "filter error", + preSchedulePlugins: []types.PreSchedule{tp1, tp2}, + postSchedulePlugins: []types.PostSchedule{tp1, tp2}, + filters: []types.Filter{tp1, tpFilterErr}, + scorers: []types.Scorer{tp1, tp2}, + picker: pickerPlugin, + input: []*backendmetrics.FakePodMetrics{ + {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, + {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, + {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}, + }, + err: true, + }, + { + name: "scorer error", + preSchedulePlugins: []types.PreSchedule{tp1, tp2}, + postSchedulePlugins: []types.PostSchedule{tp1, tp2}, + filters: []types.Filter{tp1, tp2}, + scorers: []types.Scorer{tp1, tpScorerErr}, + picker: pickerPlugin, + input: []*backendmetrics.FakePodMetrics{ + {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, + {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, + {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}, + }, + err: true, + }, + { + name: "picker error", + preSchedulePlugins: []types.PreSchedule{tp1, tp2}, + postSchedulePlugins: []types.PostSchedule{tp1, tp2}, + filters: []types.Filter{tp1, tp2}, + scorers: []types.Scorer{tp1, tp2}, + picker: pickerErr, + input: []*backendmetrics.FakePodMetrics{ + {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, + {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, + {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}, + }, + err: true, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + // Reset all plugins before each new test case. + for _, plugin := range test.preSchedulePlugins { + plugin.(*TestPlugin).Reset() + } + for _, plugin := range test.postSchedulePlugins { + plugin.(*TestPlugin).Reset() + } + for _, plugin := range test.filters { + plugin.(*TestPlugin).Reset() + } + for _, plugin := range test.scorers { + plugin.(*TestPlugin).Reset() + } + test.picker.(*TestPlugin).Reset() + + // Initialize the scheduler + scheduler := &Scheduler{ + datastore: &fakeDataStore{pods: test.input}, + preSchedulePlugins: test.preSchedulePlugins, + postSchedulePlugins: test.postSchedulePlugins, + filters: test.filters, + scorers: test.scorers, + picker: test.picker, + } + + req := &types.LLMRequest{Model: "test-model"} + got, err := scheduler.Schedule(context.Background(), req) + + // Validate error state + if test.err != (err != nil) { + t.Fatalf("Unexpected error, got %v, want %v", err, test.err) + } + + if err != nil { + return + } + + // Validate output + opt := cmp.AllowUnexported(types.PodMetrics{}) + wantPod := &types.PodMetrics{ + Pod: &backendmetrics.Pod{NamespacedName: test.wantTargetPod}, + } + wantPod.SetScore(test.targetPodScore) + wantRes := &types.Result{TargetPod: wantPod} + if diff := cmp.Diff(wantRes, got, opt); diff != "" { + t.Errorf("Unexpected output (-want +got): %v", diff) + } + + // Validate plugin execution counts dynamically + for _, plugin := range test.preSchedulePlugins { + tp, _ := plugin.(*TestPlugin) + if tp.PreScheduleCallCount != 1 { + t.Errorf("Plugin %s PreSchedule() called %d times, expected 1", tp.NameRes, tp.PreScheduleCallCount) + } + } + + for _, plugin := range test.postSchedulePlugins { + tp, _ := plugin.(*TestPlugin) + if tp.PostScheduleCallCount != 1 { + t.Errorf("Plugin %s PostSchedule() called %d times, expected 1", tp.NameRes, tp.PostScheduleCallCount) + } + } + + for _, plugin := range test.filters { + tp, _ := plugin.(*TestPlugin) + if tp.FilterCallCount != 1 { + t.Errorf("Plugin %s Filter() called %d times, expected 1", tp.NameRes, tp.FilterCallCount) + } + } + + for _, plugin := range test.scorers { + tp, _ := plugin.(*TestPlugin) + if tp.ScoreCallCount != test.numPodsToScore { + t.Errorf("Plugin %s Score() called %d times, expected 1", tp.NameRes, tp.ScoreCallCount) + } + } + + tp, _ := test.picker.(*TestPlugin) + if tp.PickCallCount != 1 { + t.Errorf("Picker plugin %s Pick() called %d times, expected 1", tp.NameRes, tp.PickCallCount) + } + + }) + } +} + type fakeDataStore struct { pods []*backendmetrics.FakePodMetrics } @@ -230,3 +438,68 @@ func (fds *fakeDataStore) PodGetAll() []backendmetrics.PodMetrics { } return pm } + +// TestPlugin is an implementation useful in unit tests. +type TestPlugin struct { + NameRes string + ScoreCallCount int + ScoreRes float64 + ScoreErr error + FilterCallCount int + FilterRes []k8stypes.NamespacedName + FilterErr error + PreScheduleCallCount int + PostScheduleCallCount int + PickCallCount int + PickRes k8stypes.NamespacedName + PickErr error +} + +func (tp *TestPlugin) Name() string { return tp.NameRes } + +func (tp *TestPlugin) Score(ctx *types.Context, pod types.Pod) (float64, error) { + tp.ScoreCallCount++ + return tp.ScoreRes, tp.ScoreErr +} + +func (tp *TestPlugin) Filter(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { + tp.FilterCallCount++ + return findPods(ctx, tp.FilterRes...), tp.FilterErr +} + +func (tp *TestPlugin) PreSchedule(ctx *types.Context) { + tp.PreScheduleCallCount++ +} + +func (tp *TestPlugin) PostSchedule(ctx *types.Context, res *types.Result) { + tp.PostScheduleCallCount++ +} + +func (tp *TestPlugin) Pick(ctx *types.Context, pods []types.Pod) (*types.Result, error) { + tp.PickCallCount++ + if tp.PickErr != nil { + return nil, tp.PickErr + } + pod := findPods(ctx, tp.PickRes)[0] + return &types.Result{TargetPod: pod}, nil +} + +func (tp *TestPlugin) Reset() { + tp.PreScheduleCallCount = 0 + tp.PostScheduleCallCount = 0 + tp.FilterCallCount = 0 + tp.ScoreCallCount = 0 + tp.PickCallCount = 0 +} + +func findPods(ctx *types.Context, names ...k8stypes.NamespacedName) []types.Pod { + res := []types.Pod{} + for _, pod := range ctx.PodsSnapshot { + for _, name := range names { + if pod.GetPod().NamespacedName.String() == name.String() { + res = append(res, pod) + } + } + } + return res +} diff --git a/pkg/epp/scheduling/types/interfaces.go b/pkg/epp/scheduling/types/interfaces.go new file mode 100644 index 00000000..6e954cef --- /dev/null +++ b/pkg/epp/scheduling/types/interfaces.go @@ -0,0 +1,75 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package types + +import ( + backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" +) + +const ( + PreSchedulerPluginType = "PreSchedule" + PostSchedulePluginType = "PostSchedule" + FilterPluginType = "Filter" + ScorerPluginType = "Scorer" + PickerPluginType = "Picker" +) + +type Pod interface { + GetPod() *backendmetrics.Pod + GetMetrics() *backendmetrics.Metrics + SetScore(float64) + Score() float64 + String() string +} + +// Plugin defines the interface for scheduler plugins, combining scoring, filtering, +// and event handling capabilities. +type Plugin interface { + // Name returns the name of the plugin. + Name() string +} + +// PreSchedule is called when the scheduler receives a new request. It can be used for various +// initialization work. +type PreSchedule interface { + Plugin + PreSchedule(ctx *Context) +} + +// PostSchedule is called by the scheduler after it selects a targetPod for the request. +type PostSchedule interface { + Plugin + PostSchedule(ctx *Context, res *Result) +} + +// Filter defines the interface for filtering a list of pods based on context. +type Filter interface { + Plugin + Filter(ctx *Context, pods []Pod) ([]Pod, error) +} + +// Scorer defines the interface for scoring pods based on context. +type Scorer interface { + Plugin + Score(ctx *Context, pod Pod) (float64, error) +} + +// Picker picks the final pod(s) to send the request to. +type Picker interface { + Plugin + Pick(ctx *Context, pods []Pod) (*Result, error) +} diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go index 9450652e..e52e9047 100644 --- a/pkg/epp/scheduling/types/types.go +++ b/pkg/epp/scheduling/types/types.go @@ -30,23 +30,22 @@ type LLMRequest struct { Model string // Target models is a map of target model name to weight. TargetModels map[string]int + Prompt string // Resolved target model is the final target model after traffic split. ResolvedTargetModel string Critical bool } +func (r *LLMRequest) String() string { + return fmt.Sprintf("Model: %s, TargetModels: %v, ResolvedTargetModel: %s, Critical: %t, PromptLength: %v", r.Model, r.TargetModels, r.ResolvedTargetModel, r.Critical, len(r.Prompt)) +} + // Context holds contextual information during a scheduling operation. type Context struct { context.Context Logger logr.Logger Req *LLMRequest - PodsSnapshot []*PodMetrics -} - -type Pod interface { - GetPod() *backendmetrics.Pod - GetMetrics() *backendmetrics.Metrics - String() string + PodsSnapshot []Pod } func (pm *PodMetrics) String() string { @@ -64,12 +63,21 @@ func (pm *PodMetrics) GetMetrics() *backendmetrics.Metrics { return pm.Metrics } +func (pm *PodMetrics) SetScore(score float64) { + pm.score = score +} + +func (pm *PodMetrics) Score() float64 { + return pm.score +} + type PodMetrics struct { + score float64 *backendmetrics.Pod *backendmetrics.Metrics } -func NewContext(ctx context.Context, req *LLMRequest, pods []*PodMetrics) *Context { +func NewContext(ctx context.Context, req *LLMRequest, pods []Pod) *Context { logger := log.FromContext(ctx).WithValues("request", req) return &Context{ Context: ctx, @@ -79,10 +87,15 @@ func NewContext(ctx context.Context, req *LLMRequest, pods []*PodMetrics) *Conte } } -func ToSchedulerPodMetrics(pods []backendmetrics.PodMetrics) []*PodMetrics { - pm := make([]*PodMetrics, 0, len(pods)) +func ToSchedulerPodMetrics(pods []backendmetrics.PodMetrics) []Pod { + pm := make([]Pod, 0, len(pods)) for _, pod := range pods { - pm = append(pm, &PodMetrics{pod.GetPod().Clone(), pod.GetMetrics().Clone()}) + pm = append(pm, &PodMetrics{Pod: pod.GetPod().Clone(), Metrics: pod.GetMetrics().Clone()}) } return pm } + +// Result captures the scheduler result. +type Result struct { + TargetPod Pod +} From 7d238dd720303393c31138db8501225e86c77233 Mon Sep 17 00:00:00 2001 From: Nicole Xin Date: Tue, 22 Apr 2025 17:51:42 -0700 Subject: [PATCH 51/74] Complete the InferencePool documentation (#673) * Initial guide for inference pool * Add extensionReference to the InferencePool spec * Fix list formatting * Remove unused labels * Autogenerate the spec * Update site-src/api-types/inferencepool.md Co-authored-by: Rob Scott * Update site-src/api-types/inferencepool.md Co-authored-by: Rob Scott * Update site-src/api-types/inferencepool.md Co-authored-by: Rob Scott * Update site-src/api-types/inferencepool.md Co-authored-by: Rob Scott * Update site-src/api-types/inferencepool.md Co-authored-by: Rob Scott * Update site-src/api-types/inferencepool.md Co-authored-by: Rob Scott * Rename llm-pool names in rollout example * Add use cases for replacing an inference pool * Rewording the background section * Create replacing-inference-pool.md * Replace instructions with a link for how to replace an inference pool * Update replacing-inference-pool.md * Update mkdocs.yml * Update replacing-inference-pool.md * Update inferencemodel_types.go * Update inferencepool.md * Update site-src/guides/replacing-inference-pool.md Co-authored-by: Rob Scott --------- Co-authored-by: Rob Scott --- api/v1alpha2/inferencemodel_types.go | 2 +- mkdocs.yml | 1 + site-src/api-types/inferencepool.md | 58 +++- site-src/guides/replacing-inference-pool.md | 59 ++++ site-src/reference/spec.md | 288 +++++++++++++++++--- 5 files changed, 352 insertions(+), 56 deletions(-) create mode 100644 site-src/guides/replacing-inference-pool.md diff --git a/api/v1alpha2/inferencemodel_types.go b/api/v1alpha2/inferencemodel_types.go index 052683d8..7cd98a74 100644 --- a/api/v1alpha2/inferencemodel_types.go +++ b/api/v1alpha2/inferencemodel_types.go @@ -126,7 +126,7 @@ type PoolObjectReference struct { } // Criticality defines how important it is to serve the model compared to other models. -// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional(use a pointer), and set no default. +// Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional (use a pointer), and set no default. // This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior. // +kubebuilder:validation:Enum=Critical;Standard;Sheddable type Criticality string diff --git a/mkdocs.yml b/mkdocs.yml index bdfffe05..e5927ed5 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -63,6 +63,7 @@ nav: - Getting started: guides/index.md - Adapter Rollout: guides/adapter-rollout.md - Metrics: guides/metrics.md + - Replacing an Inference Pool: guides/replacing-inference-pool.md - Implementer's Guide: guides/implementers.md - Performance: - Benchmark: performance/benchmark/index.md diff --git a/site-src/api-types/inferencepool.md b/site-src/api-types/inferencepool.md index baa604b6..1494d314 100644 --- a/site-src/api-types/inferencepool.md +++ b/site-src/api-types/inferencepool.md @@ -7,28 +7,56 @@ ## Background -The InferencePool resource is a logical grouping of compute resources, e.g. Pods, that run model servers. The InferencePool would deploy its own routing, and offer administrative configuration to the Platform Admin. +The **InferencePool** API defines a group of Pods (containers) dedicated to serving AI models. Pods within an InferencePool share the same compute configuration, accelerator type, base language model, and model server. This abstraction simplifies the management of AI model serving resources, providing a centralized point of administrative configuration for Platform Admins. -It is expected for the InferencePool to: +An InferencePool is expected to be bundled with an [Endpoint Picker](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp) extension. This extension is responsible for tracking key metrics on each model server (i.e. the KV-cache utilization, queue length of pending requests, active LoRA adapters, etc.) and routing incoming inference requests to the optimal model server replica based on these metrics. An EPP can only be associated with a single InferencePool. The associated InferencePool is specified by the [poolName](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/config/manifests/inferencepool-resources.yaml#L54) and [poolNamespace](https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/config/manifests/inferencepool-resources.yaml#L56) flags. An HTTPRoute can have multiple backendRefs that reference the same InferencePool and therefore routes to the same EPP. An HTTPRoute can have multiple backendRefs that reference different InferencePools and therefore routes to different EPPs. - - Enforce fair consumption of resources across competing workloads - - Efficiently route requests across shared compute (as displayed by the PoC) - -It is _not_ expected for the InferencePool to: +Additionally, any Pod that seeks to join an InferencePool would need to support the [model server protocol](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/docs/proposals/003-model-server-protocol), defined by this project, to ensure the Endpoint Picker has adequate information to intelligently route requests. - - Enforce any common set of adapters or base models are available on the Pods - - Manage Deployments of Pods within the Pool - - Manage Pod lifecycle of pods within the pool +## How to Configure an InferencePool -Additionally, any Pod that seeks to join an InferencePool would need to support a protocol, defined by this project, to ensure the Pool has adequate information to intelligently route requests. +The full spec of the InferencePool is defined [here](/reference/spec/#inferencepool). -`InferencePool` has some small overlap with `Service`, displayed here: +In summary, the InferencePoolSpec consists of 3 major parts: + +- The `selector` field specifies which Pods belong to this pool. The labels in this selector must exactly match the labels applied to your model server Pods. +- The `targetPortNumber` field defines the port number that the Inference Gateway should route to on model server Pods that belong to this pool. +- The `extensionRef` field references the [endpoint picker extension](https://github.com/kubernetes-sigs/gateway-api-inference-extension/tree/main/pkg/epp) (EPP) service that monitors key metrics from model servers within the InferencePool and provides intelligent routing decisions. + +### Example Configuration + +Here is an example InferencePool configuration: + +``` +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + name: vllm-llama3-8b-instruct +spec: + targetPortNumber: 8000 + selector: + app: vllm-llama3-8b-instruct + extensionRef: + name: vllm-llama3-8b-instruct-epp + port: 9002 + failureMode: FailClose +``` + +In this example: + +- An InferencePool named `vllm-llama3-8b-instruct` is created in the `default` namespace. +- It will select Pods that have the label `app: vllm-llama3-8b-instruct`. +- Traffic routed to this InferencePool will call out to the EPP service `vllm-llama3-8b-instruct-epp` on port `9002` for making routing decisions. If EPP fails to pick an endpoint, or is not responsive, the request will be dropped. +- Traffic routed to this InferencePool will be forwarded to the port `8000` on the selected Pods. + +## Overlap with Service + +**InferencePool** has some small overlap with **Service**, displayed here: Comparing InferencePool with Service -The InferencePool is _not_ intended to be a mask of the Service object, simply exposing the absolute bare minimum required to allow the Platform Admin to focus less on networking, and more on Pool management. - -## Spec +The InferencePool is not intended to be a mask of the Service object. It provides a specialized abstraction tailored for managing and routing traffic to groups of LLM model servers, allowing Platform Admins to focus on pool-level management rather than low-level networking details. -The full spec of the InferencePool is defined [here](/reference/spec/#inferencepool). \ No newline at end of file +## Replacing an InferencePool +Please refer to the [Replacing an InferencePool](/guides/replacing-inference-pool) guide for details on uses cases and how to replace an InferencePool. diff --git a/site-src/guides/replacing-inference-pool.md b/site-src/guides/replacing-inference-pool.md new file mode 100644 index 00000000..21294570 --- /dev/null +++ b/site-src/guides/replacing-inference-pool.md @@ -0,0 +1,59 @@ +# Replacing an InferencePool + +## Background + +Replacing an InferencePool is a powerful technique for performing various infrastructure and model updates with minimal disruption and built-in rollback capabilities. This method allows you to introduce changes incrementally, monitor their impact, and revert to the previous state if necessary. + +## Use Cases +Use Cases for Replacing an InferencePool: + +- Upgrading or replacing your model server framework +- Upgrading or replacing your base model +- Transitioning to new hardware + +## How to replace an InferencePool + +To replacing an InferencePool: + +1. **Deploy new infrastructure**: Create a new InferencePool configured with the new hardware / model server / base model that you chose. +1. **Configure traffic splitting**: Use an HTTPRoute to split traffic between the existing InferencePool and the new InferencePool. The `backendRefs.weight` field controls the traffic percentage allocated to each pool. +1. **Maintain InferenceModel integrity**: Keep your InferenceModel configuration unchanged. This ensures that the system applies the same LoRA adapters consistently across both base model versions. +1. **Preserve rollback capability**: Retain the original nodes and InferencePool during the roll out to facilitate a rollback if necessary. + +### Example + +You start with an existing lnferencePool named `llm-pool-v1`. To replace the original InferencePool, you create a new InferencePool named `llm-pool-v2`. By configuring an **HTTPRoute**, as shown below, you can incrementally split traffic between the original `llm-pool-v1` and new `llm-pool-v2`. + +1. Save the following sample manifest as `httproute.yaml`: + + ```yaml + apiVersion: gateway.networking.k8s.io/v1 + kind: HTTPRoute + metadata: + name: llm-route + spec: + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: inference-gateway + rules: + backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: llm-pool-v1 + weight: 90 + - group: inference.networking.x-k8s.io + kind: InferencePool + name: llm-pool-v2 + weight: 10 + ``` + +1. Apply the sample manifest to your cluster: + + ``` + kubectl apply -f httproute.yaml + ``` + + The original `llm-pool-v1` InferencePool receives most of the traffic, while the `llm-pool-v2` InferencePool receives the rest. + +1. Increase the traffic weight gradually for the `llm-pool-v2` InferencePool to complete the new InferencePool roll out. diff --git a/site-src/reference/spec.md b/site-src/reference/spec.md index e16c113c..d8e0c95b 100644 --- a/site-src/reference/spec.md +++ b/site-src/reference/spec.md @@ -1,12 +1,14 @@ # API Reference ## Packages -- [inference.networking.x-k8s.io/v1alpha1](#inferencenetworkingx-k8siov1alpha1) +- [inference.networking.x-k8s.io/v1alpha2](#inferencenetworkingx-k8siov1alpha2) -## inference.networking.x-k8s.io/v1alpha1 +## inference.networking.x-k8s.io/v1alpha2 + +Package v1alpha2 contains API Schema definitions for the +inference.networking.x-k8s.io API group. -Package v1alpha1 contains API Schema definitions for the gateway v1alpha1 API group ### Resource Types - [InferenceModel](#inferencemodel) @@ -18,26 +20,152 @@ Package v1alpha1 contains API Schema definitions for the gateway v1alpha1 API gr _Underlying type:_ _string_ -Defines how important it is to serve the model compared to other models. +Criticality defines how important it is to serve the model compared to other models. +Criticality is intentionally a bounded enum to contain the possibilities that need to be supported by the load balancing algorithm. Any reference to the Criticality field must be optional(use a pointer), and set no default. +This allows us to union this with a oneOf field in the future should we wish to adjust/extend this behavior. _Validation:_ -- Enum: [Critical Default Sheddable] +- Enum: [Critical Standard Sheddable] _Appears in:_ - [InferenceModelSpec](#inferencemodelspec) | Field | Description | | --- | --- | -| `Critical` | Most important. Requests to this band will be shed last.
| -| `Default` | More important than Sheddable, less important than Critical.
Requests in this band will be shed before critical traffic.
+kubebuilder:default=Default
| -| `Sheddable` | Least important. Requests to this band will be shed before all other bands.
| +| `Critical` | Critical defines the highest level of criticality. Requests to this band will be shed last.
| +| `Standard` | Standard defines the base criticality level and is more important than Sheddable but less
important than Critical. Requests in this band will be shed before critical traffic.
Most models are expected to fall within this band.
| +| `Sheddable` | Sheddable defines the lowest level of criticality. Requests to this band will be shed before
all other bands.
| + + +#### EndpointPickerConfig + + + +EndpointPickerConfig specifies the configuration needed by the proxy to discover and connect to the endpoint picker extension. +This type is intended to be a union of mutually exclusive configuration options that we may add in the future. + + + +_Appears in:_ +- [InferencePoolSpec](#inferencepoolspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `extensionRef` _[Extension](#extension)_ | Extension configures an endpoint picker as an extension service. | | Required: \{\}
| + + +#### Extension + + + +Extension specifies how to configure an extension that runs the endpoint picker. + + + +_Appears in:_ +- [EndpointPickerConfig](#endpointpickerconfig) +- [InferencePoolSpec](#inferencepoolspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `group` _[Group](#group)_ | Group is the group of the referent.
The default value is "", representing the Core API group. | | MaxLength: 253
Pattern: `^$\|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
| +| `kind` _[Kind](#kind)_ | Kind is the Kubernetes resource kind of the referent. For example
"Service".
Defaults to "Service" when not specified.
ExternalName services can refer to CNAME DNS records that may live
outside of the cluster and as such are difficult to reason about in
terms of conformance. They also may not be safe to forward to (see
CVE-2021-25740 for more information). Implementations MUST NOT
support ExternalName Services. | Service | MaxLength: 63
MinLength: 1
Pattern: `^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$`
| +| `name` _[ObjectName](#objectname)_ | Name is the name of the referent. | | MaxLength: 253
MinLength: 1
Required: \{\}
| +| `portNumber` _[PortNumber](#portnumber)_ | The port number on the service running the extension. When unspecified,
implementations SHOULD infer a default value of 9002 when the Kind is
Service. | | Maximum: 65535
Minimum: 1
| +| `failureMode` _[ExtensionFailureMode](#extensionfailuremode)_ | Configures how the gateway handles the case when the extension is not responsive.
Defaults to failClose. | FailClose | Enum: [FailOpen FailClose]
| + + +#### ExtensionConnection + + + +ExtensionConnection encapsulates options that configures the connection to the extension. + + + +_Appears in:_ +- [Extension](#extension) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `failureMode` _[ExtensionFailureMode](#extensionfailuremode)_ | Configures how the gateway handles the case when the extension is not responsive.
Defaults to failClose. | FailClose | Enum: [FailOpen FailClose]
| + + +#### ExtensionFailureMode + +_Underlying type:_ _string_ + +ExtensionFailureMode defines the options for how the gateway handles the case when the extension is not +responsive. + +_Validation:_ +- Enum: [FailOpen FailClose] + +_Appears in:_ +- [Extension](#extension) +- [ExtensionConnection](#extensionconnection) + +| Field | Description | +| --- | --- | +| `FailOpen` | FailOpen specifies that the proxy should not drop the request and forward the request to and endpoint of its picking.
| +| `FailClose` | FailClose specifies that the proxy should drop the request.
| + + +#### ExtensionReference + + + +ExtensionReference is a reference to the extension deployment. + + + +_Appears in:_ +- [Extension](#extension) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `group` _[Group](#group)_ | Group is the group of the referent.
The default value is "", representing the Core API group. | | MaxLength: 253
Pattern: `^$\|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
| +| `kind` _[Kind](#kind)_ | Kind is the Kubernetes resource kind of the referent. For example
"Service".
Defaults to "Service" when not specified.
ExternalName services can refer to CNAME DNS records that may live
outside of the cluster and as such are difficult to reason about in
terms of conformance. They also may not be safe to forward to (see
CVE-2021-25740 for more information). Implementations MUST NOT
support ExternalName Services. | Service | MaxLength: 63
MinLength: 1
Pattern: `^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$`
| +| `name` _[ObjectName](#objectname)_ | Name is the name of the referent. | | MaxLength: 253
MinLength: 1
Required: \{\}
| +| `portNumber` _[PortNumber](#portnumber)_ | The port number on the service running the extension. When unspecified,
implementations SHOULD infer a default value of 9002 when the Kind is
Service. | | Maximum: 65535
Minimum: 1
| + + +#### Group + +_Underlying type:_ _string_ + +Group refers to a Kubernetes Group. It must either be an empty string or a +RFC 1123 subdomain. + +This validation is based off of the corresponding Kubernetes validation: +https://github.com/kubernetes/apimachinery/blob/02cfb53916346d085a6c6c7c66f882e3c6b0eca6/pkg/util/validation/validation.go#L208 + +Valid values include: + +* "" - empty string implies core Kubernetes API group +* "gateway.networking.k8s.io" +* "foo.example.com" + +Invalid values include: + +* "example.com/bar" - "/" is an invalid character + +_Validation:_ +- MaxLength: 253 +- Pattern: `^$|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$` + +_Appears in:_ +- [Extension](#extension) +- [ExtensionReference](#extensionreference) +- [PoolObjectReference](#poolobjectreference) + #### InferenceModel -InferenceModel is the Schema for the InferenceModels API +InferenceModel is the Schema for the InferenceModels API. @@ -45,29 +173,31 @@ InferenceModel is the Schema for the InferenceModels API | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `apiVersion` _string_ | `inference.networking.x-k8s.io/v1alpha1` | | | +| `apiVersion` _string_ | `inference.networking.x-k8s.io/v1alpha2` | | | | `kind` _string_ | `InferenceModel` | | | | `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | | `spec` _[InferenceModelSpec](#inferencemodelspec)_ | | | | | `status` _[InferenceModelStatus](#inferencemodelstatus)_ | | | | + + + + #### InferenceModelSpec -InferenceModelSpec represents a specific model use case. This resource is +InferenceModelSpec represents the desired state of a specific model use case. This resource is managed by the "Inference Workload Owner" persona. - -The Inference Workload Owner persona is: a team that trains, verifies, and +The Inference Workload Owner persona is someone that trains, verifies, and leverages a large language model from a model frontend, drives the lifecycle and rollout of new versions of those models, and defines the specific performance and latency goals for the model. These workloads are expected to operate within an InferencePool sharing compute capacity with other InferenceModels, defined by the Inference Platform Admin. - InferenceModel's modelName (not the ObjectMeta name) is unique for a given InferencePool, if the name is reused, an error will be shown on the status of a InferenceModel that attempted to reuse. The oldest InferenceModel, based on @@ -81,10 +211,10 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `modelName` _string_ | The name of the model as the users set in the "model" parameter in the requests.
The name should be unique among the workloads that reference the same backend pool.
This is the parameter that will be used to match the request with. In the future, we may
allow to match on other request parameters. The other approach to support matching on
on other request parameters is to use a different ModelName per HTTPFilter.
Names can be reserved without implementing an actual model in the pool.
This can be done by specifying a target model and setting the weight to zero,
an error will be returned specifying that no valid target model is found. | | MaxLength: 253
| -| `criticality` _[Criticality](#criticality)_ | Defines how important it is to serve the model compared to other models referencing the same pool. | Default | Enum: [Critical Default Sheddable]
| -| `targetModels` _[TargetModel](#targetmodel) array_ | Allow multiple versions of a model for traffic splitting.
If not specified, the target model name is defaulted to the modelName parameter.
modelName is often in reference to a LoRA adapter. | | MaxItems: 10
| -| `poolRef` _[PoolObjectReference](#poolobjectreference)_ | Reference to the inference pool, the pool must exist in the same namespace. | | Required: \{\}
| +| `modelName` _string_ | ModelName is the name of the model as it will be set in the "model" parameter for an incoming request.
ModelNames must be unique for a referencing InferencePool
(names can be reused for a different pool in the same cluster).
The modelName with the oldest creation timestamp is retained, and the incoming
InferenceModel is sets the Ready status to false with a corresponding reason.
In the rare case of a race condition, one Model will be selected randomly to be considered valid, and the other rejected.
Names can be reserved without an underlying model configured in the pool.
This can be done by specifying a target model and setting the weight to zero,
an error will be returned specifying that no valid target model is found. | | MaxLength: 256
Required: \{\}
| +| `criticality` _[Criticality](#criticality)_ | Criticality defines how important it is to serve the model compared to other models referencing the same pool.
Criticality impacts how traffic is handled in resource constrained situations. It handles this by
queuing or rejecting requests of lower criticality. InferenceModels of an equivalent Criticality will
fairly share resources over throughput of tokens. In the future, the metric used to calculate fairness,
and the proportionality of fairness will be configurable.
Default values for this field will not be set, to allow for future additions of new field that may 'one of' with this field.
Any implementations that may consume this field may treat an unset value as the 'Standard' range. | | Enum: [Critical Standard Sheddable]
| +| `targetModels` _[TargetModel](#targetmodel) array_ | TargetModels allow multiple versions of a model for traffic splitting.
If not specified, the target model name is defaulted to the modelName parameter.
modelName is often in reference to a LoRA adapter. | | MaxItems: 10
| +| `poolRef` _[PoolObjectReference](#poolobjectreference)_ | PoolRef is a reference to the inference pool, the pool must exist in the same namespace. | | Required: \{\}
| #### InferenceModelStatus @@ -100,14 +230,14 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#condition-v1-meta) array_ | Conditions track the state of the InferencePool. | | | +| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#condition-v1-meta) array_ | Conditions track the state of the InferenceModel.
Known condition types are:
* "Accepted" | [map[lastTransitionTime:1970-01-01T00:00:00Z message:Waiting for controller reason:Pending status:Unknown type:Ready]] | MaxItems: 8
| #### InferencePool -InferencePool is the Schema for the Inferencepools API +InferencePool is the Schema for the InferencePools API. @@ -115,13 +245,17 @@ InferencePool is the Schema for the Inferencepools API | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `apiVersion` _string_ | `inference.networking.x-k8s.io/v1alpha1` | | | +| `apiVersion` _string_ | `inference.networking.x-k8s.io/v1alpha2` | | | | `kind` _string_ | `InferencePool` | | | | `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | | `spec` _[InferencePoolSpec](#inferencepoolspec)_ | | | | | `status` _[InferencePoolStatus](#inferencepoolstatus)_ | | | | + + + + #### InferencePoolSpec @@ -135,8 +269,9 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `selector` _object (keys:[LabelKey](#labelkey), values:[LabelValue](#labelvalue))_ | Selector uses a map of label to watch model server pods
that should be included in the InferencePool. ModelServers should not
be with any other Service or InferencePool, that behavior is not supported
and will result in sub-optimal utilization.
In some cases, implementations may translate this to a Service selector, so this matches the simple
map used for Service selectors instead of the full Kubernetes LabelSelector type. | | Required: \{\}
| -| `targetPortNumber` _integer_ | TargetPortNumber is the port number that the model servers within the pool expect
to receive traffic from.
This maps to the TargetPort in: https://pkg.go.dev/k8s.io/api/core/v1#ServicePort | | Maximum: 65535
Minimum: 0
Required: \{\}
| +| `selector` _object (keys:[LabelKey](#labelkey), values:[LabelValue](#labelvalue))_ | Selector defines a map of labels to watch model server pods
that should be included in the InferencePool.
In some cases, implementations may translate this field to a Service selector, so this matches the simple
map used for Service selectors instead of the full Kubernetes LabelSelector type.
If sepecified, it will be applied to match the model server pods in the same namespace as the InferencePool.
Cross namesoace selector is not supported. | | Required: \{\}
| +| `targetPortNumber` _integer_ | TargetPortNumber defines the port number to access the selected model servers.
The number must be in the range 1 to 65535. | | Maximum: 65535
Minimum: 1
Required: \{\}
| +| `extensionRef` _[Extension](#extension)_ | Extension configures an endpoint picker as an extension service. | | Required: \{\}
| #### InferencePoolStatus @@ -152,33 +287,56 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#condition-v1-meta) array_ | Conditions track the state of the InferencePool. | | | +| `parent` _[PoolStatus](#poolstatus) array_ | Parents is a list of parent resources (usually Gateways) that are
associated with the route, and the status of the InferencePool with respect to
each parent.
A maximum of 32 Gateways will be represented in this list. An empty list
means the route has not been attached to any Gateway. | | MaxItems: 32
| + + +#### Kind + +_Underlying type:_ _string_ + +Kind refers to a Kubernetes Kind. + +Valid values include: + +* "Service" +* "HTTPRoute" + +Invalid values include: + +* "invalid/kind" - "/" is an invalid character + +_Validation:_ +- MaxLength: 63 +- MinLength: 1 +- Pattern: `^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$` + +_Appears in:_ +- [Extension](#extension) +- [ExtensionReference](#extensionreference) +- [PoolObjectReference](#poolobjectreference) + #### LabelKey _Underlying type:_ _string_ -Originally copied from: https://github.com/kubernetes-sigs/gateway-api/blob/99a3934c6bc1ce0874f3a4c5f20cafd8977ffcb4/apis/v1/shared_types.go#L694-L731 +LabelKey was originally copied from: https://github.com/kubernetes-sigs/gateway-api/blob/99a3934c6bc1ce0874f3a4c5f20cafd8977ffcb4/apis/v1/shared_types.go#L694-L731 Duplicated as to not take an unexpected dependency on gw's API. - LabelKey is the key of a label. This is used for validation of maps. This matches the Kubernetes "qualified name" validation that is used for labels. - +Labels are case sensitive, so: my-label and My-Label are considered distinct. Valid values include: - * example * example.com * example.com/path * example.com/path.html - Invalid values include: - * example~ - "~" is an invalid character * example.com. - can not start or end with "." @@ -202,10 +360,8 @@ of maps. This matches the Kubernetes label validation rules: * unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]), * could contain dashes (-), underscores (_), dots (.), and alphanumerics between. - Valid values include: - * MyValue * my.name * 123-my-value @@ -220,6 +376,25 @@ _Appears in:_ +#### ObjectName + +_Underlying type:_ _string_ + +ObjectName refers to the name of a Kubernetes object. +Object names can have a variety of forms, including RFC 1123 subdomains, +RFC 1123 labels, or RFC 1035 labels. + +_Validation:_ +- MaxLength: 253 +- MinLength: 1 + +_Appears in:_ +- [Extension](#extension) +- [ExtensionReference](#extensionreference) +- [PoolObjectReference](#poolobjectreference) + + + #### PoolObjectReference @@ -234,9 +409,42 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `group` _string_ | Group is the group of the referent. | inference.networking.x-k8s.io | MaxLength: 253
Pattern: `^$\|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
| -| `kind` _string_ | Kind is kind of the referent. For example "InferencePool". | InferencePool | MaxLength: 63
MinLength: 1
Pattern: `^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$`
| -| `name` _string_ | Name is the name of the referent. | | MaxLength: 253
MinLength: 1
Required: \{\}
| +| `group` _[Group](#group)_ | Group is the group of the referent. | inference.networking.x-k8s.io | MaxLength: 253
Pattern: `^$\|^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$`
| +| `kind` _[Kind](#kind)_ | Kind is kind of the referent. For example "InferencePool". | InferencePool | MaxLength: 63
MinLength: 1
Pattern: `^[a-zA-Z]([-a-zA-Z0-9]*[a-zA-Z0-9])?$`
| +| `name` _[ObjectName](#objectname)_ | Name is the name of the referent. | | MaxLength: 253
MinLength: 1
Required: \{\}
| + + +#### PoolStatus + + + +PoolStatus defines the observed state of InferencePool from a Gateway. + + + +_Appears in:_ +- [InferencePoolStatus](#inferencepoolstatus) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `parentRef` _[ObjectReference](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#objectreference-v1-core)_ | GatewayRef indicates the gateway that observed state of InferencePool. | | | +| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.31/#condition-v1-meta) array_ | Conditions track the state of the InferencePool.
Known condition types are:
* "Accepted"
* "ResolvedRefs" | [map[lastTransitionTime:1970-01-01T00:00:00Z message:Waiting for controller reason:Pending status:Unknown type:Accepted]] | MaxItems: 8
| + + +#### PortNumber + +_Underlying type:_ _integer_ + +PortNumber defines a network port. + +_Validation:_ +- Maximum: 65535 +- Minimum: 1 + +_Appears in:_ +- [Extension](#extension) +- [ExtensionReference](#extensionreference) + #### TargetModel @@ -246,10 +454,10 @@ _Appears in:_ TargetModel represents a deployed model or a LoRA adapter. The Name field is expected to match the name of the LoRA adapter (or base model) as it is registered within the model server. Inference -Gateway assumes that the model exists on the model server and is the +Gateway assumes that the model exists on the model server and it's the responsibility of the user to validate a correct match. Should a model fail -to exist at request time, the error is processed by the Instance Gateway, -and then emitted on the appropriate InferenceModel object. +to exist at request time, the error is processed by the Inference Gateway +and emitted on the appropriate InferenceModel object. @@ -258,7 +466,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `name` _string_ | The name of the adapter as expected by the ModelServer. | | MaxLength: 253
| -| `weight` _integer_ | Weight is used to determine the proportion of traffic that should be
sent to this target model when multiple versions of the model are specified. | 1 | Maximum: 1e+06
Minimum: 0
| +| `name` _string_ | Name is the name of the adapter or base model, as expected by the ModelServer. | | MaxLength: 253
Required: \{\}
| +| `weight` _integer_ | Weight is used to determine the proportion of traffic that should be
sent to this model when multiple target models are specified.
Weight defines the proportion of requests forwarded to the specified
model. This is computed as weight/(sum of all weights in this
TargetModels list). For non-zero values, there may be some epsilon from
the exact proportion defined here depending on the precision an
implementation supports. Weight is not a percentage and the sum of
weights does not need to equal 100.
If a weight is set for any targetModel, it must be set for all targetModels.
Conversely weights are optional, so long as ALL targetModels do not specify a weight. | | Maximum: 1e+06
Minimum: 1
| From f1d425b7e5d460dd8c64bcf30c0466c079951af3 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Wed, 23 Apr 2025 04:27:40 +0300 Subject: [PATCH 52/74] reduce log level in metrics logger not to trash the log (#708) * reduce log level in metrics logger not to trash the log Signed-off-by: Nir Rozenbaum * rename flush metrics to refresh metrics Signed-off-by: Nir Rozenbaum * revert log level Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum --- cmd/epp/main.go | 9 ++++----- pkg/epp/backend/metrics/logger.go | 10 +++++----- pkg/epp/server/runserver.go | 17 ++++++----------- test/integration/epp/hermetic_test.go | 2 +- 4 files changed, 16 insertions(+), 22 deletions(-) diff --git a/cmd/epp/main.go b/cmd/epp/main.go index b5e6fbe6..c0a87e62 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -142,8 +142,8 @@ func run() error { } poolNamespacedName := types.NamespacedName{ - Namespace: *poolNamespace, Name: *poolName, + Namespace: *poolNamespace, } mgr, err := runserver.NewDefaultManager(poolNamespacedName, cfg) if err != nil { @@ -151,8 +151,6 @@ func run() error { return err } - ctx := ctrl.SetupSignalHandler() - // Set up mapper for metric scraping. mapping, err := backendmetrics.NewMetricMapping( *totalQueuedRequestsMetric, @@ -167,14 +165,15 @@ func run() error { pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.PodMetricsClientImpl{MetricMapping: mapping}, *refreshMetricsInterval) // Setup runner. + ctx := ctrl.SetupSignalHandler() + datastore := datastore.NewDatastore(ctx, pmf) serverRunner := &runserver.ExtProcServerRunner{ GrpcPort: *grpcPort, DestinationEndpointHintMetadataNamespace: *destinationEndpointHintMetadataNamespace, DestinationEndpointHintKey: *destinationEndpointHintKey, - PoolName: *poolName, - PoolNamespace: *poolNamespace, + PoolNamespacedName: poolNamespacedName, Datastore: datastore, SecureServing: *secureServing, CertPath: *certPath, diff --git a/pkg/epp/backend/metrics/logger.go b/pkg/epp/backend/metrics/logger.go index d9a93027..7dc1a8b8 100644 --- a/pkg/epp/backend/metrics/logger.go +++ b/pkg/epp/backend/metrics/logger.go @@ -55,8 +55,8 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh case <-ctx.Done(): logger.V(logutil.DEFAULT).Info("Shutting down prometheus metrics thread") return - case <-ticker.C: // Periodically flush prometheus metrics for inference pool - flushPrometheusMetricsOnce(logger, datastore) + case <-ticker.C: // Periodically refresh prometheus metrics for inference pool + refreshPrometheusMetrics(logger, datastore) } } }() @@ -86,11 +86,11 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh } } -func flushPrometheusMetricsOnce(logger logr.Logger, datastore Datastore) { +func refreshPrometheusMetrics(logger logr.Logger, datastore Datastore) { pool, err := datastore.PoolGet() if err != nil { // No inference pool or not initialize. - logger.V(logutil.DEFAULT).Info("pool is not initialized, skipping flushing metrics") + logger.V(logutil.DEFAULT).Info("Pool is not initialized, skipping refreshing metrics") return } @@ -98,7 +98,7 @@ func flushPrometheusMetricsOnce(logger logr.Logger, datastore Datastore) { var queueTotal int podMetrics := datastore.PodGetAll() - logger.V(logutil.VERBOSE).Info("Flushing Prometheus Metrics", "ReadyPods", len(podMetrics)) + logger.V(logutil.TRACE).Info("Refreshing Prometheus Metrics", "ReadyPods", len(podMetrics)) if len(podMetrics) == 0 { return } diff --git a/pkg/epp/server/runserver.go b/pkg/epp/server/runserver.go index 65a6e787..0c0a6a6d 100644 --- a/pkg/epp/server/runserver.go +++ b/pkg/epp/server/runserver.go @@ -43,8 +43,7 @@ type ExtProcServerRunner struct { GrpcPort int DestinationEndpointHintMetadataNamespace string DestinationEndpointHintKey string - PoolName string - PoolNamespace string + PoolNamespacedName types.NamespacedName Datastore datastore.Datastore SecureServing bool CertPath string @@ -73,8 +72,7 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner { GrpcPort: DefaultGrpcPort, DestinationEndpointHintKey: DefaultDestinationEndpointHintKey, DestinationEndpointHintMetadataNamespace: DefaultDestinationEndpointHintMetadataNamespace, - PoolName: DefaultPoolName, - PoolNamespace: DefaultPoolNamespace, + PoolNamespacedName: types.NamespacedName{Name: DefaultPoolName, Namespace: DefaultPoolNamespace}, SecureServing: DefaultSecureServing, RefreshPrometheusMetricsInterval: DefaultRefreshPrometheusMetricsInterval, // Datastore can be assigned later. @@ -93,13 +91,10 @@ func (r *ExtProcServerRunner) SetupWithManager(ctx context.Context, mgr ctrl.Man } if err := (&controller.InferenceModelReconciler{ - Datastore: r.Datastore, - Client: mgr.GetClient(), - PoolNamespacedName: types.NamespacedName{ - Name: r.PoolName, - Namespace: r.PoolNamespace, - }, - Record: mgr.GetEventRecorderFor("InferenceModel"), + Datastore: r.Datastore, + Client: mgr.GetClient(), + PoolNamespacedName: r.PoolNamespacedName, + Record: mgr.GetEventRecorderFor("InferenceModel"), }).SetupWithManager(ctx, mgr); err != nil { return fmt.Errorf("failed setting up InferenceModelReconciler: %w", err) } diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go index 372158f4..79b619fd 100644 --- a/test/integration/epp/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -1348,7 +1348,7 @@ func BeforeSuite() func() { serverRunner.TestPodMetricsClient = &backendmetrics.FakePodMetricsClient{} pmf := backendmetrics.NewPodMetricsFactory(serverRunner.TestPodMetricsClient, 10*time.Millisecond) // Adjust from defaults - serverRunner.PoolName = "vllm-llama3-8b-instruct-pool" + serverRunner.PoolNamespacedName = types.NamespacedName{Name: "vllm-llama3-8b-instruct-pool", Namespace: "default"} serverRunner.Datastore = datastore.NewDatastore(context.Background(), pmf) serverRunner.SecureServing = false From d935a7cc9bec473d04f10147f2f012e33757da98 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Wed, 23 Apr 2025 04:27:47 +0300 Subject: [PATCH 53/74] few updates in datastore (#713) * few updates in datastore Signed-off-by: Nir Rozenbaum * PoolSet documentation Signed-off-by: Nir Rozenbaum * error phrasing Signed-off-by: Nir Rozenbaum * removed unused pool arg from PodUpdateOrAddIfNotExist Signed-off-by: Nir Rozenbaum * linter Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum --- .../inferencemodel_reconciler_test.go | 5 +- .../controller/inferencepool_reconciler.go | 24 ++---- pkg/epp/controller/pod_reconciler.go | 12 ++- pkg/epp/controller/pod_reconciler_test.go | 4 +- pkg/epp/datastore/datastore.go | 78 ++++++++++++------- pkg/epp/datastore/datastore_test.go | 21 ++++- pkg/epp/util/pod/pod.go | 3 + 7 files changed, 89 insertions(+), 58 deletions(-) diff --git a/pkg/epp/controller/inferencemodel_reconciler_test.go b/pkg/epp/controller/inferencemodel_reconciler_test.go index 57dc2469..80c30e19 100644 --- a/pkg/epp/controller/inferencemodel_reconciler_test.go +++ b/pkg/epp/controller/inferencemodel_reconciler_test.go @@ -25,6 +25,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -178,6 +179,7 @@ func TestInferenceModelReconciler(t *testing.T) { t.Run(test.name, func(t *testing.T) { // Create a fake client with no InferenceModel objects. scheme := runtime.NewScheme() + _ = clientgoscheme.AddToScheme(scheme) _ = v1alpha2.Install(scheme) initObjs := []client.Object{} if test.model != nil { @@ -186,6 +188,7 @@ func TestInferenceModelReconciler(t *testing.T) { for _, m := range test.modelsInAPIServer { initObjs = append(initObjs, m) } + fakeClient := fake.NewClientBuilder(). WithScheme(scheme). WithObjects(initObjs...). @@ -196,7 +199,7 @@ func TestInferenceModelReconciler(t *testing.T) { for _, m := range test.modelsInStore { ds.ModelSetIfOlder(m) } - ds.PoolSet(pool) + _ = ds.PoolSet(context.Background(), fakeClient, pool) reconciler := &InferenceModelReconciler{ Client: fakeClient, Record: record.NewFakeRecorder(10), diff --git a/pkg/epp/controller/inferencepool_reconciler.go b/pkg/epp/controller/inferencepool_reconciler.go index 0738181f..fb7d7727 100644 --- a/pkg/epp/controller/inferencepool_reconciler.go +++ b/pkg/epp/controller/inferencepool_reconciler.go @@ -18,7 +18,6 @@ package controller import ( "context" - "reflect" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/client-go/tools/record" @@ -60,28 +59,15 @@ func (c *InferencePoolReconciler) Reconcile(ctx context.Context, req ctrl.Reques c.Datastore.Clear() return ctrl.Result{}, nil } - - c.updateDatastore(ctx, infPool) + // update pool in datastore + if err := c.Datastore.PoolSet(ctx, c.Client, infPool); err != nil { + logger.Error(err, "Failed to update datastore") + return ctrl.Result{}, err + } return ctrl.Result{}, nil } -func (c *InferencePoolReconciler) updateDatastore(ctx context.Context, newPool *v1alpha2.InferencePool) { - logger := log.FromContext(ctx) - oldPool, err := c.Datastore.PoolGet() - c.Datastore.PoolSet(newPool) - if err != nil || !reflect.DeepEqual(newPool.Spec.Selector, oldPool.Spec.Selector) { - logger.V(logutil.DEFAULT).Info("Updating inference pool endpoints", "selector", newPool.Spec.Selector) - // A full resync is required to address two cases: - // 1) At startup, the pod events may get processed before the pool is synced with the datastore, - // and hence they will not be added to the store since pool selector is not known yet - // 2) If the selector on the pool was updated, then we will not get any pod events, and so we need - // to resync the whole pool: remove pods in the store that don't match the new selector and add - // the ones that may have existed already to the store. - c.Datastore.PodResyncAll(ctx, c.Client, newPool) - } -} - func (c *InferencePoolReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). For(&v1alpha2.InferencePool{}). diff --git a/pkg/epp/controller/pod_reconciler.go b/pkg/epp/controller/pod_reconciler.go index 494adeb7..6d1af8d9 100644 --- a/pkg/epp/controller/pod_reconciler.go +++ b/pkg/epp/controller/pod_reconciler.go @@ -27,7 +27,6 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" podutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/pod" @@ -41,8 +40,7 @@ type PodReconciler struct { func (c *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { logger := log.FromContext(ctx) - pool, err := c.Datastore.PoolGet() - if err != nil { + if !c.Datastore.PoolHasSynced() { logger.V(logutil.TRACE).Info("Skipping reconciling Pod because the InferencePool is not available yet") // When the inferencePool is initialized it lists the appropriate pods and populates the datastore, so no need to requeue. return ctrl.Result{}, nil @@ -60,7 +58,7 @@ func (c *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R return ctrl.Result{}, err } - c.updateDatastore(logger, pod, pool) + c.updateDatastore(logger, pod) return ctrl.Result{}, nil } @@ -70,13 +68,13 @@ func (c *PodReconciler) SetupWithManager(mgr ctrl.Manager) error { Complete(c) } -func (c *PodReconciler) updateDatastore(logger logr.Logger, pod *corev1.Pod, pool *v1alpha2.InferencePool) { +func (c *PodReconciler) updateDatastore(logger logr.Logger, pod *corev1.Pod) { namespacedName := types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace} - if !pod.DeletionTimestamp.IsZero() || !c.Datastore.PoolLabelsMatch(pod.Labels) || !podutil.IsPodReady(pod) { + if !podutil.IsPodReady(pod) || !c.Datastore.PoolLabelsMatch(pod.Labels) { logger.V(logutil.DEBUG).Info("Pod removed or not added", "name", namespacedName) c.Datastore.PodDelete(namespacedName) } else { - if c.Datastore.PodUpdateOrAddIfNotExist(pod, pool) { + if c.Datastore.PodUpdateOrAddIfNotExist(pod) { logger.V(logutil.DEFAULT).Info("Pod added", "name", namespacedName) } else { logger.V(logutil.DEFAULT).Info("Pod already exists", "name", namespacedName) diff --git a/pkg/epp/controller/pod_reconciler_test.go b/pkg/epp/controller/pod_reconciler_test.go index e4cb0b62..d2bdd5d0 100644 --- a/pkg/epp/controller/pod_reconciler_test.go +++ b/pkg/epp/controller/pod_reconciler_test.go @@ -182,9 +182,9 @@ func TestPodReconciler(t *testing.T) { // Configure the initial state of the datastore. store := datastore.NewDatastore(t.Context(), pmf) - store.PoolSet(test.pool) + _ = store.PoolSet(t.Context(), fakeClient, test.pool) for _, pod := range test.existingPods { - store.PodUpdateOrAddIfNotExist(pod, pool) + store.PodUpdateOrAddIfNotExist(pod) } podReconciler := &PodReconciler{Client: fakeClient, Datastore: store} diff --git a/pkg/epp/datastore/datastore.go b/pkg/epp/datastore/datastore.go index 5435e3af..f8378d25 100644 --- a/pkg/epp/datastore/datastore.go +++ b/pkg/epp/datastore/datastore.go @@ -20,6 +20,7 @@ import ( "context" "errors" "fmt" + "reflect" "sync" corev1 "k8s.io/api/core/v1" @@ -44,7 +45,10 @@ var ( // The datastore is a local cache of relevant data for the given InferencePool (currently all pulled from k8s-api) type Datastore interface { // InferencePool operations - PoolSet(pool *v1alpha2.InferencePool) + // PoolSet sets the given pool in datastore. If the given pool has different label selector than the previous pool + // that was stored, the function triggers a resync of the pods to keep the datastore updated. If the given pool + // is nil, this call triggers the datastore.Clear() function. + PoolSet(ctx context.Context, client client.Client, pool *v1alpha2.InferencePool) error PoolGet() (*v1alpha2.InferencePool, error) PoolHasSynced() bool PoolLabelsMatch(podLabels map[string]string) bool @@ -60,10 +64,9 @@ type Datastore interface { // PodGetAll returns all pods and metrics, including fresh and stale. PodGetAll() []backendmetrics.PodMetrics // PodList lists pods matching the given predicate. - PodList(func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics - PodUpdateOrAddIfNotExist(pod *corev1.Pod, pool *v1alpha2.InferencePool) bool + PodList(predicate func(backendmetrics.PodMetrics) bool) []backendmetrics.PodMetrics + PodUpdateOrAddIfNotExist(pod *corev1.Pod) bool PodDelete(namespacedName types.NamespacedName) - PodResyncAll(ctx context.Context, ctrlClient client.Client, pool *v1alpha2.InferencePool) // Clears the store state, happens when the pool gets deleted. Clear() @@ -102,10 +105,31 @@ func (ds *datastore) Clear() { } // /// InferencePool APIs /// -func (ds *datastore) PoolSet(pool *v1alpha2.InferencePool) { +func (ds *datastore) PoolSet(ctx context.Context, client client.Client, pool *v1alpha2.InferencePool) error { + if pool == nil { + ds.Clear() + return nil + } + logger := log.FromContext(ctx) ds.poolAndModelsMu.Lock() defer ds.poolAndModelsMu.Unlock() + + oldPool := ds.pool ds.pool = pool + if oldPool == nil || !reflect.DeepEqual(pool.Spec.Selector, oldPool.Spec.Selector) { + logger.V(logutil.DEFAULT).Info("Updating inference pool endpoints", "selector", pool.Spec.Selector) + // A full resync is required to address two cases: + // 1) At startup, the pod events may get processed before the pool is synced with the datastore, + // and hence they will not be added to the store since pool selector is not known yet + // 2) If the selector on the pool was updated, then we will not get any pod events, and so we need + // to resync the whole pool: remove pods in the store that don't match the new selector and add + // the ones that may have existed already to the store. + if err := ds.podResyncAll(ctx, client); err != nil { + return fmt.Errorf("failed to update pods according to the pool selector - %w", err) + } + } + + return nil } func (ds *datastore) PoolGet() (*v1alpha2.InferencePool, error) { @@ -229,7 +253,7 @@ func (ds *datastore) PodList(predicate func(backendmetrics.PodMetrics) bool) []b return res } -func (ds *datastore) PodUpdateOrAddIfNotExist(pod *corev1.Pod, pool *v1alpha2.InferencePool) bool { +func (ds *datastore) PodUpdateOrAddIfNotExist(pod *corev1.Pod) bool { namespacedName := types.NamespacedName{ Name: pod.Name, Namespace: pod.Namespace, @@ -247,27 +271,35 @@ func (ds *datastore) PodUpdateOrAddIfNotExist(pod *corev1.Pod, pool *v1alpha2.In return ok } -func (ds *datastore) PodResyncAll(ctx context.Context, ctrlClient client.Client, pool *v1alpha2.InferencePool) { +func (ds *datastore) PodDelete(namespacedName types.NamespacedName) { + v, ok := ds.pods.LoadAndDelete(namespacedName) + if ok { + pmr := v.(backendmetrics.PodMetrics) + pmr.StopRefreshLoop() + } +} + +func (ds *datastore) podResyncAll(ctx context.Context, ctrlClient client.Client) error { logger := log.FromContext(ctx) podList := &corev1.PodList{} if err := ctrlClient.List(ctx, podList, &client.ListOptions{ - LabelSelector: selectorFromInferencePoolSelector(pool.Spec.Selector), - Namespace: pool.Namespace, + LabelSelector: selectorFromInferencePoolSelector(ds.pool.Spec.Selector), + Namespace: ds.pool.Namespace, }); err != nil { - log.FromContext(ctx).V(logutil.DEFAULT).Error(err, "Failed to list clients") - return + return fmt.Errorf("failed to list pods - %w", err) } activePods := make(map[string]bool) for _, pod := range podList.Items { - if podutil.IsPodReady(&pod) { - namespacedName := types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace} - activePods[pod.Name] = true - if ds.PodUpdateOrAddIfNotExist(&pod, pool) { - logger.V(logutil.DEFAULT).Info("Pod added", "name", namespacedName) - } else { - logger.V(logutil.DEFAULT).Info("Pod already exists", "name", namespacedName) - } + if !podutil.IsPodReady(&pod) { + continue + } + namespacedName := types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace} + activePods[pod.Name] = true + if ds.PodUpdateOrAddIfNotExist(&pod) { + logger.V(logutil.DEFAULT).Info("Pod added", "name", namespacedName) + } else { + logger.V(logutil.DEFAULT).Info("Pod already exists", "name", namespacedName) } } @@ -281,14 +313,8 @@ func (ds *datastore) PodResyncAll(ctx context.Context, ctrlClient client.Client, return true } ds.pods.Range(deleteFn) -} -func (ds *datastore) PodDelete(namespacedName types.NamespacedName) { - v, ok := ds.pods.LoadAndDelete(namespacedName) - if ok { - pmr := v.(backendmetrics.PodMetrics) - pmr.StopRefreshLoop() - } + return nil } func selectorFromInferencePoolSelector(selector map[v1alpha2.LabelKey]v1alpha2.LabelValue) labels.Selector { diff --git a/pkg/epp/datastore/datastore_test.go b/pkg/epp/datastore/datastore_test.go index abbff429..e8c77d37 100644 --- a/pkg/epp/datastore/datastore_test.go +++ b/pkg/epp/datastore/datastore_test.go @@ -27,7 +27,10 @@ import ( "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" testutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/testing" @@ -71,9 +74,15 @@ func TestPool(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + // Set up the scheme. + scheme := runtime.NewScheme() + _ = clientgoscheme.AddToScheme(scheme) + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) datastore := NewDatastore(context.Background(), pmf) - datastore.PoolSet(tt.inferencePool) + _ = datastore.PoolSet(context.Background(), fakeClient, tt.inferencePool) gotPool, gotErr := datastore.PoolGet() if diff := cmp.Diff(tt.wantErr, gotErr, cmpopts.EquateErrors()); diff != "" { t.Errorf("Unexpected error diff (+got/-want): %s", diff) @@ -320,11 +329,17 @@ func TestMetrics(t *testing.T) { t.Run(test.name, func(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() + // Set up the scheme. + scheme := runtime.NewScheme() + _ = clientgoscheme.AddToScheme(scheme) + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + Build() pmf := backendmetrics.NewPodMetricsFactory(test.pmc, time.Millisecond) ds := NewDatastore(ctx, pmf) - ds.PoolSet(inferencePool) + _ = ds.PoolSet(ctx, fakeClient, inferencePool) for _, pod := range test.storePods { - ds.PodUpdateOrAddIfNotExist(pod, inferencePool) + ds.PodUpdateOrAddIfNotExist(pod) } assert.EventuallyWithT(t, func(t *assert.CollectT) { got := ds.PodGetAll() diff --git a/pkg/epp/util/pod/pod.go b/pkg/epp/util/pod/pod.go index 9f564024..4fcb948f 100644 --- a/pkg/epp/util/pod/pod.go +++ b/pkg/epp/util/pod/pod.go @@ -21,6 +21,9 @@ import ( ) func IsPodReady(pod *corev1.Pod) bool { + if !pod.DeletionTimestamp.IsZero() { + return false + } for _, condition := range pod.Status.Conditions { if condition.Type == corev1.PodReady { if condition.Status == corev1.ConditionTrue { From b24f94834724df5af902d014f1f4d6ca177c89e6 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Wed, 23 Apr 2025 20:15:47 +0300 Subject: [PATCH 54/74] scheduler refactoring (#730) Signed-off-by: Nir Rozenbaum --- pkg/epp/backend/metrics/pod_metrics.go | 11 +- pkg/epp/backend/metrics/types.go | 15 +- .../scheduling/plugins/{ => filter}/filter.go | 85 +++++------ .../plugins/{ => filter}/filter_test.go | 38 ++--- pkg/epp/scheduling/plugins/noop.go | 12 +- .../{picker.go => picker/random_picker.go} | 6 +- .../interfaces.go => plugins/plugins.go} | 42 +++--- pkg/epp/scheduling/scheduler.go | 141 ++++++++---------- pkg/epp/scheduling/scheduler_test.go | 133 ++++++----------- pkg/epp/scheduling/types/types.go | 16 +- 10 files changed, 214 insertions(+), 285 deletions(-) rename pkg/epp/scheduling/plugins/{ => filter}/filter.go (81%) rename pkg/epp/scheduling/plugins/{ => filter}/filter_test.go (90%) rename pkg/epp/scheduling/plugins/{picker.go => picker/random_picker.go} (86%) rename pkg/epp/scheduling/{types/interfaces.go => plugins/plugins.go} (70%) diff --git a/pkg/epp/backend/metrics/pod_metrics.go b/pkg/epp/backend/metrics/pod_metrics.go index c85d4d79..7339389a 100644 --- a/pkg/epp/backend/metrics/pod_metrics.go +++ b/pkg/epp/backend/metrics/pod_metrics.go @@ -41,9 +41,8 @@ type podMetrics struct { ds Datastore interval time.Duration - parentCtx context.Context - once sync.Once // ensure the StartRefreshLoop is only called once. - done chan struct{} + once sync.Once // ensure the StartRefreshLoop is only called once. + done chan struct{} logger logr.Logger } @@ -79,8 +78,8 @@ func toInternalPod(in *corev1.Pod) *Pod { } // start starts a goroutine exactly once to periodically update metrics. The goroutine will be -// stopped either when stop() is called, or the parentCtx is cancelled. -func (pm *podMetrics) startRefreshLoop() { +// stopped either when stop() is called, or the given ctx is cancelled. +func (pm *podMetrics) startRefreshLoop(ctx context.Context) { pm.once.Do(func() { go func() { pm.logger.V(logutil.DEFAULT).Info("Starting refresher", "pod", pm.GetPod()) @@ -90,7 +89,7 @@ func (pm *podMetrics) startRefreshLoop() { select { case <-pm.done: return - case <-pm.parentCtx.Done(): + case <-ctx.Done(): return case <-ticker.C: // refresh metrics periodically if err := pm.refreshMetrics(); err != nil { diff --git a/pkg/epp/backend/metrics/types.go b/pkg/epp/backend/metrics/types.go index 21c0f401..156ac3ed 100644 --- a/pkg/epp/backend/metrics/types.go +++ b/pkg/epp/backend/metrics/types.go @@ -43,18 +43,17 @@ type PodMetricsFactory struct { func (f *PodMetricsFactory) NewPodMetrics(parentCtx context.Context, in *corev1.Pod, ds Datastore) PodMetrics { pod := toInternalPod(in) pm := &podMetrics{ - pmc: f.pmc, - ds: ds, - interval: f.refreshMetricsInterval, - parentCtx: parentCtx, - once: sync.Once{}, - done: make(chan struct{}), - logger: log.FromContext(parentCtx).WithValues("pod", pod.NamespacedName), + pmc: f.pmc, + ds: ds, + interval: f.refreshMetricsInterval, + once: sync.Once{}, + done: make(chan struct{}), + logger: log.FromContext(parentCtx).WithValues("pod", pod.NamespacedName), } pm.pod.Store(pod) pm.metrics.Store(newMetrics()) - pm.startRefreshLoop() + pm.startRefreshLoop(parentCtx) return pm } diff --git a/pkg/epp/scheduling/plugins/filter.go b/pkg/epp/scheduling/plugins/filter/filter.go similarity index 81% rename from pkg/epp/scheduling/plugins/filter.go rename to pkg/epp/scheduling/plugins/filter/filter.go index efcb6be1..86620aa9 100644 --- a/pkg/epp/scheduling/plugins/filter.go +++ b/pkg/epp/scheduling/plugins/filter/filter.go @@ -14,56 +14,55 @@ See the License for the specific language governing permissions and limitations under the License. */ -package plugins +package filter import ( - "errors" "math" "math/rand" "time" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/config" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" - errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) -type Filter struct { +type baseFilter struct { name string filter filterFunc } -func (bf *Filter) Name() string { - if bf == nil { +func (f *baseFilter) Name() string { + if f == nil { return "nil" } - return bf.name + return f.name } -func (bf *Filter) Filter(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { +func (f *baseFilter) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { loggerTrace := ctx.Logger.V(logutil.TRACE) - loggerTrace.Info("Running a filter", "name", bf.Name(), "podCount", len(pods)) + loggerTrace.Info("Running a filter", "name", f.Name(), "podCount", len(pods)) - return bf.filter(ctx, pods) + return f.filter(ctx, pods) } // DecisionTreeFilter applies current filterFunc, and then recursively applies next filters // depending success or failure of the current filter. // It can be used to construct a flow chart algorithm. type DecisionTreeFilter struct { - Current types.Filter + Current plugins.Filter // NextOnSuccess filter will be applied after successfully applying the current filter. // The filtered results will be passed to the next filter. - NextOnSuccess types.Filter + NextOnSuccess plugins.Filter // NextOnFailure filter will be applied if current filter fails. // The original input will be passed to the next filter. - NextOnFailure types.Filter + NextOnFailure plugins.Filter // NextOnSuccessOrFailure is a convenience field to configure the next filter regardless of the // success or failure of the current filter. // NOTE: When using NextOnSuccessOrFailure, both nextOnSuccess and nextOnFailure SHOULD be nil. // However if that's not the case, nextOnSuccess and nextOnFailure will be used, instead of // NextOnSuccessOrFailure, in the success and failure scenarios, respectively. - NextOnSuccessOrFailure types.Filter + NextOnSuccessOrFailure plugins.Filter } func (f *DecisionTreeFilter) Name() string { @@ -73,15 +72,15 @@ func (f *DecisionTreeFilter) Name() string { return f.Current.Name() } -func (f *DecisionTreeFilter) Filter(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { +func (f *DecisionTreeFilter) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { loggerTrace := ctx.Logger.V(logutil.TRACE) - filtered, err := f.Current.Filter(ctx, pods) + filtered := f.Current.Filter(ctx, pods) next := f.NextOnSuccessOrFailure - if err == nil && len(filtered) > 0 { + if len(filtered) > 0 { if f.NextOnSuccess == nil && f.NextOnSuccessOrFailure == nil { // No succeeding filters to run, return. - return filtered, err + return filtered } if f.NextOnSuccess != nil { next = f.NextOnSuccess @@ -92,7 +91,7 @@ func (f *DecisionTreeFilter) Filter(ctx *types.Context, pods []types.Pod) ([]typ } else { if f.NextOnFailure == nil && f.NextOnSuccessOrFailure == nil { // No succeeding filters to run, return. - return filtered, err + return filtered } if f.NextOnFailure != nil { next = f.NextOnFailure @@ -104,11 +103,11 @@ func (f *DecisionTreeFilter) Filter(ctx *types.Context, pods []types.Pod) ([]typ } // filterFunc filters a set of input pods to a subset. -type filterFunc func(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) +type filterFunc func(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod // toFilterFunc is a helper function to convert a per pod filter func to the FilterFunc. func toFilterFunc(pp podPredicate) filterFunc { - return func(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { + return func(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { filtered := []types.Pod{} for _, pod := range pods { pass := pp(ctx.Req, pod) @@ -116,14 +115,12 @@ func toFilterFunc(pp podPredicate) filterFunc { filtered = append(filtered, pod) } } - if len(filtered) == 0 { - return nil, errors.New("no pods left") - } - return filtered, nil + + return filtered } } -var LeastQueueFilter = &Filter{ +var LeastQueueFilter = &baseFilter{ name: "least queuing", filter: leastQueuingFilterFunc, } @@ -135,7 +132,7 @@ var LeastQueueFilter = &Filter{ // the least one as it gives more choices for the next filter, which on aggregate gave better // results. // TODO: Compare this strategy with other strategies such as top K. -func leastQueuingFilterFunc(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { +func leastQueuingFilterFunc(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { min := math.MaxInt max := 0 filtered := []types.Pod{} @@ -154,15 +151,15 @@ func leastQueuingFilterFunc(ctx *types.Context, pods []types.Pod) ([]types.Pod, filtered = append(filtered, pod) } } - return filtered, nil + return filtered } -var LowQueueFilter = &Filter{ +var LowQueueFilter = &baseFilter{ name: "low queueing filter", filter: toFilterFunc((queueThresholdPredicate(config.Conf.QueueingThresholdLoRA))), } -var LeastKVCacheFilter = &Filter{ +var LeastKVCacheFilter = &baseFilter{ name: "least KV cache percent", filter: leastKVCacheFilterFunc, } @@ -173,7 +170,7 @@ var LeastKVCacheFilter = &Filter{ // should consider them all instead of the absolute minimum one. This worked better than picking the // least one as it gives more choices for the next filter, which on aggregate gave better results. // TODO: Compare this strategy with other strategies such as top K. -func leastKVCacheFilterFunc(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { +func leastKVCacheFilterFunc(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { min := math.MaxFloat64 var max float64 = 0 filtered := []types.Pod{} @@ -192,10 +189,10 @@ func leastKVCacheFilterFunc(ctx *types.Context, pods []types.Pod) ([]types.Pod, filtered = append(filtered, pod) } } - return filtered, nil + return filtered } -var LoRAAffinityFilter = &Filter{ +var LoRAAffinityFilter = &baseFilter{ name: "affinity LoRA", filter: loRASoftAffinityFilterFunc, } @@ -216,7 +213,7 @@ var LoRAAffinityFilter = &Filter{ // Returns: // - Filtered slice of pod metrics based on affinity and availability // - Error if any issues occur during filtering -func loRASoftAffinityFilterFunc(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { +func loRASoftAffinityFilterFunc(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { // Pre-allocate slices with estimated capacity filtered_affinity := make([]types.Pod, 0, len(pods)) @@ -241,34 +238,24 @@ func loRASoftAffinityFilterFunc(ctx *types.Context, pods []types.Pod) ([]types.P // If both groups have pods, use probability to select which group to return if len(filtered_affinity) > 0 && len(filtered_available) > 0 { if randGen.Float64() < config.Conf.LoraAffinityThreshold { - return filtered_affinity, nil + return filtered_affinity } - return filtered_available, nil + return filtered_available } // Return whichever group has pods if len(filtered_affinity) > 0 { - return filtered_affinity, nil + return filtered_affinity } - return filtered_available, nil + return filtered_available } -var HasCapacityFilter = &Filter{ +var HasCapacityFilter = &baseFilter{ name: "has capacity for sheddable requests", filter: toFilterFunc(queueThresholdPredicate(config.Conf.QueueThresholdCritical).and(kvCacheThresholdPredicate(config.Conf.KVCacheThreshold))), } -var DropRequestFilter = &Filter{ - name: "drop request", - filter: func(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { - ctx.Logger.V(logutil.DEFAULT).Info("Request dropped", "request", ctx.Req) - return []types.Pod{}, errutil.Error{ - Code: errutil.InferencePoolResourceExhausted, Msg: "dropping request due to limited backend resources", - } - }, -} - // podPredicate is a filter function to check whether a pod is desired. type podPredicate func(req *types.LLMRequest, pod types.Pod) bool diff --git a/pkg/epp/scheduling/plugins/filter_test.go b/pkg/epp/scheduling/plugins/filter/filter_test.go similarity index 90% rename from pkg/epp/scheduling/plugins/filter_test.go rename to pkg/epp/scheduling/plugins/filter/filter_test.go index 107b423f..56cccb3b 100644 --- a/pkg/epp/scheduling/plugins/filter_test.go +++ b/pkg/epp/scheduling/plugins/filter/filter_test.go @@ -14,11 +14,10 @@ See the License for the specific language governing permissions and limitations under the License. */ -package plugins +package filter import ( "context" - "errors" "testing" "github.com/google/go-cmp/cmp" @@ -34,30 +33,26 @@ func TestFilter(t *testing.T) { req *types.LLMRequest input []types.Pod output []types.Pod - err bool filter *DecisionTreeFilter }{ { - name: "simple filter without successor, failure", + name: "simple filter without available pods", filter: &DecisionTreeFilter{ - Current: &Filter{ - name: "error", - filter: func(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { - return nil, errors.New("filter error") + Current: &baseFilter{ + name: "filter all", + filter: func(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { + return []types.Pod{} }, }, }, - err: true, + output: []types.Pod{}, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - ctx := types.NewContext(context.Background(), test.req, test.input) - got, err := test.filter.Filter(ctx, test.input) - if test.err != (err != nil) { - t.Errorf("Unexpected error, got %v, want %v", err, test.err) - } + ctx := types.NewSchedulingContext(context.Background(), test.req, test.input) + got := test.filter.Filter(ctx, test.input) opt := cmp.AllowUnexported(types.PodMetrics{}) if diff := cmp.Diff(test.output, got, opt); diff != "" { @@ -74,7 +69,6 @@ func TestFilterFunc(t *testing.T) { req *types.LLMRequest input []types.Pod output []types.Pod - err bool }{ { name: "least queuing empty input", @@ -193,11 +187,8 @@ func TestFilterFunc(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - ctx := types.NewContext(context.Background(), test.req, test.input) - got, err := test.f(ctx, test.input) - if test.err != (err != nil) { - t.Errorf("Unexpected error, got %v, want %v", err, test.err) - } + ctx := types.NewSchedulingContext(context.Background(), test.req, test.input) + got := test.f(ctx, test.input) opt := cmp.AllowUnexported(types.PodMetrics{}) if diff := cmp.Diff(test.output, got, opt); diff != "" { @@ -254,7 +245,7 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { }, }, } - ctx := types.NewContext(context.Background(), req, pods) + ctx := types.NewSchedulingContext(context.Background(), req, pods) // Run the filter function multiple times and count the results affinityCount := 0 @@ -265,10 +256,7 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { expectedAvailabilityPercent := 100 - expectedAffinityPercent for i := 0; i < numIterations; i++ { - result, err := loRASoftAffinityFilterFunc(ctx, pods) - if err != nil { - t.Fatalf("Unexpected error: %v", err) - } + result := loRASoftAffinityFilterFunc(ctx, pods) // Check which type of pod was returned if len(result) != 1 { diff --git a/pkg/epp/scheduling/plugins/noop.go b/pkg/epp/scheduling/plugins/noop.go index 1abcb95b..8f50ff36 100644 --- a/pkg/epp/scheduling/plugins/noop.go +++ b/pkg/epp/scheduling/plugins/noop.go @@ -27,12 +27,16 @@ type NoopPlugin struct{} func (p *NoopPlugin) Name() string { return "NoopPlugin" } -func (p *NoopPlugin) Score(ctx *types.Context, pod types.Pod) (float64, error) { return 0.0, nil } +func (p *NoopPlugin) PreSchedule(ctx *types.SchedulingContext) {} -func (p *NoopPlugin) Filter(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { +func (p *NoopPlugin) Filter(ctx *types.SchedulingContext, pods []types.Pod) ([]types.Pod, error) { return pods, nil } -func (p *NoopPlugin) PreSchedule(ctx *types.Context) {} +func (p *NoopPlugin) Score(ctx *types.SchedulingContext, pod types.Pod) (float64, error) { + return 0.0, nil +} + +func (p *NoopPlugin) PostSchedule(ctx *types.SchedulingContext, res *types.Result) {} -func (p *NoopPlugin) PostSchedule(ctx *types.Context, res *types.Result) {} +func (p *NoopPlugin) PostResponse(ctx *types.SchedulingContext, pod types.Pod) {} diff --git a/pkg/epp/scheduling/plugins/picker.go b/pkg/epp/scheduling/plugins/picker/random_picker.go similarity index 86% rename from pkg/epp/scheduling/plugins/picker.go rename to pkg/epp/scheduling/plugins/picker/random_picker.go index 569e4e86..850108e7 100644 --- a/pkg/epp/scheduling/plugins/picker.go +++ b/pkg/epp/scheduling/plugins/picker/random_picker.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package plugins +package picker import ( "fmt" @@ -30,8 +30,8 @@ func (rp *RandomPicker) Name() string { return "random" } -func (rp *RandomPicker) Pick(ctx *types.Context, pods []types.Pod) (*types.Result, error) { +func (rp *RandomPicker) Pick(ctx *types.SchedulingContext, pods []types.Pod) *types.Result { ctx.Logger.V(logutil.DEBUG).Info(fmt.Sprintf("Selecting a random pod from %d candidates: %+v", len(pods), pods)) i := rand.Intn(len(pods)) - return &types.Result{TargetPod: pods[i]}, nil + return &types.Result{TargetPod: pods[i]} } diff --git a/pkg/epp/scheduling/types/interfaces.go b/pkg/epp/scheduling/plugins/plugins.go similarity index 70% rename from pkg/epp/scheduling/types/interfaces.go rename to pkg/epp/scheduling/plugins/plugins.go index 6e954cef..4b334803 100644 --- a/pkg/epp/scheduling/types/interfaces.go +++ b/pkg/epp/scheduling/plugins/plugins.go @@ -14,28 +14,21 @@ See the License for the specific language governing permissions and limitations under the License. */ -package types +package plugins import ( - backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" ) const ( PreSchedulerPluginType = "PreSchedule" - PostSchedulePluginType = "PostSchedule" FilterPluginType = "Filter" ScorerPluginType = "Scorer" + PostSchedulePluginType = "PostSchedule" PickerPluginType = "Picker" + PostResponsePluginType = "PostResponse" ) -type Pod interface { - GetPod() *backendmetrics.Pod - GetMetrics() *backendmetrics.Metrics - SetScore(float64) - Score() float64 - String() string -} - // Plugin defines the interface for scheduler plugins, combining scoring, filtering, // and event handling capabilities. type Plugin interface { @@ -47,29 +40,36 @@ type Plugin interface { // initialization work. type PreSchedule interface { Plugin - PreSchedule(ctx *Context) -} - -// PostSchedule is called by the scheduler after it selects a targetPod for the request. -type PostSchedule interface { - Plugin - PostSchedule(ctx *Context, res *Result) + PreSchedule(ctx *types.SchedulingContext) } // Filter defines the interface for filtering a list of pods based on context. type Filter interface { Plugin - Filter(ctx *Context, pods []Pod) ([]Pod, error) + Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod } // Scorer defines the interface for scoring pods based on context. type Scorer interface { Plugin - Score(ctx *Context, pod Pod) (float64, error) + Score(ctx *types.SchedulingContext, pod types.Pod) float64 +} + +// PostSchedule is called by the scheduler after it selects a targetPod for the request. +type PostSchedule interface { + Plugin + PostSchedule(ctx *types.SchedulingContext, res *types.Result) } // Picker picks the final pod(s) to send the request to. type Picker interface { Plugin - Pick(ctx *Context, pods []Pod) (*Result, error) + Pick(ctx *types.SchedulingContext, pods []types.Pod) *types.Result +} + +// PostResponse is called by the scheduler after a successful response was sent. +// The given pod argument is the pod that served the request. +type PostResponse interface { + Plugin + PostResponse(ctx *types.SchedulingContext, pod types.Pod) } diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go index 7cc2bd96..beac5e6b 100644 --- a/pkg/epp/scheduling/scheduler.go +++ b/pkg/epp/scheduling/scheduler.go @@ -26,42 +26,44 @@ import ( backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/filter" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins/picker" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + errutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/error" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) var ( - lowLatencyFilter = &plugins.DecisionTreeFilter{ - Current: plugins.LowQueueFilter, - NextOnSuccess: &plugins.DecisionTreeFilter{ - Current: plugins.LoRAAffinityFilter, - NextOnSuccessOrFailure: &plugins.DecisionTreeFilter{ - Current: plugins.LeastQueueFilter, - NextOnSuccessOrFailure: &plugins.DecisionTreeFilter{ - Current: plugins.LeastKVCacheFilter, + lowLatencyFilter = &filter.DecisionTreeFilter{ + Current: filter.LowQueueFilter, + NextOnSuccess: &filter.DecisionTreeFilter{ + Current: filter.LoRAAffinityFilter, + NextOnSuccessOrFailure: &filter.DecisionTreeFilter{ + Current: filter.LeastQueueFilter, + NextOnSuccessOrFailure: &filter.DecisionTreeFilter{ + Current: filter.LeastKVCacheFilter, }, }, }, - NextOnFailure: &plugins.DecisionTreeFilter{ - Current: plugins.LeastQueueFilter, - NextOnSuccessOrFailure: &plugins.DecisionTreeFilter{ - Current: plugins.LoRAAffinityFilter, - NextOnSuccessOrFailure: &plugins.DecisionTreeFilter{ - Current: plugins.LeastKVCacheFilter, + NextOnFailure: &filter.DecisionTreeFilter{ + Current: filter.LeastQueueFilter, + NextOnSuccessOrFailure: &filter.DecisionTreeFilter{ + Current: filter.LoRAAffinityFilter, + NextOnSuccessOrFailure: &filter.DecisionTreeFilter{ + Current: filter.LeastKVCacheFilter, }, }, }, } - sheddableRequestFilter = &plugins.DecisionTreeFilter{ + sheddableRequestFilter = &filter.DecisionTreeFilter{ // When there is at least one model server that's not queuing requests, and still has KV // cache below a certain threshold, we consider this model server has capacity to handle // a sheddable request without impacting critical requests. - Current: plugins.HasCapacityFilter, + Current: filter.HasCapacityFilter, NextOnSuccess: lowLatencyFilter, // If all pods are queuing or running above the KVCache threshold, we drop the sheddable - // request to make room for critical requests. - NextOnFailure: plugins.DropRequestFilter, + // request to make room for critical requests. for this, we don't define nextOnFailure. } ) @@ -70,21 +72,21 @@ func NewScheduler(datastore Datastore) *Scheduler { return &Scheduler{ datastore: datastore, - preSchedulePlugins: []types.PreSchedule{}, - postSchedulePlugins: []types.PostSchedule{}, - scorers: []types.Scorer{}, - filters: []types.Filter{defaultPlugin}, + preSchedulePlugins: []plugins.PreSchedule{}, + scorers: []plugins.Scorer{}, + filters: []plugins.Filter{defaultPlugin}, + postSchedulePlugins: []plugins.PostSchedule{}, picker: defaultPlugin, } } type Scheduler struct { datastore Datastore - preSchedulePlugins []types.PreSchedule - postSchedulePlugins []types.PostSchedule - filters []types.Filter - scorers []types.Scorer - picker types.Picker + preSchedulePlugins []plugins.PreSchedule + filters []plugins.Filter + scorers []plugins.Scorer + postSchedulePlugins []plugins.PostSchedule + picker plugins.Picker } type Datastore interface { @@ -99,26 +101,21 @@ func (s *Scheduler) Schedule(ctx context.Context, req *types.LLMRequest) (*types // Snapshot pod metrics from the datastore to: // 1. Reduce concurrent access to the datastore. // 2. Ensure consistent data during the scheduling operation of a request. - sCtx := types.NewContext(ctx, req, types.ToSchedulerPodMetrics(s.datastore.PodGetAll())) + sCtx := types.NewSchedulingContext(ctx, req, types.ToSchedulerPodMetrics(s.datastore.PodGetAll())) loggerDebug.Info(fmt.Sprintf("Scheduling a request. Metrics: %+v", sCtx.PodsSnapshot)) s.runPreSchedulePlugins(sCtx) - pods, err := s.runFilterPlugins(sCtx) - if err != nil { - return nil, err + pods := s.runFilterPlugins(sCtx) + if len(pods) == 0 { + return nil, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: "failed to find a target pod"} } - if err := s.runScorerPlugins(sCtx, pods); err != nil { - return nil, err - } + s.runScorerPlugins(sCtx, pods) before := time.Now() - res, err := s.picker.Pick(sCtx, pods) - metrics.RecordSchedulerPluginProcessingLatency(types.PickerPluginType, s.picker.Name(), time.Since(before)) - if err != nil { - return nil, err - } + res := s.picker.Pick(sCtx, pods) + metrics.RecordSchedulerPluginProcessingLatency(plugins.PickerPluginType, s.picker.Name(), time.Since(before)) loggerDebug.Info("After running picker plugins", "result", res) s.runPostSchedulePlugins(sCtx, res) @@ -126,91 +123,79 @@ func (s *Scheduler) Schedule(ctx context.Context, req *types.LLMRequest) (*types return res, nil } -func (s *Scheduler) runPreSchedulePlugins(ctx *types.Context) { +func (s *Scheduler) runPreSchedulePlugins(ctx *types.SchedulingContext) { for _, plugin := range s.preSchedulePlugins { ctx.Logger.V(logutil.DEBUG).Info("Running pre-schedule plugin", "plugin", plugin.Name()) before := time.Now() plugin.PreSchedule(ctx) - metrics.RecordSchedulerPluginProcessingLatency(types.PreSchedulerPluginType, plugin.Name(), time.Since(before)) + metrics.RecordSchedulerPluginProcessingLatency(plugins.PreSchedulerPluginType, plugin.Name(), time.Since(before)) } } -func (s *Scheduler) runPostSchedulePlugins(ctx *types.Context, res *types.Result) { +func (s *Scheduler) runPostSchedulePlugins(ctx *types.SchedulingContext, res *types.Result) { for _, plugin := range s.postSchedulePlugins { ctx.Logger.V(logutil.DEBUG).Info("Running post-schedule plugin", "plugin", plugin.Name()) before := time.Now() plugin.PostSchedule(ctx, res) - metrics.RecordSchedulerPluginProcessingLatency(types.PostSchedulePluginType, plugin.Name(), time.Since(before)) + metrics.RecordSchedulerPluginProcessingLatency(plugins.PostSchedulePluginType, plugin.Name(), time.Since(before)) } } -func (s *Scheduler) runFilterPlugins(ctx *types.Context) ([]types.Pod, error) { +func (s *Scheduler) runFilterPlugins(ctx *types.SchedulingContext) []types.Pod { loggerDebug := ctx.Logger.V(logutil.DEBUG) - pods := ctx.PodsSnapshot - loggerDebug.Info("Before running filter plugins", "pods", pods) + filteredPods := ctx.PodsSnapshot + loggerDebug.Info("Before running filter plugins", "pods", filteredPods) + for _, filter := range s.filters { loggerDebug.Info("Running filter plugin", "plugin", filter.Name()) before := time.Now() - filteredPods, err := filter.Filter(ctx, pods) - metrics.RecordSchedulerPluginProcessingLatency(types.FilterPluginType, filter.Name(), time.Since(before)) - if err != nil || len(filteredPods) == 0 { - return nil, fmt.Errorf("failed to apply filter, resulted %v pods, this should never happen: %w", len(filteredPods), err) + filteredPods = filter.Filter(ctx, filteredPods) + metrics.RecordSchedulerPluginProcessingLatency(plugins.FilterPluginType, filter.Name(), time.Since(before)) + loggerDebug.Info("Filter plugin result", "plugin", filter.Name(), "pods", filteredPods) + if len(filteredPods) == 0 { + break } - pods = filteredPods - loggerDebug.Info("Filter plugin result", "plugin", filter.Name(), "pods", pods) } - loggerDebug.Info("After running filter plugins", "pods", pods) - return pods, nil + return filteredPods } -func (s *Scheduler) runScorerPlugins(ctx *types.Context, pods []types.Pod) error { +func (s *Scheduler) runScorerPlugins(ctx *types.SchedulingContext, pods []types.Pod) { loggerDebug := ctx.Logger.V(logutil.DEBUG) loggerDebug.Info("Before running score plugins", "pods", pods) for _, pod := range pods { - score, err := runScorersForPod(ctx, s.scorers, pod) - if err != nil { - return err - } + score := s.runScorersForPod(ctx, pod) pod.SetScore(score) } loggerDebug.Info("After running score plugins", "pods", pods) - return nil } // Iterate through each scorer in the chain and accumulate the scores. -func runScorersForPod(ctx *types.Context, scorers []types.Scorer, pod types.Pod) (float64, error) { +func (s *Scheduler) runScorersForPod(ctx *types.SchedulingContext, pod types.Pod) float64 { logger := ctx.Logger.WithValues("pod", pod.GetPod().NamespacedName).V(logutil.DEBUG) score := float64(0) - for _, scorer := range scorers { + for _, scorer := range s.scorers { logger.Info("Running scorer", "scorer", scorer.Name()) before := time.Now() - oneScore, err := scorer.Score(ctx, pod) - metrics.RecordSchedulerPluginProcessingLatency(types.ScorerPluginType, scorer.Name(), time.Since(before)) - if err != nil { - logger.Error(err, "Failed to calculate score for scorer", "scorer", scorer.Name()) - return 0, err - } + oneScore := scorer.Score(ctx, pod) + metrics.RecordSchedulerPluginProcessingLatency(plugins.ScorerPluginType, scorer.Name(), time.Since(before)) score += oneScore logger.Info("After scorer", "scorer", scorer.Name(), "score", oneScore, "total score", score) } - return score, nil + return score } type defaultPlugin struct { - plugins.RandomPicker + picker.RandomPicker } func (p *defaultPlugin) Name() string { return "DefaultPlugin" } -func (p *defaultPlugin) Filter(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { - req := ctx.Req - var filter types.Filter - if req.Critical { - filter = lowLatencyFilter - } else { - filter = sheddableRequestFilter +func (p *defaultPlugin) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { + if ctx.Req.Critical { + return lowLatencyFilter.Filter(ctx, pods) } - return filter.Filter(ctx, pods) + + return sheddableRequestFilter.Filter(ctx, pods) } diff --git a/pkg/epp/scheduling/scheduler_test.go b/pkg/epp/scheduling/scheduler_test.go index 5a2265bf..cb729038 100644 --- a/pkg/epp/scheduling/scheduler_test.go +++ b/pkg/epp/scheduling/scheduler_test.go @@ -18,12 +18,12 @@ package scheduling import ( "context" - "errors" "testing" "github.com/google/go-cmp/cmp" k8stypes "k8s.io/apimachinery/pkg/types" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" // Import config for thresholds + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" ) @@ -247,30 +247,22 @@ func TestSchedulePlugins(t *testing.T) { ScoreRes: 0.8, FilterRes: []k8stypes.NamespacedName{{Name: "pod1"}, {Name: "pod2"}}, } - tpFilterErr := &TestPlugin{ - NameRes: "filter err", - FilterErr: errors.New("filter error"), - } - tpScorerErr := &TestPlugin{ - NameRes: "score err", - ScoreErr: errors.New("score err"), + tp_filterAll := &TestPlugin{ + NameRes: "filter all", + FilterRes: []k8stypes.NamespacedName{}, } pickerPlugin := &TestPlugin{ NameRes: "picker", PickRes: k8stypes.NamespacedName{Name: "pod1"}, } - pickerErr := &TestPlugin{ - NameRes: "picker err", - PickErr: errors.New("picker err"), - } tests := []struct { name string - preSchedulePlugins []types.PreSchedule - postSchedulePlugins []types.PostSchedule - filters []types.Filter - scorers []types.Scorer - picker types.Picker + preSchedulePlugins []plugins.PreSchedule + filters []plugins.Filter + scorers []plugins.Scorer + postSchedulePlugins []plugins.PostSchedule + picker plugins.Picker input []*backendmetrics.FakePodMetrics wantTargetPod k8stypes.NamespacedName targetPodScore float64 @@ -280,10 +272,10 @@ func TestSchedulePlugins(t *testing.T) { }{ { name: "all plugins executed successfully", - preSchedulePlugins: []types.PreSchedule{tp1, tp2}, - postSchedulePlugins: []types.PostSchedule{tp1, tp2}, - filters: []types.Filter{tp1, tp2}, - scorers: []types.Scorer{tp1, tp2}, + preSchedulePlugins: []plugins.PreSchedule{tp1, tp2}, + filters: []plugins.Filter{tp1, tp2}, + scorers: []plugins.Scorer{tp1, tp2}, + postSchedulePlugins: []plugins.PostSchedule{tp1, tp2}, picker: pickerPlugin, input: []*backendmetrics.FakePodMetrics{ {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, @@ -296,46 +288,19 @@ func TestSchedulePlugins(t *testing.T) { err: false, }, { - name: "filter error", - preSchedulePlugins: []types.PreSchedule{tp1, tp2}, - postSchedulePlugins: []types.PostSchedule{tp1, tp2}, - filters: []types.Filter{tp1, tpFilterErr}, - scorers: []types.Scorer{tp1, tp2}, - picker: pickerPlugin, - input: []*backendmetrics.FakePodMetrics{ - {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, - {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, - {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}, - }, - err: true, - }, - { - name: "scorer error", - preSchedulePlugins: []types.PreSchedule{tp1, tp2}, - postSchedulePlugins: []types.PostSchedule{tp1, tp2}, - filters: []types.Filter{tp1, tp2}, - scorers: []types.Scorer{tp1, tpScorerErr}, + name: "filter all", + preSchedulePlugins: []plugins.PreSchedule{tp1, tp2}, + filters: []plugins.Filter{tp1, tp_filterAll}, + scorers: []plugins.Scorer{tp1, tp2}, + postSchedulePlugins: []plugins.PostSchedule{tp1, tp2}, picker: pickerPlugin, input: []*backendmetrics.FakePodMetrics{ {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}, }, - err: true, - }, - { - name: "picker error", - preSchedulePlugins: []types.PreSchedule{tp1, tp2}, - postSchedulePlugins: []types.PostSchedule{tp1, tp2}, - filters: []types.Filter{tp1, tp2}, - scorers: []types.Scorer{tp1, tp2}, - picker: pickerErr, - input: []*backendmetrics.FakePodMetrics{ - {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, - {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, - {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}, - }, - err: true, + numPodsToScore: 0, + err: true, // no available pods to server after filter all }, } @@ -343,26 +308,26 @@ func TestSchedulePlugins(t *testing.T) { t.Run(test.name, func(t *testing.T) { // Reset all plugins before each new test case. for _, plugin := range test.preSchedulePlugins { - plugin.(*TestPlugin).Reset() + plugin.(*TestPlugin).reset() } for _, plugin := range test.postSchedulePlugins { - plugin.(*TestPlugin).Reset() + plugin.(*TestPlugin).reset() } for _, plugin := range test.filters { - plugin.(*TestPlugin).Reset() + plugin.(*TestPlugin).reset() } for _, plugin := range test.scorers { - plugin.(*TestPlugin).Reset() + plugin.(*TestPlugin).reset() } - test.picker.(*TestPlugin).Reset() + test.picker.(*TestPlugin).reset() // Initialize the scheduler scheduler := &Scheduler{ datastore: &fakeDataStore{pods: test.input}, preSchedulePlugins: test.preSchedulePlugins, - postSchedulePlugins: test.postSchedulePlugins, filters: test.filters, scorers: test.scorers, + postSchedulePlugins: test.postSchedulePlugins, picker: test.picker, } @@ -397,13 +362,6 @@ func TestSchedulePlugins(t *testing.T) { } } - for _, plugin := range test.postSchedulePlugins { - tp, _ := plugin.(*TestPlugin) - if tp.PostScheduleCallCount != 1 { - t.Errorf("Plugin %s PostSchedule() called %d times, expected 1", tp.NameRes, tp.PostScheduleCallCount) - } - } - for _, plugin := range test.filters { tp, _ := plugin.(*TestPlugin) if tp.FilterCallCount != 1 { @@ -418,6 +376,13 @@ func TestSchedulePlugins(t *testing.T) { } } + for _, plugin := range test.postSchedulePlugins { + tp, _ := plugin.(*TestPlugin) + if tp.PostScheduleCallCount != 1 { + t.Errorf("Plugin %s PostSchedule() called %d times, expected 1", tp.NameRes, tp.PostScheduleCallCount) + } + } + tp, _ := test.picker.(*TestPlugin) if tp.PickCallCount != 1 { t.Errorf("Picker plugin %s Pick() called %d times, expected 1", tp.NameRes, tp.PickCallCount) @@ -444,55 +409,49 @@ type TestPlugin struct { NameRes string ScoreCallCount int ScoreRes float64 - ScoreErr error FilterCallCount int FilterRes []k8stypes.NamespacedName - FilterErr error PreScheduleCallCount int PostScheduleCallCount int PickCallCount int PickRes k8stypes.NamespacedName - PickErr error } func (tp *TestPlugin) Name() string { return tp.NameRes } -func (tp *TestPlugin) Score(ctx *types.Context, pod types.Pod) (float64, error) { - tp.ScoreCallCount++ - return tp.ScoreRes, tp.ScoreErr +func (tp *TestPlugin) PreSchedule(ctx *types.SchedulingContext) { + tp.PreScheduleCallCount++ } -func (tp *TestPlugin) Filter(ctx *types.Context, pods []types.Pod) ([]types.Pod, error) { +func (tp *TestPlugin) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { tp.FilterCallCount++ - return findPods(ctx, tp.FilterRes...), tp.FilterErr + return findPods(ctx, tp.FilterRes...) } -func (tp *TestPlugin) PreSchedule(ctx *types.Context) { - tp.PreScheduleCallCount++ +func (tp *TestPlugin) Score(ctx *types.SchedulingContext, pod types.Pod) float64 { + tp.ScoreCallCount++ + return tp.ScoreRes } -func (tp *TestPlugin) PostSchedule(ctx *types.Context, res *types.Result) { +func (tp *TestPlugin) PostSchedule(ctx *types.SchedulingContext, res *types.Result) { tp.PostScheduleCallCount++ } -func (tp *TestPlugin) Pick(ctx *types.Context, pods []types.Pod) (*types.Result, error) { +func (tp *TestPlugin) Pick(ctx *types.SchedulingContext, pods []types.Pod) *types.Result { tp.PickCallCount++ - if tp.PickErr != nil { - return nil, tp.PickErr - } pod := findPods(ctx, tp.PickRes)[0] - return &types.Result{TargetPod: pod}, nil + return &types.Result{TargetPod: pod} } -func (tp *TestPlugin) Reset() { +func (tp *TestPlugin) reset() { tp.PreScheduleCallCount = 0 - tp.PostScheduleCallCount = 0 tp.FilterCallCount = 0 tp.ScoreCallCount = 0 + tp.PostScheduleCallCount = 0 tp.PickCallCount = 0 } -func findPods(ctx *types.Context, names ...k8stypes.NamespacedName) []types.Pod { +func findPods(ctx *types.SchedulingContext, names ...k8stypes.NamespacedName) []types.Pod { res := []types.Pod{} for _, pod := range ctx.PodsSnapshot { for _, name := range names { diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go index e52e9047..e66b5fb5 100644 --- a/pkg/epp/scheduling/types/types.go +++ b/pkg/epp/scheduling/types/types.go @@ -40,8 +40,16 @@ func (r *LLMRequest) String() string { return fmt.Sprintf("Model: %s, TargetModels: %v, ResolvedTargetModel: %s, Critical: %t, PromptLength: %v", r.Model, r.TargetModels, r.ResolvedTargetModel, r.Critical, len(r.Prompt)) } -// Context holds contextual information during a scheduling operation. -type Context struct { +type Pod interface { + GetPod() *backendmetrics.Pod + GetMetrics() *backendmetrics.Metrics + SetScore(float64) + Score() float64 + String() string +} + +// SchedulingContext holds contextual information during a scheduling operation. +type SchedulingContext struct { context.Context Logger logr.Logger Req *LLMRequest @@ -77,9 +85,9 @@ type PodMetrics struct { *backendmetrics.Metrics } -func NewContext(ctx context.Context, req *LLMRequest, pods []Pod) *Context { +func NewSchedulingContext(ctx context.Context, req *LLMRequest, pods []Pod) *SchedulingContext { logger := log.FromContext(ctx).WithValues("request", req) - return &Context{ + return &SchedulingContext{ Context: ctx, Logger: logger, Req: req, From 9317e9b8abdb078a1bc49ba23adf8de6849b2387 Mon Sep 17 00:00:00 2001 From: nayihz Date: Thu, 24 Apr 2025 01:41:46 +0800 Subject: [PATCH 55/74] filter irrelevant pod in pod_reconciler (#696) --- pkg/epp/controller/pod_reconciler.go | 22 ++++++++++++++++++++++ pkg/epp/datastore/datastore.go | 3 +++ 2 files changed, 25 insertions(+) diff --git a/pkg/epp/controller/pod_reconciler.go b/pkg/epp/controller/pod_reconciler.go index 6d1af8d9..5f1df10d 100644 --- a/pkg/epp/controller/pod_reconciler.go +++ b/pkg/epp/controller/pod_reconciler.go @@ -26,7 +26,9 @@ import ( "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" podutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/pod" @@ -63,8 +65,28 @@ func (c *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R } func (c *PodReconciler) SetupWithManager(mgr ctrl.Manager) error { + filter := predicate.Funcs{ + CreateFunc: func(ce event.CreateEvent) bool { + pod := ce.Object.(*corev1.Pod) + return c.Datastore.PoolLabelsMatch(pod.GetLabels()) + }, + UpdateFunc: func(ue event.UpdateEvent) bool { + oldPod := ue.ObjectOld.(*corev1.Pod) + newPod := ue.ObjectNew.(*corev1.Pod) + return c.Datastore.PoolLabelsMatch(oldPod.GetLabels()) || c.Datastore.PoolLabelsMatch(newPod.GetLabels()) + }, + DeleteFunc: func(de event.DeleteEvent) bool { + pod := de.Object.(*corev1.Pod) + return c.Datastore.PoolLabelsMatch(pod.GetLabels()) + }, + GenericFunc: func(ge event.GenericEvent) bool { + pod := ge.Object.(*corev1.Pod) + return c.Datastore.PoolLabelsMatch(pod.GetLabels()) + }, + } return ctrl.NewControllerManagedBy(mgr). For(&corev1.Pod{}). + WithEventFilter(filter). Complete(c) } diff --git a/pkg/epp/datastore/datastore.go b/pkg/epp/datastore/datastore.go index f8378d25..22c50022 100644 --- a/pkg/epp/datastore/datastore.go +++ b/pkg/epp/datastore/datastore.go @@ -150,6 +150,9 @@ func (ds *datastore) PoolHasSynced() bool { func (ds *datastore) PoolLabelsMatch(podLabels map[string]string) bool { ds.poolAndModelsMu.RLock() defer ds.poolAndModelsMu.RUnlock() + if ds.pool == nil { + return false + } poolSelector := selectorFromInferencePoolSelector(ds.pool.Spec.Selector) podSet := labels.Set(podLabels) return poolSelector.Matches(podSet) From 9eeb2dccb0c01f8ca8adbd0a8ae94230001eea83 Mon Sep 17 00:00:00 2001 From: Daneyon Hansen Date: Wed, 23 Apr 2025 14:30:31 -0700 Subject: [PATCH 56/74] EPP: Update GetRandomPod() to return nil if no pods exist (#731) Signed-off-by: Daneyon Hansen --- pkg/epp/handlers/request.go | 3 ++ pkg/epp/handlers/server.go | 3 ++ pkg/epp/handlers/streamingserver_test.go | 55 ++++++++++++++++++++++++ 3 files changed, 61 insertions(+) diff --git a/pkg/epp/handlers/request.go b/pkg/epp/handlers/request.go index 9121b59a..8d30e543 100644 --- a/pkg/epp/handlers/request.go +++ b/pkg/epp/handlers/request.go @@ -138,6 +138,9 @@ func (s *StreamingServer) HandleRequestHeaders(ctx context.Context, reqCtx *Requ // The above PR will address endpoint admission, but currently any request without a body will be // routed to a random upstream pod. pod := GetRandomPod(s.datastore) + if pod == nil { + return errutil.Error{Code: errutil.Internal, Msg: "no pods available in datastore"} + } pool, err := s.datastore.PoolGet() if err != nil { return err diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go index 2e3a35fe..5e23c7a0 100644 --- a/pkg/epp/handlers/server.go +++ b/pkg/epp/handlers/server.go @@ -449,6 +449,9 @@ func RandomWeightedDraw(logger logr.Logger, model *v1alpha2.InferenceModel, seed func GetRandomPod(ds datastore.Datastore) *backendmetrics.Pod { pods := ds.PodGetAll() + if len(pods) == 0 { + return nil + } number := rand.Intn(len(pods)) pod := pods[number] return pod.GetPod() diff --git a/pkg/epp/handlers/streamingserver_test.go b/pkg/epp/handlers/streamingserver_test.go index 72f7031a..23d2b68f 100644 --- a/pkg/epp/handlers/streamingserver_test.go +++ b/pkg/epp/handlers/streamingserver_test.go @@ -18,8 +18,14 @@ package handlers import ( "testing" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -126,6 +132,55 @@ func TestRandomWeightedDraw(t *testing.T) { } } +func TestGetRandomPod(t *testing.T) { + tests := []struct { + name string + storePods []*corev1.Pod + expectNil bool + }{ + { + name: "No pods available", + storePods: []*corev1.Pod{}, + expectNil: true, + }, + { + name: "Single pod available", + storePods: []*corev1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "pod1"}}, + }, + expectNil: false, + }, + { + name: "Multiple pods available", + storePods: []*corev1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "pod1"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "pod2"}}, + {ObjectMeta: metav1.ObjectMeta{Name: "pod3"}}, + }, + expectNil: false, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + pmf := metrics.NewPodMetricsFactory(&metrics.FakePodMetricsClient{}, time.Millisecond) + ds := datastore.NewDatastore(t.Context(), pmf) + for _, pod := range test.storePods { + ds.PodUpdateOrAddIfNotExist(pod) + } + + gotPod := GetRandomPod(ds) + + if test.expectNil && gotPod != nil { + t.Errorf("expected nil pod, got: %v", gotPod) + } + if !test.expectNil && gotPod == nil { + t.Errorf("expected non-nil pod, got nil") + } + }) + } +} + func pointer(v int32) *int32 { return &v } From 4c7fd64da7e0e1b39c89d79ff33cce244e44871a Mon Sep 17 00:00:00 2001 From: Maya Barnea Date: Thu, 24 Apr 2025 18:48:31 +0300 Subject: [PATCH 57/74] Move filter and scorer plugins registration to a separate file (#729) * Move filters and scorers registration to filter/scorer specific files * Default scheduler config contains empty list of scorers Signed-off-by: Maya Barnea * Default plugin is not a scorer any more Signed-off-by: Maya Barnea * fix scheduler test + lint comments Signed-off-by: Maya Barnea --------- Signed-off-by: Maya Barnea --- pkg/epp/scheduling/config.go | 27 ++++++++++ pkg/epp/scheduling/default_config.go | 31 +++++++++++ pkg/epp/scheduling/scheduler.go | 18 ++++--- pkg/epp/scheduling/scheduler_test.go | 81 ++++++++++++++-------------- 4 files changed, 110 insertions(+), 47 deletions(-) create mode 100644 pkg/epp/scheduling/config.go create mode 100644 pkg/epp/scheduling/default_config.go diff --git a/pkg/epp/scheduling/config.go b/pkg/epp/scheduling/config.go new file mode 100644 index 00000000..6c0f4be7 --- /dev/null +++ b/pkg/epp/scheduling/config.go @@ -0,0 +1,27 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduling + +import "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" + +type SchedulerConfig struct { + preSchedulePlugins []plugins.PreSchedule + scorers []plugins.Scorer + filters []plugins.Filter + postSchedulePlugins []plugins.PostSchedule + picker plugins.Picker +} diff --git a/pkg/epp/scheduling/default_config.go b/pkg/epp/scheduling/default_config.go new file mode 100644 index 00000000..e42f1317 --- /dev/null +++ b/pkg/epp/scheduling/default_config.go @@ -0,0 +1,31 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scheduling + +import ( + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" +) + +var defPlugin = &defaultPlugin{} + +var defaultConfig = &SchedulerConfig{ + preSchedulePlugins: []plugins.PreSchedule{}, + scorers: []plugins.Scorer{}, + filters: []plugins.Filter{defPlugin}, + postSchedulePlugins: []plugins.PostSchedule{}, + picker: defPlugin, +} diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go index beac5e6b..322f714f 100644 --- a/pkg/epp/scheduling/scheduler.go +++ b/pkg/epp/scheduling/scheduler.go @@ -68,16 +68,20 @@ var ( ) func NewScheduler(datastore Datastore) *Scheduler { - defaultPlugin := &defaultPlugin{} + return NewSchedulerWithConfig(datastore, defaultConfig) +} - return &Scheduler{ +func NewSchedulerWithConfig(datastore Datastore, config *SchedulerConfig) *Scheduler { + scheduler := &Scheduler{ datastore: datastore, - preSchedulePlugins: []plugins.PreSchedule{}, - scorers: []plugins.Scorer{}, - filters: []plugins.Filter{defaultPlugin}, - postSchedulePlugins: []plugins.PostSchedule{}, - picker: defaultPlugin, + preSchedulePlugins: config.preSchedulePlugins, + scorers: config.scorers, + filters: config.filters, + postSchedulePlugins: config.postSchedulePlugins, + picker: config.picker, } + + return scheduler } type Scheduler struct { diff --git a/pkg/epp/scheduling/scheduler_test.go b/pkg/epp/scheduling/scheduler_test.go index cb729038..2fb26a86 100644 --- a/pkg/epp/scheduling/scheduler_test.go +++ b/pkg/epp/scheduling/scheduler_test.go @@ -220,9 +220,17 @@ func TestSchedule(t *testing.T) { }, } + schedConfig := &SchedulerConfig{ + preSchedulePlugins: []plugins.PreSchedule{}, + scorers: []plugins.Scorer{}, + filters: []plugins.Filter{defPlugin}, + postSchedulePlugins: []plugins.PostSchedule{}, + picker: defPlugin, + } + for _, test := range tests { t.Run(test.name, func(t *testing.T) { - scheduler := NewScheduler(&fakeDataStore{pods: test.input}) + scheduler := NewSchedulerWithConfig(&fakeDataStore{pods: test.input}, schedConfig) got, err := scheduler.Schedule(context.Background(), test.req) if test.err != (err != nil) { t.Errorf("Unexpected error, got %v, want %v", err, test.err) @@ -257,26 +265,24 @@ func TestSchedulePlugins(t *testing.T) { } tests := []struct { - name string - preSchedulePlugins []plugins.PreSchedule - filters []plugins.Filter - scorers []plugins.Scorer - postSchedulePlugins []plugins.PostSchedule - picker plugins.Picker - input []*backendmetrics.FakePodMetrics - wantTargetPod k8stypes.NamespacedName - targetPodScore float64 + name string + config SchedulerConfig + input []*backendmetrics.FakePodMetrics + wantTargetPod k8stypes.NamespacedName + targetPodScore float64 // Number of expected pods to score (after filter) numPodsToScore int err bool }{ { - name: "all plugins executed successfully", - preSchedulePlugins: []plugins.PreSchedule{tp1, tp2}, - filters: []plugins.Filter{tp1, tp2}, - scorers: []plugins.Scorer{tp1, tp2}, - postSchedulePlugins: []plugins.PostSchedule{tp1, tp2}, - picker: pickerPlugin, + name: "all plugins executed successfully", + config: SchedulerConfig{ + preSchedulePlugins: []plugins.PreSchedule{tp1, tp2}, + filters: []plugins.Filter{tp1, tp2}, + scorers: []plugins.Scorer{tp1, tp2}, + postSchedulePlugins: []plugins.PostSchedule{tp1, tp2}, + picker: pickerPlugin, + }, input: []*backendmetrics.FakePodMetrics{ {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, @@ -288,12 +294,14 @@ func TestSchedulePlugins(t *testing.T) { err: false, }, { - name: "filter all", - preSchedulePlugins: []plugins.PreSchedule{tp1, tp2}, - filters: []plugins.Filter{tp1, tp_filterAll}, - scorers: []plugins.Scorer{tp1, tp2}, - postSchedulePlugins: []plugins.PostSchedule{tp1, tp2}, - picker: pickerPlugin, + name: "filter all", + config: SchedulerConfig{ + preSchedulePlugins: []plugins.PreSchedule{tp1, tp2}, + filters: []plugins.Filter{tp1, tp_filterAll}, + scorers: []plugins.Scorer{tp1, tp2}, + postSchedulePlugins: []plugins.PostSchedule{tp1, tp2}, + picker: pickerPlugin, + }, input: []*backendmetrics.FakePodMetrics{ {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, @@ -307,29 +315,22 @@ func TestSchedulePlugins(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { // Reset all plugins before each new test case. - for _, plugin := range test.preSchedulePlugins { + for _, plugin := range test.config.preSchedulePlugins { plugin.(*TestPlugin).reset() } - for _, plugin := range test.postSchedulePlugins { + for _, plugin := range test.config.postSchedulePlugins { plugin.(*TestPlugin).reset() } - for _, plugin := range test.filters { + for _, plugin := range test.config.filters { plugin.(*TestPlugin).reset() } - for _, plugin := range test.scorers { + for _, plugin := range test.config.scorers { plugin.(*TestPlugin).reset() } - test.picker.(*TestPlugin).reset() + test.config.picker.(*TestPlugin).reset() // Initialize the scheduler - scheduler := &Scheduler{ - datastore: &fakeDataStore{pods: test.input}, - preSchedulePlugins: test.preSchedulePlugins, - filters: test.filters, - scorers: test.scorers, - postSchedulePlugins: test.postSchedulePlugins, - picker: test.picker, - } + scheduler := NewSchedulerWithConfig(&fakeDataStore{pods: test.input}, &test.config) req := &types.LLMRequest{Model: "test-model"} got, err := scheduler.Schedule(context.Background(), req) @@ -355,35 +356,35 @@ func TestSchedulePlugins(t *testing.T) { } // Validate plugin execution counts dynamically - for _, plugin := range test.preSchedulePlugins { + for _, plugin := range test.config.preSchedulePlugins { tp, _ := plugin.(*TestPlugin) if tp.PreScheduleCallCount != 1 { t.Errorf("Plugin %s PreSchedule() called %d times, expected 1", tp.NameRes, tp.PreScheduleCallCount) } } - for _, plugin := range test.filters { + for _, plugin := range test.config.filters { tp, _ := plugin.(*TestPlugin) if tp.FilterCallCount != 1 { t.Errorf("Plugin %s Filter() called %d times, expected 1", tp.NameRes, tp.FilterCallCount) } } - for _, plugin := range test.scorers { + for _, plugin := range test.config.scorers { tp, _ := plugin.(*TestPlugin) if tp.ScoreCallCount != test.numPodsToScore { t.Errorf("Plugin %s Score() called %d times, expected 1", tp.NameRes, tp.ScoreCallCount) } } - for _, plugin := range test.postSchedulePlugins { + for _, plugin := range test.config.postSchedulePlugins { tp, _ := plugin.(*TestPlugin) if tp.PostScheduleCallCount != 1 { t.Errorf("Plugin %s PostSchedule() called %d times, expected 1", tp.NameRes, tp.PostScheduleCallCount) } } - tp, _ := test.picker.(*TestPlugin) + tp, _ := test.config.picker.(*TestPlugin) if tp.PickCallCount != 1 { t.Errorf("Picker plugin %s Pick() called %d times, expected 1", tp.NameRes, tp.PickCallCount) } From c8d0d62d0a4584d0557e078263a279f4e86e7c27 Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Thu, 24 Apr 2025 14:20:30 -0700 Subject: [PATCH 58/74] Update issue templates (#738) * Update issue templates * Updates artifacts for v0.3.0-rc.1 release Signed-off-by: Kellen Swain * Updates bbr chart for v0.3.0-rc.1 release Signed-off-by: Kellen Swain * Updates artifacts for v0.3.0 release Signed-off-by: Kellen Swain * Adding blank issue template so that all issues start with label --------- Signed-off-by: Kellen Swain --- .github/ISSUE_TEMPLATE/bug_request.md | 4 +++- .github/ISSUE_TEMPLATE/feature_request.md | 3 +-- .github/ISSUE_TEMPLATE/issue_template.md | 8 ++++++++ .github/ISSUE_TEMPLATE/new-release.md | 1 + 4 files changed, 13 insertions(+), 3 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE/issue_template.md diff --git a/.github/ISSUE_TEMPLATE/bug_request.md b/.github/ISSUE_TEMPLATE/bug_request.md index c2597eb3..15ed35e1 100644 --- a/.github/ISSUE_TEMPLATE/bug_request.md +++ b/.github/ISSUE_TEMPLATE/bug_request.md @@ -1,7 +1,9 @@ --- name: Bug Report about: Report a bug you encountered -labels: kind/bug +title: '' +labels: kind/bug, needs-triage +assignees: '' --- diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 53a885c7..1eee5871 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -2,7 +2,7 @@ name: Feature request about: Suggest an idea for this project title: '' -labels: '' +labels: needs-triage assignees: '' --- @@ -12,4 +12,3 @@ assignees: '' **What would you like to be added**: **Why is this needed**: - diff --git a/.github/ISSUE_TEMPLATE/issue_template.md b/.github/ISSUE_TEMPLATE/issue_template.md new file mode 100644 index 00000000..1a2c8c6f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/issue_template.md @@ -0,0 +1,8 @@ +--- +name: Blank Issue +about: '' +title: '' +labels: needs-triage +assignees: '' + +--- \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/new-release.md b/.github/ISSUE_TEMPLATE/new-release.md index be569844..27e83784 100644 --- a/.github/ISSUE_TEMPLATE/new-release.md +++ b/.github/ISSUE_TEMPLATE/new-release.md @@ -4,6 +4,7 @@ about: Propose a new release title: Release v0.x.0 labels: '' assignees: '' + --- - [Introduction](#introduction) From b66a61c4b4753b9c5dedc26c0772490c9da9907e Mon Sep 17 00:00:00 2001 From: Shane Utt Date: Fri, 25 Apr 2025 10:31:02 -0400 Subject: [PATCH 59/74] docs: add concepts and definitions to README.md (#734) Signed-off-by: Shane Utt --- README.md | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f7943d2f..ffd86758 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,56 @@ [![Go Reference](https://pkg.go.dev/badge/sigs.k8s.io/gateway-api-inference-extension.svg)](https://pkg.go.dev/sigs.k8s.io/gateway-api-inference-extension) [![License](https://img.shields.io/github/license/kubernetes-sigs/gateway-api-inference-extension)](/LICENSE) -# Gateway API Inference Extension +# Gateway API Inference Extension (GIE) + +This project offers tools for AI Inference, enabling developers to build [Inference Gateways]. + +[Inference Gateways]:#concepts-and-definitions + +## Concepts and Definitions + +The following are some key industry terms that are important to understand for +this project: + +- **Model**: A generative AI model that has learned patterns from data and is + used for inference. Models vary in size and architecture, from smaller + domain-specific models to massive multi-billion parameter neural networks that + are optimized for diverse language tasks. +- **Inference**: The process of running a generative AI model, such as a large + language model, diffusion model etc, to generate text, embeddings, or other + outputs from input data. +- **Model server**: A service (in our case, containerized) responsible for + receiving inference requests and returning predictions from a model. +- **Accelerator**: specialized hardware, such as Graphics Processing Units + (GPUs) that can be attached to Kubernetes nodes to speed up computations, + particularly for training and inference tasks. + +And the following are more specific terms to this project: + +- **Scheduler**: Makes decisions about which endpoint is optimal (best cost / + best performance) for an inference request based on `Metrics and Capabilities` + from [Model Serving](/docs/proposals/003-model-server-protocol/README.md). +- **Metrics and Capabilities**: Data provided by model serving platforms about + performance, availability and capabilities to optimize routing. Includes + things like [Prefix Cache] status or [LoRA Adapters] availability. +- **Endpoint Selector**: A `Scheduler` combined with `Metrics and Capabilities` + systems is often referred to together as an [Endpoint Selection Extension] + (this is also sometimes referred to as an "endpoint picker", or "EPP"). +- **Inference Gateway**: A proxy/load-balancer which has been coupled with a + `Endpoint Selector`. It provides optimized routing and load balancing for + serving Kubernetes self-hosted generative Artificial Intelligence (AI) + workloads. It simplifies the deployment, management, and observability of AI + inference workloads. + +For deeper insights and more advanced concepts, refer to our [proposals](/docs/proposals). + +[Inference]:https://www.digitalocean.com/community/tutorials/llm-inference-optimization +[Gateway API]:https://github.com/kubernetes-sigs/gateway-api +[Prefix Cache]:https://docs.vllm.ai/en/stable/design/v1/prefix_caching.html +[LoRA Adapters]:https://docs.vllm.ai/en/stable/features/lora.html +[Endpoint Selection Extension]:https://gateway-api-inference-extension.sigs.k8s.io/#endpoint-selection-extension + +## Technical Overview This extension upgrades an [ext-proc](https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_filters/ext_proc_filter)-capable proxy or gateway - such as Envoy Gateway, kGateway, or the GKE Gateway - to become an **inference gateway** - supporting inference platform teams self-hosting large language models on Kubernetes. This integration makes it easy to expose and control access to your local [OpenAI-compatible chat completion endpoints](https://platform.openai.com/docs/api-reference/chat) to other workloads on or off cluster, or to integrate your self-hosted models alongside model-as-a-service providers in a higher level **AI Gateway** like LiteLLM, Solo AI Gateway, or Apigee. From 772ac4d69da2049304797a749f87511f61380660 Mon Sep 17 00:00:00 2001 From: Radhika Lakhtakia <137429298+rlakhtakia@users.noreply.github.com> Date: Fri, 25 Apr 2025 15:15:24 +0000 Subject: [PATCH 60/74] Add unit tests for pod APIs under pkg/datastore (#712) * Add unit test coverage for pod APIs under datastore/pkg * Add unit test coverage for pod APIs under datastore/pkg * Add unit test coverage for pod APIs under datastore/pkg * Add unit test coverage for pod APIs under datastore/pkg * EPP Architecture proposal (#683) * initial changes * Adding to proposal to give a quick barebones definition to refactor * feedback changes * more feedback addressing * removed unused Fake struct (#723) Signed-off-by: Nir Rozenbaum * epp: return correct response for trailers (#726) This looks like a copy paste error. * Refactor scheduler to run plugins (#677) * Refactor scheduler to run plugins * Add scheduler plugin latency metric * Address comments * Address comments * Complete the InferencePool documentation (#673) * Initial guide for inference pool * Add extensionReference to the InferencePool spec * Fix list formatting * Remove unused labels * Autogenerate the spec * Update site-src/api-types/inferencepool.md Co-authored-by: Rob Scott * Update site-src/api-types/inferencepool.md Co-authored-by: Rob Scott * Update site-src/api-types/inferencepool.md Co-authored-by: Rob Scott * Update site-src/api-types/inferencepool.md Co-authored-by: Rob Scott * Update site-src/api-types/inferencepool.md Co-authored-by: Rob Scott * Update site-src/api-types/inferencepool.md Co-authored-by: Rob Scott * Rename llm-pool names in rollout example * Add use cases for replacing an inference pool * Rewording the background section * Create replacing-inference-pool.md * Replace instructions with a link for how to replace an inference pool * Update replacing-inference-pool.md * Update mkdocs.yml * Update replacing-inference-pool.md * Update inferencemodel_types.go * Update inferencepool.md * Update site-src/guides/replacing-inference-pool.md Co-authored-by: Rob Scott --------- Co-authored-by: Rob Scott * reduce log level in metrics logger not to trash the log (#708) * reduce log level in metrics logger not to trash the log Signed-off-by: Nir Rozenbaum * rename flush metrics to refresh metrics Signed-off-by: Nir Rozenbaum * revert log level Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum * few updates in datastore (#713) * few updates in datastore Signed-off-by: Nir Rozenbaum * PoolSet documentation Signed-off-by: Nir Rozenbaum * error phrasing Signed-off-by: Nir Rozenbaum * removed unused pool arg from PodUpdateOrAddIfNotExist Signed-off-by: Nir Rozenbaum * linter Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum * scheduler refactoring (#730) Signed-off-by: Nir Rozenbaum * filter irrelevant pod in pod_reconciler (#696) * EPP: Update GetRandomPod() to return nil if no pods exist (#731) Signed-off-by: Daneyon Hansen * Move filter and scorer plugins registration to a separate file (#729) * Move filters and scorers registration to filter/scorer specific files * Default scheduler config contains empty list of scorers Signed-off-by: Maya Barnea * Default plugin is not a scorer any more Signed-off-by: Maya Barnea * fix scheduler test + lint comments Signed-off-by: Maya Barnea --------- Signed-off-by: Maya Barnea * Update issue templates (#738) * Update issue templates * Updates artifacts for v0.3.0-rc.1 release Signed-off-by: Kellen Swain * Updates bbr chart for v0.3.0-rc.1 release Signed-off-by: Kellen Swain * Updates artifacts for v0.3.0 release Signed-off-by: Kellen Swain * Adding blank issue template so that all issues start with label --------- Signed-off-by: Kellen Swain * Add unit test coverage for pod APIs under datastore/pkg * few updates in datastore (#713) * few updates in datastore Signed-off-by: Nir Rozenbaum * PoolSet documentation Signed-off-by: Nir Rozenbaum * error phrasing Signed-off-by: Nir Rozenbaum * removed unused pool arg from PodUpdateOrAddIfNotExist Signed-off-by: Nir Rozenbaum * linter Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum * few updates in datastore (#713) * few updates in datastore Signed-off-by: Nir Rozenbaum * PoolSet documentation Signed-off-by: Nir Rozenbaum * error phrasing Signed-off-by: Nir Rozenbaum * removed unused pool arg from PodUpdateOrAddIfNotExist Signed-off-by: Nir Rozenbaum * linter Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum * Add unit test coverage for pod APIs under datastore/pkg --------- Signed-off-by: Nir Rozenbaum Signed-off-by: Daneyon Hansen Signed-off-by: Maya Barnea Signed-off-by: Kellen Swain Co-authored-by: Kellen Swain Co-authored-by: Nir Rozenbaum Co-authored-by: John Howard Co-authored-by: Cong Liu Co-authored-by: Nicole Xin Co-authored-by: Rob Scott Co-authored-by: nayihz Co-authored-by: Daneyon Hansen Co-authored-by: Maya Barnea --- pkg/epp/datastore/datastore_test.go | 91 +++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/pkg/epp/datastore/datastore_test.go b/pkg/epp/datastore/datastore_test.go index e8c77d37..b6466e6b 100644 --- a/pkg/epp/datastore/datastore_test.go +++ b/pkg/epp/datastore/datastore_test.go @@ -355,3 +355,94 @@ func TestMetrics(t *testing.T) { }) } } + +func TestPods(t *testing.T) { + updatedPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + }, + Spec: corev1.PodSpec{ + NodeName: "node-1", + }, + } + tests := []struct { + name string + op func(ctx context.Context, ds Datastore) + existingPods []*corev1.Pod + wantPods []*corev1.Pod + }{ + { + name: "Add new pod, no existing pods, should add", + existingPods: []*corev1.Pod{}, + wantPods: []*corev1.Pod{pod1}, + op: func(ctx context.Context, ds Datastore) { + ds.PodUpdateOrAddIfNotExist(pod1) + }, + }, + { + name: "Add new pod, with existing pods, should add", + existingPods: []*corev1.Pod{pod1}, + wantPods: []*corev1.Pod{pod1, pod2}, + op: func(ctx context.Context, ds Datastore) { + ds.PodUpdateOrAddIfNotExist(pod2) + }, + }, + { + name: "Update existing pod, new field, should update", + existingPods: []*corev1.Pod{pod1}, + wantPods: []*corev1.Pod{updatedPod}, + op: func(ctx context.Context, ds Datastore) { + ds.PodUpdateOrAddIfNotExist(updatedPod) + }, + }, + { + name: "Update existing pod, no new fields, should not update", + existingPods: []*corev1.Pod{pod1}, + wantPods: []*corev1.Pod{pod1}, + op: func(ctx context.Context, ds Datastore) { + incoming := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod1", + Namespace: "default", + }, + } + ds.PodUpdateOrAddIfNotExist(incoming) + }, + }, + { + name: "Delete the pod", + wantPods: []*corev1.Pod{pod1}, + op: func(ctx context.Context, ds Datastore) { + ds.PodDelete(pod2NamespacedName) + }, + }, + { + name: "Delete the pod that doesn't exist", + existingPods: []*corev1.Pod{pod1}, + wantPods: []*corev1.Pod{pod1}, + op: func(ctx context.Context, ds Datastore) { + ds.PodDelete(pod2NamespacedName) + }, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ctx := context.Background() + pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.FakePodMetricsClient{}, time.Second) + ds := NewDatastore(t.Context(), pmf) + for _, pod := range test.existingPods { + ds.PodUpdateOrAddIfNotExist(pod) + } + + test.op(ctx, ds) + var gotPods []*corev1.Pod + for _, pm := range ds.PodGetAll() { + pod := &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: pm.GetPod().NamespacedName.Name, Namespace: pm.GetPod().NamespacedName.Namespace}, Status: corev1.PodStatus{PodIP: pm.GetPod().Address}} + gotPods = append(gotPods, pod) + } + if !cmp.Equal(gotPods, test.wantPods, cmpopts.SortSlices(func(a, b *corev1.Pod) bool { return a.Name < b.Name })) { + t.Logf("got (%v) != want (%v);", gotPods, test.wantPods) + } + }) + } +} From 60f8c57bb95b656a75d27564d5ff01c060bcdba5 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Fri, 25 Apr 2025 21:37:23 +0300 Subject: [PATCH 61/74] added a target dedicated for running unit-test only (#739) * added a target dedicated for running unit-test only. this is very useful during development. Signed-off-by: Nir Rozenbaum * code review Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum --- Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a1845560..563e0ce9 100644 --- a/Makefile +++ b/Makefile @@ -123,8 +123,12 @@ vet: ## Run go vet against code. test: manifests generate fmt vet envtest image-build ## Run tests. KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -race -coverprofile cover.out +.PHONY: test-unit +test-unit: ## Run unit tests. + KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./pkg/... -race -coverprofile cover.out + .PHONY: test-integration -test-integration: ## Run tests. +test-integration: ## Run integration tests. KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test ./test/integration/epp/... -race -coverprofile cover.out .PHONY: test-e2e From 1a871729f3af64840aac85b4cf7f861880f35a8a Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Fri, 25 Apr 2025 12:25:23 -0700 Subject: [PATCH 62/74] Updating proposal directories to match their PR number (#741) --- .../README.md | 0 .../images/epp_arch.svg | 0 docs/proposals/README.md | 5 +++++ 3 files changed, 5 insertions(+) rename docs/proposals/{00x-epp-compliance-proposal => 0683-epp-architecture-proposal}/README.md (100%) rename docs/proposals/{00x-epp-compliance-proposal => 0683-epp-architecture-proposal}/images/epp_arch.svg (100%) create mode 100644 docs/proposals/README.md diff --git a/docs/proposals/00x-epp-compliance-proposal/README.md b/docs/proposals/0683-epp-architecture-proposal/README.md similarity index 100% rename from docs/proposals/00x-epp-compliance-proposal/README.md rename to docs/proposals/0683-epp-architecture-proposal/README.md diff --git a/docs/proposals/00x-epp-compliance-proposal/images/epp_arch.svg b/docs/proposals/0683-epp-architecture-proposal/images/epp_arch.svg similarity index 100% rename from docs/proposals/00x-epp-compliance-proposal/images/epp_arch.svg rename to docs/proposals/0683-epp-architecture-proposal/images/epp_arch.svg diff --git a/docs/proposals/README.md b/docs/proposals/README.md new file mode 100644 index 00000000..2b0408d3 --- /dev/null +++ b/docs/proposals/README.md @@ -0,0 +1,5 @@ +# Proposals Best Practices + + +## Naming +The directory of the proposal should lead with a 4-digit PR number (will move to 5,6,... should our PR count get that high), followed by kebab-cased title. The PR number is not known until the PR is cut, so development can use a placeholder, ex. XXXX-my-proposal. PR number is used b/c it is unique & chronological, allowing the default ordering of proposals to follow the timeline of development. \ No newline at end of file From ddc3d6992d41f515ef31d6d67fbba5c8aacb451c Mon Sep 17 00:00:00 2001 From: Kellen Swain Date: Fri, 25 Apr 2025 12:43:24 -0700 Subject: [PATCH 63/74] fixing errors in new template & disabling the default blank template (#742) --- .github/ISSUE_TEMPLATE/{issue_template.md => blank_issue.md} | 2 +- .github/ISSUE_TEMPLATE/config.yml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) rename .github/ISSUE_TEMPLATE/{issue_template.md => blank_issue.md} (64%) create mode 100644 .github/ISSUE_TEMPLATE/config.yml diff --git a/.github/ISSUE_TEMPLATE/issue_template.md b/.github/ISSUE_TEMPLATE/blank_issue.md similarity index 64% rename from .github/ISSUE_TEMPLATE/issue_template.md rename to .github/ISSUE_TEMPLATE/blank_issue.md index 1a2c8c6f..dd6ebabf 100644 --- a/.github/ISSUE_TEMPLATE/issue_template.md +++ b/.github/ISSUE_TEMPLATE/blank_issue.md @@ -1,6 +1,6 @@ --- name: Blank Issue -about: '' +about: Create a new issue from scratch title: '' labels: needs-triage assignees: '' diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 00000000..3ba13e0c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false From e845173f9488605b941caa532a4e98abd5cca640 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Sun, 27 Apr 2025 19:41:25 +0300 Subject: [PATCH 64/74] fixed broken link to implemenations (#750) Signed-off-by: Nir Rozenbaum --- site-src/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site-src/index.md b/site-src/index.md index 04d1fadb..61bece27 100644 --- a/site-src/index.md +++ b/site-src/index.md @@ -91,7 +91,7 @@ This project is being driven by [WG-Serving](https://github.com/kubernetes/community/tree/master/wg-serving) [SIG-Network](https://github.com/kubernetes/community/tree/master/sig-network) to improve and standardize routing to inference workloads in Kubernetes. Check -out the [implementations reference](implementations.md) to see the latest +out the [implementations reference](implementations/gateways.md) to see the latest projects & products that support this project. If you are interested in contributing to or building an implementation using Gateway API then don’t hesitate to [get involved!](/contributing) From 855436e23577a6ef6d2dfe9ea2fe6668c9461838 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Mon, 28 Apr 2025 12:25:28 +0300 Subject: [PATCH 65/74] Weighted scorers (#737) * removed unused noop plugin Signed-off-by: Nir Rozenbaum * more scheduler refactoring Signed-off-by: Nir Rozenbaum * more refactoring Signed-off-by: Nir Rozenbaum * added weights to scorers and calculating weighted score Signed-off-by: Nir Rozenbaum * addressed code review comments Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum --- pkg/epp/scheduling/config.go | 18 ++- pkg/epp/scheduling/default_config.go | 31 ---- .../scheduling/plugins/filter/filter_test.go | 6 +- pkg/epp/scheduling/plugins/noop.go | 42 ------ .../plugins/picker/random_picker.go | 12 +- pkg/epp/scheduling/plugins/plugins.go | 17 ++- pkg/epp/scheduling/scheduler.go | 96 +++++++----- pkg/epp/scheduling/scheduler_test.go | 141 ++++++++++++------ pkg/epp/scheduling/types/types.go | 16 +- 9 files changed, 190 insertions(+), 189 deletions(-) delete mode 100644 pkg/epp/scheduling/default_config.go delete mode 100644 pkg/epp/scheduling/plugins/noop.go diff --git a/pkg/epp/scheduling/config.go b/pkg/epp/scheduling/config.go index 6c0f4be7..4ed109af 100644 --- a/pkg/epp/scheduling/config.go +++ b/pkg/epp/scheduling/config.go @@ -20,8 +20,22 @@ import "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" type SchedulerConfig struct { preSchedulePlugins []plugins.PreSchedule - scorers []plugins.Scorer filters []plugins.Filter - postSchedulePlugins []plugins.PostSchedule + scorers map[plugins.Scorer]int // map from scorer to weight picker plugins.Picker + postSchedulePlugins []plugins.PostSchedule +} + +var defPlugin = &defaultPlugin{} + +// When the scheduler is initialized with NewScheduler function, this config will be used as default. +// it's possible to call NewSchedulerWithConfig to pass a different argument. + +// For build time plugins changes, it's recommended to change the defaultConfig variable in this file. +var defaultConfig = &SchedulerConfig{ + preSchedulePlugins: []plugins.PreSchedule{}, + filters: []plugins.Filter{defPlugin}, + scorers: map[plugins.Scorer]int{}, + picker: defPlugin, + postSchedulePlugins: []plugins.PostSchedule{}, } diff --git a/pkg/epp/scheduling/default_config.go b/pkg/epp/scheduling/default_config.go deleted file mode 100644 index e42f1317..00000000 --- a/pkg/epp/scheduling/default_config.go +++ /dev/null @@ -1,31 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package scheduling - -import ( - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" -) - -var defPlugin = &defaultPlugin{} - -var defaultConfig = &SchedulerConfig{ - preSchedulePlugins: []plugins.PreSchedule{}, - scorers: []plugins.Scorer{}, - filters: []plugins.Filter{defPlugin}, - postSchedulePlugins: []plugins.PostSchedule{}, - picker: defPlugin, -} diff --git a/pkg/epp/scheduling/plugins/filter/filter_test.go b/pkg/epp/scheduling/plugins/filter/filter_test.go index 56cccb3b..a06ec3ca 100644 --- a/pkg/epp/scheduling/plugins/filter/filter_test.go +++ b/pkg/epp/scheduling/plugins/filter/filter_test.go @@ -54,8 +54,7 @@ func TestFilter(t *testing.T) { ctx := types.NewSchedulingContext(context.Background(), test.req, test.input) got := test.filter.Filter(ctx, test.input) - opt := cmp.AllowUnexported(types.PodMetrics{}) - if diff := cmp.Diff(test.output, got, opt); diff != "" { + if diff := cmp.Diff(test.output, got); diff != "" { t.Errorf("Unexpected output (-want +got): %v", diff) } }) @@ -190,8 +189,7 @@ func TestFilterFunc(t *testing.T) { ctx := types.NewSchedulingContext(context.Background(), test.req, test.input) got := test.f(ctx, test.input) - opt := cmp.AllowUnexported(types.PodMetrics{}) - if diff := cmp.Diff(test.output, got, opt); diff != "" { + if diff := cmp.Diff(test.output, got); diff != "" { t.Errorf("Unexpected output (-want +got): %v", diff) } }) diff --git a/pkg/epp/scheduling/plugins/noop.go b/pkg/epp/scheduling/plugins/noop.go deleted file mode 100644 index 8f50ff36..00000000 --- a/pkg/epp/scheduling/plugins/noop.go +++ /dev/null @@ -1,42 +0,0 @@ -/* -Copyright 2025 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package plugins - -import ( - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" -) - -// NoopPlugin provides a default, no-operation implementation of the Plugin interface. -// It can be embedded in other plugin implementations to avoid boilerplate code for -// unused methods. -type NoopPlugin struct{} - -func (p *NoopPlugin) Name() string { return "NoopPlugin" } - -func (p *NoopPlugin) PreSchedule(ctx *types.SchedulingContext) {} - -func (p *NoopPlugin) Filter(ctx *types.SchedulingContext, pods []types.Pod) ([]types.Pod, error) { - return pods, nil -} - -func (p *NoopPlugin) Score(ctx *types.SchedulingContext, pod types.Pod) (float64, error) { - return 0.0, nil -} - -func (p *NoopPlugin) PostSchedule(ctx *types.SchedulingContext, res *types.Result) {} - -func (p *NoopPlugin) PostResponse(ctx *types.SchedulingContext, pod types.Pod) {} diff --git a/pkg/epp/scheduling/plugins/picker/random_picker.go b/pkg/epp/scheduling/plugins/picker/random_picker.go index 850108e7..6eecbb0d 100644 --- a/pkg/epp/scheduling/plugins/picker/random_picker.go +++ b/pkg/epp/scheduling/plugins/picker/random_picker.go @@ -20,18 +20,22 @@ import ( "fmt" "math/rand" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) +var _ plugins.Picker = &RandomPicker{} + +// RandomPicker picks a random pod from the list of candidates. type RandomPicker struct{} func (rp *RandomPicker) Name() string { return "random" } -func (rp *RandomPicker) Pick(ctx *types.SchedulingContext, pods []types.Pod) *types.Result { - ctx.Logger.V(logutil.DEBUG).Info(fmt.Sprintf("Selecting a random pod from %d candidates: %+v", len(pods), pods)) - i := rand.Intn(len(pods)) - return &types.Result{TargetPod: pods[i]} +func (rp *RandomPicker) Pick(ctx *types.SchedulingContext, scoredPods []*types.ScoredPod) *types.Result { + ctx.Logger.V(logutil.DEBUG).Info(fmt.Sprintf("Selecting a random pod from %d candidates: %+v", len(scoredPods), scoredPods)) + i := rand.Intn(len(scoredPods)) + return &types.Result{TargetPod: scoredPods[i].Pod} } diff --git a/pkg/epp/scheduling/plugins/plugins.go b/pkg/epp/scheduling/plugins/plugins.go index 4b334803..f3412ab7 100644 --- a/pkg/epp/scheduling/plugins/plugins.go +++ b/pkg/epp/scheduling/plugins/plugins.go @@ -49,22 +49,23 @@ type Filter interface { Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod } -// Scorer defines the interface for scoring pods based on context. +// Scorer defines the interface for scoring a list of pods based on context. +// Scorers must score pods with a value within the range of [0,1] where 1 is the highest score. type Scorer interface { Plugin - Score(ctx *types.SchedulingContext, pod types.Pod) float64 + Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 } -// PostSchedule is called by the scheduler after it selects a targetPod for the request. -type PostSchedule interface { +// Picker picks the final pod(s) to send the request to. +type Picker interface { Plugin - PostSchedule(ctx *types.SchedulingContext, res *types.Result) + Pick(ctx *types.SchedulingContext, scoredPods []*types.ScoredPod) *types.Result } -// Picker picks the final pod(s) to send the request to. -type Picker interface { +// PostSchedule is called by the scheduler after it selects a targetPod for the request. +type PostSchedule interface { Plugin - Pick(ctx *types.SchedulingContext, pods []types.Pod) *types.Result + PostSchedule(ctx *types.SchedulingContext, res *types.Result) } // PostResponse is called by the scheduler after a successful response was sent. diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go index 322f714f..04d24ea2 100644 --- a/pkg/epp/scheduling/scheduler.go +++ b/pkg/epp/scheduling/scheduler.go @@ -72,25 +72,23 @@ func NewScheduler(datastore Datastore) *Scheduler { } func NewSchedulerWithConfig(datastore Datastore, config *SchedulerConfig) *Scheduler { - scheduler := &Scheduler{ + return &Scheduler{ datastore: datastore, preSchedulePlugins: config.preSchedulePlugins, - scorers: config.scorers, filters: config.filters, - postSchedulePlugins: config.postSchedulePlugins, + scorers: config.scorers, picker: config.picker, + postSchedulePlugins: config.postSchedulePlugins, } - - return scheduler } type Scheduler struct { datastore Datastore preSchedulePlugins []plugins.PreSchedule filters []plugins.Filter - scorers []plugins.Scorer - postSchedulePlugins []plugins.PostSchedule + scorers map[plugins.Scorer]int // map from scorer to its weight picker plugins.Picker + postSchedulePlugins []plugins.PostSchedule } type Datastore interface { @@ -106,7 +104,7 @@ func (s *Scheduler) Schedule(ctx context.Context, req *types.LLMRequest) (*types // 1. Reduce concurrent access to the datastore. // 2. Ensure consistent data during the scheduling operation of a request. sCtx := types.NewSchedulingContext(ctx, req, types.ToSchedulerPodMetrics(s.datastore.PodGetAll())) - loggerDebug.Info(fmt.Sprintf("Scheduling a request. Metrics: %+v", sCtx.PodsSnapshot)) + loggerDebug.Info(fmt.Sprintf("Scheduling a request, Metrics: %+v", sCtx.PodsSnapshot)) s.runPreSchedulePlugins(sCtx) @@ -114,17 +112,14 @@ func (s *Scheduler) Schedule(ctx context.Context, req *types.LLMRequest) (*types if len(pods) == 0 { return nil, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: "failed to find a target pod"} } + // if we got here, there is at least one pod to score + weightedScorePerPod := s.runScorerPlugins(sCtx, pods) - s.runScorerPlugins(sCtx, pods) - - before := time.Now() - res := s.picker.Pick(sCtx, pods) - metrics.RecordSchedulerPluginProcessingLatency(plugins.PickerPluginType, s.picker.Name(), time.Since(before)) - loggerDebug.Info("After running picker plugins", "result", res) + result := s.runPickerPlugin(sCtx, weightedScorePerPod) - s.runPostSchedulePlugins(sCtx, res) + s.runPostSchedulePlugins(sCtx, result) - return res, nil + return result, nil } func (s *Scheduler) runPreSchedulePlugins(ctx *types.SchedulingContext) { @@ -136,15 +131,6 @@ func (s *Scheduler) runPreSchedulePlugins(ctx *types.SchedulingContext) { } } -func (s *Scheduler) runPostSchedulePlugins(ctx *types.SchedulingContext, res *types.Result) { - for _, plugin := range s.postSchedulePlugins { - ctx.Logger.V(logutil.DEBUG).Info("Running post-schedule plugin", "plugin", plugin.Name()) - before := time.Now() - plugin.PostSchedule(ctx, res) - metrics.RecordSchedulerPluginProcessingLatency(plugins.PostSchedulePluginType, plugin.Name(), time.Since(before)) - } -} - func (s *Scheduler) runFilterPlugins(ctx *types.SchedulingContext) []types.Pod { loggerDebug := ctx.Logger.V(logutil.DEBUG) filteredPods := ctx.PodsSnapshot @@ -160,32 +146,60 @@ func (s *Scheduler) runFilterPlugins(ctx *types.SchedulingContext) []types.Pod { break } } + loggerDebug.Info("After running filter plugins") + return filteredPods } -func (s *Scheduler) runScorerPlugins(ctx *types.SchedulingContext, pods []types.Pod) { +func (s *Scheduler) runScorerPlugins(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 { loggerDebug := ctx.Logger.V(logutil.DEBUG) - loggerDebug.Info("Before running score plugins", "pods", pods) + loggerDebug.Info("Before running scorer plugins", "pods", pods) + + weightedScorePerPod := make(map[types.Pod]float64, len(pods)) for _, pod := range pods { - score := s.runScorersForPod(ctx, pod) - pod.SetScore(score) + weightedScorePerPod[pod] = float64(0) // initialize weighted score per pod with 0 value + } + // Iterate through each scorer in the chain and accumulate the weighted scores. + for scorer, weight := range s.scorers { + loggerDebug.Info("Running scorer", "scorer", scorer.Name()) + before := time.Now() + scores := scorer.Score(ctx, pods) + metrics.RecordSchedulerPluginProcessingLatency(plugins.ScorerPluginType, scorer.Name(), time.Since(before)) + for pod, score := range scores { // weight is relative to the sum of weights + weightedScorePerPod[pod] += score * float64(weight) // TODO normalize score before multiply with weight + } + loggerDebug.Info("After running scorer", "scorer", scorer.Name()) + } + loggerDebug.Info("After running scorer plugins") + + return weightedScorePerPod +} + +func (s *Scheduler) runPickerPlugin(ctx *types.SchedulingContext, weightedScorePerPod map[types.Pod]float64) *types.Result { + loggerDebug := ctx.Logger.V(logutil.DEBUG) + scoredPods := make([]*types.ScoredPod, len(weightedScorePerPod)) + i := 0 + for pod, score := range weightedScorePerPod { + scoredPods[i] = &types.ScoredPod{Pod: pod, Score: score} + i++ } - loggerDebug.Info("After running score plugins", "pods", pods) + + loggerDebug.Info("Before running picker plugin", "pods", weightedScorePerPod) + before := time.Now() + result := s.picker.Pick(ctx, scoredPods) + metrics.RecordSchedulerPluginProcessingLatency(plugins.PickerPluginType, s.picker.Name(), time.Since(before)) + loggerDebug.Info("After running picker plugin", "result", result) + + return result } -// Iterate through each scorer in the chain and accumulate the scores. -func (s *Scheduler) runScorersForPod(ctx *types.SchedulingContext, pod types.Pod) float64 { - logger := ctx.Logger.WithValues("pod", pod.GetPod().NamespacedName).V(logutil.DEBUG) - score := float64(0) - for _, scorer := range s.scorers { - logger.Info("Running scorer", "scorer", scorer.Name()) +func (s *Scheduler) runPostSchedulePlugins(ctx *types.SchedulingContext, res *types.Result) { + for _, plugin := range s.postSchedulePlugins { + ctx.Logger.V(logutil.DEBUG).Info("Running post-schedule plugin", "plugin", plugin.Name()) before := time.Now() - oneScore := scorer.Score(ctx, pod) - metrics.RecordSchedulerPluginProcessingLatency(plugins.ScorerPluginType, scorer.Name(), time.Since(before)) - score += oneScore - logger.Info("After scorer", "scorer", scorer.Name(), "score", oneScore, "total score", score) + plugin.PostSchedule(ctx, res) + metrics.RecordSchedulerPluginProcessingLatency(plugins.PostSchedulePluginType, plugin.Name(), time.Since(before)) } - return score } type defaultPlugin struct { diff --git a/pkg/epp/scheduling/scheduler_test.go b/pkg/epp/scheduling/scheduler_test.go index 2fb26a86..559f53f8 100644 --- a/pkg/epp/scheduling/scheduler_test.go +++ b/pkg/epp/scheduling/scheduler_test.go @@ -220,24 +220,15 @@ func TestSchedule(t *testing.T) { }, } - schedConfig := &SchedulerConfig{ - preSchedulePlugins: []plugins.PreSchedule{}, - scorers: []plugins.Scorer{}, - filters: []plugins.Filter{defPlugin}, - postSchedulePlugins: []plugins.PostSchedule{}, - picker: defPlugin, - } - for _, test := range tests { t.Run(test.name, func(t *testing.T) { - scheduler := NewSchedulerWithConfig(&fakeDataStore{pods: test.input}, schedConfig) + scheduler := NewScheduler(&fakeDataStore{pods: test.input}) got, err := scheduler.Schedule(context.Background(), test.req) if test.err != (err != nil) { t.Errorf("Unexpected error, got %v, want %v", err, test.err) } - opt := cmp.AllowUnexported(types.PodMetrics{}) - if diff := cmp.Diff(test.wantRes, got, opt); diff != "" { + if diff := cmp.Diff(test.wantRes, got); diff != "" { t.Errorf("Unexpected output (-want +got): %v", diff) } }) @@ -275,13 +266,16 @@ func TestSchedulePlugins(t *testing.T) { err bool }{ { - name: "all plugins executed successfully", + name: "all plugins executed successfully, all scorers with same weight", config: SchedulerConfig{ - preSchedulePlugins: []plugins.PreSchedule{tp1, tp2}, - filters: []plugins.Filter{tp1, tp2}, - scorers: []plugins.Scorer{tp1, tp2}, - postSchedulePlugins: []plugins.PostSchedule{tp1, tp2}, + preSchedulePlugins: []plugins.PreSchedule{tp1, tp2}, + filters: []plugins.Filter{tp1, tp2}, + scorers: map[plugins.Scorer]int{ + tp1: 1, + tp2: 1, + }, picker: pickerPlugin, + postSchedulePlugins: []plugins.PostSchedule{tp1, tp2}, }, input: []*backendmetrics.FakePodMetrics{ {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, @@ -294,13 +288,38 @@ func TestSchedulePlugins(t *testing.T) { err: false, }, { - name: "filter all", + name: "all plugins executed successfully, different scorers weights", config: SchedulerConfig{ - preSchedulePlugins: []plugins.PreSchedule{tp1, tp2}, - filters: []plugins.Filter{tp1, tp_filterAll}, - scorers: []plugins.Scorer{tp1, tp2}, + preSchedulePlugins: []plugins.PreSchedule{tp1, tp2}, + filters: []plugins.Filter{tp1, tp2}, + scorers: map[plugins.Scorer]int{ + tp1: 60, + tp2: 40, + }, + picker: pickerPlugin, postSchedulePlugins: []plugins.PostSchedule{tp1, tp2}, + }, + input: []*backendmetrics.FakePodMetrics{ + {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, + {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, + {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}, + }, + wantTargetPod: k8stypes.NamespacedName{Name: "pod1"}, + targetPodScore: 50, + numPodsToScore: 2, + err: false, + }, + { + name: "filter all", + config: SchedulerConfig{ + preSchedulePlugins: []plugins.PreSchedule{tp1, tp2}, + filters: []plugins.Filter{tp1, tp_filterAll}, + scorers: map[plugins.Scorer]int{ + tp1: 1, + tp2: 1, + }, picker: pickerPlugin, + postSchedulePlugins: []plugins.PostSchedule{tp1, tp2}, }, input: []*backendmetrics.FakePodMetrics{ {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, @@ -318,16 +337,16 @@ func TestSchedulePlugins(t *testing.T) { for _, plugin := range test.config.preSchedulePlugins { plugin.(*TestPlugin).reset() } - for _, plugin := range test.config.postSchedulePlugins { - plugin.(*TestPlugin).reset() - } for _, plugin := range test.config.filters { plugin.(*TestPlugin).reset() } - for _, plugin := range test.config.scorers { + for plugin := range test.config.scorers { plugin.(*TestPlugin).reset() } test.config.picker.(*TestPlugin).reset() + for _, plugin := range test.config.postSchedulePlugins { + plugin.(*TestPlugin).reset() + } // Initialize the scheduler scheduler := NewSchedulerWithConfig(&fakeDataStore{pods: test.input}, &test.config) @@ -345,13 +364,11 @@ func TestSchedulePlugins(t *testing.T) { } // Validate output - opt := cmp.AllowUnexported(types.PodMetrics{}) wantPod := &types.PodMetrics{ Pod: &backendmetrics.Pod{NamespacedName: test.wantTargetPod}, } - wantPod.SetScore(test.targetPodScore) wantRes := &types.Result{TargetPod: wantPod} - if diff := cmp.Diff(wantRes, got, opt); diff != "" { + if diff := cmp.Diff(wantRes, got); diff != "" { t.Errorf("Unexpected output (-want +got): %v", diff) } @@ -359,36 +376,44 @@ func TestSchedulePlugins(t *testing.T) { for _, plugin := range test.config.preSchedulePlugins { tp, _ := plugin.(*TestPlugin) if tp.PreScheduleCallCount != 1 { - t.Errorf("Plugin %s PreSchedule() called %d times, expected 1", tp.NameRes, tp.PreScheduleCallCount) + t.Errorf("Plugin %s PreSchedule() called %d times, expected 1", plugin.Name(), tp.PreScheduleCallCount) } } for _, plugin := range test.config.filters { tp, _ := plugin.(*TestPlugin) if tp.FilterCallCount != 1 { - t.Errorf("Plugin %s Filter() called %d times, expected 1", tp.NameRes, tp.FilterCallCount) + t.Errorf("Plugin %s Filter() called %d times, expected 1", plugin.Name(), tp.FilterCallCount) } } - for _, plugin := range test.config.scorers { + for plugin := range test.config.scorers { tp, _ := plugin.(*TestPlugin) - if tp.ScoreCallCount != test.numPodsToScore { - t.Errorf("Plugin %s Score() called %d times, expected 1", tp.NameRes, tp.ScoreCallCount) + if tp.ScoreCallCount != 1 { + t.Errorf("Plugin %s Score() called %d times, expected 1", plugin.Name(), tp.ScoreCallCount) } - } - - for _, plugin := range test.config.postSchedulePlugins { - tp, _ := plugin.(*TestPlugin) - if tp.PostScheduleCallCount != 1 { - t.Errorf("Plugin %s PostSchedule() called %d times, expected 1", tp.NameRes, tp.PostScheduleCallCount) + if test.numPodsToScore != tp.NumOfScoredPods { + t.Errorf("Plugin %s Score() called with %d pods, expected %d", plugin.Name(), tp.NumOfScoredPods, test.numPodsToScore) } } tp, _ := test.config.picker.(*TestPlugin) + if tp.NumOfPickerCandidates != test.numPodsToScore { + t.Errorf("Picker plugin %s Pick() called with %d candidates, expected %d", tp.Name(), tp.NumOfPickerCandidates, tp.NumOfScoredPods) + } if tp.PickCallCount != 1 { - t.Errorf("Picker plugin %s Pick() called %d times, expected 1", tp.NameRes, tp.PickCallCount) + t.Errorf("Picker plugin %s Pick() called %d times, expected 1", tp.Name(), tp.PickCallCount) + } + if tp.WinnderPodScore != test.targetPodScore { + t.Errorf("winnder pod score %v, expected %v", tp.WinnderPodScore, test.targetPodScore) } + for _, plugin := range test.config.postSchedulePlugins { + tp, _ := plugin.(*TestPlugin) + if tp.PostScheduleCallCount != 1 { + t.Errorf("Plugin %s PostSchedule() called %d times, expected 1", plugin.Name(), tp.PostScheduleCallCount) + } + } }) } } @@ -409,13 +434,16 @@ func (fds *fakeDataStore) PodGetAll() []backendmetrics.PodMetrics { type TestPlugin struct { NameRes string ScoreCallCount int + NumOfScoredPods int ScoreRes float64 FilterCallCount int FilterRes []k8stypes.NamespacedName PreScheduleCallCount int PostScheduleCallCount int PickCallCount int + NumOfPickerCandidates int PickRes k8stypes.NamespacedName + WinnderPodScore float64 } func (tp *TestPlugin) Name() string { return tp.NameRes } @@ -427,29 +455,39 @@ func (tp *TestPlugin) PreSchedule(ctx *types.SchedulingContext) { func (tp *TestPlugin) Filter(ctx *types.SchedulingContext, pods []types.Pod) []types.Pod { tp.FilterCallCount++ return findPods(ctx, tp.FilterRes...) -} -func (tp *TestPlugin) Score(ctx *types.SchedulingContext, pod types.Pod) float64 { - tp.ScoreCallCount++ - return tp.ScoreRes } -func (tp *TestPlugin) PostSchedule(ctx *types.SchedulingContext, res *types.Result) { - tp.PostScheduleCallCount++ +func (tp *TestPlugin) Score(ctx *types.SchedulingContext, pods []types.Pod) map[types.Pod]float64 { + tp.ScoreCallCount++ + scoredPods := make(map[types.Pod]float64, len(pods)) + for _, pod := range pods { + scoredPods[pod] += tp.ScoreRes + } + tp.NumOfScoredPods = len(scoredPods) + return scoredPods } -func (tp *TestPlugin) Pick(ctx *types.SchedulingContext, pods []types.Pod) *types.Result { +func (tp *TestPlugin) Pick(ctx *types.SchedulingContext, scoredPods []*types.ScoredPod) *types.Result { tp.PickCallCount++ + tp.NumOfPickerCandidates = len(scoredPods) pod := findPods(ctx, tp.PickRes)[0] + tp.WinnderPodScore = getPodScore(scoredPods, pod) return &types.Result{TargetPod: pod} } +func (tp *TestPlugin) PostSchedule(ctx *types.SchedulingContext, res *types.Result) { + tp.PostScheduleCallCount++ +} + func (tp *TestPlugin) reset() { tp.PreScheduleCallCount = 0 tp.FilterCallCount = 0 tp.ScoreCallCount = 0 + tp.NumOfScoredPods = 0 tp.PostScheduleCallCount = 0 tp.PickCallCount = 0 + tp.NumOfPickerCandidates = 0 } func findPods(ctx *types.SchedulingContext, names ...k8stypes.NamespacedName) []types.Pod { @@ -463,3 +501,14 @@ func findPods(ctx *types.SchedulingContext, names ...k8stypes.NamespacedName) [] } return res } + +func getPodScore(scoredPods []*types.ScoredPod, selectedPod types.Pod) float64 { + finalScore := 0.0 + for _, scoredPod := range scoredPods { + if scoredPod.Pod.GetPod().NamespacedName.String() == selectedPod.GetPod().NamespacedName.String() { + finalScore = scoredPod.Score + break + } + } + return finalScore +} diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go index e66b5fb5..5ccfbdce 100644 --- a/pkg/epp/scheduling/types/types.go +++ b/pkg/epp/scheduling/types/types.go @@ -43,11 +43,14 @@ func (r *LLMRequest) String() string { type Pod interface { GetPod() *backendmetrics.Pod GetMetrics() *backendmetrics.Metrics - SetScore(float64) - Score() float64 String() string } +type ScoredPod struct { + Pod Pod + Score float64 +} + // SchedulingContext holds contextual information during a scheduling operation. type SchedulingContext struct { context.Context @@ -71,16 +74,7 @@ func (pm *PodMetrics) GetMetrics() *backendmetrics.Metrics { return pm.Metrics } -func (pm *PodMetrics) SetScore(score float64) { - pm.score = score -} - -func (pm *PodMetrics) Score() float64 { - return pm.score -} - type PodMetrics struct { - score float64 *backendmetrics.Pod *backendmetrics.Metrics } From cea06e2a02f6500f23758c2359ff64f4eb53e887 Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Tue, 29 Apr 2025 00:07:54 +0300 Subject: [PATCH 66/74] add max score picker (#752) * embedded Pod interface into ScoredPod struct. updated tests and picker accordingly Signed-off-by: Nir Rozenbaum * implemented max-score picker Signed-off-by: Maroon Ayoub * minor changes in max score picker Signed-off-by: Nir Rozenbaum --------- Signed-off-by: Nir Rozenbaum Signed-off-by: Maroon Ayoub Co-authored-by: Maroon Ayoub --- .../plugins/picker/max_score_picker.go | 49 +++++++++++++++++++ .../plugins/picker/random_picker.go | 6 +-- pkg/epp/scheduling/scheduler_test.go | 46 +++++++++-------- pkg/epp/scheduling/types/types.go | 2 +- 4 files changed, 78 insertions(+), 25 deletions(-) create mode 100644 pkg/epp/scheduling/plugins/picker/max_score_picker.go diff --git a/pkg/epp/scheduling/plugins/picker/max_score_picker.go b/pkg/epp/scheduling/plugins/picker/max_score_picker.go new file mode 100644 index 00000000..1705b7dd --- /dev/null +++ b/pkg/epp/scheduling/plugins/picker/max_score_picker.go @@ -0,0 +1,49 @@ +package picker + +import ( + "fmt" + + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" + logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" +) + +var _ plugins.Picker = &MaxScorePicker{} + +func NewMaxScorePicker() plugins.Picker { + return &MaxScorePicker{ + random: &RandomPicker{}, + } +} + +// MaxScorePicker picks the pod with the maximum score from the list of candidates. +type MaxScorePicker struct { + random *RandomPicker +} + +// Name returns the name of the picker. +func (p *MaxScorePicker) Name() string { + return "max_score" +} + +// Pick selects the pod with the maximum score from the list of candidates. +func (p *MaxScorePicker) Pick(ctx *types.SchedulingContext, scoredPods []*types.ScoredPod) *types.Result { + ctx.Logger.V(logutil.DEBUG).Info(fmt.Sprintf("Selecting a pod with the max score from %d candidates: %+v", len(scoredPods), scoredPods)) + + highestScorePods := []*types.ScoredPod{} + maxScore := -1.0 // pods min score is 0, putting value lower than 0 in order to find at least one pod as highest + for _, pod := range scoredPods { + if pod.Score > maxScore { + maxScore = pod.Score + highestScorePods = []*types.ScoredPod{pod} + } else if pod.Score == maxScore { + highestScorePods = append(highestScorePods, pod) + } + } + + if len(highestScorePods) > 1 { + return p.random.Pick(ctx, highestScorePods) // pick randomly from the highest score pods + } + + return &types.Result{TargetPod: highestScorePods[0]} +} diff --git a/pkg/epp/scheduling/plugins/picker/random_picker.go b/pkg/epp/scheduling/plugins/picker/random_picker.go index 6eecbb0d..fb9f9a29 100644 --- a/pkg/epp/scheduling/plugins/picker/random_picker.go +++ b/pkg/epp/scheduling/plugins/picker/random_picker.go @@ -30,12 +30,12 @@ var _ plugins.Picker = &RandomPicker{} // RandomPicker picks a random pod from the list of candidates. type RandomPicker struct{} -func (rp *RandomPicker) Name() string { +func (p *RandomPicker) Name() string { return "random" } -func (rp *RandomPicker) Pick(ctx *types.SchedulingContext, scoredPods []*types.ScoredPod) *types.Result { +func (p *RandomPicker) Pick(ctx *types.SchedulingContext, scoredPods []*types.ScoredPod) *types.Result { ctx.Logger.V(logutil.DEBUG).Info(fmt.Sprintf("Selecting a random pod from %d candidates: %+v", len(scoredPods), scoredPods)) i := rand.Intn(len(scoredPods)) - return &types.Result{TargetPod: scoredPods[i].Pod} + return &types.Result{TargetPod: scoredPods[i]} } diff --git a/pkg/epp/scheduling/scheduler_test.go b/pkg/epp/scheduling/scheduler_test.go index 559f53f8..311f44e9 100644 --- a/pkg/epp/scheduling/scheduler_test.go +++ b/pkg/epp/scheduling/scheduler_test.go @@ -93,17 +93,19 @@ func TestSchedule(t *testing.T) { }, }, wantRes: &types.Result{ - TargetPod: &types.PodMetrics{ - Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 3, - KVCacheUsagePercent: 0.1, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "critical": 1, + TargetPod: &types.ScoredPod{ + Pod: &types.PodMetrics{ + Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 3, + KVCacheUsagePercent: 0.1, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "critical": 1, + }, + WaitingModels: map[string]int{}, }, - WaitingModels: map[string]int{}, }, }, }, @@ -154,17 +156,19 @@ func TestSchedule(t *testing.T) { }, }, wantRes: &types.Result{ - TargetPod: &types.PodMetrics{ - Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, - Metrics: &backendmetrics.Metrics{ - WaitingQueueSize: 0, - KVCacheUsagePercent: 0.2, - MaxActiveModels: 2, - ActiveModels: map[string]int{ - "foo": 1, - "bar": 1, + TargetPod: &types.ScoredPod{ + Pod: &types.PodMetrics{ + Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, + Metrics: &backendmetrics.Metrics{ + WaitingQueueSize: 0, + KVCacheUsagePercent: 0.2, + MaxActiveModels: 2, + ActiveModels: map[string]int{ + "foo": 1, + "bar": 1, + }, + WaitingModels: map[string]int{}, }, - WaitingModels: map[string]int{}, }, }, }, @@ -505,7 +509,7 @@ func findPods(ctx *types.SchedulingContext, names ...k8stypes.NamespacedName) [] func getPodScore(scoredPods []*types.ScoredPod, selectedPod types.Pod) float64 { finalScore := 0.0 for _, scoredPod := range scoredPods { - if scoredPod.Pod.GetPod().NamespacedName.String() == selectedPod.GetPod().NamespacedName.String() { + if scoredPod.GetPod().NamespacedName.String() == selectedPod.GetPod().NamespacedName.String() { finalScore = scoredPod.Score break } diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go index 5ccfbdce..5198515b 100644 --- a/pkg/epp/scheduling/types/types.go +++ b/pkg/epp/scheduling/types/types.go @@ -47,7 +47,7 @@ type Pod interface { } type ScoredPod struct { - Pod Pod + Pod Score float64 } From 06bd4223e28bb576b0b7ac51d3e0d9805d4cbd14 Mon Sep 17 00:00:00 2001 From: Cong Liu Date: Mon, 28 Apr 2025 16:53:53 -0700 Subject: [PATCH 67/74] Add GetEnvString helper function (#758) --- pkg/epp/util/env/env.go | 24 +++++++++----- pkg/epp/util/env/env_test.go | 61 ++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 7 deletions(-) diff --git a/pkg/epp/util/env/env.go b/pkg/epp/util/env/env.go index 11e3bde1..0c6d1c6d 100644 --- a/pkg/epp/util/env/env.go +++ b/pkg/epp/util/env/env.go @@ -5,26 +5,25 @@ import ( "strconv" "github.com/go-logr/logr" - logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) // getEnvFloat gets a float64 from an environment variable with a default value func GetEnvFloat(key string, defaultVal float64, logger logr.Logger) float64 { val, exists := os.LookupEnv(key) if !exists { - logger.V(logutil.VERBOSE).Info("Environment variable not set, using default value", + logger.Info("Environment variable not set, using default value", "key", key, "defaultValue", defaultVal) return defaultVal } floatVal, err := strconv.ParseFloat(val, 64) if err != nil { - logger.V(logutil.VERBOSE).Info("Failed to parse environment variable as float, using default value", + logger.Info("Failed to parse environment variable as float, using default value", "key", key, "value", val, "error", err, "defaultValue", defaultVal) return defaultVal } - logger.V(logutil.VERBOSE).Info("Successfully loaded environment variable", + logger.Info("Successfully loaded environment variable", "key", key, "value", floatVal) return floatVal } @@ -33,19 +32,30 @@ func GetEnvFloat(key string, defaultVal float64, logger logr.Logger) float64 { func GetEnvInt(key string, defaultVal int, logger logr.Logger) int { val, exists := os.LookupEnv(key) if !exists { - logger.V(logutil.VERBOSE).Info("Environment variable not set, using default value", + logger.Info("Environment variable not set, using default value", "key", key, "defaultValue", defaultVal) return defaultVal } intVal, err := strconv.Atoi(val) if err != nil { - logger.V(logutil.VERBOSE).Info("Failed to parse environment variable as int, using default value", + logger.Info("Failed to parse environment variable as int, using default value", "key", key, "value", val, "error", err, "defaultValue", defaultVal) return defaultVal } - logger.V(logutil.VERBOSE).Info("Successfully loaded environment variable", + logger.Info("Successfully loaded environment variable", "key", key, "value", intVal) return intVal } + +// GetEnvString gets a string from an environment variable with a default value +func GetEnvString(key string, defaultVal string, logger logr.Logger) string { + val, exists := os.LookupEnv(key) + if !exists { + logger.Info("Environment variable not set, using default value", + "key", key, "defaultValue", defaultVal) + return defaultVal + } + return val +} diff --git a/pkg/epp/util/env/env_test.go b/pkg/epp/util/env/env_test.go index 02513e28..105beb28 100644 --- a/pkg/epp/util/env/env_test.go +++ b/pkg/epp/util/env/env_test.go @@ -142,3 +142,64 @@ func TestGetEnvInt(t *testing.T) { }) } } + +func TestGetEnvString(t *testing.T) { + logger := testr.New(t) + + tests := []struct { + name string + key string + value string + defaultVal string + expected string + setup func() + teardown func() + }{ + { + name: "env variable exists and is valid", + key: "TEST_STR", + value: "123", + defaultVal: "default", + expected: "123", + setup: func() { + os.Setenv("TEST_STR", "123") + }, + teardown: func() { + os.Unsetenv("TEST_STR") + }, + }, + { + name: "env variable does not exist", + key: "TEST_STR_MISSING", + defaultVal: "default", + expected: "default", + setup: func() {}, + teardown: func() {}, + }, + { + name: "env variable is empty string", + key: "TEST_STR_EMPTY", + value: "", + defaultVal: "default", + expected: "", + setup: func() { + os.Setenv("TEST_STR_EMPTY", "") + }, + teardown: func() { + os.Unsetenv("TEST_STR_EMPTY") + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + tc.setup() + defer tc.teardown() + + result := GetEnvString(tc.key, tc.defaultVal, logger.V(logutil.VERBOSE)) + if result != tc.expected { + t.Errorf("GetEnvString(%s, %s) = %s, expected %s", tc.key, tc.defaultVal, result, tc.expected) + } + }) + } +} From e12c61718367e058788fbe851104cd7de64754e1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Apr 2025 20:29:53 -0700 Subject: [PATCH 68/74] Bump the kubernetes group with 6 updates (#754) Bumps the kubernetes group with 6 updates: | Package | From | To | | --- | --- | --- | | [k8s.io/api](https://github.com/kubernetes/api) | `0.32.3` | `0.32.4` | | [k8s.io/apiextensions-apiserver](https://github.com/kubernetes/apiextensions-apiserver) | `0.32.3` | `0.32.4` | | [k8s.io/apimachinery](https://github.com/kubernetes/apimachinery) | `0.32.3` | `0.32.4` | | [k8s.io/client-go](https://github.com/kubernetes/client-go) | `0.32.3` | `0.32.4` | | [k8s.io/code-generator](https://github.com/kubernetes/code-generator) | `0.32.3` | `0.32.4` | | [k8s.io/component-base](https://github.com/kubernetes/component-base) | `0.32.3` | `0.32.4` | Updates `k8s.io/api` from 0.32.3 to 0.32.4 - [Commits](https://github.com/kubernetes/api/compare/v0.32.3...v0.32.4) Updates `k8s.io/apiextensions-apiserver` from 0.32.3 to 0.32.4 - [Release notes](https://github.com/kubernetes/apiextensions-apiserver/releases) - [Commits](https://github.com/kubernetes/apiextensions-apiserver/compare/v0.32.3...v0.32.4) Updates `k8s.io/apimachinery` from 0.32.3 to 0.32.4 - [Commits](https://github.com/kubernetes/apimachinery/compare/v0.32.3...v0.32.4) Updates `k8s.io/client-go` from 0.32.3 to 0.32.4 - [Changelog](https://github.com/kubernetes/client-go/blob/master/CHANGELOG.md) - [Commits](https://github.com/kubernetes/client-go/compare/v0.32.3...v0.32.4) Updates `k8s.io/code-generator` from 0.32.3 to 0.32.4 - [Commits](https://github.com/kubernetes/code-generator/compare/v0.32.3...v0.32.4) Updates `k8s.io/component-base` from 0.32.3 to 0.32.4 - [Commits](https://github.com/kubernetes/component-base/compare/v0.32.3...v0.32.4) --- updated-dependencies: - dependency-name: k8s.io/api dependency-version: 0.32.4 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: kubernetes - dependency-name: k8s.io/apiextensions-apiserver dependency-version: 0.32.4 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: kubernetes - dependency-name: k8s.io/apimachinery dependency-version: 0.32.4 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: kubernetes - dependency-name: k8s.io/client-go dependency-version: 0.32.4 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: kubernetes - dependency-name: k8s.io/code-generator dependency-version: 0.32.4 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: kubernetes - dependency-name: k8s.io/component-base dependency-version: 0.32.4 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: kubernetes ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 14 +++++++------- go.sum | 28 ++++++++++++++-------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/go.mod b/go.mod index fcfb60af..076bdf4b 100644 --- a/go.mod +++ b/go.mod @@ -17,12 +17,12 @@ require ( go.uber.org/zap v1.27.0 google.golang.org/grpc v1.71.1 google.golang.org/protobuf v1.36.6 - k8s.io/api v0.32.3 - k8s.io/apiextensions-apiserver v0.32.3 - k8s.io/apimachinery v0.32.3 - k8s.io/client-go v0.32.3 - k8s.io/code-generator v0.32.3 - k8s.io/component-base v0.32.3 + k8s.io/api v0.32.4 + k8s.io/apiextensions-apiserver v0.32.4 + k8s.io/apimachinery v0.32.4 + k8s.io/client-go v0.32.4 + k8s.io/code-generator v0.32.4 + k8s.io/component-base v0.32.4 k8s.io/utils v0.0.0-20241210054802-24370beab758 sigs.k8s.io/controller-runtime v0.20.4 sigs.k8s.io/structured-merge-diff/v4 v4.7.0 @@ -123,7 +123,7 @@ require ( gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiserver v0.32.3 // indirect + k8s.io/apiserver v0.32.4 // indirect k8s.io/gengo/v2 v2.0.0-20240911193312-2b36238f13e9 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect diff --git a/go.sum b/go.sum index b2c05a61..0258fc7a 100644 --- a/go.sum +++ b/go.sum @@ -300,20 +300,20 @@ gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -k8s.io/api v0.32.3 h1:Hw7KqxRusq+6QSplE3NYG4MBxZw1BZnq4aP4cJVINls= -k8s.io/api v0.32.3/go.mod h1:2wEDTXADtm/HA7CCMD8D8bK4yuBUptzaRhYcYEEYA3k= -k8s.io/apiextensions-apiserver v0.32.3 h1:4D8vy+9GWerlErCwVIbcQjsWunF9SUGNu7O7hiQTyPY= -k8s.io/apiextensions-apiserver v0.32.3/go.mod h1:8YwcvVRMVzw0r1Stc7XfGAzB/SIVLunqApySV5V7Dss= -k8s.io/apimachinery v0.32.3 h1:JmDuDarhDmA/Li7j3aPrwhpNBA94Nvk5zLeOge9HH1U= -k8s.io/apimachinery v0.32.3/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= -k8s.io/apiserver v0.32.3 h1:kOw2KBuHOA+wetX1MkmrxgBr648ksz653j26ESuWNY8= -k8s.io/apiserver v0.32.3/go.mod h1:q1x9B8E/WzShF49wh3ADOh6muSfpmFL0I2t+TG0Zdgc= -k8s.io/client-go v0.32.3 h1:RKPVltzopkSgHS7aS98QdscAgtgah/+zmpAogooIqVU= -k8s.io/client-go v0.32.3/go.mod h1:3v0+3k4IcT9bXTc4V2rt+d2ZPPG700Xy6Oi0Gdl2PaY= -k8s.io/code-generator v0.32.3 h1:31p2TVzC9+hVdSkAFruAk3JY+iSfzrJ83Qij1yZutyw= -k8s.io/code-generator v0.32.3/go.mod h1:+mbiYID5NLsBuqxjQTygKM/DAdKpAjvBzrJd64NU1G8= -k8s.io/component-base v0.32.3 h1:98WJvvMs3QZ2LYHBzvltFSeJjEx7t5+8s71P7M74u8k= -k8s.io/component-base v0.32.3/go.mod h1:LWi9cR+yPAv7cu2X9rZanTiFKB2kHA+JjmhkKjCZRpI= +k8s.io/api v0.32.4 h1:kw8Y/G8E7EpNy7gjB8gJZl3KJkNz8HM2YHrZPtAZsF4= +k8s.io/api v0.32.4/go.mod h1:5MYFvLvweRhyKylM3Es/6uh/5hGp0dg82vP34KifX4g= +k8s.io/apiextensions-apiserver v0.32.4 h1:IA+CoR63UDOijR/vEpow6wQnX4V6iVpzazJBskHrpHE= +k8s.io/apiextensions-apiserver v0.32.4/go.mod h1:Y06XO/b92H8ymOdG1HlA1submf7gIhbEDc3RjriqZOs= +k8s.io/apimachinery v0.32.4 h1:8EEksaxA7nd7xWJkkwLDN4SvWS5ot9g6Z/VZb3ju25I= +k8s.io/apimachinery v0.32.4/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= +k8s.io/apiserver v0.32.4 h1:Yf7sd/y+GOQKH1Qf6wUeayZrYXe2SKZ17Bcq7VQM5HQ= +k8s.io/apiserver v0.32.4/go.mod h1:JFUMNtE2M5yqLZpIsgCb06SkVSW1YcxW1oyLSTfjXR8= +k8s.io/client-go v0.32.4 h1:zaGJS7xoYOYumoWIFXlcVrsiYioRPrXGO7dBfVC5R6M= +k8s.io/client-go v0.32.4/go.mod h1:k0jftcyYnEtwlFW92xC7MTtFv5BNcZBr+zn9jPlT9Ic= +k8s.io/code-generator v0.32.4 h1:d4dm/43RD6xhPBX22JgJw9JUpwTKzVR6tAxJD7pz83o= +k8s.io/code-generator v0.32.4/go.mod h1:R0bKdIg1smtvsKvj9q7SxTeKq5X9ko6PuICCGt4yqxg= +k8s.io/component-base v0.32.4 h1:HuF+2JVLbFS5GODLIfPCb1Td6b+G2HszJoArcWOSr5I= +k8s.io/component-base v0.32.4/go.mod h1:10KloJEYw1keU/Xmjfy9TKJqUq7J2mYdiD1VDXoco4o= k8s.io/gengo/v2 v2.0.0-20240911193312-2b36238f13e9 h1:si3PfKm8dDYxgfbeA6orqrtLkvvIeH8UqffFJDl0bz4= k8s.io/gengo/v2 v2.0.0-20240911193312-2b36238f13e9/go.mod h1:EJykeLsmFC60UQbYJezXkEsG2FLrt0GPNkU5iK5GWxU= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= From 28c7484cc93eb5b6110ce50c4467b390a564b05c Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Tue, 29 Apr 2025 18:36:00 +0300 Subject: [PATCH 69/74] extract pod representation from backend/metrics to backend (#751) Signed-off-by: Nir Rozenbaum --- pkg/epp/backend/metrics/fake.go | 7 +-- pkg/epp/backend/metrics/metrics.go | 12 ++--- pkg/epp/backend/metrics/metrics_test.go | 3 +- pkg/epp/backend/metrics/pod_metrics.go | 11 ++--- pkg/epp/backend/metrics/types.go | 29 +----------- pkg/epp/backend/pod.go | 45 +++++++++++++++++++ pkg/epp/handlers/server.go | 4 +- .../scheduling/plugins/filter/filter_test.go | 5 ++- pkg/epp/scheduling/scheduler_test.go | 43 +++++++++--------- pkg/epp/scheduling/types/types.go | 7 +-- test/integration/epp/hermetic_test.go | 29 ++++++------ 11 files changed, 108 insertions(+), 87 deletions(-) create mode 100644 pkg/epp/backend/pod.go diff --git a/pkg/epp/backend/metrics/fake.go b/pkg/epp/backend/metrics/fake.go index ec97c6de..58d05026 100644 --- a/pkg/epp/backend/metrics/fake.go +++ b/pkg/epp/backend/metrics/fake.go @@ -24,12 +24,13 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) // FakePodMetrics is an implementation of PodMetrics that doesn't run the async refresh loop. type FakePodMetrics struct { - Pod *Pod + Pod *backend.Pod Metrics *Metrics } @@ -37,7 +38,7 @@ func (fpm *FakePodMetrics) String() string { return fmt.Sprintf("Pod: %v; Metrics: %v", fpm.GetPod(), fpm.GetMetrics()) } -func (fpm *FakePodMetrics) GetPod() *Pod { +func (fpm *FakePodMetrics) GetPod() *backend.Pod { return fpm.Pod } func (fpm *FakePodMetrics) GetMetrics() *Metrics { @@ -55,7 +56,7 @@ type FakePodMetricsClient struct { Res map[types.NamespacedName]*Metrics } -func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, pod *Pod, existing *Metrics, port int32) (*Metrics, error) { +func (f *FakePodMetricsClient) FetchMetrics(ctx context.Context, pod *backend.Pod, existing *Metrics, port int32) (*Metrics, error) { f.errMu.RLock() err, ok := f.Err[pod.NamespacedName] f.errMu.RUnlock() diff --git a/pkg/epp/backend/metrics/metrics.go b/pkg/epp/backend/metrics/metrics.go index 96814b4b..4cf56179 100644 --- a/pkg/epp/backend/metrics/metrics.go +++ b/pkg/epp/backend/metrics/metrics.go @@ -26,6 +26,7 @@ import ( dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" "go.uber.org/multierr" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" ) const ( @@ -39,15 +40,8 @@ type PodMetricsClientImpl struct { MetricMapping *MetricMapping } -// FetchMetrics fetches metrics from a given pod, clones the existing metrics object and returns an -// updated one. -func (p *PodMetricsClientImpl) FetchMetrics( - ctx context.Context, - pod *Pod, - existing *Metrics, - port int32, -) (*Metrics, error) { - +// FetchMetrics fetches metrics from a given pod, clones the existing metrics object and returns an updated one. +func (p *PodMetricsClientImpl) FetchMetrics(ctx context.Context, pod *backend.Pod, existing *Metrics, port int32) (*Metrics, error) { // Currently the metrics endpoint is hard-coded, which works with vLLM. // TODO(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/16): Consume this from InferencePool config. url := "http://" + pod.Address + ":" + strconv.Itoa(int(port)) + "/metrics" diff --git a/pkg/epp/backend/metrics/metrics_test.go b/pkg/epp/backend/metrics/metrics_test.go index e3b45b94..53127010 100644 --- a/pkg/epp/backend/metrics/metrics_test.go +++ b/pkg/epp/backend/metrics/metrics_test.go @@ -30,6 +30,7 @@ import ( "google.golang.org/protobuf/proto" "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -486,7 +487,7 @@ func TestPromToPodMetrics(t *testing.T) { // there's no server running on the specified port. func TestFetchMetrics(t *testing.T) { ctx := logutil.NewTestLoggerIntoContext(context.Background()) - pod := &Pod{ + pod := &backend.Pod{ Address: "127.0.0.1", NamespacedName: types.NamespacedName{ Namespace: "test", diff --git a/pkg/epp/backend/metrics/pod_metrics.go b/pkg/epp/backend/metrics/pod_metrics.go index 7339389a..bdeb28ba 100644 --- a/pkg/epp/backend/metrics/pod_metrics.go +++ b/pkg/epp/backend/metrics/pod_metrics.go @@ -27,6 +27,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -35,7 +36,7 @@ const ( ) type podMetrics struct { - pod atomic.Pointer[Pod] + pod atomic.Pointer[backend.Pod] metrics atomic.Pointer[Metrics] pmc PodMetricsClient ds Datastore @@ -48,14 +49,14 @@ type podMetrics struct { } type PodMetricsClient interface { - FetchMetrics(ctx context.Context, pod *Pod, existing *Metrics, port int32) (*Metrics, error) + FetchMetrics(ctx context.Context, pod *backend.Pod, existing *Metrics, port int32) (*Metrics, error) } func (pm *podMetrics) String() string { return fmt.Sprintf("Pod: %v; Metrics: %v", pm.GetPod(), pm.GetMetrics()) } -func (pm *podMetrics) GetPod() *Pod { +func (pm *podMetrics) GetPod() *backend.Pod { return pm.pod.Load() } @@ -67,8 +68,8 @@ func (pm *podMetrics) UpdatePod(in *corev1.Pod) { pm.pod.Store(toInternalPod(in)) } -func toInternalPod(in *corev1.Pod) *Pod { - return &Pod{ +func toInternalPod(in *corev1.Pod) *backend.Pod { + return &backend.Pod{ NamespacedName: types.NamespacedName{ Name: in.Name, Namespace: in.Namespace, diff --git a/pkg/epp/backend/metrics/types.go b/pkg/epp/backend/metrics/types.go index 156ac3ed..4932e3ac 100644 --- a/pkg/epp/backend/metrics/types.go +++ b/pkg/epp/backend/metrics/types.go @@ -24,8 +24,8 @@ import ( "time" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" ) func NewPodMetricsFactory(pmc PodMetricsClient, refreshMetricsInterval time.Duration) *PodMetricsFactory { @@ -58,38 +58,13 @@ func (f *PodMetricsFactory) NewPodMetrics(parentCtx context.Context, in *corev1. } type PodMetrics interface { - GetPod() *Pod + GetPod() *backend.Pod GetMetrics() *Metrics UpdatePod(*corev1.Pod) StopRefreshLoop() String() string } -type Pod struct { - NamespacedName types.NamespacedName - Address string -} - -func (p *Pod) String() string { - if p == nil { - return "" - } - return fmt.Sprintf("%+v", *p) -} - -func (p *Pod) Clone() *Pod { - if p == nil { - return nil - } - return &Pod{ - NamespacedName: types.NamespacedName{ - Name: p.NamespacedName.Name, - Namespace: p.NamespacedName.Namespace, - }, - Address: p.Address, - } -} - type Metrics struct { // ActiveModels is a set of models(including LoRA adapters) that are currently cached to GPU. ActiveModels map[string]int diff --git a/pkg/epp/backend/pod.go b/pkg/epp/backend/pod.go new file mode 100644 index 00000000..a63a0a83 --- /dev/null +++ b/pkg/epp/backend/pod.go @@ -0,0 +1,45 @@ +/* +Copyright 2025 The Kubernetes Authors. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package backend + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/types" +) + +type Pod struct { + NamespacedName types.NamespacedName + Address string +} + +func (p *Pod) String() string { + if p == nil { + return "" + } + return fmt.Sprintf("%+v", *p) +} + +func (p *Pod) Clone() *Pod { + if p == nil { + return nil + } + return &Pod{ + NamespacedName: types.NamespacedName{ + Name: p.NamespacedName.Name, + Namespace: p.NamespacedName.Namespace, + }, + Address: p.Address, + } +} diff --git a/pkg/epp/handlers/server.go b/pkg/epp/handlers/server.go index 5e23c7a0..630baef3 100644 --- a/pkg/epp/handlers/server.go +++ b/pkg/epp/handlers/server.go @@ -34,7 +34,7 @@ import ( "google.golang.org/protobuf/types/known/structpb" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" - backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" schedulingtypes "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" @@ -447,7 +447,7 @@ func RandomWeightedDraw(logger logr.Logger, model *v1alpha2.InferenceModel, seed return "" } -func GetRandomPod(ds datastore.Datastore) *backendmetrics.Pod { +func GetRandomPod(ds datastore.Datastore) *backend.Pod { pods := ds.PodGetAll() if len(pods) == 0 { return nil diff --git a/pkg/epp/scheduling/plugins/filter/filter_test.go b/pkg/epp/scheduling/plugins/filter/filter_test.go index a06ec3ca..2354c3ef 100644 --- a/pkg/epp/scheduling/plugins/filter/filter_test.go +++ b/pkg/epp/scheduling/plugins/filter/filter_test.go @@ -22,6 +22,7 @@ import ( "github.com/google/go-cmp/cmp" k8stypes "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/config" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" @@ -227,7 +228,7 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { // Test setup: One affinity pod and one available pod pods := []types.Pod{ &types.PodMetrics{ - Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "affinity-pod"}}, + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "affinity-pod"}}, Metrics: &backendmetrics.Metrics{ MaxActiveModels: 2, ActiveModels: map[string]int{ @@ -236,7 +237,7 @@ func TestLoRASoftAffinityDistribution(t *testing.T) { }, }, &types.PodMetrics{ - Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "available-pod"}}, + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "available-pod"}}, Metrics: &backendmetrics.Metrics{ MaxActiveModels: 2, ActiveModels: map[string]int{}, diff --git a/pkg/epp/scheduling/scheduler_test.go b/pkg/epp/scheduling/scheduler_test.go index 311f44e9..b44c7ac2 100644 --- a/pkg/epp/scheduling/scheduler_test.go +++ b/pkg/epp/scheduling/scheduler_test.go @@ -22,6 +22,7 @@ import ( "github.com/google/go-cmp/cmp" k8stypes "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" // Import config for thresholds "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/plugins" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling/types" @@ -57,7 +58,7 @@ func TestSchedule(t *testing.T) { // model being active, and has low KV cache. input: []*backendmetrics.FakePodMetrics{ { - Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, @@ -69,7 +70,7 @@ func TestSchedule(t *testing.T) { }, }, { - Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 3, KVCacheUsagePercent: 0.1, @@ -81,7 +82,7 @@ func TestSchedule(t *testing.T) { }, }, { - Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}, + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}, Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.2, @@ -95,7 +96,7 @@ func TestSchedule(t *testing.T) { wantRes: &types.Result{ TargetPod: &types.ScoredPod{ Pod: &types.PodMetrics{ - Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 3, KVCacheUsagePercent: 0.1, @@ -120,7 +121,7 @@ func TestSchedule(t *testing.T) { // pod1 will be picked because it has capacity for the sheddable request. input: []*backendmetrics.FakePodMetrics{ { - Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, @@ -132,7 +133,7 @@ func TestSchedule(t *testing.T) { }, }, { - Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 3, KVCacheUsagePercent: 0.1, @@ -144,7 +145,7 @@ func TestSchedule(t *testing.T) { }, }, { - Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}, + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}, Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.2, @@ -158,7 +159,7 @@ func TestSchedule(t *testing.T) { wantRes: &types.Result{ TargetPod: &types.ScoredPod{ Pod: &types.PodMetrics{ - Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, @@ -184,7 +185,7 @@ func TestSchedule(t *testing.T) { // dropped. input: []*backendmetrics.FakePodMetrics{ { - Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}, Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.9, @@ -196,7 +197,7 @@ func TestSchedule(t *testing.T) { }, }, { - Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}, Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 3, KVCacheUsagePercent: 0.85, @@ -208,7 +209,7 @@ func TestSchedule(t *testing.T) { }, }, { - Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}, + Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}, Metrics: &backendmetrics.Metrics{ WaitingQueueSize: 10, KVCacheUsagePercent: 0.85, @@ -282,9 +283,9 @@ func TestSchedulePlugins(t *testing.T) { postSchedulePlugins: []plugins.PostSchedule{tp1, tp2}, }, input: []*backendmetrics.FakePodMetrics{ - {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, - {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, - {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}, + {Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, + {Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, + {Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}, }, wantTargetPod: k8stypes.NamespacedName{Name: "pod1"}, targetPodScore: 1.1, @@ -304,9 +305,9 @@ func TestSchedulePlugins(t *testing.T) { postSchedulePlugins: []plugins.PostSchedule{tp1, tp2}, }, input: []*backendmetrics.FakePodMetrics{ - {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, - {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, - {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}, + {Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, + {Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, + {Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}, }, wantTargetPod: k8stypes.NamespacedName{Name: "pod1"}, targetPodScore: 50, @@ -326,9 +327,9 @@ func TestSchedulePlugins(t *testing.T) { postSchedulePlugins: []plugins.PostSchedule{tp1, tp2}, }, input: []*backendmetrics.FakePodMetrics{ - {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, - {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, - {Pod: &backendmetrics.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}, + {Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod1"}}}, + {Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod2"}}}, + {Pod: &backend.Pod{NamespacedName: k8stypes.NamespacedName{Name: "pod3"}}}, }, numPodsToScore: 0, err: true, // no available pods to server after filter all @@ -369,7 +370,7 @@ func TestSchedulePlugins(t *testing.T) { // Validate output wantPod := &types.PodMetrics{ - Pod: &backendmetrics.Pod{NamespacedName: test.wantTargetPod}, + Pod: &backend.Pod{NamespacedName: test.wantTargetPod}, } wantRes := &types.Result{TargetPod: wantPod} if diff := cmp.Diff(wantRes, got); diff != "" { diff --git a/pkg/epp/scheduling/types/types.go b/pkg/epp/scheduling/types/types.go index 5198515b..4f69fae0 100644 --- a/pkg/epp/scheduling/types/types.go +++ b/pkg/epp/scheduling/types/types.go @@ -22,6 +22,7 @@ import ( "github.com/go-logr/logr" "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" ) @@ -41,7 +42,7 @@ func (r *LLMRequest) String() string { } type Pod interface { - GetPod() *backendmetrics.Pod + GetPod() *backend.Pod GetMetrics() *backendmetrics.Metrics String() string } @@ -66,7 +67,7 @@ func (pm *PodMetrics) String() string { return fmt.Sprintf("%+v", *pm) } -func (pm *PodMetrics) GetPod() *backendmetrics.Pod { +func (pm *PodMetrics) GetPod() *backend.Pod { return pm.Pod } @@ -75,7 +76,7 @@ func (pm *PodMetrics) GetMetrics() *backendmetrics.Metrics { } type PodMetrics struct { - *backendmetrics.Pod + *backend.Pod *backendmetrics.Metrics } diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go index 79b619fd..35361329 100644 --- a/test/integration/epp/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -61,6 +61,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/envtest" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend" backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" @@ -96,7 +97,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { tests := []struct { name string requests []*extProcPb.ProcessingRequest - pods map[backendmetrics.Pod]*backendmetrics.Metrics + pods map[backend.Pod]*backendmetrics.Metrics wantResponses []*extProcPb.ProcessingResponse wantMetrics map[string]string wantErr bool @@ -107,7 +108,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { name: "select lower queue and kv cache, no active lora", requests: integrationutils.GenerateStreamedRequestSet(logger, "test1", "my-model"), // pod-1 will be picked because it has relatively low queue size and low KV cache. - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + pods: map[backend.Pod]*backendmetrics.Metrics{ fakePod(0): { WaitingQueueSize: 3, KVCacheUsagePercent: 0.2, @@ -182,7 +183,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { requests: integrationutils.GenerateStreamedRequestSet(logger, "test2", "sql-lora"), // pod-1 will be picked because it has relatively low queue size, with the requested // model being active, and has low KV cache. - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + pods: map[backend.Pod]*backendmetrics.Metrics{ fakePod(0): { WaitingQueueSize: 0, KVCacheUsagePercent: 0.2, @@ -267,7 +268,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { // pod-2 will be picked despite it NOT having the requested model being active // as it's above the affinity for queue size. Also is critical, so we should // still honor request despite all queues > 5 - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + pods: map[backend.Pod]*backendmetrics.Metrics{ fakePod(0): { WaitingQueueSize: 10, KVCacheUsagePercent: 0.2, @@ -350,7 +351,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { requests: integrationutils.GenerateStreamedRequestSet(logger, "test4", "sql-lora-sheddable"), // no pods will be picked as all models are either above kv threshold, // queue threshold, or both. - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + pods: map[backend.Pod]*backendmetrics.Metrics{ fakePod(0): { WaitingQueueSize: 6, KVCacheUsagePercent: 0.2, @@ -398,7 +399,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { name: "noncritical, but one server has capacity, do not shed", requests: integrationutils.GenerateStreamedRequestSet(logger, "test5", "sql-lora-sheddable"), // pod 0 will be picked as all other models are above threshold - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + pods: map[backend.Pod]*backendmetrics.Metrics{ fakePod(0): { WaitingQueueSize: 4, KVCacheUsagePercent: 0.2, @@ -509,7 +510,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { // // pod 0 will be picked as all other models are above threshold - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + pods: map[backend.Pod]*backendmetrics.Metrics{ fakePod(0): { WaitingQueueSize: 4, KVCacheUsagePercent: 0.2, @@ -620,7 +621,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { // // pod 0 will be picked as all other models are above threshold - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + pods: map[backend.Pod]*backendmetrics.Metrics{ fakePod(0): { WaitingQueueSize: 4, KVCacheUsagePercent: 0.2, @@ -732,7 +733,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { // // pod 0 will be picked as all other models are above threshold - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + pods: map[backend.Pod]*backendmetrics.Metrics{ fakePod(0): { WaitingQueueSize: 4, KVCacheUsagePercent: 0.2, @@ -831,7 +832,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { // // pod 0 will be picked as all other models are above threshold - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + pods: map[backend.Pod]*backendmetrics.Metrics{ fakePod(0): { WaitingQueueSize: 4, KVCacheUsagePercent: 0.2, @@ -1179,7 +1180,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { DynamicMetadata: makeMetadata("192.168.1.1:8000"), }, }, - pods: map[backendmetrics.Pod]*backendmetrics.Metrics{ + pods: map[backend.Pod]*backendmetrics.Metrics{ fakePod(0): { WaitingQueueSize: 4, KVCacheUsagePercent: 0.2, @@ -1225,7 +1226,7 @@ func TestFullDuplexStreamed_KubeInferenceModelRequest(t *testing.T) { } } -func setUpHermeticServer(t *testing.T, podAndMetrics map[backendmetrics.Pod]*backendmetrics.Metrics, streamed bool) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { +func setUpHermeticServer(t *testing.T, podAndMetrics map[backend.Pod]*backendmetrics.Metrics, streamed bool) (client extProcPb.ExternalProcessor_ProcessClient, cleanup func()) { // Reconfigure the TestPodMetricsClient. res := map[types.NamespacedName]*backendmetrics.Metrics{} for pod, metrics := range podAndMetrics { @@ -1303,8 +1304,8 @@ func setUpHermeticServer(t *testing.T, podAndMetrics map[backendmetrics.Pod]*bac } } -func fakePod(index int) backendmetrics.Pod { - return backendmetrics.Pod{ +func fakePod(index int) backend.Pod { + return backend.Pod{ NamespacedName: types.NamespacedName{Name: fmt.Sprintf("pod-%v", index), Namespace: "default"}, Address: fmt.Sprintf("192.168.1.%d", index+1), } From cb0524ba0a7f0c6cdad2afacdcc2fd63f9ca1cb4 Mon Sep 17 00:00:00 2001 From: Hang Yin Date: Tue, 29 Apr 2025 23:51:55 +0800 Subject: [PATCH 70/74] Request for adding Alibaba Cloud Container Service for Kubernetes (ACK) into implementations (#748) * add ack gie to implementations Signed-off-by: Hang Yin * fix documentation links * supply a github issue to track GIE support of ACK --------- Signed-off-by: Hang Yin --- site-src/implementations/gateways.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/site-src/implementations/gateways.md b/site-src/implementations/gateways.md index b44dca6f..950c0833 100644 --- a/site-src/implementations/gateways.md +++ b/site-src/implementations/gateways.md @@ -6,11 +6,13 @@ This project has several implementations that are planned or in progress: * [Kgateway][2] * [Google Kubernetes Engine][3] * [Istio][4] +* [Alibaba Cloud Container Service for Kubernetes][5] [1]:#envoy-gateway [2]:#kgateway [3]:#google-kubernetes-engine [4]:#istio +[5]:#alibaba-cloud-container-service-for-kubernetes ## Envoy AI Gateway @@ -65,3 +67,22 @@ For service mesh users, Istio also fully supports east-west (including [GAMMA](h Gateway API Inference Extension support is being tracked by this [GitHub Issue](https://github.com/istio/istio/issues/55768). + +## Alibaba Cloud Container Service for Kubernetes + +[Alibaba Cloud Container Service for Kubernetes (ACK)][ack] is a managed Kubernetes platform +offered by Alibaba Cloud. The implementation of the Gateway API in ACK is through the +[ACK Gateway with Inference Extension][ack-gie] component, which introduces model-aware, +GPU-efficient load balancing for AI workloads beyond basic HTTP routing. + +The ACK Gateway with Inference Extension implements the Gateway API Inference Extension +and provides optimized routing for serving generative AI workloads, +including weighted traffic splitting, mirroring, advanced routing, etc. +See the docs for the [usage][ack-gie-usage]. + +Progress towards supporting Gateway API Inference Extension is being tracked +by [this Issue](https://github.com/AliyunContainerService/ack-gateway-api/issues/1). + +[ack]:https://www.alibabacloud.com/help/en/ack +[ack-gie]:https://www.alibabacloud.com/help/en/ack/product-overview/ack-gateway-with-inference-extension +[ack-gie-usage]:https://www.alibabacloud.com/help/en/ack/ack-managed-and-ack-dedicated/user-guide/intelligent-routing-and-traffic-management-with-ack-gateway-inference-extension \ No newline at end of file From ea75ca135364e136cee8ab7f310930270a759e9c Mon Sep 17 00:00:00 2001 From: Nir Rozenbaum Date: Tue, 29 Apr 2025 18:52:02 +0300 Subject: [PATCH 71/74] fixed error message in scheduler when no pods are available (#759) Signed-off-by: Nir Rozenbaum --- pkg/epp/scheduling/scheduler.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/epp/scheduling/scheduler.go b/pkg/epp/scheduling/scheduler.go index 04d24ea2..1a1d67b5 100644 --- a/pkg/epp/scheduling/scheduler.go +++ b/pkg/epp/scheduling/scheduler.go @@ -110,7 +110,7 @@ func (s *Scheduler) Schedule(ctx context.Context, req *types.LLMRequest) (*types pods := s.runFilterPlugins(sCtx) if len(pods) == 0 { - return nil, errutil.Error{Code: errutil.InferencePoolResourceExhausted, Msg: "failed to find a target pod"} + return nil, errutil.Error{Code: errutil.Internal, Msg: "no pods available for the given request"} } // if we got here, there is at least one pod to score weightedScorePerPod := s.runScorerPlugins(sCtx, pods) From ef3d01a07e3b1598e8929ffef46144d91b49eb77 Mon Sep 17 00:00:00 2001 From: sina chavoshi Date: Tue, 29 Apr 2025 09:21:55 -0700 Subject: [PATCH 72/74] feat: Initial setup for conformance test suite (#720) * feat: Initial setup for conformance test suite * fix missing go.sum entry * Fix the API version and basic Inferencepool-basic-accepted Yaml definition. * exclude conformance tests from github acceptance test run. * Add support for multiple profiles, remove release channel and update version to use semver. * Adding another layer to the report hierarchy for category of conformance (gateway, epp, model server). * Add trailing new line to yaml files. * switch to use InferencePoolMustHaveCondition from /conformance/utils/kubernetes * remove extra godoc comments * Remove references to ExtensionChannel from reports readme * format readme * remove the service for the conformance backend. * update the namespace and EEP names to match the manifest. * Update PR based on review feedback including, change dir name to lower case, remove unused manifest, remove NamespaceLabels and NamespaceAnnotations * add a comment to clarify use of echo server --- Makefile | 2 +- conformance/conformance.go | 230 ++++++++++++++++++ conformance/conformance_test.go | 29 +++ conformance/embed.go | 25 ++ conformance/reports/README.md | 93 +++++++ .../resources/manifests/manifests.yaml | 49 ++++ .../tests/basic/inferencepool_accepted.go | 60 +++++ .../tests/basic/inferencepool_accepted.yaml | 27 ++ conformance/tests/main.go | 35 +++ conformance/utils/assertions.go | 25 ++ conformance/utils/kubernetes/helpers.go | 49 ++++ conformance/utils/traffic/traffic.go | 22 ++ go.mod | 15 +- go.sum | 41 ++-- 14 files changed, 671 insertions(+), 31 deletions(-) create mode 100644 conformance/conformance.go create mode 100644 conformance/conformance_test.go create mode 100644 conformance/embed.go create mode 100644 conformance/reports/README.md create mode 100644 conformance/resources/manifests/manifests.yaml create mode 100644 conformance/tests/basic/inferencepool_accepted.go create mode 100644 conformance/tests/basic/inferencepool_accepted.yaml create mode 100644 conformance/tests/main.go create mode 100644 conformance/utils/assertions.go create mode 100644 conformance/utils/kubernetes/helpers.go create mode 100644 conformance/utils/traffic/traffic.go diff --git a/Makefile b/Makefile index 563e0ce9..4826a029 100644 --- a/Makefile +++ b/Makefile @@ -121,7 +121,7 @@ vet: ## Run go vet against code. .PHONY: test test: manifests generate fmt vet envtest image-build ## Run tests. - KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e) -race -coverprofile cover.out + KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./... | grep -v /e2e | grep -v /conformance) -race -coverprofile cover.out .PHONY: test-unit test-unit: ## Run unit tests. diff --git a/conformance/conformance.go b/conformance/conformance.go new file mode 100644 index 00000000..20d80fde --- /dev/null +++ b/conformance/conformance.go @@ -0,0 +1,230 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package conformance contains the core setup and execution logic +// for the Gateway API Inference Extension conformance test suite. +package conformance + +import ( + "fmt" + "io/fs" + "os" + "testing" + + "github.com/stretchr/testify/require" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + clientset "k8s.io/client-go/kubernetes" + + // Import runtime package for scheme creation + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/sets" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/config" + "sigs.k8s.io/yaml" + + // Import necessary types and utilities from the core Gateway API conformance suite. + // Assumes sigs.k8s.io/gateway-api is a dependency in the go.mod. + gatewayv1 "sigs.k8s.io/gateway-api/apis/v1" // Import core Gateway API types + confapis "sigs.k8s.io/gateway-api/conformance/apis/v1" // Report struct definition + confconfig "sigs.k8s.io/gateway-api/conformance/utils/config" + confflags "sigs.k8s.io/gateway-api/conformance/utils/flags" + confsuite "sigs.k8s.io/gateway-api/conformance/utils/suite" + "sigs.k8s.io/gateway-api/pkg/features" // Using core features definitions if applicable + + // Import the test definitions package to access the ConformanceTests slice + "sigs.k8s.io/gateway-api-inference-extension/conformance/tests" + + // Import test packages using blank identifier + // This triggers the init() functions in these packages, which register the tests + // by appending them to the tests.ConformanceTests slice. + _ "sigs.k8s.io/gateway-api-inference-extension/conformance/tests/basic" + // TODO: Add blank imports for other test categories as they are created. + // _ "sigs.k8s.io/gateway-api-inference-extension/conformance/tests/model_routing" + + // Import the Inference Extension API types + inferencev1alpha2 "sigs.k8s.io/gateway-api-inference-extension/api/v1alpha2" +) + +// GatewayLayerProfileName defines the name for the conformance profile that tests +// the Gateway API layer aspects of the Inference Extension (e.g., InferencePool, InferenceModel CRDs). +// Future profiles will cover EPP and ModelServer layers. +const GatewayLayerProfileName confsuite.ConformanceProfileName = "Gateway" + +var InferenceCoreFeatures = sets.New[features.FeatureName]() // Placeholder - Populate with actual features specific to this profile or manage features per profile + +// GatewayLayerProfile defines the conformance profile for the Gateway API layer +// of the Inference Extension. +// In future iterations, we will add constants and ConformanceProfile structs for +// EPPProfileName ("EPP") and ModelServerProfileName ("ModelServer") +// to cover their respective conformance layers. +var GatewayLayerProfile = confsuite.ConformanceProfile{ + Name: GatewayLayerProfileName, + CoreFeatures: InferenceCoreFeatures, +} + +// DefaultOptions parses command line flags and sets up the suite options. +// Adapted from the core Gateway API conformance suite. +func DefaultOptions(t *testing.T) confsuite.ConformanceOptions { + t.Helper() + + cfg, err := config.GetConfig() + require.NoError(t, err, "error loading Kubernetes config") + + // Initialize client options. The scheme must include Gateway API types + // and the Inference Extension types. + clientOptions := client.Options{} + scheme := clientOptions.Scheme + if scheme == nil { + // If default options don't provide a scheme, create one using runtime.NewScheme(). + scheme = runtime.NewScheme() + clientOptions.Scheme = scheme + } + + // Register necessary API Types + require.NoError(t, gatewayv1.Install(scheme)) // Add core Gateway API types + // Add the Inference Extension API types to the scheme using the correct import alias + require.NoError(t, inferencev1alpha2.Install(scheme)) + require.NoError(t, apiextensionsv1.AddToScheme(scheme)) // Needed for CRD checks + + // Create the Kubernetes clients + c, err := client.New(cfg, clientOptions) + require.NoError(t, err, "error initializing Kubernetes client") + cs, err := clientset.NewForConfig(cfg) + require.NoError(t, err, "error initializing Kubernetes clientset") + + exemptFeatures := confsuite.ParseSupportedFeatures(*confflags.ExemptFeatures) + skipTests := confsuite.ParseSkipTests(*confflags.SkipTests) + // Initially, run the GatewayLayerProfile. This will expand as other profiles + // (EPP, ModelServer) are added and can be selected via flags in future iterations. + conformanceProfiles := sets.New(GatewayLayerProfileName) + + // Implementation details from flags + implementation := confsuite.ParseImplementation( + *confflags.ImplementationOrganization, + *confflags.ImplementationProject, + *confflags.ImplementationURL, + *confflags.ImplementationVersion, + *confflags.ImplementationContact, + ) + + // Inference Extension Specific Report Fields + inferenceExtensionVersion := "v0.3.0" + _ = inferenceExtensionVersion // Avoid unused variable error until implemented + + // Create ConformanceOptions + opts := confsuite.ConformanceOptions{ + Client: c, + Clientset: cs, + RestConfig: cfg, + GatewayClassName: *confflags.GatewayClassName, + Debug: *confflags.ShowDebug, + CleanupBaseResources: *confflags.CleanupBaseResources, + SupportedFeatures: sets.New[features.FeatureName](), // Initialize empty, will be populated below + TimeoutConfig: confconfig.DefaultTimeoutConfig(), + SkipTests: skipTests, + ExemptFeatures: exemptFeatures, + RunTest: *confflags.RunTest, + Mode: *confflags.Mode, + Implementation: implementation, + ConformanceProfiles: conformanceProfiles, + ManifestFS: []fs.FS{&Manifests}, // Assumes embed.go defines `Manifests` + ReportOutputPath: *confflags.ReportOutput, + SkipProvisionalTests: *confflags.SkipProvisionalTests, + // TODO: Add the inference extension specific fields to ConformanceOptions struct if needed, + // or handle them during report generation. + // GatewayAPIInferenceExtensionChannel: inferenceExtensionChannel, + // GatewayAPIInferenceExtensionVersion: inferenceExtensionVersion, + } + + // Populate SupportedFeatures based on the GatewayLayerProfile. + // Since all features are mandatory for this profile, add all defined core features. + if opts.ConformanceProfiles.Has(GatewayLayerProfileName) { + for feature := range GatewayLayerProfile.CoreFeatures { + opts.SupportedFeatures.Insert(feature) + } + } + + // Remove any features explicitly exempted via flags. + for feature := range opts.ExemptFeatures { + opts.SupportedFeatures.Delete(feature) + } + + return opts +} + +// RunConformance runs the Inference Extension conformance tests using default options. +func RunConformance(t *testing.T) { + RunConformanceWithOptions(t, DefaultOptions(t)) +} + +// RunConformanceWithOptions runs the Inference Extension conformance tests with specific options. +func RunConformanceWithOptions(t *testing.T, opts confsuite.ConformanceOptions) { + t.Logf("Running Inference Extension conformance tests with GatewayClass %s", opts.GatewayClassName) + + // Register the GatewayLayerProfile with the suite runner. + // In the future, other profiles (EPP, ModelServer) will also be registered here, + // and the suite runner will execute tests based on the selected profiles. + confsuite.RegisterConformanceProfile(GatewayLayerProfile) + + // Initialize the test suite. + cSuite, err := confsuite.NewConformanceTestSuite(opts) + require.NoError(t, err, "error initializing conformance suite") + + t.Log("Setting up Inference Extension conformance tests") + // Setup requires the list of tests, which is populated by the init() functions + // triggered by the blank imports at the top of this file. + cSuite.Setup(t, tests.ConformanceTests) + + t.Log("Running Inference Extension conformance tests") + // Run the tests. + err = cSuite.Run(t, tests.ConformanceTests) + require.NoError(t, err, "error running conformance tests") + + // Generate and write the report if requested. + if opts.ReportOutputPath != "" { + t.Log("Generating Inference Extension conformance report") + report, err := cSuite.Report() // Use the existing report generation logic. + require.NoError(t, err, "error generating conformance report") + + // TODO: Modify the report struct here if channel, version need to be modified. + // Example (requires adding fields to confapis.ConformanceReport): + // report.GatewayAPIInferenceExtensionChannel = opts.GatewayAPIInferenceExtensionChannel + // report.GatewayAPIInferenceExtensionVersion = opts.GatewayAPIInferenceExtensionVersion + + err = writeReport(t.Logf, *report, opts.ReportOutputPath) + require.NoError(t, err, "error writing conformance report") + } +} + +// writeReport writes the generated conformance report to the specified output file or logs it. +// Adapted from the core Gateway API suite. +func writeReport(logf func(string, ...any), report confapis.ConformanceReport, output string) error { + rawReport, err := yaml.Marshal(report) + if err != nil { + return fmt.Errorf("error marshaling report: %w", err) + } + + if output != "" { + if err = os.WriteFile(output, rawReport, 0o600); err != nil { + return fmt.Errorf("error writing report file %s: %w", output, err) + } + logf("Conformance report written to %s", output) + } else { + // Log the report YAML to stdout if no output file is specified. + logf("Conformance report:\n%s", string(rawReport)) + } + return nil +} diff --git a/conformance/conformance_test.go b/conformance/conformance_test.go new file mode 100644 index 00000000..de82d5ec --- /dev/null +++ b/conformance/conformance_test.go @@ -0,0 +1,29 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package conformance + +import ( + "testing" +) + +// TestConformance is the top-level function that runs the conformance tests. +// It calls the RunConformance function which sets up the suite and executes +// the registered tests. +func TestConformance(t *testing.T) { + // RunConformance is defined in conformance.go + RunConformance(t) +} diff --git a/conformance/embed.go b/conformance/embed.go new file mode 100644 index 00000000..f7fa64c9 --- /dev/null +++ b/conformance/embed.go @@ -0,0 +1,25 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package conformance + +import "embed" + +// Manifests embeds the contents of the conformance/resources directory making +// the YAML files within them available to the test suite at runtime. +// +//go:embed resources/* tests/* +var Manifests embed.FS diff --git a/conformance/reports/README.md b/conformance/reports/README.md new file mode 100644 index 00000000..81652b1c --- /dev/null +++ b/conformance/reports/README.md @@ -0,0 +1,93 @@ +# Conformance Reports for Gateway API Inference Extension + +This directory stores conformance reports submitted by various implementations of the Gateway API Inference Extension. This structure closely follows the [kubernetes-sigs/gateway-api/conformance/reports](https://github.com/kubernetes-sigs/gateway-api/blob/main/conformance/reports/README.md). + +## How this folder is structured + +This folder stores conformance reports organized first by the version of the Gateway API Inference Extension specification they were tested against, and then by the specific conformance profile (e.g., Gateway, EPP, Model Server): + +|-- conformance/reports +| |-- v0.3.0 # Example extension version +| | |-- gateway # Conformance profile/category +| | | |-- my-inference-gateway +| | | | |-- README.md +| | | | |-- experimental-v1.2.3-default-gateway-report.yaml # Example report file +| | | |-- another-implementation +| | | | |-- README.md +| | | | |-- ... +| | |-- epp # Future conformance profile/category +| | | |-- my-epp-implementation +| | | | |-- ... +| | |-- model-server # Future conformance profile/category +| | | |-- ... +| |-- v0.4.0 # Future extension version +| | |-- ... + +## Implementation Submissions + +Each implementation conformant with a specific profile of a specific version of the Gateway API Inference Extension should have its own folder within the corresponding version and profile directory (e.g., `/conformance/reports/v0.3.0/Gateway/my-implementation/`). + +The implementation is the owner of its folder and is responsible for: + +1. Uploading one or more conformance reports (YAML files). +2. Maintaining a mandatory `README.md` file within their folder, structured as follows: + + # My Inference Gateway Implementation (Gateway Profile Conformance) + + General information about the My/Implementation project. + + ## Table of Contents + +| Extension Version Tested | Profile Tested | Implementation Version | Mode | Report | +|--------------------------|----------------|------------------------|---------|----------------------------------------------------------------------------| +| v0.3.0 | Gateway | v1.2.3 | default | [v1.2.3 Gateway report](./experimental-v1.2.3-default-gateway-report.yaml) | +| ... | ... | ... | ... | ... | + + ## Reproduce + + Instructions on how to reproduce the claimed report(s). + +### Table of Contents (within Implementation README) + +The table of contents within an implementation's `README.md` should contain one row for each submitted report and include the following columns: + +* **Extension Version Tested**: The version of the Gateway API Inference Extension specification tested against (e.g., `v0.3.0`). Must correspond to the `gatewayAPIInferenceExtensionVersion` field in the report. +* **Profile Tested**: The specific conformance profile tested (e.g., `Gateway`, `EPP`, `ModelServer`). Must correspond to the `name` of the profile in the `profiles` list within the report. +* **Implementation Version**: A link to the GitHub/website page for the specific release/commit of the implementation tested. The version value MUST correspond to the `implementation.version` field in the report. +* **Mode**: The operating mode of the implementation used for the test run (default is `default`). Must correspond to the `mode` field in the report. If a mode other than `default` is used, the "Reproduce" section must explain how to configure it. +* **Report**: A link to the corresponding report YAML file. Reports MUST be named according to the pattern: `---report.yaml` (e.g., `experimental-v1.2.3-default-gateway-report.yaml`). + +### Reproduce Section (within Implementation README) + +This section MUST exist and contain the manual or automatic steps required to reproduce the results claimed by the uploaded conformance reports for that specific implementation. If reproduction steps differ significantly between implementation versions, use sub-sections. + +## Report Files + +Conformance reports MUST be uploaded exactly as generated by the official Gateway API Inference Extension conformance test suite, without any modifications. The "Reproduce" section allows for verification of the submitted report against a fresh run. + +### Report Rules + +To be accepted, submitted conformance reports must comply with the following rules: + +1. **Implementation Details:** All fields within the `implementation` block must have meaningful values: + * `organization`: The entity maintaining the implementation (company, open source org, individual). + * `project`: The name of the implementation project, unique within the organization. + * `url`: A valid URL for the project (e.g., GitHub repository, product page). + * `version`: A specific, reproducible snapshot of the implementation (e.g., tag, commit hash, release version). Branch names are not acceptable. + * `contact`: A list of contact points (GitHub handles like `@maintainer`, team handles like `@org/team`, email addresses, or support URLs like an issue tracker). +2. **Inference Extension Versioning:** The report MUST include: + * `gatewayAPIInferenceExtensionVersion`: The specific version of the Gateway API Inference Extension specification tested against (e.g., `v0.3.0`). +3. **Mode:** The `mode` field indicates the implementation's operating mode during the test run. +4. **Test Profile & Result:** + * The report MUST contain exactly one profile result under the `profiles` list for the specific conformance category being submitted (e.g., a report for "Gateway" conformance should only contain the "Gateway" profile result). + * The profile's `name` MUST match the conformance category (e.g., `Gateway`, `EPP`, `ModelServer`). + * The profile's `result` field MUST be `success`. A `success` result indicates that **all** tests defined within the Gateway API Inference Extension conformance suite for that specific profile and version passed. + +## Submission Process + +Conformance reports demonstrating a `success` result for a specific profile (e.g., `Gateway`) should be submitted via Pull Request directly to this repository (`kubernetes-sigs/gateway-api-inference-extension`). + +1. Create a new folder structure under `/conformance/reports///` named after your implementation (e.g., `/conformance/reports/v0.3.0/Gateway/my-implementation/`). +2. Add your implementation's `README.md` to this folder, following the structure described above. +3. Add your generated conformance report YAML file(s) to this folder, ensuring they follow the naming convention `---report.yaml`. +4. Submit the Pull Request. diff --git a/conformance/resources/manifests/manifests.yaml b/conformance/resources/manifests/manifests.yaml new file mode 100644 index 00000000..7b43b784 --- /dev/null +++ b/conformance/resources/manifests/manifests.yaml @@ -0,0 +1,49 @@ +# Base Kubernetes resources for the Gateway API Inference Extension conformance tests. +# This includes namespaces and a minimal set of resources (Gateway, Backend) +# required by many tests. More specific resources should be defined within +# individual test files or other resource directories (e.g., sample_backends). + +--- +# Namespace for core infrastructure like Gateways. +apiVersion: v1 +kind: Namespace +metadata: + name: gateway-conformance-infra + labels: + gateway-conformance: infra + +--- +# Namespace for application backends (potentially simulating model servers +# or where InferencePools might reside in some tests). +apiVersion: v1 +kind: Namespace +metadata: + name: gateway-conformance-app-backend + labels: + gateway-conformance: backend + +--- +# A basic Gateway resource that allows HTTPRoutes from the same namespace. +# Tests can use this as a parent reference for routes that target InferencePools. +# Using a simple echo server instead of an actual model server to simplify the test +# execution, this design may need to be revised based on the test case needs. +apiVersion: gateway.networking.k8s.io/v1 # Using v1 as per latest Gateway API standard +kind: Gateway +metadata: + name: same-namespace + namespace: gateway-conformance-infra +spec: + # The conformance suite runner will replace this placeholder + # with the actual GatewayClass name provided via flags. + gatewayClassName: "{GATEWAY_CLASS_NAME}" + listeners: + - name: http # Standard listener name + port: 80 + protocol: HTTP + allowedRoutes: + namespaces: + from: Same # Restrict to same namespace initially for simplicity + kinds: + # Allows HTTPRoutes to attach, which can then reference InferencePools. + - group: gateway.networking.k8s.io + kind: HTTPRoute diff --git a/conformance/tests/basic/inferencepool_accepted.go b/conformance/tests/basic/inferencepool_accepted.go new file mode 100644 index 00000000..eae59404 --- /dev/null +++ b/conformance/tests/basic/inferencepool_accepted.go @@ -0,0 +1,60 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package basic + +import ( + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + gatewayv1 "sigs.k8s.io/gateway-api/apis/v1" // For standard condition types + "sigs.k8s.io/gateway-api/conformance/utils/suite" + "sigs.k8s.io/gateway-api/pkg/features" // For standard feature names + + // Import the tests package to append to ConformanceTests + "sigs.k8s.io/gateway-api-inference-extension/conformance/tests" + infrakubernetes "sigs.k8s.io/gateway-api-inference-extension/conformance/utils/kubernetes" +) + +func init() { + // Register the InferencePoolAccepted test case with the conformance suite. + // This ensures it will be discovered and run by the test runner. + tests.ConformanceTests = append(tests.ConformanceTests, InferencePoolAccepted) +} + +// InferencePoolAccepted defines the test case for verifying basic InferencePool acceptance. +var InferencePoolAccepted = suite.ConformanceTest{ + ShortName: "InferencePoolAccepted", + Description: "A minimal InferencePool resource should be accepted by the controller and report an Accepted condition", + Manifests: []string{"tests/basic/inferencepool_accepted.yaml"}, + Features: []features.FeatureName{}, + Test: func(t *testing.T, s *suite.ConformanceTestSuite) { + // created by the associated manifest file. + poolNN := types.NamespacedName{Name: "inferencepool-basic-accepted", Namespace: "gateway-conformance-app-backend"} + + t.Run("InferencePool should have Accepted condition set to True", func(t *testing.T) { + // Define the expected status condition. We use the standard "Accepted" + // condition type from the Gateway API for consistency. + acceptedCondition := metav1.Condition{ + Type: string(gatewayv1.GatewayConditionAccepted), // Standard condition type + Status: metav1.ConditionTrue, + Reason: "", // "" means we don't strictly check the Reason for this basic test. + } + infrakubernetes.InferencePoolMustHaveCondition(t, s.Client, s.TimeoutConfig, poolNN, acceptedCondition) + }) + }, +} diff --git a/conformance/tests/basic/inferencepool_accepted.yaml b/conformance/tests/basic/inferencepool_accepted.yaml new file mode 100644 index 00000000..8ae327d8 --- /dev/null +++ b/conformance/tests/basic/inferencepool_accepted.yaml @@ -0,0 +1,27 @@ +# Basic InferencePool for acceptance testing. +# This manifest defines the minimal required fields to create a valid +# InferencePool resource, which the InferencePoolAccepted test will use +# to verify that the controller recognizes and accepts the resource. + +apiVersion: inference.networking.x-k8s.io/v1alpha2 +kind: InferencePool +metadata: + # This name must match the 'poolNN' variable defined in the + # conformance/tests/basic/inferencepool_accepted.go test file. + name: inferencepool-basic-accepted + # This namespace should be one created by the base manifests. + namespace: gateway-conformance-app-backend +spec: + # --- Selector (Required) --- + # Selects the Pods belonging to this pool. + selector: + app: "infra-backend-v1" + + # --- Target Port (Required) --- + # The port the model server container listens on. + targetPortNumber: 3000 + + # --- Extension Reference --- + # GKE-specific configuration reference. + extensionRef: + name: infra-backend-v1-epp diff --git a/conformance/tests/main.go b/conformance/tests/main.go new file mode 100644 index 00000000..fc66c765 --- /dev/null +++ b/conformance/tests/main.go @@ -0,0 +1,35 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package tests is the root package for all Gateway API Inference Extension +// conformance test implementations. +package tests + +import ( + // Importing the suite package to access the ConformanceTest struct definition. + // For initial version directly importing from the core gateway-api repo. + // This may be adjusted in the future if we have need to create a copy of + // the suite utilities. + "sigs.k8s.io/gateway-api/conformance/utils/suite" + // Do NOT add blank imports for specific test packages here. + // They should be added to the main conformance package instead + // to avoid import cycles. +) + +// ConformanceTests holds all the conformance tests definitions for the +// Gateway API Inference Extension suite. Tests are registered from other packages +// using init() functions like the one in the basic package. +var ConformanceTests []suite.ConformanceTest diff --git a/conformance/utils/assertions.go b/conformance/utils/assertions.go new file mode 100644 index 00000000..c77d0fc5 --- /dev/null +++ b/conformance/utils/assertions.go @@ -0,0 +1,25 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package assertions contains custom assertion helper functions used within +// the Gateway API Inference Extension conformance test suite. +package assertions + +// TODO: Implement custom assertion functions specific to Inference Extension testing. +// Examples might include: +// - Asserting specific fields or structures within an inference API response body. +// - Asserting specific metrics reported by mock model servers or EPPs. +// - Asserting specific conditions or status fields unique to InferencePool or InferenceModel. diff --git a/conformance/utils/kubernetes/helpers.go b/conformance/utils/kubernetes/helpers.go new file mode 100644 index 00000000..3d517863 --- /dev/null +++ b/conformance/utils/kubernetes/helpers.go @@ -0,0 +1,49 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package kubernetes contains helper functions for interacting with +// Kubernetes objects within the conformance test suite. +package kubernetes + +import ( + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + // Import necessary utilities from the core Gateway API conformance suite + "sigs.k8s.io/gateway-api/conformance/utils/config" +) + +// InferencePoolMustHaveCondition waits for the specified InferencePool resource +// to exist and report the expected status condition. +// This is a placeholder and needs full implementation. +// +// TODO: Implement the actual logic for this helper function. +// It should fetch the InferencePool using the provided client and check its +// Status.Conditions field, polling until the condition is met or a timeout occurs. +// like HTTPRouteMustHaveCondition. +func InferencePoolMustHaveCondition(t *testing.T, c client.Client, timeoutConfig config.TimeoutConfig, poolNN types.NamespacedName, expectedCondition metav1.Condition) { + t.Helper() // Marks this function as a test helper + + // Placeholder implementation: Log and skip the check. + t.Logf("Verification for InferencePool condition (%s=%s) on %s - Placeholder: Skipping check.", + expectedCondition.Type, expectedCondition.Status, poolNN.String()) + + // Skip the test using this helper until it's fully implemented. + t.Skip("InferencePoolMustHaveCondition helper not yet implemented") +} diff --git a/conformance/utils/traffic/traffic.go b/conformance/utils/traffic/traffic.go new file mode 100644 index 00000000..4f13f980 --- /dev/null +++ b/conformance/utils/traffic/traffic.go @@ -0,0 +1,22 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package traffic contains helper functions specifically for generating, +// sending, and validating network traffic related to inference workloads +// within the Gateway API Inference Extension conformance tests. +package traffic + +// TODO: Add helpers for specific inference protocols or request patterns as needed. diff --git a/go.mod b/go.mod index 076bdf4b..30d0487e 100644 --- a/go.mod +++ b/go.mod @@ -25,7 +25,8 @@ require ( k8s.io/component-base v0.32.4 k8s.io/utils v0.0.0-20241210054802-24370beab758 sigs.k8s.io/controller-runtime v0.20.4 - sigs.k8s.io/structured-merge-diff/v4 v4.7.0 + sigs.k8s.io/gateway-api v1.2.1 + sigs.k8s.io/structured-merge-diff/v4 v4.6.0 sigs.k8s.io/yaml v1.4.0 ) @@ -42,17 +43,17 @@ require ( github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/emicklei/go-restful/v3 v3.11.0 // indirect + github.com/emicklei/go-restful/v3 v3.12.0 // indirect github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect - github.com/fatih/color v1.16.0 // indirect + github.com/fatih/color v1.17.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.7.0 // indirect github.com/fxamacker/cbor/v2 v2.7.0 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect - github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/jsonreference v0.21.0 // indirect github.com/go-openapi/swag v0.23.0 // indirect github.com/go-playground/locales v0.14.1 // indirect github.com/go-playground/universal-translator v0.18.0 // indirect @@ -67,10 +68,10 @@ require ( github.com/google/gofuzz v1.2.0 // indirect github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 // indirect github.com/google/uuid v1.6.0 // indirect - github.com/gorilla/websocket v1.5.0 // indirect + github.com/gorilla/websocket v1.5.1 // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect github.com/huandu/xstrings v1.3.3 // indirect - github.com/imdario/mergo v0.3.11 // indirect + github.com/imdario/mergo v0.3.16 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect @@ -128,6 +129,6 @@ require ( k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 // indirect - sigs.k8s.io/controller-tools v0.14.0 // indirect + sigs.k8s.io/controller-tools v0.16.3 // indirect sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect ) diff --git a/go.sum b/go.sum index 0258fc7a..6688c578 100644 --- a/go.sum +++ b/go.sum @@ -23,25 +23,24 @@ github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XL github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3 h1:boJj011Hh+874zpIySeApCX4GeOjPl9qhRF3QuIZq+Q= github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= -github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/elastic/crd-ref-docs v0.1.0 h1:Cr5kz89QB3Iuuj7dhAfLMApCrChEGAaIBTxGk/xuRKw= github.com/elastic/crd-ref-docs v0.1.0/go.mod h1:X83mMBdJt05heJUYiS3T0yJ/JkCuliuhSUNav5Gjo/U= -github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= -github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/emicklei/go-restful/v3 v3.12.0 h1:y2DdzBAURM29NFF94q6RaY4vjIH1rtwDapwQtU84iWk= +github.com/emicklei/go-restful/v3 v3.12.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/envoyproxy/go-control-plane/envoy v1.32.4 h1:jb83lalDRZSpPWW2Z7Mck/8kXZ5CQAFYVjQcdVIr83A= github.com/envoyproxy/go-control-plane/envoy v1.32.4/go.mod h1:Gzjc5k8JcJswLjAx1Zm+wSYE20UrLtt7JZMWiWQXQEw= github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8= github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU= -github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= -github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ= +github.com/evanphx/json-patch v5.7.0+incompatible h1:vgGkfT/9f8zE6tvSCe74nfpAVDQ2tG6yudJd8LBksgI= +github.com/evanphx/json-patch v5.7.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= -github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= -github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= +github.com/fatih/color v1.17.0 h1:GlRw1BRJxkpqUCBKzKOw098ed57fEsKeNjpTe3cSjK4= +github.com/fatih/color v1.17.0/go.mod h1:YZ7TlrGPkiz6ku9fK3TLD/pl3CpsiFyu8N92HLgmosI= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= @@ -55,12 +54,10 @@ github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= -github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= -github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= -github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= -github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= +github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= github.com/go-playground/locales v0.14.0/go.mod h1:sawfccIbzZTqEDETgFXqTho0QybSa7l++s0DH+LDiLs= @@ -96,14 +93,14 @@ github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/gorilla/websocket v1.5.0 h1:PPwGk2jz7EePpoHN/+ClbZu8SPxiqlu12wZP/3sWmnc= -github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/gorilla/websocket v1.5.1 h1:gmztn0JnHVt9JZquRuzLw3g4wouNVzKL15iLr/zn/QY= +github.com/gorilla/websocket v1.5.1/go.mod h1:x3kM2JMyaluk02fnUJpQuwD2dCS5NDG2ZHL0uE0tcaY= github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1YCS1PXdKYWi8FsN0= github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k= github.com/huandu/xstrings v1.3.3 h1:/Gcsuc1x8JVbJ9/rlye4xZnVAbEkGauT8lbebqcQws4= github.com/huandu/xstrings v1.3.3/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE= -github.com/imdario/mergo v0.3.11 h1:3tnifQM4i+fbajXKBHXWEH+KvNHqojZ778UH75j3bGA= -github.com/imdario/mergo v0.3.11/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA= +github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= +github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= @@ -114,11 +111,8 @@ github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= -github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= @@ -294,7 +288,6 @@ gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= -gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= @@ -326,13 +319,15 @@ sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0 h1:CPT0ExVicCzcp sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.0/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= sigs.k8s.io/controller-runtime v0.20.4 h1:X3c+Odnxz+iPTRobG4tp092+CvBU9UK0t/bRf+n0DGU= sigs.k8s.io/controller-runtime v0.20.4/go.mod h1:xg2XB0K5ShQzAgsoujxuKN4LNXR2LfwwHsPj7Iaw+XY= -sigs.k8s.io/controller-tools v0.14.0 h1:rnNoCC5wSXlrNoBKKzL70LNJKIQKEzT6lloG6/LF73A= -sigs.k8s.io/controller-tools v0.14.0/go.mod h1:TV7uOtNNnnR72SpzhStvPkoS/U5ir0nMudrkrC4M9Sc= +sigs.k8s.io/controller-tools v0.16.3 h1:z48C5/d4jCVQQvtiSBL5MYyZ3EO2eFIOXrIKMgHVhFY= +sigs.k8s.io/controller-tools v0.16.3/go.mod h1:AEj6k+w1kYpLZv2einOH3mj52ips4W/6FUjnB5tkJGs= +sigs.k8s.io/gateway-api v1.2.1 h1:fZZ/+RyRb+Y5tGkwxFKuYuSRQHu9dZtbjenblleOLHM= +sigs.k8s.io/gateway-api v1.2.1/go.mod h1:EpNfEXNjiYfUJypf0eZ0P5iXA9ekSGWaS1WgPaM42X0= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 h1:/Rv+M11QRah1itp8VhT6HoVx1Ray9eB4DBr+K+/sCJ8= sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3/go.mod h1:18nIHnGi6636UCz6m8i4DhaJ65T6EruyzmoQqI2BVDo= sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016 h1:kXv6kKdoEtedwuqMmkqhbkgvYKeycVbC8+iPCP9j5kQ= sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= -sigs.k8s.io/structured-merge-diff/v4 v4.7.0 h1:qPeWmscJcXP0snki5IYF79Z8xrl8ETFxgMd7wez1XkI= -sigs.k8s.io/structured-merge-diff/v4 v4.7.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= +sigs.k8s.io/structured-merge-diff/v4 v4.6.0 h1:IUA9nvMmnKWcj5jl84xn+T5MnlZKThmUW1TdblaLVAc= +sigs.k8s.io/structured-merge-diff/v4 v4.6.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= From fc3c173fd4f9ddad4364cdc82dc73592e66ff905 Mon Sep 17 00:00:00 2001 From: Cong Liu Date: Tue, 29 Apr 2025 09:22:02 -0700 Subject: [PATCH 73/74] Move scheduler initialization up to the main (#757) --- cmd/epp/main.go | 3 +++ pkg/epp/handlers/request.go | 5 +++++ pkg/epp/server/runserver.go | 4 ++-- test/integration/epp/hermetic_test.go | 2 ++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/cmd/epp/main.go b/cmd/epp/main.go index c0a87e62..bac4b852 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -41,6 +41,7 @@ import ( backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" ) @@ -169,6 +170,7 @@ func run() error { datastore := datastore.NewDatastore(ctx, pmf) + scheduler := scheduling.NewScheduler(datastore) serverRunner := &runserver.ExtProcServerRunner{ GrpcPort: *grpcPort, DestinationEndpointHintMetadataNamespace: *destinationEndpointHintMetadataNamespace, @@ -178,6 +180,7 @@ func run() error { SecureServing: *secureServing, CertPath: *certPath, RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval, + Scheduler: scheduler, } if err := serverRunner.SetupWithManager(ctx, mgr); err != nil { setupLog.Error(err, "Failed to setup ext-proc controllers") diff --git a/pkg/epp/handlers/request.go b/pkg/epp/handlers/request.go index 8d30e543..cfcd82ec 100644 --- a/pkg/epp/handlers/request.go +++ b/pkg/epp/handlers/request.go @@ -46,6 +46,10 @@ func (s *StreamingServer) HandleRequestBody( if !ok { return reqCtx, errutil.Error{Code: errutil.BadRequest, Msg: "model not found in request"} } + prompt, ok := requestBodyMap["prompt"].(string) + if !ok { + return reqCtx, errutil.Error{Code: errutil.BadRequest, Msg: "prompt not found in request"} + } modelName := model @@ -66,6 +70,7 @@ func (s *StreamingServer) HandleRequestBody( Model: model, ResolvedTargetModel: modelName, Critical: modelObj.Spec.Criticality != nil && *modelObj.Spec.Criticality == v1alpha2.Critical, + Prompt: prompt, } logger.V(logutil.DEBUG).Info("LLM request assembled", "request", llmReq) diff --git a/pkg/epp/server/runserver.go b/pkg/epp/server/runserver.go index 0c0a6a6d..687a555c 100644 --- a/pkg/epp/server/runserver.go +++ b/pkg/epp/server/runserver.go @@ -35,7 +35,6 @@ import ( "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/controller" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/handlers" - "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" ) // ExtProcServerRunner provides methods to manage an external process server. @@ -49,6 +48,7 @@ type ExtProcServerRunner struct { CertPath string UseStreaming bool RefreshPrometheusMetricsInterval time.Duration + Scheduler handlers.Scheduler // This should only be used in tests. We won't need this once we don't inject metrics in the tests. // TODO:(https://github.com/kubernetes-sigs/gateway-api-inference-extension/issues/432) Cleanup @@ -137,7 +137,7 @@ func (r *ExtProcServerRunner) AsRunnable(logger logr.Logger) manager.Runnable { } else { srv = grpc.NewServer() } - extProcServer := handlers.NewStreamingServer(scheduling.NewScheduler(r.Datastore), r.DestinationEndpointHintMetadataNamespace, r.DestinationEndpointHintKey, r.Datastore) + extProcServer := handlers.NewStreamingServer(r.Scheduler, r.DestinationEndpointHintMetadataNamespace, r.DestinationEndpointHintKey, r.Datastore) extProcPb.RegisterExternalProcessorServer( srv, extProcServer, diff --git a/test/integration/epp/hermetic_test.go b/test/integration/epp/hermetic_test.go index 35361329..c63fd017 100644 --- a/test/integration/epp/hermetic_test.go +++ b/test/integration/epp/hermetic_test.go @@ -65,6 +65,7 @@ import ( backendmetrics "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/backend/metrics" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/datastore" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/metrics" + "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/scheduling" "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" runserver "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/server" logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging" @@ -1351,6 +1352,7 @@ func BeforeSuite() func() { // Adjust from defaults serverRunner.PoolNamespacedName = types.NamespacedName{Name: "vllm-llama3-8b-instruct-pool", Namespace: "default"} serverRunner.Datastore = datastore.NewDatastore(context.Background(), pmf) + serverRunner.Scheduler = scheduling.NewScheduler(serverRunner.Datastore) serverRunner.SecureServing = false if err := serverRunner.SetupWithManager(context.Background(), mgr); err != nil { From 927c700d6ff876e758b96a50f69d99d00e25277f Mon Sep 17 00:00:00 2001 From: Jeff Luo Date: Tue, 29 Apr 2025 14:09:54 -0400 Subject: [PATCH 74/74] Add inference_extension_info metric for project metadata (#744) Start with just commit, version information will be added in a follow-up change. Verified: ``` inference_extension_info{commit="60f8c57bb95b656a75d27564d5ff01c060bcdba5"} 1 ``` --- Dockerfile | 3 ++- cmd/epp/main.go | 2 ++ pkg/epp/metrics/metrics.go | 44 ++++++++++++++++++++++++++++++++++++++ site-src/guides/metrics.md | 2 ++ 4 files changed, 50 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 8fb00dfb..d050b869 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,8 +19,9 @@ COPY cmd ./cmd COPY pkg ./pkg COPY internal ./internal COPY api ./api +COPY .git ./.git WORKDIR /src/cmd/epp -RUN go build -o /epp +RUN go build -buildvcs=true -o /epp ## Multistage deploy FROM ${BASE_IMAGE} diff --git a/cmd/epp/main.go b/cmd/epp/main.go index bac4b852..2bd779c5 100644 --- a/cmd/epp/main.go +++ b/cmd/epp/main.go @@ -250,6 +250,8 @@ func registerHealthServer(mgr manager.Manager, logger logr.Logger, ds datastore. func registerMetricsHandler(mgr manager.Manager, port int, cfg *rest.Config) error { metrics.Register() + metrics.RecordInferenceExtensionInfo() + // Init HTTP server. h, err := metricsHandlerWithAuthenticationAndAuthorization(cfg) if err != nil { diff --git a/pkg/epp/metrics/metrics.go b/pkg/epp/metrics/metrics.go index 56dcfca8..6df3dab3 100644 --- a/pkg/epp/metrics/metrics.go +++ b/pkg/epp/metrics/metrics.go @@ -18,6 +18,7 @@ package metrics import ( "context" + "runtime/debug" "sync" "time" @@ -31,6 +32,12 @@ const ( InferenceModelComponent = "inference_model" InferencePoolComponent = "inference_pool" EPPComponent = "endpoint_picker" + InferenceExtension = "inference_extension" +) + +var ( + // The git hash of the latest commit in the build. + CommitHash string ) var ( @@ -191,6 +198,17 @@ var ( }, []string{"plugin_type", "plugin_name"}, ) + + // Info Metrics + InferenceExtensionInfo = compbasemetrics.NewGaugeVec( + &compbasemetrics.GaugeOpts{ + Subsystem: InferenceExtension, + Name: "info", + Help: "General information of the current build of Inference Extension.", + StabilityLevel: compbasemetrics.ALPHA, + }, + []string{"commit"}, + ) ) var registerMetrics sync.Once @@ -213,6 +231,8 @@ func Register() { legacyregistry.MustRegister(inferencePoolReadyPods) legacyregistry.MustRegister(SchedulerPluginProcessingLatencies) + + legacyregistry.MustRegister(InferenceExtensionInfo) }) } @@ -315,3 +335,27 @@ func RecordinferencePoolReadyPods(name string, runningPods float64) { func RecordSchedulerPluginProcessingLatency(pluginType, pluginName string, duration time.Duration) { SchedulerPluginProcessingLatencies.WithLabelValues(pluginType, pluginName).Observe(duration.Seconds()) } + +func RecordInferenceExtensionInfo() { + if CommitHash != "" { + InferenceExtensionInfo.WithLabelValues(CommitHash).Set(1) + } +} + +func init() { + info, ok := debug.ReadBuildInfo() + if !ok { + return + } + + var Commit = func(i *debug.BuildInfo) string { + for _, setting := range i.Settings { + if setting.Key == "vcs.revision" { + return setting.Value + } + } + return "" + }(info) + + CommitHash = Commit +} diff --git a/site-src/guides/metrics.md b/site-src/guides/metrics.md index d16c7d47..ab3ba3fd 100644 --- a/site-src/guides/metrics.md +++ b/site-src/guides/metrics.md @@ -35,6 +35,8 @@ curl -i ${IP}:${PORT}/v1/completions -H 'Content-Type: application/json' -d '{ | inference_pool_average_kv_cache_utilization | Gauge | The average kv cache utilization for an inference server pool. | `name`=<inference-pool-name> | ALPHA | | inference_pool_average_queue_size | Gauge | The average number of requests pending in the model server queue. | `name`=<inference-pool-name> | ALPHA | | inference_pool_ready_pods | Gauge | The number of ready pods for an inference server pool. | `name`=<inference-pool-name> | ALPHA | +| inference_extension_info | Gauge | The general information of the current build. | `commit`=<hash-of-the-build> | ALPHA | + ## Scrape Metrics