diff --git a/.coveragerc b/.coveragerc index 9d801989cb..94ecfe88ff 100644 --- a/.coveragerc +++ b/.coveragerc @@ -34,6 +34,5 @@ exclude_lines = omit = */gapic/*.py */proto/*.py - */core/*.py */site-packages/*.py google/cloud/__init__.py diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 597e0c3261..ae437bcb2f 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:e8dcfd7cbfd8beac3a3ff8d3f3185287ea0625d859168cc80faccfc9a7a00455 -# created: 2024-09-16T21:04:09.091105552Z + digest: sha256:5efdf8d38e5a22c1ec9e5541cbdfde56399bdffcb6f531183f84ac66052a8024 +# created: 2024-10-23T18:04:53.195998718Z diff --git a/.kokoro/continuous/doctest.cfg b/.kokoro/continuous/doctest.cfg index dca21d43fd..6016700408 100644 --- a/.kokoro/continuous/doctest.cfg +++ b/.kokoro/continuous/doctest.cfg @@ -3,7 +3,7 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "doctest" + value: "doctest cleanup" } env_vars: { diff --git a/.kokoro/docker/docs/requirements.txt b/.kokoro/docker/docs/requirements.txt index 7129c77155..66eacc82f0 100644 --- a/.kokoro/docker/docs/requirements.txt +++ b/.kokoro/docker/docs/requirements.txt @@ -4,39 +4,39 @@ # # pip-compile --allow-unsafe --generate-hashes requirements.in # -argcomplete==3.4.0 \ - --hash=sha256:69a79e083a716173e5532e0fa3bef45f793f4e61096cf52b5a42c0211c8b8aa5 \ - --hash=sha256:c2abcdfe1be8ace47ba777d4fce319eb13bf8ad9dace8d085dcad6eded88057f +argcomplete==3.5.1 \ + --hash=sha256:1a1d148bdaa3e3b93454900163403df41448a248af01b6e849edc5ac08e6c363 \ + --hash=sha256:eb1ee355aa2557bd3d0145de7b06b2a45b0ce461e1e7813f5d066039ab4177b4 # via nox colorlog==6.8.2 \ --hash=sha256:3e3e079a41feb5a1b64f978b5ea4f46040a94f11f0e8bbb8261e3dbbeca64d44 \ --hash=sha256:4dcbb62368e2800cb3c5abd348da7e53f6c362dda502ec27c560b2e58a66bd33 # via nox -distlib==0.3.8 \ - --hash=sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784 \ - --hash=sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64 +distlib==0.3.9 \ + --hash=sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87 \ + --hash=sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403 # via virtualenv -filelock==3.15.4 \ - --hash=sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb \ - --hash=sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7 +filelock==3.16.1 \ + --hash=sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0 \ + --hash=sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435 # via virtualenv -nox==2024.4.15 \ - --hash=sha256:6492236efa15a460ecb98e7b67562a28b70da006ab0be164e8821177577c0565 \ - --hash=sha256:ecf6700199cdfa9e5ea0a41ff5e6ef4641d09508eda6edb89d9987864115817f +nox==2024.10.9 \ + --hash=sha256:1d36f309a0a2a853e9bccb76bbef6bb118ba92fa92674d15604ca99adeb29eab \ + --hash=sha256:7aa9dc8d1c27e9f45ab046ffd1c3b2c4f7c91755304769df231308849ebded95 # via -r requirements.in packaging==24.1 \ --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \ --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 # via nox -platformdirs==4.2.2 \ - --hash=sha256:2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee \ - --hash=sha256:38b7b51f512eed9e84a22788b4bce1de17c0adb134d6becb09836e37d8654cd3 +platformdirs==4.3.6 \ + --hash=sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907 \ + --hash=sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb # via virtualenv -tomli==2.0.1 \ - --hash=sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc \ - --hash=sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f +tomli==2.0.2 \ + --hash=sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38 \ + --hash=sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed # via nox -virtualenv==20.26.3 \ - --hash=sha256:4c43a2a236279d9ea36a0d76f98d84bd6ca94ac4e0f4a3b9d46d05e10fea542a \ - --hash=sha256:8cc4a31139e796e9a7de2cd5cf2489de1217193116a8fd42328f1bd65f434589 +virtualenv==20.26.6 \ + --hash=sha256:280aede09a2a5c317e409a00102e7077c6432c5a38f0ef938e643805a7ad2c48 \ + --hash=sha256:7345cc5b25405607a624d8418154577459c3e0277f5466dd79c49d5e492995f2 # via nox diff --git a/.kokoro/docs/common.cfg b/.kokoro/docs/common.cfg index e6e409f29c..5f7559f9da 100644 --- a/.kokoro/docs/common.cfg +++ b/.kokoro/docs/common.cfg @@ -30,7 +30,7 @@ env_vars: { env_vars: { key: "V2_STAGING_BUCKET" - # Push non-cloud library docs to `docs-staging-v2-staging` instead of the + # Push non-cloud library docs to `docs-staging-v2-dev` instead of the # Cloud RAD bucket `docs-staging-v2` value: "docs-staging-v2" } @@ -64,4 +64,4 @@ before_action { keyname: "docuploader_service_account" } } -} \ No newline at end of file +} diff --git a/.kokoro/presubmit/doctest.cfg b/.kokoro/presubmit/doctest.cfg index dca21d43fd..6016700408 100644 --- a/.kokoro/presubmit/doctest.cfg +++ b/.kokoro/presubmit/doctest.cfg @@ -3,7 +3,7 @@ # Only run this nox session. env_vars: { key: "NOX_SESSION" - value: "doctest" + value: "doctest cleanup" } env_vars: { diff --git a/.kokoro/release.sh b/.kokoro/release.sh index b1dd5f09ec..a2eae5fda1 100755 --- a/.kokoro/release.sh +++ b/.kokoro/release.sh @@ -23,7 +23,7 @@ python3 -m releasetool publish-reporter-script > /tmp/publisher-script; source / export PYTHONUNBUFFERED=1 # Move into the package, build the distribution and upload. -TWINE_PASSWORD=$(cat "${KOKORO_KEYSTORE_DIR}/73713_google-cloud-pypi-token-keystore-2") +TWINE_PASSWORD=$(cat "${KOKORO_KEYSTORE_DIR}/73713_google-cloud-pypi-token-keystore-3") cd github/python-bigquery-dataframes python3 setup.py sdist bdist_wheel twine upload --username __token__ --password "${TWINE_PASSWORD}" dist/* diff --git a/.kokoro/release/common.cfg b/.kokoro/release/common.cfg index 824d62f257..146dd8f451 100644 --- a/.kokoro/release/common.cfg +++ b/.kokoro/release/common.cfg @@ -28,17 +28,11 @@ before_action { fetch_keystore { keystore_resource { keystore_config_id: 73713 - keyname: "google-cloud-pypi-token-keystore-2" + keyname: "google-cloud-pypi-token-keystore-3" } } } -# Tokens needed to report release status back to GitHub -env_vars: { - key: "SECRET_MANAGER_KEYS" - value: "releasetool-publish-reporter-app,releasetool-publish-reporter-googleapis-installation,releasetool-publish-reporter-pem" -} - # Store the packages we uploaded to PyPI. That way, we have a record of exactly # what we published, which we can use to generate SBOMs and attestations. action { diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt index 9622baf0ba..006d8ef931 100644 --- a/.kokoro/requirements.txt +++ b/.kokoro/requirements.txt @@ -4,79 +4,94 @@ # # pip-compile --allow-unsafe --generate-hashes requirements.in # -argcomplete==3.4.0 \ - --hash=sha256:69a79e083a716173e5532e0fa3bef45f793f4e61096cf52b5a42c0211c8b8aa5 \ - --hash=sha256:c2abcdfe1be8ace47ba777d4fce319eb13bf8ad9dace8d085dcad6eded88057f +argcomplete==3.5.1 \ + --hash=sha256:1a1d148bdaa3e3b93454900163403df41448a248af01b6e849edc5ac08e6c363 \ + --hash=sha256:eb1ee355aa2557bd3d0145de7b06b2a45b0ce461e1e7813f5d066039ab4177b4 # via nox -attrs==23.2.0 \ - --hash=sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30 \ - --hash=sha256:99b87a485a5820b23b879f04c2305b44b951b502fd64be915879d77a7e8fc6f1 +attrs==24.2.0 \ + --hash=sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346 \ + --hash=sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2 # via gcp-releasetool backports-tarfile==1.2.0 \ --hash=sha256:77e284d754527b01fb1e6fa8a1afe577858ebe4e9dad8919e34c862cb399bc34 \ --hash=sha256:d75e02c268746e1b8144c278978b6e98e85de6ad16f8e4b0844a154557eca991 # via jaraco-context -cachetools==5.3.3 \ - --hash=sha256:0abad1021d3f8325b2fc1d2e9c8b9c9d57b04c3932657a72465447332c24d945 \ - --hash=sha256:ba29e2dfa0b8b556606f097407ed1aa62080ee108ab0dc5ec9d6a723a007d105 +cachetools==5.5.0 \ + --hash=sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292 \ + --hash=sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a # via google-auth -certifi==2024.7.4 \ - --hash=sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b \ - --hash=sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90 +certifi==2024.8.30 \ + --hash=sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8 \ + --hash=sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9 # via requests -cffi==1.16.0 \ - --hash=sha256:0c9ef6ff37e974b73c25eecc13952c55bceed9112be2d9d938ded8e856138bcc \ - --hash=sha256:131fd094d1065b19540c3d72594260f118b231090295d8c34e19a7bbcf2e860a \ - --hash=sha256:1b8ebc27c014c59692bb2664c7d13ce7a6e9a629be20e54e7271fa696ff2b417 \ - --hash=sha256:2c56b361916f390cd758a57f2e16233eb4f64bcbeee88a4881ea90fca14dc6ab \ - --hash=sha256:2d92b25dbf6cae33f65005baf472d2c245c050b1ce709cc4588cdcdd5495b520 \ - --hash=sha256:31d13b0f99e0836b7ff893d37af07366ebc90b678b6664c955b54561fc36ef36 \ - --hash=sha256:32c68ef735dbe5857c810328cb2481e24722a59a2003018885514d4c09af9743 \ - --hash=sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8 \ - --hash=sha256:582215a0e9adbe0e379761260553ba11c58943e4bbe9c36430c4ca6ac74b15ed \ - --hash=sha256:5b50bf3f55561dac5438f8e70bfcdfd74543fd60df5fa5f62d94e5867deca684 \ - --hash=sha256:5bf44d66cdf9e893637896c7faa22298baebcd18d1ddb6d2626a6e39793a1d56 \ - --hash=sha256:6602bc8dc6f3a9e02b6c22c4fc1e47aa50f8f8e6d3f78a5e16ac33ef5fefa324 \ - --hash=sha256:673739cb539f8cdaa07d92d02efa93c9ccf87e345b9a0b556e3ecc666718468d \ - --hash=sha256:68678abf380b42ce21a5f2abde8efee05c114c2fdb2e9eef2efdb0257fba1235 \ - --hash=sha256:68e7c44931cc171c54ccb702482e9fc723192e88d25a0e133edd7aff8fcd1f6e \ - --hash=sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088 \ - --hash=sha256:748dcd1e3d3d7cd5443ef03ce8685043294ad6bd7c02a38d1bd367cfd968e000 \ - --hash=sha256:7651c50c8c5ef7bdb41108b7b8c5a83013bfaa8a935590c5d74627c047a583c7 \ - --hash=sha256:7b78010e7b97fef4bee1e896df8a4bbb6712b7f05b7ef630f9d1da00f6444d2e \ - --hash=sha256:7e61e3e4fa664a8588aa25c883eab612a188c725755afff6289454d6362b9673 \ - --hash=sha256:80876338e19c951fdfed6198e70bc88f1c9758b94578d5a7c4c91a87af3cf31c \ - --hash=sha256:8895613bcc094d4a1b2dbe179d88d7fb4a15cee43c052e8885783fac397d91fe \ - --hash=sha256:88e2b3c14bdb32e440be531ade29d3c50a1a59cd4e51b1dd8b0865c54ea5d2e2 \ - --hash=sha256:8f8e709127c6c77446a8c0a8c8bf3c8ee706a06cd44b1e827c3e6a2ee6b8c098 \ - --hash=sha256:9cb4a35b3642fc5c005a6755a5d17c6c8b6bcb6981baf81cea8bfbc8903e8ba8 \ - --hash=sha256:9f90389693731ff1f659e55c7d1640e2ec43ff725cc61b04b2f9c6d8d017df6a \ - --hash=sha256:a09582f178759ee8128d9270cd1344154fd473bb77d94ce0aeb2a93ebf0feaf0 \ - --hash=sha256:a6a14b17d7e17fa0d207ac08642c8820f84f25ce17a442fd15e27ea18d67c59b \ - --hash=sha256:a72e8961a86d19bdb45851d8f1f08b041ea37d2bd8d4fd19903bc3083d80c896 \ - --hash=sha256:abd808f9c129ba2beda4cfc53bde801e5bcf9d6e0f22f095e45327c038bfe68e \ - --hash=sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9 \ - --hash=sha256:b29ebffcf550f9da55bec9e02ad430c992a87e5f512cd63388abb76f1036d8d2 \ - --hash=sha256:b2ca4e77f9f47c55c194982e10f058db063937845bb2b7a86c84a6cfe0aefa8b \ - --hash=sha256:b7be2d771cdba2942e13215c4e340bfd76398e9227ad10402a8767ab1865d2e6 \ - --hash=sha256:b84834d0cf97e7d27dd5b7f3aca7b6e9263c56308ab9dc8aae9784abb774d404 \ - --hash=sha256:b86851a328eedc692acf81fb05444bdf1891747c25af7529e39ddafaf68a4f3f \ - --hash=sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0 \ - --hash=sha256:c0f31130ebc2d37cdd8e44605fb5fa7ad59049298b3f745c74fa74c62fbfcfc4 \ - --hash=sha256:c6a164aa47843fb1b01e941d385aab7215563bb8816d80ff3a363a9f8448a8dc \ - --hash=sha256:d8a9d3ebe49f084ad71f9269834ceccbf398253c9fac910c4fd7053ff1386936 \ - --hash=sha256:db8e577c19c0fda0beb7e0d4e09e0ba74b1e4c092e0e40bfa12fe05b6f6d75ba \ - --hash=sha256:dc9b18bf40cc75f66f40a7379f6a9513244fe33c0e8aa72e2d56b0196a7ef872 \ - --hash=sha256:e09f3ff613345df5e8c3667da1d918f9149bd623cd9070c983c013792a9a62eb \ - --hash=sha256:e4108df7fe9b707191e55f33efbcb2d81928e10cea45527879a4749cbe472614 \ - --hash=sha256:e6024675e67af929088fda399b2094574609396b1decb609c55fa58b028a32a1 \ - --hash=sha256:e70f54f1796669ef691ca07d046cd81a29cb4deb1e5f942003f401c0c4a2695d \ - --hash=sha256:e715596e683d2ce000574bae5d07bd522c781a822866c20495e52520564f0969 \ - --hash=sha256:e760191dd42581e023a68b758769e2da259b5d52e3103c6060ddc02c9edb8d7b \ - --hash=sha256:ed86a35631f7bfbb28e108dd96773b9d5a6ce4811cf6ea468bb6a359b256b1e4 \ - --hash=sha256:ee07e47c12890ef248766a6e55bd38ebfb2bb8edd4142d56db91b21ea68b7627 \ - --hash=sha256:fa3a0128b152627161ce47201262d3140edb5a5c3da88d73a1b790a959126956 \ - --hash=sha256:fcc8eb6d5902bb1cf6dc4f187ee3ea80a1eba0a89aba40a5cb20a5087d961357 +cffi==1.17.1 \ + --hash=sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8 \ + --hash=sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2 \ + --hash=sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1 \ + --hash=sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15 \ + --hash=sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36 \ + --hash=sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824 \ + --hash=sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8 \ + --hash=sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36 \ + --hash=sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17 \ + --hash=sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf \ + --hash=sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc \ + --hash=sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3 \ + --hash=sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed \ + --hash=sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702 \ + --hash=sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1 \ + --hash=sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8 \ + --hash=sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903 \ + --hash=sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6 \ + --hash=sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d \ + --hash=sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b \ + --hash=sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e \ + --hash=sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be \ + --hash=sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c \ + --hash=sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683 \ + --hash=sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9 \ + --hash=sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c \ + --hash=sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8 \ + --hash=sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1 \ + --hash=sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4 \ + --hash=sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655 \ + --hash=sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67 \ + --hash=sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595 \ + --hash=sha256:98e3969bcff97cae1b2def8ba499ea3d6f31ddfdb7635374834cf89a1a08ecf0 \ + --hash=sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65 \ + --hash=sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41 \ + --hash=sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6 \ + --hash=sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401 \ + --hash=sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6 \ + --hash=sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3 \ + --hash=sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16 \ + --hash=sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93 \ + --hash=sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e \ + --hash=sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4 \ + --hash=sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964 \ + --hash=sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c \ + --hash=sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576 \ + --hash=sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0 \ + --hash=sha256:cdf5ce3acdfd1661132f2a9c19cac174758dc2352bfe37d98aa7512c6b7178b3 \ + --hash=sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662 \ + --hash=sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3 \ + --hash=sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff \ + --hash=sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5 \ + --hash=sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd \ + --hash=sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f \ + --hash=sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5 \ + --hash=sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14 \ + --hash=sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d \ + --hash=sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9 \ + --hash=sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7 \ + --hash=sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382 \ + --hash=sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a \ + --hash=sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e \ + --hash=sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a \ + --hash=sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4 \ + --hash=sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99 \ + --hash=sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87 \ + --hash=sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b # via cryptography charset-normalizer==2.1.1 \ --hash=sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845 \ @@ -97,72 +112,67 @@ colorlog==6.8.2 \ # via # gcp-docuploader # nox -cryptography==42.0.8 \ - --hash=sha256:013629ae70b40af70c9a7a5db40abe5d9054e6f4380e50ce769947b73bf3caad \ - --hash=sha256:2346b911eb349ab547076f47f2e035fc8ff2c02380a7cbbf8d87114fa0f1c583 \ - --hash=sha256:2f66d9cd9147ee495a8374a45ca445819f8929a3efcd2e3df6428e46c3cbb10b \ - --hash=sha256:2f88d197e66c65be5e42cd72e5c18afbfae3f741742070e3019ac8f4ac57262c \ - --hash=sha256:31f721658a29331f895a5a54e7e82075554ccfb8b163a18719d342f5ffe5ecb1 \ - --hash=sha256:343728aac38decfdeecf55ecab3264b015be68fc2816ca800db649607aeee648 \ - --hash=sha256:5226d5d21ab681f432a9c1cf8b658c0cb02533eece706b155e5fbd8a0cdd3949 \ - --hash=sha256:57080dee41209e556a9a4ce60d229244f7a66ef52750f813bfbe18959770cfba \ - --hash=sha256:5a94eccb2a81a309806027e1670a358b99b8fe8bfe9f8d329f27d72c094dde8c \ - --hash=sha256:6b7c4f03ce01afd3b76cf69a5455caa9cfa3de8c8f493e0d3ab7d20611c8dae9 \ - --hash=sha256:7016f837e15b0a1c119d27ecd89b3515f01f90a8615ed5e9427e30d9cdbfed3d \ - --hash=sha256:81884c4d096c272f00aeb1f11cf62ccd39763581645b0812e99a91505fa48e0c \ - --hash=sha256:81d8a521705787afe7a18d5bfb47ea9d9cc068206270aad0b96a725022e18d2e \ - --hash=sha256:8d09d05439ce7baa8e9e95b07ec5b6c886f548deb7e0f69ef25f64b3bce842f2 \ - --hash=sha256:961e61cefdcb06e0c6d7e3a1b22ebe8b996eb2bf50614e89384be54c48c6b63d \ - --hash=sha256:9c0c1716c8447ee7dbf08d6db2e5c41c688544c61074b54fc4564196f55c25a7 \ - --hash=sha256:a0608251135d0e03111152e41f0cc2392d1e74e35703960d4190b2e0f4ca9c70 \ - --hash=sha256:a0c5b2b0585b6af82d7e385f55a8bc568abff8923af147ee3c07bd8b42cda8b2 \ - --hash=sha256:ad803773e9df0b92e0a817d22fd8a3675493f690b96130a5e24f1b8fabbea9c7 \ - --hash=sha256:b297f90c5723d04bcc8265fc2a0f86d4ea2e0f7ab4b6994459548d3a6b992a14 \ - --hash=sha256:ba4f0a211697362e89ad822e667d8d340b4d8d55fae72cdd619389fb5912eefe \ - --hash=sha256:c4783183f7cb757b73b2ae9aed6599b96338eb957233c58ca8f49a49cc32fd5e \ - --hash=sha256:c9bb2ae11bfbab395bdd072985abde58ea9860ed84e59dbc0463a5d0159f5b71 \ - --hash=sha256:cafb92b2bc622cd1aa6a1dce4b93307792633f4c5fe1f46c6b97cf67073ec961 \ - --hash=sha256:d45b940883a03e19e944456a558b67a41160e367a719833c53de6911cabba2b7 \ - --hash=sha256:dc0fdf6787f37b1c6b08e6dfc892d9d068b5bdb671198c72072828b80bd5fe4c \ - --hash=sha256:dea567d1b0e8bc5764b9443858b673b734100c2871dc93163f58c46a97a83d28 \ - --hash=sha256:dec9b018df185f08483f294cae6ccac29e7a6e0678996587363dc352dc65c842 \ - --hash=sha256:e3ec3672626e1b9e55afd0df6d774ff0e953452886e06e0f1eb7eb0c832e8902 \ - --hash=sha256:e599b53fd95357d92304510fb7bda8523ed1f79ca98dce2f43c115950aa78801 \ - --hash=sha256:fa76fbb7596cc5839320000cdd5d0955313696d9511debab7ee7278fc8b5c84a \ - --hash=sha256:fff12c88a672ab9c9c1cf7b0c80e3ad9e2ebd9d828d955c126be4fd3e5578c9e +cryptography==43.0.1 \ + --hash=sha256:014f58110f53237ace6a408b5beb6c427b64e084eb451ef25a28308270086494 \ + --hash=sha256:1bbcce1a551e262dfbafb6e6252f1ae36a248e615ca44ba302df077a846a8806 \ + --hash=sha256:203e92a75716d8cfb491dc47c79e17d0d9207ccffcbcb35f598fbe463ae3444d \ + --hash=sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062 \ + --hash=sha256:2bd51274dcd59f09dd952afb696bf9c61a7a49dfc764c04dd33ef7a6b502a1e2 \ + --hash=sha256:38926c50cff6f533f8a2dae3d7f19541432610d114a70808f0926d5aaa7121e4 \ + --hash=sha256:511f4273808ab590912a93ddb4e3914dfd8a388fed883361b02dea3791f292e1 \ + --hash=sha256:58d4e9129985185a06d849aa6df265bdd5a74ca6e1b736a77959b498e0505b85 \ + --hash=sha256:5b43d1ea6b378b54a1dc99dd8a2b5be47658fe9a7ce0a58ff0b55f4b43ef2b84 \ + --hash=sha256:61ec41068b7b74268fa86e3e9e12b9f0c21fcf65434571dbb13d954bceb08042 \ + --hash=sha256:666ae11966643886c2987b3b721899d250855718d6d9ce41b521252a17985f4d \ + --hash=sha256:68aaecc4178e90719e95298515979814bda0cbada1256a4485414860bd7ab962 \ + --hash=sha256:7c05650fe8023c5ed0d46793d4b7d7e6cd9c04e68eabe5b0aeea836e37bdcec2 \ + --hash=sha256:80eda8b3e173f0f247f711eef62be51b599b5d425c429b5d4ca6a05e9e856baa \ + --hash=sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d \ + --hash=sha256:88cce104c36870d70c49c7c8fd22885875d950d9ee6ab54df2745f83ba0dc365 \ + --hash=sha256:9d3cdb25fa98afdd3d0892d132b8d7139e2c087da1712041f6b762e4f807cc96 \ + --hash=sha256:a575913fb06e05e6b4b814d7f7468c2c660e8bb16d8d5a1faf9b33ccc569dd47 \ + --hash=sha256:ac119bb76b9faa00f48128b7f5679e1d8d437365c5d26f1c2c3f0da4ce1b553d \ + --hash=sha256:c1332724be35d23a854994ff0b66530119500b6053d0bd3363265f7e5e77288d \ + --hash=sha256:d03a475165f3134f773d1388aeb19c2d25ba88b6a9733c5c590b9ff7bbfa2e0c \ + --hash=sha256:d75601ad10b059ec832e78823b348bfa1a59f6b8d545db3a24fd44362a1564cb \ + --hash=sha256:de41fd81a41e53267cb020bb3a7212861da53a7d39f863585d13ea11049cf277 \ + --hash=sha256:e710bf40870f4db63c3d7d929aa9e09e4e7ee219e703f949ec4073b4294f6172 \ + --hash=sha256:ea25acb556320250756e53f9e20a4177515f012c9eaea17eb7587a8c4d8ae034 \ + --hash=sha256:f98bf604c82c416bc829e490c700ca1553eafdf2912a91e23a79d97d9801372a \ + --hash=sha256:fba1007b3ef89946dbbb515aeeb41e30203b004f0b4b00e5e16078b518563289 # via # -r requirements.in # gcp-releasetool # secretstorage -distlib==0.3.8 \ - --hash=sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784 \ - --hash=sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64 +distlib==0.3.9 \ + --hash=sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87 \ + --hash=sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403 # via virtualenv docutils==0.21.2 \ --hash=sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f \ --hash=sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2 # via readme-renderer -filelock==3.15.4 \ - --hash=sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb \ - --hash=sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7 +filelock==3.16.1 \ + --hash=sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0 \ + --hash=sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435 # via virtualenv gcp-docuploader==0.6.5 \ --hash=sha256:30221d4ac3e5a2b9c69aa52fdbef68cc3f27d0e6d0d90e220fc024584b8d2318 \ --hash=sha256:b7458ef93f605b9d46a4bf3a8dc1755dad1f31d030c8679edf304e343b347eea # via -r requirements.in -gcp-releasetool==2.0.1 \ - --hash=sha256:34314a910c08e8911d9c965bd44f8f2185c4f556e737d719c33a41f6a610de96 \ - --hash=sha256:b0d5863c6a070702b10883d37c4bdfd74bf930fe417f36c0c965d3b7c779ae62 +gcp-releasetool==2.1.1 \ + --hash=sha256:25639269f4eae510094f9dbed9894977e1966933211eb155a451deebc3fc0b30 \ + --hash=sha256:845f4ded3d9bfe8cc7fdaad789e83f4ea014affa77785259a7ddac4b243e099e # via -r requirements.in -google-api-core==2.19.1 \ - --hash=sha256:f12a9b8309b5e21d92483bbd47ce2c445861ec7d269ef6784ecc0ea8c1fa6125 \ - --hash=sha256:f4695f1e3650b316a795108a76a1c416e6afb036199d1c1f1f110916df479ffd +google-api-core==2.21.0 \ + --hash=sha256:4a152fd11a9f774ea606388d423b68aa7e6d6a0ffe4c8266f74979613ec09f81 \ + --hash=sha256:6869eacb2a37720380ba5898312af79a4d30b8bca1548fb4093e0697dc4bdf5d # via # google-cloud-core # google-cloud-storage -google-auth==2.31.0 \ - --hash=sha256:042c4702efa9f7d3c48d3a69341c209381b125faa6dbf3ebe56bc7e40ae05c23 \ - --hash=sha256:87805c36970047247c8afe614d4e3af8eceafc1ebba0c679fe75ddd1d575e871 +google-auth==2.35.0 \ + --hash=sha256:25df55f327ef021de8be50bad0dfd4a916ad0de96da86cd05661c9297723ad3f \ + --hash=sha256:f4c64ed4e01e8e8b646ef34c018f8bf3338df0c8e37d8b3bba40e7f574a3278a # via # gcp-releasetool # google-api-core @@ -172,97 +182,56 @@ google-cloud-core==2.4.1 \ --hash=sha256:9b7749272a812bde58fff28868d0c5e2f585b82f37e09a1f6ed2d4d10f134073 \ --hash=sha256:a9e6a4422b9ac5c29f79a0ede9485473338e2ce78d91f2370c01e730eab22e61 # via google-cloud-storage -google-cloud-storage==2.17.0 \ - --hash=sha256:49378abff54ef656b52dca5ef0f2eba9aa83dc2b2c72c78714b03a1a95fe9388 \ - --hash=sha256:5b393bc766b7a3bc6f5407b9e665b2450d36282614b7945e570b3480a456d1e1 +google-cloud-storage==2.18.2 \ + --hash=sha256:97a4d45c368b7d401ed48c4fdfe86e1e1cb96401c9e199e419d289e2c0370166 \ + --hash=sha256:aaf7acd70cdad9f274d29332673fcab98708d0e1f4dceb5a5356aaef06af4d99 # via gcp-docuploader -google-crc32c==1.5.0 \ - --hash=sha256:024894d9d3cfbc5943f8f230e23950cd4906b2fe004c72e29b209420a1e6b05a \ - --hash=sha256:02c65b9817512edc6a4ae7c7e987fea799d2e0ee40c53ec573a692bee24de876 \ - --hash=sha256:02ebb8bf46c13e36998aeaad1de9b48f4caf545e91d14041270d9dca767b780c \ - --hash=sha256:07eb3c611ce363c51a933bf6bd7f8e3878a51d124acfc89452a75120bc436289 \ - --hash=sha256:1034d91442ead5a95b5aaef90dbfaca8633b0247d1e41621d1e9f9db88c36298 \ - --hash=sha256:116a7c3c616dd14a3de8c64a965828b197e5f2d121fedd2f8c5585c547e87b02 \ - --hash=sha256:19e0a019d2c4dcc5e598cd4a4bc7b008546b0358bd322537c74ad47a5386884f \ - --hash=sha256:1c7abdac90433b09bad6c43a43af253e688c9cfc1c86d332aed13f9a7c7f65e2 \ - --hash=sha256:1e986b206dae4476f41bcec1faa057851f3889503a70e1bdb2378d406223994a \ - --hash=sha256:272d3892a1e1a2dbc39cc5cde96834c236d5327e2122d3aaa19f6614531bb6eb \ - --hash=sha256:278d2ed7c16cfc075c91378c4f47924c0625f5fc84b2d50d921b18b7975bd210 \ - --hash=sha256:2ad40e31093a4af319dadf503b2467ccdc8f67c72e4bcba97f8c10cb078207b5 \ - --hash=sha256:2e920d506ec85eb4ba50cd4228c2bec05642894d4c73c59b3a2fe20346bd00ee \ - --hash=sha256:3359fc442a743e870f4588fcf5dcbc1bf929df1fad8fb9905cd94e5edb02e84c \ - --hash=sha256:37933ec6e693e51a5b07505bd05de57eee12f3e8c32b07da7e73669398e6630a \ - --hash=sha256:398af5e3ba9cf768787eef45c803ff9614cc3e22a5b2f7d7ae116df8b11e3314 \ - --hash=sha256:3b747a674c20a67343cb61d43fdd9207ce5da6a99f629c6e2541aa0e89215bcd \ - --hash=sha256:461665ff58895f508e2866824a47bdee72497b091c730071f2b7575d5762ab65 \ - --hash=sha256:4c6fdd4fccbec90cc8a01fc00773fcd5fa28db683c116ee3cb35cd5da9ef6c37 \ - --hash=sha256:5829b792bf5822fd0a6f6eb34c5f81dd074f01d570ed7f36aa101d6fc7a0a6e4 \ - --hash=sha256:596d1f98fc70232fcb6590c439f43b350cb762fb5d61ce7b0e9db4539654cc13 \ - --hash=sha256:5ae44e10a8e3407dbe138984f21e536583f2bba1be9491239f942c2464ac0894 \ - --hash=sha256:635f5d4dd18758a1fbd1049a8e8d2fee4ffed124462d837d1a02a0e009c3ab31 \ - --hash=sha256:64e52e2b3970bd891309c113b54cf0e4384762c934d5ae56e283f9a0afcd953e \ - --hash=sha256:66741ef4ee08ea0b2cc3c86916ab66b6aef03768525627fd6a1b34968b4e3709 \ - --hash=sha256:67b741654b851abafb7bc625b6d1cdd520a379074e64b6a128e3b688c3c04740 \ - --hash=sha256:6ac08d24c1f16bd2bf5eca8eaf8304812f44af5cfe5062006ec676e7e1d50afc \ - --hash=sha256:6f998db4e71b645350b9ac28a2167e6632c239963ca9da411523bb439c5c514d \ - --hash=sha256:72218785ce41b9cfd2fc1d6a017dc1ff7acfc4c17d01053265c41a2c0cc39b8c \ - --hash=sha256:74dea7751d98034887dbd821b7aae3e1d36eda111d6ca36c206c44478035709c \ - --hash=sha256:759ce4851a4bb15ecabae28f4d2e18983c244eddd767f560165563bf9aefbc8d \ - --hash=sha256:77e2fd3057c9d78e225fa0a2160f96b64a824de17840351b26825b0848022906 \ - --hash=sha256:7c074fece789b5034b9b1404a1f8208fc2d4c6ce9decdd16e8220c5a793e6f61 \ - --hash=sha256:7c42c70cd1d362284289c6273adda4c6af8039a8ae12dc451dcd61cdabb8ab57 \ - --hash=sha256:7f57f14606cd1dd0f0de396e1e53824c371e9544a822648cd76c034d209b559c \ - --hash=sha256:83c681c526a3439b5cf94f7420471705bbf96262f49a6fe546a6db5f687a3d4a \ - --hash=sha256:8485b340a6a9e76c62a7dce3c98e5f102c9219f4cfbf896a00cf48caf078d438 \ - --hash=sha256:84e6e8cd997930fc66d5bb4fde61e2b62ba19d62b7abd7a69920406f9ecca946 \ - --hash=sha256:89284716bc6a5a415d4eaa11b1726d2d60a0cd12aadf5439828353662ede9dd7 \ - --hash=sha256:8b87e1a59c38f275c0e3676fc2ab6d59eccecfd460be267ac360cc31f7bcde96 \ - --hash=sha256:8f24ed114432de109aa9fd317278518a5af2d31ac2ea6b952b2f7782b43da091 \ - --hash=sha256:98cb4d057f285bd80d8778ebc4fde6b4d509ac3f331758fb1528b733215443ae \ - --hash=sha256:998679bf62b7fb599d2878aa3ed06b9ce688b8974893e7223c60db155f26bd8d \ - --hash=sha256:9ba053c5f50430a3fcfd36f75aff9caeba0440b2d076afdb79a318d6ca245f88 \ - --hash=sha256:9c99616c853bb585301df6de07ca2cadad344fd1ada6d62bb30aec05219c45d2 \ - --hash=sha256:a1fd716e7a01f8e717490fbe2e431d2905ab8aa598b9b12f8d10abebb36b04dd \ - --hash=sha256:a2355cba1f4ad8b6988a4ca3feed5bff33f6af2d7f134852cf279c2aebfde541 \ - --hash=sha256:b1f8133c9a275df5613a451e73f36c2aea4fe13c5c8997e22cf355ebd7bd0728 \ - --hash=sha256:b8667b48e7a7ef66afba2c81e1094ef526388d35b873966d8a9a447974ed9178 \ - --hash=sha256:ba1eb1843304b1e5537e1fca632fa894d6f6deca8d6389636ee5b4797affb968 \ - --hash=sha256:be82c3c8cfb15b30f36768797a640e800513793d6ae1724aaaafe5bf86f8f346 \ - --hash=sha256:c02ec1c5856179f171e032a31d6f8bf84e5a75c45c33b2e20a3de353b266ebd8 \ - --hash=sha256:c672d99a345849301784604bfeaeba4db0c7aae50b95be04dd651fd2a7310b93 \ - --hash=sha256:c6c777a480337ac14f38564ac88ae82d4cd238bf293f0a22295b66eb89ffced7 \ - --hash=sha256:cae0274952c079886567f3f4f685bcaf5708f0a23a5f5216fdab71f81a6c0273 \ - --hash=sha256:cd67cf24a553339d5062eff51013780a00d6f97a39ca062781d06b3a73b15462 \ - --hash=sha256:d3515f198eaa2f0ed49f8819d5732d70698c3fa37384146079b3799b97667a94 \ - --hash=sha256:d5280312b9af0976231f9e317c20e4a61cd2f9629b7bfea6a693d1878a264ebd \ - --hash=sha256:de06adc872bcd8c2a4e0dc51250e9e65ef2ca91be023b9d13ebd67c2ba552e1e \ - --hash=sha256:e1674e4307fa3024fc897ca774e9c7562c957af85df55efe2988ed9056dc4e57 \ - --hash=sha256:e2096eddb4e7c7bdae4bd69ad364e55e07b8316653234a56552d9c988bd2d61b \ - --hash=sha256:e560628513ed34759456a416bf86b54b2476c59144a9138165c9a1575801d0d9 \ - --hash=sha256:edfedb64740750e1a3b16152620220f51d58ff1b4abceb339ca92e934775c27a \ - --hash=sha256:f13cae8cc389a440def0c8c52057f37359014ccbc9dc1f0827936bcd367c6100 \ - --hash=sha256:f314013e7dcd5cf45ab1945d92e713eec788166262ae8deb2cfacd53def27325 \ - --hash=sha256:f583edb943cf2e09c60441b910d6a20b4d9d626c75a36c8fcac01a6c96c01183 \ - --hash=sha256:fd8536e902db7e365f49e7d9029283403974ccf29b13fc7028b97e2295b33556 \ - --hash=sha256:fe70e325aa68fa4b5edf7d1a4b6f691eb04bbccac0ace68e34820d283b5f80d4 +google-crc32c==1.6.0 \ + --hash=sha256:05e2d8c9a2f853ff116db9706b4a27350587f341eda835f46db3c0a8c8ce2f24 \ + --hash=sha256:18e311c64008f1f1379158158bb3f0c8d72635b9eb4f9545f8cf990c5668e59d \ + --hash=sha256:236c87a46cdf06384f614e9092b82c05f81bd34b80248021f729396a78e55d7e \ + --hash=sha256:35834855408429cecf495cac67ccbab802de269e948e27478b1e47dfb6465e57 \ + --hash=sha256:386122eeaaa76951a8196310432c5b0ef3b53590ef4c317ec7588ec554fec5d2 \ + --hash=sha256:40b05ab32a5067525670880eb5d169529089a26fe35dce8891127aeddc1950e8 \ + --hash=sha256:48abd62ca76a2cbe034542ed1b6aee851b6f28aaca4e6551b5599b6f3ef175cc \ + --hash=sha256:50cf2a96da226dcbff8671233ecf37bf6e95de98b2a2ebadbfdf455e6d05df42 \ + --hash=sha256:51c4f54dd8c6dfeb58d1df5e4f7f97df8abf17a36626a217f169893d1d7f3e9f \ + --hash=sha256:5bcc90b34df28a4b38653c36bb5ada35671ad105c99cfe915fb5bed7ad6924aa \ + --hash=sha256:62f6d4a29fea082ac4a3c9be5e415218255cf11684ac6ef5488eea0c9132689b \ + --hash=sha256:6eceb6ad197656a1ff49ebfbbfa870678c75be4344feb35ac1edf694309413dc \ + --hash=sha256:7aec8e88a3583515f9e0957fe4f5f6d8d4997e36d0f61624e70469771584c760 \ + --hash=sha256:91ca8145b060679ec9176e6de4f89b07363d6805bd4760631ef254905503598d \ + --hash=sha256:a184243544811e4a50d345838a883733461e67578959ac59964e43cca2c791e7 \ + --hash=sha256:a9e4b426c3702f3cd23b933436487eb34e01e00327fac20c9aebb68ccf34117d \ + --hash=sha256:bb0966e1c50d0ef5bc743312cc730b533491d60585a9a08f897274e57c3f70e0 \ + --hash=sha256:bb8b3c75bd157010459b15222c3fd30577042a7060e29d42dabce449c087f2b3 \ + --hash=sha256:bd5e7d2445d1a958c266bfa5d04c39932dc54093fa391736dbfdb0f1929c1fb3 \ + --hash=sha256:c87d98c7c4a69066fd31701c4e10d178a648c2cac3452e62c6b24dc51f9fcc00 \ + --hash=sha256:d2952396dc604544ea7476b33fe87faedc24d666fb0c2d5ac971a2b9576ab871 \ + --hash=sha256:d8797406499f28b5ef791f339594b0b5fdedf54e203b5066675c406ba69d705c \ + --hash=sha256:d9e9913f7bd69e093b81da4535ce27af842e7bf371cde42d1ae9e9bd382dc0e9 \ + --hash=sha256:e2806553238cd076f0a55bddab37a532b53580e699ed8e5606d0de1f856b5205 \ + --hash=sha256:ebab974b1687509e5c973b5c4b8b146683e101e102e17a86bd196ecaa4d099fc \ + --hash=sha256:ed767bf4ba90104c1216b68111613f0d5926fb3780660ea1198fc469af410e9d \ + --hash=sha256:f7a1fc29803712f80879b0806cb83ab24ce62fc8daf0569f2204a0cfd7f68ed4 # via # google-cloud-storage # google-resumable-media -google-resumable-media==2.7.1 \ - --hash=sha256:103ebc4ba331ab1bfdac0250f8033627a2cd7cde09e7ccff9181e31ba4315b2c \ - --hash=sha256:eae451a7b2e2cdbaaa0fd2eb00cc8a1ee5e95e16b55597359cbc3d27d7d90e33 +google-resumable-media==2.7.2 \ + --hash=sha256:3ce7551e9fe6d99e9a126101d2536612bb73486721951e9562fee0f90c6ababa \ + --hash=sha256:5280aed4629f2b60b847b0d42f9857fd4935c11af266744df33d8074cae92fe0 # via google-cloud-storage -googleapis-common-protos==1.63.2 \ - --hash=sha256:27a2499c7e8aff199665b22741997e485eccc8645aa9176c7c988e6fae507945 \ - --hash=sha256:27c5abdffc4911f28101e635de1533fb4cfd2c37fbaa9174587c799fac90aa87 +googleapis-common-protos==1.65.0 \ + --hash=sha256:2972e6c496f435b92590fd54045060867f3fe9be2c82ab148fc8885035479a63 \ + --hash=sha256:334a29d07cddc3aa01dee4988f9afd9b2916ee2ff49d6b757155dc0d197852c0 # via google-api-core -idna==3.7 \ - --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ - --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 +idna==3.10 \ + --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \ + --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3 # via requests -importlib-metadata==8.0.0 \ - --hash=sha256:15584cf2b1bf449d98ff8a6ff1abef57bf20f3ac6454f431736cd3e660921b2f \ - --hash=sha256:188bd24e4c346d3f0a933f275c2fec67050326a856b9a359881d7c2a697e8812 +importlib-metadata==8.5.0 \ + --hash=sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b \ + --hash=sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7 # via # -r requirements.in # keyring @@ -271,13 +240,13 @@ jaraco-classes==3.4.0 \ --hash=sha256:47a024b51d0239c0dd8c8540c6c7f484be3b8fcf0b2d85c13825780d3b3f3acd \ --hash=sha256:f662826b6bed8cace05e7ff873ce0f9283b5c924470fe664fff1c2f00f581790 # via keyring -jaraco-context==5.3.0 \ - --hash=sha256:3e16388f7da43d384a1a7cd3452e72e14732ac9fe459678773a3608a812bf266 \ - --hash=sha256:c2f67165ce1f9be20f32f650f25d8edfc1646a8aeee48ae06fb35f90763576d2 +jaraco-context==6.0.1 \ + --hash=sha256:9bae4ea555cf0b14938dc0aee7c9f32ed303aa20a3b73e7dc80111628792d1b3 \ + --hash=sha256:f797fc481b490edb305122c9181830a3a5b76d84ef6d1aef2fb9b47ab956f9e4 # via keyring -jaraco-functools==4.0.1 \ - --hash=sha256:3b24ccb921d6b593bdceb56ce14799204f473976e2a9d4b15b04d0f2c2326664 \ - --hash=sha256:d33fa765374c0611b52f8b3a795f8900869aa88c84769d4d1746cd68fb28c3e8 +jaraco-functools==4.1.0 \ + --hash=sha256:70f7e0e2ae076498e212562325e805204fc092d7b4c17e0e86c959e249701a9d \ + --hash=sha256:ad159f13428bc4acbf5541ad6dec511f91573b90fba04df61dafa2a1231cf649 # via keyring jeepney==0.8.0 \ --hash=sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806 \ @@ -289,9 +258,9 @@ jinja2==3.1.4 \ --hash=sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369 \ --hash=sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d # via gcp-releasetool -keyring==25.2.1 \ - --hash=sha256:2458681cdefc0dbc0b7eb6cf75d0b98e59f9ad9b2d4edd319d18f68bdca95e50 \ - --hash=sha256:daaffd42dbda25ddafb1ad5fec4024e5bbcfe424597ca1ca452b299861e49f1b +keyring==25.4.1 \ + --hash=sha256:5426f817cf7f6f007ba5ec722b1bcad95a75b27d780343772ad76b17cb47b0bf \ + --hash=sha256:b07ebc55f3e8ed86ac81dd31ef14e81ace9dd9c3d4b5d77a6e9a2016d0d71a1b # via # gcp-releasetool # twine @@ -299,75 +268,76 @@ markdown-it-py==3.0.0 \ --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \ --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb # via rich -markupsafe==2.1.5 \ - --hash=sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf \ - --hash=sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff \ - --hash=sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f \ - --hash=sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3 \ - --hash=sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532 \ - --hash=sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f \ - --hash=sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617 \ - --hash=sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df \ - --hash=sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4 \ - --hash=sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906 \ - --hash=sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f \ - --hash=sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4 \ - --hash=sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8 \ - --hash=sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371 \ - --hash=sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2 \ - --hash=sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465 \ - --hash=sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52 \ - --hash=sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6 \ - --hash=sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169 \ - --hash=sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad \ - --hash=sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2 \ - --hash=sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0 \ - --hash=sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029 \ - --hash=sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f \ - --hash=sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a \ - --hash=sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced \ - --hash=sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5 \ - --hash=sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c \ - --hash=sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf \ - --hash=sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9 \ - --hash=sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb \ - --hash=sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad \ - --hash=sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3 \ - --hash=sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1 \ - --hash=sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46 \ - --hash=sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc \ - --hash=sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a \ - --hash=sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee \ - --hash=sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900 \ - --hash=sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5 \ - --hash=sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea \ - --hash=sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f \ - --hash=sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5 \ - --hash=sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e \ - --hash=sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a \ - --hash=sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f \ - --hash=sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50 \ - --hash=sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a \ - --hash=sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b \ - --hash=sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4 \ - --hash=sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff \ - --hash=sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2 \ - --hash=sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46 \ - --hash=sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b \ - --hash=sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf \ - --hash=sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5 \ - --hash=sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5 \ - --hash=sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab \ - --hash=sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd \ - --hash=sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68 +markupsafe==3.0.1 \ + --hash=sha256:0778de17cff1acaeccc3ff30cd99a3fd5c50fc58ad3d6c0e0c4c58092b859396 \ + --hash=sha256:0f84af7e813784feb4d5e4ff7db633aba6c8ca64a833f61d8e4eade234ef0c38 \ + --hash=sha256:17b2aea42a7280db02ac644db1d634ad47dcc96faf38ab304fe26ba2680d359a \ + --hash=sha256:242d6860f1fd9191aef5fae22b51c5c19767f93fb9ead4d21924e0bcb17619d8 \ + --hash=sha256:244dbe463d5fb6d7ce161301a03a6fe744dac9072328ba9fc82289238582697b \ + --hash=sha256:26627785a54a947f6d7336ce5963569b5d75614619e75193bdb4e06e21d447ad \ + --hash=sha256:2a4b34a8d14649315c4bc26bbfa352663eb51d146e35eef231dd739d54a5430a \ + --hash=sha256:2ae99f31f47d849758a687102afdd05bd3d3ff7dbab0a8f1587981b58a76152a \ + --hash=sha256:312387403cd40699ab91d50735ea7a507b788091c416dd007eac54434aee51da \ + --hash=sha256:3341c043c37d78cc5ae6e3e305e988532b072329639007fd408a476642a89fd6 \ + --hash=sha256:33d1c36b90e570ba7785dacd1faaf091203d9942bc036118fab8110a401eb1a8 \ + --hash=sha256:3e683ee4f5d0fa2dde4db77ed8dd8a876686e3fc417655c2ece9a90576905344 \ + --hash=sha256:3ffb4a8e7d46ed96ae48805746755fadd0909fea2306f93d5d8233ba23dda12a \ + --hash=sha256:40621d60d0e58aa573b68ac5e2d6b20d44392878e0bfc159012a5787c4e35bc8 \ + --hash=sha256:40f1e10d51c92859765522cbd79c5c8989f40f0419614bcdc5015e7b6bf97fc5 \ + --hash=sha256:45d42d132cff577c92bfba536aefcfea7e26efb975bd455db4e6602f5c9f45e7 \ + --hash=sha256:48488d999ed50ba8d38c581d67e496f955821dc183883550a6fbc7f1aefdc170 \ + --hash=sha256:4935dd7883f1d50e2ffecca0aa33dc1946a94c8f3fdafb8df5c330e48f71b132 \ + --hash=sha256:4c2d64fdba74ad16138300815cfdc6ab2f4647e23ced81f59e940d7d4a1469d9 \ + --hash=sha256:4c8817557d0de9349109acb38b9dd570b03cc5014e8aabf1cbddc6e81005becd \ + --hash=sha256:4ffaaac913c3f7345579db4f33b0020db693f302ca5137f106060316761beea9 \ + --hash=sha256:5a4cb365cb49b750bdb60b846b0c0bc49ed62e59a76635095a179d440540c346 \ + --hash=sha256:62fada2c942702ef8952754abfc1a9f7658a4d5460fabe95ac7ec2cbe0d02abc \ + --hash=sha256:67c519635a4f64e495c50e3107d9b4075aec33634272b5db1cde839e07367589 \ + --hash=sha256:6a54c43d3ec4cf2a39f4387ad044221c66a376e58c0d0e971d47c475ba79c6b5 \ + --hash=sha256:7044312a928a66a4c2a22644147bc61a199c1709712069a344a3fb5cfcf16915 \ + --hash=sha256:730d86af59e0e43ce277bb83970530dd223bf7f2a838e086b50affa6ec5f9295 \ + --hash=sha256:800100d45176652ded796134277ecb13640c1a537cad3b8b53da45aa96330453 \ + --hash=sha256:80fcbf3add8790caddfab6764bde258b5d09aefbe9169c183f88a7410f0f6dea \ + --hash=sha256:82b5dba6eb1bcc29cc305a18a3c5365d2af06ee71b123216416f7e20d2a84e5b \ + --hash=sha256:852dc840f6d7c985603e60b5deaae1d89c56cb038b577f6b5b8c808c97580f1d \ + --hash=sha256:8ad4ad1429cd4f315f32ef263c1342166695fad76c100c5d979c45d5570ed58b \ + --hash=sha256:8ae369e84466aa70f3154ee23c1451fda10a8ee1b63923ce76667e3077f2b0c4 \ + --hash=sha256:93e8248d650e7e9d49e8251f883eed60ecbc0e8ffd6349e18550925e31bd029b \ + --hash=sha256:973a371a55ce9ed333a3a0f8e0bcfae9e0d637711534bcb11e130af2ab9334e7 \ + --hash=sha256:9ba25a71ebf05b9bb0e2ae99f8bc08a07ee8e98c612175087112656ca0f5c8bf \ + --hash=sha256:a10860e00ded1dd0a65b83e717af28845bb7bd16d8ace40fe5531491de76b79f \ + --hash=sha256:a4792d3b3a6dfafefdf8e937f14906a51bd27025a36f4b188728a73382231d91 \ + --hash=sha256:a7420ceda262dbb4b8d839a4ec63d61c261e4e77677ed7c66c99f4e7cb5030dd \ + --hash=sha256:ad91738f14eb8da0ff82f2acd0098b6257621410dcbd4df20aaa5b4233d75a50 \ + --hash=sha256:b6a387d61fe41cdf7ea95b38e9af11cfb1a63499af2759444b99185c4ab33f5b \ + --hash=sha256:b954093679d5750495725ea6f88409946d69cfb25ea7b4c846eef5044194f583 \ + --hash=sha256:bbde71a705f8e9e4c3e9e33db69341d040c827c7afa6789b14c6e16776074f5a \ + --hash=sha256:beeebf760a9c1f4c07ef6a53465e8cfa776ea6a2021eda0d0417ec41043fe984 \ + --hash=sha256:c91b394f7601438ff79a4b93d16be92f216adb57d813a78be4446fe0f6bc2d8c \ + --hash=sha256:c97ff7fedf56d86bae92fa0a646ce1a0ec7509a7578e1ed238731ba13aabcd1c \ + --hash=sha256:cb53e2a99df28eee3b5f4fea166020d3ef9116fdc5764bc5117486e6d1211b25 \ + --hash=sha256:cbf445eb5628981a80f54087f9acdbf84f9b7d862756110d172993b9a5ae81aa \ + --hash=sha256:d06b24c686a34c86c8c1fba923181eae6b10565e4d80bdd7bc1c8e2f11247aa4 \ + --hash=sha256:d98e66a24497637dd31ccab090b34392dddb1f2f811c4b4cd80c230205c074a3 \ + --hash=sha256:db15ce28e1e127a0013dfb8ac243a8e392db8c61eae113337536edb28bdc1f97 \ + --hash=sha256:db842712984e91707437461930e6011e60b39136c7331e971952bb30465bc1a1 \ + --hash=sha256:e24bfe89c6ac4c31792793ad9f861b8f6dc4546ac6dc8f1c9083c7c4f2b335cd \ + --hash=sha256:e81c52638315ff4ac1b533d427f50bc0afc746deb949210bc85f05d4f15fd772 \ + --hash=sha256:e9393357f19954248b00bed7c56f29a25c930593a77630c719653d51e7669c2a \ + --hash=sha256:ee3941769bd2522fe39222206f6dd97ae83c442a94c90f2b7a25d847d40f4729 \ + --hash=sha256:f31ae06f1328595d762c9a2bf29dafd8621c7d3adc130cbb46278079758779ca \ + --hash=sha256:f94190df587738280d544971500b9cafc9b950d32efcb1fba9ac10d84e6aa4e6 \ + --hash=sha256:fa7d686ed9883f3d664d39d5a8e74d3c5f63e603c2e3ff0abcba23eac6542635 \ + --hash=sha256:fb532dd9900381d2e8f48172ddc5a59db4c445a11b9fab40b3b786da40d3b56b \ + --hash=sha256:fe32482b37b4b00c7a52a07211b479653b7fe4f22b2e481b9a9b099d8a430f2f # via jinja2 mdurl==0.1.2 \ --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba # via markdown-it-py -more-itertools==10.3.0 \ - --hash=sha256:e5d93ef411224fbcef366a6e8ddc4c5781bc6359d43412a65dd5964e46111463 \ - --hash=sha256:ea6a02e24a9161e51faad17a8782b92a0df82c12c1c8886fec7f0c3fa1a1b320 +more-itertools==10.5.0 \ + --hash=sha256:037b0d3203ce90cca8ab1defbbdac29d5f993fc20131f3664dc8d6acfa872aef \ + --hash=sha256:5482bfef7849c25dc3c6dd53a6173ae4795da2a41a80faea6700d9f5846c5da6 # via # jaraco-classes # jaraco-functools @@ -389,9 +359,9 @@ nh3==0.2.18 \ --hash=sha256:de3ceed6e661954871d6cd78b410213bdcb136f79aafe22aa7182e028b8c7307 \ --hash=sha256:f0eca9ca8628dbb4e916ae2491d72957fdd35f7a5d326b7032a345f111ac07fe # via readme-renderer -nox==2024.4.15 \ - --hash=sha256:6492236efa15a460ecb98e7b67562a28b70da006ab0be164e8821177577c0565 \ - --hash=sha256:ecf6700199cdfa9e5ea0a41ff5e6ef4641d09508eda6edb89d9987864115817f +nox==2024.10.9 \ + --hash=sha256:1d36f309a0a2a853e9bccb76bbef6bb118ba92fa92674d15604ca99adeb29eab \ + --hash=sha256:7aa9dc8d1c27e9f45ab046ffd1c3b2c4f7c91755304769df231308849ebded95 # via -r requirements.in packaging==24.1 \ --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \ @@ -403,41 +373,41 @@ pkginfo==1.10.0 \ --hash=sha256:5df73835398d10db79f8eecd5cd86b1f6d29317589ea70796994d49399af6297 \ --hash=sha256:889a6da2ed7ffc58ab5b900d888ddce90bce912f2d2de1dc1c26f4cb9fe65097 # via twine -platformdirs==4.2.2 \ - --hash=sha256:2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee \ - --hash=sha256:38b7b51f512eed9e84a22788b4bce1de17c0adb134d6becb09836e37d8654cd3 +platformdirs==4.3.6 \ + --hash=sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907 \ + --hash=sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb # via virtualenv proto-plus==1.24.0 \ --hash=sha256:30b72a5ecafe4406b0d339db35b56c4059064e69227b8c3bda7462397f966445 \ --hash=sha256:402576830425e5f6ce4c2a6702400ac79897dab0b4343821aa5188b0fab81a12 # via google-api-core -protobuf==5.27.2 \ - --hash=sha256:0e341109c609749d501986b835f667c6e1e24531096cff9d34ae411595e26505 \ - --hash=sha256:176c12b1f1c880bf7a76d9f7c75822b6a2bc3db2d28baa4d300e8ce4cde7409b \ - --hash=sha256:354d84fac2b0d76062e9b3221f4abbbacdfd2a4d8af36bab0474f3a0bb30ab38 \ - --hash=sha256:4fadd8d83e1992eed0248bc50a4a6361dc31bcccc84388c54c86e530b7f58863 \ - --hash=sha256:54330f07e4949d09614707c48b06d1a22f8ffb5763c159efd5c0928326a91470 \ - --hash=sha256:610e700f02469c4a997e58e328cac6f305f649826853813177e6290416e846c6 \ - --hash=sha256:7fc3add9e6003e026da5fc9e59b131b8f22b428b991ccd53e2af8071687b4fce \ - --hash=sha256:9e8f199bf7f97bd7ecebffcae45ebf9527603549b2b562df0fbc6d4d688f14ca \ - --hash=sha256:a109916aaac42bff84702fb5187f3edadbc7c97fc2c99c5ff81dd15dcce0d1e5 \ - --hash=sha256:b848dbe1d57ed7c191dfc4ea64b8b004a3f9ece4bf4d0d80a367b76df20bf36e \ - --hash=sha256:f3ecdef226b9af856075f28227ff2c90ce3a594d092c39bee5513573f25e2714 +protobuf==5.28.2 \ + --hash=sha256:2c69461a7fcc8e24be697624c09a839976d82ae75062b11a0972e41fd2cd9132 \ + --hash=sha256:35cfcb15f213449af7ff6198d6eb5f739c37d7e4f1c09b5d0641babf2cc0c68f \ + --hash=sha256:52235802093bd8a2811abbe8bf0ab9c5f54cca0a751fdd3f6ac2a21438bffece \ + --hash=sha256:59379674ff119717404f7454647913787034f03fe7049cbef1d74a97bb4593f0 \ + --hash=sha256:5e8a95246d581eef20471b5d5ba010d55f66740942b95ba9b872d918c459452f \ + --hash=sha256:87317e9bcda04a32f2ee82089a204d3a2f0d3c8aeed16568c7daf4756e4f1fe0 \ + --hash=sha256:8ddc60bf374785fb7cb12510b267f59067fa10087325b8e1855b898a0d81d276 \ + --hash=sha256:a8b9403fc70764b08d2f593ce44f1d2920c5077bf7d311fefec999f8c40f78b7 \ + --hash=sha256:c0ea0123dac3399a2eeb1a1443d82b7afc9ff40241433296769f7da42d142ec3 \ + --hash=sha256:ca53faf29896c526863366a52a8f4d88e69cd04ec9571ed6082fa117fac3ab36 \ + --hash=sha256:eeea10f3dc0ac7e6b4933d32db20662902b4ab81bf28df12218aa389e9c2102d # via # gcp-docuploader # gcp-releasetool # google-api-core # googleapis-common-protos # proto-plus -pyasn1==0.6.0 \ - --hash=sha256:3a35ab2c4b5ef98e17dfdec8ab074046fbda76e281c5a706ccd82328cfc8f64c \ - --hash=sha256:cca4bb0f2df5504f02f6f8a775b6e416ff9b0b3b16f7ee80b5a3153d9b804473 +pyasn1==0.6.1 \ + --hash=sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629 \ + --hash=sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034 # via # pyasn1-modules # rsa -pyasn1-modules==0.4.0 \ - --hash=sha256:831dbcea1b177b28c9baddf4c6d1013c24c3accd14a1873fffaa6a2e905f17b6 \ - --hash=sha256:be04f15b66c206eed667e0bb5ab27e2b1855ea54a842e5037738099e8ca4ae0b +pyasn1-modules==0.4.1 \ + --hash=sha256:49bfa96b45a292b711e986f222502c1c9a5e1f4e568fc30e2574a6c7d07838fd \ + --hash=sha256:c28e2dbf9c06ad61c71a075c7e0f9fd0f1b0bb2d2ad4377f240d33ac2ab60a7c # via google-auth pycparser==2.22 \ --hash=sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6 \ @@ -449,9 +419,9 @@ pygments==2.18.0 \ # via # readme-renderer # rich -pyjwt==2.8.0 \ - --hash=sha256:57e28d156e3d5c10088e0c68abb90bfac3df82b40a71bd0daa20c65ccd5c23de \ - --hash=sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320 +pyjwt==2.9.0 \ + --hash=sha256:3b02fb0f44517787776cf48f2ae25d8e14f300e6d7545a4315cee571a415e850 \ + --hash=sha256:7e1e5b56cc735432a7369cbfa0efe50fa113ebecdc04ae6922deba8b84582d0c # via gcp-releasetool pyperclip==1.9.0 \ --hash=sha256:b7de0142ddc81bfc5c7507eea19da920b92252b548b96186caf94a5e2527d310 @@ -481,9 +451,9 @@ rfc3986==2.0.0 \ --hash=sha256:50b1502b60e289cb37883f3dfd34532b8873c7de9f49bb546641ce9cbd256ebd \ --hash=sha256:97aacf9dbd4bfd829baad6e6309fa6573aaf1be3f6fa735c8ab05e46cecb261c # via twine -rich==13.7.1 \ - --hash=sha256:4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222 \ - --hash=sha256:9be308cb1fe2f1f57d67ce99e95af38a1e2bc71ad9813b0e247cf7ffbcc3a432 +rich==13.9.2 \ + --hash=sha256:51a2c62057461aaf7152b4d611168f93a9fc73068f8ded2790f29fe2b5366d0c \ + --hash=sha256:8c82a3d3f8dcfe9e734771313e606b39d8247bb6b826e196f4914b333b743cf1 # via twine rsa==4.9 \ --hash=sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7 \ @@ -499,9 +469,9 @@ six==1.16.0 \ # via # gcp-docuploader # python-dateutil -tomli==2.0.1 \ - --hash=sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc \ - --hash=sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f +tomli==2.0.2 \ + --hash=sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38 \ + --hash=sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed # via nox twine==5.1.1 \ --hash=sha256:215dbe7b4b94c2c50a7315c0275d2258399280fbb7d04182c7e55e24b5f93997 \ @@ -510,28 +480,30 @@ twine==5.1.1 \ typing-extensions==4.12.2 \ --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \ --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8 - # via -r requirements.in -urllib3==2.2.2 \ - --hash=sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472 \ - --hash=sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168 + # via + # -r requirements.in + # rich +urllib3==2.2.3 \ + --hash=sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac \ + --hash=sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9 # via # requests # twine -virtualenv==20.26.3 \ - --hash=sha256:4c43a2a236279d9ea36a0d76f98d84bd6ca94ac4e0f4a3b9d46d05e10fea542a \ - --hash=sha256:8cc4a31139e796e9a7de2cd5cf2489de1217193116a8fd42328f1bd65f434589 +virtualenv==20.26.6 \ + --hash=sha256:280aede09a2a5c317e409a00102e7077c6432c5a38f0ef938e643805a7ad2c48 \ + --hash=sha256:7345cc5b25405607a624d8418154577459c3e0277f5466dd79c49d5e492995f2 # via nox -wheel==0.43.0 \ - --hash=sha256:465ef92c69fa5c5da2d1cf8ac40559a8c940886afcef87dcf14b9470862f1d85 \ - --hash=sha256:55c570405f142630c6b9f72fe09d9b67cf1477fcf543ae5b8dcb1f5b7377da81 +wheel==0.44.0 \ + --hash=sha256:2376a90c98cc337d18623527a97c31797bd02bad0033d41547043a1cbfbe448f \ + --hash=sha256:a29c3f2817e95ab89aa4660681ad547c0e9547f20e75b0562fe7723c9a2a9d49 # via -r requirements.in -zipp==3.19.2 \ - --hash=sha256:bf1dcf6450f873a13e952a29504887c89e6de7506209e5b1bcc3460135d4de19 \ - --hash=sha256:f091755f667055f2d02b32c53771a7a6c8b47e1fdbc4b72a8b9072b3eef8015c +zipp==3.20.2 \ + --hash=sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350 \ + --hash=sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: -setuptools==70.2.0 \ - --hash=sha256:b8b8060bb426838fbe942479c90296ce976249451118ef566a5a0b7d8b78fb05 \ - --hash=sha256:bd63e505105011b25c3c11f753f7e3b8465ea739efddaccef8f0efac2137bac1 +setuptools==75.1.0 \ + --hash=sha256:35ab7fd3bcd95e6b7fd704e4a1539513edad446c097797f2985e0e4b960772f2 \ + --hash=sha256:d59a21b17a275fb872a9c3dae73963160ae079f1049ed956880cd7c09b120538 # via -r requirements.in diff --git a/.kokoro/samples/python3.13/common.cfg b/.kokoro/samples/python3.13/common.cfg new file mode 100644 index 0000000000..6a5d9a2080 --- /dev/null +++ b/.kokoro/samples/python3.13/common.cfg @@ -0,0 +1,40 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Build logs will be here +action { + define_artifacts { + regex: "**/*sponge_log.xml" + } +} + +# Specify which tests to run +env_vars: { + key: "RUN_TESTS_SESSION" + value: "py-3.13" +} + +# Declare build specific Cloud project. +env_vars: { + key: "BUILD_SPECIFIC_GCLOUD_PROJECT" + value: "python-docs-samples-tests-313" +} + +env_vars: { + key: "TRAMPOLINE_BUILD_FILE" + value: "github/python-bigquery-dataframes/.kokoro/test-samples.sh" +} + +# Configure the docker image for kokoro-trampoline. +env_vars: { + key: "TRAMPOLINE_IMAGE" + value: "gcr.io/cloud-devrel-kokoro-resources/python-samples-testing-docker" +} + +# Download secrets for samples +gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" + +# Download trampoline resources. +gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" + +# Use the trampoline script to run in docker. +build_file: "python-bigquery-dataframes/.kokoro/trampoline_v2.sh" diff --git a/.kokoro/samples/python3.13/continuous.cfg b/.kokoro/samples/python3.13/continuous.cfg new file mode 100644 index 0000000000..a1c8d9759c --- /dev/null +++ b/.kokoro/samples/python3.13/continuous.cfg @@ -0,0 +1,6 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "True" +} \ No newline at end of file diff --git a/.kokoro/samples/python3.13/periodic-head.cfg b/.kokoro/samples/python3.13/periodic-head.cfg new file mode 100644 index 0000000000..123a35fbd3 --- /dev/null +++ b/.kokoro/samples/python3.13/periodic-head.cfg @@ -0,0 +1,11 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "True" +} + +env_vars: { + key: "TRAMPOLINE_BUILD_FILE" + value: "github/python-bigquery-dataframes/.kokoro/test-samples-against-head.sh" +} diff --git a/.kokoro/samples/python3.13/periodic.cfg b/.kokoro/samples/python3.13/periodic.cfg new file mode 100644 index 0000000000..71cd1e597e --- /dev/null +++ b/.kokoro/samples/python3.13/periodic.cfg @@ -0,0 +1,6 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "False" +} diff --git a/.kokoro/samples/python3.13/presubmit.cfg b/.kokoro/samples/python3.13/presubmit.cfg new file mode 100644 index 0000000000..a1c8d9759c --- /dev/null +++ b/.kokoro/samples/python3.13/presubmit.cfg @@ -0,0 +1,6 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "True" +} \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 55e295f06a..00f942f128 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,51 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.24.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.23.0...v1.24.0) (2024-10-24) + + +### Features + +* Support series items method ([#1089](https://github.com/googleapis/python-bigquery-dataframes/issues/1089)) ([245a89c](https://github.com/googleapis/python-bigquery-dataframes/commit/245a89c36544faf2bcecb5735abbc00c0b4dd687)) + + +### Documentation + +* Update docstrings of DataFrame and related files ([#1092](https://github.com/googleapis/python-bigquery-dataframes/issues/1092)) ([15e9fd5](https://github.com/googleapis/python-bigquery-dataframes/commit/15e9fd547a01572cbda3d21de04d5548c7a4a82c)) + +## [1.23.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.22.0...v1.23.0) (2024-10-23) + + +### Features + +* Add `bigframes.bigquery.create_vector_index` to assist in creating vector index on `ARRAY` columns ([#1024](https://github.com/googleapis/python-bigquery-dataframes/issues/1024)) ([863d694](https://github.com/googleapis/python-bigquery-dataframes/commit/863d6942eaf0cc435c3b76dc5d579c68fd478aa4)) +* Add gemini-1.5-pro-002 and gemini-1.5-flash-002 to known Gemini model list. ([#1105](https://github.com/googleapis/python-bigquery-dataframes/issues/1105)) ([7094c85](https://github.com/googleapis/python-bigquery-dataframes/commit/7094c85945efeb57067640404f7b98969401191b)) +* Add support for pandas series & data frames as inputs for ml models. ([#1088](https://github.com/googleapis/python-bigquery-dataframes/issues/1088)) ([30c8883](https://github.com/googleapis/python-bigquery-dataframes/commit/30c8883ff19db2c223d84c099c7b822467e9eb9a)) +* Cleanup temp resources with session deletion ([#1068](https://github.com/googleapis/python-bigquery-dataframes/issues/1068)) ([1d5373d](https://github.com/googleapis/python-bigquery-dataframes/commit/1d5373dd531c95b4a6a4132ef9b0ead0ecab14b4)) +* Show possible correct key(s) in `.__getitem__` KeyError message ([#1097](https://github.com/googleapis/python-bigquery-dataframes/issues/1097)) ([32fab96](https://github.com/googleapis/python-bigquery-dataframes/commit/32fab9626b9278e20c70c2ada8702e28e167a539)) +* Support uploading local geo data ([#1036](https://github.com/googleapis/python-bigquery-dataframes/issues/1036)) ([51cdd33](https://github.com/googleapis/python-bigquery-dataframes/commit/51cdd33e9f8377b3b992e0392eeb212aed499e3b)) + + +### Bug Fixes + +* Escape ids more consistently in ml module ([#1074](https://github.com/googleapis/python-bigquery-dataframes/issues/1074)) ([103e998](https://github.com/googleapis/python-bigquery-dataframes/commit/103e99823d442a36b2aaa5113950b988f6d3ba1e)) +* Model.fit metric not collected issue. ([#1085](https://github.com/googleapis/python-bigquery-dataframes/issues/1085)) ([06cec00](https://github.com/googleapis/python-bigquery-dataframes/commit/06cec00c51ba4b8df591e0988379db75b20c450b)) +* Remove index requirement from some dataframe APIs ([#1073](https://github.com/googleapis/python-bigquery-dataframes/issues/1073)) ([2d16f6d](https://github.com/googleapis/python-bigquery-dataframes/commit/2d16f6d1e9519e228533a67084000568a61c086e)) +* Update session metrics in `read_gbq_query` ([#1084](https://github.com/googleapis/python-bigquery-dataframes/issues/1084)) ([dced460](https://github.com/googleapis/python-bigquery-dataframes/commit/dced46070ee4212b5585a1eb53ae341dc0bf63ba)) + + +### Performance Improvements + +* Speed up tree transforms during sql compile ([#1071](https://github.com/googleapis/python-bigquery-dataframes/issues/1071)) ([d73fe9d](https://github.com/googleapis/python-bigquery-dataframes/commit/d73fe9d5fd2907aeaaa892a329221c10bb390da0)) +* Utilize ORDER BY LIMIT over ROW_NUMBER where possible ([#1077](https://github.com/googleapis/python-bigquery-dataframes/issues/1077)) ([7003d1a](https://github.com/googleapis/python-bigquery-dataframes/commit/7003d1ae6fddd535f6c206081e85f82bb6006f17)) + + +### Documentation + +* Add ml tutorial for Evaluate the model ([#1038](https://github.com/googleapis/python-bigquery-dataframes/issues/1038)) ([a120bae](https://github.com/googleapis/python-bigquery-dataframes/commit/a120bae2a8039d6115369b1f4a9047d4f0586120)) +* Show best practice of closing the session to cleanup resources in sample notebooks ([#1095](https://github.com/googleapis/python-bigquery-dataframes/issues/1095)) ([62a88e8](https://github.com/googleapis/python-bigquery-dataframes/commit/62a88e87f55f9cc109aa38f4b7ac10dd45ca41fd)) +* Update docstrings of Session and related files ([#1087](https://github.com/googleapis/python-bigquery-dataframes/issues/1087)) ([bf93e80](https://github.com/googleapis/python-bigquery-dataframes/commit/bf93e808daad2454e5c1aa933e0d2164d63084e7)) + ## [1.22.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.21.0...v1.22.0) (2024-10-09) diff --git a/bigframes/_config/__init__.py b/bigframes/_config/__init__.py index 75f91b28d3..db860e6b1d 100644 --- a/bigframes/_config/__init__.py +++ b/bigframes/_config/__init__.py @@ -107,7 +107,7 @@ def sampling(self) -> sampling_options.SamplingOptions: The data can be downloaded into memory explicitly (e.g., to_pandas, to_numpy, values) or implicitly (e.g., - matplotlib plotting). This option can be overriden by + matplotlib plotting). This option can be overridden by parameters in specific functions. Returns: diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 28a818e709..0b2d2d5aeb 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -12,489 +12,33 @@ # See the License for the specific language governing permissions and # limitations under the License. - """This module integrates BigQuery built-in functions for use with DataFrame objects, such as array functions: https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions. """ - -from __future__ import annotations - -import typing -from typing import Literal, Optional, Union - -import bigframes_vendored.constants as constants - -import bigframes.core.groupby as groupby -import bigframes.core.sql -import bigframes.ml.utils as utils -import bigframes.operations as ops -import bigframes.operations.aggregations as agg_ops -import bigframes.series - -if typing.TYPE_CHECKING: - import bigframes.dataframe as dataframe - import bigframes.series as series - - -# Array functions defined from -# https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions - - -def array_length(series: series.Series) -> series.Series: - """Compute the length of each array element in the Series. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series([[1, 2, 8, 3], [], [3, 4]]) - >>> bbq.array_length(s) - 0 4 - 1 0 - 2 2 - dtype: Int64 - - You can also apply this function directly to Series. - - >>> s.apply(bbq.array_length, by_row=False) - 0 4 - 1 0 - 2 2 - dtype: Int64 - - Args: - series (bigframes.series.Series): A Series with array columns. - - Returns: - bigframes.series.Series: A Series of integer values indicating - the length of each element in the Series. - - """ - return series._apply_unary_op(ops.len_op) - - -def array_agg( - obj: groupby.SeriesGroupBy | groupby.DataFrameGroupBy, -) -> series.Series | dataframe.DataFrame: - """Group data and create arrays from selected columns, omitting NULLs to avoid - BigQuery errors (NULLs not allowed in arrays). - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - - For a SeriesGroupBy object: - - >>> lst = ['a', 'a', 'b', 'b', 'a'] - >>> s = bpd.Series([1, 2, 3, 4, np.nan], index=lst) - >>> bbq.array_agg(s.groupby(level=0)) - a [1. 2.] - b [3. 4.] - dtype: list[pyarrow] - - For a DataFrameGroupBy object: - - >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] - >>> df = bpd.DataFrame(l, columns=["a", "b", "c"]) - >>> bbq.array_agg(df.groupby(by=["b"])) - a c - b - 1.0 [2] [3] - 2.0 [1 1] [3 2] - - [2 rows x 2 columns] - - Args: - obj (groupby.SeriesGroupBy | groupby.DataFrameGroupBy): - A GroupBy object to be applied the function. - - Returns: - bigframes.series.Series | bigframes.dataframe.DataFrame: A Series or - DataFrame containing aggregated array columns, and indexed by the - original group columns. - """ - if isinstance(obj, groupby.SeriesGroupBy): - return obj._aggregate(agg_ops.ArrayAggOp()) - elif isinstance(obj, groupby.DataFrameGroupBy): - return obj._aggregate_all(agg_ops.ArrayAggOp(), numeric_only=False) - else: - raise ValueError( - f"Unsupported type {type(obj)} to apply `array_agg` function. {constants.FEEDBACK_LINK}" - ) - - -def array_to_string(series: series.Series, delimiter: str) -> series.Series: - """Converts array elements within a Series into delimited strings. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]]) - >>> bbq.array_to_string(s, delimiter=", ") - 0 H, i, ! - 1 Hello, World - 2 - 3 - 4 Hi - dtype: string - - Args: - series (bigframes.series.Series): A Series containing arrays. - delimiter (str): The string used to separate array elements. - - Returns: - bigframes.series.Series: A Series containing delimited strings. - - """ - return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter)) - - -# JSON functions defined from -# https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions - - -def json_set( - series: series.Series, - json_path_value_pairs: typing.Sequence[typing.Tuple[str, typing.Any]], -) -> series.Series: - """Produces a new JSON value within a Series by inserting or replacing values at - specified paths. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - >>> import numpy as np - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] - >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) - 0 {"a":100,"b":"hi"} - Name: data, dtype: string - - Args: - series (bigframes.series.Series): - The Series containing JSON data (as native JSON objects or JSON-formatted strings). - json_path_value_pairs (Sequence[Tuple[str, typing.Any]]): - Pairs of JSON path and the new value to insert/replace. - - Returns: - bigframes.series.Series: A new Series with the transformed JSON data. - - """ - # SQLGlot parser does not support the "create_if_missing => true" syntax, so - # create_if_missing is not currently implemented. - - for json_path_value_pair in json_path_value_pairs: - if len(json_path_value_pair) != 2: - raise ValueError( - "Incorrect format: Expected (, ), but found: " - + f"{json_path_value_pair}" - ) - - json_path, json_value = json_path_value_pair - series = series._apply_binary_op( - json_value, ops.JSONSet(json_path=json_path), alignment="left" - ) - return series - - -def json_extract( - series: series.Series, - json_path: str, -) -> series.Series: - """Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON` - value. This function uses single quotes and brackets to escape invalid JSONPath - characters in JSON keys. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) - >>> bbq.json_extract(s, json_path="$.class") - 0 {"students":[{"id":5},{"id":12}]} - dtype: string - - Args: - series (bigframes.series.Series): - The Series containing JSON data (as native JSON objects or JSON-formatted strings). - json_path (str): - The JSON path identifying the data that you want to obtain from the input. - - Returns: - bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. - """ - return series._apply_unary_op(ops.JSONExtract(json_path=json_path)) - - -def json_extract_array( - series: series.Series, - json_path: str = "$", -) -> series.Series: - """Extracts a JSON array and converts it to a SQL array of JSON-formatted `STRING` or `JSON` - values. This function uses single quotes and brackets to escape invalid JSONPath - characters in JSON keys. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - - >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) - >>> bbq.json_extract_array(s) - 0 ['1' '2' '3'] - 1 ['4' '5'] - dtype: list[pyarrow] - - Args: - series (bigframes.series.Series): - The Series containing JSON data (as native JSON objects or JSON-formatted strings). - json_path (str): - The JSON path identifying the data that you want to obtain from the input. - - Returns: - bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. - """ - return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path)) - - -# Approximate aggrgate functions defined from -# https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions - - -def approx_top_count( - series: series.Series, - number: int, -) -> series.Series: - """Returns the approximate top elements of `expression` as an array of STRUCTs. - The number parameter specifies the number of elements returned. - - Each `STRUCT` contains two fields. The first field (named `value`) contains an input - value. The second field (named `count`) contains an `INT64` specifying the number - of times the value was returned. - - Returns `NULL` if there are zero input rows. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - >>> s = bpd.Series(["apple", "apple", "pear", "pear", "pear", "banana"]) - >>> bbq.approx_top_count(s, number=2) - [{'value': 'pear', 'count': 3}, {'value': 'apple', 'count': 2}] - - Args: - series (bigframes.series.Series): - The Series with any data type that the `GROUP BY` clause supports. - number (int): - An integer specifying the number of times the value was returned. - - Returns: - bigframes.series.Series: A new Series with the result data. - """ - if number < 1: - raise ValueError("The number of approx_top_count must be at least 1") - return series._apply_aggregation(agg_ops.ApproxTopCountOp(number=number)) - - -def struct(value: dataframe.DataFrame) -> series.Series: - """Takes a DataFrame and converts it into a Series of structs with each - struct entry corresponding to a DataFrame row and each struct field - corresponding to a DataFrame column - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - >>> import bigframes.series as series - >>> bpd.options.display.progress_bar = None - - >>> srs = series.Series([{"version": 1, "project": "pandas"}, {"version": 2, "project": "numpy"},]) - >>> df = srs.struct.explode() - >>> bbq.struct(df) - 0 {'project': 'pandas', 'version': 1} - 1 {'project': 'numpy', 'version': 2} - dtype: struct[pyarrow] - - Args: - value (bigframes.dataframe.DataFrame): - The DataFrame to be converted to a Series of structs - - Returns: - bigframes.series.Series: A new Series with struct entries representing rows of the original DataFrame - """ - block = value._block - block, result_id = block.apply_nary_op( - block.value_columns, ops.StructOp(column_names=tuple(block.column_labels)) - ) - block = block.select_column(result_id) - return bigframes.series.Series(block) - - -# Search functions defined from -# https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions - - -def vector_search( - base_table: str, - column_to_search: str, - query: Union[dataframe.DataFrame, series.Series], - *, - query_column_to_search: Optional[str] = None, - top_k: Optional[int] = 10, - distance_type: Literal["euclidean", "cosine"] = "euclidean", - fraction_lists_to_search: Optional[float] = None, - use_brute_force: bool = False, -) -> dataframe.DataFrame: - """ - Conduct vector search which searches embeddings to find semantically similar entities. - - **Examples:** - - - >>> import bigframes.pandas as bpd - >>> import bigframes.bigquery as bbq - >>> bpd.options.display.progress_bar = None - - DataFrame embeddings for which to find nearest neighbors. The ``ARRAY`` column - is used as the search query: - - >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"], - ... "embedding": [[1.0, 2.0], [3.0, 5.2]]}) - >>> bbq.vector_search( - ... base_table="bigframes-dev.bigframes_tests_sys.base_table", - ... column_to_search="my_embedding", - ... query=search_query, - ... top_k=2) - query_id embedding id my_embedding distance - 1 cat [3. 5.2] 5 [5. 5.4] 2.009975 - 0 dog [1. 2.] 1 [1. 2.] 0.0 - 0 dog [1. 2.] 4 [1. 3.2] 1.2 - 1 cat [3. 5.2] 2 [2. 4.] 1.56205 - - [4 rows x 5 columns] - - Series embeddings for which to find nearest neighbors: - - >>> search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]], - ... index=["dog", "cat"], - ... name="embedding") - >>> bbq.vector_search( - ... base_table="bigframes-dev.bigframes_tests_sys.base_table", - ... column_to_search="my_embedding", - ... query=search_query, - ... top_k=2) - embedding id my_embedding distance - dog [1. 2.] 1 [1. 2.] 0.0 - cat [3. 5.2] 5 [5. 5.4] 2.009975 - dog [1. 2.] 4 [1. 3.2] 1.2 - cat [3. 5.2] 2 [2. 4.] 1.56205 - - [4 rows x 4 columns] - - You can specify the name of the column in the query DataFrame embeddings and distance type. - If you specify query_column_to_search_value, it will use the provided column which contains - the embeddings for which to find nearest neighbors. Otherwiese, it uses the column_to_search value. - - >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"], - ... "embedding": [[1.0, 2.0], [3.0, 5.2]], - ... "another_embedding": [[0.7, 2.2], [3.3, 5.2]]}) - >>> bbq.vector_search( - ... base_table="bigframes-dev.bigframes_tests_sys.base_table", - ... column_to_search="my_embedding", - ... query=search_query, - ... distance_type="cosine", - ... query_column_to_search="another_embedding", - ... top_k=2) - query_id embedding another_embedding id my_embedding distance - 1 cat [3. 5.2] [3.3 5.2] 2 [2. 4.] 0.005181 - 0 dog [1. 2.] [0.7 2.2] 4 [1. 3.2] 0.000013 - 1 cat [3. 5.2] [3.3 5.2] 1 [1. 2.] 0.005181 - 0 dog [1. 2.] [0.7 2.2] 3 [1.5 7. ] 0.004697 - - [4 rows x 6 columns] - - Args: - base_table (str): - The table to search for nearest neighbor embeddings. - column_to_search (str): - The name of the base table column to search for nearest neighbor embeddings. - The column must have a type of ``ARRAY``. All elements in the array must be non-NULL. - query (bigframes.dataframe.DataFrame | bigframes.dataframe.Series): - A Series or DataFrame that provides the embeddings for which to find nearest neighbors. - query_column_to_search (str): - Specifies the name of the column in the query that contains the embeddings for which to - find nearest neighbors. The column must have a type of ``ARRAY``. All elements in - the array must be non-NULL and all values in the column must have the same array dimensions - as the values in the ``column_to_search`` column. Can only be set when query is a DataFrame. - top_k (int, default 10): - Sepecifies the number of nearest neighbors to return. Default to 10. - distance_type (str, defalt "euclidean"): - Specifies the type of metric to use to compute the distance between two vectors. - Possible values are "euclidean" and "cosine". Default to "euclidean". - fraction_lists_to_search (float, range in [0.0, 1.0]): - Specifies the percentage of lists to search. Specifying a higher percentage leads to - higher recall and slower performance, and the converse is true when specifying a lower - percentage. It is only used when a vector index is also used. You can only specify - ``fraction_lists_to_search`` when ``use_brute_force`` is set to False. - use_brute_force (bool, default False): - Determines whether to use brute force search by skipping the vector index if one is available. - Default to False. - - Returns: - bigframes.dataframe.DataFrame: A DataFrame containing vector search result. - """ - if not fraction_lists_to_search and use_brute_force is True: - raise ValueError( - "You can't specify fraction_lists_to_search when use_brute_force is set to True." - ) - if ( - isinstance(query, bigframes.series.Series) - and query_column_to_search is not None - ): - raise ValueError( - "You can't specify query_column_to_search when query is a Series." - ) - # TODO(ashleyxu): Support options in vector search. b/344019989 - if fraction_lists_to_search is not None or use_brute_force is True: - raise NotImplementedError( - f"fraction_lists_to_search and use_brute_force is not supported. {constants.FEEDBACK_LINK}" - ) - options = { - "base_table": base_table, - "column_to_search": column_to_search, - "query_column_to_search": query_column_to_search, - "distance_type": distance_type, - "top_k": top_k, - "fraction_lists_to_search": fraction_lists_to_search, - "use_brute_force": use_brute_force, - } - - (query,) = utils.convert_to_dataframe(query) - sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True) - - sql = bigframes.core.sql.create_vector_search_sql( - sql_string=sql_string, options=options # type: ignore - ) - if index_col_ids is not None: - df = query._session.read_gbq(sql, index_col=index_col_ids) - else: - df = query._session.read_gbq(sql) - df.index.names = index_labels - - return df +from bigframes.bigquery._operations.approx_agg import approx_top_count +from bigframes.bigquery._operations.array import ( + array_agg, + array_length, + array_to_string, +) +from bigframes.bigquery._operations.json import ( + json_extract, + json_extract_array, + json_set, +) +from bigframes.bigquery._operations.search import create_vector_index, vector_search +from bigframes.bigquery._operations.struct import struct + +__all__ = [ + "array_length", + "array_agg", + "array_to_string", + "json_set", + "json_extract", + "json_extract_array", + "approx_top_count", + "struct", + "create_vector_index", + "vector_search", +] diff --git a/bigframes/bigquery/_operations/__init__.py b/bigframes/bigquery/_operations/__init__.py new file mode 100644 index 0000000000..6d5e14bcf4 --- /dev/null +++ b/bigframes/bigquery/_operations/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/bigframes/bigquery/_operations/approx_agg.py b/bigframes/bigquery/_operations/approx_agg.py new file mode 100644 index 0000000000..696f8f5a66 --- /dev/null +++ b/bigframes/bigquery/_operations/approx_agg.py @@ -0,0 +1,59 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import bigframes.operations.aggregations as agg_ops +import bigframes.series as series + +""" +Approximate functions defined from +https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions +""" + + +def approx_top_count( + series: series.Series, + number: int, +) -> series.Series: + """Returns the approximate top elements of `expression` as an array of STRUCTs. + The number parameter specifies the number of elements returned. + + Each `STRUCT` contains two fields. The first field (named `value`) contains an input + value. The second field (named `count`) contains an `INT64` specifying the number + of times the value was returned. + + Returns `NULL` if there are zero input rows. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series(["apple", "apple", "pear", "pear", "pear", "banana"]) + >>> bbq.approx_top_count(s, number=2) + [{'value': 'pear', 'count': 3}, {'value': 'apple', 'count': 2}] + + Args: + series (bigframes.series.Series): + The Series with any data type that the `GROUP BY` clause supports. + number (int): + An integer specifying the number of times the value was returned. + + Returns: + bigframes.series.Series: A new Series with the result data. + """ + if number < 1: + raise ValueError("The number of approx_top_count must be at least 1") + return series._apply_aggregation(agg_ops.ApproxTopCountOp(number=number)) diff --git a/bigframes/bigquery/_operations/array.py b/bigframes/bigquery/_operations/array.py new file mode 100644 index 0000000000..4af1416127 --- /dev/null +++ b/bigframes/bigquery/_operations/array.py @@ -0,0 +1,151 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Array functions defined from +https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions +""" + + +from __future__ import annotations + +import typing + +import bigframes_vendored.constants as constants + +import bigframes.core.groupby as groupby +import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops +import bigframes.series as series + +if typing.TYPE_CHECKING: + import bigframes.dataframe as dataframe + + +def array_length(series: series.Series) -> series.Series: + """Compute the length of each array element in the Series. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([[1, 2, 8, 3], [], [3, 4]]) + >>> bbq.array_length(s) + 0 4 + 1 0 + 2 2 + dtype: Int64 + + You can also apply this function directly to Series. + + >>> s.apply(bbq.array_length, by_row=False) + 0 4 + 1 0 + 2 2 + dtype: Int64 + + Args: + series (bigframes.series.Series): A Series with array columns. + + Returns: + bigframes.series.Series: A Series of integer values indicating + the length of each element in the Series. + + """ + return series._apply_unary_op(ops.len_op) + + +def array_agg( + obj: groupby.SeriesGroupBy | groupby.DataFrameGroupBy, +) -> series.Series | dataframe.DataFrame: + """Group data and create arrays from selected columns, omitting NULLs to avoid + BigQuery errors (NULLs not allowed in arrays). + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + For a SeriesGroupBy object: + + >>> lst = ['a', 'a', 'b', 'b', 'a'] + >>> s = bpd.Series([1, 2, 3, 4, np.nan], index=lst) + >>> bbq.array_agg(s.groupby(level=0)) + a [1. 2.] + b [3. 4.] + dtype: list[pyarrow] + + For a DataFrameGroupBy object: + + >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] + >>> df = bpd.DataFrame(l, columns=["a", "b", "c"]) + >>> bbq.array_agg(df.groupby(by=["b"])) + a c + b + 1.0 [2] [3] + 2.0 [1 1] [3 2] + + [2 rows x 2 columns] + + Args: + obj (groupby.SeriesGroupBy | groupby.DataFrameGroupBy): + A GroupBy object to be applied the function. + + Returns: + bigframes.series.Series | bigframes.dataframe.DataFrame: A Series or + DataFrame containing aggregated array columns, and indexed by the + original group columns. + """ + if isinstance(obj, groupby.SeriesGroupBy): + return obj._aggregate(agg_ops.ArrayAggOp()) + elif isinstance(obj, groupby.DataFrameGroupBy): + return obj._aggregate_all(agg_ops.ArrayAggOp(), numeric_only=False) + else: + raise ValueError( + f"Unsupported type {type(obj)} to apply `array_agg` function. {constants.FEEDBACK_LINK}" + ) + + +def array_to_string(series: series.Series, delimiter: str) -> series.Series: + """Converts array elements within a Series into delimited strings. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]]) + >>> bbq.array_to_string(s, delimiter=", ") + 0 H, i, ! + 1 Hello, World + 2 + 3 + 4 Hi + dtype: string + + Args: + series (bigframes.series.Series): A Series containing arrays. + delimiter (str): The string used to separate array elements. + + Returns: + bigframes.series.Series: A Series containing delimited strings. + + """ + return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter)) diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py new file mode 100644 index 0000000000..d3c3c97a9c --- /dev/null +++ b/bigframes/bigquery/_operations/json.py @@ -0,0 +1,136 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +JSON functions defined from +https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions +""" + + +from __future__ import annotations + +from typing import Any, Sequence, Tuple + +import bigframes.operations as ops +import bigframes.series as series + + +def json_set( + series: series.Series, + json_path_value_pairs: Sequence[Tuple[str, Any]], +) -> series.Series: + """Produces a new JSON value within a Series by inserting or replacing values at + specified paths. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> import numpy as np + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"] + >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")]) + 0 {"a":100,"b":"hi"} + Name: data, dtype: string + + Args: + series (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path_value_pairs (Sequence[Tuple[str, Any]]): + Pairs of JSON path and the new value to insert/replace. + + Returns: + bigframes.series.Series: A new Series with the transformed JSON data. + + """ + # SQLGlot parser does not support the "create_if_missing => true" syntax, so + # create_if_missing is not currently implemented. + + for json_path_value_pair in json_path_value_pairs: + if len(json_path_value_pair) != 2: + raise ValueError( + "Incorrect format: Expected (, ), but found: " + + f"{json_path_value_pair}" + ) + + json_path, json_value = json_path_value_pair + series = series._apply_binary_op( + json_value, ops.JSONSet(json_path=json_path), alignment="left" + ) + return series + + +def json_extract( + series: series.Series, + json_path: str, +) -> series.Series: + """Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON` + value. This function uses single quotes and brackets to escape invalid JSONPath + characters in JSON keys. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}']) + >>> bbq.json_extract(s, json_path="$.class") + 0 {"students":[{"id":5},{"id":12}]} + dtype: string + + Args: + series (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path (str): + The JSON path identifying the data that you want to obtain from the input. + + Returns: + bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. + """ + return series._apply_unary_op(ops.JSONExtract(json_path=json_path)) + + +def json_extract_array( + series: series.Series, + json_path: str = "$", +) -> series.Series: + """Extracts a JSON array and converts it to a SQL array of JSON-formatted `STRING` or `JSON` + values. This function uses single quotes and brackets to escape invalid JSONPath + characters in JSON keys. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) + >>> bbq.json_extract_array(s) + 0 ['1' '2' '3'] + 1 ['4' '5'] + dtype: list[pyarrow] + + Args: + series (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path (str): + The JSON path identifying the data that you want to obtain from the input. + + Returns: + bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. + """ + return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path)) diff --git a/bigframes/bigquery/_operations/search.py b/bigframes/bigquery/_operations/search.py new file mode 100644 index 0000000000..496e259944 --- /dev/null +++ b/bigframes/bigquery/_operations/search.py @@ -0,0 +1,245 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import json +import typing +from typing import Collection, Literal, Mapping, Optional, Union + +import bigframes_vendored.constants as constants +import google.cloud.bigquery as bigquery + +import bigframes.core.sql +import bigframes.ml.utils as utils + +if typing.TYPE_CHECKING: + import bigframes.dataframe as dataframe + import bigframes.series as series + import bigframes.session + +""" +Search functions defined from +https://cloud.google.com/bigquery/docs/reference/standard-sql/search_functions +""" + + +def create_vector_index( + table_id: str, + column_name: str, + *, + replace: bool = False, + index_name: Optional[str] = None, + distance_type="cosine", + stored_column_names: Collection[str] = (), + index_type: str = "ivf", + ivf_options: Optional[Mapping] = None, + tree_ah_options: Optional[Mapping] = None, + session: Optional[bigframes.session.Session] = None, +) -> None: + """ + Creates a new vector index on a column of a table. + + This method calls the `CREATE VECTOR INDEX DDL statement + `_. + + """ + import bigframes.pandas + + if index_name is None: + table_ref = bigquery.TableReference.from_string(table_id) + index_name = table_ref.table_id + + options = { + "index_type": index_type.upper(), + "distance_type": distance_type.upper(), + } + + if ivf_options is not None: + options["ivf_options"] = json.dumps(ivf_options) + + if tree_ah_options is not None: + options["tree_ah_options"] = json.dumps(tree_ah_options) + + sql = bigframes.core.sql.create_vector_index_ddl( + replace=replace, + index_name=index_name, + table_name=table_id, + column_name=column_name, + stored_column_names=stored_column_names, + options=options, + ) + + # Use global read_gbq to execute this for better location autodetection. + if session is None: + read_gbq_query = bigframes.pandas.read_gbq_query + else: + read_gbq_query = session.read_gbq_query + + read_gbq_query(sql) + + +def vector_search( + base_table: str, + column_to_search: str, + query: Union[dataframe.DataFrame, series.Series], + *, + query_column_to_search: Optional[str] = None, + top_k: Optional[int] = 10, + distance_type: Literal["euclidean", "cosine"] = "euclidean", + fraction_lists_to_search: Optional[float] = None, + use_brute_force: bool = False, +) -> dataframe.DataFrame: + """ + Conduct vector search which searches embeddings to find semantically similar entities. + + This method calls the `VECTOR_SEARCH() SQL function + `_. + + **Examples:** + + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + DataFrame embeddings for which to find nearest neighbors. The ``ARRAY`` column + is used as the search query: + + >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"], + ... "embedding": [[1.0, 2.0], [3.0, 5.2]]}) + >>> bbq.vector_search( + ... base_table="bigframes-dev.bigframes_tests_sys.base_table", + ... column_to_search="my_embedding", + ... query=search_query, + ... top_k=2) + query_id embedding id my_embedding distance + 1 cat [3. 5.2] 5 [5. 5.4] 2.009975 + 0 dog [1. 2.] 1 [1. 2.] 0.0 + 0 dog [1. 2.] 4 [1. 3.2] 1.2 + 1 cat [3. 5.2] 2 [2. 4.] 1.56205 + + [4 rows x 5 columns] + + Series embeddings for which to find nearest neighbors: + + >>> search_query = bpd.Series([[1.0, 2.0], [3.0, 5.2]], + ... index=["dog", "cat"], + ... name="embedding") + >>> bbq.vector_search( + ... base_table="bigframes-dev.bigframes_tests_sys.base_table", + ... column_to_search="my_embedding", + ... query=search_query, + ... top_k=2) + embedding id my_embedding distance + dog [1. 2.] 1 [1. 2.] 0.0 + cat [3. 5.2] 5 [5. 5.4] 2.009975 + dog [1. 2.] 4 [1. 3.2] 1.2 + cat [3. 5.2] 2 [2. 4.] 1.56205 + + [4 rows x 4 columns] + + You can specify the name of the column in the query DataFrame embeddings and distance type. + If you specify query_column_to_search_value, it will use the provided column which contains + the embeddings for which to find nearest neighbors. Otherwiese, it uses the column_to_search value. + + >>> search_query = bpd.DataFrame({"query_id": ["dog", "cat"], + ... "embedding": [[1.0, 2.0], [3.0, 5.2]], + ... "another_embedding": [[0.7, 2.2], [3.3, 5.2]]}) + >>> bbq.vector_search( + ... base_table="bigframes-dev.bigframes_tests_sys.base_table", + ... column_to_search="my_embedding", + ... query=search_query, + ... distance_type="cosine", + ... query_column_to_search="another_embedding", + ... top_k=2) + query_id embedding another_embedding id my_embedding distance + 1 cat [3. 5.2] [3.3 5.2] 2 [2. 4.] 0.005181 + 0 dog [1. 2.] [0.7 2.2] 4 [1. 3.2] 0.000013 + 1 cat [3. 5.2] [3.3 5.2] 1 [1. 2.] 0.005181 + 0 dog [1. 2.] [0.7 2.2] 3 [1.5 7. ] 0.004697 + + [4 rows x 6 columns] + + Args: + base_table (str): + The table to search for nearest neighbor embeddings. + column_to_search (str): + The name of the base table column to search for nearest neighbor embeddings. + The column must have a type of ``ARRAY``. All elements in the array must be non-NULL. + query (bigframes.dataframe.DataFrame | bigframes.dataframe.Series): + A Series or DataFrame that provides the embeddings for which to find nearest neighbors. + query_column_to_search (str): + Specifies the name of the column in the query that contains the embeddings for which to + find nearest neighbors. The column must have a type of ``ARRAY``. All elements in + the array must be non-NULL and all values in the column must have the same array dimensions + as the values in the ``column_to_search`` column. Can only be set when query is a DataFrame. + top_k (int, default 10): + Sepecifies the number of nearest neighbors to return. Default to 10. + distance_type (str, defalt "euclidean"): + Specifies the type of metric to use to compute the distance between two vectors. + Possible values are "euclidean" and "cosine". Default to "euclidean". + fraction_lists_to_search (float, range in [0.0, 1.0]): + Specifies the percentage of lists to search. Specifying a higher percentage leads to + higher recall and slower performance, and the converse is true when specifying a lower + percentage. It is only used when a vector index is also used. You can only specify + ``fraction_lists_to_search`` when ``use_brute_force`` is set to False. + use_brute_force (bool, default False): + Determines whether to use brute force search by skipping the vector index if one is available. + Default to False. + + Returns: + bigframes.dataframe.DataFrame: A DataFrame containing vector search result. + """ + import bigframes.series + + if not fraction_lists_to_search and use_brute_force is True: + raise ValueError( + "You can't specify fraction_lists_to_search when use_brute_force is set to True." + ) + if ( + isinstance(query, bigframes.series.Series) + and query_column_to_search is not None + ): + raise ValueError( + "You can't specify query_column_to_search when query is a Series." + ) + # TODO(ashleyxu): Support options in vector search. b/344019989 + if fraction_lists_to_search is not None or use_brute_force is True: + raise NotImplementedError( + f"fraction_lists_to_search and use_brute_force is not supported. {constants.FEEDBACK_LINK}" + ) + options = { + "base_table": base_table, + "column_to_search": column_to_search, + "query_column_to_search": query_column_to_search, + "distance_type": distance_type, + "top_k": top_k, + "fraction_lists_to_search": fraction_lists_to_search, + "use_brute_force": use_brute_force, + } + + (query,) = utils.convert_to_dataframe(query) + sql_string, index_col_ids, index_labels = query._to_sql_query(include_index=True) + + sql = bigframes.core.sql.create_vector_search_sql( + sql_string=sql_string, options=options # type: ignore + ) + if index_col_ids is not None: + df = query._session.read_gbq(sql, index_col=index_col_ids) + df.index.names = index_labels + else: + df = query._session.read_gbq(sql) + + return df diff --git a/bigframes/bigquery/_operations/struct.py b/bigframes/bigquery/_operations/struct.py new file mode 100644 index 0000000000..7cb826351c --- /dev/null +++ b/bigframes/bigquery/_operations/struct.py @@ -0,0 +1,63 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +"""This module integrates BigQuery built-in functions for use with DataFrame objects, +such as array functions: +https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions. """ + + +from __future__ import annotations + +import typing + +import bigframes.operations as ops +import bigframes.series as series + +if typing.TYPE_CHECKING: + import bigframes.dataframe as dataframe + + +def struct(value: dataframe.DataFrame) -> series.Series: + """Takes a DataFrame and converts it into a Series of structs with each + struct entry corresponding to a DataFrame row and each struct field + corresponding to a DataFrame column + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> import bigframes.series as series + >>> bpd.options.display.progress_bar = None + + >>> srs = series.Series([{"version": 1, "project": "pandas"}, {"version": 2, "project": "numpy"},]) + >>> df = srs.struct.explode() + >>> bbq.struct(df) + 0 {'project': 'pandas', 'version': 1} + 1 {'project': 'numpy', 'version': 2} + dtype: struct[pyarrow] + + Args: + value (bigframes.dataframe.DataFrame): + The DataFrame to be converted to a Series of structs + + Returns: + bigframes.series.Series: A new Series with struct entries representing rows of the original DataFrame + """ + block = value._block + block, result_id = block.apply_nary_op( + block.value_columns, ops.StructOp(column_names=tuple(block.column_labels)) + ) + block = block.select_column(result_id) + return series.Series(block) diff --git a/bigframes/core/compile/api.py b/bigframes/core/compile/api.py index 4e833411ae..86c8fca25a 100644 --- a/bigframes/core/compile/api.py +++ b/bigframes/core/compile/api.py @@ -18,6 +18,7 @@ import google.cloud.bigquery as bigquery import bigframes.core.compile.compiler as compiler +import bigframes.core.rewrite as rewrites if TYPE_CHECKING: import bigframes.core.nodes @@ -42,6 +43,7 @@ def compile_unordered( col_id_overrides: Mapping[str, str] = {}, ) -> str: """Compile node into sql where rows are unsorted, and no ordering information is preserved.""" + # TODO: Enable limit pullup, but only if not being used to write to clustered table. return self._compiler.compile_unordered_ir(node).to_sql( col_id_overrides=col_id_overrides ) @@ -53,8 +55,10 @@ def compile_ordered( col_id_overrides: Mapping[str, str] = {}, ) -> str: """Compile node into sql where rows are sorted with ORDER BY.""" - return self._compiler.compile_ordered_ir(node).to_sql( - col_id_overrides=col_id_overrides, ordered=True + # If we are ordering the query anyways, compiling the slice as a limit is probably a good idea. + new_node, limit = rewrites.pullup_limit_from_slice(node) + return self._compiler.compile_ordered_ir(new_node).to_sql( + col_id_overrides=col_id_overrides, ordered=True, limit=limit ) def compile_raw( diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index f4afdaa97c..d02a2c444c 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -943,8 +943,9 @@ def to_sql( self, col_id_overrides: typing.Mapping[str, str] = {}, ordered: bool = False, + limit: Optional[int] = None, ) -> str: - if ordered: + if ordered or limit: # Need to bake ordering expressions into the selected column in order for our ordering clause builder to work. baked_ir = self._bake_ordering() sql = ibis_bigquery.Backend().compile( @@ -969,7 +970,11 @@ def to_sql( order_by_clause = bigframes.core.sql.ordering_clause( baked_ir._ordering.all_ordering_columns ) - sql += f"{order_by_clause}\n" + sql += f"\n{order_by_clause}" + if limit is not None: + if not isinstance(limit, int): + raise TypeError(f"Limit param: {limit} must be an int.") + sql += f"\nLIMIT {limit}" else: sql = ibis_bigquery.Backend().compile( self._to_ibis_expr( diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index fd1514d7b7..19c18798c0 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -36,6 +36,7 @@ import bigframes.core.identifiers as ids import bigframes.core.nodes as nodes import bigframes.core.ordering as bf_ordering +import bigframes.core.rewrite as rewrites if typing.TYPE_CHECKING: import bigframes.core @@ -48,20 +49,32 @@ class Compiler: # In unstrict mode, ordering from ReadTable or after joins may be ambiguous to improve query performance. strict: bool = True scalar_op_compiler = compile_scalar.ScalarOpCompiler() + enable_pruning: bool = False + + def _preprocess(self, node: nodes.BigFrameNode): + if self.enable_pruning: + used_fields = frozenset(field.id for field in node.fields) + node = node.prune(used_fields) + node = functools.cache(rewrites.replace_slice_ops)(node) + return node def compile_ordered_ir(self, node: nodes.BigFrameNode) -> compiled.OrderedIR: - ir = typing.cast(compiled.OrderedIR, self.compile_node(node, True)) + ir = typing.cast( + compiled.OrderedIR, self.compile_node(self._preprocess(node), True) + ) if self.strict: assert ir.has_total_order return ir def compile_unordered_ir(self, node: nodes.BigFrameNode) -> compiled.UnorderedIR: - return typing.cast(compiled.UnorderedIR, self.compile_node(node, False)) + return typing.cast( + compiled.UnorderedIR, self.compile_node(self._preprocess(node), False) + ) def compile_peak_sql( self, node: nodes.BigFrameNode, n_rows: int ) -> typing.Optional[str]: - return self.compile_unordered_ir(node).peek_sql(n_rows) + return self.compile_unordered_ir(self._preprocess(node)).peek_sql(n_rows) # TODO: Remove cache when schema no longer requires compilation to derive schema (and therefor only compiles for execution) @functools.lru_cache(maxsize=5000) diff --git a/bigframes/core/eval.py b/bigframes/core/eval.py index 692ca1c7bb..82add99258 100644 --- a/bigframes/core/eval.py +++ b/bigframes/core/eval.py @@ -38,12 +38,15 @@ def eval(df: dataframe.DataFrame, expr: str, target: Optional[dataframe.DataFram Returns: Result of evaluation. """ - index_resolver = { - vendored_pandas_eval_parsing.clean_column_name(str(name)): EvalSeries( - df.index.get_level_values(level).to_series() - ) - for level, name in enumerate(df.index.names) - } + if df._has_index: + index_resolver = { + vendored_pandas_eval_parsing.clean_column_name(str(name)): EvalSeries( + df.index.get_level_values(level).to_series() + ) + for level, name in enumerate(df.index.names) + } + else: + index_resolver = {} column_resolver = { vendored_pandas_eval_parsing.clean_column_name(str(name)): EvalSeries(series) for name, series in df.items() diff --git a/bigframes/core/global_session.py b/bigframes/core/global_session.py index 3187c5c11b..e70cdad59e 100644 --- a/bigframes/core/global_session.py +++ b/bigframes/core/global_session.py @@ -30,7 +30,7 @@ _global_session_state.thread_local_session = None -def _try_close_session(session): +def _try_close_session(session: bigframes.session.Session): """Try to close the session and warn if couldn't.""" try: session.close() diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 2d351cf82d..dfbe2ddea2 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -19,6 +19,7 @@ import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby +import jellyfish import pandas as pd from bigframes.core import log_adapter @@ -91,8 +92,21 @@ def __getitem__( bad_keys = [key for key in keys if key not in self._block.column_labels] + # Raise a KeyError message with the possible correct key(s) if len(bad_keys) > 0: - raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}") + possible_key = [] + for bad_key in bad_keys: + possible_key.append( + min( + self._block.column_labels, + key=lambda item: jellyfish.damerau_levenshtein_distance( + bad_key, item + ), + ) + ) + raise KeyError( + f"Columns not found: {str(bad_keys)[1:-1]}. Did you mean {str(possible_key)[1:-1]}?" + ) columns = [ col_id for col_id, label in self._col_id_labels.items() if label in keys diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 1d01936509..2e23f529e2 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -20,7 +20,7 @@ import functools import itertools import typing -from typing import Callable, Iterable, Optional, Sequence, Tuple +from typing import Callable, cast, Iterable, Optional, Sequence, Tuple import google.cloud.bigquery as bq @@ -30,6 +30,7 @@ import bigframes.core.identifiers as bfet_ids from bigframes.core.ordering import OrderingExpression import bigframes.core.schema as schemata +import bigframes.core.slices as slices import bigframes.core.window_spec as window import bigframes.dtypes import bigframes.operations.aggregations as agg_ops @@ -82,6 +83,11 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]: """Direct children of this node""" return tuple([]) + @property + @abc.abstractmethod + def row_count(self) -> typing.Optional[int]: + return None + @functools.cached_property def session(self): sessions = [] @@ -263,7 +269,11 @@ def explicitly_ordered(self) -> bool: def transform_children( self, t: Callable[[BigFrameNode], BigFrameNode] ) -> BigFrameNode: - return replace(self, child=t(self.child)) + transformed = replace(self, child=t(self.child)) + if self == transformed: + # reusing existing object speeds up eq, and saves a small amount of memory + return self + return transformed @property def order_ambiguous(self) -> bool: @@ -300,6 +310,26 @@ def variables_introduced(self) -> int: def relation_ops_created(self) -> int: return 2 + @property + def is_limit(self) -> bool: + """Returns whether this is equivalent to a ORDER BY ... LIMIT N.""" + # TODO: Handle tail case. + return ( + (not self.start) + and (self.step == 1) + and (self.stop is not None) + and (self.stop > 0) + ) + + @property + def row_count(self) -> typing.Optional[int]: + child_length = self.child.row_count + if child_length is None: + return None + return slices.slice_output_rows( + (self.start, self.stop, self.step), child_length + ) + @dataclass(frozen=True, eq=False) class JoinNode(BigFrameNode): @@ -347,12 +377,25 @@ def variables_introduced(self) -> int: def joins(self) -> bool: return True + @property + def row_count(self) -> Optional[int]: + if self.type == "cross": + if self.left_child.row_count is None or self.right_child.row_count is None: + return None + return self.left_child.row_count * self.right_child.row_count + + return None + def transform_children( self, t: Callable[[BigFrameNode], BigFrameNode] ) -> BigFrameNode: - return replace( + transformed = replace( self, left_child=t(self.left_child), right_child=t(self.right_child) ) + if self == transformed: + # reusing existing object speeds up eq, and saves a small amount of memory + return self + return transformed @property def defines_namespace(self) -> bool: @@ -404,10 +447,24 @@ def variables_introduced(self) -> int: """Defines the number of variables generated by the current node. Used to estimate query planning complexity.""" return len(self.schema.items) + OVERHEAD_VARIABLES + @property + def row_count(self) -> Optional[int]: + sub_counts = [node.row_count for node in self.child_nodes] + total = 0 + for count in sub_counts: + if count is None: + return None + total += count + return total + def transform_children( self, t: Callable[[BigFrameNode], BigFrameNode] ) -> BigFrameNode: - return replace(self, children=tuple(t(child) for child in self.children)) + transformed = replace(self, children=tuple(t(child) for child in self.children)) + if self == transformed: + # reusing existing object speeds up eq, and saves a small amount of memory + return self + return transformed def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: # TODO: Make concat prunable, probably by redefining @@ -448,10 +505,18 @@ def variables_introduced(self) -> int: """Defines the number of variables generated by the current node. Used to estimate query planning complexity.""" return len(self.schema.items) + OVERHEAD_VARIABLES + @property + def row_count(self) -> Optional[int]: + return None + def transform_children( self, t: Callable[[BigFrameNode], BigFrameNode] ) -> BigFrameNode: - return replace(self, start=t(self.start), end=t(self.end)) + transformed = replace(self, start=t(self.start), end=t(self.end)) + if self == transformed: + # reusing existing object speeds up eq, and saves a small amount of memory + return self + return transformed def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: # TODO: Make FromRangeNode prunable (or convert to other node types) @@ -468,7 +533,11 @@ def roots(self) -> typing.Set[BigFrameNode]: return {self} @property - def supports_fast_head(self) -> bool: + def fast_offsets(self) -> bool: + return False + + @property + def fast_ordered_limit(self) -> bool: return False def transform_children( @@ -476,11 +545,6 @@ def transform_children( ) -> BigFrameNode: return self - @property - def row_count(self) -> typing.Optional[int]: - """How many rows are in the data source. None means unknown.""" - return None - class ScanItem(typing.NamedTuple): id: bfet_ids.ColumnId @@ -512,7 +576,11 @@ def variables_introduced(self) -> int: return len(self.scan_list.items) + 1 @property - def supports_fast_head(self) -> bool: + def fast_offsets(self) -> bool: + return True + + @property + def fast_ordered_limit(self) -> bool: return True @property @@ -619,12 +687,27 @@ def relation_ops_created(self) -> int: return 3 @property - def supports_fast_head(self) -> bool: - # Fast head is only supported when row offsets are available. - # In the future, ORDER BY+LIMIT optimizations may allow fast head when - # clustered and/or partitioned on ordering key + def fast_offsets(self) -> bool: + # Fast head is only supported when row offsets are available or data is clustered over ordering key. return (self.source.ordering is not None) and self.source.ordering.is_sequential + @property + def fast_ordered_limit(self) -> bool: + if self.source.ordering is None: + return False + order_cols = self.source.ordering.all_ordering_columns + # monotonicity would probably be fine + if not all(col.scalar_expression.is_identity for col in order_cols): + return False + order_col_ids = tuple( + cast(ex.DerefOp, col.scalar_expression).id.name for col in order_cols + ) + cluster_col_ids = self.source.table.cluster_cols + if cluster_col_ids is None: + return False + + return order_col_ids == cluster_col_ids[: len(order_col_ids)] + @property def order_ambiguous(self) -> bool: return ( @@ -690,6 +773,10 @@ def relation_ops_created(self) -> int: def variables_introduced(self) -> int: return 1 + @property + def row_count(self) -> Optional[int]: + return self.child.row_count + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: if self.col_id not in used_cols: return self.child.prune(used_cols) @@ -710,6 +797,10 @@ def row_preserving(self) -> bool: def variables_introduced(self) -> int: return 1 + @property + def row_count(self) -> Optional[int]: + return None + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: consumed_ids = used_cols.union(self.predicate.column_references) pruned_child = self.child.prune(consumed_ids) @@ -733,6 +824,10 @@ def relation_ops_created(self) -> int: def explicitly_ordered(self) -> bool: return True + @property + def row_count(self) -> Optional[int]: + return self.child.row_count + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: ordering_cols = itertools.chain.from_iterable( map(lambda x: x.referenced_columns, self.by) @@ -756,6 +851,10 @@ def relation_ops_created(self) -> int: # Doesnt directly create any relational operations return 0 + @property + def row_count(self) -> Optional[int]: + return self.child.row_count + @dataclass(frozen=True, eq=False) class SelectionNode(UnaryNode): @@ -782,6 +881,10 @@ def variables_introduced(self) -> int: def defines_namespace(self) -> bool: return True + @property + def row_count(self) -> Optional[int]: + return self.child.row_count + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: pruned_selections = tuple( select for select in self.input_output_pairs if select[1] in used_cols @@ -826,6 +929,10 @@ def variables_introduced(self) -> int: new_vars = sum(1 for i in self.assignments if not i[0].is_identity) return new_vars + @property + def row_count(self) -> Optional[int]: + return self.child.row_count + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: pruned_assignments = tuple(i for i in self.assignments if i[1] in used_cols) if len(pruned_assignments) == 0: @@ -861,6 +968,10 @@ def variables_introduced(self) -> int: def defines_namespace(self) -> bool: return True + @property + def row_count(self) -> Optional[int]: + return 1 + @dataclass(frozen=True, eq=False) class AggregateNode(UnaryNode): @@ -910,6 +1021,12 @@ def explicitly_ordered(self) -> bool: def defines_namespace(self) -> bool: return True + @property + def row_count(self) -> Optional[int]: + if not self.by_column_ids: + return 1 + return None + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: by_ids = (ref.id for ref in self.by_column_ids) pruned_aggs = tuple(agg for agg in self.aggregations if agg[1] in used_cols) @@ -947,6 +1064,10 @@ def relation_ops_created(self) -> int: # Assume that if not reprojecting, that there is a sequence of window operations sharing the same window return 0 if self.skip_reproject_unsafe else 4 + @property + def row_count(self) -> Optional[int]: + return self.child.row_count + @functools.cached_property def added_field(self) -> Field: input_type = self.child.get_type(self.column_name.id) @@ -978,6 +1099,10 @@ def row_preserving(self) -> bool: def variables_introduced(self) -> int: return 1 + @property + def row_count(self) -> Optional[int]: + return None + # TODO: Explode should create a new column instead of overriding the existing one @dataclass(frozen=True, eq=False) @@ -1014,6 +1139,10 @@ def variables_introduced(self) -> int: def defines_namespace(self) -> bool: return True + @property + def row_count(self) -> Optional[int]: + return None + def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: # Cannot prune explode op return self.transform_children( diff --git a/bigframes/core/rewrite.py b/bigframes/core/rewrite.py index d4e530fff3..9c0eb81450 100644 --- a/bigframes/core/rewrite.py +++ b/bigframes/core/rewrite.py @@ -24,7 +24,7 @@ import bigframes.core.join_def as join_defs import bigframes.core.nodes as nodes import bigframes.core.ordering as order -import bigframes.core.tree_properties as traversals +import bigframes.core.slices as slices import bigframes.operations as ops Selection = Tuple[Tuple[scalar_exprs.Expression, ids.ColumnId], ...] @@ -385,46 +385,71 @@ def common_selection_root( return None +def pullup_limit_from_slice( + root: nodes.BigFrameNode, +) -> Tuple[nodes.BigFrameNode, Optional[int]]: + """ + This is a BQ-sql specific optimization that can be helpful as ORDER BY LIMIT is more efficient than WHERE + ROW_NUMBER(). + + Only use this if writing to an unclustered table. Clustering is not compatible with ORDER BY. + """ + if isinstance(root, nodes.SliceNode): + # head case + # More cases could be handled, but this is by far the most important, as it is used by df.head(), df[:N] + if root.is_limit: + assert not root.start + assert root.step == 1 + assert root.stop is not None + limit = root.stop + new_root, prior_limit = pullup_limit_from_slice(root.child) + if (prior_limit is not None) and (prior_limit < limit): + limit = prior_limit + return new_root, limit + elif ( + isinstance(root, (nodes.SelectionNode, nodes.ProjectionNode)) + and root.row_preserving + ): + new_child, prior_limit = pullup_limit_from_slice(root.child) + if prior_limit is not None: + return root.transform_children(lambda _: new_child), prior_limit + # Most ops don't support pulling up slice, like filter, agg, join, etc. + return root, None + + def replace_slice_ops(root: nodes.BigFrameNode) -> nodes.BigFrameNode: # TODO: we want to pull up some slices into limit op if near root. if isinstance(root, nodes.SliceNode): root = root.transform_children(replace_slice_ops) - return convert_slice_to_filter(cast(nodes.SliceNode, root)) + return rewrite_slice(cast(nodes.SliceNode, root)) else: return root.transform_children(replace_slice_ops) -def get_simplified_slice(node: nodes.SliceNode): - """Attempts to simplify the slice.""" - row_count = traversals.row_count(node) - start, stop, step = node.start, node.stop, node.step +def rewrite_slice(node: nodes.SliceNode): + slice_def = (node.start, node.stop, node.step) + + # no-op (eg. df[::1]) + if slices.is_noop(slice_def, node.child.row_count): + return node.child - if start is None: - start = 0 if step > 0 else -1 - if row_count and step > 0: - if start and start < 0: - start = row_count + start - if stop and stop < 0: - stop = row_count + stop - return start, stop, step + # No filtering, just reverse (eg. df[::-1]) + if slices.is_reverse(slice_def, node.child.row_count): + return nodes.ReversedNode(node.child) + if node.child.row_count: + slice_def = slices.to_forward_offsets(slice_def, node.child.row_count) + return slice_as_filter(node.child, *slice_def) -def convert_slice_to_filter(node: nodes.SliceNode): - start, stop, step = get_simplified_slice(node) - # no-op (eg. df[::1]) +def slice_as_filter( + node: nodes.BigFrameNode, start: Optional[int], stop: Optional[int], step: int +) -> nodes.BigFrameNode: if ( - ((start == 0) or (start is None)) - and ((stop is None) or (stop == -1)) - and (step == 1) + ((start is None) or (start >= 0)) + and ((stop is None) or (stop >= 0)) + and (step > 0) ): - return node.child - # No filtering, just reverse (eg. df[::-1]) - if ((start is None) or (start == -1)) and (not stop) and (step == -1): - return nodes.ReversedNode(node.child) - # if start/stop/step are all non-negative, and do a simple predicate on forward offsets - if ((start is None) or (start >= 0)) and ((stop is None) or (stop >= 0)): - node_w_offset = add_offsets(node.child) + node_w_offset = add_offsets(node) predicate = convert_simple_slice( scalar_exprs.DerefOp(node_w_offset.col_id), start or 0, stop, step ) @@ -433,17 +458,18 @@ def convert_slice_to_filter(node: nodes.SliceNode): # fallback cases, generate both forward and backward offsets if step < 0: - forward_offsets = add_offsets(node.child) + forward_offsets = add_offsets(node) reversed_offsets = add_offsets(nodes.ReversedNode(forward_offsets)) dual_indexed = reversed_offsets else: - reversed_offsets = add_offsets(nodes.ReversedNode(node.child)) + reversed_offsets = add_offsets(nodes.ReversedNode(node)) forward_offsets = add_offsets(nodes.ReversedNode(reversed_offsets)) dual_indexed = forward_offsets + default_start = 0 if step >= 0 else -1 predicate = convert_complex_slice( scalar_exprs.DerefOp(forward_offsets.col_id), scalar_exprs.DerefOp(reversed_offsets.col_id), - start, + start if (start is not None) else default_start, stop, step, ) @@ -505,7 +531,7 @@ def convert_complex_slice( if start or ((start is not None) and step < 0): if start > 0 and step > 0: start_cond = ops.ge_op.as_expr(forward_offsets, scalar_exprs.const(start)) - elif start > 0 and step < 0: + elif start >= 0 and step < 0: start_cond = ops.le_op.as_expr(forward_offsets, scalar_exprs.const(start)) elif start < 0 and step > 0: start_cond = ops.le_op.as_expr( diff --git a/bigframes/core/slices.py b/bigframes/core/slices.py new file mode 100644 index 0000000000..97f90d3349 --- /dev/null +++ b/bigframes/core/slices.py @@ -0,0 +1,106 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import Optional + + +def to_forward_offsets( + slice: tuple[Optional[int], Optional[int], Optional[int]], input_rows: int +) -> tuple[int, Optional[int], int]: + """Redefine the slice to use forward offsets for start and stop indices.""" + step = slice[2] or 1 + stop = slice[1] + start = slice[0] + + # normalize start to positive number + if start is None: + start = 0 if (step > 0) else (input_rows - 1) + elif start < 0: + start = max(0, input_rows + start) + else: + start = min(start, input_rows) + + if stop is None: + stop = None + elif stop < 0: + stop = max(0, input_rows + stop) + else: + stop = min(stop, input_rows) + + return (start, stop, step) + + +def remove_unused_parts( + slice: tuple[Optional[int], Optional[int], Optional[int]], input_rows: int +) -> tuple[Optional[int], Optional[int], Optional[int]]: + """Makes a slice component null if it doesn't impact slice semantics.""" + start, stop, step = slice + is_forward = (step is None) or (step > 0) + if start is not None: + if is_forward and ((start == 0) or (start <= -input_rows)): + start = None + elif (not is_forward) and ((start == -1) or (start >= (input_rows - 1))): + start = None + if stop is not None: + if is_forward and (stop >= input_rows): + stop = None + elif (not is_forward) and (stop <= (-input_rows - 1)): + stop = None + if step == 1: + step = None + return start, stop, step + + +def slice_output_rows( + slice: tuple[Optional[int], Optional[int], Optional[int]], input_size: int +) -> int: + """Given input_size, returns the number of rows returned after the slice operation.""" + slice = to_forward_offsets(slice, input_size) + start, stop, step = slice + + if step > 0: + if stop is None: + stop = input_size + length = max(0, (stop - start + step - 1) // step) + else: + if stop is None: + stop = -1 + length = max(0, (start - stop - step - 1) // -step) + return length + + +def is_noop( + slice_def: tuple[Optional[int], Optional[int], Optional[int]], + input_size: Optional[int], +) -> bool: + """Returns true iff the slice op is a no-op returning the input array.""" + if input_size: + start, stop, step = remove_unused_parts(slice_def, input_size) + else: + start, stop, step = slice_def + return (not start) and (stop is None) and ((step is None) or (step == 1)) + + +def is_reverse( + slice_def: tuple[Optional[int], Optional[int], Optional[int]], + input_size: Optional[int], +) -> bool: + """Returns true iff the slice op is a pure reverse op, equivalent to df[::-1]""" + if input_size: + start, stop, step = remove_unused_parts(slice_def, input_size) + else: + start, stop, step = slice_def + return (start is None) and (stop is None) and (step == -1) diff --git a/bigframes/core/sql.py b/bigframes/core/sql.py index e44091e7b1..d5dfc64ddd 100644 --- a/bigframes/core/sql.py +++ b/bigframes/core/sql.py @@ -19,7 +19,7 @@ import datetime import math -from typing import Iterable, Mapping, TYPE_CHECKING, Union +from typing import cast, Collection, Iterable, Mapping, TYPE_CHECKING, Union import bigframes.core.compile.googlesql as googlesql @@ -116,6 +116,45 @@ def ordering_clause( return f"ORDER BY {' ,'.join(parts)}" +def create_vector_index_ddl( + *, + replace: bool, + index_name: str, + table_name: str, + column_name: str, + stored_column_names: Collection[str], + options: Mapping[str, Union[str | int | bool | float]] = {}, +) -> str: + """Encode the VECTOR INDEX statement for BigQuery Vector Search.""" + + if replace: + create = "CREATE OR REPLACE VECTOR INDEX " + else: + create = "CREATE VECTOR INDEX IF NOT EXISTS " + + if len(stored_column_names) > 0: + escaped_stored = [ + f"{googlesql.identifier(name)}" for name in stored_column_names + ] + storing = f"STORING({', '.join(escaped_stored)}) " + else: + storing = "" + + rendered_options = ", ".join( + [ + f"{option_name} = {simple_literal(option_value)}" + for option_name, option_value in options.items() + ] + ) + + return f""" + {create} {googlesql.identifier(index_name)} + ON {googlesql.identifier(table_name)}({googlesql.identifier(column_name)}) + {storing} + OPTIONS({rendered_options}); + """ + + def create_vector_search_sql( sql_string: str, options: Mapping[str, Union[str | int | bool | float]] = {}, @@ -135,7 +174,7 @@ def create_vector_search_sql( base.*, distance, FROM VECTOR_SEARCH( - TABLE `{base_table}`, + TABLE {googlesql.identifier(cast(str, base_table))}, {simple_literal(column_to_search)}, ({sql_string}), {simple_literal(query_column_to_search)}, @@ -150,7 +189,7 @@ def create_vector_search_sql( base.*, distance, FROM VECTOR_SEARCH( - TABLE `{base_table}`, + TABLE {googlesql.identifier(cast(str, base_table))}, {simple_literal(column_to_search)}, ({sql_string}), distance_type => {simple_literal(distance_type)}, diff --git a/bigframes/core/tree_properties.py b/bigframes/core/tree_properties.py index 1b0fe0d072..0a4339ee06 100644 --- a/bigframes/core/tree_properties.py +++ b/bigframes/core/tree_properties.py @@ -42,32 +42,35 @@ def can_fast_peek(node: nodes.BigFrameNode) -> bool: def can_fast_head(node: nodes.BigFrameNode) -> bool: """Can get head fast if can push head operator down to leafs and operators preserve rows.""" + # To do fast head operation: + # (1) the underlying data must be arranged/indexed according to the logical ordering + # (2) transformations must support pushing down LIMIT or a filter on row numbers + return has_fast_offset_address(node) or has_fast_offset_address(node) + + +def has_fast_orderby_limit(node: nodes.BigFrameNode) -> bool: + """True iff ORDER BY LIMIT can be performed without a large full table scan.""" + # TODO: In theory compatible with some Slice nodes, potentially by adding OFFSET if isinstance(node, nodes.LeafNode): - return node.supports_fast_head + return node.fast_ordered_limit if isinstance(node, (nodes.ProjectionNode, nodes.SelectionNode)): - return can_fast_head(node.child) + return has_fast_orderby_limit(node.child) + return False + + +def has_fast_offset_address(node: nodes.BigFrameNode) -> bool: + """True iff specific offsets can be scanned without a large full table scan.""" + # TODO: In theory can push offset lookups through slice operators by translating indices + if isinstance(node, nodes.LeafNode): + return node.fast_offsets + if isinstance(node, (nodes.ProjectionNode, nodes.SelectionNode)): + return has_fast_offset_address(node.child) return False def row_count(node: nodes.BigFrameNode) -> Optional[int]: """Determine row count from local metadata, return None if unknown.""" - if isinstance(node, nodes.LeafNode): - return node.row_count - if isinstance(node, nodes.AggregateNode): - if len(node.by_column_ids) == 0: - return 1 - return None - if isinstance(node, nodes.ConcatNode): - sub_counts = list(map(row_count, node.child_nodes)) - total = 0 - for count in sub_counts: - if count is None: - return None - total += count - return total - if isinstance(node, nodes.UnaryNode) and node.row_preserving: - return row_count(node.child) - return None + return node.row_count # Replace modified_cost(node) = cost(apply_cache(node)) @@ -113,6 +116,9 @@ def _node_counts_inner( node_counts = _node_counts_inner(root) + if len(node_counts) == 0: + raise ValueError("node counts should be non-zero") + return max( node_counts.keys(), key=lambda node: heuristic( diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 43c05c6c83..e684ac55a4 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -116,9 +116,9 @@ def label_to_identifier(label: typing.Hashable, strict: bool = False) -> str: """ # Column values will be loaded as null if the column name has spaces. # https://github.com/googleapis/python-bigquery/issues/1566 - identifier = str(label).replace(" ", "_") - + identifier = str(label) if strict: + identifier = str(label).replace(" ", "_") identifier = re.sub(r"[^a-zA-Z0-9_]", "", identifier) if not identifier: identifier = "id" diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0cfa5a2154..f803b66ab6 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -81,6 +81,8 @@ import bigframes.session._io.bigquery if typing.TYPE_CHECKING: + from _typeshed import SupportsRichComparison + import bigframes.session SingleItemValue = Union[bigframes.series.Series, int, float, Callable] @@ -377,7 +379,7 @@ def _to_sql_query( whether to include index columns. Returns: - a tuple of (sql_string, index_column_id_list, index_column_label_list). + Tuple[sql_string, index_column_id_list, index_column_label_list]: If include_index is set to False, index_column_id_list and index_column_label_list return empty lists. """ @@ -385,7 +387,12 @@ def _to_sql_query( @property def sql(self) -> str: - """Compiles this DataFrame's expression tree to SQL.""" + """Compiles this DataFrame's expression tree to SQL. + + Returns: + str: + string representing the compiled SQL. + """ include_index = self._has_index and ( self.index.name is not None or len(self.index.names) > 1 ) @@ -397,8 +404,9 @@ def query_job(self) -> Optional[bigquery.QueryJob]: """BigQuery job metadata for the most recent query. Returns: - The most recent `QueryJob - `_. + None or google.cloud.bigquery.QueryJob: + The most recent `QueryJob + `_. """ if self._query_job is None: self._set_internal_query_job(self._compute_dry_run()) @@ -768,7 +776,7 @@ def _apply_series_binop_axis_0( reverse: bool = False, ) -> DataFrame: bf_series = bigframes.core.convert.to_bf_series( - other, self.index, self._session + other, self.index if self._has_index else None, self._session ) aligned_block, columns, expr_pairs = self._block._align_axis_0( bf_series._block, how=how @@ -2464,7 +2472,7 @@ def pivot_table( values = [values] # Unlike pivot, pivot_table has values always ordered. - values.sort() + values.sort(key=lambda val: typing.cast("SupportsRichComparison", val)) keys = index + columns agged = self.groupby(keys, dropna=True)[values].agg(aggfunc) @@ -3179,6 +3187,7 @@ def to_gbq( clustering_columns: Union[pandas.Index, Iterable[typing.Hashable]] = (), labels: dict[str, str] = {}, ) -> str: + index = index and self._has_index temp_table_ref = None if destination_table is None: @@ -3760,7 +3769,7 @@ def cache(self): Useful if the dataframe will be used multiple times, as this will avoid recomputating the shared intermediate value. Returns: - DataFrame: Self + bigframes.pandas.DataFrame: DataFrame """ return self._cached(force=True) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index 81181b58cf..550b4a8178 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -22,11 +22,12 @@ """ import abc -from typing import cast, Optional, TypeVar, Union +from typing import cast, Optional, TypeVar import bigframes_vendored.sklearn.base from bigframes.ml import core +import bigframes.ml.utils as utils import bigframes.pandas as bpd @@ -157,8 +158,8 @@ class SupervisedTrainablePredictor(TrainablePredictor): def fit( self: _T, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ) -> _T: return self._fit(X, y) @@ -172,8 +173,8 @@ class UnsupervisedTrainablePredictor(TrainablePredictor): def fit( self: _T, - X: Union[bpd.DataFrame, bpd.Series], - y: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + X: utils.ArrayType, + y: Optional[utils.ArrayType] = None, ) -> _T: return self._fit(X, y) @@ -243,8 +244,8 @@ def transform(self, X): def fit_transform( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + X: utils.ArrayType, + y: Optional[utils.ArrayType] = None, ) -> bpd.DataFrame: return self.fit(X, y).transform(X) @@ -264,6 +265,6 @@ def transform(self, y): def fit_transform( self, - y: Union[bpd.DataFrame, bpd.Series], + y: utils.ArrayType, ) -> bpd.DataFrame: return self.fit(y).transform(y) diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index 43cfbdd424..a221ea8e89 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -21,6 +21,7 @@ import bigframes_vendored.sklearn.cluster._kmeans from google.cloud import bigquery +import pandas as pd import bigframes from bigframes.core import log_adapter @@ -101,7 +102,7 @@ def _bqml_options(self) -> dict: def _fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored transforms: Optional[List[str]] = None, ) -> KMeans: @@ -125,17 +126,20 @@ def cluster_centers_(self) -> bpd.DataFrame: def predict( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) def detect_anomalies( - self, X: Union[bpd.DataFrame, bpd.Series], *, contamination: float = 0.1 + self, + X: Union[bpd.DataFrame, bpd.Series, pd.DataFrame, pd.Series], + *, + contamination: float = 0.1, ) -> bpd.DataFrame: """Detect the anomaly data points of the input. @@ -156,7 +160,7 @@ def detect_anomalies( if not self._bqml_model: raise RuntimeError("A model must be fitted before detect_anomalies") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.detect_anomalies( X, options={"contamination": contamination} @@ -181,12 +185,12 @@ def to_gbq(self, model_name: str, replace: bool = False) -> KMeans: def score( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.evaluate(X) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 08c9761cc3..27d9bfb4f4 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -28,6 +28,7 @@ from google.cloud import bigquery from bigframes.core import log_adapter +import bigframes.core.compile.googlesql as sql_utils from bigframes.ml import base, core, globals, impute, preprocessing, utils import bigframes.pandas as bpd @@ -98,16 +99,11 @@ class SQLScalarColumnTransformer: def __init__(self, sql: str, target_column: str = "transformed_{0}"): super().__init__() self._sql = sql + # TODO: More robust unescaping self._target_column = target_column.replace("`", "") PLAIN_COLNAME_RX = re.compile("^[a-z][a-z0-9_]*$", re.IGNORECASE) - def escape(self, colname: str): - colname = colname.replace("`", "") - if self.PLAIN_COLNAME_RX.match(colname): - return colname - return f"`{colname}`" - def _compile_to_sql( self, X: bpd.DataFrame, columns: Optional[Iterable[str]] = None ) -> List[str]: @@ -115,8 +111,10 @@ def _compile_to_sql( columns = X.columns result = [] for column in columns: - current_sql = self._sql.format(self.escape(column)) - current_target_column = self.escape(self._target_column.format(column)) + current_sql = self._sql.format(sql_utils.identifier(column)) + current_target_column = sql_utils.identifier( + self._target_column.format(column) + ) result.append(f"{current_sql} AS {current_target_column}") return result @@ -239,6 +237,7 @@ def camel_to_snake(name): transformers_set.add( ( camel_to_snake(transformer_cls.__name__), + # TODO: This is very fragile, use real SQL parser *transformer_cls._parse_from_sql(transform_sql), # type: ignore ) ) @@ -253,7 +252,7 @@ def camel_to_snake(name): target_column = transform_col_dict["name"] sql_transformer = SQLScalarColumnTransformer( - transform_sql, target_column=target_column + transform_sql.strip(), target_column=target_column ) input_column_name = f"?{target_column}" transformers_set.add( @@ -333,7 +332,7 @@ def _compile_to_sql( def fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored ) -> ColumnTransformer: (X,) = utils.convert_to_dataframe(X) @@ -348,11 +347,11 @@ def fit( self._extract_output_names() return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) df = self._bqml_model.transform(X) return typing.cast( diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 02ccc9d6a5..4bc61c5015 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -47,8 +47,10 @@ class BqmlModel(BaseBqml): def __init__(self, session: bigframes.Session, model: bigquery.Model): self._session = session self._model = model + model_ref = self._model.reference + assert model_ref is not None self._model_manipulation_sql_generator = ml_sql.ModelManipulationSqlGenerator( - self.model_name + model_ref ) def _apply_ml_tvf( diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 41dea7617f..aaf06ef5c9 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -84,7 +84,7 @@ def _bqml_options(self) -> dict: def _fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, transforms: Optional[List[str]] = None, ) -> PCA: @@ -129,16 +129,19 @@ def explained_variance_ratio_(self) -> bpd.DataFrame: ["principal_component_id", "explained_variance_ratio"] ] - def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) def detect_anomalies( - self, X: Union[bpd.DataFrame, bpd.Series], *, contamination: float = 0.1 + self, + X: utils.ArrayType, + *, + contamination: float = 0.1, ) -> bpd.DataFrame: """Detect the anomaly data points of the input. @@ -159,7 +162,7 @@ def detect_anomalies( if not self._bqml_model: raise RuntimeError("A model must be fitted before detect_anomalies") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.detect_anomalies( X, options={"contamination": contamination} diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 0194d768b8..91c14e4336 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -17,7 +17,7 @@ from __future__ import annotations -from typing import Dict, List, Literal, Optional, Union +from typing import Dict, List, Literal, Optional import bigframes_vendored.sklearn.ensemble._forest import bigframes_vendored.xgboost.sklearn @@ -142,8 +142,8 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: def _fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, transforms: Optional[List[str]] = None, ) -> XGBRegressor: X, y = utils.convert_to_dataframe(X, y) @@ -158,24 +158,24 @@ def _fit( def predict( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ): - X, y = utils.convert_to_dataframe(X, y) - if not self._bqml_model: raise RuntimeError("A model must be fitted before score") + X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session) + input_data = ( X.join(y, how="outer") if (X is not None) and (y is not None) else None ) @@ -291,8 +291,8 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: def _fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, transforms: Optional[List[str]] = None, ) -> XGBClassifier: X, y = utils.convert_to_dataframe(X, y) @@ -305,22 +305,22 @@ def _fit( ) return self - def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ): if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - X, y = utils.convert_to_dataframe(X, y) + X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session) input_data = ( X.join(y, how="outer") if (X is not None) and (y is not None) else None @@ -427,8 +427,8 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: def _fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, transforms: Optional[List[str]] = None, ) -> RandomForestRegressor: X, y = utils.convert_to_dataframe(X, y) @@ -443,18 +443,18 @@ def _fit( def predict( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ): """Calculate evaluation metrics of the model. @@ -476,7 +476,7 @@ def score( if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - X, y = utils.convert_to_dataframe(X, y) + X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session) input_data = ( X.join(y, how="outer") if (X is not None) and (y is not None) else None @@ -583,8 +583,8 @@ def _bqml_options(self) -> Dict[str, str | int | bool | float | List[str]]: def _fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, transforms: Optional[List[str]] = None, ) -> RandomForestClassifier: X, y = utils.convert_to_dataframe(X, y) @@ -599,18 +599,18 @@ def _fit( def predict( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ): """Calculate evaluation metrics of the model. @@ -632,7 +632,7 @@ def score( if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - X, y = utils.convert_to_dataframe(X, y) + X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session) input_data = ( X.join(y, how="outer") if (X is not None) and (y is not None) else None diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index a1ae8435d5..d27801caa3 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -16,7 +16,7 @@ from __future__ import annotations -from typing import List, Optional, Union +from typing import List, Optional from google.cloud import bigquery @@ -65,7 +65,7 @@ class ARIMAPlus(base.SupervisedTrainablePredictor): The data frequency of the input time series. Possible values are "auto_frequency", "per_minute", "hourly", "daily", "weekly", "monthly", "quarterly", "yearly" - include_drift (bool, defalut False): + include_drift (bool, default False): Determines whether the model should include a linear drift term or not. The drift term is applicable when non-seasonal d is 1. holiday_region (str or None, default None): @@ -180,8 +180,8 @@ def _bqml_options(self) -> dict: def _fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, transforms: Optional[List[str]] = None, ): """Fit the model to training data. @@ -276,14 +276,14 @@ def coef_( def detect_anomalies( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, *, anomaly_prob_threshold: float = 0.95, ) -> bpd.DataFrame: """Detect the anomaly data points of the input. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or a DataFrame to detect anomalies. anomaly_prob_threshold (float, default 0.95): Identifies the custom threshold to use for anomaly detection. The value must be in the range [0, 1), with a default value of 0.95. @@ -298,7 +298,7 @@ def detect_anomalies( if not self._bqml_model: raise RuntimeError("A model must be fitted before detect_anomalies") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.detect_anomalies( X, options={"anomaly_prob_threshold": anomaly_prob_threshold} @@ -306,8 +306,8 @@ def detect_anomalies( def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ) -> bpd.DataFrame: """Calculate evaluation metrics of the model. @@ -318,11 +318,11 @@ def score( for the outputs relevant to this model type. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): A BigQuery DataFrame only contains 1 column as evaluation timestamp. The timestamp must be within the horizon of the model, which by default is 1000 data points. - y (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): A BigQuery DataFrame only contains 1 column as evaluation numeric values. @@ -331,7 +331,7 @@ def score( """ if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - X, y = utils.convert_to_dataframe(X, y) + X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session) input_data = X.join(y, how="outer") return self._bqml_model.evaluate(input_data) diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index cb8fe7a96e..dfee12f523 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -16,7 +16,7 @@ from __future__ import annotations -from typing import cast, Mapping, Optional, Union +from typing import cast, Mapping, Optional from google.cloud import bigquery @@ -64,11 +64,11 @@ def _from_bq( model._bqml_model = core.BqmlModel(session, bq_model) return model - def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: """Predict the result from input DataFrame. Args: - X (bigframes.dataframe.DataFrame): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame. Schema is defined by the model. Returns: @@ -143,11 +143,11 @@ def _from_bq( model._bqml_model = core.BqmlModel(session, bq_model) return model - def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: """Predict the result from input DataFrame. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame or Series. Schema is defined by the model. Returns: @@ -159,7 +159,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: self._bqml_model = self._create_bqml_model() self._bqml_model = cast(core.BqmlModel, self._bqml_model) - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) @@ -259,11 +259,11 @@ def _from_bq( model._bqml_model = core.BqmlModel(session, bq_model) return model - def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: """Predict the result from input DataFrame. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame or Series. Schema is defined by the model. Returns: @@ -275,7 +275,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: self._bqml_model = self._create_bqml_model() self._bqml_model = cast(core.BqmlModel, self._bqml_model) - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) diff --git a/bigframes/ml/impute.py b/bigframes/ml/impute.py index 4955eb5de5..37b9849b4f 100644 --- a/bigframes/ml/impute.py +++ b/bigframes/ml/impute.py @@ -18,7 +18,7 @@ from __future__ import annotations import typing -from typing import Iterable, List, Literal, Optional, Union +from typing import Iterable, List, Literal, Optional import bigframes_vendored.sklearn.impute._base @@ -80,11 +80,11 @@ def _parse_from_sql(cls, sql: str) -> tuple[SimpleImputer, str]: tuple(SimpleImputer, column_label)""" s = sql[sql.find("(") + 1 : sql.find(")")] col_label, strategy = s.split(", ") - return cls(strategy[1:-1]), col_label # type: ignore[arg-type] + return cls(strategy[1:-1]), _unescape_id(col_label) # type: ignore[arg-type] def fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored ) -> SimpleImputer: (X,) = utils.convert_to_dataframe(X) @@ -99,14 +99,22 @@ def fit( self._extract_output_names() return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) df = self._bqml_model.transform(X) return typing.cast( bpd.DataFrame, df[self._output_names], ) + + +def _unescape_id(id: str) -> str: + """Very simple conversion to removed ` characters from ids. + + A proper sql parser should be used instead. + """ + return id.removeprefix("`").removesuffix("`") diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index 8fe1d6ec27..5665507286 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -128,8 +128,8 @@ def _bqml_options(self) -> dict: def _fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, transforms: Optional[List[str]] = None, ) -> LinearRegression: X, y = utils.convert_to_dataframe(X, y) @@ -142,7 +142,7 @@ def _fit( ) return self - def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") @@ -152,13 +152,13 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - X, y = utils.convert_to_dataframe(X, y) + X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session) input_data = X.join(y, how="outer") return self._bqml_model.evaluate(input_data) @@ -280,8 +280,8 @@ def _bqml_options(self) -> dict: def _fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, transforms: Optional[List[str]] = None, ) -> LogisticRegression: """Fit model with transforms.""" @@ -297,24 +297,24 @@ def _fit( def predict( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) return self._bqml_model.predict(X) def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - X, y = utils.convert_to_dataframe(X, y) + X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session) input_data = X.join(y, how="outer") return self._bqml_model.evaluate(input_data) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 3920da6c71..5c5153e163 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -16,7 +16,7 @@ from __future__ import annotations -from typing import cast, Literal, Optional, Union +from typing import cast, Literal, Optional import warnings import bigframes_vendored.constants as constants @@ -58,13 +58,17 @@ _GEMINI_1P5_PRO_PREVIEW_ENDPOINT = "gemini-1.5-pro-preview-0514" _GEMINI_1P5_PRO_FLASH_PREVIEW_ENDPOINT = "gemini-1.5-flash-preview-0514" _GEMINI_1P5_PRO_001_ENDPOINT = "gemini-1.5-pro-001" +_GEMINI_1P5_PRO_002_ENDPOINT = "gemini-1.5-pro-002" _GEMINI_1P5_FLASH_001_ENDPOINT = "gemini-1.5-flash-001" +_GEMINI_1P5_FLASH_002_ENDPOINT = "gemini-1.5-flash-002" _GEMINI_ENDPOINTS = ( _GEMINI_PRO_ENDPOINT, _GEMINI_1P5_PRO_PREVIEW_ENDPOINT, _GEMINI_1P5_PRO_FLASH_PREVIEW_ENDPOINT, _GEMINI_1P5_PRO_001_ENDPOINT, + _GEMINI_1P5_PRO_002_ENDPOINT, _GEMINI_1P5_FLASH_001_ENDPOINT, + _GEMINI_1P5_FLASH_002_ENDPOINT, ) _CLAUDE_3_SONNET_ENDPOINT = "claude-3-sonnet" @@ -214,8 +218,8 @@ def _bqml_options(self) -> dict: def fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ) -> PaLM2TextGenerator: """Fine tune PaLM2TextGenerator model. @@ -227,9 +231,9 @@ def fit( (https://cloud.google.com/products#product-launch-stages). Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): DataFrame of shape (n_samples, n_features). Training data. - y (bigframes.dataframe.DataFrame or bigframes.series.Series: + y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Training labels. Returns: @@ -251,7 +255,7 @@ def fit( def predict( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, *, temperature: float = 0.0, max_output_tokens: int = 128, @@ -261,7 +265,7 @@ def predict( """Predict the result from input DataFrame. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "prompt" column for prediction. Prompts can include preamble, questions, suggestions, instructions, or examples. @@ -323,7 +327,7 @@ def predict( if top_p < 0.0 or top_p > 1.0: raise ValueError(f"top_p must be [0.0, 1.0], but is {top_p}.") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) if len(X.columns) == 1: # BQML identified the column by name @@ -350,8 +354,8 @@ def predict( def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, task_type: Literal[ "text_generation", "classification", "summarization", "question_answering" ] = "text_generation", @@ -372,10 +376,10 @@ def score( for the outputs relevant to this model type. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): A BigQuery DataFrame as evaluation data, which contains only one column of input_text that contains the prompt text to use when evaluating the model. - y (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): A BigQuery DataFrame as evaluation labels, which contains only one column of output_text that you would expect to be returned by the model. task_type (str): @@ -388,7 +392,7 @@ def score( if not self._bqml_model: raise RuntimeError("A model must be fitted before score") - X, y = utils.convert_to_dataframe(X, y) + X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session) if len(X.columns) != 1 or len(y.columns) != 1: raise ValueError( @@ -538,11 +542,11 @@ def _from_bq( model._bqml_model = core.BqmlModel(session, bq_model) return model - def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: """Predict the result from input DataFrame. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "content" column for prediction. Returns: @@ -550,7 +554,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: """ # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) if len(X.columns) == 1: # BQML identified the column by name @@ -694,11 +698,11 @@ def _from_bq( model._bqml_model = core.BqmlModel(session, bq_model) return model - def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: """Predict the result from input DataFrame. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "content" column for prediction. Returns: @@ -706,7 +710,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: """ # Params reference: https://cloud.google.com/vertex-ai/docs/generative-ai/learn/models - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) if len(X.columns) == 1: # BQML identified the column by name @@ -749,7 +753,7 @@ class GeminiTextGenerator(base.BaseEstimator): Args: model_name (str, Default to "gemini-pro"): - The model for natural language tasks. Accepted values are "gemini-pro", "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514", "gemini-1.5-pro-001" and "gemini-1.5-flash-001". Default to "gemini-pro". + The model for natural language tasks. Accepted values are "gemini-pro", "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514", "gemini-1.5-pro-001", "gemini-1.5-pro-002", "gemini-1.5-flash-001" and "gemini-1.5-flash-002". Default to "gemini-pro". .. note:: "gemini-1.5-pro-preview-0514" and "gemini-1.5-flash-preview-0514" is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the @@ -775,7 +779,9 @@ def __init__( "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514", "gemini-1.5-pro-001", + "gemini-1.5-pro-002", "gemini-1.5-flash-001", + "gemini-1.5-flash-002", ] = "gemini-pro", session: Optional[bigframes.Session] = None, connection_name: Optional[str] = None, @@ -861,8 +867,8 @@ def _bqml_options(self) -> dict: def fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, ) -> GeminiTextGenerator: """Fine tune GeminiTextGenerator model. Only support "gemini-pro" model for now. @@ -901,7 +907,7 @@ def fit( def predict( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, *, temperature: float = 0.9, max_output_tokens: int = 8192, @@ -911,7 +917,7 @@ def predict( """Predict the result from input DataFrame. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "prompt" column for prediction. Prompts can include preamble, questions, suggestions, instructions, or examples. @@ -955,7 +961,7 @@ def predict( if top_p < 0.0 or top_p > 1.0: raise ValueError(f"top_p must be [0.0, 1.0], but is {top_p}.") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) if len(X.columns) == 1: # BQML identified the column by name @@ -982,8 +988,8 @@ def predict( def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, + y: utils.ArrayType, task_type: Literal[ "text_generation", "classification", "summarization", "question_answering" ] = "text_generation", @@ -1004,10 +1010,10 @@ def score( for the outputs relevant to this model type. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): A BigQuery DataFrame as evaluation data, which contains only one column of input_text that contains the prompt text to use when evaluating the model. - y (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): A BigQuery DataFrame as evaluation labels, which contains only one column of output_text that you would expect to be returned by the model. task_type (str): @@ -1024,7 +1030,7 @@ def score( if self._bqml_model.model_name.startswith("gemini-1.5"): raise NotImplementedError("Score is not supported for gemini-1.5 model.") - X, y = utils.convert_to_dataframe(X, y) + X, y = utils.convert_to_dataframe(X, y, session=self._bqml_model.session) if len(X.columns) != 1 or len(y.columns) != 1: raise ValueError( @@ -1074,7 +1080,7 @@ class Claude3TextGenerator(base.BaseEstimator): .. note:: - The models only availabe in specific regions. Check https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/use-claude#regions for details. + The models only available in specific regions. Check https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/use-claude#regions for details. Args: model_name (str, Default to "claude-3-sonnet"): @@ -1189,7 +1195,7 @@ def _bqml_options(self) -> dict: def predict( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, *, max_output_tokens: int = 128, top_k: int = 40, @@ -1198,7 +1204,7 @@ def predict( """Predict the result from input DataFrame. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "prompt" column for prediction. Prompts can include preamble, questions, suggestions, instructions, or examples. @@ -1238,7 +1244,7 @@ def predict( if top_p < 0.0 or top_p > 1.0: raise ValueError(f"top_p must be [0.0, 1.0], but is {top_p}.") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) if len(X.columns) == 1: # BQML identified the column by name diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index de9681660e..0ebf65b893 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -64,7 +64,9 @@ llm._GEMINI_1P5_PRO_PREVIEW_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_1P5_PRO_FLASH_PREVIEW_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_1P5_PRO_001_ENDPOINT: llm.GeminiTextGenerator, + llm._GEMINI_1P5_PRO_002_ENDPOINT: llm.GeminiTextGenerator, llm._GEMINI_1P5_FLASH_001_ENDPOINT: llm.GeminiTextGenerator, + llm._GEMINI_1P5_FLASH_002_ENDPOINT: llm.GeminiTextGenerator, llm._CLAUDE_3_HAIKU_ENDPOINT: llm.Claude3TextGenerator, llm._CLAUDE_3_SONNET_ENDPOINT: llm.Claude3TextGenerator, llm._CLAUDE_3_5_SONNET_ENDPOINT: llm.Claude3TextGenerator, diff --git a/bigframes/ml/model_selection.py b/bigframes/ml/model_selection.py index f9d7e6cf73..8fc0095931 100644 --- a/bigframes/ml/model_selection.py +++ b/bigframes/ml/model_selection.py @@ -23,6 +23,7 @@ import bigframes_vendored.sklearn.model_selection._split as vendored_model_selection_split import bigframes_vendored.sklearn.model_selection._validation as vendored_model_selection_validation +import pandas as pd from bigframes.core import log_adapter from bigframes.ml import utils @@ -30,7 +31,7 @@ def train_test_split( - *arrays: Union[bpd.DataFrame, bpd.Series], + *arrays: utils.ArrayType, test_size: Union[float, None] = None, train_size: Union[float, None] = None, random_state: Union[int, None] = None, @@ -125,9 +126,9 @@ def get_n_splits(self) -> int: def split( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series, None] = None, - ) -> Generator[tuple[Union[bpd.DataFrame, bpd.Series, None]], None, None]: + X: utils.ArrayType, + y: Union[utils.ArrayType, None] = None, + ) -> Generator[tuple[Union[bpd.DataFrame, bpd.Series, None], ...], None, None]: X_df = next(utils.convert_to_dataframe(X)) y_df_or = next(utils.convert_to_dataframe(y)) if y is not None else None joined_df = X_df.join(y_df_or, how="outer") if y_df_or is not None else X_df @@ -146,15 +147,35 @@ def split( X_test = test_df[X_df.columns] y_test = test_df[y_df_or.columns] if y_df_or is not None else None - yield utils.convert_to_types( - [X_train, X_test, y_train, y_test], [X, X, y, y] + yield ( + KFold._convert_to_bf_type(X_train, X), + KFold._convert_to_bf_type(X_test, X), + KFold._convert_to_bf_type(y_train, y), + KFold._convert_to_bf_type(y_test, y), ) + @staticmethod + def _convert_to_bf_type( + input, + type_instance: Union[bpd.DataFrame, bpd.Series, pd.DataFrame, pd.Series, None], + ) -> Union[bpd.DataFrame, bpd.Series, None]: + if isinstance(type_instance, pd.Series) or isinstance( + type_instance, bpd.Series + ): + return next(utils.convert_to_series(input)) + + if isinstance(type_instance, pd.DataFrame) or isinstance( + type_instance, bpd.DataFrame + ): + return next(utils.convert_to_dataframe(input)) + + return None + def cross_validate( estimator, - X: Union[bpd.DataFrame, bpd.Series], - y: Union[bpd.DataFrame, bpd.Series, None] = None, + X: utils.ArrayType, + y: Union[utils.ArrayType, None] = None, *, cv: Optional[Union[int, KFold]] = None, ) -> dict[str, list]: diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index dc3bd1f3f4..4313a05acf 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -18,7 +18,7 @@ from __future__ import annotations -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple import bigframes_vendored.constants as constants import bigframes_vendored.sklearn.pipeline @@ -101,8 +101,8 @@ def _from_bq(cls, session: bigframes.Session, bq_model: bigquery.Model) -> Pipel def fit( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + X: utils.BigFramesArrayType, + y: Optional[utils.BigFramesArrayType] = None, ) -> Pipeline: (X,) = utils.convert_to_dataframe(X) @@ -115,13 +115,13 @@ def fit( self._estimator._fit(X=X, y=y, transforms=transform_sqls) return self - def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def predict(self, X: utils.ArrayType) -> bpd.DataFrame: return self._estimator.predict(X) def score( self, - X: Union[bpd.DataFrame, bpd.Series], - y: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + X: utils.BigFramesArrayType, + y: Optional[utils.BigFramesArrayType] = None, ) -> bpd.DataFrame: (X,) = utils.convert_to_dataframe(X) if y is not None: diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 2c327f63f8..94b3a601d4 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -76,11 +76,11 @@ def _parse_from_sql(cls, sql: str) -> tuple[StandardScaler, str]: Returns: tuple(StandardScaler, column_label)""" col_label = sql[sql.find("(") + 1 : sql.find(")")] - return cls(), col_label + return cls(), _unescape_id(col_label) def fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored ) -> StandardScaler: (X,) = utils.convert_to_dataframe(X) @@ -95,11 +95,11 @@ def fit( self._extract_output_names() return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) df = self._bqml_model.transform(X) return typing.cast( @@ -152,12 +152,13 @@ def _parse_from_sql(cls, sql: str) -> tuple[MaxAbsScaler, str]: Returns: tuple(MaxAbsScaler, column_label)""" + # TODO: Use real sql parser col_label = sql[sql.find("(") + 1 : sql.find(")")] - return cls(), col_label + return cls(), _unescape_id(col_label) def fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored ) -> MaxAbsScaler: (X,) = utils.convert_to_dataframe(X) @@ -172,11 +173,11 @@ def fit( self._extract_output_names() return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) df = self._bqml_model.transform(X) return typing.cast( @@ -229,12 +230,13 @@ def _parse_from_sql(cls, sql: str) -> tuple[MinMaxScaler, str]: Returns: tuple(MinMaxScaler, column_label)""" + # TODO: Use real sql parser col_label = sql[sql.find("(") + 1 : sql.find(")")] - return cls(), col_label + return cls(), _unescape_id(col_label) def fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored ) -> MinMaxScaler: (X,) = utils.convert_to_dataframe(X) @@ -249,11 +251,11 @@ def fit( self._extract_output_names() return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) df = self._bqml_model.transform(X) return typing.cast( @@ -349,15 +351,15 @@ def _parse_from_sql(cls, sql: str) -> tuple[KBinsDiscretizer, str]: if sql.startswith("ML.QUANTILE_BUCKETIZE"): num_bins = s.split(",")[1] - return cls(int(num_bins), "quantile"), col_label + return cls(int(num_bins), "quantile"), _unescape_id(col_label) else: array_split_points = s[s.find("[") + 1 : s.find("]")] n_bins = array_split_points.count(",") + 2 - return cls(n_bins, "uniform"), col_label + return cls(n_bins, "uniform"), _unescape_id(col_label) def fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored ) -> KBinsDiscretizer: (X,) = utils.convert_to_dataframe(X) @@ -372,11 +374,11 @@ def fit( self._extract_output_names() return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) df = self._bqml_model.transform(X) return typing.cast( @@ -469,11 +471,11 @@ def _parse_from_sql(cls, sql: str) -> tuple[OneHotEncoder, str]: max_categories = int(top_k) + 1 min_frequency = int(frequency_threshold) - return cls(drop, min_frequency, max_categories), col_label + return cls(drop, min_frequency, max_categories), _unescape_id(col_label) def fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored ) -> OneHotEncoder: (X,) = utils.convert_to_dataframe(X) @@ -488,11 +490,11 @@ def fit( self._extract_output_names() return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) df = self._bqml_model.transform(X) return typing.cast( @@ -578,11 +580,11 @@ def _parse_from_sql(cls, sql: str) -> tuple[LabelEncoder, str]: max_categories = int(top_k) + 1 min_frequency = int(frequency_threshold) - return cls(min_frequency, max_categories), col_label + return cls(min_frequency, max_categories), _unescape_id(col_label) def fit( self, - y: Union[bpd.DataFrame, bpd.Series], + y: utils.ArrayType, ) -> LabelEncoder: (y,) = utils.convert_to_dataframe(y) @@ -596,11 +598,11 @@ def fit( self._extract_output_names() return self - def transform(self, y: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, y: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (y,) = utils.convert_to_dataframe(y) + (y,) = utils.convert_to_dataframe(y, session=self._bqml_model.session) df = self._bqml_model.transform(y) return typing.cast( @@ -661,11 +663,11 @@ def _parse_from_sql(cls, sql: str) -> tuple[PolynomialFeatures, tuple[str, ...]] col_labels = sql[sql.find("STRUCT(") + 7 : sql.find(")")].split(",") col_labels = [label.strip() for label in col_labels] degree = int(sql[sql.rfind(",") + 1 : sql.rfind(")")]) - return cls(degree), tuple(col_labels) + return cls(degree), tuple(map(_unescape_id, col_labels)) def fit( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, y=None, # ignored ) -> PolynomialFeatures: (X,) = utils.convert_to_dataframe(X) @@ -681,11 +683,11 @@ def fit( return self - def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: + def transform(self, X: utils.ArrayType) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("Must be fitted before transform") - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) df = self._bqml_model.transform(X) return typing.cast( @@ -694,6 +696,14 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: ) +def _unescape_id(id: str) -> str: + """Very simple conversion to removed ` characters from ids. + + A proper sql parser should be used instead. + """ + return id.removeprefix("`").removesuffix("`") + + PreprocessingType = Union[ OneHotEncoder, StandardScaler, diff --git a/bigframes/ml/remote.py b/bigframes/ml/remote.py index 8fb6d9db4c..05e6354f9f 100644 --- a/bigframes/ml/remote.py +++ b/bigframes/ml/remote.py @@ -16,7 +16,7 @@ from __future__ import annotations -from typing import Mapping, Optional, Union +from typing import Mapping, Optional import warnings import bigframes @@ -121,19 +121,19 @@ def standardize_type(v: str): def predict( self, - X: Union[bpd.DataFrame, bpd.Series], + X: utils.ArrayType, ) -> bpd.DataFrame: """Predict the result from the input DataFrame. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Input DataFrame or Series, which needs to comply with the input parameter of the model. Returns: bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. """ - (X,) = utils.convert_to_dataframe(X) + (X,) = utils.convert_to_dataframe(X, session=self._bqml_model.session) df = self._bqml_model.predict(X) diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 1cb327f19c..b7d550ac63 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -21,6 +21,9 @@ import bigframes_vendored.constants as constants import google.cloud.bigquery +import bigframes.core.compile.googlesql as sql_utils +import bigframes.core.sql as sql_vals + # TODO: Add proper escaping logic from core/compile module class BaseSqlGenerator: @@ -29,10 +32,8 @@ class BaseSqlGenerator: # General methods def encode_value(self, v: Union[str, int, float, Iterable[str]]) -> str: """Encode a parameter value for SQL""" - if isinstance(v, str): - return f'"{v}"' - elif isinstance(v, int) or isinstance(v, float): - return f"{v}" + if isinstance(v, (str, int, float)): + return sql_vals.simple_literal(v) elif isinstance(v, Iterable): inner = ", ".join([self.encode_value(x) for x in v]) return f"[{inner}]" @@ -50,7 +51,10 @@ def build_parameters(self, **kwargs: Union[str, int, float, Iterable[str]]) -> s def build_structs(self, **kwargs: Union[int, float]) -> str: """Encode a dict of values into a formatted STRUCT items for SQL""" indent_str = " " - param_strs = [f"{v} AS {k}" for k, v in kwargs.items()] + param_strs = [ + f"{sql_vals.simple_literal(v)} AS {sql_utils.identifier(k)}" + for k, v in kwargs.items() + ] return "\n" + indent_str + f",\n{indent_str}".join(param_strs) def build_expressions(self, *expr_sqls: str) -> str: @@ -61,7 +65,7 @@ def build_expressions(self, *expr_sqls: str) -> str: def build_schema(self, **kwargs: str) -> str: """Encode a dict of values into a formatted schema type items for SQL""" indent_str = " " - param_strs = [f"{k} {v}" for k, v in kwargs.items()] + param_strs = [f"{sql_utils.identifier(k)} {v}" for k, v in kwargs.items()] return "\n" + indent_str + f",\n{indent_str}".join(param_strs) def options(self, **kwargs: Union[str, int, float, Iterable[str]]) -> str: @@ -74,7 +78,7 @@ def struct_options(self, **kwargs: Union[int, float]) -> str: def struct_columns(self, columns: Iterable[str]) -> str: """Encode a BQ Table columns to a STRUCT.""" - columns_str = ", ".join(columns) + columns_str = ", ".join(map(sql_utils.identifier, columns)) return f"STRUCT({columns_str})" def input(self, **kwargs: str) -> str: @@ -97,30 +101,30 @@ def transform(self, *expr_sqls: str) -> str: def ml_standard_scaler(self, numeric_expr_sql: str, name: str) -> str: """Encode ML.STANDARD_SCALER for BQML""" - return f"""ML.STANDARD_SCALER({numeric_expr_sql}) OVER() AS {name}""" + return f"""ML.STANDARD_SCALER({sql_utils.identifier(numeric_expr_sql)}) OVER() AS {sql_utils.identifier(name)}""" def ml_max_abs_scaler(self, numeric_expr_sql: str, name: str) -> str: """Encode ML.MAX_ABS_SCALER for BQML""" - return f"""ML.MAX_ABS_SCALER({numeric_expr_sql}) OVER() AS {name}""" + return f"""ML.MAX_ABS_SCALER({sql_utils.identifier(numeric_expr_sql)}) OVER() AS {sql_utils.identifier(name)}""" def ml_min_max_scaler(self, numeric_expr_sql: str, name: str) -> str: """Encode ML.MIN_MAX_SCALER for BQML""" - return f"""ML.MIN_MAX_SCALER({numeric_expr_sql}) OVER() AS {name}""" + return f"""ML.MIN_MAX_SCALER({sql_utils.identifier(numeric_expr_sql)}) OVER() AS {sql_utils.identifier(name)}""" def ml_imputer( self, - expr_sql: str, + col_name: str, strategy: str, name: str, ) -> str: """Encode ML.IMPUTER for BQML""" - return f"""ML.IMPUTER({expr_sql}, '{strategy}') OVER() AS {name}""" + return f"""ML.IMPUTER({sql_utils.identifier(col_name)}, '{strategy}') OVER() AS {sql_utils.identifier(name)}""" def ml_bucketize( self, - numeric_expr_sql: str, + input_id: str, array_split_points: Iterable[Union[int, float]], - name: str, + output_id: str, ) -> str: """Encode ML.BUCKETIZE for BQML""" # Use Python value rather than Numpy value to serialization. @@ -128,7 +132,7 @@ def ml_bucketize( point.item() if hasattr(point, "item") else point for point in array_split_points ] - return f"""ML.BUCKETIZE({numeric_expr_sql}, {points}, FALSE) AS {name}""" + return f"""ML.BUCKETIZE({sql_utils.identifier(input_id)}, {points}, FALSE) AS {sql_utils.identifier(output_id)}""" def ml_quantile_bucketize( self, @@ -137,7 +141,7 @@ def ml_quantile_bucketize( name: str, ) -> str: """Encode ML.QUANTILE_BUCKETIZE for BQML""" - return f"""ML.QUANTILE_BUCKETIZE({numeric_expr_sql}, {num_bucket}) OVER() AS {name}""" + return f"""ML.QUANTILE_BUCKETIZE({sql_utils.identifier(numeric_expr_sql)}, {num_bucket}) OVER() AS {sql_utils.identifier(name)}""" def ml_one_hot_encoder( self, @@ -149,7 +153,7 @@ def ml_one_hot_encoder( ) -> str: """Encode ML.ONE_HOT_ENCODER for BQML. https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-one-hot-encoder for params.""" - return f"""ML.ONE_HOT_ENCODER({numeric_expr_sql}, '{drop}', {top_k}, {frequency_threshold}) OVER() AS {name}""" + return f"""ML.ONE_HOT_ENCODER({sql_utils.identifier(numeric_expr_sql)}, '{drop}', {top_k}, {frequency_threshold}) OVER() AS {sql_utils.identifier(name)}""" def ml_label_encoder( self, @@ -160,14 +164,14 @@ def ml_label_encoder( ) -> str: """Encode ML.LABEL_ENCODER for BQML. https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-label-encoder for params.""" - return f"""ML.LABEL_ENCODER({numeric_expr_sql}, {top_k}, {frequency_threshold}) OVER() AS {name}""" + return f"""ML.LABEL_ENCODER({sql_utils.identifier(numeric_expr_sql)}, {top_k}, {frequency_threshold}) OVER() AS {sql_utils.identifier(name)}""" def ml_polynomial_expand( self, columns: Iterable[str], degree: int, name: str ) -> str: """Encode ML.POLYNOMIAL_EXPAND. https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-polynomial-expand""" - return f"""ML.POLYNOMIAL_EXPAND({self.struct_columns(columns)}, {degree}) AS {name}""" + return f"""ML.POLYNOMIAL_EXPAND({self.struct_columns(columns)}, {degree}) AS {sql_utils.identifier(name)}""" def ml_distance( self, @@ -179,7 +183,7 @@ def ml_distance( ) -> str: """Encode ML.DISTANCE for BQML. https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-distance""" - return f"""SELECT *, ML.DISTANCE({col_x}, {col_y}, '{type}') AS {name} FROM ({source_sql})""" + return f"""SELECT *, ML.DISTANCE({sql_utils.identifier(col_x)}, {sql_utils.identifier(col_y)}, '{type}') AS {sql_utils.identifier(name)} FROM ({source_sql})""" class ModelCreationSqlGenerator(BaseSqlGenerator): @@ -189,7 +193,7 @@ def _model_id_sql( self, model_ref: google.cloud.bigquery.ModelReference, ): - return f"`{model_ref.project}`.`{model_ref.dataset_id}`.`{model_ref.model_id}`" + return f"{sql_utils.identifier(model_ref.project)}.{sql_utils.identifier(model_ref.dataset_id)}.{sql_utils.identifier(model_ref.model_id)}" # Model create and alter def create_model( @@ -276,8 +280,11 @@ def create_xgboost_imported_model( class ModelManipulationSqlGenerator(BaseSqlGenerator): """Sql generator for manipulating a model entity. Model name is the full model path of project_id.dataset_id.model_id.""" - def __init__(self, model_name: str): - self._model_name = model_name + def __init__(self, model_ref: google.cloud.bigquery.ModelReference): + self._model_ref = model_ref + + def _model_ref_sql(self) -> str: + return f"{sql_utils.identifier(self._model_ref.project)}.{sql_utils.identifier(self._model_ref.dataset_id)}.{sql_utils.identifier(self._model_ref.model_id)}" # Alter model def alter_model( @@ -287,20 +294,20 @@ def alter_model( """Encode the ALTER MODEL statement for BQML""" options_sql = self.options(**options) - parts = [f"ALTER MODEL `{self._model_name}`"] + parts = [f"ALTER MODEL {self._model_ref_sql()}"] parts.append(f"SET {options_sql}") return "\n".join(parts) # ML prediction TVFs def ml_predict(self, source_sql: str) -> str: """Encode ML.PREDICT for BQML""" - return f"""SELECT * FROM ML.PREDICT(MODEL `{self._model_name}`, + return f"""SELECT * FROM ML.PREDICT(MODEL {self._model_ref_sql()}, ({source_sql}))""" def ml_forecast(self, struct_options: Mapping[str, Union[int, float]]) -> str: """Encode ML.FORECAST for BQML""" struct_options_sql = self.struct_options(**struct_options) - return f"""SELECT * FROM ML.FORECAST(MODEL `{self._model_name}`, + return f"""SELECT * FROM ML.FORECAST(MODEL {self._model_ref_sql()}, {struct_options_sql})""" def ml_generate_text( @@ -308,7 +315,7 @@ def ml_generate_text( ) -> str: """Encode ML.GENERATE_TEXT for BQML""" struct_options_sql = self.struct_options(**struct_options) - return f"""SELECT * FROM ML.GENERATE_TEXT(MODEL `{self._model_name}`, + return f"""SELECT * FROM ML.GENERATE_TEXT(MODEL {self._model_ref_sql()}, ({source_sql}), {struct_options_sql})""" def ml_generate_embedding( @@ -316,7 +323,7 @@ def ml_generate_embedding( ) -> str: """Encode ML.GENERATE_EMBEDDING for BQML""" struct_options_sql = self.struct_options(**struct_options) - return f"""SELECT * FROM ML.GENERATE_EMBEDDING(MODEL `{self._model_name}`, + return f"""SELECT * FROM ML.GENERATE_EMBEDDING(MODEL {self._model_ref_sql()}, ({source_sql}), {struct_options_sql})""" def ml_detect_anomalies( @@ -324,51 +331,51 @@ def ml_detect_anomalies( ) -> str: """Encode ML.DETECT_ANOMALIES for BQML""" struct_options_sql = self.struct_options(**struct_options) - return f"""SELECT * FROM ML.DETECT_ANOMALIES(MODEL `{self._model_name}`, + return f"""SELECT * FROM ML.DETECT_ANOMALIES(MODEL {self._model_ref_sql()}, {struct_options_sql}, ({source_sql}))""" # ML evaluation TVFs def ml_evaluate(self, source_sql: Optional[str] = None) -> str: """Encode ML.EVALUATE for BQML""" if source_sql is None: - return f"""SELECT * FROM ML.EVALUATE(MODEL `{self._model_name}`)""" + return f"""SELECT * FROM ML.EVALUATE(MODEL {self._model_ref_sql()})""" else: - return f"""SELECT * FROM ML.EVALUATE(MODEL `{self._model_name}`, + return f"""SELECT * FROM ML.EVALUATE(MODEL {self._model_ref_sql()}, ({source_sql}))""" def ml_arima_coefficients(self) -> str: """Encode ML.ARIMA_COEFFICIENTS for BQML""" - return f"""SELECT * FROM ML.ARIMA_COEFFICIENTS(MODEL `{self._model_name}`)""" + return f"""SELECT * FROM ML.ARIMA_COEFFICIENTS(MODEL {self._model_ref_sql()})""" # ML evaluation TVFs def ml_llm_evaluate(self, source_sql: str, task_type: Optional[str] = None) -> str: """Encode ML.EVALUATE for BQML""" # Note: don't need index as evaluate returns a new table - return f"""SELECT * FROM ML.EVALUATE(MODEL `{self._model_name}`, + return f"""SELECT * FROM ML.EVALUATE(MODEL {self._model_ref_sql()}, ({source_sql}), STRUCT("{task_type}" AS task_type))""" # ML evaluation TVFs def ml_arima_evaluate(self, show_all_candidate_models: bool = False) -> str: """Encode ML.ARMIA_EVALUATE for BQML""" - return f"""SELECT * FROM ML.ARIMA_EVALUATE(MODEL `{self._model_name}`, + return f"""SELECT * FROM ML.ARIMA_EVALUATE(MODEL {self._model_ref_sql()}, STRUCT({show_all_candidate_models} AS show_all_candidate_models))""" def ml_centroids(self) -> str: """Encode ML.CENTROIDS for BQML""" - return f"""SELECT * FROM ML.CENTROIDS(MODEL `{self._model_name}`)""" + return f"""SELECT * FROM ML.CENTROIDS(MODEL {self._model_ref_sql()})""" def ml_principal_components(self) -> str: """Encode ML.PRINCIPAL_COMPONENTS for BQML""" - return f"""SELECT * FROM ML.PRINCIPAL_COMPONENTS(MODEL `{self._model_name}`)""" + return ( + f"""SELECT * FROM ML.PRINCIPAL_COMPONENTS(MODEL {self._model_ref_sql()})""" + ) def ml_principal_component_info(self) -> str: """Encode ML.PRINCIPAL_COMPONENT_INFO for BQML""" - return ( - f"""SELECT * FROM ML.PRINCIPAL_COMPONENT_INFO(MODEL `{self._model_name}`)""" - ) + return f"""SELECT * FROM ML.PRINCIPAL_COMPONENT_INFO(MODEL {self._model_ref_sql()})""" # ML transform TVF, that require a transform_only type model def ml_transform(self, source_sql: str) -> str: """Encode ML.TRANSFORM for BQML""" - return f"""SELECT * FROM ML.TRANSFORM(MODEL `{self._model_name}`, + return f"""SELECT * FROM ML.TRANSFORM(MODEL {self._model_ref_sql()}, ({source_sql}))""" diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py index 96f0bc31e9..bdca45e457 100644 --- a/bigframes/ml/utils.py +++ b/bigframes/ml/utils.py @@ -13,37 +13,75 @@ # limitations under the License. import typing -from typing import Any, Generator, Iterable, Literal, Mapping, Optional, Union +from typing import Any, Generator, Literal, Mapping, Optional, Union import bigframes_vendored.constants as constants from google.cloud import bigquery +import pandas as pd from bigframes.core import blocks import bigframes.pandas as bpd +from bigframes.session import Session # Internal type alias -ArrayType = Union[bpd.DataFrame, bpd.Series] +ArrayType = Union[bpd.DataFrame, bpd.Series, pd.DataFrame, pd.Series] +BigFramesArrayType = Union[bpd.DataFrame, bpd.Series] -def convert_to_dataframe(*input: ArrayType) -> Generator[bpd.DataFrame, None, None]: - return (_convert_to_dataframe(frame) for frame in input) +def convert_to_dataframe( + *input: ArrayType, + session: Optional[Session] = None, +) -> Generator[bpd.DataFrame, None, None]: + """Converts the input to BigFrames DataFrame. + Args: + session: + The session to convert local pandas instances to BigFrames counter-parts. + It is not used if the input itself is already a BigFrame data frame or series. -def _convert_to_dataframe(frame: ArrayType) -> bpd.DataFrame: + """ + return (_convert_to_dataframe(frame, session) for frame in input) + + +def _convert_to_dataframe( + frame: ArrayType, session: Optional[Session] = None +) -> bpd.DataFrame: if isinstance(frame, bpd.DataFrame): return frame if isinstance(frame, bpd.Series): return frame.to_frame() + if isinstance(frame, pd.DataFrame): + if session is None: + return bpd.read_pandas(frame) + else: + return session.read_pandas(frame) + if isinstance(frame, pd.Series): + if session is None: + return bpd.read_pandas(frame).to_frame() + else: + return session.read_pandas(frame).to_frame() raise ValueError( f"Unsupported type {type(frame)} to convert to DataFrame. {constants.FEEDBACK_LINK}" ) -def convert_to_series(*input: ArrayType) -> Generator[bpd.Series, None, None]: - return (_convert_to_series(frame) for frame in input) +def convert_to_series( + *input: ArrayType, session: Optional[Session] = None +) -> Generator[bpd.Series, None, None]: + """Converts the input to BigFrames Series. + + Args: + session: + The session to convert local pandas instances to BigFrames counter-parts. + It is not used if the input itself is already a BigFrame data frame or series. + """ + return (_convert_to_series(frame, session) for frame in input) -def _convert_to_series(frame: ArrayType) -> bpd.Series: + +def _convert_to_series( + frame: ArrayType, session: Optional[Session] = None +) -> bpd.Series: if isinstance(frame, bpd.DataFrame): if len(frame.columns) != 1: raise ValueError( @@ -55,44 +93,22 @@ def _convert_to_series(frame: ArrayType) -> bpd.Series: return typing.cast(bpd.Series, frame[label]) if isinstance(frame, bpd.Series): return frame + if isinstance(frame, pd.DataFrame): + # Recursively call this method to re-use the length-checking logic + if session is None: + return _convert_to_series(bpd.read_pandas(frame)) + else: + return _convert_to_series(session.read_pandas(frame), session) + if isinstance(frame, pd.Series): + if session is None: + return bpd.read_pandas(frame) + else: + return session.read_pandas(frame) raise ValueError( f"Unsupported type {type(frame)} to convert to Series. {constants.FEEDBACK_LINK}" ) -def convert_to_types( - inputs: Iterable[Union[ArrayType, None]], - type_instances: Iterable[Union[ArrayType, None]], -) -> tuple[Union[ArrayType, None]]: - """Convert the DF, Series and None types of the input to corresponding type_instances types.""" - results = [] - for input, type_instance in zip(inputs, type_instances): - results.append(_convert_to_type(input, type_instance)) - return tuple(results) - - -def _convert_to_type( - input: Union[ArrayType, None], type_instance: Union[ArrayType, None] -): - if type_instance is None: - if input is not None: - raise ValueError( - f"Trying to convert not None type to None. {constants.FEEDBACK_LINK}" - ) - return None - if input is None: - raise ValueError( - f"Trying to convert None type to not None. {constants.FEEDBACK_LINK}" - ) - if isinstance(type_instance, bpd.DataFrame): - return _convert_to_dataframe(input) - if isinstance(type_instance, bpd.Series): - return _convert_to_series(input) - raise ValueError( - f"Unsupport converting to {type(type_instance)}. {constants.FEEDBACK_LINK}" - ) - - def parse_model_endpoint(model_endpoint: str) -> tuple[str, Optional[str]]: """Parse model endpoint string to model_name and version.""" model_name = model_endpoint diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 9ff7ea38b2..3e0be74889 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -17,17 +17,21 @@ import typing from typing import List, Optional -import bigframes -import bigframes.core.guid +import numpy as np + +import bigframes.core.guid as guid import bigframes.dtypes as dtypes class Semantics: def __init__(self, df) -> None: + import bigframes + import bigframes.dataframe + if not bigframes.options.experiments.semantic_operators: raise NotImplementedError() - self._df = df + self._df: bigframes.dataframe.DataFrame = df def agg( self, @@ -94,12 +98,21 @@ def agg( ValueError: when the instruction refers to a non-existing column, or when more than one columns are referred to. """ - self._validate_model(model) + import bigframes.bigquery as bbq + import bigframes.dataframe + import bigframes.series + self._validate_model(model) columns = self._parse_columns(instruction) + + df: bigframes.dataframe.DataFrame = self._df.copy() for column in columns: if column not in self._df.columns: raise ValueError(f"Column {column} not found.") + + if df[column].dtype != dtypes.STRING_DTYPE: + df[column] = df[column].astype(dtypes.STRING_DTYPE) + if len(columns) > 1: raise NotImplementedError( "Semantic aggregations are limited to a single column." @@ -112,11 +125,6 @@ def agg( "It must be greater than 1." ) - import bigframes.bigquery as bbq - import bigframes.dataframe - import bigframes.series - - df: bigframes.dataframe.DataFrame = self._df.copy() user_instruction = self._format_instruction(instruction, columns) num_cluster = 1 @@ -130,15 +138,15 @@ def agg( f"{type(df[cluster_column])}" ) - num_cluster = len(df[cluster_column].unique()) + num_cluster = df[cluster_column].unique().shape[0] df = df.sort_values(cluster_column) else: - cluster_column = bigframes.core.guid.generate_guid("pid") + cluster_column = guid.generate_guid("pid") df[cluster_column] = 0 - aggregation_group_id = bigframes.core.guid.generate_guid("agg") - group_row_index = bigframes.core.guid.generate_guid("gid") - llm_prompt = bigframes.core.guid.generate_guid("prompt") + aggregation_group_id = guid.generate_guid("agg") + group_row_index = guid.generate_guid("gid") + llm_prompt = guid.generate_guid("prompt") df = ( df.reset_index(drop=True) .reset_index() @@ -183,7 +191,7 @@ def agg( # Run model predict_df = typing.cast( - bigframes.dataframe.DataFrame, model.predict(prompt_s) + bigframes.dataframe.DataFrame, model.predict(prompt_s, temperature=0.0) ) agg_df[column] = predict_df["ml_generate_text_llm_result"].combine_first( single_row_df @@ -221,7 +229,7 @@ def cluster_by( >>> df = bpd.DataFrame({ ... "Product": ["Smartphone", "Laptop", "T-shirt", "Jeans"], ... }) - >>> df.semantics.cluster_by("Product", "Cluster ID", model, n_clusters=2) + >>> df.semantics.cluster_by("Product", "Cluster ID", model, n_clusters=2) # doctest: +SKIP Product Cluster ID 0 Smartphone 2 1 Laptop 2 @@ -315,21 +323,28 @@ def filter(self, instruction: str, model): ValueError: when the instruction refers to a non-existing column, or when no columns are referred to. """ + import bigframes.dataframe + import bigframes.series + self._validate_model(model) columns = self._parse_columns(instruction) for column in columns: if column not in self._df.columns: raise ValueError(f"Column {column} not found.") + df: bigframes.dataframe.DataFrame = self._df[columns].copy() + for column in columns: + if df[column].dtype != dtypes.STRING_DTYPE: + df[column] = df[column].astype(dtypes.STRING_DTYPE) + user_instruction = self._format_instruction(instruction, columns) output_instruction = "Based on the provided context, reply to the following claim by only True or False:" - from bigframes.dataframe import DataFrame - results = typing.cast( - DataFrame, + bigframes.dataframe.DataFrame, model.predict( - self._make_prompt(columns, user_instruction, output_instruction) + self._make_prompt(df, columns, user_instruction, output_instruction), + temperature=0.0, ), ) @@ -368,7 +383,7 @@ def map(self, instruction: str, output_column: str, model): in the instructions like: "Get the ingredients of {food}." - result_column_name: + output_column: The column name of the mapping result. model: @@ -382,23 +397,30 @@ def map(self, instruction: str, output_column: str, model): ValueError: when the instruction refers to a non-existing column, or when no columns are referred to. """ + import bigframes.dataframe + import bigframes.series + self._validate_model(model) columns = self._parse_columns(instruction) for column in columns: if column not in self._df.columns: raise ValueError(f"Column {column} not found.") + df: bigframes.dataframe.DataFrame = self._df[columns].copy() + for column in columns: + if df[column].dtype != dtypes.STRING_DTYPE: + df[column] = df[column].astype(dtypes.STRING_DTYPE) + user_instruction = self._format_instruction(instruction, columns) output_instruction = ( "Based on the provided contenxt, answer the following instruction:" ) - from bigframes.series import Series - results = typing.cast( - Series, + bigframes.series.Series, model.predict( - self._make_prompt(columns, user_instruction, output_instruction) + self._make_prompt(df, columns, user_instruction, output_instruction), + temperature=0.0, )["ml_generate_text_llm_result"], ) @@ -440,10 +462,10 @@ def join(self, other, instruction: str, model, max_rows: int = 1000): An instruction on how left and right rows can be joined. This value must contain column references by name. which should be wrapped in a pair of braces. For example: "The {city} belongs to the {country}". - For column names that are shared between two dataframes, you need to add "_left" - and "_right" suffix for differentiation. This is especially important when you do - self joins. For example: "The {employee_name_left} reports to {employee_name_right}" - You must not add "_left" or "_right" suffix to non-overlapping columns. + For column names that are shared between two dataframes, you need to add "left." + and "right." prefix for differentiation. This is especially important when you do + self joins. For example: "The {left.employee_name} reports to {right.employee_name}" + For unique column names, this prefix is optional. model: A GeminiTextGenerator provided by Bigframes ML package. @@ -481,35 +503,45 @@ def join(self, other, instruction: str, model, max_rows: int = 1000): elif col in other.columns: right_columns.append(col) - elif col.endswith("_left"): - original_col_name = col[: -len("_left")] + elif col.startswith("left."): + original_col_name = col[len("left.") :] if ( original_col_name in self._df.columns and original_col_name in other.columns ): left_columns.append(col) elif original_col_name in self._df.columns: - raise ValueError(f"Unnecessary suffix for {col}") + left_columns.append(col) + instruction = instruction.replace(col, original_col_name) else: raise ValueError(f"Column {col} not found") - elif col.endswith("_right"): - original_col_name = col[: -len("_right")] + elif col.startswith("right."): + original_col_name = col[len("right.") :] if ( original_col_name in self._df.columns and original_col_name in other.columns ): right_columns.append(col) elif original_col_name in other.columns: - raise ValueError(f"Unnecessary suffix for {col}") + right_columns.append(col) + instruction = instruction.replace(col, original_col_name) else: raise ValueError(f"Column {col} not found") else: raise ValueError(f"Column {col} not found") - if not left_columns or not right_columns: - raise ValueError() + if not left_columns: + raise ValueError("No left column references.") + + if not right_columns: + raise ValueError("No right column references.") + + # Update column references to be compatible with internal naming scheme. + # That is, "left.col" -> "col_left" and "right.col" -> "col_right" + instruction = re.sub(r"(?>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> bpd.options.experiments.semantic_operators = True + + >>> import bigframes.ml.llm as llm + >>> model = llm.GeminiTextGenerator(model_name="gemini-1.5-flash-001") + + >>> df = bpd.DataFrame({"Animals": ["Dog", "Bird", "Cat", "Horse"]}) + >>> df.semantics.top_k("{Animals} are more popular as pets", model=model, k=2) + Animals + 0 Dog + 2 Cat + + [2 rows x 1 columns] + + Args: + instruction (str): + An instruction on how to map the data. This value must contain + column references by name enclosed in braces. + For example, to reference a column named "Animals", use "{Animals}" in the + instruction, like: "{Animals} are more popular as pets" + + model (bigframes.ml.llm.GeminiTextGenerator): + A GeminiTextGenerator provided by the Bigframes ML package. + + k (int, default 10): + The number of rows to return. + + Returns: + bigframes.dataframe.DataFrame: A new DataFrame with the top k rows. + + Raises: + NotImplementedError: when the semantic operator experiment is off. + ValueError: when the instruction refers to a non-existing column, or when no + columns are referred to. + """ + import bigframes.dataframe + import bigframes.series + + self._validate_model(model) + columns = self._parse_columns(instruction) + for column in columns: + if column not in self._df.columns: + raise ValueError(f"Column {column} not found.") + if len(columns) > 1: + raise NotImplementedError( + "Semantic aggregations are limited to a single column." + ) + + df: bigframes.dataframe.DataFrame = self._df[columns].copy() + column = columns[0] + if df[column].dtype != dtypes.STRING_DTYPE: + df[column] = df[column].astype(dtypes.STRING_DTYPE) + + # `index` is reserved for the `reset_index` below. + if column == "index": + raise ValueError( + "Column name 'index' is reserved. Please choose a different name." + ) + + if k < 1: + raise ValueError("k must be an integer greater than or equal to 1.") + + user_instruction = self._format_instruction(instruction, columns) + + n = df.shape[0] + if k >= n: + return df + + # Create a unique index and duplicate it as the "index" column. This workaround + # is needed for the select search algorithm due to unimplemented bigFrame methods. + df = df.reset_index().rename(columns={"index": "old_index"}).reset_index() + + # Initialize a status column to track the selection status of each item. + # - None: Unknown/not yet processed + # - 1.0: Selected as part of the top-k items + # - -1.0: Excluded from the top-k items + status_column = guid.generate_guid("status") + df[status_column] = bigframes.series.Series(None, dtype=dtypes.FLOAT_DTYPE) + + num_selected = 0 + while num_selected < k: + df, num_new_selected = self._topk_partition( + df, + column, + status_column, + user_instruction, + model, + k - num_selected, + ) + num_selected += num_new_selected + + df = ( + df[df[status_column] > 0] + .drop(["index", status_column], axis=1) + .rename(columns={"old_index": "index"}) + .set_index("index") + ) + df.index.name = None + return df + + @staticmethod + def _topk_partition( + df, column: str, status_column: str, user_instruction: str, model, k + ): + output_instruction = ( + "Given a question and two documents, choose the document that best answers " + "the question. Respond with 'Document 1' or 'Document 2'. You must choose " + "one, even if neither is ideal. " + ) + + # Random pivot selection for improved average quickselect performance. + pending_df = df[df[status_column].isna()] + pivot_iloc = np.random.randint(0, pending_df.shape[0]) + pivot_index = pending_df.iloc[pivot_iloc]["index"] + pivot_df = pending_df[pending_df["index"] == pivot_index] + + # Build a prompt to compare the pivot item's relevance to other pending items. + prompt_s = pending_df[pending_df["index"] != pivot_index][column] + prompt_s = ( + f"{output_instruction}\n\nQuestion: {user_instruction}\n" + + f"\nDocument 1: {column} " + + pivot_df.iloc[0][column] + + f"\nDocument 2: {column} " + + prompt_s # type:ignore + ) + + import bigframes.dataframe + + predict_df = typing.cast( + bigframes.dataframe.DataFrame, model.predict(prompt_s, temperature=0.0) + ) + + marks = predict_df["ml_generate_text_llm_result"].str.contains("2") + more_relavant: bigframes.dataframe.DataFrame = df[marks] + less_relavent: bigframes.dataframe.DataFrame = df[~marks] + + num_more_relavant = more_relavant.shape[0] + if k < num_more_relavant: + less_relavent[status_column] = -1.0 + pivot_df[status_column] = -1.0 + df = df.combine_first(less_relavent).combine_first(pivot_df) + return df, 0 + else: # k >= num_more_relavant + more_relavant[status_column] = 1.0 + df = df.combine_first(more_relavant) + if k >= num_more_relavant + 1: + pivot_df[status_column] = 1.0 + df = df.combine_first(pivot_df) + return df, num_more_relavant + 1 + else: + return df, num_more_relavant + def sim_join( self, other, @@ -688,7 +886,10 @@ def sim_join( f"Number of rows that need processing is {joined_table_rows}, which exceeds row limit {max_rows}." ) - base_table_embedding_column = bigframes.core.guid.generate_guid() + if top_k < 1: + raise ValueError("top_k must be an integer greater than or equal to 1.") + + base_table_embedding_column = guid.generate_guid() base_table = self._attach_embedding( other, right_on, base_table_embedding_column, model ).to_gbq() @@ -724,9 +925,8 @@ def _attach_embedding(dataframe, source_column: str, embedding_column: str, mode return result_df def _make_prompt( - self, columns: List[str], user_instruction: str, output_instruction: str + self, prompt_df, columns, user_instruction: str, output_instruction: str ): - prompt_df = self._df[columns].copy() prompt_df["prompt"] = f"{output_instruction}\n{user_instruction}\nContext: " # Combine context from multiple columns. @@ -759,4 +959,4 @@ def _validate_model(model): from bigframes.ml.llm import GeminiTextGenerator if not isinstance(model, GeminiTextGenerator): - raise ValueError("Model is not GeminiText Generator") + raise TypeError("Model is not GeminiText Generator") diff --git a/bigframes/series.py b/bigframes/series.py index 1a913f18d7..215f4473ee 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1117,6 +1117,14 @@ def ne(self, other: object) -> Series: # TODO: enforce stricter alignment return self._apply_binary_op(other, ops.ne_op) + def items(self): + for batch_df in self._block.to_pandas_batches(): + assert ( + batch_df.shape[1] == 1 + ), f"Expected 1 column in the dataframe, but got {batch_df.shape[1]}." + for item in batch_df.squeeze(axis=1).items(): + yield item + def where(self, cond, other=None): value_id, cond_id, other_id, block = self._align3(cond, other) block, result_id = block.project_expr( diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 0d7a90c250..419e9d9a3b 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -274,6 +274,10 @@ def __init__( metrics=self._metrics, ) + def __del__(self): + """Automatic cleanup of internal resources""" + self.close() + @property def bqclient(self): return self._clients_provider.bqclient @@ -473,6 +477,14 @@ def read_gbq_query( [2 rows x 3 columns] See also: :meth:`Session.read_gbq`. + + Returns: + bigframes.pandas.DataFrame: + A DataFrame representing results of the query or table. + + Raises: + ValueError: + When both columns (preferred) and col_order are specified. """ # NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so # these docstrings are inline. @@ -517,6 +529,14 @@ def read_gbq_table( >>> df = bpd.read_gbq_table("bigquery-public-data.ml_datasets.penguins") See also: :meth:`Session.read_gbq`. + + Returns: + bigframes.pandas.DataFrame: + A DataFrame representing results of the query or table. + + Raises: + ValueError: + When both columns (preferred) and col_order are specified. """ # NOTE: This method doesn't (yet) exist in pandas or pandas-gbq, so # these docstrings are inline. @@ -553,6 +573,10 @@ def read_gbq_table_streaming( >>> bpd.options.display.progress_bar = None >>> sdf = bst.read_gbq_table("bigquery-public-data.ml_datasets.penguins") + + Returns: + bigframes.streaming.dataframe.StreamingDataFrame: + A StreamingDataFrame representing results of the table. """ warnings.warn( "The bigframes.streaming module is a preview feature, and subject to change.", @@ -650,6 +674,10 @@ def read_pandas( Returns: An equivalent bigframes.pandas.(DataFrame/Series/Index) object + + Raises: + ValueError: + When the object is not a Pandas DataFrame. """ import bigframes.series as series @@ -703,10 +731,8 @@ def _read_pandas_inline( try: local_block = blocks.Block.from_local(pandas_dataframe, self) inline_df = dataframe.DataFrame(local_block) - except pa.ArrowInvalid as e: - raise pa.ArrowInvalid( - f"Could not convert with a BigQuery type: `{e}`. " - ) from e + except pa.ArrowInvalid: # Thrown by arrow for unsupported types, such as geo. + return None except ValueError: # Thrown by ibis for some unhandled types return None except pa.ArrowTypeError: # Thrown by arrow for types without mapping (geo). @@ -1369,7 +1395,9 @@ def _start_query_ml_ddl( # https://cloud.google.com/bigquery/docs/customer-managed-encryption#encrypt-model job_config.destination_encryption_configuration = None - return bf_io_bigquery.start_query_with_client(self.bqclient, sql, job_config) + return bf_io_bigquery.start_query_with_client( + self.bqclient, sql, job_config, metrics=self._metrics + ) def _export( self, diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index ab2ebed0d4..170f0ac086 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -40,17 +40,14 @@ import bigframes.core import bigframes.core.compile -import bigframes.core.expression as ex import bigframes.core.guid import bigframes.core.identifiers import bigframes.core.nodes as nodes import bigframes.core.ordering as order -import bigframes.core.rewrite as rewrites import bigframes.core.schema import bigframes.core.tree_properties as tree_properties import bigframes.features import bigframes.formatting_helpers as formatting_helpers -import bigframes.operations as ops import bigframes.session._io.bigquery as bq_io import bigframes.session.metrics import bigframes.session.planner @@ -128,7 +125,7 @@ def to_sql( col_id_overrides = dict(col_id_overrides) col_id_overrides[internal_offset_col] = offset_column node = ( - self._get_optimized_plan(array_value.node) + self.replace_cached_subtrees(array_value.node) if enable_cache else array_value.node ) @@ -208,6 +205,9 @@ def export_gbq( """ Export the ArrayValue to an existing BigQuery table. """ + if bigframes.options.compute.enable_multi_query_execution: + self._simplify_with_caching(array_value) + dispositions = { "fail": bigquery.WriteDisposition.WRITE_EMPTY, "replace": bigquery.WriteDisposition.WRITE_TRUNCATE, @@ -279,7 +279,7 @@ def peek( """ A 'peek' efficiently accesses a small number of rows in the dataframe. """ - plan = self._get_optimized_plan(array_value.node) + plan = self.replace_cached_subtrees(array_value.node) if not tree_properties.can_fast_peek(plan): warnings.warn("Peeking this value cannot be done efficiently.") @@ -314,7 +314,7 @@ def head( # No user-provided ordering, so just get any N rows, its faster! return self.peek(array_value, n_rows) - plan = self._get_optimized_plan(array_value.node) + plan = self.replace_cached_subtrees(array_value.node) if not tree_properties.can_fast_head(plan): # If can't get head fast, we are going to need to execute the whole query # Will want to do this in a way such that the result is reusable, but the first @@ -322,7 +322,7 @@ def head( # This currently requires clustering on offsets. self._cache_with_offsets(array_value) # Get a new optimized plan after caching - plan = self._get_optimized_plan(array_value.node) + plan = self.replace_cached_subtrees(array_value.node) assert tree_properties.can_fast_head(plan) head_plan = generate_head_plan(plan, n_rows) @@ -347,7 +347,7 @@ def get_row_count(self, array_value: bigframes.core.ArrayValue) -> int: if count is not None: return count else: - row_count_plan = self._get_optimized_plan( + row_count_plan = self.replace_cached_subtrees( generate_row_count_plan(array_value.node) ) sql = self.compiler.compile_unordered(row_count_plan) @@ -359,7 +359,7 @@ def _local_get_row_count( ) -> Optional[int]: # optimized plan has cache materializations which will have row count metadata # that is more likely to be usable than original leaf nodes. - plan = self._get_optimized_plan(array_value.node) + plan = self.replace_cached_subtrees(array_value.node) return tree_properties.row_count(plan) # Helpers @@ -424,21 +424,8 @@ def _wait_on_job( self.metrics.count_job_stats(query_job) return results_iterator - def _get_optimized_plan(self, node: nodes.BigFrameNode) -> nodes.BigFrameNode: - """ - Takes the original expression tree and applies optimizations to accelerate execution. - - At present, the only optimization is to replace subtress with cached previous materializations. - """ - # Apply any rewrites *after* applying cache, as cache is sensitive to exact tree structure - optimized_plan = tree_properties.replace_nodes( - node, (dict(self._cached_executions)) - ) - if ENABLE_PRUNING: - used_fields = frozenset(field.id for field in optimized_plan.fields) - optimized_plan = optimized_plan.prune(used_fields) - optimized_plan = rewrites.replace_slice_ops(optimized_plan) - return optimized_plan + def replace_cached_subtrees(self, node: nodes.BigFrameNode) -> nodes.BigFrameNode: + return tree_properties.replace_nodes(node, (dict(self._cached_executions))) def _is_trivially_executable(self, array_value: bigframes.core.ArrayValue): """ @@ -448,7 +435,7 @@ def _is_trivially_executable(self, array_value: bigframes.core.ArrayValue): # Once rewriting is available, will want to rewrite before # evaluating execution cost. return tree_properties.is_trivially_executable( - self._get_optimized_plan(array_value.node) + self.replace_cached_subtrees(array_value.node) ) def _cache_with_cluster_cols( @@ -457,7 +444,7 @@ def _cache_with_cluster_cols( """Executes the query and uses the resulting table to rewrite future executions.""" sql, schema, ordering_info = self.compiler.compile_raw( - self._get_optimized_plan(array_value.node) + self.replace_cached_subtrees(array_value.node) ) tmp_table = self._sql_as_cached_temp_table( sql, @@ -474,7 +461,9 @@ def _cache_with_offsets(self, array_value: bigframes.core.ArrayValue): """Executes the query and uses the resulting table to rewrite future executions.""" offset_column = bigframes.core.guid.generate_guid("bigframes_offsets") w_offsets, offset_column = array_value.promote_offsets() - sql = self.compiler.compile_unordered(self._get_optimized_plan(w_offsets.node)) + sql = self.compiler.compile_unordered( + self.replace_cached_subtrees(w_offsets.node) + ) tmp_table = self._sql_as_cached_temp_table( sql, @@ -510,7 +499,7 @@ def _simplify_with_caching(self, array_value: bigframes.core.ArrayValue): """Attempts to handle the complexity by caching duplicated subtrees and breaking the query into pieces.""" # Apply existing caching first for _ in range(MAX_SUBTREE_FACTORINGS): - node_with_cache = self._get_optimized_plan(array_value.node) + node_with_cache = self.replace_cached_subtrees(array_value.node) if node_with_cache.planning_complexity < QUERY_COMPLEXITY_LIMIT: return @@ -567,7 +556,7 @@ def _validate_result_schema( ): actual_schema = tuple(bq_schema) ibis_schema = bigframes.core.compile.test_only_ibis_inferred_schema( - self._get_optimized_plan(array_value.node) + self.replace_cached_subtrees(array_value.node) ) internal_schema = array_value.schema if not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: @@ -583,20 +572,7 @@ def _validate_result_schema( def generate_head_plan(node: nodes.BigFrameNode, n: int): - offsets_id = bigframes.core.guid.generate_guid("offsets_") - plan_w_offsets = nodes.PromoteOffsetsNode( - node, bigframes.core.identifiers.ColumnId(offsets_id) - ) - predicate = ops.lt_op.as_expr(ex.deref(offsets_id), ex.const(n)) - plan_w_head = nodes.FilterNode(plan_w_offsets, predicate) - # Finally, drop the offsets column - return nodes.SelectionNode( - plan_w_head, - tuple( - (ex.deref(i), bigframes.core.identifiers.ColumnId(i)) - for i in node.schema.names - ), - ) + return nodes.SliceNode(node, start=None, stop=n) def generate_row_count_plan(node: nodes.BigFrameNode): diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 923605627d..21d454d72f 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -300,7 +300,9 @@ def read_gbq_table( ): # TODO(b/338111344): If we are running a query anyway, we might as # well generate ROW_NUMBER() at the same time. - all_columns = itertools.chain(index_cols, columns) if columns else () + all_columns: Iterable[str] = ( + itertools.chain(index_cols, columns) if columns else () + ) query = bf_io_bigquery.to_query( query, columns=all_columns, @@ -527,6 +529,9 @@ def read_gbq_query( configuration=configuration, ) + if self._metrics is not None: + self._metrics.count_job_stats(query_job) + # If there was no destination table, that means the query must have # been DDL or DML. Return some job metadata, instead. if not destination: diff --git a/bigframes/session/metrics.py b/bigframes/session/metrics.py index 579cac1ac3..9be7119368 100644 --- a/bigframes/session/metrics.py +++ b/bigframes/session/metrics.py @@ -44,7 +44,7 @@ def count_job_stats(self, query_job: bq_job.QueryJob): def get_performance_stats( query_job: bigquery.QueryJob, -) -> Optional[Tuple[int, int, float]]: +) -> Optional[Tuple[int, int, Optional[float]]]: """Parse the query job for performance stats. Return None if the stats do not reflect real work done in bigquery. @@ -73,7 +73,9 @@ def get_performance_stats( return bytes_processed, slot_millis, exec_seconds -def write_stats_to_disk(bytes_processed: int, slot_millis: int, exec_seconds: float): +def write_stats_to_disk( + bytes_processed: int, slot_millis: int, exec_seconds: Optional[float] +): """For pytest runs only, log information about the query job to a file in order to create a performance report. """ diff --git a/bigframes/version.py b/bigframes/version.py index 75f66191ca..501aa2bd9d 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.22.0" +__version__ = "1.24.0" diff --git a/notebooks/experimental/semantic_operators.ipynb b/notebooks/experimental/semantic_operators.ipynb index bfaad69ce2..831fc2ab9b 100644 --- a/notebooks/experimental/semantic_operators.ipynb +++ b/notebooks/experimental/semantic_operators.ipynb @@ -4,7 +4,34 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Preparation" + "# BigFrames Semantic Operator Demo" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook gives you a hands-on preview of semantic operator APIs powered by LLM. The demonstration is devided into two sections: \n", + "\n", + "The first section introduces the API syntax with some simple examples. We aim to get you familiar with how BigFrames semantic operators work. \n", + "\n", + "The second section talks about applying semantic operators on real-world large datasets. The examples are designed to benchmark the performance of the operators, and to (maybe) spark some ideas for your next application scenarios.\n", + "\n", + "Without further ado, let's get started." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, let's import BigFrames packages." ] }, { @@ -21,13 +48,31 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Enable the semantic operator experiment" + "Make sure the BigFrames version is at least `1.22.0`" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, + "outputs": [], + "source": [ + "from packaging.version import Version\n", + "\n", + "assert Version(bigframes.__version__) >= Version(\"1.22.0\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Turn on the semantic operator experiment. You will see a warning sign saying that these operators are still under experiments. This is a necessary step. Otherwise you will see `NotImplementedError` when calling these operators." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -46,12 +91,28 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Prepare the LLM model. Here we are going to use Gemini 1.5 Flash." + "Optional: turn off the display of progress bar so that only the operation results will be printed out" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# bpd.options.display.progress_bar = None" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's also create some LLM instances for these operators. They will be passed in as paramters in each method call." + ] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -65,7 +126,7 @@ { "data": { "text/html": [ - "Query job 1494d834-8b38-4928-9911-ba3bb9b1228b is DONE. 0 Bytes processed. Open Job" + "Query job aadc79c5-5402-4922-a694-adb3848e3193 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -77,7 +138,7 @@ { "data": { "text/html": [ - "Query job 6caa309b-492d-4ad3-94e3-cb2b9522ef1e is DONE. 0 Bytes processed. Open Job" + "Query job 16757654-a541-47ed-ac48-84b4b549f3bd is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -93,6 +154,20 @@ "text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-004\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# API Syntax" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this section we will go through the semantic operator APIs with small examples." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -100,15 +175,116 @@ "## Semantic Filtering" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Semantic filtering allows you to filter your dataframe based on the instruction (i.e. prompt) you provided. Let's first create a small dataframe:" + ] + }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 2640c2d3-ceb0-4f8a-8bb2-a3ec5b3c8eb8 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrycity
0USASeattle
1GermanyBerlin
2JapanKyoto
\n", + "

3 rows × 2 columns

\n", + "
[3 rows x 2 columns in total]" + ], + "text/plain": [ + " country city\n", + "0 USA Seattle\n", + "1 Germany Berlin\n", + "2 Japan Kyoto\n", + "\n", + "[3 rows x 2 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = bpd.DataFrame({'country': ['USA', 'Germany', 'Japan'], 'city': ['Seattle', 'Berlin', 'Kyoto']})\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's filter this dataframe by keeping only the rows where the value in `city` column is the capital of the value in `country` column. The column references could be \"escaped\" by using a pair of braces in your instruction. In this example, our instruction should be like this:\n", + "```\n", + "The {city} is the capital of the {country}.\n", + "```\n", + "\n", + "Note that this is not a Python f-string, so you shouldn't prefix your instruction with an `f`. Let's give it a try:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job d56e32bd-f06a-4086-aac2-560ed03dceca is DONE. 0 Bytes processed. Open Job" + "Query job 0aeff7d2-c2f3-45f1-8f8f-5be572392822 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -128,7 +304,7 @@ { "data": { "text/html": [ - "Query job 0b96351f-5a48-4059-b830-1aebd330599f is DONE. 4 Bytes processed. Open Job" + "Query job 0f3bfbe3-30c5-4bf9-8cf2-a3ea694f878d is DONE. 6 Bytes processed. Open Job" ], "text/plain": [ "" @@ -140,7 +316,7 @@ { "data": { "text/html": [ - "Query job 34b2ce70-b9be-49bb-a06d-f228b0e5937c is DONE. 33 Bytes processed. Open Job" + "Query job dfef2ffb-7196-4c94-bcae-d050021ebb5f is DONE. 50 Bytes processed. Open Job" ], "text/plain": [ "" @@ -152,7 +328,7 @@ { "data": { "text/html": [ - "Query job a4f799eb-24d6-4fcf-8661-371226788b53 is DONE. 33 Bytes processed. Open Job" + "Query job f6397298-dfe2-4e9d-ab91-e790c04ccddc is DONE. 33 Bytes processed. Open Job" ], "text/plain": [ "" @@ -204,14 +380,20 @@ "[1 rows x 2 columns]" ] }, - "execution_count": 4, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = bpd.DataFrame({'country': ['USA', 'Germany'], 'city': ['Seattle', 'Berlin']})\n", - "df.semantics.filter(\"{city} is the capital of {country}\", gemini_model)" + "df.semantics.filter(\"The {city} is the capital of the {country}\", model=gemini_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The filter operator extracts the information from the referenced column to enrich your instruction with context. The instruction is then sent for the designated model for evaluation. For filtering operations, the LLM is asked to return only `True` and `False` for each row, and the operator removes the rows accordingly." ] }, { @@ -222,69 +404,21 @@ ] }, { - "cell_type": "code", - "execution_count": 5, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "df = bpd.DataFrame(\n", - " data={\"ingredient_1\": [\"Burger Bun\", \"Soy Bean\"], \"ingredient_2\": [\"Beef Patty\", \"Bittern\"]}\n", - " )" + "Semantic mapping allows to you to combine values from multiple columns into a single output based your instruction. To demonstrate this, let's create an example dataframe:" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 04a27084-a71e-4c2d-9a73-46b768615c94 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 1a4c0d7f-0bb4-4f16-b2c0-ebb930fa6cd1 is DONE. 4 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 87bf5653-d3d8-4c0a-8017-af43907465de is DONE. 34 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 12822e33-0ca3-4968-a685-7fcb2bdb0790 is DONE. 93 Bytes processed. Open Job" + "Query job 2d6b0f90-20b3-419e-8d38-68be9e7ee1ae is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -316,72 +450,67 @@ " \n", " ingredient_1\n", " ingredient_2\n", - " food\n", " \n", " \n", " \n", " \n", " 0\n", - " Burger Bun\n", + " Bun\n", " Beef Patty\n", - " Burger\n", " \n", " \n", " 1\n", " Soy Bean\n", " Bittern\n", - " Tofu\n", + " \n", + " \n", + " 2\n", + " Sausage\n", + " Long Bread\n", " \n", " \n", "\n", - "

2 rows × 3 columns

\n", - "[2 rows x 3 columns in total]" + "

3 rows × 2 columns

\n", + "[3 rows x 2 columns in total]" ], "text/plain": [ - " ingredient_1 ingredient_2 food\n", - "0 Burger Bun Beef Patty Burger \n", - "\n", - "1 Soy Bean Bittern Tofu \n", - "\n", + " ingredient_1 ingredient_2\n", + "0 Bun Beef Patty\n", + "1 Soy Bean Bittern\n", + "2 Sausage Long Bread\n", "\n", - "[2 rows x 3 columns]" + "[3 rows x 2 columns]" ] }, - "execution_count": 6, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.semantics.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", output_column=\"food\", model=gemini_model)" + "df = bpd.DataFrame({\n", + " \"ingredient_1\": [\"Bun\", \"Soy Bean\", \"Sausage\"], \n", + " \"ingredient_2\": [\"Beef Patty\", \"Bittern\", \"Long Bread\"]\n", + " })\n", + "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Semantic Joining" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "cities = bpd.DataFrame({'city': ['Seattle', 'Ottawa', 'Berlin', 'Shanghai', 'New Delhi']})\n", - "continents = bpd.DataFrame({'continent': ['North America', 'Africa', 'Asia']})" + "Now, let's ask LLM what kind of food can be made from the two ingredients in each row. The column reference syntax in your instruction stays the same. In addition, you need to specify the column name by setting the `output_column` parameter to hold the mapping results." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job fcda7d35-d969-47a8-b611-0c516e2e39e8 is DONE. 0 Bytes processed. Open Job" + "Query job a90d785f-29d7-4818-b595-9326657cc865 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -401,7 +530,7 @@ { "data": { "text/html": [ - "Query job c532592c-c4ce-4f08-9397-21b1b8b1f347 is DONE. 30 Bytes processed. Open Job" + "Query job de201200-4135-487b-8582-12d5137ddc24 is DONE. 6 Bytes processed. Open Job" ], "text/plain": [ "" @@ -413,7 +542,7 @@ { "data": { "text/html": [ - "Query job a11bd20f-7a75-462c-b6a5-64d954645e1b is DONE. 251 Bytes processed. Open Job" + "Query job ec36bc3a-773b-4443-be63-72e4c1063168 is DONE. 52 Bytes processed. Open Job" ], "text/plain": [ "" @@ -425,7 +554,7 @@ { "data": { "text/html": [ - "Query job 4703c2a9-ab08-46f1-a612-3354c5df391f is DONE. 144 Bytes processed. Open Job" + "Query job ec204378-936a-4dc9-8e4a-5d4b17caad78 is DONE. 133 Bytes processed. Open Job" ], "text/plain": [ "" @@ -455,53 +584,230 @@ " \n", " \n", " \n", - " city\n", - " continent\n", + " ingredient_1\n", + " ingredient_2\n", + " food\n", " \n", " \n", " \n", " \n", " 0\n", - " Seattle\n", - " North America\n", + " Bun\n", + " Beef Patty\n", + " Burger\n", " \n", " \n", " 1\n", - " Ottawa\n", - " North America\n", + " Soy Bean\n", + " Bittern\n", + " Tofu\n", " \n", " \n", " 2\n", - " Shanghai\n", - " Asia\n", - " \n", - " \n", - " 3\n", - " New Delhi\n", - " Asia\n", + " Sausage\n", + " Long Bread\n", + " Hotdog\n", " \n", " \n", "\n", - "

4 rows × 2 columns

\n", - "[4 rows x 2 columns in total]" + "

3 rows × 3 columns

\n", + "[3 rows x 3 columns in total]" ], "text/plain": [ - " city continent\n", - "0 Seattle North America\n", - "1 Ottawa North America\n", + " ingredient_1 ingredient_2 food\n", + "0 Bun Beef Patty Burger \n", + "\n", + "1 Soy Bean Bittern Tofu \n", + "\n", + "2 Sausage Long Bread Hotdog \n", + "\n", + "\n", + "[3 rows x 3 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.semantics.map(\"What is the food made from {ingredient_1} and {ingredient_2}? One word only.\", output_column=\"food\", model=gemini_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The mechanism behind semantic mapping is very similar with semantic filtering. The one major difference: instead of asking LLM to reply true or false to each row, the operator lets LLM reply free-form strings and attach them as a new column to the dataframe." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Semantic Joining" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Semantic joining can join two dataframes based on the instruction you provided. First, let's prepare two dataframes." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "cities = bpd.DataFrame({'city': ['Seattle', 'Ottawa', 'Berlin', 'Shanghai', 'New Delhi']})\n", + "continents = bpd.DataFrame({'continent': ['North America', 'Africa', 'Asia']})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to join the `cities` with `continents` to form a new dataframe such that, in each row the city from the `cities` data frame is in the continent from the `continents` dataframe. We could re-use the aforementioned column reference syntax:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job d1195b35-ca65-474c-9847-8e315447a941 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 721d1ef5-810e-4d0f-bb4a-250dc12bbe82 is DONE. 30 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 54502b90-fea2-4545-8542-a6be6f6837ac is DONE. 251 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 00326a4e-2371-4572-a842-e14ba99ac02c is DONE. 144 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
citycontinent
0SeattleNorth America
1OttawaNorth America
2ShanghaiAsia
3New DelhiAsia
\n", + "

4 rows × 2 columns

\n", + "
[4 rows x 2 columns in total]" + ], + "text/plain": [ + " city continent\n", + "0 Seattle North America\n", + "1 Ottawa North America\n", "2 Shanghai Asia\n", "3 New Delhi Asia\n", "\n", "[4 rows x 2 columns]" ] }, - "execution_count": 8, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "cities.semantics.join(continents, \"{city} is in {continent}\", gemini_model)" + "cities.semantics.join(continents, \"{city} is in {continent}\", model=gemini_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "!! **Important:** Semantic join can trigger probihitively expensitve operations! This operation first cross joins two dataframes, then invokes semantic filter on each row. That means if you have two dataframes of sizes `M` and `N`, the total amount of queries sent to the LLM is on the scale of `M * N`. Therefore, we have added a parameter `max_rows`, a threshold that guards against unexpected expensive calls. With this parameter, the operator first calculates the size of your cross-joined data, and compares it with the threshold. If the size exceeds your threshold, the fuction will abort early with a `ValueError`. You can manually set the value of `max_rows` to raise or lower the threshold." ] }, { @@ -511,24 +817,40 @@ "### Self Joins" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We use a self-join example to demonstrate a special case: what happens when the joining columns exist in both data frames? It turns out that you need to provide extra information in your column references: by attaching \"left.\" and \"right.\" prefixes to your column names. \n", + "\n", + "Let's create an example data frame:" + ] + }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "animals = bpd.DataFrame({'animal': ['cow', 'cat', 'spider', 'elephant']})" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to compare the weights of these animals, and output all the pairs where the animal on the left is heavier than the animal on the right. In this case, we use `left.animal` and `right.animal` to differentiate the data sources:" + ] + }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 8c1f1313-3eee-47dc-ad2d-27a49dc831dc is DONE. 0 Bytes processed. Open Job" + "Query job 2a87f5a4-927d-472f-808d-bc86e008dbaf is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -548,7 +870,7 @@ { "data": { "text/html": [ - "Query job 08dda435-13bd-49d0-a941-1cf91a9a1c96 is DONE. 32 Bytes processed. Open Job" + "Query job 591da923-319f-43f9-bf24-20e34abf899f is DONE. 32 Bytes processed. Open Job" ], "text/plain": [ "" @@ -560,7 +882,7 @@ { "data": { "text/html": [ - "Query job abf33f67-0056-499b-b7fe-583391c6bc02 is DONE. 266 Bytes processed. Open Job" + "Query job 1422bfcf-23a9-4484-96d7-16d5b56dfe26 is DONE. 266 Bytes processed. Open Job" ], "text/plain": [ "" @@ -572,7 +894,7 @@ { "data": { "text/html": [ - "Query job 74249b99-8975-4fc4-b599-1b682edf8aeb is DONE. 180 Bytes processed. Open Job" + "Query job a659dcaf-9bd4-4776-bb82-c0d3408b41a2 is DONE. 180 Bytes processed. Open Job" ], "text/plain": [ "" @@ -654,31 +976,38 @@ "[6 rows x 2 columns]" ] }, - "execution_count": 10, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "animals.semantics.join(animals, \"{animal_left} generally weighs heavier than {animal_right}\", gemini_model)" + "animals.semantics.join(animals, \"{left.animal} generally weighs heavier than {right.animal}\", model=gemini_model)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Semantic Search" + "## Semantic Aggregation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Semantic aggregation merges all the values in a column into one. At this moment you can only aggregate a single column in each oeprator call. Let's create an example:" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 8be41631-537e-4b73-b3c8-1cad09dffb95 is DONE. 0 Bytes processed. Open Job" + "Query job 6606270b-b734-494c-a602-544db495b4c1 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -708,65 +1037,101 @@ " \n", " \n", " \n", - " creatures\n", + " Movies\n", + " Year\n", " \n", " \n", " \n", " \n", " 0\n", - " salmon\n", + " Titanic\n", + " 1997\n", " \n", " \n", " 1\n", - " sea urchin\n", + " The Wolf of Wall Street\n", + " 2013\n", " \n", " \n", " 2\n", - " baboons\n", + " Killers of the Flower Moon\n", + " 2023\n", " \n", " \n", " 3\n", - " frog\n", + " The Revenant\n", + " 2015\n", " \n", " \n", " 4\n", - " chimpanzee\n", + " Inception\n", + " 2010\n", + " \n", + " \n", + " 5\n", + " Shuttle Island\n", + " 2010\n", + " \n", + " \n", + " 6\n", + " The Great Gatsby\n", + " 2013\n", " \n", " \n", "\n", - "

5 rows × 1 columns

\n", - "[5 rows x 1 columns in total]" + "

7 rows × 2 columns

\n", + "[7 rows x 2 columns in total]" ], "text/plain": [ - " creatures\n", - "0 salmon\n", - "1 sea urchin\n", - "2 baboons\n", - "3 frog\n", - "4 chimpanzee\n", + " Movies Year\n", + "0 Titanic 1997\n", + "1 The Wolf of Wall Street 2013\n", + "2 Killers of the Flower Moon 2023\n", + "3 The Revenant 2015\n", + "4 Inception 2010\n", + "5 Shuttle Island 2010\n", + "6 The Great Gatsby 2013\n", "\n", - "[5 rows x 1 columns]" + "[7 rows x 2 columns]" ] }, - "execution_count": 11, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = bpd.DataFrame({\"creatures\": [\"salmon\", \"sea urchin\", \"baboons\", \"frog\", \"chimpanzee\"]})\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, + "df = bpd.DataFrame({\n", + " \"Movies\": [\n", + " \"Titanic\",\n", + " \"The Wolf of Wall Street\",\n", + " \"Killers of the Flower Moon\",\n", + " \"The Revenant\",\n", + " \"Inception\",\n", + " \"Shuttle Island\",\n", + " \"The Great Gatsby\",\n", + " ],\n", + " \"Year\": [1997, 2013, 2023, 2015, 2010, 2010, 2013],\n", + "})\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's ask LLM to find the actor/actress that starred in all movies:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 56d5f17f-f64a-46ca-8d30-74f8e2ad5dec is DONE. 0 Bytes processed. Open Job" + "Query job 6918cec6-9ac3-49d5-82b3-61eeedbd54dc is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -786,7 +1151,127 @@ { "data": { "text/html": [ - "Query job fe75b64a-41a3-4675-ae1e-d2db6b2270d3 is DONE. 10 Bytes processed. Open Job" + "Query job d95e8f6d-bbe2-4149-86a5-f5e69ad07c9e is DONE. 2 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job a6d7545d-91bc-4336-beed-a67295bdfa13 is DONE. 16 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job aafc86f8-2bc3-44da-821e-891ca4c75d46 is DONE. 37 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 8ca9c614-268c-4592-aaf0-a5df0143e1e7 is DONE. 37 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0 Leonardo DiCaprio \n", + "\n", + "Name: Movies, dtype: string" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agg_df = df.semantics.agg(\"Find the actors/actresses who starred in all {Movies}. Reply with their names only.\", model=gemini_model)\n", + "agg_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Instead of going through each row one by one, this operator batches multiple rows in a single request towards LLM. It then aggregates all the batched results with the same technique, until there is only one value left. You could set the batch size with `max_agg_rows` parameter, which defaults to 10." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Semantic Top K" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Semantic Top K selects the top K values based on your instruction. Here is an example:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "df = bpd.DataFrame({\"Animals\": [\"Corgi\", \"Orange Cat\", \"Parrot\", \"Tarantula\"]})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to find the top two most popular pets:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 6cb9ade5-c3fd-468c-b590-c59d8c51c68c is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job a56383c1-3bd0-4af1-b0d6-1de829f4e371 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -798,7 +1283,7 @@ { "data": { "text/html": [ - "Query job 9f06c24e-d931-4e59-a444-1a6013c43290 is DONE. 30.9 kB processed. Open Job" + "Query job 2f58a6fe-4a80-481b-b590-9d657f965d91 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -810,7 +1295,7 @@ { "data": { "text/html": [ - "Query job 627b8206-b3f9-4c25-a5d9-dde7c0042a4d is DONE. 0 Bytes processed. Open Job" + "Query job 281c1181-fcdf-407b-9a5d-03c975f05687 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -830,7 +1315,7 @@ { "data": { "text/html": [ - "Query job d01597bb-30ef-495f-be5d-c9fb16d4c112 is DONE. 2 Bytes processed. Open Job" + "Query job 593b3b9c-e2b3-4eb9-a233-56454e2a8419 is DONE. 6 Bytes processed. Open Job" ], "text/plain": [ "" @@ -842,7 +1327,7 @@ { "data": { "text/html": [ - "Query job bbc67bc3-830d-4ede-829d-16d4829dec33 is RUNNING. Open Job" + "Query job 5fe0c094-740d-47ec-a4c5-9b4cdd4c66af is DONE. 66 Bytes processed. Open Job" ], "text/plain": [ "" @@ -854,7 +1339,7 @@ { "data": { "text/html": [ - "Query job 0c844655-b7d9-494b-8073-925b4e0743ce is DONE. 37.2 kB processed. Open Job" + "Query job bab92744-1ec8-42fc-aa8f-ced99ae2fc66 is DONE. 66 Bytes processed. Open Job" ], "text/plain": [ "" @@ -866,7 +1351,7 @@ { "data": { "text/html": [ - "Query job 1993f0be-bfc2-4dad-ba85-92f5bba44945 is DONE. 0 Bytes processed. Open Job" + "Query job 5a27f6f8-7e7b-45bf-8d17-580ef5c73432 is DONE. 52 Bytes processed. Open Job" ], "text/plain": [ "" @@ -896,69 +1381,165 @@ " \n", " \n", " \n", - " creatures\n", - " similarity score\n", + " Animals\n", " \n", " \n", " \n", " \n", - " 2\n", - " baboons\n", - " 0.773411\n", + " 1\n", + " Orange Cat\n", " \n", " \n", - " 4\n", - " chimpanzee\n", - " 0.781101\n", + " 2\n", + " Parrot\n", " \n", " \n", "\n", - "

2 rows × 2 columns

\n", - "[2 rows x 2 columns in total]" + "

2 rows × 1 columns

\n", + "[2 rows x 1 columns in total]" ], "text/plain": [ - " creatures similarity score\n", - "2 baboons 0.773411\n", - "4 chimpanzee 0.781101\n", + " Animals\n", + "1 Orange Cat\n", + "2 Parrot\n", "\n", - "[2 rows x 2 columns]" + "[2 rows x 1 columns]" ] }, - "execution_count": 12, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.semantics.search(\"creatures\", \"monkey\", top_k = 2, model = text_embedding_model, score_column='similarity score')" + "df.semantics.top_k(\"{Animals} are more popular as pets\", model=gemini_model, k=2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Semantic Similarity Join" + "Under the hood, the semantic top K operator performs pair-wise comparisons with LLM. It also adopts the quick select algorithm, which means the top K results are returns in the order of their indices instead of their ranks." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Semantic Search" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Semantic search searches the most similar values to your qury within a single column. Here is an example:" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job f9e814b1-a3e4-47f4-b966-0177f879c2a9 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
creatures
0salmon
1sea urchin
2baboons
3frog
4chimpanzee
\n", + "

5 rows × 1 columns

\n", + "
[5 rows x 1 columns in total]" + ], + "text/plain": [ + " creatures\n", + "0 salmon\n", + "1 sea urchin\n", + "2 baboons\n", + "3 frog\n", + "4 chimpanzee\n", + "\n", + "[5 rows x 1 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "df1 = bpd.DataFrame({'animal': ['monkey', 'spider', 'salmon', 'giraffe', 'sparrow']})\n", - "df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon', 'owl', 'elephant', 'tuna']})" + "df = bpd.DataFrame({\"creatures\": [\"salmon\", \"sea urchin\", \"baboons\", \"frog\", \"chimpanzee\"]})\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to get the top 2 creatures that are most similar to \"monkey\":" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 222a9dcb-2389-4ad3-a1e6-c2b197f3a409 is DONE. 0 Bytes processed. Open Job" + "Query job 8ae9b68c-43e7-449d-8709-b2b6108981b9 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -978,7 +1559,7 @@ { "data": { "text/html": [ - "Query job 24afcd9d-6be5-44d9-aa89-6fbe71f5e9a7 is DONE. 10 Bytes processed. Open Job" + "Query job ff50ee3b-e005-4608-9e3f-37e48f924152 is DONE. 10 Bytes processed. Open Job" ], "text/plain": [ "" @@ -990,7 +1571,7 @@ { "data": { "text/html": [ - "Query job 6bc36226-7bbb-4954-b042-044e9fd98a47 is DONE. 30.8 kB processed. Open Job" + "Query job bbc89422-ee33-4afb-8c66-b08c0b6754b9 is DONE. 30.9 kB processed. Open Job" ], "text/plain": [ "" @@ -1002,7 +1583,7 @@ { "data": { "text/html": [ - "Query job f247f63d-1d8a-4f81-a833-628143fda463 is DONE. 0 Bytes processed. Open Job" + "Query job dfea7010-9fb5-4459-b789-b26dc98cb94d is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1022,7 +1603,7 @@ { "data": { "text/html": [ - "Query job 9bec5633-8ba1-4453-b9c7-6cb555d3c60e is DONE. 10 Bytes processed. Open Job" + "Query job 66c3b84e-7eb6-408a-a244-20713ad18f1b is DONE. 2 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1034,7 +1615,7 @@ { "data": { "text/html": [ - "Query job d7df7004-b499-436b-898c-15abee330d9e is RUNNING. Open Job" + "Query job 62f2a822-4b7f-4072-8957-e7539c7a6646 is RUNNING. Open Job" ], "text/plain": [ "" @@ -1046,7 +1627,7 @@ { "data": { "text/html": [ - "Query job 9012c011-b4e7-4fba-85a6-e439fe3c32d3 is DONE. 61.5 kB processed. Open Job" + "Query job 286ee4e0-ee7f-4ba7-b057-de17818d0ea1 is DONE. 37.2 kB processed. Open Job" ], "text/plain": [ "" @@ -1058,7 +1639,7 @@ { "data": { "text/html": [ - "Query job bb9987eb-aa37-42ca-bcf1-1ea575a147a8 is DONE. 0 Bytes processed. Open Job" + "Query job 9d56d05c-ad52-47ea-b911-0abf67b6489f is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1088,76 +1669,92 @@ " \n", " \n", " \n", - " animal\n", - " animal_1\n", - " distance\n", + " creatures\n", + " similarity score\n", " \n", " \n", " \n", " \n", - " 0\n", - " monkey\n", - " baboon\n", - " 0.747665\n", - " \n", - " \n", - " 1\n", - " spider\n", - " scorpion\n", - " 0.890909\n", - " \n", - " \n", " 2\n", - " salmon\n", - " tuna\n", - " 0.925461\n", - " \n", - " \n", - " 3\n", - " giraffe\n", - " elephant\n", - " 0.887858\n", + " baboons\n", + " 0.773411\n", " \n", " \n", " 4\n", - " sparrow\n", - " owl\n", - " 0.932959\n", + " chimpanzee\n", + " 0.781101\n", " \n", " \n", "\n", - "

5 rows × 3 columns

\n", - "[5 rows x 3 columns in total]" + "

2 rows × 2 columns

\n", + "[2 rows x 2 columns in total]" ], "text/plain": [ - " animal animal_1 distance\n", - "0 monkey baboon 0.747665\n", - "1 spider scorpion 0.890909\n", - "2 salmon tuna 0.925461\n", - "3 giraffe elephant 0.887858\n", - "4 sparrow owl 0.932959\n", + " creatures similarity score\n", + "2 baboons 0.773411\n", + "4 chimpanzee 0.781101\n", "\n", - "[5 rows x 3 columns]" + "[2 rows x 2 columns]" ] }, - "execution_count": 14, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df1.semantics.sim_join(df2, left_on='animal', right_on='animal', top_k=1, model= text_embedding_model, score_column='distance')" + "df.semantics.search(\"creatures\", query=\"monkey\", top_k = 2, model = text_embedding_model, score_column='similarity score')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice that we are using a text embedding model this time. This model generates embedding vectors for both your query as well as the values in the search space. The operator then uses BigQuery's built-in VECTOR_SEARCH function to find the nearest neighbors of your query.\n", + "\n", + "In addition, `score_column` is an optional parameter for storing the distances between the results and your query. If not set, the score column won't be attached to the result." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Semantic Similarity Join" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When you have multiple queries to search in the same value space, you could use similarity join to simplify your call. For example:" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "df1 = bpd.DataFrame({'animal': ['monkey', 'spider', 'salmon', 'giraffe', 'sparrow']})\n", + "df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon', 'owl', 'elephant', 'tuna']})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example, we want to pick the most related animal from `df2` for each value in `df1`, and this is how it's done:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 46e1cbb4-2b4a-4578-b3fd-7caba80d5dcc is DONE. 0 Bytes processed. Open Job" + "Query job 292332da-b5ec-45c2-aa34-322afbce8102 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1177,7 +1774,7 @@ { "data": { "text/html": [ - "Query job 356840f4-840c-41fc-9c9e-8bbaf9ffa02c is DONE. 4 Bytes processed. Open Job" + "Query job 95e9743a-2965-4d66-888c-49d5359253d0 is DONE. 10 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1189,7 +1786,7 @@ { "data": { "text/html": [ - "Query job 428d070e-fd5c-4b2f-b651-b3de9836c02a is DONE. 12.3 kB processed. Open Job" + "Query job 9bdaa6c6-0bdc-41c2-bf4d-aecd677a4991 is DONE. 30.8 kB processed. Open Job" ], "text/plain": [ "" @@ -1201,7 +1798,7 @@ { "data": { "text/html": [ - "Query job bf566989-7bd4-4560-952e-34d007ee1e7e is DONE. 0 Bytes processed. Open Job" + "Query job b148669b-1071-4128-8f7d-4c4c37b23d43 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1221,7 +1818,7 @@ { "data": { "text/html": [ - "Query job 92818df7-d4e9-4cea-884e-304126e78b71 is DONE. 4 Bytes processed. Open Job" + "Query job 0c09280c-69ad-4692-9559-c48a656d9cf2 is DONE. 10 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1233,7 +1830,7 @@ { "data": { "text/html": [ - "Query job e8619330-7b91-4ae2-99b3-f4386de4c512 is RUNNING. Open Job" + "Query job 9e31d888-d333-4bbd-9410-0444115e022b is RUNNING. Open Job" ], "text/plain": [ "" @@ -1245,7 +1842,7 @@ { "data": { "text/html": [ - "Query job be89eca7-462a-4b1c-95ed-0b0c031aaaac is DONE. 24.6 kB processed. Open Job" + "Query job bb8988ec-a2b7-4f5b-95c0-f8b89f53f4cc is DONE. 61.5 kB processed. Open Job" ], "text/plain": [ "" @@ -1257,7 +1854,7 @@ { "data": { "text/html": [ - "Query job 40dcd8ed-1262-459a-b6b3-7471722da078 is DONE. 0 Bytes processed. Open Job" + "Query job 6657a931-d0c5-4c8f-a12e-71719394740b is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1289,6 +1886,7 @@ " \n", " animal\n", " animal_1\n", + " distance\n", " \n", " \n", " \n", @@ -1296,170 +1894,103 @@ " 0\n", " monkey\n", " baboon\n", + " 0.747665\n", " \n", " \n", " 1\n", " spider\n", " scorpion\n", + " 0.890909\n", + " \n", + " \n", + " 2\n", + " salmon\n", + " tuna\n", + " 0.925461\n", + " \n", + " \n", + " 3\n", + " giraffe\n", + " elephant\n", + " 0.887858\n", + " \n", + " \n", + " 4\n", + " sparrow\n", + " owl\n", + " 0.932959\n", " \n", " \n", "\n", - "

2 rows × 2 columns

\n", - "[2 rows x 2 columns in total]" + "

5 rows × 3 columns

\n", + "[5 rows x 3 columns in total]" ], "text/plain": [ - " animal animal_1\n", - "0 monkey baboon\n", - "1 spider scorpion\n", + " animal animal_1 distance\n", + "0 monkey baboon 0.747665\n", + "1 spider scorpion 0.890909\n", + "2 salmon tuna 0.925461\n", + "3 giraffe elephant 0.887858\n", + "4 sparrow owl 0.932959\n", "\n", - "[2 rows x 2 columns]" + "[5 rows x 3 columns]" ] }, - "execution_count": 15, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df1 = bpd.DataFrame({'animal': ['monkey', 'spider']})\n", - "df2 = bpd.DataFrame({'animal': ['scorpion', 'baboon']})\n", - "\n", - "df1.semantics.sim_join(df2, left_on='animal', right_on='animal', top_k=1, model= text_embedding_model)" + "df1.semantics.sim_join(df2, left_on='animal', right_on='animal', top_k=1, model= text_embedding_model, score_column='distance')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Semantic Aggregation" + "!! **Important** Like semantic join, this operator can also be very expensive. To guard against unexpected processing of large dataset, use the `max_rows` parameter to provide a threshold. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Semantic Cluster" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Semantic Cluster group similar values together. For example:" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 22, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job ea1e5180-a13a-4ec7-a6b4-8eca042ac9a6 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
MoviesYear
0Titanic1997
1The Wolf of Wall Street2013
2Killers of the Flower Moon2023
3The Revenant2015
4Inception2010
5Shuttle Island2010
6The Great Gatsby2013
\n", - "

7 rows × 2 columns

\n", - "
[7 rows x 2 columns in total]" - ], - "text/plain": [ - " Movies Year\n", - "0 Titanic 1997\n", - "1 The Wolf of Wall Street 2013\n", - "2 Killers of the Flower Moon 2023\n", - "3 The Revenant 2015\n", - "4 Inception 2010\n", - "5 Shuttle Island 2010\n", - "6 The Great Gatsby 2013\n", - "\n", - "[7 rows x 2 columns]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "df = bpd.DataFrame({\n", - " \"Movies\": [\n", - " \"Titanic\",\n", - " \"The Wolf of Wall Street\",\n", - " \"Killers of the Flower Moon\",\n", - " \"The Revenant\",\n", - " \"Inception\",\n", - " \"Shuttle Island\",\n", - " \"The Great Gatsby\",\n", - " ],\n", - " \"Year\": [1997, 2013, 2023, 2015, 2010, 2010, 2013],\n", - "})\n", - "df" + "df = bpd.DataFrame({'Product': ['Smartphone', 'Laptop', 'Coffee Maker', 'T-shirt', 'Jeans']})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to cluster these products into 3 groups, and this is how:" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query job 274df4fb-06ee-49d8-8e7f-2c7eaee3440f is DONE. 0 Bytes processed. Open Job" + "Query job 1b1d59bd-3bfe-4887-a595-f281d745cf2a is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1479,7 +2010,7 @@ { "data": { "text/html": [ - "Query job 878b41c8-6428-4f05-aa0b-dcba14761ac0 is DONE. 2 Bytes processed. Open Job" + "Query job 1f01891d-4177-450d-a647-85eb7461a1ec is DONE. 10 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1491,7 +2022,7 @@ { "data": { "text/html": [ - "Query job 5a909cb7-fcbf-43d5-aac2-79b7ba466dd3 is DONE. 16 Bytes processed. Open Job" + "Query job eee27d1d-e3c9-4efd-a0ae-bfb982e51cb1 is DONE. 30.8 kB processed. Open Job" ], "text/plain": [ "" @@ -1503,7 +2034,7 @@ { "data": { "text/html": [ - "Query job 10f97d30-101c-447c-876c-d329d3a6d89b is DONE. 28 Bytes processed. Open Job" + "Query job b98ed0ab-903f-4e4a-b450-d800cef3cabf is DONE. 30.7 kB processed. Open Job" ], "text/plain": [ "" @@ -1515,7 +2046,7 @@ { "data": { "text/html": [ - "Query job b1b94183-6ad4-4014-94da-7d585d45bc6d is DONE. 28 Bytes processed. Open Job" + "Query job ec7f3a21-cf30-4f5b-88e8-c7721da60066 is DONE. 138.9 kB processed. Open Job" ], "text/plain": [ "" @@ -1526,58 +2057,1028 @@ }, { "data": { + "text/html": [ + "Query job 0766e72b-cb0f-48e2-8475-91d55e95ff42 is DONE. 80 Bytes processed. Open Job" + ], "text/plain": [ - "0 Leonardo \n", - "\n", - "Name: Movies, dtype: string" + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 9c6275c9-f368-47bf-a9ce-7e4b64f97b2c is DONE. 170 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ProductCluster ID
0Smartphone3
1Laptop3
2Coffee Maker1
3T-shirt2
4Jeans2
\n", + "

5 rows × 2 columns

\n", + "
[5 rows x 2 columns in total]" + ], + "text/plain": [ + " Product Cluster ID\n", + "0 Smartphone 3\n", + "1 Laptop 3\n", + "2 Coffee Maker 1\n", + "3 T-shirt 2\n", + "4 Jeans 2\n", + "\n", + "[5 rows x 2 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.semantics.cluster_by(column='Product', output_column='Cluster ID', model=text_embedding_model, n_clusters=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This operator uses the the embedding model to generate vectors for each value, and then uses KMeans algorithm to group them." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Performance Analyses" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this section we will use BigQuery's public data of hacker news to perform some heavy work. First, let's load 3K rows from the table:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 6622d251-728a-428f-8eb9-55c450460fe4 is DONE. 16.7 GB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 409b18ea-bf23-44ac-84eb-deb33feeaa89 is DONE. 1.2 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titletextbyscoretimestamptype
0<NA>Well, most people aren&#x27;t alcoholics, so I...slipframe<NA>2021-06-26 02:37:56+00:00comment
1<NA>No, you don&#x27;t really <i>need</i> a smartp...vetinari<NA>2023-04-19 15:56:34+00:00comment
2<NA>It&#x27;s for the late Paul Allen RIP. Should&...lsr_ssri<NA>2018-10-16 01:07:55+00:00comment
3<NA>Yup they are dangerous. Be careful Donald Trump.Sven7<NA>2015-08-10 16:05:54+00:00comment
4<NA>Sure, it&#x27;s totally reasonable. Just point...nicoburns<NA>2020-10-05 11:20:51+00:00comment
5<NA>I wonder how long before special forces start ...autisticcurio<NA>2020-09-01 15:38:50+00:00comment
6The Impending NY Tech Apocalypse: Here's What ...<NA>gaoprea32011-09-27 22:43:27+00:00story
7<NA>Where would you relocate to? I'm assuming that...pavel_lishin<NA>2011-09-16 19:02:01+00:00comment
8Eureca beta is live. A place for your business...<NA>ricardos12012-10-15 13:09:32+00:00story
9<NA>It doesn’t work on Safari, and WebKit based br...archiewood<NA>2023-04-21 16:45:13+00:00comment
10<NA>I guess I don’t see the relevance. Vegans eat ...stevula<NA>2023-01-19 20:05:54+00:00comment
11<NA>I remember watching the American news media go...fareesh<NA>2019-06-17 19:49:17+00:00comment
12<NA>This article is incorrectly using the current ...stale2002<NA>2018-03-18 18:57:21+00:00comment
13<NA>In the firm I made my internship, we have to u...iserlohnmage<NA>2019-10-22 10:41:01+00:00comment
14<NA>The main reason it requires unsafe is for memo...comex<NA>2017-05-05 20:45:37+00:00comment
15Discord vs. IRC Rough Notes<NA>todsacerdoti482024-07-12 18:39:52+00:00story
16<NA>you have to auth again when you use apple pay.empath75<NA>2017-09-12 18:58:20+00:00comment
17<NA>It goes consumer grade, automotive, military, ...moftz<NA>2021-04-13 01:24:03+00:00comment
18<NA>I don&#x27;t have a link handy but the differe...KennyBlanken<NA>2022-05-13 16:08:38+00:00comment
19<NA>&gt; I don&#x27;t think the use case you menti...colanderman<NA>2017-09-28 05:16:06+00:00comment
20<NA>I think you need to watch it again, because yo...vladimirralev<NA>2018-12-07 11:25:52+00:00comment
21Oh dear: new Yahoo anti-spoofing measures brea...<NA>joshreads12014-04-08 13:29:50+00:00story
22How Much Warmer Was Your City in 2016?<NA>smb0612017-02-16 23:26:34+00:00story
23<NA>Except that they clearly never tried to incent...aenis<NA>2022-01-31 17:08:57+00:00comment
24Working Best at Coffee Shops<NA>GiraffeNecktie2492011-04-19 14:25:17+00:00story
\n", + "

25 rows × 6 columns

\n", + "
[3000 rows x 6 columns in total]" + ], + "text/plain": [ + " title \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 \n", + "5 \n", + "6 The Impending NY Tech Apocalypse: Here's What ... \n", + "7 \n", + "8 Eureca beta is live. A place for your business... \n", + "9 \n", + "10 \n", + "11 \n", + "12 \n", + "13 \n", + "14 \n", + "15 Discord vs. IRC Rough Notes \n", + "16 \n", + "17 \n", + "18 \n", + "19 \n", + "20 \n", + "21 Oh dear: new Yahoo anti-spoofing measures brea... \n", + "22 How Much Warmer Was Your City in 2016? \n", + "23 \n", + "24 Working Best at Coffee Shops \n", + "\n", + " text by score \\\n", + "0 Well, most people aren't alcoholics, so I... slipframe \n", + "1 No, you don't really need a smartp... vetinari \n", + "2 It's for the late Paul Allen RIP. Should&... lsr_ssri \n", + "3 Yup they are dangerous. Be careful Donald Trump. Sven7 \n", + "4 Sure, it's totally reasonable. Just point... nicoburns \n", + "5 I wonder how long before special forces start ... autisticcurio \n", + "6 gaoprea 3 \n", + "7 Where would you relocate to? I'm assuming that... pavel_lishin \n", + "8 ricardos 1 \n", + "9 It doesn’t work on Safari, and WebKit based br... archiewood \n", + "10 I guess I don’t see the relevance. Vegans eat ... stevula \n", + "11 I remember watching the American news media go... fareesh \n", + "12 This article is incorrectly using the current ... stale2002 \n", + "13 In the firm I made my internship, we have to u... iserlohnmage \n", + "14 The main reason it requires unsafe is for memo... comex \n", + "15 todsacerdoti 48 \n", + "16 you have to auth again when you use apple pay. empath75 \n", + "17 It goes consumer grade, automotive, military, ... moftz \n", + "18 I don't have a link handy but the differe... KennyBlanken \n", + "19 > I don't think the use case you menti... colanderman \n", + "20 I think you need to watch it again, because yo... vladimirralev \n", + "21 joshreads 1 \n", + "22 smb06 1 \n", + "23 Except that they clearly never tried to incent... aenis \n", + "24 GiraffeNecktie 249 \n", + "\n", + " timestamp type \n", + "0 2021-06-26 02:37:56+00:00 comment \n", + "1 2023-04-19 15:56:34+00:00 comment \n", + "2 2018-10-16 01:07:55+00:00 comment \n", + "3 2015-08-10 16:05:54+00:00 comment \n", + "4 2020-10-05 11:20:51+00:00 comment \n", + "5 2020-09-01 15:38:50+00:00 comment \n", + "6 2011-09-27 22:43:27+00:00 story \n", + "7 2011-09-16 19:02:01+00:00 comment \n", + "8 2012-10-15 13:09:32+00:00 story \n", + "9 2023-04-21 16:45:13+00:00 comment \n", + "10 2023-01-19 20:05:54+00:00 comment \n", + "11 2019-06-17 19:49:17+00:00 comment \n", + "12 2018-03-18 18:57:21+00:00 comment \n", + "13 2019-10-22 10:41:01+00:00 comment \n", + "14 2017-05-05 20:45:37+00:00 comment \n", + "15 2024-07-12 18:39:52+00:00 story \n", + "16 2017-09-12 18:58:20+00:00 comment \n", + "17 2021-04-13 01:24:03+00:00 comment \n", + "18 2022-05-13 16:08:38+00:00 comment \n", + "19 2017-09-28 05:16:06+00:00 comment \n", + "20 2018-12-07 11:25:52+00:00 comment \n", + "21 2014-04-08 13:29:50+00:00 story \n", + "22 2017-02-16 23:26:34+00:00 story \n", + "23 2022-01-31 17:08:57+00:00 comment \n", + "24 2011-04-19 14:25:17+00:00 story \n", + "...\n", + "\n", + "[3000 rows x 6 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hacker_news = bpd.read_gbq(\"bigquery-public-data.hacker_news.full\")[['title', 'text', 'by', 'score', 'timestamp', 'type']].head(3000)\n", + "hacker_news" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then, let's keep only the rows that have text content:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job d98d2f69-ece3-4e58-b608-8114e59d275b is DONE. 1.0 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "2558" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hacker_news_with_texts = hacker_news[hacker_news['text'].isnull() == False]\n", + "len(hacker_news_with_texts)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's calculate the average text length in all the rows:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 0211925e-0599-454d-a1d5-e145e6db79c4 is DONE. 1.0 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "390.7251759186865" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hacker_news_with_texts['text'].str.len().mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now it's LLM's turn. Let's keep the rows in which the text is talking about iPhone. This will take several minutes to finish." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job d06977e3-3c83-4360-a920-4815dd1fbb0d is DONE. 1.0 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job af69426d-ebea-4338-a742-cef524ecca72 is DONE. 5.7 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/ml/llm.py:976: RuntimeWarning: Some predictions failed. Check column ml_generate_text_status for detailed status. You may want to filter the failed rows and retry.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job ccd12132-da6c-45ed-9bd3-a199d4c47118 is DONE. 1.2 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 824d2d55-cc04-4794-9f18-9eb67c639b19 is DONE. 1.8 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titletextbyscoretimestamptype
16<NA>you have to auth again when you use apple pay.empath75<NA>2017-09-12 18:58:20+00:00comment
413<NA>Well last time I got angry down votes for sayi...drieddust<NA>2021-01-11 19:27:27+00:00comment
797<NA>New iPhone should be announced on September. L...meerita<NA>2019-07-30 20:54:42+00:00comment
1484<NA>Why would this take a week? i(phone)OS was ori...TheOtherHobbes<NA>2021-06-08 09:25:24+00:00comment
1529<NA>&gt;or because Apple drama brings many clicks?...weberer<NA>2022-09-05 13:16:02+00:00comment
1561<NA>Location: Sydney, AU<p>Remote: Yes<p>Willing t...drEv0<NA>2016-05-03 23:55:26+00:00comment
\n", + "

6 rows × 6 columns

\n", + "
[6 rows x 6 columns in total]" + ], + "text/plain": [ + " title text by \\\n", + "16 you have to auth again when you use apple pay. empath75 \n", + "413 Well last time I got angry down votes for sayi... drieddust \n", + "797 New iPhone should be announced on September. L... meerita \n", + "1484 Why would this take a week? i(phone)OS was ori... TheOtherHobbes \n", + "1529 >or because Apple drama brings many clicks?... weberer \n", + "1561 Location: Sydney, AU

Remote: Yes

Willing t... drEv0 \n", + "\n", + " score timestamp type \n", + "16 2017-09-12 18:58:20+00:00 comment \n", + "413 2021-01-11 19:27:27+00:00 comment \n", + "797 2019-07-30 20:54:42+00:00 comment \n", + "1484 2021-06-08 09:25:24+00:00 comment \n", + "1529 2022-09-05 13:16:02+00:00 comment \n", + "1561 2016-05-03 23:55:26+00:00 comment \n", + "\n", + "[6 rows x 6 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iphone_comments=hacker_news_with_texts.semantics.filter(\"The {text} is mainly focused on iPhone\", gemini_model)\n", + "iphone_comments" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The performance of the semantic operators depends on the length of your input as well as your quota. Here are my benchmarks for running the previous operation over data of different sizes.\n", + "\n", + "* 800 Rows -> 1m 21.3s\n", + "* 2550 Rows -> 5m 9s\n", + "* 8500 Rows -> 16m 34.4s\n", + "\n", + "These numbers can give you a general idea of how fast the operators run." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's use LLM to summarize the sentiments towards iPhone:" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 6d7d7de8-0ec8-4966-a8d1-1dc09e61af6a is DONE. 1.6 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 5aa9d5a4-aa2f-4320-91ab-3cf7479ab883 is DONE. 12 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job d64f52ab-6575-43a5-870d-418decc99e49 is DONE. 2.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 9e11e90f-1ae1-4fee-b5bf-932436e4ea92 is DONE. 2.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titletextbyscoretimestamptypesentiment
16<NA>you have to auth again when you use apple pay.empath75<NA>2017-09-12 18:58:20+00:00commentFrustrated, Negative, Annoyed
413<NA>Well last time I got angry down votes for sayi...drieddust<NA>2021-01-11 19:27:27+00:00commentFrustrated, feeling cheated.
797<NA>New iPhone should be announced on September. L...meerita<NA>2019-07-30 20:54:42+00:00commentExcited anticipation.
1484<NA>Why would this take a week? i(phone)OS was ori...TheOtherHobbes<NA>2021-06-08 09:25:24+00:00commentFrustrated, critical, obvious.
1529<NA>&gt;or because Apple drama brings many clicks?...weberer<NA>2022-09-05 13:16:02+00:00commentNegative, clickbait, controversy.
1561<NA>Location: Sydney, AU<p>Remote: Yes<p>Willing t...drEv0<NA>2016-05-03 23:55:26+00:00commentSeeking employment in Australia.
\n", + "

6 rows × 7 columns

\n", + "
[6 rows x 7 columns in total]" + ], + "text/plain": [ + " title text by \\\n", + "16 you have to auth again when you use apple pay. empath75 \n", + "413 Well last time I got angry down votes for sayi... drieddust \n", + "797 New iPhone should be announced on September. L... meerita \n", + "1484 Why would this take a week? i(phone)OS was ori... TheOtherHobbes \n", + "1529 >or because Apple drama brings many clicks?... weberer \n", + "1561 Location: Sydney, AU

Remote: Yes

Willing t... drEv0 \n", + "\n", + " score timestamp type \\\n", + "16 2017-09-12 18:58:20+00:00 comment \n", + "413 2021-01-11 19:27:27+00:00 comment \n", + "797 2019-07-30 20:54:42+00:00 comment \n", + "1484 2021-06-08 09:25:24+00:00 comment \n", + "1529 2022-09-05 13:16:02+00:00 comment \n", + "1561 2016-05-03 23:55:26+00:00 comment \n", + "\n", + " sentiment \n", + "16 Frustrated, Negative, Annoyed \n", + " \n", + "413 Frustrated, feeling cheated. \n", + " \n", + "797 Excited anticipation. \n", + " \n", + "1484 Frustrated, critical, obvious. \n", + " \n", + "1529 Negative, clickbait, controversy. \n", + " \n", + "1561 Seeking employment in Australia. \n", + " \n", + "\n", + "[6 rows x 7 columns]" ] }, - "execution_count": 17, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "agg_df = df.semantics.agg(\"Find the shared first name of actors in {Movies}. One word answer.\", model=gemini_model)\n", - "agg_df" + "iphone_comments.semantics.map(\"Summarize the sentiment of the {text}. Your answer should have at most 3 words\", output_column=\"sentiment\", model=gemini_model)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Semantic Cluster" + "Here is another example: we count the number of rows whose authors have animals in their names." ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 29, "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "Query job e52f886a-1f87-45fc-990d-e66c23417a66 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", - " warnings.warn(\n" + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py:3577: UserWarning: Reading cached table from 2024-10-10 20:04:54.370456+00:00 to avoid incompatibilies with previous reads of this table. To read the latest version, set `use_cache=False` or close the current session with Session.close() or bigframes.pandas.close_session().\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n" ] }, { "data": { "text/html": [ - "Query job 82ac6302-78a1-41f7-8665-769887a47d42 is DONE. 10 Bytes processed. Open Job" + "Query job ec9a3769-cc0a-4f45-9565-c1af98ae98e5 is DONE. 0 Bytes processed. Open Job" ], "text/plain": [ "" @@ -1589,19 +3090,368 @@ { "data": { "text/html": [ - "Query job cd42b04e-e9ea-4b56-a891-78608dbef215 is DONE. 30.8 kB processed. Open Job" - ], - "text/plain": [ - "" + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titletextbyscoretimestamptype
0<NA>Well, most people aren&#x27;t alcoholics, so I...slipframe<NA>2021-06-26 02:37:56+00:00comment
1<NA>No, you don&#x27;t really <i>need</i> a smartp...vetinari<NA>2023-04-19 15:56:34+00:00comment
2<NA>It&#x27;s for the late Paul Allen RIP. Should&...lsr_ssri<NA>2018-10-16 01:07:55+00:00comment
3<NA>Yup they are dangerous. Be careful Donald Trump.Sven7<NA>2015-08-10 16:05:54+00:00comment
4<NA>Sure, it&#x27;s totally reasonable. Just point...nicoburns<NA>2020-10-05 11:20:51+00:00comment
5<NA>I wonder how long before special forces start ...autisticcurio<NA>2020-09-01 15:38:50+00:00comment
6The Impending NY Tech Apocalypse: Here's What ...<NA>gaoprea32011-09-27 22:43:27+00:00story
7<NA>Where would you relocate to? I'm assuming that...pavel_lishin<NA>2011-09-16 19:02:01+00:00comment
8Eureca beta is live. A place for your business...<NA>ricardos12012-10-15 13:09:32+00:00story
9<NA>It doesn’t work on Safari, and WebKit based br...archiewood<NA>2023-04-21 16:45:13+00:00comment
10<NA>I guess I don’t see the relevance. Vegans eat ...stevula<NA>2023-01-19 20:05:54+00:00comment
11<NA>I remember watching the American news media go...fareesh<NA>2019-06-17 19:49:17+00:00comment
12<NA>This article is incorrectly using the current ...stale2002<NA>2018-03-18 18:57:21+00:00comment
13<NA>In the firm I made my internship, we have to u...iserlohnmage<NA>2019-10-22 10:41:01+00:00comment
14<NA>The main reason it requires unsafe is for memo...comex<NA>2017-05-05 20:45:37+00:00comment
15Discord vs. IRC Rough Notes<NA>todsacerdoti482024-07-12 18:39:52+00:00story
16<NA>you have to auth again when you use apple pay.empath75<NA>2017-09-12 18:58:20+00:00comment
17<NA>It goes consumer grade, automotive, military, ...moftz<NA>2021-04-13 01:24:03+00:00comment
18<NA>I don&#x27;t have a link handy but the differe...KennyBlanken<NA>2022-05-13 16:08:38+00:00comment
19<NA>&gt; I don&#x27;t think the use case you menti...colanderman<NA>2017-09-28 05:16:06+00:00comment
20<NA>I think you need to watch it again, because yo...vladimirralev<NA>2018-12-07 11:25:52+00:00comment
21Oh dear: new Yahoo anti-spoofing measures brea...<NA>joshreads12014-04-08 13:29:50+00:00story
22How Much Warmer Was Your City in 2016?<NA>smb0612017-02-16 23:26:34+00:00story
23<NA>Except that they clearly never tried to incent...aenis<NA>2022-01-31 17:08:57+00:00comment
24Working Best at Coffee Shops<NA>GiraffeNecktie2492011-04-19 14:25:17+00:00story
\n", + "

25 rows × 6 columns

\n", + "
[3000 rows x 6 columns in total]" + ], + "text/plain": [ + " title \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 \n", + "5 \n", + "6 The Impending NY Tech Apocalypse: Here's What ... \n", + "7 \n", + "8 Eureca beta is live. A place for your business... \n", + "9 \n", + "10 \n", + "11 \n", + "12 \n", + "13 \n", + "14 \n", + "15 Discord vs. IRC Rough Notes \n", + "16 \n", + "17 \n", + "18 \n", + "19 \n", + "20 \n", + "21 Oh dear: new Yahoo anti-spoofing measures brea... \n", + "22 How Much Warmer Was Your City in 2016? \n", + "23 \n", + "24 Working Best at Coffee Shops \n", + "\n", + " text by score \\\n", + "0 Well, most people aren't alcoholics, so I... slipframe \n", + "1 No, you don't really need a smartp... vetinari \n", + "2 It's for the late Paul Allen RIP. Should&... lsr_ssri \n", + "3 Yup they are dangerous. Be careful Donald Trump. Sven7 \n", + "4 Sure, it's totally reasonable. Just point... nicoburns \n", + "5 I wonder how long before special forces start ... autisticcurio \n", + "6 gaoprea 3 \n", + "7 Where would you relocate to? I'm assuming that... pavel_lishin \n", + "8 ricardos 1 \n", + "9 It doesn’t work on Safari, and WebKit based br... archiewood \n", + "10 I guess I don’t see the relevance. Vegans eat ... stevula \n", + "11 I remember watching the American news media go... fareesh \n", + "12 This article is incorrectly using the current ... stale2002 \n", + "13 In the firm I made my internship, we have to u... iserlohnmage \n", + "14 The main reason it requires unsafe is for memo... comex \n", + "15 todsacerdoti 48 \n", + "16 you have to auth again when you use apple pay. empath75 \n", + "17 It goes consumer grade, automotive, military, ... moftz \n", + "18 I don't have a link handy but the differe... KennyBlanken \n", + "19 > I don't think the use case you menti... colanderman \n", + "20 I think you need to watch it again, because yo... vladimirralev \n", + "21 joshreads 1 \n", + "22 smb06 1 \n", + "23 Except that they clearly never tried to incent... aenis \n", + "24 GiraffeNecktie 249 \n", + "\n", + " timestamp type \n", + "0 2021-06-26 02:37:56+00:00 comment \n", + "1 2023-04-19 15:56:34+00:00 comment \n", + "2 2018-10-16 01:07:55+00:00 comment \n", + "3 2015-08-10 16:05:54+00:00 comment \n", + "4 2020-10-05 11:20:51+00:00 comment \n", + "5 2020-09-01 15:38:50+00:00 comment \n", + "6 2011-09-27 22:43:27+00:00 story \n", + "7 2011-09-16 19:02:01+00:00 comment \n", + "8 2012-10-15 13:09:32+00:00 story \n", + "9 2023-04-21 16:45:13+00:00 comment \n", + "10 2023-01-19 20:05:54+00:00 comment \n", + "11 2019-06-17 19:49:17+00:00 comment \n", + "12 2018-03-18 18:57:21+00:00 comment \n", + "13 2019-10-22 10:41:01+00:00 comment \n", + "14 2017-05-05 20:45:37+00:00 comment \n", + "15 2024-07-12 18:39:52+00:00 story \n", + "16 2017-09-12 18:58:20+00:00 comment \n", + "17 2021-04-13 01:24:03+00:00 comment \n", + "18 2022-05-13 16:08:38+00:00 comment \n", + "19 2017-09-28 05:16:06+00:00 comment \n", + "20 2018-12-07 11:25:52+00:00 comment \n", + "21 2014-04-08 13:29:50+00:00 story \n", + "22 2017-02-16 23:26:34+00:00 story \n", + "23 2022-01-31 17:08:57+00:00 comment \n", + "24 2011-04-19 14:25:17+00:00 story \n", + "...\n", + "\n", + "[3000 rows x 6 columns]" ] }, + "execution_count": 29, "metadata": {}, - "output_type": "display_data" - }, + "output_type": "execute_result" + } + ], + "source": [ + "hacker_news = bpd.read_gbq(\"bigquery-public-data.hacker_news.full\")[['title', 'text', 'by', 'score', 'timestamp', 'type']].head(3000)\n", + "hacker_news" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ { "data": { "text/html": [ - "Query job dced08f2-12ee-4b52-b5b2-b7dd177dae12 is DONE. 30.7 kB processed. Open Job" + "Query job bff6ac16-2641-495a-a624-11883435e06c is DONE. 54.2 kB processed. Open Job" ], "text/plain": [ "" @@ -1610,10 +3460,18 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/core/__init__.py:112: PreviewWarning: Interpreting JSON column(s) as StringDtype. This behavior may change in future versions.\n", + " warnings.warn(\n" + ] + }, { "data": { "text/html": [ - "Query job 5cbdac9b-f5dd-488c-8262-7a96f8501faa is DONE. 138.9 kB processed. Open Job" + "Query job aa390314-5b6e-4c7e-9502-8497dc27429a is DONE. 5.9 kB processed. Open Job" ], "text/plain": [ "" @@ -1622,10 +3480,18 @@ "metadata": {}, "output_type": "display_data" }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/sycai/src/python-bigquery-dataframes/bigframes/ml/llm.py:976: RuntimeWarning: Some predictions failed. Check column ml_generate_text_status for detailed status. You may want to filter the failed rows and retry.\n", + " warnings.warn(\n" + ] + }, { "data": { "text/html": [ - "Query job e30ff06e-b561-4ea2-b150-8cd91d4f827c is DONE. 80 Bytes processed. Open Job" + "Query job 3591e287-5c45-4ec2-a11b-afe95fd956e3 is DONE. 1.2 MB processed. Open Job" ], "text/plain": [ "" @@ -1637,7 +3503,7 @@ { "data": { "text/html": [ - "Query job 278d8a51-711a-42fe-86aa-408b2b44d4c7 is DONE. 170 Bytes processed. Open Job" + "Query job b0801c71-d44a-498a-ab3a-6302de52729f is DONE. 45.7 kB processed. Open Job" ], "text/plain": [ "" @@ -1667,61 +3533,347 @@ " \n", " \n", " \n", - " Product\n", - " Cluster ID\n", + " title\n", + " text\n", + " by\n", + " score\n", + " timestamp\n", + " type\n", " \n", " \n", " \n", " \n", - " 0\n", - " Smartphone\n", - " 3\n", + " 24\n", + " Working Best at Coffee Shops\n", + " <NA>\n", + " GiraffeNecktie\n", + " 249\n", + " 2011-04-19 14:25:17+00:00\n", + " story\n", " \n", " \n", - " 1\n", - " Laptop\n", - " 3\n", + " 96\n", + " <NA>\n", + " i resisted switching to chrome for months beca...\n", + " catshirt\n", + " <NA>\n", + " 2011-04-06 08:02:24+00:00\n", + " comment\n", " \n", " \n", - " 2\n", - " Coffee Maker\n", - " 1\n", + " 106\n", + " <NA>\n", + " I was about to say the same thing myself. For ...\n", + " geophile\n", + " <NA>\n", + " 2011-12-08 21:13:08+00:00\n", + " comment\n", " \n", " \n", - " 3\n", - " T-shirt\n", - " 2\n", + " 184\n", + " <NA>\n", + " I think it&#x27;s more than hazing. It may be ...\n", + " bayesianhorse\n", + " <NA>\n", + " 2015-06-18 16:42:53+00:00\n", + " comment\n", " \n", " \n", - " 4\n", - " Jeans\n", - " 2\n", + " 223\n", + " <NA>\n", + " I don&#x27;t understand why a beginner would s...\n", + " wolco\n", + " <NA>\n", + " 2019-02-03 14:35:43+00:00\n", + " comment\n", + " \n", + " \n", + " 284\n", + " <NA>\n", + " I leaerned more with one minute of this than a...\n", + " agumonkey\n", + " <NA>\n", + " 2016-07-16 06:19:39+00:00\n", + " comment\n", + " \n", + " \n", + " 297\n", + " <NA>\n", + " I've suggested a <i>rationale</i> for the tabo...\n", + " mechanical_fish\n", + " <NA>\n", + " 2008-12-17 04:42:02+00:00\n", + " comment\n", + " \n", + " \n", + " 306\n", + " <NA>\n", + " Do you have any reference for this?<p>I&#x27;m...\n", + " banashark\n", + " <NA>\n", + " 2023-11-13 19:57:00+00:00\n", + " comment\n", + " \n", + " \n", + " 316\n", + " <NA>\n", + " Default search scope is an option in the Finde...\n", + " kitsunesoba\n", + " <NA>\n", + " 2017-08-13 17:15:19+00:00\n", + " comment\n", + " \n", + " \n", + " 386\n", + " <NA>\n", + " Orthogonality and biology aren&#x27;t friends.\n", + " agumonkey\n", + " <NA>\n", + " 2016-04-24 16:33:41+00:00\n", + " comment\n", + " \n", + " \n", + " 391\n", + " <NA>\n", + " I chose some random physics book that was good...\n", + " prawn\n", + " <NA>\n", + " 2011-03-27 22:29:51+00:00\n", + " comment\n", + " \n", + " \n", + " 417\n", + " <NA>\n", + " Seeing this get huge on Twitter. It&#x27;s the...\n", + " shenanigoat\n", + " <NA>\n", + " 2016-01-09 03:04:22+00:00\n", + " comment\n", + " \n", + " \n", + " 421\n", + " <NA>\n", + " Looking through the comments there are a numbe...\n", + " moomin\n", + " <NA>\n", + " 2024-10-01 14:37:04+00:00\n", + " comment\n", + " \n", + " \n", + " 422\n", + " <NA>\n", + " Legacy media is a tough business. GBTC is payi...\n", + " arcticbull\n", + " <NA>\n", + " 2021-04-16 16:30:33+00:00\n", + " comment\n", + " \n", + " \n", + " 429\n", + " <NA>\n", + " Same thing if you sell unsafe food, yet we hav...\n", + " jabradoodle\n", + " <NA>\n", + " 2023-08-03 20:47:52+00:00\n", + " comment\n", + " \n", + " \n", + " 431\n", + " <NA>\n", + " There was briefly a thing called HSCSD (&quot;...\n", + " LeoPanthera\n", + " <NA>\n", + " 2019-02-11 19:49:29+00:00\n", + " comment\n", + " \n", + " \n", + " 439\n", + " <NA>\n", + " &gt; This article is a bit comical to read and...\n", + " lapcat\n", + " <NA>\n", + " 2023-01-02 16:00:49+00:00\n", + " comment\n", + " \n", + " \n", + " 446\n", + " <NA>\n", + " Large positions are most likely sold off in sm...\n", + " meowkit\n", + " <NA>\n", + " 2021-01-27 23:22:48+00:00\n", + " comment\n", + " \n", + " \n", + " 500\n", + " <NA>\n", + " A US-based VPN (or really any VPN) is only goi...\n", + " RandomBacon\n", + " <NA>\n", + " 2019-04-05 00:58:58+00:00\n", + " comment\n", + " \n", + " \n", + " 533\n", + " <NA>\n", + " <a href=\"https:&#x2F;&#x2F;codeberg.org&#x2F;A...\n", + " ElectronBadger\n", + " <NA>\n", + " 2023-12-13 08:13:15+00:00\n", + " comment\n", + " \n", + " \n", + " 589\n", + " <NA>\n", + " &gt; To me, a point of view is to do with your...\n", + " dragonwriter\n", + " <NA>\n", + " 2019-02-13 23:05:50+00:00\n", + " comment\n", + " \n", + " \n", + " 601\n", + " <NA>\n", + " So by using ADMIN_SL0T instead was it just set...\n", + " minitoar\n", + " <NA>\n", + " 2021-03-05 16:07:56+00:00\n", + " comment\n", + " \n", + " \n", + " 616\n", + " <NA>\n", + " I completely agree that this sets a bad preced...\n", + " save_ferris\n", + " <NA>\n", + " 2019-05-08 14:55:22+00:00\n", + " comment\n", + " \n", + " \n", + " 634\n", + " <NA>\n", + " How are guitar playing skills useful if you do...\n", + " Yajirobe\n", + " <NA>\n", + " 2019-06-14 10:18:19+00:00\n", + " comment\n", + " \n", + " \n", + " 647\n", + " <NA>\n", + " Outstanding!\n", + " cafard\n", + " <NA>\n", + " 2022-06-09 09:51:54+00:00\n", + " comment\n", " \n", " \n", "\n", - "

5 rows × 2 columns

\n", - "[5 rows x 2 columns in total]" - ], - "text/plain": [ - " Product Cluster ID\n", - "0 Smartphone 3\n", - "1 Laptop 3\n", - "2 Coffee Maker 1\n", - "3 T-shirt 2\n", - "4 Jeans 2\n", + "

25 rows × 6 columns

\n", + "[121 rows x 6 columns in total]" + ], + "text/plain": [ + " title \\\n", + "24 Working Best at Coffee Shops \n", + "96 \n", + "106 \n", + "184 \n", + "223 \n", + "284 \n", + "297 \n", + "306 \n", + "316 \n", + "386 \n", + "391 \n", + "417 \n", + "421 \n", + "422 \n", + "429 \n", + "431 \n", + "439 \n", + "446 \n", + "500 \n", + "533 \n", + "589 \n", + "601 \n", + "616 \n", + "634 \n", + "647 \n", "\n", - "[5 rows x 2 columns]" + " text by \\\n", + "24 GiraffeNecktie \n", + "96 i resisted switching to chrome for months beca... catshirt \n", + "106 I was about to say the same thing myself. For ... geophile \n", + "184 I think it's more than hazing. It may be ... bayesianhorse \n", + "223 I don't understand why a beginner would s... wolco \n", + "284 I leaerned more with one minute of this than a... agumonkey \n", + "297 I've suggested a rationale for the tabo... mechanical_fish \n", + "306 Do you have any reference for this?

I'm... banashark \n", + "316 Default search scope is an option in the Finde... kitsunesoba \n", + "386 Orthogonality and biology aren't friends. agumonkey \n", + "391 I chose some random physics book that was good... prawn \n", + "417 Seeing this get huge on Twitter. It's the... shenanigoat \n", + "421 Looking through the comments there are a numbe... moomin \n", + "422 Legacy media is a tough business. GBTC is payi... arcticbull \n", + "429 Same thing if you sell unsafe food, yet we hav... jabradoodle \n", + "431 There was briefly a thing called HSCSD ("... LeoPanthera \n", + "439 > This article is a bit comical to read and... lapcat \n", + "446 Large positions are most likely sold off in sm... meowkit \n", + "500 A US-based VPN (or really any VPN) is only goi... RandomBacon \n", + "533 2011-04-06 08:02:24+00:00 comment \n", + "106 2011-12-08 21:13:08+00:00 comment \n", + "184 2015-06-18 16:42:53+00:00 comment \n", + "223 2019-02-03 14:35:43+00:00 comment \n", + "284 2016-07-16 06:19:39+00:00 comment \n", + "297 2008-12-17 04:42:02+00:00 comment \n", + "306 2023-11-13 19:57:00+00:00 comment \n", + "316 2017-08-13 17:15:19+00:00 comment \n", + "386 2016-04-24 16:33:41+00:00 comment \n", + "391 2011-03-27 22:29:51+00:00 comment \n", + "417 2016-01-09 03:04:22+00:00 comment \n", + "421 2024-10-01 14:37:04+00:00 comment \n", + "422 2021-04-16 16:30:33+00:00 comment \n", + "429 2023-08-03 20:47:52+00:00 comment \n", + "431 2019-02-11 19:49:29+00:00 comment \n", + "439 2023-01-02 16:00:49+00:00 comment \n", + "446 2021-01-27 23:22:48+00:00 comment \n", + "500 2019-04-05 00:58:58+00:00 comment \n", + "533 2023-12-13 08:13:15+00:00 comment \n", + "589 2019-02-13 23:05:50+00:00 comment \n", + "601 2021-03-05 16:07:56+00:00 comment \n", + "616 2019-05-08 14:55:22+00:00 comment \n", + "634 2019-06-14 10:18:19+00:00 comment \n", + "647 2022-06-09 09:51:54+00:00 comment \n", + "...\n", + "\n", + "[121 rows x 6 columns]" ] }, - "execution_count": 19, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = bpd.DataFrame({'Product': ['Smartphone', 'Laptop', 'Coffee Maker', 'T-shirt', 'Jeans']})\n", - "\n", - "df.semantics.cluster_by(column='Product', output_column='Cluster ID', model=text_embedding_model, n_clusters=3)" + "hacker_news.semantics.filter(\"{by} contains animal name\", model=gemini_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here are my performance numbers:\n", + "* 3000 rows -> 6m 9.2s\n", + "* 10000 rows -> 26m 42.4s" ] } ], @@ -1741,7 +3893,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.1" } }, "nbformat": 4, diff --git a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb index c0c3c58a3c..f56472f1b5 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb @@ -1217,6 +1217,15 @@ "Otherwise, you can uncomment the remaining cells and run them to delete the individual resources you created in this tutorial:" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bf.close_session()" + ] + }, { "cell_type": "code", "execution_count": 24, diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb index b59ccbb8ac..e79db455fc 100644 --- a/notebooks/getting_started/getting_started_bq_dataframes.ipynb +++ b/notebooks/getting_started/getting_started_bq_dataframes.ipynb @@ -1704,6 +1704,16 @@ "Otherwise, you can uncomment the remaining cells and run them to delete the individual resources you created in this tutorial:" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Delete the temporary cloud artifacts created during the bigframes session \n", + "bpd.close_session()" + ] + }, { "cell_type": "code", "execution_count": 25, diff --git a/notebooks/location/regionalized.ipynb b/notebooks/location/regionalized.ipynb index c383a22609..5a8239a42a 100644 --- a/notebooks/location/regionalized.ipynb +++ b/notebooks/location/regionalized.ipynb @@ -7,8 +7,9 @@ "source": [ "# README\n", "\n", - "This Notebook runs differently depending on the following environent variable:\n", - "1. BIGQUERY_LOCATION - can take values as per https://cloud.google.com/bigquery/docs/locations, e.g. `us`, `asia-east1`" + "This Notebook runs requiring the following environent variable:\n", + "1. GOOGLE_CLOUD_PROJECT - The google cloud project id.\n", + "1. BIGQUERY_LOCATION - can take values as per https://cloud.google.com/bigquery/docs/locations, e.g. `us`, `asia-east1`." ] }, { @@ -1420,8 +1421,8 @@ } ], "source": [ - "import bigframes.pandas as pd\n", - "help(pd.remote_function)" + "import bigframes.pandas as bpd\n", + "help(bpd.remote_function)" ] }, { @@ -1460,7 +1461,7 @@ } ], "source": [ - "@pd.remote_function([float], str, bigquery_connection='bigframes-rf-conn')\n", + "@bpd.remote_function([float], str, bigquery_connection='bigframes-rf-conn')\n", "def get_bucket(num):\n", " if not num: return \"NA\"\n", " boundary = 4000\n", @@ -2784,6 +2785,22 @@ "source": [ "model.to_gbq(f\"{DATASET}.penguins_model\", replace=True)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Clean Up" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bpd.close_session()" + ] } ], "metadata": { diff --git a/notebooks/remote_functions/remote_function_usecases.ipynb b/notebooks/remote_functions/remote_function_usecases.ipynb index 9317e4b8fe..b897def4e8 100644 --- a/notebooks/remote_functions/remote_function_usecases.ipynb +++ b/notebooks/remote_functions/remote_function_usecases.ipynb @@ -25,7 +25,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Setup" + "# Set Up" ] }, { @@ -1379,6 +1379,22 @@ "df1 = df.assign(duration_cat=df[\"duration_minutes\"].apply(duration_category))\n", "df1.peek()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Clean Up" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bpd.close_session()" + ] } ], "metadata": { diff --git a/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb b/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb index 650bb92e50..a5769a2285 100644 --- a/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb +++ b/notebooks/remote_functions/remote_function_vertex_claude_model.ipynb @@ -452,12 +452,21 @@ "df" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Clean Up" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "bpd.close_session()" + ] } ], "metadata": { diff --git a/noxfile.py b/noxfile.py index 92f8acad7f..f537005e57 100644 --- a/noxfile.py +++ b/noxfile.py @@ -105,6 +105,7 @@ "system-3.9", "system-3.12", "cover", + "cleanup", ] # Error if a python version is missing @@ -432,7 +433,15 @@ def cover(session): (including system test runs), and then erases coverage data. """ session.install("coverage", "pytest-cov") - session.run("coverage", "report", "--show-missing", "--fail-under=90") + + # Create a coverage report that includes only the product code. + session.run( + "coverage", + "report", + "--include=bigframes/*", + "--show-missing", + "--fail-under=86", + ) # Make sure there is no dead code in our test directories. session.run( @@ -697,8 +706,8 @@ def system_prerelease(session: nox.sessions.Session): @nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS) def notebook(session: nox.Session): - GOOGLE_CLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") - if not GOOGLE_CLOUD_PROJECT: + google_cloud_project = os.getenv("GOOGLE_CLOUD_PROJECT") + if not google_cloud_project: session.error( "Set GOOGLE_CLOUD_PROJECT environment variable to run notebook session." ) @@ -744,6 +753,7 @@ def notebook(session: nox.Session): # The experimental notebooks imagine features that don't yet # exist or only exist as temporary prototypes. "notebooks/experimental/longer_ml_demo.ipynb", + "notebooks/experimental/semantic_operators.ipynb", # The notebooks that are added for more use cases, such as backing a # blog post, which may take longer to execute and need not be # continuously tested. @@ -937,3 +947,30 @@ def release_dry_run(session): ): env["PROJECT_ROOT"] = "." session.run(".kokoro/release-nightly.sh", "--dry-run", env=env) + + +@nox.session(python=DEFAULT_PYTHON_VERSION) +def cleanup(session): + """Clean up stale and/or temporary resources in the test project.""" + google_cloud_project = os.getenv("GOOGLE_CLOUD_PROJECT") + cleanup_options = [] + if google_cloud_project: + cleanup_options.append(f"--project-id={google_cloud_project}") + + # Cleanup a few stale (more than 12 hours old) temporary cloud run + # functions created by bigframems. This will help keeping the test GCP + # project within the "Number of functions" quota + # https://cloud.google.com/functions/quotas#resource_limits + recency_cutoff_hours = 12 + cleanup_count_per_location = 20 + cleanup_options.extend( + [ + f"--recency-cutoff={recency_cutoff_hours}", + "cleanup", + f"--number={cleanup_count_per_location}", + ] + ) + + session.install("-e", ".") + + session.run("python", "scripts/manage_cloud_functions.py", *cleanup_options) diff --git a/owlbot.py b/owlbot.py index b29384d462..3daf24e18c 100644 --- a/owlbot.py +++ b/owlbot.py @@ -61,7 +61,7 @@ # ---------------------------------------------------------------------------- # Encourage sharring all relevant versions in bug reports. -assert 1 == s.replace( +assert 1 == s.replace( # bug_report.md [".github/ISSUE_TEMPLATE/bug_report.md"], re.escape("#### Steps to reproduce\n"), textwrap.dedent( @@ -90,7 +90,7 @@ ) # Make sure build includes all necessary files. -assert 1 == s.replace( +assert 1 == s.replace( # MANIFEST.in ["MANIFEST.in"], re.escape("recursive-include google"), "recursive-include third_party/bigframes_vendored *\nrecursive-include bigframes", @@ -98,15 +98,15 @@ # Even though BigQuery DataFrames isn't technically a client library, we are # opting into Cloud RAD for docs hosting. -assert 1 == s.replace( +assert 1 == s.replace( # common.cfg [".kokoro/docs/common.cfg"], - re.escape('value: "docs-staging-v2-staging"'), + re.escape('value: "docs-staging-v2-dev"'), 'value: "docs-staging-v2"', ) # Use a custom table of contents since the default one isn't organized well # enough for the number of classes we have. -assert 1 == s.replace( +assert 1 == s.replace( # publish-docs.sh [".kokoro/publish-docs.sh"], ( re.escape("# upload docs") @@ -124,12 +124,19 @@ ) # Fixup the documentation. -assert 1 == s.replace( +assert 1 == s.replace( # docs/conf.py ["docs/conf.py"], re.escape("Google Cloud Client Libraries for bigframes"), "BigQuery DataFrames provides DataFrame APIs on the BigQuery engine", ) +# Don't omit `*/core/*.py` when counting test coverages +assert 1 == s.replace( # .coveragerc + [".coveragerc"], + re.escape(" */core/*.py\n"), + "", +) + # ---------------------------------------------------------------------------- # Samples templates # ---------------------------------------------------------------------------- diff --git a/samples/polars/noxfile.py b/samples/polars/noxfile.py index c36d5f2d81..494639d2fa 100644 --- a/samples/polars/noxfile.py +++ b/samples/polars/noxfile.py @@ -88,7 +88,7 @@ def get_pytest_env_vars() -> Dict[str, str]: # DO NOT EDIT - automatically generated. # All versions used to test samples. -ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] +ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] # Any default versions that should be ignored. IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] diff --git a/samples/snippets/linear_regression_tutorial_test.py b/samples/snippets/linear_regression_tutorial_test.py index 0c861d1120..9a4908dbf5 100644 --- a/samples/snippets/linear_regression_tutorial_test.py +++ b/samples/snippets/linear_regression_tutorial_test.py @@ -37,6 +37,24 @@ def test_linear_regression(random_model_id: str) -> None: replace=True, ) # [END bigquery_dataframes_bqml_linear_regression] + # [START bigquery_dataframes_bqml_linear_evaluate] + import bigframes.pandas as bpd + + # Select the model you will be evaluating. `read_gbq_model` loads model data from + # BigQuery, but you could also use the `model` object from the previous steps. + model = bpd.read_gbq_model( + your_model_id, # For example: "bqml_tutorial.penguins_model" + ) + + # Score the model with input data defined in an earlier step to compare + # model predictions on feature_columns to true labels in label_columns. + score = model.score(feature_columns, label_columns) + # Expected output results: + # index mean_absolute_error mean_squared_error mean_squared_log_error median_absolute_error r2_score explained_variance + # 0 227.012237 81838.159892 0.00507 173.080816 0.872377 0.872377 + # 1 rows x columns + # [END bigquery_dataframes_bqml_linear_evaluate] assert feature_columns is not None assert label_columns is not None assert model is not None + assert score is not None diff --git a/samples/snippets/noxfile.py b/samples/snippets/noxfile.py index c36d5f2d81..494639d2fa 100644 --- a/samples/snippets/noxfile.py +++ b/samples/snippets/noxfile.py @@ -88,7 +88,7 @@ def get_pytest_env_vars() -> Dict[str, str]: # DO NOT EDIT - automatically generated. # All versions used to test samples. -ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] +ALL_VERSIONS = ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] # Any default versions that should be ignored. IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"] diff --git a/scripts/manage_cloud_functions.py b/scripts/manage_cloud_functions.py index 6b69089089..145e178f4d 100644 --- a/scripts/manage_cloud_functions.py +++ b/scripts/manage_cloud_functions.py @@ -13,7 +13,7 @@ # limitations under the License. import argparse -from datetime import datetime +import datetime as dt import sys import time @@ -94,8 +94,10 @@ def summarize_gcfs(args): # Count how many GCFs are newer than a day recent = 0 for f in functions: - age = datetime.now() - datetime.fromtimestamp(f.update_time.timestamp()) - if age.days <= 0: + age = dt.datetime.now() - dt.datetime.fromtimestamp( + f.update_time.timestamp() + ) + if age.total_seconds() < args.recency_cutoff: recent += 1 region_counts[region] = (functions_count, recent) @@ -106,7 +108,7 @@ def summarize_gcfs(args): region = item[0] count, recent = item[1] print( - "{}: Total={}, Recent={}, OlderThanADay={}".format( + "{}: Total={}, Recent={}, Older={}".format( region, count, recent, count - recent ) ) @@ -120,8 +122,10 @@ def cleanup_gcfs(args): functions = get_bigframes_functions(args.project_id, region) count = 0 for f in functions: - age = datetime.now() - datetime.fromtimestamp(f.update_time.timestamp()) - if age.days > 0: + age = dt.datetime.now() - dt.datetime.fromtimestamp( + f.update_time.timestamp() + ) + if age.total_seconds() >= args.recency_cutoff: try: count += 1 GCF_CLIENT.delete_function(name=f.name) @@ -134,18 +138,27 @@ def cleanup_gcfs(args): # that for this clean-up, i.e. 6 mutations per minute. So wait for # 60/6 = 10 seconds time.sleep(10) + except google.api_core.exceptions.NotFound: + # Most likely the function was deleted otherwise + pass except google.api_core.exceptions.ResourceExhausted: # Stop deleting in this region for now print( - f"Cannot delete any more functions in region {region} due to quota exhaustion. Please try again later." + f"Failed to delete function in region {region} due to quota exhaustion. Pausing for 2 minutes." ) - break + time.sleep(120) def list_str(values): return [val for val in values.split(",") if val] +def get_project_from_environment(): + from google.cloud import bigquery + + return bigquery.Client().project + + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Manage cloud functions created to serve bigframes remote functions." @@ -154,9 +167,10 @@ def list_str(values): "-p", "--project-id", type=str, - required=True, + required=False, action="store", - help="GCP project-id.", + help="GCP project-id. If not provided, the project-id resolved by the" + " BigQuery client from the user environment would be used.", ) parser.add_argument( "-r", @@ -168,6 +182,19 @@ def list_str(values): help="Cloud functions region(s). If multiple regions, Specify comma separated (e.g. region1,region2)", ) + def hours_to_timedelta(hrs): + return dt.timedelta(hours=int(hrs)).total_seconds() + + parser.add_argument( + "-c", + "--recency-cutoff", + type=hours_to_timedelta, + required=False, + default=hours_to_timedelta("24"), + action="store", + help="Number of hours, cloud functions older than which should be considered stale (worthy of cleanup).", + ) + subparsers = parser.add_subparsers(title="subcommands", required=True) parser_summary = subparsers.add_parser( "summary", @@ -192,4 +219,10 @@ def list_str(values): parser_cleanup.set_defaults(func=cleanup_gcfs) args = parser.parse_args(sys.argv[1:]) + if args.project_id is None: + args.project_id = get_project_from_environment() + if args.project_id is None: + raise ValueError( + "Could not resolve a project. Plese set it via --project-id option." + ) args.func(args) diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 217cf71e0c..ba8f350c73 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -241,6 +241,11 @@ def dataset_id_permanent_tokyo( return dataset_id +@pytest.fixture(scope="session") +def table_id_not_created(dataset_id: str): + return f"{dataset_id}.{prefixer.create_prefix()}" + + @pytest.fixture(scope="session") def scalars_schema(bigquery_client: bigquery.Client): # TODO(swast): Add missing scalar data types such as BIGNUMERIC. diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index ba963837e5..cbc702018a 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -90,12 +90,14 @@ def test_columntransformer_standalone_fit_and_transform( def test_columntransformer_standalone_fit_transform(new_penguins_df): + # rename column to ensure robustness to column names that must be escaped + new_penguins_df = new_penguins_df.rename(columns={"species": "123 'species'"}) transformer = compose.ColumnTransformer( [ ( "onehot", preprocessing.OneHotEncoder(), - "species", + "123 'species'", ), ( "standard_scale", @@ -108,7 +110,7 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df): "CASE WHEN {0} IS NULL THEN -1 ELSE LENGTH({0}) END", target_column="len_{0}", ), - "species", + "123 'species'", ), ( "identity", @@ -119,16 +121,16 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df): ) result = transformer.fit_transform( - new_penguins_df[["species", "culmen_length_mm", "flipper_length_mm"]] + new_penguins_df[["123 'species'", "culmen_length_mm", "flipper_length_mm"]] ).to_pandas() utils.check_pandas_df_schema_and_index( result, columns=[ - "onehotencoded_species", + "onehotencoded_123 'species'", "standard_scaled_culmen_length_mm", "standard_scaled_flipper_length_mm", - "len_species", + "len_123 'species'", "culmen_length_mm", "flipper_length_mm", ], @@ -194,7 +196,7 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id): ( "sql_scalar_column_transformer", compose.SQLScalarColumnTransformer( - "CASE WHEN species IS NULL THEN -1 ELSE LENGTH(species) END", + "CASE WHEN `species` IS NULL THEN -1 ELSE LENGTH(`species`) END", target_column="len_species", ), "?len_species", @@ -202,21 +204,21 @@ def test_columntransformer_save_load(new_penguins_df, dataset_id): ( "sql_scalar_column_transformer", compose.SQLScalarColumnTransformer( - "flipper_length_mm", target_column="flipper_length_mm" + "`flipper_length_mm`", target_column="flipper_length_mm" ), "?flipper_length_mm", ), ( "sql_scalar_column_transformer", compose.SQLScalarColumnTransformer( - "culmen_length_mm", target_column="culmen_length_mm" + "`culmen_length_mm`", target_column="culmen_length_mm" ), "?culmen_length_mm", ), ( "sql_scalar_column_transformer", compose.SQLScalarColumnTransformer( - "CASE WHEN species IS NULL THEN -1 ELSE LENGTH(species) END ", + "CASE WHEN `species` IS NULL THEN -1 ELSE LENGTH(`species`) END", target_column="Flex species Name", ), "?Flex species Name", diff --git a/tests/system/large/ml/test_core.py b/tests/system/large/ml/test_core.py index be5eea925f..c1e1cc19d9 100644 --- a/tests/system/large/ml/test_core.py +++ b/tests/system/large/ml/test_core.py @@ -146,10 +146,15 @@ def test_bqml_standalone_transform(penguins_df_default_index, new_penguins_df): "ML.ONE_HOT_ENCODER(species, 'none', 1000000, 0) OVER() AS onehotencoded_species", ], ) + start_execution_count = model.session._metrics.execution_count + + transformed = model.transform(new_penguins_df) + + end_execution_count = model.session._metrics.execution_count + assert end_execution_count - start_execution_count == 1 - transformed = model.transform(new_penguins_df).to_pandas() utils.check_pandas_df_schema_and_index( - transformed, + transformed.to_pandas(), columns=["scaled_culmen_length_mm", "onehotencoded_species"], index=[1633, 1672, 1690], col_exact=False, diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index f593ac2983..273da97bc5 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -128,10 +128,21 @@ def test_unordered_mode_linear_regression_configure_fit_score_predict( ] ] y_train = df[["body_mass_g"]] + + start_execution_count = df._block._expr.session._metrics.execution_count model.fit(X_train, y_train) + end_execution_count = df._block._expr.session._metrics.execution_count + # The fit function initiates two queries: the first generates and caches + # the training data, while the second creates and fits the model. + assert end_execution_count - start_execution_count == 2 # Check score to ensure the model was fitted + start_execution_count = end_execution_count result = model.score(X_train, y_train).to_pandas() + end_execution_count = df._block._expr.session._metrics.execution_count + # The score function and to_pandas each initiate one query. + assert end_execution_count - start_execution_count == 2 + utils.check_pandas_df_schema_and_index( result, columns=utils.ML_REGRESSION_METRICS, index=1 ) @@ -154,7 +165,10 @@ def test_unordered_mode_linear_regression_configure_fit_score_predict( assert reloaded_model.max_iterations == 20 assert reloaded_model.tol == 0.01 + start_execution_count = df._block._expr.session._metrics.execution_count pred = reloaded_model.predict(df) + end_execution_count = df._block._expr.session._metrics.execution_count + assert end_execution_count - start_execution_count == 1 utils.check_pandas_df_schema_and_index( pred, columns=("predicted_body_mass_g",), diff --git a/tests/system/large/operations/test_semantics.py b/tests/system/large/operations/test_semantics.py index 2d7f4756af..37489d0e53 100644 --- a/tests/system/large/operations/test_semantics.py +++ b/tests/system/large/operations/test_semantics.py @@ -38,12 +38,12 @@ def test_semantics_experiment_off_raise_error(): pytest.param(2, None, id="two"), pytest.param(3, None, id="three"), pytest.param(4, None, id="four"), - pytest.param(5, "Year", id="two_w_cluster_column"), - pytest.param(6, "Year", id="three_w_cluster_column"), - pytest.param(7, "Year", id="four_w_cluster_column"), + pytest.param(5, "Years", id="two_w_cluster_column"), + pytest.param(6, "Years", id="three_w_cluster_column"), + pytest.param(7, "Years", id="four_w_cluster_column"), ], ) -def test_agg_w_max_agg_rows(session, gemini_flash_model, max_agg_rows, cluster_column): +def test_agg(session, gemini_flash_model, max_agg_rows, cluster_column): bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( data={ @@ -56,7 +56,7 @@ def test_agg_w_max_agg_rows(session, gemini_flash_model, max_agg_rows, cluster_c "Shuttle Island", "The Great Gatsby", ], - "Year": [1997, 2013, 2023, 2015, 2010, 2010, 2013], + "Years": [1997, 2013, 2023, 2015, 2010, 2010, 2013], }, session=session, ) @@ -73,6 +73,29 @@ def test_agg_w_max_agg_rows(session, gemini_flash_model, max_agg_rows, cluster_c pandas.testing.assert_series_equal(actual_s, expected_s, check_index_type=False) +def test_agg_w_int_column(session, gemini_flash_model): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame( + data={ + "Movies": [ + "Killers of the Flower Moon", + "The Great Gatsby", + ], + "Years": [2023, 2013], + }, + session=session, + ) + instruction = "Find the {Years} Leonardo DiCaprio acted in the most movies. Answer with the year only." + actual_s = df.semantics.agg( + instruction, + model=gemini_flash_model, + ).to_pandas() + + expected_s = pd.Series(["2013 \n"], dtype=dtypes.STRING_DTYPE) + expected_s.name = "Years" + pandas.testing.assert_series_equal(actual_s, expected_s, check_index_type=False) + + @pytest.mark.parametrize( "instruction", [ @@ -82,12 +105,12 @@ def test_agg_w_max_agg_rows(session, gemini_flash_model, max_agg_rows, cluster_c marks=pytest.mark.xfail(raises=ValueError), ), pytest.param( - "{city} is in the {non_existing_column}", + "{Movies} is good", id="non_existing_column", marks=pytest.mark.xfail(raises=ValueError), ), pytest.param( - "{city} is in the {country}", + "{Movies} is better than {Movies}", id="two_columns", marks=pytest.mark.xfail(raises=NotImplementedError), ), @@ -96,7 +119,14 @@ def test_agg_w_max_agg_rows(session, gemini_flash_model, max_agg_rows, cluster_c def test_agg_invalid_instruction_raise_error(instruction, gemini_flash_model): bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( - {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} + data={ + "Movies": [ + "Titanic", + "The Wolf of Wall Street", + "Killers of the Flower Moon", + ], + "Year": [1997, 2013, 2023], + }, ) df.semantics.agg(instruction, gemini_flash_model) @@ -195,15 +225,21 @@ def test_cluster_by_invalid_model(session, gemini_flash_model): def test_filter(session, gemini_flash_model): bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( - data={"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}, + data={ + "country": ["USA", "Germany"], + "city": ["Seattle", "Berlin"], + "year": [2023, 2024], + }, session=session, ) actual_df = df.semantics.filter( - "{city} is the capital of {country}", gemini_flash_model + "{city} is the capital of {country} in {year}", gemini_flash_model ).to_pandas() - expected_df = pd.DataFrame({"country": ["Germany"], "city": ["Berlin"]}, index=[1]) + expected_df = pd.DataFrame( + {"country": ["Germany"], "city": ["Berlin"], "year": [2024]}, index=[1] + ) pandas.testing.assert_frame_equal( actual_df, expected_df, check_dtype=False, check_index_type=False ) @@ -229,15 +265,26 @@ def test_filter_single_column_reference(session, gemini_flash_model): @pytest.mark.parametrize( "instruction", [ - "No column reference", - "{city} is in the {non_existing_column}", + pytest.param( + "No column reference", + id="zero_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "{city} is in the {non_existing_column}", + id="non_existing_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "{id}", + id="invalid_type", + marks=pytest.mark.xfail(raises=TypeError), + ), ], ) def test_filter_invalid_instruction_raise_error(instruction, gemini_flash_model): bigframes.options.experiments.semantic_operators = True - df = dataframe.DataFrame( - {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} - ) + df = dataframe.DataFrame({"id": [1, 2], "city": ["Seattle", "Berlin"]}) with pytest.raises(ValueError): df.semantics.filter(instruction, gemini_flash_model) @@ -249,7 +296,7 @@ def test_filter_invalid_model_raise_error(): {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} ) - with pytest.raises(ValueError): + with pytest.raises(TypeError): df.semantics.filter("{city} is the capital of {country}", None) @@ -259,12 +306,13 @@ def test_map(session, gemini_flash_model): data={ "ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"], + "gluten-free": [True, True], }, session=session, ) actual_df = df.semantics.map( - "What is the food made from {ingredient_1} and {ingredient_2}? One word only.", + "What is the {gluten-free} food made from {ingredient_1} and {ingredient_2}? One word only.", "food", gemini_flash_model, ).to_pandas() @@ -275,6 +323,7 @@ def test_map(session, gemini_flash_model): { "ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"], + "gluten-free": [True, True], "food": ["burger", "tofu"], } ) @@ -290,14 +339,28 @@ def test_map(session, gemini_flash_model): @pytest.mark.parametrize( "instruction", [ - "No column reference", - "What is the food made from {ingredient_1} and {non_existing_column}?}", + pytest.param( + "No column reference", + id="zero_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "What is the food made from {ingredient_1} and {non_existing_column}?}", + id="non_existing_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "{id}", + id="invalid_type", + marks=pytest.mark.xfail(raises=TypeError), + ), ], ) def test_map_invalid_instruction_raise_error(instruction, gemini_flash_model): bigframes.options.experiments.semantic_operators = True df = dataframe.DataFrame( data={ + "id": [1, 2], "ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"], } @@ -316,7 +379,7 @@ def test_map_invalid_model_raise_error(): }, ) - with pytest.raises(ValueError): + with pytest.raises(TypeError): df.semantics.map( "What is the food made from {ingredient_1} and {ingredient_2}? One word only.", "food", @@ -324,7 +387,21 @@ def test_map_invalid_model_raise_error(): ) -def test_join(session, gemini_flash_model): +@pytest.mark.parametrize( + "instruction", + [ + pytest.param("{city} is in {country}", id="no_dataframe_reference"), + pytest.param("{left.city} is in {country}", id="has_left_dataframe_reference"), + pytest.param( + "{city} is in {right.country}", + id="has_right_dataframe_reference", + ), + pytest.param( + "{left.city} is in {right.country}", id="has_both_dataframe_references" + ), + ], +) +def test_join(instruction, session, gemini_flash_model): bigframes.options.experiments.semantic_operators = True cities = dataframe.DataFrame( data={ @@ -339,7 +416,7 @@ def test_join(session, gemini_flash_model): actual_df = cities.semantics.join( countries, - "{city} belongs to {country}", + instruction, gemini_flash_model, ).to_pandas() @@ -369,7 +446,7 @@ def test_self_join(session, gemini_flash_model): actual_df = animals.semantics.join( animals, - "{animal_left} is heavier than {animal_right}", + "{left.animal} is heavier than {right.animal}", gemini_flash_model, ).to_pandas() @@ -420,22 +497,12 @@ def test_join_data_too_large_raise_error(session, gemini_flash_model): id="ambiguous_column", ), pytest.param( - "{city_left} is in {country}", - r"Unnecessary suffix for .+", - id="suffix_on_left_unique_column", + "{right.city} is in {country}", r"Column .+ not found", id="wrong_prefix" ), pytest.param( - "{city} is in {region_right}", - r"Unnecessary suffix for .+", - id="suffix_on_right_unique_column", - ), - pytest.param( - "{city_right} is in {country}", r"Column .+ not found", id="wrong_suffix" - ), - pytest.param( - "{city} is in {continent_right}", + "{city} is in {right.continent}", r"Column .+ not found", - id="suffix_on_non_existing_column", + id="prefix_on_non_existing_column", ), ], ) @@ -462,7 +529,7 @@ def test_join_invalid_model_raise_error(): cities = dataframe.DataFrame({"city": ["Seattle", "Berlin"]}) countries = dataframe.DataFrame({"country": ["USA", "UK", "Germany"]}) - with pytest.raises(ValueError): + with pytest.raises(TypeError): cities.semantics.join(countries, "{city} is in {country}", None) @@ -528,6 +595,19 @@ def test_search_invalid_model_raises_error(session): df.semantics.search("creatures", "monkey", top_k=2, model=None) +def test_search_invalid_top_k_raises_error(session, text_embedding_generator): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame( + data={"creatures": ["salmon", "sea urchin", "baboons", "frog", "chimpanzee"]}, + session=session, + ) + + with pytest.raises(ValueError): + df.semantics.search( + "creatures", "monkey", top_k=0, model=text_embedding_generator + ) + + @pytest.mark.parametrize( "score_column", [ @@ -614,6 +694,27 @@ def test_sim_join_invalid_model_raises_error(session): ) +def test_sim_join_invalid_top_k_raises_error(session, text_embedding_generator): + bigframes.options.experiments.semantic_operators = True + df1 = dataframe.DataFrame( + data={"creatures": ["salmon", "cat"]}, + session=session, + ) + df2 = dataframe.DataFrame( + data={"creatures": ["dog", "tuna"]}, + session=session, + ) + + with pytest.raises(ValueError): + df1.semantics.sim_join( + df2, + left_on="creatures", + right_on="creatures", + top_k=0, + model=text_embedding_generator, + ) + + def test_sim_join_data_too_large_raises_error(session, text_embedding_generator): bigframes.options.experiments.semantic_operators = True df1 = dataframe.DataFrame( @@ -633,3 +734,51 @@ def test_sim_join_data_too_large_raises_error(session, text_embedding_generator) model=text_embedding_generator, max_rows=1, ) + + +@pytest.mark.parametrize( + "instruction", + [ + pytest.param( + "No column reference", + id="zero_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "{Animals}", + id="non_existing_column", + marks=pytest.mark.xfail(raises=ValueError), + ), + pytest.param( + "{Animals} and {Animals}", + id="two_columns", + marks=pytest.mark.xfail(raises=NotImplementedError), + ), + pytest.param( + "{index}", + id="preserved", + marks=pytest.mark.xfail(raises=ValueError), + ), + ], +) +def test_top_k_invalid_instruction_raise_error(instruction, gemini_flash_model): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame( + { + "Animals": ["Dog", "Cat", "Bird", "Horse"], + "ID": [1, 2, 3, 4], + "index": ["a", "b", "c", "d"], + } + ) + df.semantics.top_k(instruction, model=gemini_flash_model, k=2) + + +def test_top_k_invalid_k_raise_error(gemini_flash_model): + bigframes.options.experiments.semantic_operators = True + df = dataframe.DataFrame({"Animals": ["Dog", "Cat", "Bird", "Horse"]}) + with pytest.raises(ValueError): + df.semantics.top_k( + "{Animals} are more popular as pets", + gemini_flash_model, + k=0, + ) diff --git a/tests/system/small/bigquery/test_vector_search.py b/tests/system/small/bigquery/test_vector_search.py index 4280c0a888..b6a6d59c4c 100644 --- a/tests/system/small/bigquery/test_vector_search.py +++ b/tests/system/small/bigquery/test_vector_search.py @@ -12,12 +12,108 @@ # See the License for the specific language governing permissions and # limitations under the License. +import random +from typing import Any, cast, Dict, Iterable + +import google.cloud.bigquery import numpy as np import pandas as pd +import pyarrow +import pytest import bigframes.bigquery as bbq import bigframes.pandas as bpd +# Need at least 5,000 rows to create a vector index. +VECTOR_DF = pd.DataFrame( + { + "rowid": np.arange(9_999), + # 3D values, clustered around the three unit vector axes. + "my_embedding": pd.Series( + [ + [ + 1 + (random.random() - 0.5) if (row % 3) == 0 else 0, + 1 + (random.random() - 0.5) if (row % 3) == 1 else 0, + 1 + (random.random() - 0.5) if (row % 3) == 2 else 0, + ] + for row in range(9_999) + ], + dtype=pd.ArrowDtype(pyarrow.list_(pyarrow.float64())), + ), + # Three groups of animal, vegetable, and mineral, corresponding to + # the embeddings above. + "mystery_word": [ + "aarvark", + "broccoli", + "calcium", + "dog", + "eggplant", + "ferrite", + "gopher", + "huckleberry", + "ice", + ] + * 1_111, + }, +) + + +@pytest.fixture +def vector_table_id( + bigquery_client: google.cloud.bigquery.Client, + # Use non-US location to ensure location autodetection works. + table_id_not_created: str, +): + table = google.cloud.bigquery.Table( + table_id_not_created, + [ + {"name": "rowid", "type": "INT64"}, + {"name": "my_embedding", "type": "FLOAT64", "mode": "REPEATED"}, + {"name": "mystery_word", "type": "STRING"}, + ], + ) + bigquery_client.create_table(table) + bigquery_client.load_table_from_json( + cast(Iterable[Dict[str, Any]], VECTOR_DF.to_dict(orient="records")), + table_id_not_created, + ).result() + yield table_id_not_created + bigquery_client.delete_table(table_id_not_created, not_found_ok=True) + + +def test_create_vector_index_ivf( + session, vector_table_id: str, bigquery_client: google.cloud.bigquery.Client +): + bbq.create_vector_index( + vector_table_id, + "my_embedding", + distance_type="cosine", + stored_column_names=["mystery_word"], + index_type="ivf", + ivf_options={"num_lists": 3}, + session=session, + ) + + # Check that the index was created successfully. + project_id, dataset_id, table_name = vector_table_id.split(".") + indexes = bigquery_client.query_and_wait( + f""" + SELECT index_catalog, index_schema, table_name, index_name, index_column_name + FROM `{project_id}`.`{dataset_id}`.INFORMATION_SCHEMA.VECTOR_INDEX_COLUMNS + WHERE table_name = '{table_name}'; + """ + ).to_dataframe() + + # There should only be one vector index. + assert len(indexes.index) == 1 + assert indexes["index_catalog"].iloc[0] == project_id + assert indexes["index_schema"].iloc[0] == dataset_id + assert indexes["table_name"].iloc[0] == table_name + assert indexes["index_column_name"].iloc[0] == "my_embedding" + + # If no name is specified, use the table name as the index name + assert indexes["index_name"].iloc[0] == table_name + def test_vector_search_basic_params_with_df(): search_query = bpd.DataFrame( diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index 6b852e87af..65540e7e81 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -383,8 +383,14 @@ def test_model_forecast(time_series_bqml_arima_plus_model: core.BqmlModel): def test_model_register(ephemera_penguins_bqml_linear_model: core.BqmlModel): model = ephemera_penguins_bqml_linear_model + + start_execution_count = model.session._metrics.execution_count + model.register() + end_execution_count = model.session._metrics.execution_count + assert end_execution_count - start_execution_count == 1 + assert model.model.model_id is not None model_name = "bigframes_" + model.model.model_id # Only registered model contains the field, and the field includes project/dataset. Here only check model_id. diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 78fed6b82f..40862b3086 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -264,7 +264,9 @@ def test_text_embedding_generator_multi_cols_predict_success( "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514", "gemini-1.5-pro-001", + "gemini-1.5-pro-002", "gemini-1.5-flash-001", + "gemini-1.5-flash-002", ), ) def test_create_load_gemini_text_generator_model( @@ -292,7 +294,9 @@ def test_create_load_gemini_text_generator_model( "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514", "gemini-1.5-pro-001", + "gemini-1.5-pro-002", "gemini-1.5-flash-001", + "gemini-1.5-flash-002", ), ) @pytest.mark.flaky(retries=2) @@ -315,7 +319,9 @@ def test_gemini_text_generator_predict_default_params_success( "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514", "gemini-1.5-pro-001", + "gemini-1.5-pro-002", "gemini-1.5-flash-001", + "gemini-1.5-flash-002", ), ) @pytest.mark.flaky(retries=2) @@ -340,7 +346,9 @@ def test_gemini_text_generator_predict_with_params_success( "gemini-1.5-pro-preview-0514", "gemini-1.5-flash-preview-0514", "gemini-1.5-pro-001", + "gemini-1.5-pro-002", "gemini-1.5-flash-001", + "gemini-1.5-flash-002", ), ) @pytest.mark.flaky(retries=2) diff --git a/tests/system/small/ml/test_model_selection.py b/tests/system/small/ml/test_model_selection.py index e6b5f8cdc2..c1a1e073b9 100644 --- a/tests/system/small/ml/test_model_selection.py +++ b/tests/system/small/ml/test_model_selection.py @@ -434,10 +434,10 @@ def test_KFold_seeded_correct_rows(session, penguins_pandas_df_default_index): y = df["body_mass_g"] X_train, X_test, y_train, y_test = next(kf.split(X, y)) # type: ignore - X_train_sorted = X_train.to_pandas().sort_index() - X_test_sorted = X_test.to_pandas().sort_index() - y_train_sorted = y_train.to_pandas().sort_index() - y_test_sorted = y_test.to_pandas().sort_index() + X_train_sorted = X_train.to_pandas().sort_index() # type: ignore + X_test_sorted = X_test.to_pandas().sort_index() # type: ignore + y_train_sorted = y_train.to_pandas().sort_index() # type: ignore + y_test_sorted = y_test.to_pandas().sort_index() # type: ignore train_index: pd.Index = pd.Index( [ diff --git a/tests/system/small/ml/test_utils.py b/tests/system/small/ml/test_utils.py new file mode 100644 index 0000000000..0543f36852 --- /dev/null +++ b/tests/system/small/ml/test_utils.py @@ -0,0 +1,80 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pandas.testing +import pytest + +import bigframes.ml.utils as utils + +_DATA_FRAME = pd.DataFrame({"column": [1, 2, 3]}) +_SERIES = pd.Series([1, 2, 3], name="column") + + +@pytest.mark.parametrize( + "data", + [pytest.param(_DATA_FRAME, id="dataframe"), pytest.param(_SERIES, id="series")], +) +def test_convert_to_dataframe(session, data): + bf_data = session.read_pandas(data) + + (actual_result,) = utils.convert_to_dataframe(bf_data) + + pandas.testing.assert_frame_equal( + actual_result.to_pandas(), + _DATA_FRAME, + check_index_type=False, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "data", + [pytest.param(_DATA_FRAME, id="dataframe"), pytest.param(_SERIES, id="series")], +) +def test_convert_pandas_to_dataframe(data, session): + (actual_result,) = utils.convert_to_dataframe(data, session=session) + + pandas.testing.assert_frame_equal( + actual_result.to_pandas(), + _DATA_FRAME, + check_index_type=False, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "data", + [pytest.param(_DATA_FRAME, id="dataframe"), pytest.param(_SERIES, id="series")], +) +def test_convert_to_series(session, data): + bf_data = session.read_pandas(data) + + (actual_result,) = utils.convert_to_series(bf_data) + + pandas.testing.assert_series_equal( + actual_result.to_pandas(), _SERIES, check_index_type=False, check_dtype=False + ) + + +@pytest.mark.parametrize( + "data", + [pytest.param(_DATA_FRAME, id="dataframe"), pytest.param(_SERIES, id="series")], +) +def test_convert_pandas_to_series(data, session): + (actual_result,) = utils.convert_to_series(data, session=session) + + pandas.testing.assert_series_equal( + actual_result.to_pandas(), _SERIES, check_index_type=False, check_dtype=False + ) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 2d5ae21bb4..cbf6e1269d 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -426,24 +426,12 @@ def test_dataframe_groupby_getitem_error( scalars_pandas_df_index, ): col_names = ["float64_col", "int64_col", "bool_col", "string_col"] - with pytest.raises(KeyError, match="\"Columns not found: 'not_in_group'\""): - ( - scalars_df_index[col_names] - .groupby("string_col")["not_in_group"] - .min() - .to_pandas() - ) - - -def test_dataframe_groupby_getitem_multiple_columns_error( - scalars_df_index, - scalars_pandas_df_index, -): - col_names = ["float64_col", "int64_col", "bool_col", "string_col"] - with pytest.raises(KeyError, match="\"Columns not found: 'col1', 'col2'\""): + with pytest.raises( + KeyError, match=r"Columns not found: 'not_in_group'. Did you mean 'string_col'?" + ): ( scalars_df_index[col_names] - .groupby("string_col")["col1", "col2"] + .groupby("bool_col")["not_in_group"] .min() .to_pandas() ) @@ -464,6 +452,23 @@ def test_dataframe_groupby_getitem_list( pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) +def test_dataframe_groupby_getitem_list_error( + scalars_df_index, + scalars_pandas_df_index, +): + col_names = ["float64_col", "int64_col", "bool_col", "string_col"] + with pytest.raises( + KeyError, + match=r"Columns not found: 'col1', 'float'. Did you mean 'bool_col', 'float64_col'?", + ): + ( + scalars_df_index[col_names] + .groupby("string_col")["col1", "float"] + .min() + .to_pandas() + ) + + def test_dataframe_groupby_nonnumeric_with_mean(): df = pd.DataFrame( { diff --git a/tests/system/small/test_null_index.py b/tests/system/small/test_null_index.py index a1e360f73d..c5be49a56b 100644 --- a/tests/system/small/test_null_index.py +++ b/tests/system/small/test_null_index.py @@ -21,6 +21,23 @@ from tests.system.utils import skip_legacy_pandas +def test_null_index_to_gbq(session, scalars_df_null_index, dataset_id_not_created): + dataset_id = dataset_id_not_created + destination_table = f"{dataset_id}.scalars_df_unindexed" + + result_table = scalars_df_null_index.to_gbq( + destination_table, clustering_columns=["int64_col"] + ) + assert ( + result_table == destination_table + if destination_table + else result_table is not None + ) + + loaded_scalars_df_index = session.read_gbq(result_table) + assert not loaded_scalars_df_index.empty + + def test_null_index_materialize(scalars_df_null_index, scalars_pandas_df_default_index): bf_result = scalars_df_null_index.to_pandas() pd.testing.assert_frame_equal( @@ -83,6 +100,23 @@ def test_null_index_aggregate(scalars_df_null_index, scalars_pandas_df_default_i ) +def test_null_index_binop_series_axis_0( + scalars_df_null_index, scalars_pandas_df_default_index +): + bf_result = ( + scalars_df_null_index[["int64_col", "int64_too"]] + .add(scalars_df_null_index["int64_col"], axis=0) + .to_pandas() + ) + pd_result = scalars_pandas_df_default_index[["int64_col", "int64_too"]].add( + scalars_pandas_df_default_index.int64_col, axis=0 + ) + + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + def test_null_index_groupby_aggregate( scalars_df_null_index, scalars_pandas_df_default_index ): @@ -139,6 +173,25 @@ def test_null_index_merge_left_null_index_object( assert got.shape == expected.shape +@skip_legacy_pandas +@pytest.mark.parametrize( + ("expr",), + [ + ("new_col = int64_col + int64_too",), + ("new_col = (rowindex > 3) | bool_col",), + ("int64_too = bool_col\nnew_col2 = rowindex",), + ], +) +def test_null_index_df_eval( + scalars_df_null_index, scalars_pandas_df_default_index, expr +): + + bf_result = scalars_df_null_index.eval(expr).to_pandas() + pd_result = scalars_pandas_df_default_index.eval(expr) + + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + def test_null_index_merge_right_null_index_object( scalars_df_null_index, scalars_df_default_index, scalars_pandas_df_default_index ): diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index f1c60664a1..c29f91bc5c 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -23,6 +23,7 @@ import pandas as pd import pyarrow as pa # type: ignore import pytest +import shapely # type: ignore import bigframes.pandas import bigframes.series as series @@ -213,6 +214,43 @@ def test_series_construct_from_list_escaped_strings(): pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) +def test_series_construct_geodata(): + pd_series = pd.Series( + [shapely.Point(1, 1), shapely.Point(2, 2), shapely.Point(3, 3)], + dtype=gpd.array.GeometryDtype(), + ) + + series = bigframes.pandas.Series(pd_series) + + pd.testing.assert_series_equal( + pd_series, series.to_pandas(), check_index_type=False + ) + + +@pytest.mark.parametrize( + ["data", "index"], + [ + (["a", "b", "c"], None), + ([1, 2, 3], ["a", "b", "c"]), + ([1, 2, None], ["a", "b", "c"]), + ([1, 2, 3], [pd.NA, "b", "c"]), + ([numpy.nan, 2, 3], ["a", "b", "c"]), + ], +) +def test_series_items(data, index): + bf_series = series.Series(data, index=index) + pd_series = pd.Series(data, index=index) + + for (bf_index, bf_value), (pd_index, pd_value) in zip( + bf_series.items(), pd_series.items() + ): + # TODO(jialuo): Remove the if conditions after b/373699458 is addressed. + if not pd.isna(bf_index) or not pd.isna(pd_index): + assert bf_index == pd_index + if not pd.isna(bf_value) or not pd.isna(pd_value): + assert bf_value == pd_value + + @pytest.mark.parametrize( ["col_name", "expected_dtype"], [ diff --git a/tests/unit/core/test_bf_utils.py b/tests/unit/core/test_bf_utils.py index 10ce1fd09e..248b6796e2 100644 --- a/tests/unit/core/test_bf_utils.py +++ b/tests/unit/core/test_bf_utils.py @@ -26,7 +26,7 @@ def test_get_standardized_ids_columns(): utils.UNNAMED_COLUMN_ID, "duplicate", "duplicate_1", - "with_space", + "with space", ] assert idx_ids == [] @@ -35,7 +35,7 @@ def test_get_standardized_ids_indexes(): col_labels = ["duplicate"] idx_labels = ["string", 0, None, "duplicate", "duplicate", "with space"] - col_ids, idx_ids = utils.get_standardized_ids(col_labels, idx_labels) + col_ids, idx_ids = utils.get_standardized_ids(col_labels, idx_labels, strict=True) assert col_ids == ["duplicate_2"] assert idx_ids == [ @@ -53,4 +53,4 @@ def test_get_standardized_ids_tuple(): col_ids, _ = utils.get_standardized_ids(col_labels) - assert col_ids == ["('foo',_1)", "('foo',_2)", "('bar',_1)"] + assert col_ids == ["('foo', 1)", "('foo', 2)", "('bar', 1)"] diff --git a/tests/unit/core/test_rewrite.py b/tests/unit/core/test_rewrite.py new file mode 100644 index 0000000000..0965238fcd --- /dev/null +++ b/tests/unit/core/test_rewrite.py @@ -0,0 +1,57 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest.mock as mock + +import google.cloud.bigquery + +import bigframes.core as core +import bigframes.core.nodes as nodes +import bigframes.core.rewrite as rewrites +import bigframes.core.schema + +TABLE_REF = google.cloud.bigquery.TableReference.from_string("project.dataset.table") +SCHEMA = ( + google.cloud.bigquery.SchemaField("col_a", "INTEGER"), + google.cloud.bigquery.SchemaField("col_b", "INTEGER"), +) +TABLE = google.cloud.bigquery.Table( + table_ref=TABLE_REF, + schema=SCHEMA, +) +FAKE_SESSION = mock.create_autospec(bigframes.Session, instance=True) +type(FAKE_SESSION)._strictly_ordered = mock.PropertyMock(return_value=True) +LEAF = core.ArrayValue.from_table( + session=FAKE_SESSION, + table=TABLE, + schema=bigframes.core.schema.ArraySchema.from_bq_table(TABLE), +).node + + +def test_rewrite_noop_slice(): + slice = nodes.SliceNode(LEAF, None, None) + result = rewrites.rewrite_slice(slice) + assert result == LEAF + + +def test_rewrite_reverse_slice(): + slice = nodes.SliceNode(LEAF, None, None, -1) + result = rewrites.rewrite_slice(slice) + assert result == nodes.ReversedNode(LEAF) + + +def test_rewrite_filter_slice(): + slice = nodes.SliceNode(LEAF, None, 2) + result = rewrites.rewrite_slice(slice) + assert list(result.fields) == list(LEAF.fields) + assert isinstance(result.child, nodes.FilterNode) diff --git a/tests/unit/core/test_slices.py b/tests/unit/core/test_slices.py new file mode 100644 index 0000000000..745db45eab --- /dev/null +++ b/tests/unit/core/test_slices.py @@ -0,0 +1,61 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes.core.slices as slices + + +@pytest.mark.parametrize( + ["slice", "input_rows", "expected"], + [ + ((1, 2, 3), 3, 1), + ((-3, 400, None), 401, 2), + ((5, 505, None), 300, 295), + ((1, 10, 4), 10, 3), + ((1, 9, 4), 10, 2), + ((-1, -10, -4), 10, 3), + ((-1, -10, 4), 10, 0), + ((99, 100, 1), 9, 0), + ], +) +def test_slice_row_count(slice, input_rows, expected): + assert expected == slices.slice_output_rows(slice, input_rows) + + +@pytest.mark.parametrize( + ["slice", "input_rows", "expected"], + [ + ((1, 2, 3), 3, (1, 2, 3)), + ((-3, 400, None), 401, (-3, 400, None)), + ((5, 505, None), 300, (5, None, None)), + ((99, 100, 1), 9, (99, None, None)), + ], +) +def test_remove_unused_parts(slice, input_rows, expected): + assert expected == slices.remove_unused_parts(slice, input_rows) + + +@pytest.mark.parametrize( + ["slice", "input_rows", "expected"], + [ + ((1, 2, 3), 3, (1, 2, 3)), + ((-3, 400, None), 401, (398, 400, 1)), + ((5, 505, None), 300, (5, 300, 1)), + ((None, None, None), 300, (0, None, 1)), + ((None, None, -1), 300, (299, None, -1)), + ], +) +def test_to_forward_offsets(slice, input_rows, expected): + assert expected == slices.to_forward_offsets(slice, input_rows) diff --git a/tests/unit/ml/test_compose.py b/tests/unit/ml/test_compose.py index 7643f76e56..395296f3e4 100644 --- a/tests/unit/ml/test_compose.py +++ b/tests/unit/ml/test_compose.py @@ -258,8 +258,8 @@ def test_customtransformer_compile_sql(mock_X): ident_trafo = SQLScalarColumnTransformer("{0}", target_column="ident_{0}") sqls = ident_trafo._compile_to_sql(X=mock_X, columns=["col1", "col2"]) assert sqls == [ - "col1 AS ident_col1", - "col2 AS ident_col2", + "`col1` AS `ident_col1`", + "`col2` AS `ident_col2`", ] len1_trafo = SQLScalarColumnTransformer( @@ -267,8 +267,8 @@ def test_customtransformer_compile_sql(mock_X): ) sqls = len1_trafo._compile_to_sql(X=mock_X, columns=["col1", "col2"]) assert sqls == [ - "CASE WHEN col1 IS NULL THEN -5 ELSE LENGTH(col1) END AS len1_col1", - "CASE WHEN col2 IS NULL THEN -5 ELSE LENGTH(col2) END AS len1_col2", + "CASE WHEN `col1` IS NULL THEN -5 ELSE LENGTH(`col1`) END AS `len1_col1`", + "CASE WHEN `col2` IS NULL THEN -5 ELSE LENGTH(`col2`) END AS `len1_col2`", ] len2_trafo = SQLScalarColumnTransformer( @@ -276,8 +276,8 @@ def test_customtransformer_compile_sql(mock_X): ) sqls = len2_trafo._compile_to_sql(X=mock_X, columns=["col1", "col2"]) assert sqls == [ - "CASE WHEN col1 IS NULL THEN 99 ELSE LENGTH(col1) END AS len2_col1", - "CASE WHEN col2 IS NULL THEN 99 ELSE LENGTH(col2) END AS len2_col2", + "CASE WHEN `col1` IS NULL THEN 99 ELSE LENGTH(`col1`) END AS `len2_col1`", + "CASE WHEN `col2` IS NULL THEN 99 ELSE LENGTH(`col2`) END AS `len2_col2`", ] @@ -524,11 +524,11 @@ def test_columntransformer_compile_to_sql(mock_X): ) sqls = column_transformer._compile_to_sql(mock_X) assert sqls == [ - "culmen_length_mm AS ident_culmen_length_mm", - "flipper_length_mm AS ident_flipper_length_mm", - "CASE WHEN species IS NULL THEN -2 ELSE LENGTH(species) END AS len1_species", - "CASE WHEN species IS NULL THEN 99 ELSE LENGTH(species) END AS len2_species", - "ML.LABEL_ENCODER(species, 1000000, 0) OVER() AS labelencoded_species", + "`culmen_length_mm` AS `ident_culmen_length_mm`", + "`flipper_length_mm` AS `ident_flipper_length_mm`", + "CASE WHEN `species` IS NULL THEN -2 ELSE LENGTH(`species`) END AS `len1_species`", + "CASE WHEN `species` IS NULL THEN 99 ELSE LENGTH(`species`) END AS `len2_species`", + "ML.LABEL_ENCODER(`species`, 1000000, 0) OVER() AS `labelencoded_species`", ] @@ -548,13 +548,13 @@ def test_columntransformer_flexible_column_names(mock_X): ["culmen_length_mm", "flipper_length_mm"], ), ("len1_trafo", len1_transformer, ["species shortname"]), - ("len2_trafo", len2_transformer, ["`species longname`"]), + ("len2_trafo", len2_transformer, ["species longname"]), ] ) sqls = column_transformer._compile_to_sql(mock_X) assert sqls == [ - "culmen_length_mm AS `ident culmen_length_mm`", - "flipper_length_mm AS `ident flipper_length_mm`", + "`culmen_length_mm` AS `ident culmen_length_mm`", + "`flipper_length_mm` AS `ident flipper_length_mm`", "CASE WHEN `species shortname` IS NULL THEN -2 ELSE LENGTH(`species shortname`) END AS `len1_species shortname`", "CASE WHEN `species longname` IS NULL THEN 99 ELSE LENGTH(`species longname`) END AS `len2_species longname`", ] @@ -576,6 +576,6 @@ def test_columntransformer_extract_from_bq_model_flexnames(bq_model_flexnames): SQLScalarColumnTransformer(sql='culmen_length_mm', target_column='Flex Name culmen_length_mm'), '?Flex Name culmen_length_mm'), ('sql_scalar_column_transformer', - SQLScalarColumnTransformer(sql='flipper_length_mm ', target_column='Flex Name flipper_length_mm'), + SQLScalarColumnTransformer(sql='flipper_length_mm', target_column='Flex Name flipper_length_mm'), '?Flex Name flipper_length_mm')])""" assert expected == actual diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index aa7e919b24..ce05011546 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -36,6 +36,7 @@ def mock_session(): TEMP_MODEL_ID.project, TEMP_MODEL_ID.dataset_id ) mock_session._bq_kms_key_name = None + mock_session._metrics = None query_job = mock.create_autospec(bigquery.QueryJob) type(query_job).destination = mock.PropertyMock( @@ -106,7 +107,7 @@ def test_linear_regression_default_fit( model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="auto_strategy",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LINEAR_REG',\n data_split_method='NO_SPLIT',\n optimize_strategy='auto_strategy',\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy='line_search',\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_sql" ) @@ -116,7 +117,7 @@ def test_linear_regression_params_fit(bqml_model_factory, mock_session, mock_X, model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="auto_strategy",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LINEAR_REG',\n data_split_method='NO_SPLIT',\n optimize_strategy='auto_strategy',\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy='line_search',\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_sql" ) @@ -126,7 +127,7 @@ def test_linear_regression_predict(mock_session, bqml_model, mock_X): model.predict(mock_X) mock_session.read_gbq.assert_called_once_with( - "SELECT * FROM ML.PREDICT(MODEL `model_project.model_dataset.model_id`,\n (input_X_sql))", + "SELECT * FROM ML.PREDICT(MODEL `model_project`.`model_dataset`.`model_id`,\n (input_X_sql))", index_col=["index_column_id"], ) @@ -137,7 +138,7 @@ def test_linear_regression_score(mock_session, bqml_model, mock_X, mock_y): model.score(mock_X, mock_y) mock_session.read_gbq.assert_called_once_with( - "SELECT * FROM ML.EVALUATE(MODEL `model_project.model_dataset.model_id`,\n (input_X_y_sql))" + "SELECT * FROM ML.EVALUATE(MODEL `model_project`.`model_dataset`.`model_id`,\n (input_X_y_sql))" ) @@ -149,7 +150,7 @@ def test_logistic_regression_default_fit( model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=True,\n auto_class_weights=False,\n optimize_strategy="auto_strategy",\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LOGISTIC_REG',\n data_split_method='NO_SPLIT',\n fit_intercept=True,\n auto_class_weights=False,\n optimize_strategy='auto_strategy',\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy='line_search',\n min_rel_progress=0.01,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_sql" ) @@ -171,7 +172,7 @@ def test_logistic_regression_params_fit( model.fit(mock_X, mock_y) mock_session._start_query_ml_ddl.assert_called_once_with( - 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=False,\n auto_class_weights=True,\n optimize_strategy="batch_gradient_descent",\n l2_reg=0.2,\n max_iterations=30,\n learn_rate_strategy="constant",\n min_rel_progress=0.02,\n calculate_p_values=False,\n enable_global_explain=False,\n l1_reg=0.2,\n learn_rate=0.2,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' + "CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type='LOGISTIC_REG',\n data_split_method='NO_SPLIT',\n fit_intercept=False,\n auto_class_weights=True,\n optimize_strategy='batch_gradient_descent',\n l2_reg=0.2,\n max_iterations=30,\n learn_rate_strategy='constant',\n min_rel_progress=0.02,\n calculate_p_values=False,\n enable_global_explain=False,\n l1_reg=0.2,\n learn_rate=0.2,\n INPUT_LABEL_COLS=['input_column_label'])\nAS input_X_y_sql" ) @@ -181,7 +182,7 @@ def test_logistic_regression_predict(mock_session, bqml_model, mock_X): model.predict(mock_X) mock_session.read_gbq.assert_called_once_with( - "SELECT * FROM ML.PREDICT(MODEL `model_project.model_dataset.model_id`,\n (input_X_sql))", + "SELECT * FROM ML.PREDICT(MODEL `model_project`.`model_dataset`.`model_id`,\n (input_X_sql))", index_col=["index_column_id"], ) @@ -192,5 +193,5 @@ def test_logistic_regression_score(mock_session, bqml_model, mock_X, mock_y): model.score(mock_X, mock_y) mock_session.read_gbq.assert_called_once_with( - "SELECT * FROM ML.EVALUATE(MODEL `model_project.model_dataset.model_id`,\n (input_X_y_sql))" + "SELECT * FROM ML.EVALUATE(MODEL `model_project`.`model_dataset`.`model_id`,\n (input_X_y_sql))" ) diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index cdf2d0b2e4..ee0821dfe9 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -34,7 +34,9 @@ def model_creation_sql_generator() -> ml_sql.ModelCreationSqlGenerator: @pytest.fixture(scope="session") def model_manipulation_sql_generator() -> ml_sql.ModelManipulationSqlGenerator: return ml_sql.ModelManipulationSqlGenerator( - model_name="my_project_id.my_dataset_id.my_model_id" + model_ref=bigquery.ModelReference.from_string( + "my_project_id.my_dataset_id.my_model_id" + ) ) @@ -53,7 +55,7 @@ def test_ml_arima_coefficients( sql = model_manipulation_sql_generator.ml_arima_coefficients() assert ( sql - == """SELECT * FROM ML.ARIMA_COEFFICIENTS(MODEL `my_project_id.my_dataset_id.my_model_id`)""" + == """SELECT * FROM ML.ARIMA_COEFFICIENTS(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`)""" ) @@ -64,8 +66,8 @@ def test_options_correct(base_sql_generator: ml_sql.BaseSqlGenerator): assert ( sql == """OPTIONS( - model_type="lin_reg", - input_label_cols=["col_a"], + model_type='lin_reg', + input_label_cols=['col_a'], l1_reg=0.6)""" ) @@ -89,42 +91,42 @@ def test_standard_scaler_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_standard_scaler("col_a", "scaled_col_a") - assert sql == "ML.STANDARD_SCALER(col_a) OVER() AS scaled_col_a" + assert sql == "ML.STANDARD_SCALER(`col_a`) OVER() AS `scaled_col_a`" def test_max_abs_scaler_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_max_abs_scaler("col_a", "scaled_col_a") - assert sql == "ML.MAX_ABS_SCALER(col_a) OVER() AS scaled_col_a" + assert sql == "ML.MAX_ABS_SCALER(`col_a`) OVER() AS `scaled_col_a`" def test_min_max_scaler_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_min_max_scaler("col_a", "scaled_col_a") - assert sql == "ML.MIN_MAX_SCALER(col_a) OVER() AS scaled_col_a" + assert sql == "ML.MIN_MAX_SCALER(`col_a`) OVER() AS `scaled_col_a`" def test_imputer_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_imputer("col_a", "mean", "scaled_col_a") - assert sql == "ML.IMPUTER(col_a, 'mean') OVER() AS scaled_col_a" + assert sql == "ML.IMPUTER(`col_a`, 'mean') OVER() AS `scaled_col_a`" def test_k_bins_discretizer_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_bucketize("col_a", [1, 2, 3, 4], "scaled_col_a") - assert sql == "ML.BUCKETIZE(col_a, [1, 2, 3, 4], FALSE) AS scaled_col_a" + assert sql == "ML.BUCKETIZE(`col_a`, [1, 2, 3, 4], FALSE) AS `scaled_col_a`" def test_k_bins_discretizer_quantile_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_quantile_bucketize("col_a", 5, "scaled_col_a") - assert sql == "ML.QUANTILE_BUCKETIZE(col_a, 5) OVER() AS scaled_col_a" + assert sql == "ML.QUANTILE_BUCKETIZE(`col_a`, 5) OVER() AS `scaled_col_a`" def test_one_hot_encoder_correct( @@ -134,7 +136,8 @@ def test_one_hot_encoder_correct( "col_a", "none", 1000000, 0, "encoded_col_a" ) assert ( - sql == "ML.ONE_HOT_ENCODER(col_a, 'none', 1000000, 0) OVER() AS encoded_col_a" + sql + == "ML.ONE_HOT_ENCODER(`col_a`, 'none', 1000000, 0) OVER() AS `encoded_col_a`" ) @@ -142,14 +145,14 @@ def test_label_encoder_correct( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_label_encoder("col_a", 1000000, 0, "encoded_col_a") - assert sql == "ML.LABEL_ENCODER(col_a, 1000000, 0) OVER() AS encoded_col_a" + assert sql == "ML.LABEL_ENCODER(`col_a`, 1000000, 0) OVER() AS `encoded_col_a`" def test_polynomial_expand( base_sql_generator: ml_sql.BaseSqlGenerator, ): sql = base_sql_generator.ml_polynomial_expand(["col_a", "col_b"], 2, "poly_exp") - assert sql == "ML.POLYNOMIAL_EXPAND(STRUCT(col_a, col_b), 2) AS poly_exp" + assert sql == "ML.POLYNOMIAL_EXPAND(STRUCT(`col_a`, `col_b`), 2) AS `poly_exp`" def test_create_model_correct( @@ -167,7 +170,7 @@ def test_create_model_correct( sql == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_model_correct_sql` OPTIONS( - option_key1="option_value1", + option_key1='option_value1', option_key2=2) AS input_X_y_sql""" ) @@ -195,7 +198,7 @@ def test_create_model_transform_correct( ML.STANDARD_SCALER(col_a) OVER(col_a) AS scaled_col_a, ML.ONE_HOT_ENCODER(col_b) OVER(col_b) AS encoded_col_b) OPTIONS( - option_key1="option_value1", + option_key1='option_value1', option_key2=2) AS input_X_y_sql""" ) @@ -218,7 +221,7 @@ def test_create_llm_remote_model_correct( == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_remote_model` REMOTE WITH CONNECTION `my_project.us.my_connection` OPTIONS( - option_key1="option_value1", + option_key1='option_value1', option_key2=2) AS input_X_y_sql""" ) @@ -239,7 +242,7 @@ def test_create_remote_model_correct( == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_remote_model` REMOTE WITH CONNECTION `my_project.us.my_connection` OPTIONS( - option_key1="option_value1", + option_key1='option_value1', option_key2=2)""" ) @@ -260,12 +263,12 @@ def test_create_remote_model_with_params_correct( sql == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_remote_model` INPUT( - column1 int64) + `column1` int64) OUTPUT( - result array) + `result` array) REMOTE WITH CONNECTION `my_project.us.my_connection` OPTIONS( - option_key1="option_value1", + option_key1='option_value1', option_key2=2)""" ) @@ -283,7 +286,7 @@ def test_create_imported_model_correct( sql == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_imported_model` OPTIONS( - option_key1="option_value1", + option_key1='option_value1', option_key2=2)""" ) @@ -303,11 +306,11 @@ def test_create_xgboost_imported_model_produces_correct_sql( sql == """CREATE OR REPLACE MODEL `test-proj`.`_anonXYZ`.`create_xgboost_imported_model` INPUT( - column1 int64) + `column1` int64) OUTPUT( - result array) + `result` array) OPTIONS( - option_key1="option_value1", + option_key1='option_value1', option_key2=2)""" ) @@ -320,9 +323,9 @@ def test_alter_model_correct_sql( ) assert ( sql - == """ALTER MODEL `my_project_id.my_dataset_id.my_model_id` + == """ALTER MODEL `my_project_id`.`my_dataset_id`.`my_model_id` SET OPTIONS( - option_key1="option_value1", + option_key1='option_value1', option_key2=2)""" ) @@ -334,7 +337,7 @@ def test_ml_predict_correct( sql = model_manipulation_sql_generator.ml_predict(source_sql=mock_df.sql) assert ( sql - == """SELECT * FROM ML.PREDICT(MODEL `my_project_id.my_dataset_id.my_model_id`, + == """SELECT * FROM ML.PREDICT(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`, (input_X_y_sql))""" ) @@ -348,7 +351,7 @@ def test_ml_llm_evaluate_correct( ) assert ( sql - == """SELECT * FROM ML.EVALUATE(MODEL `my_project_id.my_dataset_id.my_model_id`, + == """SELECT * FROM ML.EVALUATE(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`, (input_X_y_sql), STRUCT("CLASSIFICATION" AS task_type))""" ) @@ -360,7 +363,7 @@ def test_ml_evaluate_correct( sql = model_manipulation_sql_generator.ml_evaluate(source_sql=mock_df.sql) assert ( sql - == """SELECT * FROM ML.EVALUATE(MODEL `my_project_id.my_dataset_id.my_model_id`, + == """SELECT * FROM ML.EVALUATE(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`, (input_X_y_sql))""" ) @@ -373,7 +376,7 @@ def test_ml_arima_evaluate_correct( ) assert ( sql - == """SELECT * FROM ML.ARIMA_EVALUATE(MODEL `my_project_id.my_dataset_id.my_model_id`, + == """SELECT * FROM ML.ARIMA_EVALUATE(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`, STRUCT(True AS show_all_candidate_models))""" ) @@ -384,7 +387,7 @@ def test_ml_evaluate_no_source_correct( sql = model_manipulation_sql_generator.ml_evaluate() assert ( sql - == """SELECT * FROM ML.EVALUATE(MODEL `my_project_id.my_dataset_id.my_model_id`)""" + == """SELECT * FROM ML.EVALUATE(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`)""" ) @@ -394,7 +397,7 @@ def test_ml_centroids_correct( sql = model_manipulation_sql_generator.ml_centroids() assert ( sql - == """SELECT * FROM ML.CENTROIDS(MODEL `my_project_id.my_dataset_id.my_model_id`)""" + == """SELECT * FROM ML.CENTROIDS(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`)""" ) @@ -406,10 +409,10 @@ def test_ml_forecast_correct_sql( ) assert ( sql - == """SELECT * FROM ML.FORECAST(MODEL `my_project_id.my_dataset_id.my_model_id`, + == """SELECT * FROM ML.FORECAST(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`, STRUCT( - 1 AS option_key1, - 2.2 AS option_key2))""" + 1 AS `option_key1`, + 2.2 AS `option_key2`))""" ) @@ -423,10 +426,10 @@ def test_ml_generate_text_correct( ) assert ( sql - == """SELECT * FROM ML.GENERATE_TEXT(MODEL `my_project_id.my_dataset_id.my_model_id`, + == """SELECT * FROM ML.GENERATE_TEXT(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`, (input_X_y_sql), STRUCT( - 1 AS option_key1, - 2.2 AS option_key2))""" + 1 AS `option_key1`, + 2.2 AS `option_key2`))""" ) @@ -440,10 +443,10 @@ def test_ml_generate_embedding_correct( ) assert ( sql - == """SELECT * FROM ML.GENERATE_EMBEDDING(MODEL `my_project_id.my_dataset_id.my_model_id`, + == """SELECT * FROM ML.GENERATE_EMBEDDING(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`, (input_X_y_sql), STRUCT( - 1 AS option_key1, - 2.2 AS option_key2))""" + 1 AS `option_key1`, + 2.2 AS `option_key2`))""" ) @@ -457,10 +460,10 @@ def test_ml_detect_anomalies_correct_sql( ) assert ( sql - == """SELECT * FROM ML.DETECT_ANOMALIES(MODEL `my_project_id.my_dataset_id.my_model_id`, + == """SELECT * FROM ML.DETECT_ANOMALIES(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`, STRUCT( - 1 AS option_key1, - 2.2 AS option_key2), (input_X_y_sql))""" + 1 AS `option_key1`, + 2.2 AS `option_key2`), (input_X_y_sql))""" ) @@ -470,7 +473,7 @@ def test_ml_principal_components_correct( sql = model_manipulation_sql_generator.ml_principal_components() assert ( sql - == """SELECT * FROM ML.PRINCIPAL_COMPONENTS(MODEL `my_project_id.my_dataset_id.my_model_id`)""" + == """SELECT * FROM ML.PRINCIPAL_COMPONENTS(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`)""" ) @@ -480,5 +483,5 @@ def test_ml_principal_component_info_correct( sql = model_manipulation_sql_generator.ml_principal_component_info() assert ( sql - == """SELECT * FROM ML.PRINCIPAL_COMPONENT_INFO(MODEL `my_project_id.my_dataset_id.my_model_id`)""" + == """SELECT * FROM ML.PRINCIPAL_COMPONENT_INFO(MODEL `my_project_id`.`my_dataset_id`.`my_model_id`)""" ) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 970883257c..a6c11ed1b9 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -45,6 +45,10 @@ def shape(self) -> tuple[int, int]: ... 'col2': [4, 5, 6]}) >>> df.shape (3, 2) + + Returns: + Tuple[int, int]: + Tuple of array dimensions. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -89,6 +93,10 @@ def values(self) -> np.ndarray: on another array. na_value (default None): The value to use for missing values. + + Returns: + numpy.ndarray: + The values of the DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -119,7 +127,7 @@ def T(self) -> DataFrame: [2 rows x 2 columns] Returns: - DataFrame: The transposed DataFrame. + bigframes.pandas.DataFrame: The transposed DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -170,7 +178,7 @@ def transpose(self) -> DataFrame: dtype: object Returns: - DataFrame: The transposed DataFrame. + bigframes.pandas.DataFrame: The transposed DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -216,7 +224,8 @@ def info( shows the counts, and False never shows the counts. Returns: - None: This method prints a summary of a DataFrame and returns None.""" + None: This method prints a summary of a DataFrame and returns None. + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def memory_usage(self, index: bool = True): @@ -236,7 +245,7 @@ def memory_usage(self, index: bool = True): the index is the first item in the output. Returns: - Series: A Series whose index is the original column names and whose values is the memory usage of each column in bytes. + bigframes.pandas.Series: A Series whose index is the original column names and whose values is the memory usage of each column in bytes. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -272,7 +281,7 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame: A selection of dtypes or strings to be excluded. Returns: - DataFrame: The subset of the frame including the dtypes in ``include`` and excluding the dtypes in ``exclude``. + bigframes.pandas.DataFrame: The subset of the frame including the dtypes in ``include`` and excluding the dtypes in ``exclude``. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -304,11 +313,14 @@ def from_dict( dtype (dtype, default None): Data type to force after DataFrame construction, otherwise infer. columns (list, default None): - Column labels to use when ``orient='index'``. Raises a ValueError - if used with ``orient='columns'`` or ``orient='tight'``. + Column labels to use when ``orient='index'``. + + Raises: + ValueError: + If used with ``orient='columns'`` or ``orient='tight'``. Returns: - DataFrame: DataFrame. + bigframes.pandas.DataFrame: DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -349,7 +361,7 @@ def from_records( Number of rows to read if data is an iterator. Returns: - DataFrame: DataFrame. + bigframes.pandas.DataFrame: DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -519,7 +531,7 @@ def to_parquet( If ``False``, they will not be written to the file. Returns: - bytes if no path argument is provided else None + None or bytes: bytes if no path argument is provided else None """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -662,6 +674,10 @@ def to_latex( it is assumed to be aliases for the column names. index (bool, default True): Write row names (index). + + Returns: + str or None: If buf is None, returns the result as a string. Otherwise returns + None. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -952,7 +968,8 @@ def to_markdown( These parameters will be passed to `tabulate `_. Returns: - DataFrame: DataFrame in Markdown-friendly format. + str: + DataFrame in Markdown-friendly format. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -994,6 +1011,11 @@ def to_orc(self, path=None, **kwargs) -> bytes | None: we refer to objects with a write() method, such as a file handle (e.g. via builtin open function). If path is None, a bytes object is returned. + + Returns: + bytes or None: + If buf is None, returns the result as bytes. Otherwise returns + None. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1020,7 +1042,8 @@ def assign(self, **kwargs) -> DataFrame: are simply assigned to the column. Returns: - bigframes.dataframe.DataFrame: A new DataFrame with the new columns + bigframes.pandas.DataFrame: + A new DataFrame with the new columns in addition to all the existing columns. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1054,7 +1077,7 @@ def reindex( Axis to target. Can be either the axis name ('index', 'columns') or number (0, 1). Returns: - DataFrame: DataFrame with changed index. + bigframes.pandas.DataFrame: DataFrame with changed index. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1071,16 +1094,14 @@ def reindex_like(self, other): of this object. Returns: - Series or DataFrame: Same type as caller, but with changed indices on each axis. + bigframes.pandas.DataFrame or bigframes.pandas.Series: + Same type as caller, but with changed indices on each axis. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def insert(self, loc, column, value, allow_duplicates=False): """Insert column into DataFrame at specified location. - Raises a ValueError if `column` is already contained in the DataFrame, - unless `allow_duplicates` is set to True. - **Examples:** >>> import bigframes.pandas as bpd @@ -1117,6 +1138,11 @@ def insert(self, loc, column, value, allow_duplicates=False): Content of the inserted column. allow_duplicates (bool, default False): Allow duplicate column labels to be created. + + Raises: + ValueError: + If `column` is already contained in the DataFrame, + unless `allow_duplicates` is set to True. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1233,7 +1259,7 @@ def drop( level: For MultiIndex, level from which the labels will be removed. Returns: - bigframes.dataframe.DataFrame: DataFrame without the removed column labels. + bigframes.pandas.DataFrame: DataFrame without the removed column labels. Raises: KeyError: If any of the labels is not found in the selected axis. @@ -1266,7 +1292,8 @@ def align( Align on index (0), columns (1), or both (None). Returns: - tuple of (DataFrame, type of other): Aligned objects. + Tuple[bigframes.pandas.DataFrame or bigframes.pandas.Series, type of other]: + Aligned objects. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1309,7 +1336,7 @@ def rename( Dict-like from old column labels to new column labels. Returns: - bigframes.dataframe.DataFrame: DataFrame with the renamed axis labels. + bigframes.pandas.DataFrame: DataFrame with the renamed axis labels. Raises: KeyError: If any of the labels is not found. @@ -1328,7 +1355,7 @@ def rename_axis(self, mapper: Optional[str], **kwargs) -> DataFrame: Value to set the axis name attribute. Returns: - bigframes.dataframe.DataFrame: DataFrame with the new index name + bigframes.pandas.DataFrame: DataFrame with the new index name """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1392,7 +1419,7 @@ def set_index( Delete columns to be used as the new index. Returns: - DataFrame: Changed row labels. + bigframes.pandas.DataFrame: Changed row labels. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1410,7 +1437,7 @@ def reorder_levels( Where to reorder levels. Returns: - DataFrame: DataFrame of rearranged index. + bigframes.pandas.DataFrame: DataFrame of rearranged index. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1428,7 +1455,7 @@ def swaplevel(self, i, j, axis: str | int = 0) -> DataFrame: 'columns' for column-wise. Returns: - DataFrame: DataFrame with levels swapped in MultiIndex. + bigframes.pandas.DataFrame: DataFrame with levels swapped in MultiIndex. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1447,7 +1474,7 @@ def droplevel(self, level, axis: str | int = 0): * 0 or 'index': remove level(s) in column. * 1 or 'columns': remove level(s) in row. Returns: - DataFrame: DataFrame with requested index / column level(s) removed. + bigframes.pandas.DataFrame: DataFrame with requested index / column level(s) removed. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1553,7 +1580,7 @@ class name speed max the index to the default integer index. Returns: - bigframes.dataframe.DataFrame: DataFrame with the new index. + bigframes.pandas.DataFrame: DataFrame with the new index. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1580,7 +1607,7 @@ def drop_duplicates( - ``False`` : Drop all duplicates. Returns: - bigframes.dataframe.DataFrame: DataFrame with duplicates removed + bigframes.pandas.DataFrame: DataFrame with duplicates removed """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1602,7 +1629,7 @@ def duplicated(self, subset=None, keep="first"): - False : Mark all duplicates as ``True``. Returns: - bigframes.series.Series: Boolean series for each duplicated rows. + bigframes.pandas.Series: Boolean series for each duplicated rows. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1697,7 +1724,7 @@ def dropna( Returns: - bigframes.dataframe.DataFrame: DataFrame with NA entries dropped from it. + bigframes.pandas.DataFrame: DataFrame with NA entries dropped from it. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1745,7 +1772,7 @@ def isin(self, values): the column names, which must match. Returns: - DataFrame: DataFrame of booleans showing whether each element + bigframes.pandas.DataFrame: DataFrame of booleans showing whether each element in the DataFrame is contained in values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1769,7 +1796,7 @@ def keys(self): Index(['A', 'B'], dtype='object') Returns: - Index: Info axis. + pandas.Index: Info axis. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1777,9 +1804,6 @@ def iterrows(self): """ Iterate over DataFrame rows as (index, Series) pairs. - Yields: - a tuple (index, data) where data contains row values as a Series - **Examples:** >>> import bigframes.pandas as bpd @@ -1795,6 +1819,10 @@ def iterrows(self): A 1 B 4 Name: 0, dtype: object + + Returns: + Iterable[Tuple]: + A tuple where data contains row values as a Series """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1821,7 +1849,7 @@ def itertuples(self, index: bool = True, name: str | None = "Pandas"): tuples. Returns: - iterator: + Iterable[Tuple]: An object to iterate over namedtuples for each row in the DataFrame with the first field possibly being the index and following fields being the column values. @@ -1976,7 +2004,7 @@ def sort_values( if `first`; `last` puts NaNs at the end. Returns: - DataFrame: DataFrame with sorted values. + bigframes.pandas.DataFrame: DataFrame with sorted values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1986,7 +2014,7 @@ def sort_index( """Sort object by labels (along an axis). Returns: - DataFrame: The original DataFrame sorted by the labels. + bigframes.pandas.DataFrame: The original DataFrame sorted by the labels. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2035,7 +2063,7 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). Returns: - DataFrame: Result of the comparison. + bigframes.pandas.DataFrame: Result of the comparison. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2068,7 +2096,7 @@ def __eq__(self, other): Object to be compared to the DataFrame for equality. Returns: - DataFrame: The result of comparing `other` to DataFrame. + bigframes.pandas.DataFrame: The result of comparing `other` to DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2092,7 +2120,7 @@ def __invert__(self) -> DataFrame: [3 rows x 2 columns] Returns: - DataFrame: The result of inverting elements in the input. + bigframes.pandas.DataFrame: The result of inverting elements in the input. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2137,7 +2165,7 @@ def ne(self, other, axis: str | int = "columns") -> DataFrame: Whether to compare by the index (0 or 'index') or columns (1 or 'columns'). Returns: - DataFrame: Result of the comparison. + bigframes.pandas.DataFrame: Result of the comparison. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2170,7 +2198,7 @@ def __ne__(self, other): Object to be compared to the DataFrame for inequality. Returns: - DataFrame: The result of comparing `other` to DataFrame. + bigframes.pandas.DataFrame: The result of comparing `other` to DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2220,7 +2248,7 @@ def le(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). Returns: - DataFrame: DataFrame of bool. The result of the comparison. + bigframes.pandas.DataFrame: DataFrame of bool. The result of the comparison. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2253,7 +2281,7 @@ def __le__(self, other): Object to be compared to the DataFrame. Returns: - DataFrame: The result of comparing `other` to DataFrame. + bigframes.pandas.DataFrame: The result of comparing `other` to DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2303,7 +2331,7 @@ def lt(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). Returns: - DataFrame: DataFrame of bool. The result of the comparison. + bigframes.pandas.DataFrame: DataFrame of bool. The result of the comparison. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2336,7 +2364,7 @@ def __lt__(self, other): Object to be compared to the DataFrame. Returns: - DataFrame: The result of comparing `other` to DataFrame. + bigframes.pandas.DataFrame: The result of comparing `other` to DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2386,7 +2414,7 @@ def ge(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). Returns: - DataFrame: DataFrame of bool. The result of the comparison. + bigframes.pandas.DataFrame: DataFrame of bool. The result of the comparison. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2419,7 +2447,7 @@ def __ge__(self, other): Object to be compared to the DataFrame. Returns: - DataFrame: The result of comparing `other` to DataFrame. + bigframes.pandas.DataFrame: The result of comparing `other` to DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2467,7 +2495,7 @@ def gt(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). Returns: - DataFrame: DataFrame of bool: The result of the comparison. + bigframes.pandas.DataFrame: DataFrame of bool: The result of the comparison. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2500,7 +2528,7 @@ def __gt__(self, other): Object to be compared to the DataFrame. Returns: - DataFrame: The result of comparing `other` to DataFrame. + bigframes.pandas.DataFrame: The result of comparing `other` to DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2549,7 +2577,7 @@ def add(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame: DataFrame result of the arithmetic operation. + bigframes.pandas.DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2619,7 +2647,7 @@ def __add__(self, other) -> DataFrame: Object to be added to the DataFrame. Returns: - DataFrame: The result of adding `other` to DataFrame. + bigframes.pandas.DataFrame: The result of adding `other` to DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2668,7 +2696,7 @@ def radd(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame: DataFrame result of the arithmetic operation. + bigframes.pandas.DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2717,7 +2745,7 @@ def sub(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame: DataFrame result of the arithmetic operation. + bigframes.pandas.DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2760,7 +2788,7 @@ def __sub__(self, other): Object to subtract from the DataFrame. Returns: - DataFrame: The result of the subtraction. + bigframes.pandas.DataFrame: The result of the subtraction. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2806,7 +2834,7 @@ def rsub(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame: DataFrame result of the arithmetic operation. + bigframes.pandas.DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2821,7 +2849,7 @@ def __rsub__(self, other): Object to subtract the DataFrame from. Returns: - DataFrame: The result of the subtraction. + bigframes.pandas.DataFrame: The result of the subtraction. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2870,7 +2898,7 @@ def mul(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame: DataFrame result of the arithmetic operation. + bigframes.pandas.DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2913,7 +2941,7 @@ def __mul__(self, other): Object to multiply with the DataFrame. Returns: - DataFrame: The result of the multiplication. + bigframes.pandas.DataFrame: The result of the multiplication. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -2962,7 +2990,7 @@ def rmul(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame: DataFrame result of the arithmetic operation. + bigframes.pandas.DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3005,7 +3033,7 @@ def __rmul__(self, other): Object to multiply the DataFrame with. Returns: - DataFrame: The result of the multiplication. + bigframes.pandas.DataFrame: The result of the multiplication. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3054,7 +3082,7 @@ def truediv(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame: DataFrame result of the arithmetic operation. + bigframes.pandas.DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3097,7 +3125,7 @@ def __truediv__(self, other): Object to divide the DataFrame by. Returns: - DataFrame: The result of the division. + bigframes.pandas.DataFrame: The result of the division. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3143,7 +3171,7 @@ def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame: DataFrame result of the arithmetic operation. + bigframes.pandas.DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3158,7 +3186,7 @@ def __rtruediv__(self, other): Object to divide by the DataFrame. Returns: - DataFrame: The result of the division. + bigframes.pandas.DataFrame: The result of the division. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3207,7 +3235,7 @@ def floordiv(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame: DataFrame result of the arithmetic operation. + bigframes.pandas.DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3250,7 +3278,7 @@ def __floordiv__(self, other): Object to divide the DataFrame by. Returns: - DataFrame: The result of the integer divison. + bigframes.pandas.DataFrame: The result of the integer divison. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3296,7 +3324,7 @@ def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame: DataFrame result of the arithmetic operation. + bigframes.pandas.DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3311,7 +3339,7 @@ def __rfloordiv__(self, other): Object to divide by the DataFrame. Returns: - DataFrame: The result of the integer divison. + bigframes.pandas.DataFrame: The result of the integer divison. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3360,7 +3388,7 @@ def mod(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame: DataFrame result of the arithmetic operation. + bigframes.pandas.DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3403,7 +3431,7 @@ def __mod__(self, other): Object to modulo the DataFrame by. Returns: - DataFrame: The result of the modulo. + bigframes.pandas.DataFrame: The result of the modulo. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3449,7 +3477,7 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame: DataFrame result of the arithmetic operation. + bigframes.pandas.DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3464,7 +3492,7 @@ def __rmod__(self, other): Object to modulo by the DataFrame. Returns: - DataFrame: The result of the modulo. + bigframes.pandas.DataFrame: The result of the modulo. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3514,7 +3542,7 @@ def pow(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame: DataFrame result of the arithmetic operation. + bigframes.pandas.DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3558,7 +3586,7 @@ def __pow__(self, other): Object to exponentiate the DataFrame with. Returns: - DataFrame: The result of the exponentiation. + bigframes.pandas.DataFrame: The result of the exponentiation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3605,7 +3633,7 @@ def rpow(self, other, axis: str | int = "columns") -> DataFrame: (1 or 'columns'). For Series input, axis to match Series index on. Returns: - DataFrame: DataFrame result of the arithmetic operation. + bigframes.pandas.DataFrame: DataFrame result of the arithmetic operation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3699,7 +3727,7 @@ def combine( overwritten with NaNs. Returns: - DataFrame: Combination of the provided DataFrames. + bigframes.pandas.DataFrame: Combination of the provided DataFrames. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3734,7 +3762,8 @@ def combine_first(self, other) -> DataFrame: Provided DataFrame to use to fill null values. Returns: - DataFrame: The result of combining the provided DataFrame with the other object. + bigframes.pandas.DataFrame: + The result of combining the provided DataFrame with the other object. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3784,7 +3813,7 @@ def explode( If True, the resulting index will be labeled 0, 1, …, n - 1. Returns: - bigframes.series.DataFrame: Exploded lists to rows of the subset columns; + bigframes.pandas.DataFrame: Exploded lists to rows of the subset columns; index will be duplicated for these rows. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3819,7 +3848,7 @@ def corr(self, method, min_periods, numeric_only) -> DataFrame: Include only float, int, boolean, decimal data. Returns: - DataFrame: Correlation matrix. + bigframes.pandas.DataFrame: Correlation matrix. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -3848,7 +3877,7 @@ def cov(self, *, numeric_only) -> DataFrame: Include only float, int, boolean, decimal data. Returns: - DataFrame: The covariance matrix of the series of the DataFrame. + bigframes.pandas.DataFrame: The covariance matrix of the series of the DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4080,7 +4109,7 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: values, without passing them to func. Returns: - bigframes.dataframe.DataFrame: Transformed DataFrame. + bigframes.pandas.DataFrame: Transformed DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4180,7 +4209,7 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: the order of the left keys. Returns: - bigframes.dataframe.DataFrame: A dataframe containing columns from both the caller and `other`. + bigframes.pandas.DataFrame: A dataframe containing columns from both the caller and `other`. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4325,7 +4354,7 @@ def merge( no suffix. At least one of the values must not be None. Returns: - bigframes.dataframe.DataFrame: A DataFrame of the two merged objects. + bigframes.pandas.DataFrame: A DataFrame of the two merged objects. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4438,7 +4467,8 @@ def apply(self, func, *, axis=0, args=(), **kwargs): `func`. Returns: - pandas.Series or bigframes.DataFrame: Result of applying ``func`` along the given axis of the DataFrame. + bigframes.pandas.DataFrame or bigframes.pandas.Series: + Result of applying ``func`` along the given axis of the DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4488,7 +4518,7 @@ def any(self, *, axis=0, bool_only: bool = False): Include only boolean columns. Returns: - bigframes.series.Series: Series indicating if any element is True per column. + bigframes.pandas.Series: Series indicating if any element is True per column. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4535,7 +4565,7 @@ def all(self, axis=0, *, bool_only: bool = False): Include only boolean columns. Returns: - bigframes.series.Series: Series indicating if all elements are True per column. + bigframes.pandas.Series: Series indicating if all elements are True per column. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4580,7 +4610,7 @@ def prod(self, axis=0, *, numeric_only: bool = False): Include only float, int, boolean columns. Returns: - bigframes.series.Series: Series with the product of the values. + bigframes.pandas.Series: Series with the product of the values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4625,7 +4655,7 @@ def min(self, axis=0, *, numeric_only: bool = False): Default False. Include only float, int, boolean columns. Returns: - bigframes.series.Series: Series with the minimum of the values. + bigframes.pandas.Series: Series with the minimum of the values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4670,7 +4700,7 @@ def max(self, axis=0, *, numeric_only: bool = False): Default False. Include only float, int, boolean columns. Returns: - bigframes.series.Series: Series after the maximum of values. + bigframes.pandas.Series: Series after the maximum of values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4714,7 +4744,7 @@ def sum(self, axis=0, *, numeric_only: bool = False): Default False. Include only float, int, boolean columns. Returns: - bigframes.series.Series: Series with the sum of values. + bigframes.pandas.Series: Series with the sum of values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4756,7 +4786,7 @@ def mean(self, axis=0, *, numeric_only: bool = False): Default False. Include only float, int, boolean columns. Returns: - bigframes.series.Series: Series with the mean of values. + bigframes.pandas.Series: Series with the mean of values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4791,7 +4821,7 @@ def median(self, *, numeric_only: bool = False, exact: bool = True): one. Returns: - bigframes.series.Series: Series with the median of values. + bigframes.pandas.Series: Series with the median of values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4825,7 +4855,7 @@ def quantile( Include only `float`, `int` or `boolean` data. Returns: - Series or DataFrame: + bigframes.pandas.DataFrame or bigframes.pandas.Series: If ``q`` is an array, a DataFrame will be returned where the index is ``q``, the columns are the columns of self, and the values are the quantiles. @@ -4875,7 +4905,7 @@ def var(self, axis=0, *, numeric_only: bool = False): Default False. Include only float, int, boolean columns. Returns: - bigframes.series.Series: Series with unbiased variance over requested axis. + bigframes.pandas.Series: Series with unbiased variance over requested axis. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4915,7 +4945,7 @@ def skew(self, *, numeric_only: bool = False): Include only float, int, boolean columns. Returns: - Series: Series. + bigframes.pandas.Series: Series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4956,7 +4986,7 @@ def kurt(self, *, numeric_only: bool = False): Include only float, int, boolean columns. Returns: - Series: Series. + bigframes.pandas.Series: Series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4996,7 +5026,7 @@ def std(self, *, numeric_only: bool = False): Default False. Include only float, int, boolean columns. Returns: - bigframes.series.Series: Series with sample standard deviation. + bigframes.pandas.Series: Series with sample standard deviation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -5038,7 +5068,7 @@ def count(self, *, numeric_only: bool = False): Include only `float`, `int` or `boolean` data. Returns: - bigframes.series.Series: For each column/row the number of + bigframes.pandas.Series: For each column/row the number of non-NA/null entries. If `level` is specified returns a `DataFrame`. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -5126,7 +5156,7 @@ def nlargest(self, n: int, columns, keep: str = "first"): selecting more than `n` items. Returns: - DataFrame: The first `n` rows ordered by the given columns in descending order. + bigframes.pandas.DataFrame: The first `n` rows ordered by the given columns in descending order. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -5214,7 +5244,7 @@ def nsmallest(self, n: int, columns, keep: str = "first"): selecting more than `n` items. Returns: - DataFrame: The first `n` rows ordered by the given columns in ascending order. + bigframes.pandas.DataFrame: The first `n` rows ordered by the given columns in ascending order. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -5244,7 +5274,7 @@ def idxmin(self): dtype: Int64 Returns: - Series: Indexes of minima along the columns. + bigframes.pandas.Series: Indexes of minima along the columns. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -5274,7 +5304,7 @@ def idxmax(self): dtype: Int64 Returns: - Series: Indexes of maxima along the columns. + bigframes.pandas.Series: Indexes of maxima along the columns. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -5359,7 +5389,7 @@ def melt(self, id_vars, value_vars, var_name, value_name): Name to use for the 'value' column. Returns: - DataFrame: Unpivoted DataFrame. + bigframes.pandas.DataFrame: Unpivoted DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -5387,7 +5417,7 @@ def nunique(self): dtype: Int64 Returns: - bigframes.series.Series: Series with number of distinct elements. + bigframes.pandas.Series: Series with number of distinct elements. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -5419,7 +5449,7 @@ def cummin(self) -> DataFrame: [3 rows x 2 columns] Returns: - bigframes.dataframe.DataFrame: Return cumulative minimum of DataFrame. + bigframes.pandas.DataFrame: Return cumulative minimum of DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -5451,7 +5481,7 @@ def cummax(self) -> DataFrame: [3 rows x 2 columns] Returns: - bigframes.dataframe.DataFrame: Return cumulative maximum of DataFrame. + bigframes.pandas.DataFrame: Return cumulative maximum of DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -5483,7 +5513,7 @@ def cumsum(self) -> DataFrame: [3 rows x 2 columns] Returns: - bigframes.dataframe.DataFrame: Return cumulative sum of DataFrame. + bigframes.pandas.DataFrame: Return cumulative sum of DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -5515,7 +5545,7 @@ def cumprod(self) -> DataFrame: [3 rows x 2 columns] Returns: - bigframes.dataframe.DataFrame: Return cumulative product of DataFrame. + bigframes.pandas.DataFrame: Return cumulative product of DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -5568,7 +5598,7 @@ def diff( values. Returns: - bigframes.dataframe.DataFrame: First differences of the Series. + bigframes.pandas.DataFrame: First differences of the Series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -5613,7 +5643,7 @@ def agg(self, func): function names, e.g. ``['sum', 'mean']``. Returns: - DataFrame or bigframes.series.Series: Aggregated results. + bigframes.pandas.DataFrame or bigframes.pandas.Series: Aggregated results. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -5665,7 +5695,7 @@ def describe(self): [8 rows x 2 columns] Returns: - bigframes.dataframe.DataFrame: Summary statistics of the Series or Dataframe provided. + bigframes.pandas.DataFrame: Summary statistics of the Series or Dataframe provided. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -5747,7 +5777,7 @@ def pivot(self, *, columns, index=None, values=None): have hierarchically indexed columns. Returns: - DataFrame: Returns reshaped DataFrame. + bigframes.pandas.DataFrame: Returns reshaped DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -5829,7 +5859,7 @@ def pivot_table(self, values=None, index=None, columns=None, aggfunc="mean"): Aggregation function name to compute summary statistics (e.g., 'sum', 'mean'). Returns: - DataFrame: An Excel style pivot table. + bigframes.pandas.DataFrame: An Excel style pivot table. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -5876,7 +5906,7 @@ def stack(self, level=-1): Level(s) to stack from the column axis onto the index axis. Returns: - DataFrame or Series: Stacked dataframe or series. + bigframes.pandas.DataFrame or bigframes.pandas.Series: Stacked dataframe or series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -5915,7 +5945,7 @@ def unstack(self, level=-1): Level(s) of index to unstack, can pass level name. Returns: - DataFrame or Series: DataFrame or Series. + bigframes.pandas.DataFrame or bigframes.pandas.Series: DataFrame or Series. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -6101,7 +6131,7 @@ def value_counts( Don’t include counts of rows that contain NA values. Returns: - Series: Series containing counts of unique rows in the DataFrame + bigframes.pandas.Series: Series containing counts of unique rows in the DataFrame """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -6181,7 +6211,7 @@ def eval(self, expr: str) -> DataFrame: The expression string to evaluate. Returns: - DataFrame + bigframes.pandas.DataFrame: DataFrame result after the operation. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -6255,7 +6285,8 @@ def query(self, expr: str) -> DataFrame | None: to sum it with ``b``, your query should be ```a a` + b``. Returns: - DataFrame + None or bigframes.pandas.DataFrame: + DataFrame result after the query operation, otherwise None. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -6303,7 +6334,7 @@ def interpolate(self, method: str = "linear"): 'nearest', 'zero', 'slinear': Emulates `scipy.interpolate.interp1d` Returns: - DataFrame: + bigframes.pandas.DataFrame: Returns the same object type as the caller, interpolated at some or all ``NaN`` values """ @@ -6372,7 +6403,7 @@ def fillna(self, value): be a list. Returns: - DataFrame: Object with missing values filled + bigframes.pandas.DataFrame: Object with missing values filled """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -6460,7 +6491,8 @@ def replace( string. Returns: - Series/DataFrame: Object after replacement. + bigframes.pandas.DataFrame or bigframes.pandas.Series: + Object after replacement. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -6643,7 +6675,7 @@ def dot(self, other): The other object to compute the matrix product with. Returns: - Series or DataFrame: + bigframes.pandas.DataFrame or bigframes.pandas.Series: If `other` is a Series, return the matrix product between self and other as a Series. If other is a DataFrame, return the matrix product of self and other in a DataFrame. @@ -6837,7 +6869,7 @@ def __getitem__(self, key): column labels Returns: - Series or Value: Value(s) at the requested index(es). + bigframes.pandas.Series or Any: Value(s) at the requested index(es). """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 6734fb6aa9..0ac527e2ff 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -57,7 +57,8 @@ def __iter__(self) -> Iterator: Iterate over column axis for DataFrame, or values for Series. Returns: - iterator + Iterator: + Iterator of DataFrame or Series values. **Examples:** @@ -91,8 +92,8 @@ def abs(self): This function only applies to elements that are all numeric. Returns: - Series/DataFrame containing the absolute value of each element. - Returns a Series/DataFrame containing the absolute value of each element. + bigframes.pandas.DataFrame or bigframes.pandas.Series: + A Series or DataFrame containing the absolute value of each element. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -181,7 +182,8 @@ def astype(self, dtype): ``pd.ArrowDtype(pa.timestamp("us", tz="UTC"))``. Returns: - same type as caller + bigframes.pandas.DataFrame: + A BigQuery DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -370,7 +372,8 @@ def get(self, key, default=None): key: object Returns: - same type as items contained in object + Any: + same type as items contained in object """ try: return self[key] @@ -391,7 +394,8 @@ def add_prefix(self, prefix: str, axis: int | str | None = None): to add prefix on. Returns: - New Series or DataFrame with updated labels. + bigframes.pandas.DataFrame or bigframes.pandas.Series: + New Series or DataFrame with updated labels. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -409,7 +413,8 @@ def add_suffix(self, suffix: str, axis: int | str | None = None): to add suffix on Returns: - New Series or DataFrame with updated labels. + bigframes.pandas.DataFrame or bigframes.pandas.Series: + New Series or DataFrame with updated labels. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -486,7 +491,8 @@ def head(self, n: int = 5): Default 5. Number of rows to select. Returns: - same type as caller: The first ``n`` rows of the caller object. + bigframes.pandas.DataFrame or bigframes.pandas.Series: + The first ``n`` rows of the caller object. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -507,7 +513,8 @@ def tail(self, n: int = 5): Number of rows to select. Returns: - The last `n` rows of the caller object. + bigframes.pandas.DataFrame: + The last `n` rows of the caller object. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -584,8 +591,9 @@ def sample( - 'False': The sample will retain the original object's order. Returns: - A new object of same type as caller containing `n` items randomly - sampled from the caller object. + bigframes.pandas.DataFrame or bigframes.pandas.Series: + A new object of same type as caller containing `n` items randomly + sampled from the caller object. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -613,7 +621,8 @@ def dtypes(self): dtype: object Returns: - A *pandas* Series with the data type of each column. + pandas.Series: + A *pandas* Series with the data type of each column. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -686,7 +695,8 @@ def copy(self): [2 rows x 2 columns] Returns: - Object type matches caller. + bigframes.pandas.DataFrame or bigframes.pandas.Series: + Object type matches caller. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -749,7 +759,8 @@ def ffill(self, *, limit: Optional[int] = None): Returns: - Series/DataFrame or None: Object with missing values filled. + bigframes.pandas.DataFrame or bigframes.pandas.Series or None: + Object with missing values filled. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -766,7 +777,8 @@ def bfill(self, *, limit: Optional[int] = None): filled. Must be greater than 0 if not None. Returns: - Series/DataFrame or None: Object with missing values filled. + bigframes.pandas.DataFrame or bigframes.pandas.Series or None: + Object with missing values filled. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -844,8 +856,9 @@ def isna(self) -> NDFrame: dtype: boolean Returns: - Mask of bool values for each element that indicates whether an - element is an NA value. + bigframes.pandas.DataFrame or bigframes.pandas.Series: + Mask of bool values for each element that indicates whether an + element is an NA value. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -893,7 +906,8 @@ def filter( DataFrame. For `Series` this parameter is unused and defaults to `None`. Returns: - same type as input object + bigframes.pandas.DataFrame or bigframes.pandas.Series: + Same type as input object. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -934,7 +948,7 @@ def pct_change(self, periods: int = 1): Periods to shift for forming percent change. Returns: - Series or DataFrame: The same type as the calling object. + bigframes.pandas.DataFrame or bigframes.pandas.Series: The same type as the calling object. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -972,7 +986,8 @@ def rank( Whether or not the elements should be ranked in ascending order. Returns: - same type as caller: Return a Series or DataFrame with data ranks as values. + bigframes.pandas.DataFrame or bigframes.pandas.Series: + Return a Series or DataFrame with data ranks as values. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -1126,7 +1141,8 @@ def pipe( A dictionary of keyword arguments passed into ``func``. Returns: - same type as caller + bigframes.pandas.DataFrame or bigframes.pandas.Series: + Object of same type as caller """ return common.pipe(self, func, *args, **kwargs) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index a6363e3285..845d623e2a 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -3332,6 +3332,42 @@ def kurt(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def items(self): + """ + Iterate over (index, value) pairs of a Series. + + Iterates over the Series contents, returning a tuple with + the index and the value of a Series. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['bear', 'bear', 'marsupial'], + ... index=['panda', 'polar', 'koala']) + >>> s + panda bear + polar bear + koala marsupial + dtype: string + + >>> for index, value in s.items(): + ... print(f'--> index: {index}') + ... print(f'--> value: {value}') + ... + --> index: panda + --> value: bear + --> index: polar + --> value: bear + --> index: koala + --> value: marsupial + + Returns: + Iterator: Iterator of index, value for each content of the Series. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def where(self, cond, other): """Replace values where the condition is False. diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 47a6013c4c..4bd4353413 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -162,6 +162,6 @@ def read_gbq( or partitioned tables without primary keys. Returns: - bigframes.dataframe.DataFrame: A DataFrame representing results of the query or table. + bigframes.pandas.DataFrame: A DataFrame representing results of the query or table. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/io/parquet.py b/third_party/bigframes_vendored/pandas/io/parquet.py index 1f5563c962..aec911d2fe 100644 --- a/third_party/bigframes_vendored/pandas/io/parquet.py +++ b/third_party/bigframes_vendored/pandas/io/parquet.py @@ -41,6 +41,6 @@ def read_parquet( order is not preserved. Default, ``'auto'``. Returns: - bigframes.dataframe.DataFrame: A BigQuery DataFrames. + bigframes.pandas.DataFrame: A BigQuery DataFrames. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/io/parsers/readers.py b/third_party/bigframes_vendored/pandas/io/parsers/readers.py index 35b2a1982a..90154d8a00 100644 --- a/third_party/bigframes_vendored/pandas/io/parsers/readers.py +++ b/third_party/bigframes_vendored/pandas/io/parsers/readers.py @@ -145,9 +145,13 @@ def read_csv( **kwargs: keyword arguments for `pandas.read_csv` when not using the BigQuery engine. - Returns: - bigframes.dataframe.DataFrame: A BigQuery DataFrames. + bigframes.pandas.DataFrame: A BigQuery DataFrames. + + Raises: + bigframes.exceptions.DefaultIndexWarning: + Using the default index is discouraged, such as with clustered + or partitioned tables without primary keys. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -226,7 +230,12 @@ def read_json( keyword arguments for `pandas.read_json` when not using the BigQuery engine. Returns: - bigframes.dataframe.DataFrame: + bigframes.pandas.DataFrame: The DataFrame representing JSON contents. + + Raises: + bigframes.exceptions.DefaultIndexWarning: + Using the default index is discouraged, such as with clustered + or partitioned tables without primary keys. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/io/pickle.py b/third_party/bigframes_vendored/pandas/io/pickle.py index 88684309f9..32ff2666c0 100644 --- a/third_party/bigframes_vendored/pandas/io/pickle.py +++ b/third_party/bigframes_vendored/pandas/io/pickle.py @@ -64,7 +64,7 @@ def read_pickle( examples on storage options refer here. Returns: - bigframes.dataframe.DataFrame or bigframes.series.Series: same type as object + bigframes.pandas.DataFrame or bigframes.pandas.Series: same type as object stored in file. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index aaf43dbcfe..b3c83c8d96 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -69,7 +69,7 @@ def fit( """Compute k-means clustering. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): DataFrame of shape (n_samples, n_features). Training data. y (default None): Not used, present here for API consistency by convention. @@ -86,7 +86,7 @@ def predict( """Predict the closest cluster each sample in X belongs to. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): DataFrame of shape (n_samples, n_features). New data to predict. Returns: @@ -108,7 +108,7 @@ def score( for the outputs relevant to this model type. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): DataFrame of shape (n_samples, n_features). New Data. y (default None) Not used, present here for API consistency by convention. diff --git a/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py b/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py index 4b0bd42706..e4e71c1ff9 100644 --- a/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py +++ b/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py @@ -37,7 +37,7 @@ def fit( """Fit all transformers using X. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The Series or DataFrame of shape (n_samples, n_features). Training vector, where `n_samples` is the number of samples and `n_features` is the number of features. @@ -54,7 +54,7 @@ def transform( """Transform X separately by each transformer, concatenate results. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The Series or DataFrame to be transformed by subset. Returns: diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index ae6f0b0561..a0cccdcb4e 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -34,7 +34,7 @@ def fit(self, X, y=None): """Fit the model according to the given training data. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or DataFrame of shape (n_samples, n_features). Training vector, where `n_samples` is the number of samples and `n_features` is the number of features. @@ -71,7 +71,7 @@ def predict(self, X): """Predict the closest cluster for each sample in X. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or a DataFrame to predict. Returns: diff --git a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py index 92794bb68e..1f6284c146 100644 --- a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py +++ b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py @@ -47,10 +47,10 @@ def fit(self, X, y): """Build a forest of trees from the training set (X, y). Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or DataFrame of shape (n_samples, n_features). Training data. - y (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or DataFrame of shape (n_samples,) or (n_samples, n_targets). Target values. Will be cast to X's dtype if necessary. @@ -73,7 +73,7 @@ def predict(self, X): mean predicted regression targets of the trees in the forest. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or DataFrame of shape (n_samples, n_features). The data matrix for which we want to get the predictions. diff --git a/third_party/bigframes_vendored/sklearn/impute/_base.py b/third_party/bigframes_vendored/sklearn/impute/_base.py index 3064e8a118..4e33d976a9 100644 --- a/third_party/bigframes_vendored/sklearn/impute/_base.py +++ b/third_party/bigframes_vendored/sklearn/impute/_base.py @@ -30,7 +30,7 @@ def fit(self, X, y=None): """Fit the imputer on X. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The Dataframe or Series with training data. y (default None): @@ -45,7 +45,7 @@ def transform(self, X): """Impute all missing values in X. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series to be transformed. Returns: diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index 69f98697af..fa8f28a656 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -31,7 +31,7 @@ def predict(self, X): """Predict using the linear model. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or DataFrame of shape (n_samples, n_features). Samples. Returns: @@ -45,7 +45,7 @@ def predict(self, X): """Predict class labels for samples in X. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or DataFrame of shape (n_samples, n_features). The data matrix for which we want to get the predictions. @@ -101,10 +101,10 @@ def fit( """Fit linear model. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or DataFrame of shape (n_samples, n_features). Training data. - y (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or DataFrame of shape (n_samples,) or (n_samples, n_targets). Target values. Will be cast to X's dtype if necessary. diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index c52a37018c..f3419ba8a9 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -71,12 +71,12 @@ def fit( """Fit the model according to the given training data. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): Series or DataFrame of shape (n_samples, n_features). Training vector, where `n_samples` is the number of samples and `n_features` is the number of features. - y (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): DataFrame of shape (n_samples,). Target vector relative to X. diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py index 1ff83aa640..b051cb24b4 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py @@ -48,7 +48,7 @@ def fit(self, X, y=None): """Compute the mean and std to be used for later scaling. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The Dataframe or Series with training data. y (default None): @@ -63,7 +63,7 @@ def transform(self, X): """Perform standardization by centering and scaling. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series to be transformed. Returns: @@ -85,7 +85,7 @@ def fit(self, X, y=None): """Compute the maximum absolute value to be used for later scaling. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The Dataframe or Series with training data. y (default None): @@ -100,7 +100,7 @@ def transform(self, X): """Scale the data. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series to be transformed. Returns: @@ -121,7 +121,7 @@ def fit(self, X, y=None): """Compute the minimum and maximum to be used for later scaling. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The Dataframe or Series with training data. y (default None): @@ -136,7 +136,7 @@ def transform(self, X): """Scale the data. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series to be transformed. Returns: diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py index 54c81af71d..5fa84d2d15 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py @@ -25,7 +25,7 @@ def fit(self, X, y=None): """Fit the estimator. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The Dataframe or Series with training data. y (default None): @@ -40,7 +40,7 @@ def transform(self, X): """Discretize the data. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series to be transformed. Returns: diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py index 7cdca9229a..5476a9fb3c 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py @@ -65,7 +65,7 @@ def fit(self, X, y=None): """Fit OneHotEncoder to X. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series with training data. y (default None): @@ -80,7 +80,7 @@ def transform(self, X): """Transform X using one-hot encoding. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series to be transformed. Returns: diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py index 61a44db92f..74b3ca347a 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py @@ -33,7 +33,7 @@ def fit(self, y): """Fit label encoder. Args: - y (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series with training data. Returns: @@ -45,7 +45,7 @@ def transform(self, y): """Transform y using label encoding. Args: - y (bigframes.dataframe.DataFrame or bigframes.series.Series): + y (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series to be transformed. Returns: diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py b/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py index 9ad43b7956..aeed4dce92 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_polynomial.py @@ -19,7 +19,7 @@ def fit(self, X, y=None): """Compute number of output features. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The Dataframe or Series with training data. y (default None): @@ -34,7 +34,7 @@ def transform(self, X): """Transform data to polynomial features. Args: - X (bigframes.dataframe.DataFrame or bigframes.series.Series): + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): The DataFrame or Series to be transformed. Returns: diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 75f66191ca..501aa2bd9d 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.22.0" +__version__ = "1.24.0"