diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 4c0027ff1c..3983728ba1 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:04c35dc5f49f0f503a306397d6d043685f8d2bb822ab515818c4208d7fb2db3a + digest: sha256:5581906b957284864632cde4e9c51d1cc66b0094990b27e689132fe5cd036046 # created: 2025-01-16T15:24:11.364245182Z diff --git a/.github/sync-repo-settings.yaml b/.github/sync-repo-settings.yaml index c2f3673fcc..37874a6888 100644 --- a/.github/sync-repo-settings.yaml +++ b/.github/sync-repo-settings.yaml @@ -12,6 +12,7 @@ branchProtectionRules: - 'cla/google' - 'docs' - 'lint' + - 'mypy' - 'unit (3.9)' - 'unit (3.10)' - 'unit (3.11)' diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml new file mode 100644 index 0000000000..e6a79291d0 --- /dev/null +++ b/.github/workflows/mypy.yml @@ -0,0 +1,22 @@ +on: + pull_request: + branches: + - main +name: mypy +jobs: + mypy: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + - name: Install nox + run: | + python -m pip install --upgrade setuptools pip wheel + python -m pip install nox + - name: Run mypy + run: | + nox -s mypy diff --git a/.kokoro/docker/docs/Dockerfile b/.kokoro/docker/docs/Dockerfile deleted file mode 100644 index e5410e296b..0000000000 --- a/.kokoro/docker/docs/Dockerfile +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from ubuntu:24.04 - -ENV DEBIAN_FRONTEND noninteractive - -# Ensure local Python is preferred over distribution Python. -ENV PATH /usr/local/bin:$PATH - -# Install dependencies. -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - apt-transport-https \ - build-essential \ - ca-certificates \ - curl \ - dirmngr \ - git \ - gpg-agent \ - graphviz \ - libbz2-dev \ - libdb5.3-dev \ - libexpat1-dev \ - libffi-dev \ - liblzma-dev \ - libreadline-dev \ - libsnappy-dev \ - libssl-dev \ - libsqlite3-dev \ - portaudio19-dev \ - redis-server \ - software-properties-common \ - ssh \ - sudo \ - tcl \ - tcl-dev \ - tk \ - tk-dev \ - uuid-dev \ - wget \ - zlib1g-dev \ - && add-apt-repository universe \ - && apt-get update \ - && apt-get -y install jq \ - && apt-get clean autoclean \ - && apt-get autoremove -y \ - && rm -rf /var/lib/apt/lists/* \ - && rm -f /var/cache/apt/archives/*.deb - - -###################### Install python 3.10.14 for docs/docfx session - -# Download python 3.10.14 -RUN wget https://www.python.org/ftp/python/3.10.14/Python-3.10.14.tgz - -# Extract files -RUN tar -xvf Python-3.10.14.tgz - -# Install python 3.10.14 -RUN ./Python-3.10.14/configure --enable-optimizations -RUN make altinstall - -ENV PATH /usr/local/bin/python3.10:$PATH - -###################### Install pip -RUN wget -O /tmp/get-pip.py 'https://bootstrap.pypa.io/get-pip.py' \ - && python3.10 /tmp/get-pip.py \ - && rm /tmp/get-pip.py - -# Test pip -RUN python3.10 -m pip - -# Install build requirements -COPY requirements.txt /requirements.txt -RUN python3.10 -m pip install --require-hashes -r requirements.txt - -CMD ["python3.10"] diff --git a/.kokoro/docker/docs/fetch_gpg_keys.sh b/.kokoro/docker/docs/fetch_gpg_keys.sh deleted file mode 100644 index c4a92a33ea..0000000000 --- a/.kokoro/docker/docs/fetch_gpg_keys.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# A script to fetch gpg keys with retry. -# Avoid jinja parsing the file. -# - -function retry { - if [[ "${#}" -le 1 ]]; then - echo "Usage: ${0} retry_count commands.." - exit 1 - fi - local retries=${1} - local command="${@:2}" - until [[ "${retries}" -le 0 ]]; do - $command && return 0 - if [[ $? -ne 0 ]]; then - echo "command failed, retrying" - ((retries--)) - fi - done - return 1 -} - -# 3.6.9, 3.7.5 (Ned Deily) -retry 3 gpg --keyserver ha.pool.sks-keyservers.net --recv-keys \ - 0D96DF4D4110E5C43FBFB17F2D347EA6AA65421D - -# 3.8.0 (Ɓukasz Langa) -retry 3 gpg --keyserver ha.pool.sks-keyservers.net --recv-keys \ - E3FF2839C048B25C084DEBE9B26995E310250568 - -# diff --git a/.kokoro/docker/docs/requirements.in b/.kokoro/docker/docs/requirements.in deleted file mode 100644 index 586bd07037..0000000000 --- a/.kokoro/docker/docs/requirements.in +++ /dev/null @@ -1,2 +0,0 @@ -nox -gcp-docuploader diff --git a/.kokoro/docker/docs/requirements.txt b/.kokoro/docker/docs/requirements.txt deleted file mode 100644 index a9360a25b7..0000000000 --- a/.kokoro/docker/docs/requirements.txt +++ /dev/null @@ -1,297 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.10 -# by the following command: -# -# pip-compile --allow-unsafe --generate-hashes requirements.in -# -argcomplete==3.5.3 \ - --hash=sha256:2ab2c4a215c59fd6caaff41a869480a23e8f6a5f910b266c1808037f4e375b61 \ - --hash=sha256:c12bf50eded8aebb298c7b7da7a5ff3ee24dffd9f5281867dfe1424b58c55392 - # via nox -cachetools==5.5.0 \ - --hash=sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292 \ - --hash=sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a - # via google-auth -certifi==2024.12.14 \ - --hash=sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56 \ - --hash=sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db - # via requests -charset-normalizer==3.4.1 \ - --hash=sha256:0167ddc8ab6508fe81860a57dd472b2ef4060e8d378f0cc555707126830f2537 \ - --hash=sha256:01732659ba9b5b873fc117534143e4feefecf3b2078b0a6a2e925271bb6f4cfa \ - --hash=sha256:01ad647cdd609225c5350561d084b42ddf732f4eeefe6e678765636791e78b9a \ - --hash=sha256:04432ad9479fa40ec0f387795ddad4437a2b50417c69fa275e212933519ff294 \ - --hash=sha256:0907f11d019260cdc3f94fbdb23ff9125f6b5d1039b76003b5b0ac9d6a6c9d5b \ - --hash=sha256:0924e81d3d5e70f8126529951dac65c1010cdf117bb75eb02dd12339b57749dd \ - --hash=sha256:09b26ae6b1abf0d27570633b2b078a2a20419c99d66fb2823173d73f188ce601 \ - --hash=sha256:09b5e6733cbd160dcc09589227187e242a30a49ca5cefa5a7edd3f9d19ed53fd \ - --hash=sha256:0af291f4fe114be0280cdd29d533696a77b5b49cfde5467176ecab32353395c4 \ - --hash=sha256:0f55e69f030f7163dffe9fd0752b32f070566451afe180f99dbeeb81f511ad8d \ - --hash=sha256:1a2bc9f351a75ef49d664206d51f8e5ede9da246602dc2d2726837620ea034b2 \ - --hash=sha256:22e14b5d70560b8dd51ec22863f370d1e595ac3d024cb8ad7d308b4cd95f8313 \ - --hash=sha256:234ac59ea147c59ee4da87a0c0f098e9c8d169f4dc2a159ef720f1a61bbe27cd \ - --hash=sha256:2369eea1ee4a7610a860d88f268eb39b95cb588acd7235e02fd5a5601773d4fa \ - --hash=sha256:237bdbe6159cff53b4f24f397d43c6336c6b0b42affbe857970cefbb620911c8 \ - --hash=sha256:28bf57629c75e810b6ae989f03c0828d64d6b26a5e205535585f96093e405ed1 \ - --hash=sha256:2967f74ad52c3b98de4c3b32e1a44e32975e008a9cd2a8cc8966d6a5218c5cb2 \ - --hash=sha256:2a75d49014d118e4198bcee5ee0a6f25856b29b12dbf7cd012791f8a6cc5c496 \ - --hash=sha256:2bdfe3ac2e1bbe5b59a1a63721eb3b95fc9b6817ae4a46debbb4e11f6232428d \ - --hash=sha256:2d074908e1aecee37a7635990b2c6d504cd4766c7bc9fc86d63f9c09af3fa11b \ - --hash=sha256:2fb9bd477fdea8684f78791a6de97a953c51831ee2981f8e4f583ff3b9d9687e \ - --hash=sha256:311f30128d7d333eebd7896965bfcfbd0065f1716ec92bd5638d7748eb6f936a \ - --hash=sha256:329ce159e82018d646c7ac45b01a430369d526569ec08516081727a20e9e4af4 \ - --hash=sha256:345b0426edd4e18138d6528aed636de7a9ed169b4aaf9d61a8c19e39d26838ca \ - --hash=sha256:363e2f92b0f0174b2f8238240a1a30142e3db7b957a5dd5689b0e75fb717cc78 \ - --hash=sha256:3a3bd0dcd373514dcec91c411ddb9632c0d7d92aed7093b8c3bbb6d69ca74408 \ - --hash=sha256:3bed14e9c89dcb10e8f3a29f9ccac4955aebe93c71ae803af79265c9ca5644c5 \ - --hash=sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3 \ - --hash=sha256:44ecbf16649486d4aebafeaa7ec4c9fed8b88101f4dd612dcaf65d5e815f837f \ - --hash=sha256:4532bff1b8421fd0a320463030c7520f56a79c9024a4e88f01c537316019005a \ - --hash=sha256:49402233c892a461407c512a19435d1ce275543138294f7ef013f0b63d5d3765 \ - --hash=sha256:4c0907b1928a36d5a998d72d64d8eaa7244989f7aaaf947500d3a800c83a3fd6 \ - --hash=sha256:4d86f7aff21ee58f26dcf5ae81a9addbd914115cdebcbb2217e4f0ed8982e146 \ - --hash=sha256:5777ee0881f9499ed0f71cc82cf873d9a0ca8af166dfa0af8ec4e675b7df48e6 \ - --hash=sha256:5df196eb874dae23dcfb968c83d4f8fdccb333330fe1fc278ac5ceeb101003a9 \ - --hash=sha256:619a609aa74ae43d90ed2e89bdd784765de0a25ca761b93e196d938b8fd1dbbd \ - --hash=sha256:6e27f48bcd0957c6d4cb9d6fa6b61d192d0b13d5ef563e5f2ae35feafc0d179c \ - --hash=sha256:6ff8a4a60c227ad87030d76e99cd1698345d4491638dfa6673027c48b3cd395f \ - --hash=sha256:73d94b58ec7fecbc7366247d3b0b10a21681004153238750bb67bd9012414545 \ - --hash=sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176 \ - --hash=sha256:75832c08354f595c760a804588b9357d34ec00ba1c940c15e31e96d902093770 \ - --hash=sha256:7709f51f5f7c853f0fb938bcd3bc59cdfdc5203635ffd18bf354f6967ea0f824 \ - --hash=sha256:78baa6d91634dfb69ec52a463534bc0df05dbd546209b79a3880a34487f4b84f \ - --hash=sha256:7974a0b5ecd505609e3b19742b60cee7aa2aa2fb3151bc917e6e2646d7667dcf \ - --hash=sha256:7a4f97a081603d2050bfaffdefa5b02a9ec823f8348a572e39032caa8404a487 \ - --hash=sha256:7b1bef6280950ee6c177b326508f86cad7ad4dff12454483b51d8b7d673a2c5d \ - --hash=sha256:7d053096f67cd1241601111b698f5cad775f97ab25d81567d3f59219b5f1adbd \ - --hash=sha256:804a4d582ba6e5b747c625bf1255e6b1507465494a40a2130978bda7b932c90b \ - --hash=sha256:807f52c1f798eef6cf26beb819eeb8819b1622ddfeef9d0977a8502d4db6d534 \ - --hash=sha256:80ed5e856eb7f30115aaf94e4a08114ccc8813e6ed1b5efa74f9f82e8509858f \ - --hash=sha256:8417cb1f36cc0bc7eaba8ccb0e04d55f0ee52df06df3ad55259b9a323555fc8b \ - --hash=sha256:8436c508b408b82d87dc5f62496973a1805cd46727c34440b0d29d8a2f50a6c9 \ - --hash=sha256:89149166622f4db9b4b6a449256291dc87a99ee53151c74cbd82a53c8c2f6ccd \ - --hash=sha256:8bfa33f4f2672964266e940dd22a195989ba31669bd84629f05fab3ef4e2d125 \ - --hash=sha256:8c60ca7339acd497a55b0ea5d506b2a2612afb2826560416f6894e8b5770d4a9 \ - --hash=sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de \ - --hash=sha256:955f8851919303c92343d2f66165294848d57e9bba6cf6e3625485a70a038d11 \ - --hash=sha256:97f68b8d6831127e4787ad15e6757232e14e12060bec17091b85eb1486b91d8d \ - --hash=sha256:9b23ca7ef998bc739bf6ffc077c2116917eabcc901f88da1b9856b210ef63f35 \ - --hash=sha256:9f0b8b1c6d84c8034a44893aba5e767bf9c7a211e313a9605d9c617d7083829f \ - --hash=sha256:aabfa34badd18f1da5ec1bc2715cadc8dca465868a4e73a0173466b688f29dda \ - --hash=sha256:ab36c8eb7e454e34e60eb55ca5d241a5d18b2c6244f6827a30e451c42410b5f7 \ - --hash=sha256:b010a7a4fd316c3c484d482922d13044979e78d1861f0e0650423144c616a46a \ - --hash=sha256:b1ac5992a838106edb89654e0aebfc24f5848ae2547d22c2c3f66454daa11971 \ - --hash=sha256:b7b2d86dd06bfc2ade3312a83a5c364c7ec2e3498f8734282c6c3d4b07b346b8 \ - --hash=sha256:b97e690a2118911e39b4042088092771b4ae3fc3aa86518f84b8cf6888dbdb41 \ - --hash=sha256:bc2722592d8998c870fa4e290c2eec2c1569b87fe58618e67d38b4665dfa680d \ - --hash=sha256:c0429126cf75e16c4f0ad00ee0eae4242dc652290f940152ca8c75c3a4b6ee8f \ - --hash=sha256:c30197aa96e8eed02200a83fba2657b4c3acd0f0aa4bdc9f6c1af8e8962e0757 \ - --hash=sha256:c4c3e6da02df6fa1410a7680bd3f63d4f710232d3139089536310d027950696a \ - --hash=sha256:c75cb2a3e389853835e84a2d8fb2b81a10645b503eca9bcb98df6b5a43eb8886 \ - --hash=sha256:c96836c97b1238e9c9e3fe90844c947d5afbf4f4c92762679acfe19927d81d77 \ - --hash=sha256:d7f50a1f8c450f3925cb367d011448c39239bb3eb4117c36a6d354794de4ce76 \ - --hash=sha256:d973f03c0cb71c5ed99037b870f2be986c3c05e63622c017ea9816881d2dd247 \ - --hash=sha256:d98b1668f06378c6dbefec3b92299716b931cd4e6061f3c875a71ced1780ab85 \ - --hash=sha256:d9c3cdf5390dcd29aa8056d13e8e99526cda0305acc038b96b30352aff5ff2bb \ - --hash=sha256:dad3e487649f498dd991eeb901125411559b22e8d7ab25d3aeb1af367df5efd7 \ - --hash=sha256:dccbe65bd2f7f7ec22c4ff99ed56faa1e9f785482b9bbd7c717e26fd723a1d1e \ - --hash=sha256:dd78cfcda14a1ef52584dbb008f7ac81c1328c0f58184bf9a84c49c605002da6 \ - --hash=sha256:e218488cd232553829be0664c2292d3af2eeeb94b32bea483cf79ac6a694e037 \ - --hash=sha256:e358e64305fe12299a08e08978f51fc21fac060dcfcddd95453eabe5b93ed0e1 \ - --hash=sha256:ea0d8d539afa5eb2728aa1932a988a9a7af94f18582ffae4bc10b3fbdad0626e \ - --hash=sha256:eab677309cdb30d047996b36d34caeda1dc91149e4fdca0b1a039b3f79d9a807 \ - --hash=sha256:eb8178fe3dba6450a3e024e95ac49ed3400e506fd4e9e5c32d30adda88cbd407 \ - --hash=sha256:ecddf25bee22fe4fe3737a399d0d177d72bc22be6913acfab364b40bce1ba83c \ - --hash=sha256:eea6ee1db730b3483adf394ea72f808b6e18cf3cb6454b4d86e04fa8c4327a12 \ - --hash=sha256:f08ff5e948271dc7e18a35641d2f11a4cd8dfd5634f55228b691e62b37125eb3 \ - --hash=sha256:f30bf9fd9be89ecb2360c7d94a711f00c09b976258846efe40db3d05828e8089 \ - --hash=sha256:fa88b843d6e211393a37219e6a1c1df99d35e8fd90446f1118f4216e307e48cd \ - --hash=sha256:fc54db6c8593ef7d4b2a331b58653356cf04f67c960f584edb7c3d8c97e8f39e \ - --hash=sha256:fd4ec41f914fa74ad1b8304bbc634b3de73d2a0889bd32076342a573e0779e00 \ - --hash=sha256:ffc9202a29ab3920fa812879e95a9e78b2465fd10be7fcbd042899695d75e616 - # via requests -click==8.1.8 \ - --hash=sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2 \ - --hash=sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a - # via gcp-docuploader -colorlog==6.9.0 \ - --hash=sha256:5906e71acd67cb07a71e779c47c4bcb45fb8c2993eebe9e5adcd6a6f1b283eff \ - --hash=sha256:bfba54a1b93b94f54e1f4fe48395725a3d92fd2a4af702f6bd70946bdc0c6ac2 - # via - # gcp-docuploader - # nox -distlib==0.3.9 \ - --hash=sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87 \ - --hash=sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403 - # via virtualenv -filelock==3.16.1 \ - --hash=sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0 \ - --hash=sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435 - # via virtualenv -gcp-docuploader==0.6.5 \ - --hash=sha256:30221d4ac3e5a2b9c69aa52fdbef68cc3f27d0e6d0d90e220fc024584b8d2318 \ - --hash=sha256:b7458ef93f605b9d46a4bf3a8dc1755dad1f31d030c8679edf304e343b347eea - # via -r requirements.in -google-api-core==2.24.0 \ - --hash=sha256:10d82ac0fca69c82a25b3efdeefccf6f28e02ebb97925a8cce8edbfe379929d9 \ - --hash=sha256:e255640547a597a4da010876d333208ddac417d60add22b6851a0c66a831fcaf - # via - # google-cloud-core - # google-cloud-storage -google-auth==2.37.0 \ - --hash=sha256:0054623abf1f9c83492c63d3f47e77f0a544caa3d40b2d98e099a611c2dd5d00 \ - --hash=sha256:42664f18290a6be591be5329a96fe30184be1a1badb7292a7f686a9659de9ca0 - # via - # google-api-core - # google-cloud-core - # google-cloud-storage -google-cloud-core==2.4.1 \ - --hash=sha256:9b7749272a812bde58fff28868d0c5e2f585b82f37e09a1f6ed2d4d10f134073 \ - --hash=sha256:a9e6a4422b9ac5c29f79a0ede9485473338e2ce78d91f2370c01e730eab22e61 - # via google-cloud-storage -google-cloud-storage==2.19.0 \ - --hash=sha256:aeb971b5c29cf8ab98445082cbfe7b161a1f48ed275822f59ed3f1524ea54fba \ - --hash=sha256:cd05e9e7191ba6cb68934d8eb76054d9be4562aa89dbc4236feee4d7d51342b2 - # via gcp-docuploader -google-crc32c==1.6.0 \ - --hash=sha256:05e2d8c9a2f853ff116db9706b4a27350587f341eda835f46db3c0a8c8ce2f24 \ - --hash=sha256:18e311c64008f1f1379158158bb3f0c8d72635b9eb4f9545f8cf990c5668e59d \ - --hash=sha256:236c87a46cdf06384f614e9092b82c05f81bd34b80248021f729396a78e55d7e \ - --hash=sha256:35834855408429cecf495cac67ccbab802de269e948e27478b1e47dfb6465e57 \ - --hash=sha256:386122eeaaa76951a8196310432c5b0ef3b53590ef4c317ec7588ec554fec5d2 \ - --hash=sha256:40b05ab32a5067525670880eb5d169529089a26fe35dce8891127aeddc1950e8 \ - --hash=sha256:48abd62ca76a2cbe034542ed1b6aee851b6f28aaca4e6551b5599b6f3ef175cc \ - --hash=sha256:50cf2a96da226dcbff8671233ecf37bf6e95de98b2a2ebadbfdf455e6d05df42 \ - --hash=sha256:51c4f54dd8c6dfeb58d1df5e4f7f97df8abf17a36626a217f169893d1d7f3e9f \ - --hash=sha256:5bcc90b34df28a4b38653c36bb5ada35671ad105c99cfe915fb5bed7ad6924aa \ - --hash=sha256:62f6d4a29fea082ac4a3c9be5e415218255cf11684ac6ef5488eea0c9132689b \ - --hash=sha256:6eceb6ad197656a1ff49ebfbbfa870678c75be4344feb35ac1edf694309413dc \ - --hash=sha256:7aec8e88a3583515f9e0957fe4f5f6d8d4997e36d0f61624e70469771584c760 \ - --hash=sha256:91ca8145b060679ec9176e6de4f89b07363d6805bd4760631ef254905503598d \ - --hash=sha256:a184243544811e4a50d345838a883733461e67578959ac59964e43cca2c791e7 \ - --hash=sha256:a9e4b426c3702f3cd23b933436487eb34e01e00327fac20c9aebb68ccf34117d \ - --hash=sha256:bb0966e1c50d0ef5bc743312cc730b533491d60585a9a08f897274e57c3f70e0 \ - --hash=sha256:bb8b3c75bd157010459b15222c3fd30577042a7060e29d42dabce449c087f2b3 \ - --hash=sha256:bd5e7d2445d1a958c266bfa5d04c39932dc54093fa391736dbfdb0f1929c1fb3 \ - --hash=sha256:c87d98c7c4a69066fd31701c4e10d178a648c2cac3452e62c6b24dc51f9fcc00 \ - --hash=sha256:d2952396dc604544ea7476b33fe87faedc24d666fb0c2d5ac971a2b9576ab871 \ - --hash=sha256:d8797406499f28b5ef791f339594b0b5fdedf54e203b5066675c406ba69d705c \ - --hash=sha256:d9e9913f7bd69e093b81da4535ce27af842e7bf371cde42d1ae9e9bd382dc0e9 \ - --hash=sha256:e2806553238cd076f0a55bddab37a532b53580e699ed8e5606d0de1f856b5205 \ - --hash=sha256:ebab974b1687509e5c973b5c4b8b146683e101e102e17a86bd196ecaa4d099fc \ - --hash=sha256:ed767bf4ba90104c1216b68111613f0d5926fb3780660ea1198fc469af410e9d \ - --hash=sha256:f7a1fc29803712f80879b0806cb83ab24ce62fc8daf0569f2204a0cfd7f68ed4 - # via - # google-cloud-storage - # google-resumable-media -google-resumable-media==2.7.2 \ - --hash=sha256:3ce7551e9fe6d99e9a126101d2536612bb73486721951e9562fee0f90c6ababa \ - --hash=sha256:5280aed4629f2b60b847b0d42f9857fd4935c11af266744df33d8074cae92fe0 - # via google-cloud-storage -googleapis-common-protos==1.66.0 \ - --hash=sha256:c3e7b33d15fdca5374cc0a7346dd92ffa847425cc4ea941d970f13680052ec8c \ - --hash=sha256:d7abcd75fabb2e0ec9f74466401f6c119a0b498e27370e9be4c94cb7e382b8ed - # via google-api-core -idna==3.10 \ - --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \ - --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3 - # via requests -nox==2024.10.9 \ - --hash=sha256:1d36f309a0a2a853e9bccb76bbef6bb118ba92fa92674d15604ca99adeb29eab \ - --hash=sha256:7aa9dc8d1c27e9f45ab046ffd1c3b2c4f7c91755304769df231308849ebded95 - # via -r requirements.in -packaging==24.2 \ - --hash=sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759 \ - --hash=sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f - # via nox -platformdirs==4.3.6 \ - --hash=sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907 \ - --hash=sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb - # via virtualenv -proto-plus==1.25.0 \ - --hash=sha256:c91fc4a65074ade8e458e95ef8bac34d4008daa7cce4a12d6707066fca648961 \ - --hash=sha256:fbb17f57f7bd05a68b7707e745e26528b0b3c34e378db91eef93912c54982d91 - # via google-api-core -protobuf==5.29.3 \ - --hash=sha256:0a18ed4a24198528f2333802eb075e59dea9d679ab7a6c5efb017a59004d849f \ - --hash=sha256:0eb32bfa5219fc8d4111803e9a690658aa2e6366384fd0851064b963b6d1f2a7 \ - --hash=sha256:3ea51771449e1035f26069c4c7fd51fba990d07bc55ba80701c78f886bf9c888 \ - --hash=sha256:5da0f41edaf117bde316404bad1a486cb4ededf8e4a54891296f648e8e076620 \ - --hash=sha256:6ce8cc3389a20693bfde6c6562e03474c40851b44975c9b2bf6df7d8c4f864da \ - --hash=sha256:84a57163a0ccef3f96e4b6a20516cedcf5bb3a95a657131c5c3ac62200d23252 \ - --hash=sha256:a4fa6f80816a9a0678429e84973f2f98cbc218cca434abe8db2ad0bffc98503a \ - --hash=sha256:a8434404bbf139aa9e1300dbf989667a83d42ddda9153d8ab76e0d5dcaca484e \ - --hash=sha256:b89c115d877892a512f79a8114564fb435943b59067615894c3b13cd3e1fa107 \ - --hash=sha256:c027e08a08be10b67c06bf2370b99c811c466398c357e615ca88c91c07f0910f \ - --hash=sha256:daaf63f70f25e8689c072cfad4334ca0ac1d1e05a92fc15c54eb9cf23c3efd84 - # via - # gcp-docuploader - # google-api-core - # googleapis-common-protos - # proto-plus -pyasn1==0.6.1 \ - --hash=sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629 \ - --hash=sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 \ - --hash=sha256:49bfa96b45a292b711e986f222502c1c9a5e1f4e568fc30e2574a6c7d07838fd \ - --hash=sha256:c28e2dbf9c06ad61c71a075c7e0f9fd0f1b0bb2d2ad4377f240d33ac2ab60a7c - # via google-auth -requests==2.32.3 \ - --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ - --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 - # via - # google-api-core - # google-cloud-storage -rsa==4.9 \ - --hash=sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7 \ - --hash=sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21 - # via google-auth -six==1.17.0 \ - --hash=sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274 \ - --hash=sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81 - # via gcp-docuploader -tomli==2.2.1 \ - --hash=sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6 \ - --hash=sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd \ - --hash=sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c \ - --hash=sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b \ - --hash=sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8 \ - --hash=sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6 \ - --hash=sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77 \ - --hash=sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff \ - --hash=sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea \ - --hash=sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192 \ - --hash=sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249 \ - --hash=sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee \ - --hash=sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4 \ - --hash=sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98 \ - --hash=sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8 \ - --hash=sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4 \ - --hash=sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281 \ - --hash=sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744 \ - --hash=sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69 \ - --hash=sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13 \ - --hash=sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140 \ - --hash=sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e \ - --hash=sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e \ - --hash=sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc \ - --hash=sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff \ - --hash=sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec \ - --hash=sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2 \ - --hash=sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222 \ - --hash=sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106 \ - --hash=sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272 \ - --hash=sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a \ - --hash=sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7 - # via nox -urllib3==2.3.0 \ - --hash=sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df \ - --hash=sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d - # via requests -virtualenv==20.28.1 \ - --hash=sha256:412773c85d4dab0409b83ec36f7a6499e72eaf08c80e81e9576bca61831c71cb \ - --hash=sha256:5d34ab240fdb5d21549b76f9e8ff3af28252f5499fb6d6f031adac4e5a8c5329 - # via nox diff --git a/.kokoro/docs/common.cfg b/.kokoro/docs/common.cfg deleted file mode 100644 index 5f7559f9da..0000000000 --- a/.kokoro/docs/common.cfg +++ /dev/null @@ -1,67 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -# Build logs will be here -action { - define_artifacts { - regex: "**/*sponge_log.xml" - } -} - -# Download trampoline resources. -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" - -# Use the trampoline script to run in docker. -build_file: "python-bigquery-dataframes/.kokoro/trampoline_v2.sh" - -# Configure the docker image for kokoro-trampoline. -env_vars: { - key: "TRAMPOLINE_IMAGE" - value: "gcr.io/cloud-devrel-kokoro-resources/python-lib-docs" -} -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-dataframes/.kokoro/publish-docs.sh" -} - -env_vars: { - key: "STAGING_BUCKET" - value: "docs-staging" -} - -env_vars: { - key: "V2_STAGING_BUCKET" - # Push non-cloud library docs to `docs-staging-v2-dev` instead of the - # Cloud RAD bucket `docs-staging-v2` - value: "docs-staging-v2" -} - -# It will upload the docker image after successful builds. -env_vars: { - key: "TRAMPOLINE_IMAGE_UPLOAD" - value: "true" -} - -# It will always build the docker image. -env_vars: { - key: "TRAMPOLINE_DOCKERFILE" - value: ".kokoro/docker/docs/Dockerfile" -} - -# Fetch the token needed for reporting release status to GitHub -before_action { - fetch_keystore { - keystore_resource { - keystore_config_id: 73713 - keyname: "yoshi-automation-github-key" - } - } -} - -before_action { - fetch_keystore { - keystore_resource { - keystore_config_id: 73713 - keyname: "docuploader_service_account" - } - } -} diff --git a/.kokoro/docs/docs-presubmit-gerrit.cfg b/.kokoro/docs/docs-presubmit-gerrit.cfg deleted file mode 100644 index 1d0dc4b499..0000000000 --- a/.kokoro/docs/docs-presubmit-gerrit.cfg +++ /dev/null @@ -1,23 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "V2_STAGING_BUCKET" - value: "gcloud-python-test" -} - -# We only upload the image in the main `docs` build. -env_vars: { - key: "TRAMPOLINE_IMAGE_UPLOAD" - value: "false" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: ".kokoro/build.sh" -} - -# Only run this nox session. -env_vars: { - key: "NOX_SESSION" - value: "docfx" -} diff --git a/.kokoro/docs/docs-presubmit.cfg b/.kokoro/docs/docs-presubmit.cfg deleted file mode 100644 index 805cfd162b..0000000000 --- a/.kokoro/docs/docs-presubmit.cfg +++ /dev/null @@ -1,28 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -env_vars: { - key: "STAGING_BUCKET" - value: "gcloud-python-test" -} - -env_vars: { - key: "V2_STAGING_BUCKET" - value: "gcloud-python-test" -} - -# We only upload the image in the main `docs` build. -env_vars: { - key: "TRAMPOLINE_IMAGE_UPLOAD" - value: "false" -} - -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-dataframes/.kokoro/build.sh" -} - -# Only run this nox session. -env_vars: { - key: "NOX_SESSION" - value: "docs docfx" -} diff --git a/.kokoro/docs/docs.cfg b/.kokoro/docs/docs.cfg deleted file mode 100644 index 8f43917d92..0000000000 --- a/.kokoro/docs/docs.cfg +++ /dev/null @@ -1 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto \ No newline at end of file diff --git a/.kokoro/publish-docs.sh b/.kokoro/publish-docs.sh deleted file mode 100755 index 2d5ba47549..0000000000 --- a/.kokoro/publish-docs.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -eo pipefail - -# Disable buffering, so that the logs stream through. -export PYTHONUNBUFFERED=1 - -export PATH="${HOME}/.local/bin:${PATH}" - -# build docs -nox -s docs - -# create metadata -python3.10 -m docuploader create-metadata \ - --name=$(jq --raw-output '.name // empty' .repo-metadata.json) \ - --version=$(python3.10 setup.py --version) \ - --language=$(jq --raw-output '.language // empty' .repo-metadata.json) \ - --distribution-name=$(python3.10 setup.py --name) \ - --product-page=$(jq --raw-output '.product_documentation // empty' .repo-metadata.json) \ - --github-repository=$(jq --raw-output '.repo // empty' .repo-metadata.json) \ - --issue-tracker=$(jq --raw-output '.issue_tracker // empty' .repo-metadata.json) - -cat docs.metadata - -# upload docs -python3.10 -m docuploader upload docs/_build/html --metadata-file docs.metadata --staging-bucket "${STAGING_BUCKET}" - - -# docfx yaml files -nox -s docfx - -# create metadata. -python3.10 -m docuploader create-metadata \ - --name=$(jq --raw-output '.name // empty' .repo-metadata.json) \ - --version=$(python3.10 setup.py --version) \ - --language=$(jq --raw-output '.language // empty' .repo-metadata.json) \ - --distribution-name=$(python3.10 setup.py --name) \ - --product-page=$(jq --raw-output '.product_documentation // empty' .repo-metadata.json) \ - --github-repository=$(jq --raw-output '.repo // empty' .repo-metadata.json) \ - --issue-tracker=$(jq --raw-output '.issue_tracker // empty' .repo-metadata.json) - -cat docs.metadata - -# Replace toc.yml template file -mv docs/templates/toc.yml docs/_build/html/docfx_yaml/toc.yml - -# upload docs -python3.10 -m docuploader upload docs/_build/html/docfx_yaml --metadata-file docs.metadata --destination-prefix docfx --staging-bucket "${V2_STAGING_BUCKET}" diff --git a/.kokoro/release.sh b/.kokoro/release.sh deleted file mode 100755 index a2eae5fda1..0000000000 --- a/.kokoro/release.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -eo pipefail - -# Start the releasetool reporter -python3 -m pip install --require-hashes -r github/python-bigquery-dataframes/.kokoro/requirements.txt -python3 -m releasetool publish-reporter-script > /tmp/publisher-script; source /tmp/publisher-script - -# Disable buffering, so that the logs stream through. -export PYTHONUNBUFFERED=1 - -# Move into the package, build the distribution and upload. -TWINE_PASSWORD=$(cat "${KOKORO_KEYSTORE_DIR}/73713_google-cloud-pypi-token-keystore-3") -cd github/python-bigquery-dataframes -python3 setup.py sdist bdist_wheel -twine upload --username __token__ --password "${TWINE_PASSWORD}" dist/* diff --git a/.kokoro/release/common.cfg b/.kokoro/release/common.cfg deleted file mode 100644 index 146dd8f451..0000000000 --- a/.kokoro/release/common.cfg +++ /dev/null @@ -1,43 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto - -# Build logs will be here -action { - define_artifacts { - regex: "**/*sponge_log.xml" - } -} - -# Download trampoline resources. -gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" - -# Use the trampoline script to run in docker. -build_file: "python-bigquery-dataframes/.kokoro/trampoline.sh" - -# Configure the docker image for kokoro-trampoline. -env_vars: { - key: "TRAMPOLINE_IMAGE" - value: "gcr.io/cloud-devrel-kokoro-resources/python-multi" -} -env_vars: { - key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-dataframes/.kokoro/release.sh" -} - -# Fetch PyPI password -before_action { - fetch_keystore { - keystore_resource { - keystore_config_id: 73713 - keyname: "google-cloud-pypi-token-keystore-3" - } - } -} - -# Store the packages we uploaded to PyPI. That way, we have a record of exactly -# what we published, which we can use to generate SBOMs and attestations. -action { - define_artifacts { - regex: "github/python-bigquery-dataframes/**/*.tar.gz" - strip_prefix: "github/python-bigquery-dataframes" - } -} diff --git a/.kokoro/release/release.cfg b/.kokoro/release/release.cfg deleted file mode 100644 index 8f43917d92..0000000000 --- a/.kokoro/release/release.cfg +++ /dev/null @@ -1 +0,0 @@ -# Format: //devtools/kokoro/config/proto/build.proto \ No newline at end of file diff --git a/.kokoro/requirements.in b/.kokoro/requirements.in deleted file mode 100644 index fff4d9ce0d..0000000000 --- a/.kokoro/requirements.in +++ /dev/null @@ -1,11 +0,0 @@ -gcp-docuploader -gcp-releasetool>=2 # required for compatibility with cryptography>=42.x -importlib-metadata -typing-extensions -twine -wheel -setuptools -nox>=2022.11.21 # required to remove dependency on py -charset-normalizer<3 -click<8.1.0 -cryptography>=42.0.5 diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt deleted file mode 100644 index 006d8ef931..0000000000 --- a/.kokoro/requirements.txt +++ /dev/null @@ -1,509 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile --allow-unsafe --generate-hashes requirements.in -# -argcomplete==3.5.1 \ - --hash=sha256:1a1d148bdaa3e3b93454900163403df41448a248af01b6e849edc5ac08e6c363 \ - --hash=sha256:eb1ee355aa2557bd3d0145de7b06b2a45b0ce461e1e7813f5d066039ab4177b4 - # via nox -attrs==24.2.0 \ - --hash=sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346 \ - --hash=sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2 - # via gcp-releasetool -backports-tarfile==1.2.0 \ - --hash=sha256:77e284d754527b01fb1e6fa8a1afe577858ebe4e9dad8919e34c862cb399bc34 \ - --hash=sha256:d75e02c268746e1b8144c278978b6e98e85de6ad16f8e4b0844a154557eca991 - # via jaraco-context -cachetools==5.5.0 \ - --hash=sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292 \ - --hash=sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a - # via google-auth -certifi==2024.8.30 \ - --hash=sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8 \ - --hash=sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9 - # via requests -cffi==1.17.1 \ - --hash=sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8 \ - --hash=sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2 \ - --hash=sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1 \ - --hash=sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15 \ - --hash=sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36 \ - --hash=sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824 \ - --hash=sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8 \ - --hash=sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36 \ - --hash=sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17 \ - --hash=sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf \ - --hash=sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc \ - --hash=sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3 \ - --hash=sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed \ - --hash=sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702 \ - --hash=sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1 \ - --hash=sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8 \ - --hash=sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903 \ - --hash=sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6 \ - --hash=sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d \ - --hash=sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b \ - --hash=sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e \ - --hash=sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be \ - --hash=sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c \ - --hash=sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683 \ - --hash=sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9 \ - --hash=sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c \ - --hash=sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8 \ - --hash=sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1 \ - --hash=sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4 \ - --hash=sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655 \ - --hash=sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67 \ - --hash=sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595 \ - --hash=sha256:98e3969bcff97cae1b2def8ba499ea3d6f31ddfdb7635374834cf89a1a08ecf0 \ - --hash=sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65 \ - --hash=sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41 \ - --hash=sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6 \ - --hash=sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401 \ - --hash=sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6 \ - --hash=sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3 \ - --hash=sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16 \ - --hash=sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93 \ - --hash=sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e \ - --hash=sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4 \ - --hash=sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964 \ - --hash=sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c \ - --hash=sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576 \ - --hash=sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0 \ - --hash=sha256:cdf5ce3acdfd1661132f2a9c19cac174758dc2352bfe37d98aa7512c6b7178b3 \ - --hash=sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662 \ - --hash=sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3 \ - --hash=sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff \ - --hash=sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5 \ - --hash=sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd \ - --hash=sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f \ - --hash=sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5 \ - --hash=sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14 \ - --hash=sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d \ - --hash=sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9 \ - --hash=sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7 \ - --hash=sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382 \ - --hash=sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a \ - --hash=sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e \ - --hash=sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a \ - --hash=sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4 \ - --hash=sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99 \ - --hash=sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87 \ - --hash=sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b - # via cryptography -charset-normalizer==2.1.1 \ - --hash=sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845 \ - --hash=sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f - # via - # -r requirements.in - # requests -click==8.0.4 \ - --hash=sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1 \ - --hash=sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb - # via - # -r requirements.in - # gcp-docuploader - # gcp-releasetool -colorlog==6.8.2 \ - --hash=sha256:3e3e079a41feb5a1b64f978b5ea4f46040a94f11f0e8bbb8261e3dbbeca64d44 \ - --hash=sha256:4dcbb62368e2800cb3c5abd348da7e53f6c362dda502ec27c560b2e58a66bd33 - # via - # gcp-docuploader - # nox -cryptography==43.0.1 \ - --hash=sha256:014f58110f53237ace6a408b5beb6c427b64e084eb451ef25a28308270086494 \ - --hash=sha256:1bbcce1a551e262dfbafb6e6252f1ae36a248e615ca44ba302df077a846a8806 \ - --hash=sha256:203e92a75716d8cfb491dc47c79e17d0d9207ccffcbcb35f598fbe463ae3444d \ - --hash=sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062 \ - --hash=sha256:2bd51274dcd59f09dd952afb696bf9c61a7a49dfc764c04dd33ef7a6b502a1e2 \ - --hash=sha256:38926c50cff6f533f8a2dae3d7f19541432610d114a70808f0926d5aaa7121e4 \ - --hash=sha256:511f4273808ab590912a93ddb4e3914dfd8a388fed883361b02dea3791f292e1 \ - --hash=sha256:58d4e9129985185a06d849aa6df265bdd5a74ca6e1b736a77959b498e0505b85 \ - --hash=sha256:5b43d1ea6b378b54a1dc99dd8a2b5be47658fe9a7ce0a58ff0b55f4b43ef2b84 \ - --hash=sha256:61ec41068b7b74268fa86e3e9e12b9f0c21fcf65434571dbb13d954bceb08042 \ - --hash=sha256:666ae11966643886c2987b3b721899d250855718d6d9ce41b521252a17985f4d \ - --hash=sha256:68aaecc4178e90719e95298515979814bda0cbada1256a4485414860bd7ab962 \ - --hash=sha256:7c05650fe8023c5ed0d46793d4b7d7e6cd9c04e68eabe5b0aeea836e37bdcec2 \ - --hash=sha256:80eda8b3e173f0f247f711eef62be51b599b5d425c429b5d4ca6a05e9e856baa \ - --hash=sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d \ - --hash=sha256:88cce104c36870d70c49c7c8fd22885875d950d9ee6ab54df2745f83ba0dc365 \ - --hash=sha256:9d3cdb25fa98afdd3d0892d132b8d7139e2c087da1712041f6b762e4f807cc96 \ - --hash=sha256:a575913fb06e05e6b4b814d7f7468c2c660e8bb16d8d5a1faf9b33ccc569dd47 \ - --hash=sha256:ac119bb76b9faa00f48128b7f5679e1d8d437365c5d26f1c2c3f0da4ce1b553d \ - --hash=sha256:c1332724be35d23a854994ff0b66530119500b6053d0bd3363265f7e5e77288d \ - --hash=sha256:d03a475165f3134f773d1388aeb19c2d25ba88b6a9733c5c590b9ff7bbfa2e0c \ - --hash=sha256:d75601ad10b059ec832e78823b348bfa1a59f6b8d545db3a24fd44362a1564cb \ - --hash=sha256:de41fd81a41e53267cb020bb3a7212861da53a7d39f863585d13ea11049cf277 \ - --hash=sha256:e710bf40870f4db63c3d7d929aa9e09e4e7ee219e703f949ec4073b4294f6172 \ - --hash=sha256:ea25acb556320250756e53f9e20a4177515f012c9eaea17eb7587a8c4d8ae034 \ - --hash=sha256:f98bf604c82c416bc829e490c700ca1553eafdf2912a91e23a79d97d9801372a \ - --hash=sha256:fba1007b3ef89946dbbb515aeeb41e30203b004f0b4b00e5e16078b518563289 - # via - # -r requirements.in - # gcp-releasetool - # secretstorage -distlib==0.3.9 \ - --hash=sha256:47f8c22fd27c27e25a65601af709b38e4f0a45ea4fc2e710f65755fa8caaaf87 \ - --hash=sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403 - # via virtualenv -docutils==0.21.2 \ - --hash=sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f \ - --hash=sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2 - # via readme-renderer -filelock==3.16.1 \ - --hash=sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0 \ - --hash=sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435 - # via virtualenv -gcp-docuploader==0.6.5 \ - --hash=sha256:30221d4ac3e5a2b9c69aa52fdbef68cc3f27d0e6d0d90e220fc024584b8d2318 \ - --hash=sha256:b7458ef93f605b9d46a4bf3a8dc1755dad1f31d030c8679edf304e343b347eea - # via -r requirements.in -gcp-releasetool==2.1.1 \ - --hash=sha256:25639269f4eae510094f9dbed9894977e1966933211eb155a451deebc3fc0b30 \ - --hash=sha256:845f4ded3d9bfe8cc7fdaad789e83f4ea014affa77785259a7ddac4b243e099e - # via -r requirements.in -google-api-core==2.21.0 \ - --hash=sha256:4a152fd11a9f774ea606388d423b68aa7e6d6a0ffe4c8266f74979613ec09f81 \ - --hash=sha256:6869eacb2a37720380ba5898312af79a4d30b8bca1548fb4093e0697dc4bdf5d - # via - # google-cloud-core - # google-cloud-storage -google-auth==2.35.0 \ - --hash=sha256:25df55f327ef021de8be50bad0dfd4a916ad0de96da86cd05661c9297723ad3f \ - --hash=sha256:f4c64ed4e01e8e8b646ef34c018f8bf3338df0c8e37d8b3bba40e7f574a3278a - # via - # gcp-releasetool - # google-api-core - # google-cloud-core - # google-cloud-storage -google-cloud-core==2.4.1 \ - --hash=sha256:9b7749272a812bde58fff28868d0c5e2f585b82f37e09a1f6ed2d4d10f134073 \ - --hash=sha256:a9e6a4422b9ac5c29f79a0ede9485473338e2ce78d91f2370c01e730eab22e61 - # via google-cloud-storage -google-cloud-storage==2.18.2 \ - --hash=sha256:97a4d45c368b7d401ed48c4fdfe86e1e1cb96401c9e199e419d289e2c0370166 \ - --hash=sha256:aaf7acd70cdad9f274d29332673fcab98708d0e1f4dceb5a5356aaef06af4d99 - # via gcp-docuploader -google-crc32c==1.6.0 \ - --hash=sha256:05e2d8c9a2f853ff116db9706b4a27350587f341eda835f46db3c0a8c8ce2f24 \ - --hash=sha256:18e311c64008f1f1379158158bb3f0c8d72635b9eb4f9545f8cf990c5668e59d \ - --hash=sha256:236c87a46cdf06384f614e9092b82c05f81bd34b80248021f729396a78e55d7e \ - --hash=sha256:35834855408429cecf495cac67ccbab802de269e948e27478b1e47dfb6465e57 \ - --hash=sha256:386122eeaaa76951a8196310432c5b0ef3b53590ef4c317ec7588ec554fec5d2 \ - --hash=sha256:40b05ab32a5067525670880eb5d169529089a26fe35dce8891127aeddc1950e8 \ - --hash=sha256:48abd62ca76a2cbe034542ed1b6aee851b6f28aaca4e6551b5599b6f3ef175cc \ - --hash=sha256:50cf2a96da226dcbff8671233ecf37bf6e95de98b2a2ebadbfdf455e6d05df42 \ - --hash=sha256:51c4f54dd8c6dfeb58d1df5e4f7f97df8abf17a36626a217f169893d1d7f3e9f \ - --hash=sha256:5bcc90b34df28a4b38653c36bb5ada35671ad105c99cfe915fb5bed7ad6924aa \ - --hash=sha256:62f6d4a29fea082ac4a3c9be5e415218255cf11684ac6ef5488eea0c9132689b \ - --hash=sha256:6eceb6ad197656a1ff49ebfbbfa870678c75be4344feb35ac1edf694309413dc \ - --hash=sha256:7aec8e88a3583515f9e0957fe4f5f6d8d4997e36d0f61624e70469771584c760 \ - --hash=sha256:91ca8145b060679ec9176e6de4f89b07363d6805bd4760631ef254905503598d \ - --hash=sha256:a184243544811e4a50d345838a883733461e67578959ac59964e43cca2c791e7 \ - --hash=sha256:a9e4b426c3702f3cd23b933436487eb34e01e00327fac20c9aebb68ccf34117d \ - --hash=sha256:bb0966e1c50d0ef5bc743312cc730b533491d60585a9a08f897274e57c3f70e0 \ - --hash=sha256:bb8b3c75bd157010459b15222c3fd30577042a7060e29d42dabce449c087f2b3 \ - --hash=sha256:bd5e7d2445d1a958c266bfa5d04c39932dc54093fa391736dbfdb0f1929c1fb3 \ - --hash=sha256:c87d98c7c4a69066fd31701c4e10d178a648c2cac3452e62c6b24dc51f9fcc00 \ - --hash=sha256:d2952396dc604544ea7476b33fe87faedc24d666fb0c2d5ac971a2b9576ab871 \ - --hash=sha256:d8797406499f28b5ef791f339594b0b5fdedf54e203b5066675c406ba69d705c \ - --hash=sha256:d9e9913f7bd69e093b81da4535ce27af842e7bf371cde42d1ae9e9bd382dc0e9 \ - --hash=sha256:e2806553238cd076f0a55bddab37a532b53580e699ed8e5606d0de1f856b5205 \ - --hash=sha256:ebab974b1687509e5c973b5c4b8b146683e101e102e17a86bd196ecaa4d099fc \ - --hash=sha256:ed767bf4ba90104c1216b68111613f0d5926fb3780660ea1198fc469af410e9d \ - --hash=sha256:f7a1fc29803712f80879b0806cb83ab24ce62fc8daf0569f2204a0cfd7f68ed4 - # via - # google-cloud-storage - # google-resumable-media -google-resumable-media==2.7.2 \ - --hash=sha256:3ce7551e9fe6d99e9a126101d2536612bb73486721951e9562fee0f90c6ababa \ - --hash=sha256:5280aed4629f2b60b847b0d42f9857fd4935c11af266744df33d8074cae92fe0 - # via google-cloud-storage -googleapis-common-protos==1.65.0 \ - --hash=sha256:2972e6c496f435b92590fd54045060867f3fe9be2c82ab148fc8885035479a63 \ - --hash=sha256:334a29d07cddc3aa01dee4988f9afd9b2916ee2ff49d6b757155dc0d197852c0 - # via google-api-core -idna==3.10 \ - --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \ - --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3 - # via requests -importlib-metadata==8.5.0 \ - --hash=sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b \ - --hash=sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7 - # via - # -r requirements.in - # keyring - # twine -jaraco-classes==3.4.0 \ - --hash=sha256:47a024b51d0239c0dd8c8540c6c7f484be3b8fcf0b2d85c13825780d3b3f3acd \ - --hash=sha256:f662826b6bed8cace05e7ff873ce0f9283b5c924470fe664fff1c2f00f581790 - # via keyring -jaraco-context==6.0.1 \ - --hash=sha256:9bae4ea555cf0b14938dc0aee7c9f32ed303aa20a3b73e7dc80111628792d1b3 \ - --hash=sha256:f797fc481b490edb305122c9181830a3a5b76d84ef6d1aef2fb9b47ab956f9e4 - # via keyring -jaraco-functools==4.1.0 \ - --hash=sha256:70f7e0e2ae076498e212562325e805204fc092d7b4c17e0e86c959e249701a9d \ - --hash=sha256:ad159f13428bc4acbf5541ad6dec511f91573b90fba04df61dafa2a1231cf649 - # via keyring -jeepney==0.8.0 \ - --hash=sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806 \ - --hash=sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755 - # via - # keyring - # secretstorage -jinja2==3.1.4 \ - --hash=sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369 \ - --hash=sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d - # via gcp-releasetool -keyring==25.4.1 \ - --hash=sha256:5426f817cf7f6f007ba5ec722b1bcad95a75b27d780343772ad76b17cb47b0bf \ - --hash=sha256:b07ebc55f3e8ed86ac81dd31ef14e81ace9dd9c3d4b5d77a6e9a2016d0d71a1b - # via - # gcp-releasetool - # twine -markdown-it-py==3.0.0 \ - --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \ - --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb - # via rich -markupsafe==3.0.1 \ - --hash=sha256:0778de17cff1acaeccc3ff30cd99a3fd5c50fc58ad3d6c0e0c4c58092b859396 \ - --hash=sha256:0f84af7e813784feb4d5e4ff7db633aba6c8ca64a833f61d8e4eade234ef0c38 \ - --hash=sha256:17b2aea42a7280db02ac644db1d634ad47dcc96faf38ab304fe26ba2680d359a \ - --hash=sha256:242d6860f1fd9191aef5fae22b51c5c19767f93fb9ead4d21924e0bcb17619d8 \ - --hash=sha256:244dbe463d5fb6d7ce161301a03a6fe744dac9072328ba9fc82289238582697b \ - --hash=sha256:26627785a54a947f6d7336ce5963569b5d75614619e75193bdb4e06e21d447ad \ - --hash=sha256:2a4b34a8d14649315c4bc26bbfa352663eb51d146e35eef231dd739d54a5430a \ - --hash=sha256:2ae99f31f47d849758a687102afdd05bd3d3ff7dbab0a8f1587981b58a76152a \ - --hash=sha256:312387403cd40699ab91d50735ea7a507b788091c416dd007eac54434aee51da \ - --hash=sha256:3341c043c37d78cc5ae6e3e305e988532b072329639007fd408a476642a89fd6 \ - --hash=sha256:33d1c36b90e570ba7785dacd1faaf091203d9942bc036118fab8110a401eb1a8 \ - --hash=sha256:3e683ee4f5d0fa2dde4db77ed8dd8a876686e3fc417655c2ece9a90576905344 \ - --hash=sha256:3ffb4a8e7d46ed96ae48805746755fadd0909fea2306f93d5d8233ba23dda12a \ - --hash=sha256:40621d60d0e58aa573b68ac5e2d6b20d44392878e0bfc159012a5787c4e35bc8 \ - --hash=sha256:40f1e10d51c92859765522cbd79c5c8989f40f0419614bcdc5015e7b6bf97fc5 \ - --hash=sha256:45d42d132cff577c92bfba536aefcfea7e26efb975bd455db4e6602f5c9f45e7 \ - --hash=sha256:48488d999ed50ba8d38c581d67e496f955821dc183883550a6fbc7f1aefdc170 \ - --hash=sha256:4935dd7883f1d50e2ffecca0aa33dc1946a94c8f3fdafb8df5c330e48f71b132 \ - --hash=sha256:4c2d64fdba74ad16138300815cfdc6ab2f4647e23ced81f59e940d7d4a1469d9 \ - --hash=sha256:4c8817557d0de9349109acb38b9dd570b03cc5014e8aabf1cbddc6e81005becd \ - --hash=sha256:4ffaaac913c3f7345579db4f33b0020db693f302ca5137f106060316761beea9 \ - --hash=sha256:5a4cb365cb49b750bdb60b846b0c0bc49ed62e59a76635095a179d440540c346 \ - --hash=sha256:62fada2c942702ef8952754abfc1a9f7658a4d5460fabe95ac7ec2cbe0d02abc \ - --hash=sha256:67c519635a4f64e495c50e3107d9b4075aec33634272b5db1cde839e07367589 \ - --hash=sha256:6a54c43d3ec4cf2a39f4387ad044221c66a376e58c0d0e971d47c475ba79c6b5 \ - --hash=sha256:7044312a928a66a4c2a22644147bc61a199c1709712069a344a3fb5cfcf16915 \ - --hash=sha256:730d86af59e0e43ce277bb83970530dd223bf7f2a838e086b50affa6ec5f9295 \ - --hash=sha256:800100d45176652ded796134277ecb13640c1a537cad3b8b53da45aa96330453 \ - --hash=sha256:80fcbf3add8790caddfab6764bde258b5d09aefbe9169c183f88a7410f0f6dea \ - --hash=sha256:82b5dba6eb1bcc29cc305a18a3c5365d2af06ee71b123216416f7e20d2a84e5b \ - --hash=sha256:852dc840f6d7c985603e60b5deaae1d89c56cb038b577f6b5b8c808c97580f1d \ - --hash=sha256:8ad4ad1429cd4f315f32ef263c1342166695fad76c100c5d979c45d5570ed58b \ - --hash=sha256:8ae369e84466aa70f3154ee23c1451fda10a8ee1b63923ce76667e3077f2b0c4 \ - --hash=sha256:93e8248d650e7e9d49e8251f883eed60ecbc0e8ffd6349e18550925e31bd029b \ - --hash=sha256:973a371a55ce9ed333a3a0f8e0bcfae9e0d637711534bcb11e130af2ab9334e7 \ - --hash=sha256:9ba25a71ebf05b9bb0e2ae99f8bc08a07ee8e98c612175087112656ca0f5c8bf \ - --hash=sha256:a10860e00ded1dd0a65b83e717af28845bb7bd16d8ace40fe5531491de76b79f \ - --hash=sha256:a4792d3b3a6dfafefdf8e937f14906a51bd27025a36f4b188728a73382231d91 \ - --hash=sha256:a7420ceda262dbb4b8d839a4ec63d61c261e4e77677ed7c66c99f4e7cb5030dd \ - --hash=sha256:ad91738f14eb8da0ff82f2acd0098b6257621410dcbd4df20aaa5b4233d75a50 \ - --hash=sha256:b6a387d61fe41cdf7ea95b38e9af11cfb1a63499af2759444b99185c4ab33f5b \ - --hash=sha256:b954093679d5750495725ea6f88409946d69cfb25ea7b4c846eef5044194f583 \ - --hash=sha256:bbde71a705f8e9e4c3e9e33db69341d040c827c7afa6789b14c6e16776074f5a \ - --hash=sha256:beeebf760a9c1f4c07ef6a53465e8cfa776ea6a2021eda0d0417ec41043fe984 \ - --hash=sha256:c91b394f7601438ff79a4b93d16be92f216adb57d813a78be4446fe0f6bc2d8c \ - --hash=sha256:c97ff7fedf56d86bae92fa0a646ce1a0ec7509a7578e1ed238731ba13aabcd1c \ - --hash=sha256:cb53e2a99df28eee3b5f4fea166020d3ef9116fdc5764bc5117486e6d1211b25 \ - --hash=sha256:cbf445eb5628981a80f54087f9acdbf84f9b7d862756110d172993b9a5ae81aa \ - --hash=sha256:d06b24c686a34c86c8c1fba923181eae6b10565e4d80bdd7bc1c8e2f11247aa4 \ - --hash=sha256:d98e66a24497637dd31ccab090b34392dddb1f2f811c4b4cd80c230205c074a3 \ - --hash=sha256:db15ce28e1e127a0013dfb8ac243a8e392db8c61eae113337536edb28bdc1f97 \ - --hash=sha256:db842712984e91707437461930e6011e60b39136c7331e971952bb30465bc1a1 \ - --hash=sha256:e24bfe89c6ac4c31792793ad9f861b8f6dc4546ac6dc8f1c9083c7c4f2b335cd \ - --hash=sha256:e81c52638315ff4ac1b533d427f50bc0afc746deb949210bc85f05d4f15fd772 \ - --hash=sha256:e9393357f19954248b00bed7c56f29a25c930593a77630c719653d51e7669c2a \ - --hash=sha256:ee3941769bd2522fe39222206f6dd97ae83c442a94c90f2b7a25d847d40f4729 \ - --hash=sha256:f31ae06f1328595d762c9a2bf29dafd8621c7d3adc130cbb46278079758779ca \ - --hash=sha256:f94190df587738280d544971500b9cafc9b950d32efcb1fba9ac10d84e6aa4e6 \ - --hash=sha256:fa7d686ed9883f3d664d39d5a8e74d3c5f63e603c2e3ff0abcba23eac6542635 \ - --hash=sha256:fb532dd9900381d2e8f48172ddc5a59db4c445a11b9fab40b3b786da40d3b56b \ - --hash=sha256:fe32482b37b4b00c7a52a07211b479653b7fe4f22b2e481b9a9b099d8a430f2f - # via jinja2 -mdurl==0.1.2 \ - --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \ - --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba - # via markdown-it-py -more-itertools==10.5.0 \ - --hash=sha256:037b0d3203ce90cca8ab1defbbdac29d5f993fc20131f3664dc8d6acfa872aef \ - --hash=sha256:5482bfef7849c25dc3c6dd53a6173ae4795da2a41a80faea6700d9f5846c5da6 - # via - # jaraco-classes - # jaraco-functools -nh3==0.2.18 \ - --hash=sha256:0411beb0589eacb6734f28d5497ca2ed379eafab8ad8c84b31bb5c34072b7164 \ - --hash=sha256:14c5a72e9fe82aea5fe3072116ad4661af5cf8e8ff8fc5ad3450f123e4925e86 \ - --hash=sha256:19aaba96e0f795bd0a6c56291495ff59364f4300d4a39b29a0abc9cb3774a84b \ - --hash=sha256:34c03fa78e328c691f982b7c03d4423bdfd7da69cd707fe572f544cf74ac23ad \ - --hash=sha256:36c95d4b70530b320b365659bb5034341316e6a9b30f0b25fa9c9eff4c27a204 \ - --hash=sha256:3a157ab149e591bb638a55c8c6bcb8cdb559c8b12c13a8affaba6cedfe51713a \ - --hash=sha256:42c64511469005058cd17cc1537578eac40ae9f7200bedcfd1fc1a05f4f8c200 \ - --hash=sha256:5f36b271dae35c465ef5e9090e1fdaba4a60a56f0bb0ba03e0932a66f28b9189 \ - --hash=sha256:6955369e4d9f48f41e3f238a9e60f9410645db7e07435e62c6a9ea6135a4907f \ - --hash=sha256:7b7c2a3c9eb1a827d42539aa64091640bd275b81e097cd1d8d82ef91ffa2e811 \ - --hash=sha256:8ce0f819d2f1933953fca255db2471ad58184a60508f03e6285e5114b6254844 \ - --hash=sha256:94a166927e53972a9698af9542ace4e38b9de50c34352b962f4d9a7d4c927af4 \ - --hash=sha256:a7f1b5b2c15866f2db413a3649a8fe4fd7b428ae58be2c0f6bca5eefd53ca2be \ - --hash=sha256:c8b3a1cebcba9b3669ed1a84cc65bf005728d2f0bc1ed2a6594a992e817f3a50 \ - --hash=sha256:de3ceed6e661954871d6cd78b410213bdcb136f79aafe22aa7182e028b8c7307 \ - --hash=sha256:f0eca9ca8628dbb4e916ae2491d72957fdd35f7a5d326b7032a345f111ac07fe - # via readme-renderer -nox==2024.10.9 \ - --hash=sha256:1d36f309a0a2a853e9bccb76bbef6bb118ba92fa92674d15604ca99adeb29eab \ - --hash=sha256:7aa9dc8d1c27e9f45ab046ffd1c3b2c4f7c91755304769df231308849ebded95 - # via -r requirements.in -packaging==24.1 \ - --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \ - --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124 - # via - # gcp-releasetool - # nox -pkginfo==1.10.0 \ - --hash=sha256:5df73835398d10db79f8eecd5cd86b1f6d29317589ea70796994d49399af6297 \ - --hash=sha256:889a6da2ed7ffc58ab5b900d888ddce90bce912f2d2de1dc1c26f4cb9fe65097 - # via twine -platformdirs==4.3.6 \ - --hash=sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907 \ - --hash=sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb - # via virtualenv -proto-plus==1.24.0 \ - --hash=sha256:30b72a5ecafe4406b0d339db35b56c4059064e69227b8c3bda7462397f966445 \ - --hash=sha256:402576830425e5f6ce4c2a6702400ac79897dab0b4343821aa5188b0fab81a12 - # via google-api-core -protobuf==5.28.2 \ - --hash=sha256:2c69461a7fcc8e24be697624c09a839976d82ae75062b11a0972e41fd2cd9132 \ - --hash=sha256:35cfcb15f213449af7ff6198d6eb5f739c37d7e4f1c09b5d0641babf2cc0c68f \ - --hash=sha256:52235802093bd8a2811abbe8bf0ab9c5f54cca0a751fdd3f6ac2a21438bffece \ - --hash=sha256:59379674ff119717404f7454647913787034f03fe7049cbef1d74a97bb4593f0 \ - --hash=sha256:5e8a95246d581eef20471b5d5ba010d55f66740942b95ba9b872d918c459452f \ - --hash=sha256:87317e9bcda04a32f2ee82089a204d3a2f0d3c8aeed16568c7daf4756e4f1fe0 \ - --hash=sha256:8ddc60bf374785fb7cb12510b267f59067fa10087325b8e1855b898a0d81d276 \ - --hash=sha256:a8b9403fc70764b08d2f593ce44f1d2920c5077bf7d311fefec999f8c40f78b7 \ - --hash=sha256:c0ea0123dac3399a2eeb1a1443d82b7afc9ff40241433296769f7da42d142ec3 \ - --hash=sha256:ca53faf29896c526863366a52a8f4d88e69cd04ec9571ed6082fa117fac3ab36 \ - --hash=sha256:eeea10f3dc0ac7e6b4933d32db20662902b4ab81bf28df12218aa389e9c2102d - # via - # gcp-docuploader - # gcp-releasetool - # google-api-core - # googleapis-common-protos - # proto-plus -pyasn1==0.6.1 \ - --hash=sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629 \ - --hash=sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034 - # via - # pyasn1-modules - # rsa -pyasn1-modules==0.4.1 \ - --hash=sha256:49bfa96b45a292b711e986f222502c1c9a5e1f4e568fc30e2574a6c7d07838fd \ - --hash=sha256:c28e2dbf9c06ad61c71a075c7e0f9fd0f1b0bb2d2ad4377f240d33ac2ab60a7c - # via google-auth -pycparser==2.22 \ - --hash=sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6 \ - --hash=sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc - # via cffi -pygments==2.18.0 \ - --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \ - --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a - # via - # readme-renderer - # rich -pyjwt==2.9.0 \ - --hash=sha256:3b02fb0f44517787776cf48f2ae25d8e14f300e6d7545a4315cee571a415e850 \ - --hash=sha256:7e1e5b56cc735432a7369cbfa0efe50fa113ebecdc04ae6922deba8b84582d0c - # via gcp-releasetool -pyperclip==1.9.0 \ - --hash=sha256:b7de0142ddc81bfc5c7507eea19da920b92252b548b96186caf94a5e2527d310 - # via gcp-releasetool -python-dateutil==2.9.0.post0 \ - --hash=sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3 \ - --hash=sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427 - # via gcp-releasetool -readme-renderer==44.0 \ - --hash=sha256:2fbca89b81a08526aadf1357a8c2ae889ec05fb03f5da67f9769c9a592166151 \ - --hash=sha256:8712034eabbfa6805cacf1402b4eeb2a73028f72d1166d6f5cb7f9c047c5d1e1 - # via twine -requests==2.32.3 \ - --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \ - --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6 - # via - # gcp-releasetool - # google-api-core - # google-cloud-storage - # requests-toolbelt - # twine -requests-toolbelt==1.0.0 \ - --hash=sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6 \ - --hash=sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06 - # via twine -rfc3986==2.0.0 \ - --hash=sha256:50b1502b60e289cb37883f3dfd34532b8873c7de9f49bb546641ce9cbd256ebd \ - --hash=sha256:97aacf9dbd4bfd829baad6e6309fa6573aaf1be3f6fa735c8ab05e46cecb261c - # via twine -rich==13.9.2 \ - --hash=sha256:51a2c62057461aaf7152b4d611168f93a9fc73068f8ded2790f29fe2b5366d0c \ - --hash=sha256:8c82a3d3f8dcfe9e734771313e606b39d8247bb6b826e196f4914b333b743cf1 - # via twine -rsa==4.9 \ - --hash=sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7 \ - --hash=sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21 - # via google-auth -secretstorage==3.3.3 \ - --hash=sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77 \ - --hash=sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99 - # via keyring -six==1.16.0 \ - --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \ - --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254 - # via - # gcp-docuploader - # python-dateutil -tomli==2.0.2 \ - --hash=sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38 \ - --hash=sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed - # via nox -twine==5.1.1 \ - --hash=sha256:215dbe7b4b94c2c50a7315c0275d2258399280fbb7d04182c7e55e24b5f93997 \ - --hash=sha256:9aa0825139c02b3434d913545c7b847a21c835e11597f5255842d457da2322db - # via -r requirements.in -typing-extensions==4.12.2 \ - --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \ - --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8 - # via - # -r requirements.in - # rich -urllib3==2.2.3 \ - --hash=sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac \ - --hash=sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9 - # via - # requests - # twine -virtualenv==20.26.6 \ - --hash=sha256:280aede09a2a5c317e409a00102e7077c6432c5a38f0ef938e643805a7ad2c48 \ - --hash=sha256:7345cc5b25405607a624d8418154577459c3e0277f5466dd79c49d5e492995f2 - # via nox -wheel==0.44.0 \ - --hash=sha256:2376a90c98cc337d18623527a97c31797bd02bad0033d41547043a1cbfbe448f \ - --hash=sha256:a29c3f2817e95ab89aa4660681ad547c0e9547f20e75b0562fe7723c9a2a9d49 - # via -r requirements.in -zipp==3.20.2 \ - --hash=sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350 \ - --hash=sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29 - # via importlib-metadata - -# The following packages are considered to be unsafe in a requirements file: -setuptools==75.1.0 \ - --hash=sha256:35ab7fd3bcd95e6b7fd704e4a1539513edad446c097797f2985e0e4b960772f2 \ - --hash=sha256:d59a21b17a275fb872a9c3dae73963160ae079f1049ed956880cd7c09b120538 - # via -r requirements.in diff --git a/CHANGELOG.md b/CHANGELOG.md index 78ecfa53d9..2bc43072b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,31 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.41.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.40.0...v1.41.0) (2025-03-19) + + +### Features + +* Add support for the 'right' parameter in 'pandas.cut' ([#1496](https://github.com/googleapis/python-bigquery-dataframes/issues/1496)) ([8aff128](https://github.com/googleapis/python-bigquery-dataframes/commit/8aff1285b26754118cc8ee906c4ac3076456a791)) +* Support BQ managed functions through `read_gbq_function` ([#1476](https://github.com/googleapis/python-bigquery-dataframes/issues/1476)) ([802183d](https://github.com/googleapis/python-bigquery-dataframes/commit/802183dc000ad2ce5559d14181dd3f7d036b3fed)) +* Warn when the BigFrames version is more than a year old ([#1455](https://github.com/googleapis/python-bigquery-dataframes/issues/1455)) ([00e0750](https://github.com/googleapis/python-bigquery-dataframes/commit/00e07508cfb0d8798e079b86a14834b3b593aa54)) + + +### Bug Fixes + +* Fix pandas.cut errors with empty bins ([#1499](https://github.com/googleapis/python-bigquery-dataframes/issues/1499)) ([434fb5d](https://github.com/googleapis/python-bigquery-dataframes/commit/434fb5dd60d11f09b808ea656394790aba43fdde)) +* Fix read_gbq with ORDER BY query and index_col set ([#963](https://github.com/googleapis/python-bigquery-dataframes/issues/963)) ([de46d2f](https://github.com/googleapis/python-bigquery-dataframes/commit/de46d2fdf7a1a30b2be07dbaa1cb127f10f5fe30)) + + +### Performance Improvements + +* Eliminate count queries in llm retry ([#1489](https://github.com/googleapis/python-bigquery-dataframes/issues/1489)) ([1c934c2](https://github.com/googleapis/python-bigquery-dataframes/commit/1c934c2fe2374c9abaaa79696f5e5f349248f3b7)) + + +### Documentation + +* Add a sample notebook for vector search ([#1500](https://github.com/googleapis/python-bigquery-dataframes/issues/1500)) ([f3bf139](https://github.com/googleapis/python-bigquery-dataframes/commit/f3bf139d33ed00ca3081e4e0315f409fdb2ad84d)) + ## [1.40.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.39.0...v1.40.0) (2025-03-11) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 66d9d6772f..db6007b41a 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2616,7 +2616,7 @@ def _get_rows_as_json_values(self) -> Block: # The only ways this code is used is through df.apply(axis=1) cope path # TODO: Stop using internal API destination, query_job = self.session._loader._query_to_destination( - json_sql, index_cols=[ordering_column_name], api_name="apply" + json_sql, cluster_candidates=[ordering_column_name], api_name="apply" ) if not destination: raise ValueError(f"Query job {query_job} did not produce result table") diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 93fddf196e..f96471e200 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -364,8 +364,12 @@ def _( if op.labels is False: for this_bin in range(op.bins - 1): + if op.right: + case_expr = x <= (col_min + (this_bin + 1) * bin_width) + else: + case_expr = x < (col_min + (this_bin + 1) * bin_width) out = out.when( - x <= (col_min + (this_bin + 1) * bin_width), + case_expr, compile_ibis_types.literal_to_ibis_scalar( this_bin, force_dtype=pd.Int64Dtype() ), @@ -375,32 +379,49 @@ def _( interval_struct = None adj = (col_max - col_min) * 0.001 for this_bin in range(op.bins): - left_edge = ( - col_min + this_bin * bin_width - (0 if this_bin > 0 else adj) - ) - right_edge = col_min + (this_bin + 1) * bin_width - interval_struct = ibis_types.struct( - { - "left_exclusive": left_edge, - "right_inclusive": right_edge, - } - ) + left_edge_adj = adj if this_bin == 0 and op.right else 0 + right_edge_adj = adj if this_bin == op.bins - 1 and not op.right else 0 + + left_edge = col_min + this_bin * bin_width - left_edge_adj + right_edge = col_min + (this_bin + 1) * bin_width + right_edge_adj + + if op.right: + interval_struct = ibis_types.struct( + { + "left_exclusive": left_edge, + "right_inclusive": right_edge, + } + ) + else: + interval_struct = ibis_types.struct( + { + "left_inclusive": left_edge, + "right_exclusive": right_edge, + } + ) if this_bin < op.bins - 1: - out = out.when( - x <= (col_min + (this_bin + 1) * bin_width), - interval_struct, - ) + if op.right: + case_expr = x <= (col_min + (this_bin + 1) * bin_width) + else: + case_expr = x < (col_min + (this_bin + 1) * bin_width) + out = out.when(case_expr, interval_struct) else: out = out.when(x.notnull(), interval_struct) else: # Interpret as intervals for interval in op.bins: left = compile_ibis_types.literal_to_ibis_scalar(interval[0]) right = compile_ibis_types.literal_to_ibis_scalar(interval[1]) - condition = (x > left) & (x <= right) - interval_struct = ibis_types.struct( - {"left_exclusive": left, "right_inclusive": right} - ) + if op.right: + condition = (x > left) & (x <= right) + interval_struct = ibis_types.struct( + {"left_exclusive": left, "right_inclusive": right} + ) + else: + condition = (x >= left) & (x < right) + interval_struct = ibis_types.struct( + {"left_inclusive": left, "right_exclusive": right} + ) out = out.when(condition, interval_struct) return out.end() diff --git a/bigframes/core/reshape/tile.py b/bigframes/core/reshape/tile.py index 2a2ca9de95..d9a5a87145 100644 --- a/bigframes/core/reshape/tile.py +++ b/bigframes/core/reshape/tile.py @@ -15,7 +15,6 @@ from __future__ import annotations import typing -from typing import Iterable, Optional, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile @@ -33,33 +32,42 @@ def cut( x: bigframes.series.Series, - bins: Union[ + bins: typing.Union[ int, pd.IntervalIndex, - Iterable, + typing.Iterable, ], *, - labels: Union[Iterable[str], bool, None] = None, + right: typing.Optional[bool] = True, + labels: typing.Union[typing.Iterable[str], bool, None] = None, ) -> bigframes.series.Series: - if isinstance(bins, int) and bins <= 0: - raise ValueError("`bins` should be a positive integer.") + if labels is not None and labels is not False: + raise NotImplementedError( + "The 'labels' parameter must be either False or None. " + "Please provide a valid value for 'labels'." + ) - if isinstance(bins, Iterable): + if isinstance(bins, int): + if bins <= 0: + raise ValueError("`bins` should be a positive integer.") + op = agg_ops.CutOp(bins, right=right, labels=labels) + return x._apply_window_op(op, window_spec=window_specs.unbound()) + elif isinstance(bins, typing.Iterable): if isinstance(bins, pd.IntervalIndex): as_index: pd.IntervalIndex = bins bins = tuple((bin.left.item(), bin.right.item()) for bin in bins) + # To maintain consistency with pandas' behavior + right = True elif len(list(bins)) == 0: - raise ValueError("`bins` iterable should have at least one item") + as_index = pd.IntervalIndex.from_tuples(list(bins)) + bins = tuple() elif isinstance(list(bins)[0], tuple): as_index = pd.IntervalIndex.from_tuples(list(bins)) bins = tuple(bins) + # To maintain consistency with pandas' behavior + right = True elif pd.api.types.is_number(list(bins)[0]): bins_list = list(bins) - if len(bins_list) < 2: - raise ValueError( - "`bins` iterable of numeric breaks should have" - " at least two items" - ) as_index = pd.IntervalIndex.from_breaks(bins_list) single_type = all([isinstance(n, type(bins_list[0])) for n in bins_list]) numeric_type = type(bins_list[0]) if single_type else float @@ -70,20 +78,20 @@ def cut( ] ) else: - raise ValueError("`bins` iterable should contain tuples or numerics") + raise ValueError("`bins` iterable should contain tuples or numerics.") if as_index.is_overlapping: raise ValueError("Overlapping IntervalIndex is not accepted.") - - if labels is not None and labels is not False: - raise NotImplementedError( - "The 'labels' parameter must be either False or None. " - "Please provide a valid value for 'labels'." - ) - - return x._apply_window_op( - agg_ops.CutOp(bins, labels=labels), window_spec=window_specs.unbound() - ) + elif len(as_index) == 0: + op = agg_ops.CutOp(bins, right=right, labels=labels) + return bigframes.series.Series( + [pd.NA] * len(x), dtype=op.output_type(), name=x.name + ) + else: + op = agg_ops.CutOp(bins, right=right, labels=labels) + return x._apply_window_op(op, window_spec=window_specs.unbound()) + else: + raise ValueError("`bins` must be an integer or interable.") cut.__doc__ = vendored_pandas_tile.cut.__doc__ @@ -93,7 +101,7 @@ def qcut( x: bigframes.series.Series, q: typing.Union[int, typing.Sequence[float]], *, - labels: Optional[bool] = None, + labels: typing.Optional[bool] = None, duplicates: typing.Literal["drop", "error"] = "error", ) -> bigframes.series.Series: if isinstance(q, int) and q <= 0: diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 262b23abd2..abab9fd268 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -4108,7 +4108,7 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: ) def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): - # In Bigframes remote function, DataFrame '.apply' method is specifically + # In Bigframes BigQuery function, DataFrame '.apply' method is specifically # designed to work with row-wise or column-wise operations, where the input # to the applied function should be a Series, not a scalar. @@ -4116,24 +4116,18 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): msg = bfe.format_message("axis=1 scenario is in preview.") warnings.warn(msg, category=bfe.PreviewWarning) - # TODO(jialuo): Deprecate the "bigframes_remote_function" attribute. - # We have some tests using pre-defined remote_function that were - # defined based on "bigframes_remote_function" instead of - # "bigframes_bigquery_function". So we need to fix those pre-defined - # remote functions before deprecating the "bigframes_remote_function" - # attribute. Check if the function is a remote function. - if not hasattr(func, "bigframes_remote_function") and not hasattr( - func, "bigframes_bigquery_function" - ): - raise ValueError("For axis=1 a bigframes function must be used.") + if not hasattr(func, "bigframes_bigquery_function"): + raise ValueError( + "For axis=1 a BigFrames BigQuery function must be used." + ) is_row_processor = getattr(func, "is_row_processor") if is_row_processor: # Early check whether the dataframe dtypes are currently supported - # in the remote function + # in the bigquery function # NOTE: Keep in sync with the value converters used in the gcf code # generated in function_template.py - remote_function_supported_dtypes = ( + bigquery_function_supported_dtypes = ( bigframes.dtypes.INT_DTYPE, bigframes.dtypes.FLOAT_DTYPE, bigframes.dtypes.BOOL_DTYPE, @@ -4142,18 +4136,18 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): ) supported_dtypes_types = tuple( type(dtype) - for dtype in remote_function_supported_dtypes + for dtype in bigquery_function_supported_dtypes if not isinstance(dtype, pandas.ArrowDtype) ) # Check ArrowDtype separately since multiple BigQuery types map to # ArrowDtype, including BYTES and TIMESTAMP. supported_arrow_types = tuple( dtype.pyarrow_dtype - for dtype in remote_function_supported_dtypes + for dtype in bigquery_function_supported_dtypes if isinstance(dtype, pandas.ArrowDtype) ) supported_dtypes_hints = tuple( - str(dtype) for dtype in remote_function_supported_dtypes + str(dtype) for dtype in bigquery_function_supported_dtypes ) for dtype in self.dtypes: @@ -4186,10 +4180,11 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): ) else: # This is a special case where we are providing not-pandas-like - # extension. If the remote function can take one or more params - # then we assume that here the user intention is to use the - # column values of the dataframe as arguments to the function. - # For this to work the following condition must be true: + # extension. If the bigquery function can take one or more + # params then we assume that here the user intention is to use + # the column values of the dataframe as arguments to the + # function. For this to work the following condition must be + # true: # 1. The number or input params in the function must be same # as the number of columns in the dataframe # 2. The dtypes of the columns in the dataframe must be @@ -4231,14 +4226,16 @@ def apply(self, func, *, axis=0, args: typing.Tuple = (), **kwargs): return result_series - # At this point column-wise or element-wise remote function operation will + # At this point column-wise or element-wise bigquery function operation will # be performed (not supported). - if hasattr(func, "bigframes_remote_function"): - raise NotImplementedError( - "BigFrames DataFrame '.apply()' does not support remote function " - "for column-wise (i.e. with axis=0) operations, please use a " - "regular python function instead. For element-wise operations of " - "the remote function, please use '.map()'." + if hasattr(func, "bigframes_bigquery_function"): + raise formatter.create_exception_with_feedback_link( + NotImplementedError, + "BigFrames DataFrame '.apply()' does not support BigFrames " + "BigQuery function for column-wise (i.e. with axis=0) " + "operations, please use a regular python function instead. For " + "element-wise operations of the BigFrames BigQuery function, " + "please use '.map()'.", ) # Per-column apply diff --git a/bigframes/exceptions.py b/bigframes/exceptions.py index 8b35d9122b..6197481253 100644 --- a/bigframes/exceptions.py +++ b/bigframes/exceptions.py @@ -91,9 +91,8 @@ class BadIndexerKeyWarning(Warning): """The indexer key is not used correctly.""" -class ColorFormatter: - WARNING = "\033[93m" - ENDC = "\033[0m" +class ObsoleteVersionWarning(Warning): + """The BigFrames version is too old.""" def format_message(message: str, fill: bool = True): @@ -106,10 +105,9 @@ def format_message(message: str, fill: bool = True): especially if the message already contains newlines. Returns: - The formatted message string, with ANSI color codes for warning color - if color is supported, otherwise the original message. If `fill` is - True, the message will be wrapped to fit the terminal width. + The formatted message string. If `fill` is True, the message will be wrapped + to fit the terminal width. """ if fill: message = textwrap.fill(message) - return ColorFormatter.WARNING + message + ColorFormatter.ENDC + return message diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index 63249b1a8a..48afb4fdbd 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -17,7 +17,7 @@ import datetime import random -from typing import Any, Optional, Union +from typing import Any, Optional, Type, Union import bigframes_vendored.constants as constants import google.api_core.exceptions as api_core_exceptions @@ -48,6 +48,16 @@ def add_feedback_link( exception.message = exception.message + f" {constants.FEEDBACK_LINK}" +def create_exception_with_feedback_link( + exception: Type[Exception], + arg: str = "", +): + if arg: + return exception(arg + f" {constants.FEEDBACK_LINK}") + + return exception(constants.FEEDBACK_LINK) + + def repr_query_job_html(query_job: Optional[bigquery.QueryJob]): """Return query job in html format. Args: diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py index 3e69563db6..37b435eeec 100644 --- a/bigframes/functions/_function_client.py +++ b/bigframes/functions/_function_client.py @@ -26,9 +26,9 @@ import types from typing import cast, Tuple, TYPE_CHECKING -from bigframes_vendored import constants import requests +import bigframes.formatting_helpers as bf_formatting import bigframes.functions.function_template as bff_template if TYPE_CHECKING: @@ -366,10 +366,9 @@ def create_cloud_function( headers={"content-type": "application/zip"}, ) if response.status_code != 200: - raise RuntimeError( - "Failed to upload user code. code={}, reason={}, text={}".format( - response.status_code, response.reason, response.text - ) + raise bf_formatting.create_exception_with_feedback_link( + RuntimeError, + f"Failed to upload user code. code={response.status_code}, reason={response.reason}, text={response.text}", ) # Deploy Cloud Function @@ -399,10 +398,11 @@ def create_cloud_function( function.service_config.available_memory = f"{memory_mib}Mi" if timeout_seconds is not None: if timeout_seconds > 1200: - raise ValueError( + raise bf_formatting.create_exception_with_feedback_link( + ValueError, "BigQuery remote function can wait only up to 20 minutes" ", see for more details " - "https://cloud.google.com/bigquery/quotas#remote_function_limits." + "https://cloud.google.com/bigquery/quotas#remote_function_limits.", ) function.service_config.timeout_seconds = timeout_seconds if max_instance_count is not None: @@ -413,10 +413,9 @@ def create_cloud_function( self._cloud_function_service_account ) if ingress_settings not in _INGRESS_SETTINGS_MAP: - raise ValueError( - "'{}' not one of the supported ingress settings values: {}".format( - ingress_settings, list(_INGRESS_SETTINGS_MAP) - ) + raise bf_formatting.create_exception_with_feedback_link( + ValueError, + f"'{ingress_settings}' not one of the supported ingress settings values: {list(_INGRESS_SETTINGS_MAP)}", ) function.service_config.ingress_settings = cast( functions_v2.ServiceConfig.IngressSettings, @@ -447,8 +446,8 @@ def create_cloud_function( # Fetch the endpoint of the just created function endpoint = self.get_cloud_function_endpoint(cf_name) if not endpoint: - raise ValueError( - f"Couldn't fetch the http endpoint. {constants.FEEDBACK_LINK}" + raise bf_formatting.create_exception_with_feedback_link( + ValueError, "Couldn't fetch the http endpoint." ) logger.info( @@ -541,8 +540,9 @@ def provision_bq_remote_function( ): input_args = inspect.getargs(def_.__code__).args if len(input_args) != len(input_types): - raise ValueError( - "Exactly one type should be provided for every input arg." + raise bf_formatting.create_exception_with_feedback_link( + ValueError, + "Exactly one type should be provided for every input arg.", ) self.create_bq_remote_function( input_args, diff --git a/bigframes/functions/_function_session.py b/bigframes/functions/_function_session.py index a66f619cf9..1444457c90 100644 --- a/bigframes/functions/_function_session.py +++ b/bigframes/functions/_function_session.py @@ -33,7 +33,6 @@ ) import warnings -import bigframes_vendored.constants as constants import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes import bigframes_vendored.ibis.expr.operations.udf as ibis_udf @@ -49,6 +48,7 @@ from bigframes import clients import bigframes.core.compile.ibis_types import bigframes.exceptions as bfe +import bigframes.formatting_helpers as bf_formatting import bigframes.series as bf_series if TYPE_CHECKING: @@ -87,9 +87,10 @@ def _resolve_bigquery_client( if not bigquery_client: bigquery_client = session.bqclient if not bigquery_client: - raise ValueError( + raise bf_formatting.create_exception_with_feedback_link( + ValueError, "A bigquery client must be provided, either directly or via " - f"session. {constants.FEEDBACK_LINK}" + "session.", ) return bigquery_client @@ -104,9 +105,10 @@ def _resolve_bigquery_connection_client( if not bigquery_connection_client: bigquery_connection_client = session.bqconnectionclient if not bigquery_connection_client: - raise ValueError( + raise bf_formatting.create_exception_with_feedback_link( + ValueError, "A bigquery connection client must be provided, either " - f"directly or via session. {constants.FEEDBACK_LINK}" + "directly or via session.", ) return bigquery_connection_client @@ -119,9 +121,10 @@ def _resolve_resource_manager_client( if not resource_manager_client: resource_manager_client = session.resourcemanagerclient if not resource_manager_client: - raise ValueError( + raise bf_formatting.create_exception_with_feedback_link( + ValueError, "A resource manager client must be provided, either directly " - f"or via session. {constants.FEEDBACK_LINK}" + "or via session.", ) return resource_manager_client @@ -149,9 +152,10 @@ def _resolve_cloud_functions_client( if not cloud_functions_client: cloud_functions_client = session.cloudfunctionsclient if not cloud_functions_client: - raise ValueError( + raise bf_formatting.create_exception_with_feedback_link( + ValueError, "A cloud functions client must be provided, either directly " - f"or via session. {constants.FEEDBACK_LINK}" + "or via session.", ) return cloud_functions_client @@ -178,14 +182,16 @@ def _resolve_bigquery_connection_id( bq_connection_id, ) = bigquery_connection.split(".") if gcp_project_id.casefold() != dataset_ref.project.casefold(): - raise ValueError( + raise bf_formatting.create_exception_with_feedback_link( + ValueError, "The project_id does not match BigQuery connection " - f"gcp_project_id: {dataset_ref.project}." + f"gcp_project_id: {dataset_ref.project}.", ) if bq_connection_location.casefold() != bq_location.casefold(): - raise ValueError( + raise bf_formatting.create_exception_with_feedback_link( + ValueError, "The location does not match BigQuery connection location: " - f"{bq_location}." + f"{bq_location}.", ) return bq_connection_id @@ -506,9 +512,10 @@ def remote_function( cloud_function_kms_key_name is not None and cloud_function_docker_repository is None ): - raise ValueError( + raise bf_formatting.create_exception_with_feedback_link( + ValueError, "cloud_function_docker_repository must be specified with cloud_function_kms_key_name." - " For more details see https://cloud.google.com/functions/docs/securing/cmek#before_you_begin" + " For more details see https://cloud.google.com/functions/docs/securing/cmek#before_you_begin.", ) if cloud_function_ingress_settings is None: @@ -521,13 +528,25 @@ def remote_function( ) warnings.warn(msg, category=FutureWarning, stacklevel=2) + if cloud_function_ingress_settings is None: + cloud_function_ingress_settings = "all" + msg = bfe.format_message( + "The `cloud_function_ingress_settings` are set to 'all' by default, " + "which will change to 'internal-only' for enhanced security in future version 2.0 onwards. " + "However, you will be able to explicitly pass cloud_function_ingress_settings='all' if you need. " + "See https://cloud.google.com/functions/docs/networking/network-settings#ingress_settings for details." + ) + warnings.warn(msg, category=FutureWarning, stacklevel=2) + bq_connection_manager = session.bqconnectionmanager def wrapper(func): nonlocal input_types, output_type if not callable(func): - raise TypeError("f must be callable, got {}".format(func)) + raise bf_formatting.create_exception_with_feedback_link( + TypeError, f"func must be a callable, got {func}" + ) if sys.version_info >= (3, 10): # Add `eval_str = True` so that deferred annotations are turned into their @@ -547,10 +566,11 @@ def wrapper(func): input_types = [] for parameter in signature.parameters.values(): if (param_type := parameter.annotation) is inspect.Signature.empty: - raise ValueError( + raise bf_formatting.create_exception_with_feedback_link( + ValueError, "'input_types' was not set and parameter " f"'{parameter.name}' is missing a type annotation. " - "Types are required to use @remote_function." + "Types are required to use @remote_function.", ) input_types.append(param_type) elif not isinstance(input_types, collections.abc.Sequence): @@ -560,10 +580,11 @@ def wrapper(func): if ( output_type := signature.return_annotation ) is inspect.Signature.empty: - raise ValueError( + raise bf_formatting.create_exception_with_feedback_link( + ValueError, "'output_type' was not set and function is missing a " "return type annotation. Types are required to use " - "@remote_function." + "@remote_function.", ) # The function will actually be receiving a pandas Series, but allow both @@ -720,7 +741,7 @@ def wrapper(func): # with that name and would directly manage their lifecycle. if created_new and (not name): self._update_temp_artifacts( - func.bigframes_remote_function, func.bigframes_cloud_function + func.bigframes_bigquery_function, func.bigframes_cloud_function ) return func @@ -789,14 +810,15 @@ def udf( https://pip.pypa.io/en/stable/reference/requirements-file-format/. """ if not bigframes.options.experiments.udf: - raise NotImplementedError() + raise bf_formatting.create_exception_with_feedback_link(NotImplementedError) # Check the Python version. python_version = _utils.get_python_version() if python_version not in _MANAGED_FUNC_PYTHON_VERSIONS: - raise RuntimeError( + raise bf_formatting.create_exception_with_feedback_link( + RuntimeError, f"Python version {python_version} is not supported yet for " - "BigFrames managed function." + "BigFrames managed function.", ) # Some defaults may be used from the session if not provided otherwise. @@ -823,7 +845,9 @@ def wrapper(func): nonlocal input_types, output_type if not callable(func): - raise TypeError("f must be callable, got {}".format(func)) + raise bf_formatting.create_exception_with_feedback_link( + TypeError, f"func must be a callable, got {func}" + ) # Managed function supports version >= 3.11. signature_kwargs: Mapping[str, Any] = {"eval_str": True} @@ -834,10 +858,11 @@ def wrapper(func): input_types = [] for parameter in signature.parameters.values(): if (param_type := parameter.annotation) is inspect.Signature.empty: - raise ValueError( + raise bf_formatting.create_exception_with_feedback_link( + ValueError, "'input_types' was not set and parameter " f"'{parameter.name}' is missing a type annotation. " - "Types are required to use managed function." + "Types are required to use managed function.", ) input_types.append(param_type) elif not isinstance(input_types, collections.abc.Sequence): @@ -847,10 +872,11 @@ def wrapper(func): if ( output_type := signature.return_annotation ) is inspect.Signature.empty: - raise ValueError( + raise bf_formatting.create_exception_with_feedback_link( + ValueError, "'output_type' was not set and function is missing a " "return type annotation. Types are required to use " - "managed function." + "managed function.", ) # The function will actually be receiving a pandas Series, but allow diff --git a/bigframes/functions/_utils.py b/bigframes/functions/_utils.py index bd6bd920b8..9247017380 100644 --- a/bigframes/functions/_utils.py +++ b/bigframes/functions/_utils.py @@ -30,6 +30,7 @@ import bigframes.core.compile.ibis_types import bigframes.dtypes +import bigframes.formatting_helpers as bf_formatting # Naming convention for the function artifacts _BIGFRAMES_FUNCTION_PREFIX = "bigframes" @@ -276,8 +277,8 @@ def get_bigframes_metadata(*, python_output_type: Optional[type] = None) -> str: get_python_output_type_from_bigframes_metadata(metadata_ser) != python_output_type ): - raise ValueError( - f"python_output_type {python_output_type} is not serializable." + raise bf_formatting.create_exception_with_feedback_link( + ValueError, f"python_output_type {python_output_type} is not serializable." ) return metadata_ser diff --git a/bigframes/functions/function.py b/bigframes/functions/function.py index 16416eb864..fd2f512f97 100644 --- a/bigframes/functions/function.py +++ b/bigframes/functions/function.py @@ -26,7 +26,6 @@ if TYPE_CHECKING: from bigframes.session import Session -import bigframes_vendored.constants as constants import google.api_core.exceptions import google.api_core.retry from google.cloud import bigquery @@ -35,6 +34,7 @@ import bigframes.core.compile.ibis_types import bigframes.dtypes import bigframes.exceptions as bfe +import bigframes.formatting_helpers as bf_formatting import bigframes.functions.function_template from . import _function_session as bff_session @@ -69,16 +69,18 @@ def ibis_signature_from_routine(routine: bigquery.Routine) -> _utils.IbisSignatu routine.description ): if not isinstance(ibis_output_type, ibis_dtypes.String): - raise TypeError( - "An explicit output_type should be provided only for a BigQuery function with STRING output." + raise bf_formatting.create_exception_with_feedback_link( + TypeError, + "An explicit output_type should be provided only for a BigQuery function with STRING output.", ) if typing.get_origin(python_output_type) is list: ibis_output_type_override = bigframes.core.compile.ibis_types.ibis_array_output_type_from_python_type( cast(type, python_output_type) ) else: - raise TypeError( - "Currently only list of a type is supported as python output type." + raise bf_formatting.create_exception_with_feedback_link( + TypeError, + "Currently only list of a type is supported as python output type.", ) return _utils.IbisSignature( @@ -153,33 +155,36 @@ def read_gbq_function( try: routine_ref = get_routine_reference(function_name, bigquery_client, session) except DatasetMissingError: - raise ValueError( - "Project and dataset must be provided, either directly or via session. " - f"{constants.FEEDBACK_LINK}" + raise bf_formatting.create_exception_with_feedback_link( + ValueError, + "Project and dataset must be provided, either directly or via session.", ) # Find the routine and get its arguments. try: routine = bigquery_client.get_routine(routine_ref) except google.api_core.exceptions.NotFound: - raise ValueError(f"Unknown function '{routine_ref}'. {constants.FEEDBACK_LINK}") + raise bf_formatting.create_exception_with_feedback_link( + ValueError, f"Unknown function '{routine_ref}'." + ) if is_row_processor and len(routine.arguments) > 1: - raise ValueError( + raise bf_formatting.create_exception_with_feedback_link( + ValueError, "A multi-input function cannot be a row processor. A row processor function " - "takes in a single input representing the row." + "takes in a single input representing the row.", ) try: ibis_signature = ibis_signature_from_routine(routine) except ReturnTypeMissingError: - raise ValueError( - f"Function return type must be specified. {constants.FEEDBACK_LINK}" + raise bf_formatting.create_exception_with_feedback_link( + ValueError, "Function return type must be specified." ) except bigframes.core.compile.ibis_types.UnsupportedTypeError as e: - raise ValueError( - f"Type {e.type} not supported, supported types are {e.supported_types}. " - f"{constants.FEEDBACK_LINK}" + raise bf_formatting.create_exception_with_feedback_link( + ValueError, + f"Type {e.type} not supported, supported types are {e.supported_types}.", ) # The name "args" conflicts with the Ibis operator, so we use @@ -214,7 +219,11 @@ def func(*bigframes_args, **bigframes_kwargs): database=routine_ref.dataset_id, signature=(ibis_signature.input_types, ibis_signature.output_type), ) # type: ignore - func.bigframes_remote_function = str(routine_ref) # type: ignore + func.bigframes_bigquery_function = str(routine_ref) # type: ignore + + # We will keep the "bigframes_remote_function" attr for remote function. + if hasattr(routine, "remote_function_options") and routine.remote_function_options: + func.bigframes_remote_function = func.bigframes_bigquery_function # type: ignore # set input bigframes data types has_unknown_dtypes = False diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index a0800c19e6..2b25bc82f0 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -22,7 +22,7 @@ """ import abc -from typing import Callable, cast, Mapping, Optional, TypeVar +from typing import Callable, cast, Mapping, Optional, TypeVar, Union import warnings import bigframes_vendored.sklearn.base @@ -259,38 +259,29 @@ def _predict_and_retry( ) -> bpd.DataFrame: assert self._bqml_model is not None - df_result = bpd.DataFrame(session=self._bqml_model.session) # placeholder - df_fail = X - for _ in range(max_retries + 1): + df_result: Union[bpd.DataFrame, None] = None # placeholder + df_succ = df_fail = X + for i in range(max_retries + 1): + if i > 0 and df_fail.empty: + break + if i > 0 and df_succ.empty: + msg = bfe.format_message("Can't make any progress, stop retrying.") + warnings.warn(msg, category=RuntimeWarning) + break + df = self._predict_func(df_fail, options) success = df[self._status_col].str.len() == 0 df_succ = df[success] df_fail = df[~success] - if df_succ.empty: - if max_retries > 0: - msg = bfe.format_message("Can't make any progress, stop retrying.") - warnings.warn(msg, category=RuntimeWarning) - break - df_result = ( - bpd.concat([df_result, df_succ]) if not df_result.empty else df_succ - ) - - if df_fail.empty: - break - - if not df_fail.empty: - msg = bfe.format_message( - f"Some predictions failed. Check column {self._status_col} for detailed " - "status. You may want to filter the failed rows and retry." + bpd.concat([df_result, df_succ]) if df_result is not None else df_succ ) - warnings.warn(msg, category=RuntimeWarning) df_result = cast( bpd.DataFrame, - bpd.concat([df_result, df_fail]) if not df_result.empty else df_fail, + bpd.concat([df_result, df_fail]) if df_result is not None else df_fail, ) return df_result diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 0ae4516dfd..d25791d3e4 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -339,6 +339,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT class CutOp(UnaryWindowOp): # TODO: Unintuitive, refactor into multiple ops? bins: typing.Union[int, Iterable] + right: Optional[bool] labels: Optional[bool] @property @@ -350,17 +351,27 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT return dtypes.INT_DTYPE else: # Assumption: buckets use same numeric type - interval_dtype = ( - pa.float64() - if isinstance(self.bins, int) - else dtypes.infer_literal_arrow_type(list(self.bins)[0][0]) - ) + if isinstance(self.bins, int): + interval_dtype = pa.float64() + elif len(list(self.bins)) == 0: + interval_dtype = pa.int64() + else: + interval_dtype = dtypes.infer_literal_arrow_type(list(self.bins)[0][0]) pa_type = pa.struct( [ - pa.field("left_exclusive", interval_dtype, nullable=True), - pa.field("right_inclusive", interval_dtype, nullable=True), + pa.field( + "left_exclusive" if self.right else "left_inclusive", + interval_dtype, + nullable=True, + ), + pa.field( + "right_inclusive" if self.right else "right_exclusive", + interval_dtype, + nullable=True, + ), ] ) + return pd.ArrowDtype(pa_type) @property diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 88a58acbfa..5e786f8d22 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -201,7 +201,7 @@ def display(self, n: int = 3, *, content_type: str = ""): content_type (str, default ""): content type of the blob. If unset, use the blob metadata of the storage. Possible values are "image", "audio" and "video". """ # col name doesn't matter here. Rename to avoid column name conflicts - df = bigframes.series.Series(self._block).rename("blob_col").head(n).to_frame() + df = bigframes.series.Series(self._block).rename("blob_col").to_frame() df["read_url"] = df["blob_col"].blob.read_url() @@ -210,6 +210,9 @@ def display(self, n: int = 3, *, content_type: str = ""): else: df["content_type"] = df["blob_col"].blob.content_type() + pandas_df, _, query_job = df._block.retrieve_repr_request_results(n) + df._set_internal_query_job(query_job) + def display_single_url( read_url: str, content_type: Union[str, pd._libs.missing.NAType] ): @@ -232,7 +235,7 @@ def display_single_url( response = requests.get(read_url) ipy_display.display(response.content) - for _, row in df.iterrows(): + for _, row in pandas_df.iterrows(): display_single_url(row["read_url"], row["content_type"]) def _resolve_connection(self, connection: Optional[str] = None) -> str: diff --git a/bigframes/operations/semantics.py b/bigframes/operations/semantics.py index 686db50a43..f4b9d85103 100644 --- a/bigframes/operations/semantics.py +++ b/bigframes/operations/semantics.py @@ -381,21 +381,42 @@ def filter(self, instruction: str, model, ground_with_google_search: bool = Fals self._confirm_operation(len(self._df)) df: bigframes.dataframe.DataFrame = self._df[columns].copy() + has_blob_column = False for column in columns: + if df[column].dtype == dtypes.OBJ_REF_DTYPE: + # Don't cast blob columns to string + has_blob_column = True + continue + if df[column].dtype != dtypes.STRING_DTYPE: df[column] = df[column].astype(dtypes.STRING_DTYPE) user_instruction = self._format_instruction(instruction, columns) output_instruction = "Based on the provided context, reply to the following claim by only True or False:" - results = typing.cast( - bigframes.dataframe.DataFrame, - model.predict( - self._make_prompt(df, columns, user_instruction, output_instruction), - temperature=0.0, - ground_with_google_search=ground_with_google_search, - ), - ) + if has_blob_column: + results = typing.cast( + bigframes.dataframe.DataFrame, + model.predict( + df, + prompt=self._make_multimodel_prompt( + df, columns, user_instruction, output_instruction + ), + temperature=0.0, + ground_with_google_search=ground_with_google_search, + ), + ) + else: + results = typing.cast( + bigframes.dataframe.DataFrame, + model.predict( + self._make_text_prompt( + df, columns, user_instruction, output_instruction + ), + temperature=0.0, + ground_with_google_search=ground_with_google_search, + ), + ) return self._df[ results["ml_generate_text_llm_result"].str.lower().str.contains("true") @@ -480,7 +501,13 @@ def map( self._confirm_operation(len(self._df)) df: bigframes.dataframe.DataFrame = self._df[columns].copy() + has_blob_column = False for column in columns: + if df[column].dtype == dtypes.OBJ_REF_DTYPE: + # Don't cast blob columns to string + has_blob_column = True + continue + if df[column].dtype != dtypes.STRING_DTYPE: df[column] = df[column].astype(dtypes.STRING_DTYPE) @@ -489,14 +516,29 @@ def map( "Based on the provided contenxt, answer the following instruction:" ) - results = typing.cast( - bigframes.series.Series, - model.predict( - self._make_prompt(df, columns, user_instruction, output_instruction), - temperature=0.0, - ground_with_google_search=ground_with_google_search, - )["ml_generate_text_llm_result"], - ) + if has_blob_column: + results = typing.cast( + bigframes.series.Series, + model.predict( + df, + prompt=self._make_multimodel_prompt( + df, columns, user_instruction, output_instruction + ), + temperature=0.0, + ground_with_google_search=ground_with_google_search, + )["ml_generate_text_llm_result"], + ) + else: + results = typing.cast( + bigframes.series.Series, + model.predict( + self._make_text_prompt( + df, columns, user_instruction, output_instruction + ), + temperature=0.0, + ground_with_google_search=ground_with_google_search, + )["ml_generate_text_llm_result"], + ) from bigframes.core.reshape.api import concat @@ -1060,8 +1102,19 @@ def _attach_embedding(dataframe, source_column: str, embedding_column: str, mode result_df[embedding_column] = embeddings return result_df - def _make_prompt( - self, prompt_df, columns, user_instruction: str, output_instruction: str + @staticmethod + def _make_multimodel_prompt( + prompt_df, columns, user_instruction: str, output_instruction: str + ): + prompt = [f"{output_instruction}\n{user_instruction}\nContext: "] + for col in columns: + prompt.extend([f"{col} is ", prompt_df[col]]) + + return prompt + + @staticmethod + def _make_text_prompt( + prompt_df, columns, user_instruction: str, output_instruction: str ): prompt_df["prompt"] = f"{output_instruction}\n{user_instruction}\nContext: " @@ -1071,7 +1124,8 @@ def _make_prompt( return prompt_df["prompt"] - def _parse_columns(self, instruction: str) -> List[str]: + @staticmethod + def _parse_columns(instruction: str) -> List[str]: """Extracts column names enclosed in curly braces from the user instruction. For example, _parse_columns("{city} is in {continent}") == ["city", "continent"] """ diff --git a/bigframes/series.py b/bigframes/series.py index 2c37913679..34ac3c3de9 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -68,9 +68,9 @@ LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] -_remote_function_recommendation_message = ( +_bigquery_function_recommendation_message = ( "Your functions could not be applied directly to the Series." - " Try converting it to a remote function." + " Try converting it to a BigFrames BigQuery function." ) _list = list # Type alias to escape Series.list property @@ -1530,25 +1530,20 @@ def apply( if not callable(func): raise ValueError( - "Only a ufunc (a function that applies to the entire Series) or a remote function that only works on single values are supported." + "Only a ufunc (a function that applies to the entire Series) or" + " a BigFrames BigQuery function that only works on single values" + " are supported." ) - # TODO(jialuo): Deprecate the "bigframes_remote_function" attribute. - # We have some tests using pre-defined remote_function that were defined - # based on "bigframes_remote_function" instead of - # "bigframes_bigquery_function". So we need to fix those pre-defined - # remote functions before deprecating the "bigframes_remote_function" - # attribute. - if not hasattr(func, "bigframes_remote_function") and not hasattr( - func, "bigframes_bigquery_function" - ): + if not hasattr(func, "bigframes_bigquery_function"): # It is neither a remote function nor a managed function. # Then it must be a vectorized function that applies to the Series # as a whole. if by_row: raise ValueError( - "A vectorized non-remote function can be provided only with by_row=False." - " For element-wise operation it must be a remote function." + "A vectorized non-BigFrames BigQuery function can be " + "provided only with by_row=False. For element-wise operation " + "it must be a BigFrames BigQuery function." ) try: @@ -1556,12 +1551,12 @@ def apply( except Exception as ex: # This could happen if any of the operators in func is not # supported on a Series. Let's guide the customer to use a - # remote function instead + # bigquery function instead if hasattr(ex, "message"): - ex.message += f"\n{_remote_function_recommendation_message}" + ex.message += f"\n{_bigquery_function_recommendation_message}" raise - # We are working with remote function at this point + # We are working with bigquery function at this point result_series = self._apply_unary_op( ops.RemoteFunctionOp(func=func, apply_on_null=True) ) @@ -1590,21 +1585,21 @@ def combine( ) -> Series: if not callable(func): raise ValueError( - "Only a ufunc (a function that applies to the entire Series) or a remote function that only works on single values are supported." + "Only a ufunc (a function that applies to the entire Series) or" + " a BigFrames BigQuery function that only works on single values" + " are supported." ) - if not hasattr(func, "bigframes_remote_function") and not hasattr( - func, "bigframes_bigquery_function" - ): + if not hasattr(func, "bigframes_bigquery_function"): # Keep this in sync with .apply try: return func(self, other) except Exception as ex: # This could happen if any of the operators in func is not # supported on a Series. Let's guide the customer to use a - # remote function instead + # bigquery function instead if hasattr(ex, "message"): - ex.message += f"\n{_remote_function_recommendation_message}" + ex.message += f"\n{_bigquery_function_recommendation_message}" raise result_series = self._apply_binary_op( @@ -1749,10 +1744,10 @@ def duplicated(self, keep: str = "first") -> Series: def mask(self, cond, other=None) -> Series: if callable(cond): - if hasattr(cond, "bigframes_remote_function"): + if hasattr(cond, "bigframes_bigquery_function"): cond = self.apply(cond) else: - # For non-remote function assume that it is applicable on Series + # For non-BigQuery function assume that it is applicable on Series cond = self.apply(cond, by_row=False) if not isinstance(cond, Series): diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 3f081e2177..7b416d4424 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -16,6 +16,7 @@ from __future__ import annotations +import datetime import logging import os import secrets @@ -54,6 +55,8 @@ ) import pyarrow as pa +from bigframes import exceptions as bfe +from bigframes import version import bigframes._config.bigquery_options as bigquery_options import bigframes.clients import bigframes.core.blocks as blocks @@ -65,8 +68,6 @@ # to register new and replacement ops with the Ibis BigQuery backend. import bigframes.dataframe import bigframes.dtypes -import bigframes.exceptions -import bigframes.exceptions as bfe import bigframes.functions._function_session as bff_session import bigframes.functions.function as bff import bigframes.session._io.bigquery as bf_io_bigquery @@ -77,7 +78,6 @@ import bigframes.session.planner import bigframes.session.temp_storage import bigframes.session.validation -import bigframes.version # Avoid circular imports. if typing.TYPE_CHECKING: @@ -147,6 +147,8 @@ def __init__( context: Optional[bigquery_options.BigQueryOptions] = None, clients_provider: Optional[bigframes.session.clients.ClientsProvider] = None, ): + _warn_if_bf_version_is_obsolete() + if context is None: context = bigquery_options.BigQueryOptions() @@ -1813,3 +1815,11 @@ def read_gbq_object_table( def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Session: return Session(context) + + +def _warn_if_bf_version_is_obsolete(): + today = datetime.datetime.today() + release_date = datetime.datetime.strptime(version.__release_date__, "%Y-%m-%d") + if today - release_date > datetime.timedelta(days=365): + msg = f"Your BigFrames version {version.__version__} is more than 1 year old. Please update to the lastest version." + warnings.warn(msg, bfe.ObsoleteVersionWarning) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 7c2586fe76..b9859e92a2 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -606,9 +606,10 @@ def read_gbq_query( time_travel_timestamp=None, ) + # No cluster candidates as user query might not be clusterable (eg because of ORDER BY clause) destination, query_job = self._query_to_destination( query, - index_cols, + cluster_candidates=[], api_name=api_name, configuration=configuration, ) @@ -645,7 +646,7 @@ def read_gbq_query( def _query_to_destination( self, query: str, - index_cols: List[str], + cluster_candidates: List[str], api_name: str, configuration: dict = {"query": {"useQueryCache": True}}, do_clustering=True, @@ -668,7 +669,7 @@ def _query_to_destination( assert schema is not None if do_clustering: cluster_cols = bf_io_bigquery.select_cluster_cols( - schema, cluster_candidates=index_cols + schema, cluster_candidates=cluster_candidates ) else: cluster_cols = [] diff --git a/bigframes/version.py b/bigframes/version.py index e4062aa0c6..4d0f809a6f 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.40.0" +__version__ = "1.41.0" # {x-release-please-start-date} -__release_date__ = "2025-03-11" +__release_date__ = "2025-03-19" # {x-release-please-end} diff --git a/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb b/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb new file mode 100644 index 0000000000..20d5b4161d --- /dev/null +++ b/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb @@ -0,0 +1,2169 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TpJu6BBeooES" + }, + "outputs": [], + "source": [ + "# Copyright 2023 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EQbZKS7_ooET" + }, + "source": [ + "## Build a Vector Search application using BigQuery DataFrames (aka BigFrames)\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"Vertex\n", + " Open in Vertex AI Workbench\n", + " \n", + " \n", + " \n", + " \"BQ\n", + " Open in BQ Studio\n", + " \n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "**Author:** Sudipto Guha (Google)\n", + "\n", + "**Last updated:** March 16th 2025" + ], + "metadata": { + "id": "vFMjpPBo9aVv" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SHQ3Gx-oooEU" + }, + "source": [ + "## Overview\n", + "\n", + "This notebook will guide you through a practical example of using [BigFrames](https://github.com/googleapis/python-bigquery-dataframes/issues) to perform [vector search](https://cloud.google.com/bigquery/docs/vector-search-intro) and analysis on a patent dataset within BigQuery. We will leverage Python and BigFrames to efficiently process, analyze, and gain insights from a large-scale dataset without moving data from BigQuery.\n", + "\n", + "Here's a breakdown of what we'll cover:\n", + "\n", + "1. **Data Ingestion and Embedding Generation:**\n", + "We will start by reading a public patent dataset directly from BigQuery into a BigFrames DataFrame.\n", + "We'll demonstrate how to use BigFrames' `TextEmbeddingGenerator` to create text embeddings for the patent abstracts. This process converts the textual data into numerical vectors that capture the semantic meaning of each abstract.\n", + "We'll show how BigFrames efficiently performs this embedding generation within BigQuery, avoiding data transfer to the client-side.\n", + "Finally, we'll store the generated embeddings back into a new BigQuery table for subsequent analysis.\n", + "\n", + "2. **Indexing and Similarity Search:**\n", + "Here we'll create a vector index using BigFrames to enable fast and scalable similarity searches.\n", + "We'll demonstrate how to create an IVF index for efficient approximate nearest neighbor searches.\n", + "We'll then perform a vector search using a sample query string to find patents that are semantically similar to the query. This showcases how vector search goes beyond keyword matching to find relevant results based on meaning.\n", + "\n", + "3. **AI-Powered Summarization with Retrieval Augmented Generation (RAG):**\n", + "To further enhance the analysis, we'll implement a RAG pipeline.\n", + "We'll retrieve the top most similar patents based on the vector search results from step 2.\n", + "We'll use BigFrames' `GeminiTextGenerator` to create a prompt for an LLM to generate a concise summary of the retrieved patents.\n", + "This demonstrates how to combine vector search with generative AI to extract and synthesize meaningful insights from complex patent data.\n", + "\n", + "\n", + "We will tie these pieces together in Python using BigQuery DataFrames. [Click here](https://cloud.google.com/bigquery/docs/dataframes-quickstart) to learn more about BigQuery DataFrames!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EHjmqb-0ooEU" + }, + "source": [ + "### Dataset\n", + "\n", + "This notebook uses the [BQ Patents Public Dataset](https://bigquery.cloud.google.com/dataset/patents-public-data:patentsview)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AqdihIDJooEU" + }, + "source": [ + "### Costs\n", + "\n", + "This tutorial uses billable components of Google Cloud:\n", + "\n", + "* BigQuery (compute)\n", + "* BigQuery ML\n", + "* Generative AI support on Vertex AI\n", + "\n", + "Learn about [BigQuery compute pricing](https://cloud.google.com/bigquery/pricing#analysis_pricing_models), [Generative AI support on Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing#generative_ai_models),\n", + "and [BigQuery ML pricing](https://cloud.google.com/bigquery/pricing#bqml),\n", + "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", + "to generate a cost estimate based on your projected usage." + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Setup & initialization\n", + "\n", + "Make sure you have the required roles and permissions listed below:\n", + "\n", + "For [Vector embedding generation](https://cloud.google.com/bigquery/docs/generate-text-embedding#required_roles)\n", + "\n", + "For [Vector Index creation](https://cloud.google.com/bigquery/docs/vector-index#roles_and_permissions)" + ], + "metadata": { + "id": "GqLjnm1hsKGU" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Z-mvYJUCooEV" + }, + "source": [ + "## Before you begin\n", + "\n", + "Complete the tasks in this section to set up your environment." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xn-v3mSvooEV" + }, + "source": [ + "### Set up your Google Cloud project\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", + "\n", + "3. [Click here](https://console.cloud.google.com/flows/enableapi?apiid=bigquery.googleapis.com,bigqueryconnection.googleapis.com,aiplatform.googleapis.com) to enable the following APIs:\n", + "\n", + " * BigQuery API\n", + " * BigQuery Connection API\n", + " * Vertex AI API\n", + "\n", + "4. If you are running this notebook locally, install the [Cloud SDK](https://cloud.google.com/sdk)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ioydzb_8ooEV" + }, + "source": [ + "#### Set your project ID\n", + "\n", + "**If you don't know your project ID**, see the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "b8bKCfIiooEV", + "executionInfo": { + "status": "ok", + "timestamp": 1742191597773, + "user_tz": -480, + "elapsed": 2, + "user": { + "displayName": "", + "userId": "" + } + } + }, + "outputs": [], + "source": [ + "# set your project ID below\n", + "PROJECT_ID = \"\" # @param {type:\"string\"}\n", + "\n", + "# set your region\n", + "REGION = \"US\" # @param {type: \"string\"}\n", + "\n", + "# Set the project id in gcloud\n", + "#! gcloud config set project {PROJECT_ID}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GbUgWr6LooEV" + }, + "source": [ + "#### Authenticate your Google Cloud account\n", + "\n", + "Depending on your Jupyter environment, you might have to manually authenticate. Follow the relevant instructions below." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "U7ChP8jUooEV" + }, + "source": [ + "**Vertex AI Workbench**\n", + "\n", + "Do nothing, you are already authenticated." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VfHOYcZZooEW" + }, + "source": [ + "**Local JupyterLab instance**\n", + "\n", + "Uncomment and run the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3cGhUVM0ooEW" + }, + "outputs": [], + "source": [ + "# ! gcloud auth login" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AoHnXlg-ooEW" + }, + "source": [ + "**Colab**\n", + "\n", + "Uncomment and run the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "j3lmnsh7ooEW", + "executionInfo": { + "status": "ok", + "timestamp": 1742191608487, + "user_tz": -480, + "elapsed": 2, + "user": { + "displayName": "", + "userId": "" + } + }, + "outputId": "eb68daf5-5558-487a-91d2-4b4f9e476da0" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "WARNING: google.colab.auth.authenticate_user() is not supported in Colab Enterprise.\n" + ] + } + ], + "source": [ + "# from google.colab import auth\n", + "# auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a9gsyttuooEW" + }, + "source": [ + "Now we are ready to use BigQuery DataFrames!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xckgWno6ouHY" + }, + "source": [ + "## Step 1: Data Ingestion and Embedding Generation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Hjg9jDN-ooEW" + }, + "source": [ + "Install libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "id": "R7STCS8xB5d2", + "executionInfo": { + "status": "ok", + "timestamp": 1742195413800, + "user_tz": -480, + "elapsed": 947, + "user": { + "displayName": "", + "userId": "" + } + } + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bf\n", + "import bigframes.ml as bf_ml\n", + "import bigframes.bigquery as bf_bq\n", + "import bigframes.ml.llm as bf_llm\n", + "\n", + "\n", + "from google.cloud import bigquery\n", + "from google.cloud import storage\n", + "\n", + "# Construct a BigQuery client object.\n", + "client = bigquery.Client()\n", + "\n", + "import pandas as pd\n", + "from IPython.display import Image, display\n", + "from PIL import Image as PILImage\n", + "import io\n", + "\n", + "import json\n", + "from IPython.display import Markdown\n", + "\n", + "# Note: The project option is not required in all environments.\n", + "# On BigQuery Studio, the project ID is automatically detected.\n", + "bf.options.bigquery.project = PROJECT_ID\n", + "bf.options.bigquery.location = REGION\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Partial ordering mode allows BigQuery DataFrames to push down many more row and column filters. On large clustered and partitioned tables, this can greatly reduce the number of bytes scanned and computation slots used. This [blog post](https://medium.com/google-cloud/introducing-partial-ordering-mode-for-bigquery-dataframes-bigframes-ec35841d95c0) goes over it in more detail." + ], + "metadata": { + "id": "iOFF9hrvs5WE" + } + }, + { + "cell_type": "code", + "source": [ + "bf.options.bigquery.ordering_mode = \"partial\"" + ], + "metadata": { + "id": "9Gil1Oaas7KA", + "executionInfo": { + "status": "ok", + "timestamp": 1742191620533, + "user_tz": -480, + "elapsed": 2, + "user": { + "displayName": "", + "userId": "" + } + } + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XGaGyyZsooEW" + }, + "source": [ + "If you want to reset the location of the created DataFrame or Series objects, reset the session by executing `bf.close_session()`. After that, you can reuse `bf.options.bigquery.location` to specify another location." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v6FGschEowht" + }, + "source": [ + "Data Input - read the data from a publicly available BigQuery dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "id": "zDSwoBo1CU3G", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "executionInfo": { + "status": "ok", + "timestamp": 1742192516923, + "user_tz": -480, + "elapsed": 468, + "user": { + "displayName": "", + "userId": "" + } + }, + "outputId": "83edbc2f-5a23-407b-8890-f968eb31be44" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py:3553: UserWarning: \u001b[93mReading cached table from 2025-03-17 06:07:09.526507+00:00 to avoid\n", + "incompatibilies with previous reads of this table. To read the latest\n", + "version, set `use_cache=False` or close the current session with\n", + "Session.close() or bigframes.pandas.close_session().\u001b[0m\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n" + ] + } + ], + "source": [ + "publications = bf.read_gbq('patents-public-data.google_patents_research.publications')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "id": "tYDoaKgJChiq", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "executionInfo": { + "status": "ok", + "timestamp": 1742192524632, + "user_tz": -480, + "elapsed": 6697, + "user": { + "displayName": "", + "userId": "" + } + }, + "outputId": "9174da29-a051-4a99-e38f-6a2b09cfe4e9" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job 6f15ad71-cc7b-49c1-90e9-274bea7afbb9 is DONE. 477.4 GB processed. Open Job" + ] + }, + "metadata": {} + } + ], + "source": [ + "## create patents base table (subset of 10k out of ~110M records)\n", + "\n", + "keep = (publications.embedding_v1.str.len() > 0) & (publications.title.str.len() > 0) & (publications.abstract.str.len() > 30)\n", + "\n", + "## Choose 10000 random rows to analyze\n", + "publications = publications[keep].peek(10000)" + ] + }, + { + "cell_type": "code", + "source": [ + "## take a look at the sample dataset\n", + "\n", + "publications.head(5)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 556 + }, + "id": "XmqdJInztzPl", + "executionInfo": { + "status": "ok", + "timestamp": 1742191801044, + "user_tz": -480, + "elapsed": 6, + "user": { + "displayName": "", + "userId": "" + } + }, + "outputId": "ae05f3a6-edeb-423a-c061-c416717e1ec5" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " publication_number title \\\n", + "0 AU-338190-S Compressor wheel \n", + "1 CN-100525651-C Method for processing egg products \n", + "2 TW-I725505-B Improved carbon molecular sieve adsorbent \n", + "3 EP-0248026-B1 A system for supplying strip to a processing line \n", + "4 MY-135762-A Method for producing acrylic acid \n", + "\n", + " title_translated abstract \\\n", + "0 False Newness and distinctiveness is claimed in the ... \n", + "1 False The invention discloses a processing method of... \n", + "2 False Disclosed herein are rapid cycle pressure swin... \n", + "3 False A system (10) for supplying strip material (S)... \n", + "4 False A PROCESS FOR THE FRACTIONAL CONDENSATION OF A... \n", + "\n", + " abstract_translated cpc \\\n", + "0 False [] \n", + "1 False [] \n", + "2 False [{'code': 'B01D2253/116', 'inventive': False, ... \n", + "3 False [{'code': 'B65H2701/37', 'inventive': False, '... \n", + "4 False [{'code': 'C07C51/50', 'inventive': True, 'fir... \n", + "\n", + " cpc_low \\\n", + "0 [] \n", + "1 [] \n", + "2 ['B01D2253/116' 'B01D2253/10' 'B01D2253/00' 'B... \n", + "3 ['B65H2701/37' 'B65H2701/30' 'B65H2701/00' 'B6... \n", + "4 ['C07C51/50' 'C07C51/42' 'C07C51/00' 'C07C' 'C... \n", + "\n", + " cpc_inventive_low \\\n", + "0 [] \n", + "1 [] \n", + "2 ['B01D2253/116' 'B01D2253/10' 'B01D2253/00' 'B... \n", + "3 ['B65H2701/37' 'B65H2701/30' 'B65H2701/00' 'B6... \n", + "4 ['C07C51/50' 'C07C51/42' 'C07C51/00' 'C07C' 'C... \n", + "\n", + " top_terms \\\n", + "0 ['compressor wheel' 'newness' 'distinctiveness... \n", + "1 ['egg' 'processing method' 'egg body' 'pack' '... \n", + "2 ['swing adsorption' 'pressure swing' 'molecula... \n", + "3 ['strip material' 'assembly' 'coil' 'take' 'pr... \n", + "4 ['acrylic acid' 'producing acrylic' 'stabilize... \n", + "\n", + " similar \\\n", + "0 [{'publication_number': 'AU-338190-S', 'applic... \n", + "1 [{'publication_number': 'CN-101396133-B', 'app... \n", + "2 [{'publication_number': 'EP-1867379-B1', 'appl... \n", + "3 [{'publication_number': 'EP-0248026-B1', 'appl... \n", + "4 [{'publication_number': 'SG-157371-A1', 'appli... \n", + "\n", + " url country \\\n", + "0 https://patents.google.com/patent/AU338190S Australia \n", + "1 https://patents.google.com/patent/CN100525651C China \n", + "2 https://patents.google.com/patent/TWI725505B Taiwan \n", + "3 https://patents.google.com/patent/EP0248026B1 European Patent Office \n", + "4 https://patents.google.com/patent/MY135762A Malaysia \n", + "\n", + " publication_description cited_by \\\n", + "0 Design [] \n", + "1 Granted Patent [] \n", + "2 Granted Patent or patent of addition [] \n", + "3 Granted patent [] \n", + "4 Granted patent / Utility model [] \n", + "\n", + " embedding_v1 \n", + "0 [ 5.2067090e-02 -1.5462303e-01 -1.3415462e-01 ... \n", + "1 [-0.05154578 -0.00437102 0.01365495 -0.168424... \n", + "2 [ 0.0163008 -0.20972364 0.02052403 -0.003073... \n", + "3 [-0.04377723 0.04111805 -0.0929429 0.043924... \n", + "4 [ 0.10407669 0.01262973 -0.22623734 -0.171453... " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
publication_numbertitletitle_translatedabstractabstract_translatedcpccpc_lowcpc_inventive_lowtop_termssimilarurlcountrypublication_descriptioncited_byembedding_v1
0AU-338190-SCompressor wheelFalseNewness and distinctiveness is claimed in the ...False[][][]['compressor wheel' 'newness' 'distinctiveness...[{'publication_number': 'AU-338190-S', 'applic...https://patents.google.com/patent/AU338190SAustraliaDesign[][ 5.2067090e-02 -1.5462303e-01 -1.3415462e-01 ...
1CN-100525651-CMethod for processing egg productsFalseThe invention discloses a processing method of...False[][][]['egg' 'processing method' 'egg body' 'pack' '...[{'publication_number': 'CN-101396133-B', 'app...https://patents.google.com/patent/CN100525651CChinaGranted Patent[][-0.05154578 -0.00437102 0.01365495 -0.168424...
2TW-I725505-BImproved carbon molecular sieve adsorbentFalseDisclosed herein are rapid cycle pressure swin...False[{'code': 'B01D2253/116', 'inventive': False, ...['B01D2253/116' 'B01D2253/10' 'B01D2253/00' 'B...['B01D2253/116' 'B01D2253/10' 'B01D2253/00' 'B...['swing adsorption' 'pressure swing' 'molecula...[{'publication_number': 'EP-1867379-B1', 'appl...https://patents.google.com/patent/TWI725505BTaiwanGranted Patent or patent of addition[][ 0.0163008 -0.20972364 0.02052403 -0.003073...
3EP-0248026-B1A system for supplying strip to a processing lineFalseA system (10) for supplying strip material (S)...False[{'code': 'B65H2701/37', 'inventive': False, '...['B65H2701/37' 'B65H2701/30' 'B65H2701/00' 'B6...['B65H2701/37' 'B65H2701/30' 'B65H2701/00' 'B6...['strip material' 'assembly' 'coil' 'take' 'pr...[{'publication_number': 'EP-0248026-B1', 'appl...https://patents.google.com/patent/EP0248026B1European Patent OfficeGranted patent[][-0.04377723 0.04111805 -0.0929429 0.043924...
4MY-135762-AMethod for producing acrylic acidFalseA PROCESS FOR THE FRACTIONAL CONDENSATION OF A...False[{'code': 'C07C51/50', 'inventive': True, 'fir...['C07C51/50' 'C07C51/42' 'C07C51/00' 'C07C' 'C...['C07C51/50' 'C07C51/42' 'C07C51/00' 'C07C' 'C...['acrylic acid' 'producing acrylic' 'stabilize...[{'publication_number': 'SG-157371-A1', 'appli...https://patents.google.com/patent/MY135762AMalaysiaGranted patent / Utility model[][ 0.10407669 0.01262973 -0.22623734 -0.171453...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "publications", + "repr_error": "Function 'unique' has no kernel matching input types (list not null>>)" + } + }, + "metadata": {}, + "execution_count": 11 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Wl2o-NYMoygb" + }, + "source": [ + "Generate the text embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "li38q8FzDDMu", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "executionInfo": { + "status": "ok", + "timestamp": 1742192047236, + "user_tz": -480, + "elapsed": 4528, + "user": { + "displayName": "", + "userId": "" + } + }, + "outputId": "b8c1bd38-b484-4f71-bd38-927c8677d0c5" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job 127fb090-1c9e-4d7a-acdd-86f077a87b07 is DONE. 0 Bytes processed. Open Job" + ] + }, + "metadata": {} + } + ], + "source": [ + "from bigframes.ml.llm import TextEmbeddingGenerator\n", + "\n", + "text_model = TextEmbeddingGenerator() # No connection id needed" + ] + }, + { + "cell_type": "code", + "source": [ + "## rename abstract column to content as the desired column on which embedding will be generated\n", + "publications = publications[[\"publication_number\", \"title\", \"abstract\"]].rename(columns={'abstract': 'content'})\n", + "\n", + "## generate the embeddings\n", + "## takes ~2-3 mins to run\n", + "embedding = text_model.predict(publications)[[\"publication_number\", \"title\", \"content\", \"ml_generate_embedding_result\",\"ml_generate_embedding_status\"]]\n", + "\n", + "## filter out rows where the embedding generation failed. the embedding status value is empty if the embedding generation was successful\n", + "embedding = embedding[~embedding[\"ml_generate_embedding_status\"].isnull()]\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 139 + }, + "id": "b5HHZob_u61B", + "executionInfo": { + "status": "ok", + "timestamp": 1742192656608, + "user_tz": -480, + "elapsed": 126632, + "user": { + "displayName": "", + "userId": "" + } + }, + "outputId": "c9ecc5fd-5d11-4fd8-f59b-9dce4e12e371" + }, + "execution_count": 19, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Load job b8079d70-7d99-4198-898f-2921915f305f is DONE. Open Job" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job 17338b11-420c-4d3d-bd55-0bba1247f705 is DONE. 8.9 MB processed. Open Job" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/bigframes/core/array_value.py:114: PreviewWarning: \u001b[93mJSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\u001b[0m\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job ebf3eb36-3199-4551-ad07-5fa5abb200be is DONE. 20.0 kB processed. Open Job" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job 9e9c5aae-9045-4750-a34e-c98493369a90 is DONE. 20.0 kB processed. Open Job" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "embedding.head(5)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 464 + }, + "id": "OIT5FbqAwqG5", + "executionInfo": { + "status": "ok", + "timestamp": 1742192727525, + "user_tz": -480, + "elapsed": 6715, + "user": { + "displayName": "", + "userId": "" + } + }, + "outputId": "d04c994a-a0c8-44b0-e897-d871036eeb1f" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/bigframes/core/array_value.py:238: AmbiguousWindowWarning: \u001b[93mWindow ordering may be ambiguous, this can cause unstable results.\u001b[0m\n", + " warnings.warn(msg, bfe.AmbiguousWindowWarning)\n", + "/usr/local/lib/python3.10/dist-packages/bigframes/core/array_value.py:262: AmbiguousWindowWarning: \u001b[93mWindow ordering may be ambiguous, this can cause unstable results.\u001b[0m\n", + " warnings.warn(msg, category=bfe.AmbiguousWindowWarning)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job 1bc3517f-df67-456c-8d31-14a6432b8629 is DONE. 70.4 MB processed. Open Job" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job ae92602b-0eab-437f-a02d-102a4defa99a is DONE. 31.3 kB processed. Open Job" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " publication_number title \\\n", + "5753 HN-1996000102-A NEW PESTICIDES \n", + "8115 AU-325874-S Baby sling \n", + "5415 AU-2016256863-A1 Microbial compositions and methods for denitri... \n", + "8886 FR-2368509-A1 NEW DEODORANTS OR FRESHENERS AND COMPOSITIONS ... \n", + "5661 US-2006051255-A1 Gas generator \n", + "\n", + " content \\\n", + "5753 THE PRESENT INVENTION REFERS TO \n", + "8115 Adjustable baby sling with velcro. \n", + "5415 The present invention provides compositions an... \n", + "8886 Polyanionic polyamide salts comprising a conca... \n", + "5661 A gas generator insulated by a vacuum-jacket v... \n", + "\n", + " ml_generate_embedding_result \\\n", + "5753 [-0.02709213 0.0366395 0.03931784 -0.003942... \n", + "8115 [ 6.44167811e-02 -2.01051459e-02 -3.39564607e-... \n", + "5415 [-5.90537786e-02 2.38401629e-03 7.22754598e-... \n", + "8886 [-3.44522446e-02 5.64815439e-02 -1.35829514e-... \n", + "5661 [-1.50892800e-02 6.56989636e-03 2.34969519e-... \n", + "\n", + " ml_generate_embedding_status \n", + "5753 \n", + "8115 \n", + "5415 \n", + "8886 \n", + "5661 \n", + "\n", + "[5 rows x 5 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
publication_numbertitlecontentml_generate_embedding_resultml_generate_embedding_status
5753HN-1996000102-ANEW PESTICIDESTHE PRESENT INVENTION REFERS TO[-0.02709213 0.0366395 0.03931784 -0.003942...
8115AU-325874-SBaby slingAdjustable baby sling with velcro.[ 6.44167811e-02 -2.01051459e-02 -3.39564607e-...
5415AU-2016256863-A1Microbial compositions and methods for denitri...The present invention provides compositions an...[-5.90537786e-02 2.38401629e-03 7.22754598e-...
8886FR-2368509-A1NEW DEODORANTS OR FRESHENERS AND COMPOSITIONS ...Polyanionic polyamide salts comprising a conca...[-3.44522446e-02 5.64815439e-02 -1.35829514e-...
5661US-2006051255-A1Gas generatorA gas generator insulated by a vacuum-jacket v...[-1.50892800e-02 6.56989636e-03 2.34969519e-...
\n", + "

5 rows × 5 columns

\n", + "
[5 rows x 5 columns in total]" + ] + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# store embeddings in a BQ table\n", + "DATASET_ID = \"\" # @param {type:\"string\"}\n", + "TEXT_EMBEDDING_TABLE_ID = \"\" # @param {type:\"string\"}\n", + "embedding.to_gbq(f\"{DATASET_ID}.{TEXT_EMBEDDING_TABLE_ID}\", if_exists='replace')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 53 + }, + "id": "GP3ZqX_bxLGq", + "executionInfo": { + "status": "ok", + "timestamp": 1742192833667, + "user_tz": -480, + "elapsed": 6590, + "user": { + "displayName": "", + "userId": "" + } + }, + "outputId": "fb823ea2-e47c-415f-84d4-543dd3291e15" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job 7370fb69-9589-4a9a-a5cf-7f7c8d50c53c is DONE. 70.3 MB processed. Open Job" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'bqml_llm_trial.patent_embedding_BF-n'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 21 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OUZ3NNbzo1Tb" + }, + "source": [ + "## Step 2: Indexing and Similarity Search" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### [Create a Vector Index](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.bigquery#bigframes_bigquery_create_vector_index) using BigFrames\n", + "\n", + "\n", + "**Index Type**\n", + "\n", + "The algorithm to use to build the vector index.\n", + "The supported values are IVF and TREE_AH." + ], + "metadata": { + "id": "mvJH2FCmynMm" + } + }, + { + "cell_type": "code", + "source": [ + "## create vector index (note only works of tables >5000 rows)\n", + "\n", + "bf_bq.create_vector_index(\n", + " table_id = f\"{DATASET_ID}.{TEXT_EMBEDDING_TABLE_ID}\",\n", + " column_name = \"ml_generate_embedding_result\",\n", + " replace= True,\n", + " index_name = \"bf_python_index\",\n", + " distance_type=\"cosine\",\n", + " index_type= \"ivf\"\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "id": "6SBVdv6gyU5A", + "executionInfo": { + "status": "ok", + "timestamp": 1742193028877, + "user_tz": -480, + "elapsed": 3882, + "user": { + "displayName": "", + "userId": "" + } + }, + "outputId": "6583e113-de27-4b44-972d-c1cc061e3c76" + }, + "execution_count": 22, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job 775f872d-ea2d-48f3-8b65-a85ed573dac0 is DONE. 61.4 MB processed. Open Job" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Vector Search (semantic search) using Vector Index\n", + "\n", + "ANN (approx nearest neighbor) search using the created vector index" + ], + "metadata": { + "id": "bo8mBbRLzCOA" + } + }, + { + "cell_type": "code", + "source": [ + "## Set variable for vector search\n", + "\n", + "TEXT_SEARCH_STRING = \"Chip assemblies employing solder bonds to back-side lands including an electrolytic nickel layer\" ## replace with whatever search string you want to use for the vector search\n", + "FRACTION_LISTS_TO_SEARCH = 0.01" + ], + "metadata": { + "id": "v19BJm_wzPdZ", + "executionInfo": { + "status": "ok", + "timestamp": 1742194606771, + "user_tz": -480, + "elapsed": 639, + "user": { + "displayName": "", + "userId": "" + } + } + }, + "execution_count": 23, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# convert search string to dataframe\n", + "TEXT_SEARCH_DF = bf.DataFrame([TEXT_SEARCH_STRING], columns=['search_string'])\n", + "\n", + "#generate embedding of search query\n", + "search_query = bf.DataFrame(text_model.predict(TEXT_SEARCH_DF))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 121 + }, + "id": "pAQY1ejpzPap", + "executionInfo": { + "status": "ok", + "timestamp": 1742194625774, + "user_tz": -480, + "elapsed": 6927, + "user": { + "displayName": "", + "userId": "" + } + }, + "outputId": "485698ad-ac6e-4c93-844e-5d0f30aff13a" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job 3a352b3b-b968-4347-80fe-6a9ef9045358 is DONE. 0 Bytes processed. Open Job" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/bigframes/core/array_value.py:114: PreviewWarning: \u001b[93mJSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\u001b[0m\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job 0e6d609b-9818-45fe-b26d-7247722bbea4 is DONE. 2 Bytes processed. Open Job" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job 66602cee-78a8-4955-96fb-6a2d603d5d7d is DONE. 2 Bytes processed. Open Job" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "## search the base table for the user's query\n", + "\n", + "vector_search_results = bf_bq.vector_search(\n", + " base_table=f\"{DATASET_ID}.{TEXT_EMBEDDING_TABLE_ID}\",\n", + " column_to_search=\"ml_generate_embedding_result\",\n", + " query=search_query,\n", + " distance_type=\"COSINE\",\n", + " query_column_to_search=\"ml_generate_embedding_result\",\n", + " top_k=5)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 104 + }, + "id": "sx0AGAdn5FYX", + "executionInfo": { + "status": "ok", + "timestamp": 1742194670801, + "user_tz": -480, + "elapsed": 5110, + "user": { + "displayName": "", + "userId": "" + } + }, + "outputId": "551ebac3-594f-4303-ca97-5301dfee72bb" + }, + "execution_count": 25, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job 4768061f-d5a6-4638-8396-5c15a098ad7b is RUNNING. Open Job" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job e6175f9a-6bbd-4cbe-967b-b04421b33b02 is DONE. 132.7 MB processed. Open Job" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/bigframes/core/array_value.py:114: PreviewWarning: \u001b[93mJSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\u001b[0m\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "## View the returned results based on simalirity with the user's query\n", + "\n", + "vector_search_results[['content', 'publication_number',\n", + " 'title', 'content_1', 'distance']].rename(columns={'content': 'query', 'content_1':'abstract (relevant match)' , 'title':'title (relevant match)'})" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 270 + }, + "id": "px1v4iJM5L0c", + "executionInfo": { + "status": "ok", + "timestamp": 1742195090670, + "user_tz": -480, + "elapsed": 3511, + "user": { + "displayName": "", + "userId": "" + } + }, + "outputId": "d107b6e3-a362-42db-c0c2-084d02acd244" + }, + "execution_count": 27, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job 61c1138d-f4da-4971-a7dd-aa7150bafe50 is DONE. 0 Bytes processed. Open Job" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " query publication_number \\\n", + "0 Chip assemblies employing solder bonds to back... KR-102569815-B1 \n", + "0 Chip assemblies employing solder bonds to back... US-8962389-B2 \n", + "0 Chip assemblies employing solder bonds to back... TW-I256279-B \n", + "0 Chip assemblies employing solder bonds to back... US-2005230147-A1 \n", + "0 Chip assemblies employing solder bonds to back... US-6686652-B1 \n", + "\n", + " title (relevant match) \\\n", + "0 electronic device package \n", + "0 Microelectronic packages including patterned d... \n", + "0 Substrate for electrical device and methods of... \n", + "0 Wiring board, and electronic device with an el... \n", + "0 Locking lead tips and die attach pad for a lea... \n", + "\n", + " abstract (relevant match) distance \n", + "0 An electronic device package technology is dis... 0.357673 \n", + "0 Embodiments of microelectronic packages and me... 0.344263 \n", + "0 Substrate for electrical devices and methods o... 0.3687 \n", + "0 An electronic device is mounted on a wiring bo... 0.304293 \n", + "0 An assembly and method suitable for use in pac... 0.364334 \n", + "\n", + "[5 rows x 5 columns]" + ], + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
querypublication_numbertitle (relevant match)abstract (relevant match)distance
0Chip assemblies employing solder bonds to back...KR-102569815-B1electronic device packageAn electronic device package technology is dis...0.357673
0Chip assemblies employing solder bonds to back...US-8962389-B2Microelectronic packages including patterned d...Embodiments of microelectronic packages and me...0.344263
0Chip assemblies employing solder bonds to back...TW-I256279-BSubstrate for electrical device and methods of...Substrate for electrical devices and methods o...0.3687
0Chip assemblies employing solder bonds to back...US-2005230147-A1Wiring board, and electronic device with an el...An electronic device is mounted on a wiring bo...0.304293
0Chip assemblies employing solder bonds to back...US-6686652-B1Locking lead tips and die attach pad for a lea...An assembly and method suitable for use in pac...0.364334
\n", + "

5 rows × 5 columns

\n", + "
[5 rows x 5 columns in total]" + ] + }, + "metadata": {}, + "execution_count": 27 + } + ] + }, + { + "cell_type": "code", + "source": [ + "## Brute force result (for comparison)\n", + "\n", + "\n", + "brute_force_result = bf_bq.vector_search(\n", + " table_id = f\"{DATASET_ID}.{TEXT_EMBEDDING_TABLE_ID}\",\n", + " column_to_search=\"ml_generate_embedding_result\",\n", + " query=search_query,\n", + " top_k=5,\n", + " distance_type=\"COSINE\",\n", + " use_brute_force=True)\n" + ], + "metadata": { + "id": "5fb_O-ne5cvH", + "executionInfo": { + "status": "ok", + "timestamp": 1742195139318, + "user_tz": -480, + "elapsed": 1622, + "user": { + "displayName": "", + "userId": "" + } + } + }, + "execution_count": 28, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "21rNsFMHo8hO" + }, + "source": [ + "## Step 3: AI-Powered Summarization with Retrieval Augmented Generation (RAG)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Patent documents can be dense and time-consuming to digest. AI-Powered Patent Summarization utilizes Retrieval Augmented Generation (RAG) to streamline this process. By retrieving relevant patent information through vector search and then synthesizing it with a large language model, we can generate concise, human-readable summaries, saving valuable time and effort. The code sample below walks through how to set this up continuing with the same user query as the previous use case." + ], + "metadata": { + "id": "K3pIQrzB7T_G" + } + }, + { + "cell_type": "code", + "source": [ + "## gemini model\n", + "\n", + "llm_model = bf_llm.GeminiTextGenerator(model_name = \"gemini-1.5-flash-002\") ## replace with other model as needed" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "id": "jb5rueqU7T5J", + "executionInfo": { + "status": "ok", + "timestamp": 1742195565658, + "user_tz": -480, + "elapsed": 4827, + "user": { + "displayName": "", + "userId": "" + } + }, + "outputId": "43732836-ebae-4fb3-b28e-bfea51146c72" + }, + "execution_count": 37, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job 093debfb-08f1-4bba-8b39-c3da575793a4 is DONE. 0 Bytes processed. Open Job" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "We will use the same user query from Section 2, and pass the list of abstracts returned by the vector search into the prompt for the RAG application" + ], + "metadata": { + "id": "41e12JTf70sr" + } + }, + { + "cell_type": "code", + "source": [ + "TEMPERATURE = 0.4" + ], + "metadata": { + "id": "EyP-ZFJK8h-2", + "executionInfo": { + "status": "ok", + "timestamp": 1742195536109, + "user_tz": -480, + "elapsed": 1474, + "user": { + "displayName": "", + "userId": "" + } + } + }, + "execution_count": 35, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Extract strings into a list of JSON strings\n", + "json_strings = [json.dumps({'abstract': s}) for s in vector_search_results['content_1']]\n", + "ALL_ABSTRACTS = json_strings\n", + "\n", + "# Print the result (optional)\n", + "print(ALL_ABSTRACTS)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 72 + }, + "id": "eP99R6SV7Tug", + "executionInfo": { + "status": "ok", + "timestamp": 1742195421813, + "user_tz": -480, + "elapsed": 3371, + "user": { + "displayName": "", + "userId": "" + } + }, + "outputId": "c34bc931-5be8-410e-ac1f-604df31ef533" + }, + "execution_count": 34, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job f77bbae5-ea1f-4ba9-92bc-bfc7bc474cd9 is DONE. 0 Bytes processed. Open Job" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['{\"abstract\": \"Substrate for electrical devices and methods of fabricating such substrate are disclosed. An embodiment for an electrical device with substrate comprised of a chip having an active surface; a substrate being coupled with the chip; and a plurality of conductive wires (bumps) electrically connecting the chip to the substrate. In additionally, the present invention of the substrate for electrical devices may be comprised of an adhesive mean or a submember as required, and furthermore, by mean of using substrate, the present invention may be capable of affording a number of advantages, it is possible to include a thinner electrical device thickness, enhanced reliability, and a decreased cost in production.\"}', '{\"abstract\": \"An electronic device is mounted on a wiring board, which includes: a substrate having through holes, and lands extending on surfaces of the substrate and adjacent to openings of the through holes. Further, at least one coating layer is provided, which coats at least one part of an outer peripheral region of the at least one land, in order to cause that the at least one part is separated from a lead-less solder, thereby preventing any peel of the land from the surface of the substrate.\"}', '{\"abstract\": \"An assembly and method suitable for use in packaging integrated circuits including a support substrate for supporting an integrated circuit die embedded in a molded encapsulating cap. The substrate includes a conductive die attach pad adapted to be molded into the encapsulating cap. The pad includes an interior facing support surface and a spaced-apart exterior facing exposed surface defined by a peripheral edge. The support surface is adapted to support the embedded die, while the exposed surface is to be exposed from the encapsulating cap. The attach pad further includes a locking ledge portion extending outward peripherally beyond at least a portion of the exposed surface peripheral edge. This ledge is adapted to be subtended in the encapsulating cap in a manner substantially preventing a pull-out of the attach pad in a direction away from the encapsulating cap.\"}', '{\"abstract\": \"Embodiments of microelectronic packages and methods for fabricating microelectronic packages are provided. In one embodiment, the fabrication method includes printing a patterned die attach material onto the backside of a wafer including an array of non-singulated microelectronic die each having an interior keep-out area, such as a central keep-out area. The die attach material, such as a B-stage epoxy, is printed onto the wafer in a predetermined pattern such that the die attach material does not encroaching into the interior keep-out areas. The wafer is singulated to produce singulated microelectronic die each including a layer of die attach material. The singulated microelectronic die are then placed onto leadframes or other package substrates with the die attach material contacting the package substrates. The layer of die attach material is then fully cured to adhere an outer peripheral portion of the singulated microelectronic die to its package substrate.\"}', '{\"abstract\": \"An electronic device package technology is disclosed. An electronic device package may include a substrate. The electronic device package may also include a first electronic component and a second electronic component in a stacked configuration. Each of the first electronic component and the second electronic component can include electrical interconnections exposed toward the substrate. The electronic device package may further include a mold compound encapsulating the first electronic component and the second electronic component. Additionally, the electronic device package can include electrically conductive posts extending through the mold compound between the electrical interconnection of at least one of the first electronic component and the second electronic component and the substrate. Related systems and methods are also disclosed.\"}']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "## Setup the LLM prompt\n", + "\n", + "prompt = f\"\"\"\n", + "You are an expert patent analyst. I will provide you the abstracts of the top 5 patents in json format retrieved by a vector search based on a user's query.\n", + "Your task is to analyze these abstracts and generate a concise, coherent summary that encapsulates the core innovations and concepts shared among them.\n", + "\n", + "In your output, share the original user query.\n", + "Then output the concise, coherent summary that encapsulates the core innovations and concepts shared among the top 5 abstracts. The heading for this section should\n", + "be : Summary of the top 5 abstracts that are semantically closest to the user query.\n", + "\n", + "User Query: {TEXT_SEARCH_STRING}\n", + "Top 5 abstracts: {ALL_ABSTRACTS}\n", + "\n", + "Instructions:\n", + "\n", + "Focus on identifying the common themes and key technological advancements described in the abstracts.\n", + "Synthesize the information into a clear and concise summary, approximately 150-200 words.\n", + "Avoid simply copying phrases from the abstracts. Instead, aim to provide a cohesive overview of the shared concepts.\n", + "Highlight the potential applications and benefits of the described inventions.\n", + "Maintain a professional and objective tone.\n", + "Do not mention the individual patents by number, focus on summarizing the shared concepts.\n", + "\"\"\"\n", + "\n", + "print(prompt)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "collapsed": true, + "id": "kSNSi1GV8OAD", + "executionInfo": { + "status": "ok", + "timestamp": 1742195587180, + "user_tz": -480, + "elapsed": 1620, + "user": { + "displayName": "", + "userId": "" + } + }, + "outputId": "37fbc822-1160-4fbd-c7d6-ecb4a16db394" + }, + "execution_count": 41, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "You are an expert patent analyst. I will provide you the abstracts of the top 5 patents in json format retrieved by a vector search based on a user's query.\n", + "Your task is to analyze these abstracts and generate a concise, coherent summary that encapsulates the core innovations and concepts shared among them.\n", + "\n", + "In your output, share the original user query.\n", + "Then output the concise, coherent summary that encapsulates the core innovations and concepts shared among the top 5 abstracts. The heading for this section should\n", + "be : Summary of the top 5 abstracts that are semantically closest to the user query.\n", + "\n", + "User Query: Chip assemblies employing solder bonds to back-side lands including an electrolytic nickel layer\n", + "Top 5 abstracts: ['{\"abstract\": \"Substrate for electrical devices and methods of fabricating such substrate are disclosed. An embodiment for an electrical device with substrate comprised of a chip having an active surface; a substrate being coupled with the chip; and a plurality of conductive wires (bumps) electrically connecting the chip to the substrate. In additionally, the present invention of the substrate for electrical devices may be comprised of an adhesive mean or a submember as required, and furthermore, by mean of using substrate, the present invention may be capable of affording a number of advantages, it is possible to include a thinner electrical device thickness, enhanced reliability, and a decreased cost in production.\"}', '{\"abstract\": \"An electronic device is mounted on a wiring board, which includes: a substrate having through holes, and lands extending on surfaces of the substrate and adjacent to openings of the through holes. Further, at least one coating layer is provided, which coats at least one part of an outer peripheral region of the at least one land, in order to cause that the at least one part is separated from a lead-less solder, thereby preventing any peel of the land from the surface of the substrate.\"}', '{\"abstract\": \"An assembly and method suitable for use in packaging integrated circuits including a support substrate for supporting an integrated circuit die embedded in a molded encapsulating cap. The substrate includes a conductive die attach pad adapted to be molded into the encapsulating cap. The pad includes an interior facing support surface and a spaced-apart exterior facing exposed surface defined by a peripheral edge. The support surface is adapted to support the embedded die, while the exposed surface is to be exposed from the encapsulating cap. The attach pad further includes a locking ledge portion extending outward peripherally beyond at least a portion of the exposed surface peripheral edge. This ledge is adapted to be subtended in the encapsulating cap in a manner substantially preventing a pull-out of the attach pad in a direction away from the encapsulating cap.\"}', '{\"abstract\": \"Embodiments of microelectronic packages and methods for fabricating microelectronic packages are provided. In one embodiment, the fabrication method includes printing a patterned die attach material onto the backside of a wafer including an array of non-singulated microelectronic die each having an interior keep-out area, such as a central keep-out area. The die attach material, such as a B-stage epoxy, is printed onto the wafer in a predetermined pattern such that the die attach material does not encroaching into the interior keep-out areas. The wafer is singulated to produce singulated microelectronic die each including a layer of die attach material. The singulated microelectronic die are then placed onto leadframes or other package substrates with the die attach material contacting the package substrates. The layer of die attach material is then fully cured to adhere an outer peripheral portion of the singulated microelectronic die to its package substrate.\"}', '{\"abstract\": \"An electronic device package technology is disclosed. An electronic device package may include a substrate. The electronic device package may also include a first electronic component and a second electronic component in a stacked configuration. Each of the first electronic component and the second electronic component can include electrical interconnections exposed toward the substrate. The electronic device package may further include a mold compound encapsulating the first electronic component and the second electronic component. Additionally, the electronic device package can include electrically conductive posts extending through the mold compound between the electrical interconnection of at least one of the first electronic component and the second electronic component and the substrate. Related systems and methods are also disclosed.\"}']\n", + "\n", + "Instructions:\n", + "\n", + "Focus on identifying the common themes and key technological advancements described in the abstracts.\n", + "Synthesize the information into a clear and concise summary, approximately 150-200 words.\n", + "Avoid simply copying phrases from the abstracts. Instead, aim to provide a cohesive overview of the shared concepts.\n", + "Highlight the potential applications and benefits of the described inventions.\n", + "Maintain a professional and objective tone.\n", + "Do not mention the individual patents by number, focus on summarizing the shared concepts.\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "## Define a function that will take the input propmpt and run the LLM\n", + "\n", + "def predict(prompt: str, temperature: float = TEMPERATURE) -> str:\n", + " # Create dataframe\n", + " input = bf.DataFrame(\n", + " {\n", + " \"prompt\": [prompt],\n", + " }\n", + " )\n", + "\n", + " # Return response\n", + " return llm_model.predict(input, temperature=temperature).ml_generate_text_llm_result.iloc[0]" + ], + "metadata": { + "id": "njiQdfkT8Y7V", + "executionInfo": { + "status": "ok", + "timestamp": 1742195567707, + "user_tz": -480, + "elapsed": 1, + "user": { + "displayName": "", + "userId": "" + } + } + }, + "execution_count": 38, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Invoke LLM with prompt\n", + "response = predict(prompt, temperature = TEMPERATURE)\n", + "\n", + "# Print results as Markdown\n", + "Markdown(response)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 426 + }, + "id": "OYYkVYbs8Y0P", + "executionInfo": { + "status": "ok", + "timestamp": 1742195608280, + "user_tz": -480, + "elapsed": 14425, + "user": { + "displayName": "", + "userId": "" + } + }, + "outputId": "def839e3-3dee-4320-9cb5-cac855ddea6b" + }, + "execution_count": 42, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Load job 53681d07-ddc6-4f62-a170-ac5cafc1c7af is DONE. Open Job" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job 259907b0-1bae-402f-be4f-d45e478832f1 is DONE. 5.3 kB processed. Open Job" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/bigframes/core/array_value.py:114: PreviewWarning: \u001b[93mJSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\u001b[0m\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job f3e6dca3-7674-41f6-a4ba-0daec387e25e is DONE. 2 Bytes processed. Open Job" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job e06bc512-d746-433b-b431-3e7426b6cd9c is DONE. 2 Bytes processed. Open Job" + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/bigframes/core/array_value.py:238: AmbiguousWindowWarning: \u001b[93mWindow ordering may be ambiguous, this can cause unstable results.\u001b[0m\n", + " warnings.warn(msg, bfe.AmbiguousWindowWarning)\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "Query job bcf9e83c-a420-4282-86b1-d005244c97f2 is DONE. 1.5 kB processed. Open Job" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ], + "text/markdown": "User Query: Chip assemblies employing solder bonds to back-side lands including an electrolytic nickel layer\n\nSummary of the top 5 abstracts that are semantically closest to the user query.\n\nThe top five patent abstracts describe advancements in microelectronic packaging, focusing on improved chip-to-substrate interconnection and enhanced reliability. A common thread is the development of novel substrate designs and assembly methods to facilitate robust electrical connections. Several abstracts highlight techniques for creating reliable connections between chips and substrates, emphasizing the use of conductive materials and adhesives to ensure strong and durable bonds. These methods aim to improve the overall reliability and performance of electronic devices. The innovations include improved techniques for preventing delamination or peeling of conductive lands, leading to more robust assemblies. The use of encapsulating materials and specialized die-attach methods are also prominent, suggesting a focus on protecting the chip and its connections from environmental factors. These advancements collectively contribute to the creation of thinner, more reliable, and cost-effective electronic devices, with applications spanning various consumer electronics and other industries. While the abstracts don't explicitly mention electrolytic nickel layers, the focus on improved solder bond reliability and substrate design suggests that such a layer could be a complementary enhancement to the described technologies.\n" + }, + "metadata": {}, + "execution_count": 42 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sy82XLDfooEb" + }, + "source": [ + "# Summary and next steps\n", + "\n", + "Ready to dive deeper and explore the endless possibilities? Start building your own vector search applications with BigFrames and BigQuery today! Check out our [documentation](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.bigquery#bigframes_bigquery_vector_search), explore our sample [notebooks](https://github.com/googleapis/python-bigquery-dataframes/tree/main/notebooks), and unleash the power of vector analytics on your data.\n", + "The BigFrames team would also love to hear from you. If you would like to reach out, please send an email to: bigframes-feedback@google.com or by filing an issue at the [open source BigFrames repository](https://github.com/googleapis/python-bigquery-dataframes/issues). To receive updates about BigFrames, subscribe to the BigFrames email list." + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "name": "bq_dataframes_llm_kmeans", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/noxfile.py b/noxfile.py index a08ef27781..74538f49f0 100644 --- a/noxfile.py +++ b/noxfile.py @@ -118,16 +118,8 @@ # Sessions are executed in the order so putting the smaller sessions # ahead to fail fast at presubmit running. -# 'docfx' is excluded since it only needs to run in 'docs-presubmit' nox.options.sessions = [ - "lint", - "lint_setup_py", - "mypy", - "format", - "docs", - "docfx", "unit", - "unit_noextras", "system-3.9", "system-3.12", "cover", @@ -145,7 +137,12 @@ def lint(session): Returns a failure if the linters find linting errors or sufficiently serious code quality issues. """ - session.install("flake8", BLACK_VERSION) + session.install("flake8", BLACK_VERSION, ISORT_VERSION) + session.run( + "isort", + "--check", + *LINT_PATHS, + ) session.run( "black", "--check", @@ -766,6 +763,8 @@ def notebook(session: nox.Session): "notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb", # Needs BUCKET_URI. "notebooks/generative_ai/sentiment_analysis.ipynb", # Too slow "notebooks/generative_ai/bq_dataframes_llm_gemini_2.ipynb", # Gemini 2.0 backend hasn't ready in prod. + "notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb", # Needs DATASET_ID. + "notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb", # Needs CONNECTION. # TODO(b/366290533): to protect BQML quota "notebooks/generative_ai/bq_dataframes_llm_claude3_museum_art.ipynb", "notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb", # Needs BUCKET_URI. diff --git a/owlbot.py b/owlbot.py index 0e40cae1ad..159df04abd 100644 --- a/owlbot.py +++ b/owlbot.py @@ -98,33 +98,6 @@ "recursive-include third_party/bigframes_vendored *\nrecursive-include bigframes", ) -# Even though BigQuery DataFrames isn't technically a client library, we are -# opting into Cloud RAD for docs hosting. -assert 1 == s.replace( # common.cfg - [".kokoro/docs/common.cfg"], - re.escape('value: "docs-staging-v2-dev"'), - 'value: "docs-staging-v2"', -) - -# Use a custom table of contents since the default one isn't organized well -# enough for the number of classes we have. -assert 1 == s.replace( # publish-docs.sh - [".kokoro/publish-docs.sh"], - ( - re.escape("# upload docs") - + "\n" - + re.escape( - 'python3.10 -m docuploader upload docs/_build/html/docfx_yaml --metadata-file docs.metadata --destination-prefix docfx --staging-bucket "${V2_STAGING_BUCKET}"' - ) - ), - ( - "# Replace toc.yml template file\n" - + "mv docs/templates/toc.yml docs/_build/html/docfx_yaml/toc.yml\n\n" - + "# upload docs\n" - + 'python3.10 -m docuploader upload docs/_build/html/docfx_yaml --metadata-file docs.metadata --destination-prefix docfx --staging-bucket "${V2_STAGING_BUCKET}"' - ), -) - # Fixup the documentation. assert 1 == s.replace( # docs/conf.py ["docs/conf.py"], diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 5b3add053c..815304371d 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -91,9 +91,12 @@ def gcs_folder(gcs_client: storage.Client): prefix = prefixer.create_prefix() path = f"gs://{bucket}/{prefix}/" yield path - for blob in gcs_client.list_blobs(bucket, prefix=prefix): - blob = typing.cast(storage.Blob, blob) - blob.delete() + try: + for blob in gcs_client.list_blobs(bucket, prefix=prefix): + blob = typing.cast(storage.Blob, blob) + blob.delete() + except Exception as exc: + traceback.print_exception(type(exc), exc, None) @pytest.fixture(scope="session") @@ -139,9 +142,7 @@ def resourcemanager_client( @pytest.fixture(scope="session") def session() -> Generator[bigframes.Session, None, None]: - context = bigframes.BigQueryOptions( - location="US", - ) + context = bigframes.BigQueryOptions(location="US", allow_large_results=False) session = bigframes.Session(context=context) yield session session.close() # close generated session at cleanup time @@ -157,7 +158,9 @@ def session_load() -> Generator[bigframes.Session, None, None]: @pytest.fixture(scope="session", params=["strict", "partial"]) def maybe_ordered_session(request) -> Generator[bigframes.Session, None, None]: - context = bigframes.BigQueryOptions(location="US", ordering_mode=request.param) + context = bigframes.BigQueryOptions( + location="US", ordering_mode=request.param, allow_large_results=False + ) session = bigframes.Session(context=context) yield session session.close() # close generated session at cleanup type @@ -165,7 +168,9 @@ def maybe_ordered_session(request) -> Generator[bigframes.Session, None, None]: @pytest.fixture(scope="session") def unordered_session() -> Generator[bigframes.Session, None, None]: - context = bigframes.BigQueryOptions(location="US", ordering_mode="partial") + context = bigframes.BigQueryOptions( + location="US", ordering_mode="partial", allow_large_results=False + ) session = bigframes.Session(context=context) yield session session.close() # close generated session at cleanup type @@ -1378,6 +1383,12 @@ def floats_product_bf(session, floats_product_pd): return session.read_pandas(floats_product_pd) +@pytest.fixture(scope="session", autouse=True) +def use_fast_query_path(): + with bpd.option_context("bigquery.allow_large_results", False): + yield + + @pytest.fixture(scope="session", autouse=True) def cleanup_cloud_functions(session, cloudfunctions_client, dataset_id_permanent): """Clean up stale cloud functions.""" @@ -1436,3 +1447,36 @@ def cleanup_cloud_functions(session, cloudfunctions_client, dataset_id_permanent # # Let's stop further clean up and leave it to later. traceback.print_exception(type(exc), exc, None) + + +@pytest.fixture(scope="session") +def images_gcs_path() -> str: + return "gs://bigframes_blob_test/images/*" + + +@pytest.fixture(scope="session") +def images_uris() -> list[str]: + return [ + "gs://bigframes_blob_test/images/img0.jpg", + "gs://bigframes_blob_test/images/img1.jpg", + ] + + +@pytest.fixture(scope="session") +def images_mm_df( + images_gcs_path, session: bigframes.Session, bq_connection: str +) -> bpd.DataFrame: + bigframes.options.experiments.blob = True + + return session.from_glob_path( + images_gcs_path, name="blob_col", connection=bq_connection + ) + + +@pytest.fixture() +def reset_default_session_and_location(): + bpd.close_session() + with bpd.option_context("bigquery.location", None): + yield + bpd.close_session() + bpd.options.bigquery.location = None diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py new file mode 100644 index 0000000000..dfdbbffede --- /dev/null +++ b/tests/system/large/blob/test_function.py @@ -0,0 +1,281 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import traceback +from typing import Generator +import uuid + +from google.cloud import storage +import pandas as pd +import pytest + +import bigframes +from bigframes import dtypes +import bigframes.pandas as bpd + + +@pytest.fixture(scope="function") +def images_output_folder() -> Generator[str, None, None]: + id = uuid.uuid4().hex + folder = os.path.join("gs://bigframes_blob_test/output/", id) + yield folder + + # clean up + try: + cloud_storage_client = storage.Client() + bucket = cloud_storage_client.bucket("bigframes_blob_test") + blobs = bucket.list_blobs(prefix="output/" + id) + for blob in blobs: + blob.delete() + except Exception as exc: + traceback.print_exception(type(exc), exc, None) + + +@pytest.fixture(scope="function") +def images_output_uris(images_output_folder: str) -> list[str]: + return [ + os.path.join(images_output_folder, "img0.jpg"), + os.path.join(images_output_folder, "img1.jpg"), + ] + + +def test_blob_image_blur_to_series( + images_mm_df: bpd.DataFrame, + bq_connection: str, + images_output_uris: list[str], + session: bigframes.Session, +): + bigframes.options.experiments.blob = True + + series = bpd.Series(images_output_uris, session=session).str.to_blob( + connection=bq_connection + ) + + actual = images_mm_df["blob_col"].blob.image_blur( + (8, 8), dst=series, connection=bq_connection + ) + expected_df = pd.DataFrame( + { + "uri": images_output_uris, + "version": [None, None], + "authorizer": [bq_connection.casefold(), bq_connection.casefold()], + "details": [None, None], + } + ) + pd.testing.assert_frame_equal( + actual.struct.explode().to_pandas(), + expected_df, + check_dtype=False, + check_index_type=False, + ) + + # verify the files exist + assert not actual.blob.size().isna().any() + + +def test_blob_image_blur_to_folder( + images_mm_df: bpd.DataFrame, + bq_connection: str, + images_output_folder: str, + images_output_uris: list[str], +): + bigframes.options.experiments.blob = True + + actual = images_mm_df["blob_col"].blob.image_blur( + (8, 8), dst=images_output_folder, connection=bq_connection + ) + expected_df = pd.DataFrame( + { + "uri": images_output_uris, + "version": [None, None], + "authorizer": [bq_connection.casefold(), bq_connection.casefold()], + "details": [None, None], + } + ) + pd.testing.assert_frame_equal( + actual.struct.explode().to_pandas(), + expected_df, + check_dtype=False, + check_index_type=False, + ) + + # verify the files exist + assert not actual.blob.size().isna().any() + + +def test_blob_image_blur_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str): + bigframes.options.experiments.blob = True + + actual = images_mm_df["blob_col"].blob.image_blur((8, 8), connection=bq_connection) + + assert isinstance(actual, bpd.Series) + assert len(actual) == 2 + assert actual.dtype == dtypes.BYTES_DTYPE + + +def test_blob_image_resize_to_series( + images_mm_df: bpd.DataFrame, + bq_connection: str, + images_output_uris: list[str], + session: bigframes.Session, +): + bigframes.options.experiments.blob = True + + series = bpd.Series(images_output_uris, session=session).str.to_blob( + connection=bq_connection + ) + + actual = images_mm_df["blob_col"].blob.image_resize( + (200, 300), dst=series, connection=bq_connection + ) + expected_df = pd.DataFrame( + { + "uri": images_output_uris, + "version": [None, None], + "authorizer": [bq_connection.casefold(), bq_connection.casefold()], + "details": [None, None], + } + ) + pd.testing.assert_frame_equal( + actual.struct.explode().to_pandas(), + expected_df, + check_dtype=False, + check_index_type=False, + ) + + # verify the files exist + assert not actual.blob.size().isna().any() + + +def test_blob_image_resize_to_folder( + images_mm_df: bpd.DataFrame, + bq_connection: str, + images_output_folder: str, + images_output_uris: list[str], +): + bigframes.options.experiments.blob = True + + actual = images_mm_df["blob_col"].blob.image_resize( + (200, 300), dst=images_output_folder, connection=bq_connection + ) + expected_df = pd.DataFrame( + { + "uri": images_output_uris, + "version": [None, None], + "authorizer": [bq_connection.casefold(), bq_connection.casefold()], + "details": [None, None], + } + ) + pd.testing.assert_frame_equal( + actual.struct.explode().to_pandas(), + expected_df, + check_dtype=False, + check_index_type=False, + ) + + # verify the files exist + assert not actual.blob.size().isna().any() + + +def test_blob_image_resize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str): + bigframes.options.experiments.blob = True + + actual = images_mm_df["blob_col"].blob.image_resize( + (200, 300), connection=bq_connection + ) + + assert isinstance(actual, bpd.Series) + assert len(actual) == 2 + assert actual.dtype == dtypes.BYTES_DTYPE + + +def test_blob_image_normalize_to_series( + images_mm_df: bpd.DataFrame, + bq_connection: str, + images_output_uris: list[str], + session: bigframes.Session, +): + bigframes.options.experiments.blob = True + + series = bpd.Series(images_output_uris, session=session).str.to_blob( + connection=bq_connection + ) + + actual = images_mm_df["blob_col"].blob.image_normalize( + alpha=50.0, beta=150.0, norm_type="minmax", dst=series, connection=bq_connection + ) + expected_df = pd.DataFrame( + { + "uri": images_output_uris, + "version": [None, None], + "authorizer": [bq_connection.casefold(), bq_connection.casefold()], + "details": [None, None], + } + ) + pd.testing.assert_frame_equal( + actual.struct.explode().to_pandas(), + expected_df, + check_dtype=False, + check_index_type=False, + ) + + # verify the files exist + assert not actual.blob.size().isna().any() + + +def test_blob_image_normalize_to_folder( + images_mm_df: bpd.DataFrame, + bq_connection: str, + images_output_folder: str, + images_output_uris: list[str], +): + bigframes.options.experiments.blob = True + + actual = images_mm_df["blob_col"].blob.image_normalize( + alpha=50.0, + beta=150.0, + norm_type="minmax", + dst=images_output_folder, + connection=bq_connection, + ) + expected_df = pd.DataFrame( + { + "uri": images_output_uris, + "version": [None, None], + "authorizer": [bq_connection.casefold(), bq_connection.casefold()], + "details": [None, None], + } + ) + pd.testing.assert_frame_equal( + actual.struct.explode().to_pandas(), + expected_df, + check_dtype=False, + check_index_type=False, + ) + + # verify the files exist + assert not actual.blob.size().isna().any() + + +def test_blob_image_normalize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str): + bigframes.options.experiments.blob = True + + actual = images_mm_df["blob_col"].blob.image_normalize( + alpha=50.0, beta=150.0, norm_type="minmax", connection=bq_connection + ) + + assert isinstance(actual, bpd.Series) + assert len(actual) == 2 + assert actual.dtype == dtypes.BYTES_DTYPE diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py index efab338861..47cbf7fb1b 100644 --- a/tests/system/large/functions/test_managed_function.py +++ b/tests/system/large/functions/test_managed_function.py @@ -12,13 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import google.api_core.exceptions import pandas import pyarrow import pytest import bigframes -from bigframes.functions import _function_session as bff_session -from bigframes.functions._utils import get_python_version import bigframes.pandas as bpd from tests.system.utils import cleanup_function_assets @@ -119,41 +118,7 @@ def stringify(x): ) finally: # clean up the gcp assets created for the managed function. - cleanup_function_assets( - bigquery_client, session.cloudfunctionsclient, stringify - ) - - -def test_managed_function_binop(session, scalars_dfs, dataset_id): - try: - - def func(x, y): - return x * abs(y % 4) - - managed_func = session.udf( - input_types=[str, int], - output_type=str, - dataset=dataset_id, - )(func) - - scalars_df, scalars_pandas_df = scalars_dfs - - scalars_df = scalars_df.dropna() - scalars_pandas_df = scalars_pandas_df.dropna() - pd_result = scalars_pandas_df["string_col"].combine( - scalars_pandas_df["int64_col"], func - ) - bf_result = ( - scalars_df["string_col"] - .combine(scalars_df["int64_col"], managed_func) - .to_pandas() - ) - pandas.testing.assert_series_equal(bf_result, pd_result) - finally: - # clean up the gcp assets created for the managed function. - cleanup_function_assets( - session.bqclient, session.cloudfunctionsclient, managed_func - ) + cleanup_function_assets(stringify, bigquery_client) @pytest.mark.parametrize( @@ -165,10 +130,6 @@ def func(x, y): str, ], ) -@pytest.mark.skipif( - get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, - reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", -) def test_managed_function_array_output(session, scalars_dfs, dataset_id, array_dtype): try: @@ -187,54 +148,362 @@ def featurize(x: int) -> list[array_dtype]: # type: ignore # Ignore any dtype disparity. pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + # Make sure the read_gbq_function path works for this function. + featurize_ref = session.read_gbq_function(featurize.bigframes_bigquery_function) + + assert hasattr(featurize_ref, "bigframes_bigquery_function") + assert not hasattr(featurize_ref, "bigframes_remote_function") + assert ( + featurize_ref.bigframes_bigquery_function + == featurize.bigframes_bigquery_function + ) + + # Test on the function from read_gbq_function. + got = featurize_ref(10) + assert got == [array_dtype(i) for i in [10, 11, 12]] + + bf_result_gbq = bf_int64_col.apply(featurize_ref).to_pandas() + pandas.testing.assert_series_equal(bf_result_gbq, pd_result, check_dtype=False) + finally: # Clean up the gcp assets created for the managed function. - cleanup_function_assets( - featurize, session.bqclient, session.cloudfunctionsclient + cleanup_function_assets(featurize, session.bqclient) + + +@pytest.mark.parametrize( + ("typ",), + [ + pytest.param(int), + pytest.param(float), + pytest.param(bool), + pytest.param(str), + pytest.param(bytes), + ], +) +def test_managed_function_series_apply( + session, + typ, + scalars_dfs, +): + try: + + @session.udf() + def foo(x: int) -> typ: # type:ignore + # The bytes() constructor expects a non-negative interger as its arg. + return typ(abs(x)) + + # Function should still work normally. + assert foo(-2) == typ(2) + + assert hasattr(foo, "bigframes_bigquery_function") + assert hasattr(foo, "ibis_node") + assert hasattr(foo, "input_dtypes") + assert hasattr(foo, "output_dtype") + assert hasattr(foo, "bigframes_bigquery_function_output_dtype") + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result_col = scalars_df["int64_too"].apply(foo) + bf_result = ( + scalars_df["int64_too"].to_frame().assign(result=bf_result_col).to_pandas() ) + pd_result_col = scalars_pandas_df["int64_too"].apply(foo) + pd_result = ( + scalars_pandas_df["int64_too"].to_frame().assign(result=pd_result_col) + ) -@pytest.mark.skipif( - get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, - reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + # Make sure the read_gbq_function path works for this function. + foo_ref = session.read_gbq_function( + function_name=foo.bigframes_bigquery_function, # type: ignore + ) + assert hasattr(foo_ref, "bigframes_bigquery_function") + assert not hasattr(foo_ref, "bigframes_remote_function") + assert foo.bigframes_bigquery_function == foo_ref.bigframes_bigquery_function # type: ignore + + bf_result_col_gbq = scalars_df["int64_too"].apply(foo_ref) + bf_result_gbq = ( + scalars_df["int64_too"] + .to_frame() + .assign(result=bf_result_col_gbq) + .to_pandas() + ) + + pandas.testing.assert_frame_equal(bf_result_gbq, pd_result, check_dtype=False) + finally: + # Clean up the gcp assets created for the managed function. + cleanup_function_assets(foo, session.bqclient) + + +@pytest.mark.parametrize( + ("typ",), + [ + pytest.param(int), + pytest.param(float), + pytest.param(bool), + pytest.param(str), + ], ) -def test_managed_function_binop_array_output(session, scalars_dfs, dataset_id): +def test_managed_function_series_apply_array_output( + session, + typ, + scalars_dfs, +): try: - def func(x, y): - return [len(x), abs(y % 4)] + @session.udf() + def foo_list(x: int) -> list[typ]: # type:ignore + # The bytes() constructor expects a non-negative interger as its arg. + return [typ(abs(x)), typ(abs(x) + 1)] - managed_func = session.udf( - input_types=[str, int], - output_type=list[int], - dataset=dataset_id, - )(func) + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result_col = scalars_df["int64_too"].apply(foo_list) + bf_result = ( + scalars_df["int64_too"].to_frame().assign(result=bf_result_col).to_pandas() + ) + + pd_result_col = scalars_pandas_df["int64_too"].apply(foo_list) + pd_result = ( + scalars_pandas_df["int64_too"].to_frame().assign(result=pd_result_col) + ) + + # Ignore any dtype difference. + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + finally: + # Clean up the gcp assets created for the managed function. + cleanup_function_assets(foo_list, session.bqclient) + + +def test_managed_function_series_combine(session, scalars_dfs): + try: + # This function is deliberately written to not work with NA input. + def add(x: int, y: int) -> int: + return x + y scalars_df, scalars_pandas_df = scalars_dfs + int_col_name_with_nulls = "int64_col" + int_col_name_no_nulls = "int64_too" + bf_df = scalars_df[[int_col_name_with_nulls, int_col_name_no_nulls]] + pd_df = scalars_pandas_df[[int_col_name_with_nulls, int_col_name_no_nulls]] + + # make sure there are NA values in the test column. + assert any([pandas.isna(val) for val in bf_df[int_col_name_with_nulls]]) + + add_managed_func = session.udf()(add) - scalars_df = scalars_df.dropna() - scalars_pandas_df = scalars_pandas_df.dropna() + # with nulls in the series the managed function application would fail. + with pytest.raises( + google.api_core.exceptions.BadRequest, match="unsupported operand" + ): + bf_df[int_col_name_with_nulls].combine( + bf_df[int_col_name_no_nulls], add_managed_func + ).to_pandas() + + # after filtering out nulls the managed function application should work + # similar to pandas. + pd_filter = pd_df[int_col_name_with_nulls].notnull() + pd_result = pd_df[pd_filter][int_col_name_with_nulls].combine( + pd_df[pd_filter][int_col_name_no_nulls], add + ) + bf_filter = bf_df[int_col_name_with_nulls].notnull() bf_result = ( - scalars_df["string_col"] - .combine(scalars_df["int64_col"], managed_func) + bf_df[bf_filter][int_col_name_with_nulls] + .combine(bf_df[bf_filter][int_col_name_no_nulls], add_managed_func) .to_pandas() ) - pd_result = scalars_pandas_df["string_col"].combine( - scalars_pandas_df["int64_col"], func + + # ignore any dtype difference. + pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + + # Make sure the read_gbq_function path works for this function. + add_managed_func_ref = session.read_gbq_function( + add_managed_func.bigframes_bigquery_function + ) + bf_result = ( + bf_df[bf_filter][int_col_name_with_nulls] + .combine(bf_df[bf_filter][int_col_name_no_nulls], add_managed_func_ref) + .to_pandas() ) pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) finally: # Clean up the gcp assets created for the managed function. - cleanup_function_assets( - managed_func, session.bqclient, session.cloudfunctionsclient + cleanup_function_assets(add_managed_func, session.bqclient) + + +def test_managed_function_series_combine_array_output(session, scalars_dfs): + try: + + def add_list(x: int, y: int) -> list[int]: + return [x, y] + + scalars_df, scalars_pandas_df = scalars_dfs + int_col_name_with_nulls = "int64_col" + int_col_name_no_nulls = "int64_too" + bf_df = scalars_df[[int_col_name_with_nulls, int_col_name_no_nulls]] + pd_df = scalars_pandas_df[[int_col_name_with_nulls, int_col_name_no_nulls]] + + # Make sure there are NA values in the test column. + assert any([pandas.isna(val) for val in bf_df[int_col_name_with_nulls]]) + + add_list_managed_func = session.udf()(add_list) + + # After filtering out nulls the managed function application should work + # similar to pandas. + pd_filter = pd_df[int_col_name_with_nulls].notnull() + pd_result = pd_df[pd_filter][int_col_name_with_nulls].combine( + pd_df[pd_filter][int_col_name_no_nulls], add_list + ) + bf_filter = bf_df[int_col_name_with_nulls].notnull() + bf_result = ( + bf_df[bf_filter][int_col_name_with_nulls] + .combine(bf_df[bf_filter][int_col_name_no_nulls], add_list_managed_func) + .to_pandas() + ) + + # Ignore any dtype difference. + pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) + + # Make sure the read_gbq_function path works for this function. + add_list_managed_func_ref = session.read_gbq_function( + function_name=add_list_managed_func.bigframes_bigquery_function, # type: ignore + ) + + assert hasattr(add_list_managed_func_ref, "bigframes_bigquery_function") + assert not hasattr(add_list_managed_func_ref, "bigframes_remote_function") + assert ( + add_list_managed_func_ref.bigframes_bigquery_function + == add_list_managed_func.bigframes_bigquery_function + ) + + # Test on the function from read_gbq_function. + got = add_list_managed_func_ref(10, 38) + assert got == [10, 38] + + bf_result_gbq = ( + bf_df[bf_filter][int_col_name_with_nulls] + .combine(bf_df[bf_filter][int_col_name_no_nulls], add_list_managed_func_ref) + .to_pandas() + ) + + pandas.testing.assert_series_equal(bf_result_gbq, pd_result, check_dtype=False) + finally: + # Clean up the gcp assets created for the managed function. + cleanup_function_assets(add_list_managed_func, session.bqclient) + + +def test_managed_function_dataframe_map(session, scalars_dfs): + try: + + def add_one(x): + return x + 1 + + mf_add_one = session.udf( + input_types=[int], + output_type=int, + )(add_one) + + scalars_df, scalars_pandas_df = scalars_dfs + int64_cols = ["int64_col", "int64_too"] + + bf_int64_df = scalars_df[int64_cols] + bf_int64_df_filtered = bf_int64_df.dropna() + bf_result = bf_int64_df_filtered.map(mf_add_one).to_pandas() + + pd_int64_df = scalars_pandas_df[int64_cols] + pd_int64_df_filtered = pd_int64_df.dropna() + pd_result = pd_int64_df_filtered.map(add_one) + # TODO(shobs): Figure why pandas .map() changes the dtype, i.e. + # pd_int64_df_filtered.dtype is Int64Dtype() + # pd_int64_df_filtered.map(lambda x: x).dtype is int64. + # For this test let's force the pandas dtype to be same as input. + for col in pd_result: + pd_result[col] = pd_result[col].astype(pd_int64_df_filtered[col].dtype) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + finally: + # Clean up the gcp assets created for the managed function. + cleanup_function_assets(mf_add_one, session.bqclient) + + +def test_managed_function_dataframe_map_array_output( + session, scalars_dfs, dataset_id_permanent +): + try: + + def add_one_list(x): + return [x + 1] * 3 + + mf_add_one_list = session.udf( + input_types=[int], + output_type=list[int], + )(add_one_list) + + scalars_df, scalars_pandas_df = scalars_dfs + int64_cols = ["int64_col", "int64_too"] + + bf_int64_df = scalars_df[int64_cols] + bf_int64_df_filtered = bf_int64_df.dropna() + bf_result = bf_int64_df_filtered.map(mf_add_one_list).to_pandas() + + pd_int64_df = scalars_pandas_df[int64_cols] + pd_int64_df_filtered = pd_int64_df.dropna() + pd_result = pd_int64_df_filtered.map(add_one_list) + + # Ignore any dtype difference. + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + # Make sure the read_gbq_function path works for this function. + mf_add_one_list_ref = session.read_gbq_function( + function_name=mf_add_one_list.bigframes_bigquery_function, # type: ignore ) + bf_result_gbq = bf_int64_df_filtered.map(mf_add_one_list_ref).to_pandas() + pandas.testing.assert_frame_equal(bf_result_gbq, pd_result, check_dtype=False) + finally: + # Clean up the gcp assets created for the managed function. + cleanup_function_assets(mf_add_one_list, session.bqclient) + -@pytest.mark.skipif( - get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, - reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", -) -def test_manage_function_df_apply_axis_1_array_output(session): +def test_managed_function_dataframe_apply_axis_1(session, scalars_dfs): + try: + scalars_df, scalars_pandas_df = scalars_dfs + series = scalars_df["int64_too"] + series_pandas = scalars_pandas_df["int64_too"] + + def add_ints(x, y): + return x + y + + add_ints_mf = session.udf( + input_types=[int, int], + output_type=int, + )(add_ints) + assert add_ints_mf.bigframes_bigquery_function # type: ignore + + with pytest.warns( + bigframes.exceptions.PreviewWarning, match="axis=1 scenario is in preview." + ): + bf_result = ( + bpd.DataFrame({"x": series, "y": series}) + .apply(add_ints_mf, axis=1) + .to_pandas() + ) + + pd_result = pandas.DataFrame({"x": series_pandas, "y": series_pandas}).apply( + lambda row: add_ints(row["x"], row["y"]), axis=1 + ) + + pandas.testing.assert_series_equal( + pd_result, bf_result, check_dtype=False, check_exact=True + ) + finally: + # Clean up the gcp assets created for the managed function. + cleanup_function_assets(add_ints_mf, session.bqclient) + + +def test_managed_function_dataframe_apply_axis_1_array_output(session): bf_df = bigframes.dataframe.DataFrame( { "Id": [1, 2, 3], @@ -293,7 +562,11 @@ def foo(x, y, z): # Successfully applies to dataframe with matching number of columns. # and their datatypes. - bf_result = bf_df.apply(foo, axis=1).to_pandas() + with pytest.warns( + bigframes.exceptions.PreviewWarning, + match="axis=1 scenario is in preview.", + ): + bf_result = bf_df.apply(foo, axis=1).to_pandas() # Since this scenario is not pandas-like, let's handcraft the # expected result. @@ -309,6 +582,27 @@ def foo(x, y, z): expected_result, bf_result, check_dtype=False, check_index_type=False ) + # Make sure the read_gbq_function path works for this function. + foo_ref = session.read_gbq_function(foo.bigframes_bigquery_function) + + assert hasattr(foo_ref, "bigframes_bigquery_function") + assert not hasattr(foo_ref, "bigframes_remote_function") + assert foo_ref.bigframes_bigquery_function == foo.bigframes_bigquery_function + + # Test on the function from read_gbq_function. + got = foo_ref(10, 38, "hello") + assert got == ["10", "38.0", "hello"] + + with pytest.warns( + bigframes.exceptions.PreviewWarning, + match="axis=1 scenario is in preview.", + ): + bf_result_gbq = bf_df.apply(foo_ref, axis=1).to_pandas() + + pandas.testing.assert_series_equal( + bf_result_gbq, expected_result, check_dtype=False, check_index_type=False + ) + finally: # Clean up the gcp assets created for the managed function. - cleanup_function_assets(foo, session.bqclient, session.cloudfunctionsclient) + cleanup_function_assets(foo, session.bqclient) diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 0d7f888306..1e5e7ede26 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -882,6 +882,7 @@ def square(x): # The remote function should reflect the explicitly provided name assert square_remote.bigframes_remote_function == expected_remote_function + assert square_remote.bigframes_bigquery_function == expected_remote_function # Now the expected BQ remote function should exist session.bqclient.get_routine(expected_remote_function) @@ -1013,6 +1014,7 @@ def test_internal(rf, udf): # The remote function should reflect the explicitly provided name assert square_remote1.bigframes_remote_function == expected_remote_function + assert square_remote1.bigframes_bigquery_function == expected_remote_function # Now the expected BQ remote function should exist routine = session.bqclient.get_routine(expected_remote_function) @@ -1037,6 +1039,7 @@ def test_internal(rf, udf): # The new remote function should still reflect the explicitly provided name assert square_remote2.bigframes_remote_function == expected_remote_function + assert square_remote2.bigframes_bigquery_function == expected_remote_function # The expected BQ remote function should still exist routine = session.bqclient.get_routine(expected_remote_function) @@ -1080,6 +1083,7 @@ def plusone(x): # The new remote function should still reflect the explicitly provided name assert plusone_remote.bigframes_remote_function == expected_remote_function + assert plusone_remote.bigframes_bigquery_function == expected_remote_function # The expected BQ remote function should still exist routine = session.bqclient.get_routine(expected_remote_function) @@ -1234,7 +1238,7 @@ def square(x): return x * x assert ( - bigquery.Routine(square.bigframes_remote_function).dataset_id + bigquery.Routine(square.bigframes_bigquery_function).dataset_id == session._anonymous_dataset.dataset_id ) @@ -1495,7 +1499,7 @@ def square(x): )(square) bq_routine = session.bqclient.get_routine( - square_remote.bigframes_remote_function + square_remote.bigframes_bigquery_function ) assert bq_routine.remote_function_options.max_batching_rows == max_batching_rows @@ -1642,7 +1646,7 @@ def serialize_row(row): # Let's make sure the read_gbq_function path works for this function serialize_row_reuse = session.read_gbq_function( - serialize_row_remote.bigframes_remote_function, is_row_processor=True + serialize_row_remote.bigframes_bigquery_function, is_row_processor=True ) bf_result = scalars_df[columns].apply(serialize_row_reuse, axis=1).to_pandas() pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) @@ -1950,6 +1954,8 @@ def foo(x: int) -> int: # ensure that remote function artifacts are created assert foo.bigframes_remote_function is not None session.bqclient.get_routine(foo.bigframes_remote_function) is not None + assert foo.bigframes_bigquery_function is not None + session.bqclient.get_routine(foo.bigframes_bigquery_function) is not None assert foo.bigframes_cloud_function is not None session.cloudfunctionsclient.get_function( name=foo.bigframes_cloud_function @@ -1960,7 +1966,7 @@ def foo(x: int) -> int: # ensure that the bq remote function is deleted with pytest.raises(google.cloud.exceptions.NotFound): - session.bqclient.get_routine(foo.bigframes_remote_function) + session.bqclient.get_routine(foo.bigframes_bigquery_function) # the deletion of cloud function happens in a non-blocking way, ensure that # it either exists in a being-deleted state, or is already deleted @@ -1990,6 +1996,8 @@ def foo(x: int) -> int: # ensure that remote function artifacts are created assert foo.bigframes_remote_function is not None session.bqclient.get_routine(foo.bigframes_remote_function) is not None + assert foo.bigframes_bigquery_function is not None + session.bqclient.get_routine(foo.bigframes_bigquery_function) is not None assert foo.bigframes_cloud_function is not None session.cloudfunctionsclient.get_function( name=foo.bigframes_cloud_function @@ -1999,7 +2007,7 @@ def foo(x: int) -> int: session.close() # ensure that the bq remote function still exists - session.bqclient.get_routine(foo.bigframes_remote_function) is not None + session.bqclient.get_routine(foo.bigframes_bigquery_function) is not None # the deletion of cloud function happens in a non-blocking way, ensure # that it was not deleted and still exists in active state @@ -2038,6 +2046,8 @@ def foo_named(x: int) -> int: for foo in [foo_unnamed, foo_named]: assert foo.bigframes_remote_function is not None session.bqclient.get_routine(foo.bigframes_remote_function) is not None + assert foo.bigframes_bigquery_function is not None + session.bqclient.get_routine(foo.bigframes_bigquery_function) is not None assert foo.bigframes_cloud_function is not None session.cloudfunctionsclient.get_function( name=foo.bigframes_cloud_function @@ -2051,7 +2061,7 @@ def foo_named(x: int) -> int: # ensure that the unnamed bq remote function is deleted along with its # corresponding cloud function with pytest.raises(google.cloud.exceptions.NotFound): - session.bqclient.get_routine(foo_unnamed.bigframes_remote_function) + session.bqclient.get_routine(foo_unnamed.bigframes_bigquery_function) try: gcf = session.cloudfunctionsclient.get_function( name=foo_unnamed.bigframes_cloud_function @@ -2062,7 +2072,7 @@ def foo_named(x: int) -> int: # ensure that the named bq remote function still exists along with its # corresponding cloud function - session.bqclient.get_routine(foo_named.bigframes_remote_function) is not None + session.bqclient.get_routine(foo_named.bigframes_bigquery_function) is not None gcf = session.cloudfunctionsclient.get_function( name=foo_named.bigframes_cloud_function ) @@ -2139,7 +2149,7 @@ def foo(x, y, z): ) # Let's make sure the read_gbq_function path works for this function - foo_reuse = session.read_gbq_function(foo.bigframes_remote_function) + foo_reuse = session.read_gbq_function(foo.bigframes_bigquery_function) bf_result = bf_df.apply(foo_reuse, axis=1).to_pandas() pandas.testing.assert_series_equal( expected_result, bf_result, check_dtype=False, check_index_type=False @@ -2225,7 +2235,7 @@ def foo(x, y, z): ) # Let's make sure the read_gbq_function path works for this function - foo_reuse = session.read_gbq_function(foo.bigframes_remote_function) + foo_reuse = session.read_gbq_function(foo.bigframes_bigquery_function) bf_result = bf_df.apply(foo_reuse, axis=1).to_pandas() pandas.testing.assert_series_equal( expected_result, bf_result, check_dtype=False, check_index_type=False @@ -2325,7 +2335,7 @@ def generate_stats(row: pandas.Series) -> list[int]: # Let's make sure the read_gbq_function path works for this function generate_stats_reuse = session.read_gbq_function( - generate_stats.bigframes_remote_function, + generate_stats.bigframes_bigquery_function, is_row_processor=True, ) bf_result = scalars_df[columns].apply(generate_stats_reuse, axis=1).to_pandas() @@ -2468,7 +2478,7 @@ def add_one(x: int) -> int: )(add_one) temporary_bigquery_remote_function = ( - add_one_remote_temp.bigframes_remote_function + add_one_remote_temp.bigframes_bigquery_function ) assert temporary_bigquery_remote_function is not None assert ( @@ -2545,7 +2555,7 @@ def add_one(x: int) -> int: )(add_one) persistent_bigquery_remote_function = ( - add_one_remote_persist.bigframes_remote_function + add_one_remote_persist.bigframes_bigquery_function ) assert persistent_bigquery_remote_function is not None assert ( @@ -2626,7 +2636,7 @@ def featurize(x: int) -> list[array_dtype]: # type: ignore # Let's make sure the read_gbq_function path works for this function featurize_reuse = session.read_gbq_function( - featurize.bigframes_remote_function # type: ignore + featurize.bigframes_bigquery_function # type: ignore ) bf_result = scalars_df["int64_too"].apply(featurize_reuse).to_pandas() pandas.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) @@ -2664,7 +2674,7 @@ def featurize(x: float) -> list[float]: # type: ignore # Let's make sure the read_gbq_function path works for this function featurize_reuse = unordered_session.read_gbq_function( - featurize.bigframes_remote_function # type: ignore + featurize.bigframes_bigquery_function # type: ignore ) bf_int64_col = scalars_df["float64_col"].dropna() bf_result = bf_int64_col.apply(featurize_reuse).to_pandas() diff --git a/tests/system/large/operations/test_semantics.py b/tests/system/large/operations/test_semantics.py index 20219ef46e..c3f08c6204 100644 --- a/tests/system/large/operations/test_semantics.py +++ b/tests/system/large/operations/test_semantics.py @@ -20,9 +20,10 @@ import pytest import bigframes -from bigframes import dataframe, dtypes, exceptions +from bigframes import dataframe, dtypes, exceptions, series -EXPERIMENT_OPTION = "experiments.semantic_operators" +SEM_OP_EXP_OPTION = "experiments.semantic_operators" +BLOB_EXP_OPTION = "experiments.blob" THRESHOLD_OPTION = "compute.semantic_ops_confirmation_threshold" @@ -31,7 +32,7 @@ def test_semantics_experiment_off_raise_error(): {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]} ) - with bigframes.option_context(EXPERIMENT_OPTION, False), pytest.raises( + with bigframes.option_context(SEM_OP_EXP_OPTION, False), pytest.raises( NotImplementedError ): df.semantics @@ -68,7 +69,7 @@ def test_agg(session, gemini_flash_model, max_agg_rows, cluster_column): instruction = "Find the shared first name of actors in {Movies}. One word answer." with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 50, @@ -114,7 +115,7 @@ def test_agg_with_confirmation(session, gemini_flash_model, reply, monkeypatch): monkeypatch.setattr("builtins.input", lambda: reply) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 0, @@ -139,7 +140,7 @@ def test_agg_w_int_column(session, gemini_flash_model): instruction = "Find the {Years} Leonardo DiCaprio acted in the most movies. Answer with the year only." with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -187,7 +188,7 @@ def test_agg_invalid_instruction_raise_error(instruction, gemini_flash_model): ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -222,7 +223,7 @@ def test_agg_invalid_cluster_column_raise_error(gemini_flash_model, cluster_colu instruction = "Find the shared first name of actors in {Movies}. One word answer." with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -257,7 +258,7 @@ def test_cluster_by(session, text_embedding_generator, n_clusters): output_column = "cluster id" with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -305,7 +306,7 @@ def test_cluster_by_with_confirmation( monkeypatch.setattr("builtins.input", lambda: reply) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 0, @@ -326,7 +327,7 @@ def test_cluster_by_invalid_column(session, text_embedding_generator): output_column = "cluster id" with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -347,7 +348,7 @@ def test_cluster_by_invalid_model(session, gemini_flash_model): output_column = "cluster id" with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -371,7 +372,7 @@ def test_filter(session, gemini_flash_model): ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -388,6 +389,29 @@ def test_filter(session, gemini_flash_model): ) +def test_filter_multi_model(session, gemini_flash_model): + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + BLOB_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + df = session.from_glob_path( + "gs://bigframes-dev-testing/a_multimodel/images/*", name="image" + ) + df["prey"] = series.Series( + ["building", "cross road", "rock", "squirrel", "rabbit"], session=session + ) + result = df.semantics.filter( + "The object in {image} feeds on {prey}", + gemini_flash_model, + ).to_pandas() + + assert len(result) <= len(df) + + @pytest.mark.parametrize( ("reply"), [ @@ -409,7 +433,7 @@ def test_filter_with_confirmation(session, gemini_flash_model, reply, monkeypatc monkeypatch.setattr("builtins.input", lambda: reply) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 0, @@ -426,7 +450,7 @@ def test_filter_single_column_reference(session, gemini_flash_model): ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -465,7 +489,7 @@ def test_filter_invalid_instruction_raise_error(instruction, gemini_flash_model) df = dataframe.DataFrame({"id": [1, 2], "city": ["Seattle", "Berlin"]}) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -479,7 +503,7 @@ def test_filter_invalid_model_raise_error(): ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -498,7 +522,7 @@ def test_map(session, gemini_flash_model): ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -528,6 +552,30 @@ def test_map(session, gemini_flash_model): ) +def test_map_multimodel(session, gemini_flash_model): + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + BLOB_EXP_OPTION, + True, + THRESHOLD_OPTION, + 10, + ): + df = session.from_glob_path( + "gs://bigframes-dev-testing/a_multimodel/images/*", name="image" + ) + df["scenario"] = series.Series( + ["building", "cross road", "tree", "squirrel", "rabbit"], session=session + ) + result = df.semantics.map( + "What is the object in {image} combined with {scenario}? One word only.", + "object", + gemini_flash_model, + ).to_pandas() + + assert len(result) == len(df) + + @pytest.mark.parametrize( ("reply"), [ @@ -549,7 +597,7 @@ def test_map_with_confirmation(session, gemini_flash_model, reply, monkeypatch): monkeypatch.setattr("builtins.input", lambda: reply) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 0, @@ -591,7 +639,7 @@ def test_map_invalid_instruction_raise_error(instruction, gemini_flash_model): ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -608,7 +656,7 @@ def test_map_invalid_model_raise_error(): ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -647,7 +695,7 @@ def test_join(instruction, session, gemini_flash_model): ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -696,7 +744,7 @@ def test_join_with_confirmation(session, gemini_flash_model, reply, monkeypatch) monkeypatch.setattr("builtins.input", lambda: reply) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 0, @@ -717,7 +765,7 @@ def test_self_join(session, gemini_flash_model): ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -779,7 +827,7 @@ def test_join_invalid_instruction_raise_error( ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -792,7 +840,7 @@ def test_join_invalid_model_raise_error(): countries = dataframe.DataFrame({"country": ["USA", "UK", "Germany"]}) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -814,7 +862,7 @@ def test_search(session, text_embedding_generator, score_column): ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -862,7 +910,7 @@ def test_search_with_confirmation( monkeypatch.setattr("builtins.input", lambda: reply) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 0, @@ -882,7 +930,7 @@ def test_search_invalid_column_raises_error(session, text_embedding_generator): ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -899,7 +947,7 @@ def test_search_invalid_model_raises_error(session): ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -914,7 +962,7 @@ def test_search_invalid_top_k_raises_error(session, text_embedding_generator): ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -942,7 +990,7 @@ def test_sim_join(session, text_embedding_generator, score_column): ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -995,7 +1043,7 @@ def test_sim_join_with_confirmation( monkeypatch.setattr("builtins.input", lambda: reply) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 0, @@ -1029,7 +1077,7 @@ def test_sim_join_invalid_column_raises_error( ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -1050,7 +1098,7 @@ def test_sim_join_invalid_model_raises_error(session): ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -1071,7 +1119,7 @@ def test_sim_join_invalid_top_k_raises_error(session, text_embedding_generator): ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -1096,7 +1144,7 @@ def test_sim_join_data_too_large_raises_error(session, text_embedding_generator) ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -1145,7 +1193,7 @@ def test_top_k_invalid_instruction_raise_error(instruction, gemini_flash_model): ) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -1157,7 +1205,7 @@ def test_top_k_invalid_k_raise_error(gemini_flash_model): df = dataframe.DataFrame({"Animals": ["Dog", "Cat", "Bird", "Horse"]}) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 10, @@ -1174,7 +1222,7 @@ def test_confirm_operation__below_threshold_do_not_confirm(mock_input): df = dataframe.DataFrame({}) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 3, @@ -1189,7 +1237,7 @@ def test_confirm_operation__threshold_is_none_do_not_confirm(mock_input): df = dataframe.DataFrame({}) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, None, @@ -1204,7 +1252,7 @@ def test_confirm_operation__threshold_autofail_do_not_confirm(mock_input): df = dataframe.DataFrame({}) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 1, @@ -1231,7 +1279,7 @@ def test_confirm_operation__above_threshold_confirm(reply, expectation, monkeypa df = dataframe.DataFrame({}) with bigframes.option_context( - EXPERIMENT_OPTION, + SEM_OP_EXP_OPTION, True, THRESHOLD_OPTION, 3, diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index bade725733..16f66dae57 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -22,6 +22,13 @@ import bigframes.pandas as bpd +@pytest.fixture(scope="module", autouse=True) +def use_large_query_path(): + # b/401630655 + with bpd.option_context("bigquery.allow_large_results", True): + yield + + @pytest.mark.parametrize( ("json_path", "expected_json"), [ diff --git a/tests/system/small/bigquery/test_struct.py b/tests/system/small/bigquery/test_struct.py index 58c822f642..4970964edd 100644 --- a/tests/system/small/bigquery/test_struct.py +++ b/tests/system/small/bigquery/test_struct.py @@ -53,9 +53,10 @@ def test_struct_from_dataframe(columns_arg): srs = series.Series( columns_arg, ) + # Use allow_large_results=True, due to b/403028465 pd.testing.assert_series_equal( - srs.to_pandas(), - bbq.struct(srs.struct.explode()).to_pandas(), + srs.to_pandas(allow_large_results=True), + bbq.struct(srs.struct.explode()).to_pandas(allow_large_results=True), check_index_type=False, check_dtype=False, ) diff --git a/tests/system/small/blob/conftest.py b/tests/system/small/blob/conftest.py deleted file mode 100644 index 5305acc193..0000000000 --- a/tests/system/small/blob/conftest.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -import bigframes -import bigframes.pandas as bpd - - -@pytest.fixture(scope="session") -def images_gcs_path() -> str: - return "gs://bigframes_blob_test/images/*" - - -@pytest.fixture(scope="session") -def images_uris() -> list[str]: - return [ - "gs://bigframes_blob_test/images/img0.jpg", - "gs://bigframes_blob_test/images/img1.jpg", - ] - - -@pytest.fixture(scope="session") -def images_mm_df( - images_gcs_path, session: bigframes.Session, bq_connection: str -) -> bpd.DataFrame: - bigframes.options.experiments.blob = True - - return session.from_glob_path( - images_gcs_path, name="blob_col", connection=bq_connection - ) diff --git a/tests/system/small/blob/test_properties.py b/tests/system/small/blob/test_properties.py index dedd1f916a..c7704ec86d 100644 --- a/tests/system/small/blob/test_properties.py +++ b/tests/system/small/blob/test_properties.py @@ -55,31 +55,33 @@ def test_blob_version(images_mm_df: bpd.DataFrame): def test_blob_metadata(images_mm_df: bpd.DataFrame): - bigframes.options.experiments.blob = True - - actual = images_mm_df["blob_col"].blob.metadata().to_pandas() - expected = pd.Series( - [ - { - "content_type": "image/jpeg", - "md5_hash": "e130ad042261a1883cd2cc06831cf748", - "size": 338390, - "updated": 1739574332000000, - }, - { - "content_type": "image/jpeg", - "md5_hash": "e2ae3191ff2b809fd0935f01a537c650", - "size": 43333, - "updated": 1739574332000000, - }, - ], - name="metadata", - dtype=db_dtypes.JSONDtype(), - ) - - pd.testing.assert_series_equal( - actual, expected, check_dtype=False, check_index_type=False - ) + # allow_large_result=False incompatible with json b/401630655 + with bigframes.option_context( + "bigquery.allow_large_results", True, "experiments.blob", True + ): + actual = images_mm_df["blob_col"].blob.metadata().to_pandas() + expected = pd.Series( + [ + { + "content_type": "image/jpeg", + "md5_hash": "e130ad042261a1883cd2cc06831cf748", + "size": 338390, + "updated": 1739574332000000, + }, + { + "content_type": "image/jpeg", + "md5_hash": "e2ae3191ff2b809fd0935f01a537c650", + "size": 43333, + "updated": 1739574332000000, + }, + ], + name="metadata", + dtype=db_dtypes.JSONDtype(), + ) + + pd.testing.assert_series_equal( + actual, expected, check_dtype=False, check_index_type=False + ) def test_blob_content_type(images_mm_df: bpd.DataFrame): diff --git a/tests/system/small/functions/test_managed_function.py b/tests/system/small/functions/test_managed_function.py deleted file mode 100644 index e1af68512a..0000000000 --- a/tests/system/small/functions/test_managed_function.py +++ /dev/null @@ -1,354 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import google.api_core.exceptions -import pandas as pd -import pytest - -import bigframes.exceptions -from bigframes.functions import _function_session as bff_session -from bigframes.functions._utils import get_python_version -from bigframes.pandas import udf -import bigframes.pandas as bpd -import bigframes.series -from tests.system.utils import assert_pandas_df_equal, get_function_name - -bpd.options.experiments.udf = True - - -@pytest.mark.skipif( - get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, - reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", -) -@pytest.mark.parametrize( - ("typ",), - [ - pytest.param(int), - pytest.param(float), - pytest.param(bool), - pytest.param(str), - pytest.param(bytes), - ], -) -def test_managed_function_series_apply( - typ, - scalars_dfs, - dataset_id_permanent, -): - def foo(x): - # The bytes() constructor expects a non-negative interger as its arg. - return typ(abs(x)) - - foo = udf( - input_types=int, - output_type=typ, - dataset=dataset_id_permanent, - name=get_function_name(foo), - )(foo) - - # Function should still work normally. - assert foo(-2) == typ(2) - - assert hasattr(foo, "bigframes_bigquery_function") - assert hasattr(foo, "ibis_node") - assert hasattr(foo, "input_dtypes") - assert hasattr(foo, "output_dtype") - assert hasattr(foo, "bigframes_bigquery_function_output_dtype") - - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result_col = scalars_df["int64_too"].apply(foo) - bf_result = ( - scalars_df["int64_too"].to_frame().assign(result=bf_result_col).to_pandas() - ) - - pd_result_col = scalars_pandas_df["int64_too"].apply(foo) - pd_result = scalars_pandas_df["int64_too"].to_frame().assign(result=pd_result_col) - - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) - - -@pytest.mark.skipif( - get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, - reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", -) -def test_managed_function_series_combine(dataset_id_permanent, scalars_dfs): - # This function is deliberately written to not work with NA input. - def add(x: int, y: int) -> int: - return x + y - - scalars_df, scalars_pandas_df = scalars_dfs - int_col_name_with_nulls = "int64_col" - int_col_name_no_nulls = "int64_too" - bf_df = scalars_df[[int_col_name_with_nulls, int_col_name_no_nulls]] - pd_df = scalars_pandas_df[[int_col_name_with_nulls, int_col_name_no_nulls]] - - # make sure there are NA values in the test column. - assert any([pd.isna(val) for val in bf_df[int_col_name_with_nulls]]) - - add_managed_func = udf( - dataset=dataset_id_permanent, - name=get_function_name(add), - )(add) - - # with nulls in the series the managed function application would fail. - with pytest.raises( - google.api_core.exceptions.BadRequest, match="unsupported operand" - ): - bf_df[int_col_name_with_nulls].combine( - bf_df[int_col_name_no_nulls], add_managed_func - ).to_pandas() - - # after filtering out nulls the managed function application should work - # similar to pandas. - pd_filter = pd_df[int_col_name_with_nulls].notnull() - pd_result = pd_df[pd_filter][int_col_name_with_nulls].combine( - pd_df[pd_filter][int_col_name_no_nulls], add - ) - bf_filter = bf_df[int_col_name_with_nulls].notnull() - bf_result = ( - bf_df[bf_filter][int_col_name_with_nulls] - .combine(bf_df[bf_filter][int_col_name_no_nulls], add_managed_func) - .to_pandas() - ) - - # ignore any dtype difference. - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) - - -@pytest.mark.skipif( - get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, - reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", -) -@pytest.mark.parametrize( - ("typ",), - [ - pytest.param(int), - pytest.param(float), - pytest.param(bool), - pytest.param(str), - ], -) -def test_managed_function_series_apply_list_output( - typ, - scalars_dfs, - dataset_id_permanent, -): - def foo_list(x): - # The bytes() constructor expects a non-negative interger as its arg. - return [typ(abs(x)), typ(abs(x) + 1)] - - foo_list = udf( - input_types=int, - output_type=list[typ], # type: ignore - dataset=dataset_id_permanent, - name=get_function_name(foo_list), - )(foo_list) - - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result_col = scalars_df["int64_too"].apply(foo_list) - bf_result = ( - scalars_df["int64_too"].to_frame().assign(result=bf_result_col).to_pandas() - ) - - pd_result_col = scalars_pandas_df["int64_too"].apply(foo_list) - pd_result = scalars_pandas_df["int64_too"].to_frame().assign(result=pd_result_col) - - # Ignore any dtype difference. - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) - - -@pytest.mark.skipif( - get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, - reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", -) -def test_managed_function_series_combine_list_output(dataset_id_permanent, scalars_dfs): - def add_list(x: int, y: int) -> list[int]: - return [x, y] - - scalars_df, scalars_pandas_df = scalars_dfs - int_col_name_with_nulls = "int64_col" - int_col_name_no_nulls = "int64_too" - bf_df = scalars_df[[int_col_name_with_nulls, int_col_name_no_nulls]] - pd_df = scalars_pandas_df[[int_col_name_with_nulls, int_col_name_no_nulls]] - - # Make sure there are NA values in the test column. - assert any([pd.isna(val) for val in bf_df[int_col_name_with_nulls]]) - - add_list_managed_func = udf( - dataset=dataset_id_permanent, - name=get_function_name(add_list), - )(add_list) - - # After filtering out nulls the managed function application should work - # similar to pandas. - pd_filter = pd_df[int_col_name_with_nulls].notnull() - pd_result = pd_df[pd_filter][int_col_name_with_nulls].combine( - pd_df[pd_filter][int_col_name_no_nulls], add_list - ) - bf_filter = bf_df[int_col_name_with_nulls].notnull() - bf_result = ( - bf_df[bf_filter][int_col_name_with_nulls] - .combine(bf_df[bf_filter][int_col_name_no_nulls], add_list_managed_func) - .to_pandas() - ) - - # Ignore any dtype difference. - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) - - -@pytest.mark.skipif( - get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, - reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", -) -def test_managed_function_dataframe_map(scalars_dfs, dataset_id_permanent): - def add_one(x): - return x + 1 - - mf_add_one = udf( - input_types=[int], - output_type=int, - dataset=dataset_id_permanent, - name=get_function_name(add_one), - )(add_one) - - scalars_df, scalars_pandas_df = scalars_dfs - int64_cols = ["int64_col", "int64_too"] - - bf_int64_df = scalars_df[int64_cols] - bf_int64_df_filtered = bf_int64_df.dropna() - bf_result = bf_int64_df_filtered.map(mf_add_one).to_pandas() - - pd_int64_df = scalars_pandas_df[int64_cols] - pd_int64_df_filtered = pd_int64_df.dropna() - pd_result = pd_int64_df_filtered.map(add_one) - # TODO(shobs): Figure why pandas .map() changes the dtype, i.e. - # pd_int64_df_filtered.dtype is Int64Dtype() - # pd_int64_df_filtered.map(lambda x: x).dtype is int64. - # For this test let's force the pandas dtype to be same as input. - for col in pd_result: - pd_result[col] = pd_result[col].astype(pd_int64_df_filtered[col].dtype) - - assert_pandas_df_equal(bf_result, pd_result) - - -@pytest.mark.skipif( - get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, - reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", -) -def test_managed_function_dataframe_apply_axis_1( - session, scalars_dfs, dataset_id_permanent -): - scalars_df, scalars_pandas_df = scalars_dfs - series = scalars_df["int64_too"] - series_pandas = scalars_pandas_df["int64_too"] - - def add_ints(x, y): - return x + y - - add_ints_mf = session.udf( - input_types=[int, int], - output_type=int, - dataset=dataset_id_permanent, - name=get_function_name(add_ints, is_row_processor=True), - )(add_ints) - assert add_ints_mf.bigframes_bigquery_function # type: ignore - - with pytest.warns( - bigframes.exceptions.PreviewWarning, match="axis=1 scenario is in preview." - ): - bf_result = ( - bpd.DataFrame({"x": series, "y": series}) - .apply(add_ints_mf, axis=1) - .to_pandas() - ) - - pd_result = pd.DataFrame({"x": series_pandas, "y": series_pandas}).apply( - lambda row: add_ints(row["x"], row["y"]), axis=1 - ) - - pd.testing.assert_series_equal( - pd_result, bf_result, check_dtype=False, check_exact=True - ) - - -@pytest.mark.skipif( - get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, - reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", -) -def test_managed_function_dataframe_map_list_output(scalars_dfs, dataset_id_permanent): - def add_one_list(x): - return [x + 1] * 3 - - mf_add_one_list = udf( - input_types=[int], - output_type=list[int], - dataset=dataset_id_permanent, - name=get_function_name(add_one_list), - )(add_one_list) - - scalars_df, scalars_pandas_df = scalars_dfs - int64_cols = ["int64_col", "int64_too"] - - bf_int64_df = scalars_df[int64_cols] - bf_int64_df_filtered = bf_int64_df.dropna() - bf_result = bf_int64_df_filtered.map(mf_add_one_list).to_pandas() - - pd_int64_df = scalars_pandas_df[int64_cols] - pd_int64_df_filtered = pd_int64_df.dropna() - pd_result = pd_int64_df_filtered.map(add_one_list) - - # Ignore any dtype difference. - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) - - -@pytest.mark.skipif( - get_python_version() not in bff_session._MANAGED_FUNC_PYTHON_VERSIONS, - reason=f"Supported version: {bff_session._MANAGED_FUNC_PYTHON_VERSIONS}", -) -def test_managed_function_dataframe_apply_axis_1_list_output( - session, scalars_dfs, dataset_id_permanent -): - scalars_df, scalars_pandas_df = scalars_dfs - series = scalars_df["int64_too"] - series_pandas = scalars_pandas_df["int64_too"] - - def add_ints_list(x, y): - return [x + y] * 2 - - add_ints_list_mf = session.udf( - input_types=[int, int], - output_type=list[int], - dataset=dataset_id_permanent, - name=get_function_name(add_ints_list, is_row_processor=True), - )(add_ints_list) - assert add_ints_list_mf.bigframes_bigquery_function # type: ignore - - with pytest.warns( - bigframes.exceptions.PreviewWarning, - match="axis=1 scenario is in preview.", - ): - bf_result = ( - bpd.DataFrame({"x": series, "y": series}) - .apply(add_ints_list_mf, axis=1) - .to_pandas() - ) - - pd_result = pd.DataFrame({"x": series_pandas, "y": series_pandas}).apply( - lambda row: add_ints_list(row["x"], row["y"]), axis=1 - ) - - # Ignore any dtype difference. - pd.testing.assert_series_equal(pd_result, bf_result, check_dtype=False) diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 075a57f23d..0af7f4e42e 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -16,6 +16,7 @@ import re import textwrap +import bigframes_vendored.constants as constants import google.api_core.exceptions from google.cloud import bigquery import pandas as pd @@ -124,6 +125,7 @@ def square(x): # Function should have extra metadata attached for remote execution. assert hasattr(square, "bigframes_remote_function") + assert hasattr(square, "bigframes_bigquery_function") assert hasattr(square, "bigframes_cloud_function") assert hasattr(square, "ibis_node") @@ -665,7 +667,7 @@ def square1(x): assert square1(2) == 4 square2 = bff.read_gbq_function( - function_name=square1.bigframes_remote_function, # type: ignore + function_name=square1.bigframes_bigquery_function, # type: ignore session=session, ) @@ -673,13 +675,17 @@ def square1(x): # cloud function associated with it, while the read-back version (square2) # should only have a remote function. assert square1.bigframes_remote_function # type: ignore + assert square1.bigframes_bigquery_function # type: ignore assert square1.bigframes_cloud_function # type: ignore assert square2.bigframes_remote_function + assert square2.bigframes_bigquery_function assert not hasattr(square2, "bigframes_cloud_function") # They should point to the same function. assert square1.bigframes_remote_function == square2.bigframes_remote_function # type: ignore + assert square1.bigframes_bigquery_function == square2.bigframes_bigquery_function # type: ignore + assert square2.bigframes_remote_function == square2.bigframes_bigquery_function # type: ignore # The result of applying them should be the same. int64_col = scalars_df_index["int64_col"] @@ -853,7 +859,7 @@ def test_read_gbq_function_reads_udfs(session, bigquery_client, dataset_id): ) # It should point to the named routine and yield the expected results. - assert square.bigframes_remote_function == str(routine.reference) + assert square.bigframes_bigquery_function == str(routine.reference) assert square.input_dtypes == (bigframes.dtypes.INT_DTYPE,) assert square.output_dtype == bigframes.dtypes.INT_DTYPE assert ( @@ -1087,10 +1093,11 @@ def test_df_apply_scalar_func(session, scalars_dfs): with pytest.raises(NotImplementedError) as context: bdf.apply(func_ref) assert str(context.value) == ( - "BigFrames DataFrame '.apply()' does not support remote function for " - "column-wise (i.e. with axis=0) operations, please use a regular python " - "function instead. For element-wise operations of the remote function, " - "please use '.map()'." + "BigFrames DataFrame '.apply()' does not support BigFrames BigQuery " + "function for column-wise (i.e. with axis=0) operations, please use a " + "regular python function instead. For element-wise operations of the " + "BigFrames BigQuery function, please use '.map()'. " + f"{constants.FEEDBACK_LINK}" ) @@ -1104,7 +1111,7 @@ def test_read_gbq_function_multiple_inputs_not_a_row_processor(session): ) assert str(context.value) == ( "A multi-input function cannot be a row processor. A row processor function " - "takes in a single input representing the row." + f"takes in a single input representing the row. {constants.FEEDBACK_LINK}" ) @@ -1134,6 +1141,7 @@ def add_ints(row): name=get_function_name(add_ints, is_row_processor=True), )(add_ints) assert add_ints_remote.bigframes_remote_function # type: ignore + assert add_ints_remote.bigframes_bigquery_function # type: ignore assert add_ints_remote.bigframes_cloud_function # type: ignore with pytest.warns( @@ -1155,11 +1163,13 @@ def add_ints(row): # Read back the deployed BQ remote function using read_gbq_function. func_ref = session.read_gbq_function( - function_name=add_ints_remote.bigframes_remote_function, # type: ignore + function_name=add_ints_remote.bigframes_bigquery_function, # type: ignore is_row_processor=True, ) assert func_ref.bigframes_remote_function == add_ints_remote.bigframes_remote_function # type: ignore + assert func_ref.bigframes_bigquery_function == add_ints_remote.bigframes_bigquery_function # type: ignore + assert func_ref.bigframes_remote_function == func_ref.bigframes_bigquery_function # type: ignore bf_result_gbq = scalars_df[columns].apply(func_ref, axis=1).to_pandas() pd.testing.assert_series_equal( @@ -1247,7 +1257,7 @@ def add_ints(row): scalars_pandas_df.apply(add_ints, axis=1) with pytest.raises( - ValueError, match="For axis=1 a bigframes function must be used." + ValueError, match="For axis=1 a BigFrames BigQuery function must be used." ): scalars_df[columns].apply(add_ints, axis=1) diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index 1c2591b90a..1827858353 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -401,9 +401,10 @@ def test_model_generate_text( "top_p": 0.5, "flatten_json_output": True, } + # Until b/401630655 is resolved, json not compatible with allow_large_results=False df = bqml_palm2_text_generator_model.generate_text( llm_text_df, options=options - ).to_pandas() + ).to_pandas(allow_large_results=True) utils.check_pandas_df_schema_and_index( df, columns=utils.ML_GENERATE_TEXT_OUTPUT, index=3, col_exact=False diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 90d5e9f1d7..1bcbd9cd8c 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -17,12 +17,20 @@ import pandas as pd import pytest +import bigframes from bigframes import exceptions from bigframes.ml import core, llm import bigframes.pandas as bpd from tests.system import utils +# Until b/401630655 is resolved, ML apis return json, not compatible with allow_large_results=False +@pytest.fixture(scope="module", autouse=True) +def always_create_table(): + with bigframes.option_context("bigquery.allow_large_results", True): + yield + + def test_create_load_text_generator_model( palm2_text_generator_model, dataset_id, bq_connection ): @@ -260,6 +268,44 @@ def test_text_embedding_generator_multi_cols_predict_success( assert len(pd_df["ml_generate_embedding_result"][0]) == 768 +def test_create_load_multimodal_embedding_generator_model( + dataset_id, session, bq_connection +): + bigframes.options.experiments.blob = True + + mm_embedding_model = llm.MultimodalEmbeddingGenerator( + connection_name=bq_connection, session=session + ) + assert mm_embedding_model is not None + assert mm_embedding_model._bqml_model is not None + + # save, load to ensure configuration was kept + reloaded_model = mm_embedding_model.to_gbq( + f"{dataset_id}.temp_mm_model", replace=True + ) + assert f"{dataset_id}.temp_mm_model" == reloaded_model._bqml_model.model_name + assert reloaded_model.connection_name == bq_connection + + +@pytest.mark.flaky(retries=2) +def test_multimodal_embedding_generator_predict_default_params_success( + images_mm_df, session, bq_connection +): + bigframes.options.experiments.blob = True + + text_embedding_model = llm.MultimodalEmbeddingGenerator( + connection_name=bq_connection, session=session + ) + df = text_embedding_model.predict(images_mm_df).to_pandas() + utils.check_pandas_df_schema_and_index( + df, + columns=utils.ML_MULTIMODAL_GENERATE_EMBEDDING_OUTPUT, + index=2, + col_exact=False, + ) + assert len(df["ml_generate_embedding_result"][0]) == 1408 + + @pytest.mark.parametrize( "model_name", ( @@ -273,6 +319,9 @@ def test_text_embedding_generator_multi_cols_predict_success( "gemini-2.0-flash-exp", ), ) +@pytest.mark.flaky( + retries=2 +) # usually create model shouldn't be flaky, but this one due to the limited quota of gemini-2.0-flash-exp. def test_create_load_gemini_text_generator_model( dataset_id, model_name, session, bq_connection ): @@ -375,6 +424,36 @@ def test_gemini_text_generator_multi_cols_predict_success( ) +@pytest.mark.parametrize( + "model_name", + ( + "gemini-1.5-pro-001", + "gemini-1.5-pro-002", + "gemini-1.5-flash-001", + "gemini-1.5-flash-002", + "gemini-2.0-flash-exp", + ), +) +@pytest.mark.flaky(retries=2) +def test_gemini_text_generator_multimodal_input( + images_mm_df: bpd.DataFrame, model_name, session, bq_connection +): + bigframes.options.experiments.blob = True + + gemini_text_generator_model = llm.GeminiTextGenerator( + model_name=model_name, connection_name=bq_connection, session=session + ) + pd_df = gemini_text_generator_model.predict( + images_mm_df, prompt=["Describe", images_mm_df["blob_col"]] + ).to_pandas() + utils.check_pandas_df_schema_and_index( + pd_df, + columns=utils.ML_GENERATE_TEXT_OUTPUT + ["blob_col"], + index=2, + col_exact=False, + ) + + # Overrides __eq__ function for comparing as mock.call parameter class EqCmpAllDataFrame(bpd.DataFrame): def __eq__(self, other): diff --git a/tests/system/small/operations/test_semantics.py b/tests/system/small/operations/test_semantics.py new file mode 100644 index 0000000000..85777faaf6 --- /dev/null +++ b/tests/system/small/operations/test_semantics.py @@ -0,0 +1,124 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Note that the tests in this files uses fake models for deterministic results. +# Tests that use real LLM models are under system/large/test_semantcs.py + +import pandas as pd +import pandas.testing +import pytest + +import bigframes +from bigframes import dataframe, dtypes +from bigframes.ml import llm + +SEM_OP_EXP_OPTION = "experiments.semantic_operators" +THRESHOLD_OPTION = "compute.semantic_ops_confirmation_threshold" + + +class FakeGeminiTextGenerator(llm.GeminiTextGenerator): + def __init__(self, prediction): + self.prediction = prediction + + def predict(self, *args, **kwargs): + return self.prediction + + +def test_semantics_experiment_off_raise_error(session): + df = dataframe.DataFrame( + {"country": ["USA", "Germany"], "city": ["Seattle", "Berlin"]}, session=session + ) + + with bigframes.option_context(SEM_OP_EXP_OPTION, False), pytest.raises( + NotImplementedError + ): + df.semantics + + +def test_filter(session): + df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) + model = FakeGeminiTextGenerator( + dataframe.DataFrame( + {"ml_generate_text_llm_result": ["true", "false"]}, session=session + ), + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 50, + ): + result = df.semantics.filter( + "filter {col}", + model=model, + ).to_pandas() + + pandas.testing.assert_frame_equal( + result, + pd.DataFrame({"col": ["A"]}, dtype=dtypes.STRING_DTYPE), + check_index_type=False, + ) + + +def test_map(session): + df = dataframe.DataFrame({"col": ["A", "B"]}, session=session) + model = FakeGeminiTextGenerator( + dataframe.DataFrame( + {"ml_generate_text_llm_result": ["true", "false"]}, session=session + ), + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 50, + ): + result = df.semantics.map( + "map {col}", model=model, output_column="output" + ).to_pandas() + + pandas.testing.assert_frame_equal( + result, + pd.DataFrame( + {"col": ["A", "B"], "output": ["true", "false"]}, dtype=dtypes.STRING_DTYPE + ), + check_index_type=False, + ) + + +def test_join(session): + left_df = dataframe.DataFrame({"col_A": ["A"]}, session=session) + right_df = dataframe.DataFrame({"col_B": ["B"]}, session=session) + model = FakeGeminiTextGenerator( + dataframe.DataFrame({"ml_generate_text_llm_result": ["true"]}, session=session), + ) + + with bigframes.option_context( + SEM_OP_EXP_OPTION, + True, + THRESHOLD_OPTION, + 50, + ): + result = left_df.semantics.join( + right_df, "join {col_A} and {col_B}", model + ).to_pandas() + + pandas.testing.assert_frame_equal( + result, + pd.DataFrame({"col_A": ["A"], "col_B": ["B"]}, dtype=dtypes.STRING_DTYPE), + check_index_type=False, + ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9415f9657e..b97846d992 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -163,11 +163,11 @@ def test_df_construct_from_dict(): ) -def test_df_construct_inline_respects_location(): +def test_df_construct_inline_respects_location(reset_default_session_and_location): # Note: This starts a thread-local session. with bpd.option_context("bigquery.location", "europe-west1"): df = bpd.DataFrame([[1, 2, 3], [4, 5, 6]]) - repr(df) + df.to_gbq() assert df.query_job is not None table = bpd.get_global_session().bqclient.get_table(df.query_job.destination) @@ -664,11 +664,8 @@ def test_rename(scalars_dfs): def test_df_peek(scalars_dfs_maybe_ordered): scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - session = scalars_df._block.session - slot_millis_sum = session.slot_millis_sum - peek_result = scalars_df.peek(n=3, force=False) + peek_result = scalars_df.peek(n=3, force=False, allow_large_results=True) - assert session.slot_millis_sum - slot_millis_sum > 1000 pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) assert len(peek_result) == 3 @@ -676,12 +673,8 @@ def test_df_peek(scalars_dfs_maybe_ordered): def test_df_peek_with_large_results_not_allowed(scalars_dfs_maybe_ordered): scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - session = scalars_df._block.session - slot_millis_sum = session.slot_millis_sum peek_result = scalars_df.peek(n=3, force=False, allow_large_results=False) - # The metrics won't be fully updated when we call query_and_wait. - assert session.slot_millis_sum - slot_millis_sum < 500 pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) assert len(peek_result) == 3 @@ -4584,12 +4577,13 @@ def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, sub ], ) def test_df_drop_duplicates_w_json(json_df, keep): - bf_df = json_df.drop_duplicates(keep=keep).to_pandas() + bf_df = json_df.drop_duplicates(keep=keep).to_pandas(allow_large_results=True) # drop_duplicates relies on pa.compute.dictionary_encode, which is incompatible # with Arrow string extension types. Temporary conversion to standard Pandas # strings is required. - json_pandas_df = json_df.to_pandas() + # allow_large_results=True for b/401630655 + json_pandas_df = json_df.to_pandas(allow_large_results=True) json_pandas_df["json_col"] = json_pandas_df["json_col"].astype( pd.StringDtype(storage="pyarrow") ) @@ -4951,14 +4945,16 @@ def test_df_bool_interpretation_error(scalars_df_index): def test_query_job_setters(scalars_df_default_index: dataframe.DataFrame): - job_ids = set() - repr(scalars_df_default_index) - assert scalars_df_default_index.query_job is not None - job_ids.add(scalars_df_default_index.query_job.job_id) - scalars_df_default_index.to_pandas() - job_ids.add(scalars_df_default_index.query_job.job_id) + # if allow_large_results=False, might not create query job + with bigframes.option_context("bigquery.allow_large_results", True): + job_ids = set() + repr(scalars_df_default_index) + assert scalars_df_default_index.query_job is not None + job_ids.add(scalars_df_default_index.query_job.job_id) + scalars_df_default_index.to_pandas(allow_large_results=True) + job_ids.add(scalars_df_default_index.query_job.job_id) - assert len(job_ids) == 2 + assert len(job_ids) == 2 def test_df_cached(scalars_df_index): @@ -5196,7 +5192,12 @@ def test_to_pandas_downsampling_option_override(session): df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting") download_size = 1 - df = df.to_pandas(max_download_size=download_size, sampling_method="head") + # limits only apply for allow_large_result=True + df = df.to_pandas( + max_download_size=download_size, + sampling_method="head", + allow_large_results=True, + ) total_memory_bytes = df.memory_usage(deep=True).sum() total_memory_mb = total_memory_bytes / (1024 * 1024) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index e80668939a..30a78b5cdc 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -254,27 +254,31 @@ def test_to_pandas_array_struct_correct_result(session): def test_to_pandas_override_global_option(scalars_df_index): # Direct call to_pandas uses global default setting (allow_large_results=True), # table has 'bqdf' prefix. - scalars_df_index.to_pandas() - table_id = scalars_df_index._query_job.destination.table_id - assert table_id.startswith("bqdf") + with bigframes.option_context("bigquery.allow_large_results", True): - # When allow_large_results=False, a query_job object should not be created. - # Therefore, the table_id should remain unchanged. - scalars_df_index.to_pandas(allow_large_results=False) - assert scalars_df_index._query_job.destination.table_id == table_id + scalars_df_index.to_pandas() + table_id = scalars_df_index._query_job.destination.table_id + assert table_id.startswith("bqdf") + + # When allow_large_results=False, a query_job object should not be created. + # Therefore, the table_id should remain unchanged. + scalars_df_index.to_pandas(allow_large_results=False) + assert scalars_df_index._query_job.destination.table_id == table_id def test_to_arrow_override_global_option(scalars_df_index): # Direct call to_arrow uses global default setting (allow_large_results=True), # table has 'bqdf' prefix. - scalars_df_index.to_arrow() - table_id = scalars_df_index._query_job.destination.table_id - assert table_id.startswith("bqdf") - - # When allow_large_results=False, a query_job object should not be created. - # Therefore, the table_id should remain unchanged. - scalars_df_index.to_arrow(allow_large_results=False) - assert scalars_df_index._query_job.destination.table_id == table_id + with bigframes.option_context("bigquery.allow_large_results", True): + + scalars_df_index.to_arrow() + table_id = scalars_df_index._query_job.destination.table_id + assert table_id.startswith("bqdf") + + # When allow_large_results=False, a query_job object should not be created. + # Therefore, the table_id should remain unchanged. + scalars_df_index.to_arrow(allow_large_results=False) + assert scalars_df_index._query_job.destination.table_id == table_id def test_load_json_w_json_string_items(session): diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py index 7d684e64b4..72529bc5b0 100644 --- a/tests/system/small/test_encryption.py +++ b/tests/system/small/test_encryption.py @@ -41,6 +41,7 @@ def bq_cmek() -> str: @pytest.fixture(scope="module") def session_with_bq_cmek(bq_cmek) -> bigframes.Session: + # allow_large_results = False might not create table, and therefore no encryption config session = bigframes.Session(bigframes.BigQueryOptions(kms_key_name=bq_cmek)) return session @@ -52,7 +53,7 @@ def _assert_bq_table_is_encrypted( session: bigframes.Session, ): # Materialize the data in BQ - repr(df) + df.to_gbq() # The df should be backed by a query job with intended encryption on the result table assert df.query_job is not None diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index b7101c90f3..2e5cd18158 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -546,8 +546,7 @@ def test_dataframe_groupby_nonnumeric_with_mean(): ) pd_result = df.groupby(["key1", "key2"]).mean() - with bpd.option_context("bigquery.location", "US"): - bf_result = bpd.DataFrame(df).groupby(["key1", "key2"]).mean().to_pandas() + bf_result = bpd.DataFrame(df).groupby(["key1", "key2"]).mean().to_pandas() pd.testing.assert_frame_equal( pd_result, bf_result, check_index_type=False, check_dtype=False diff --git a/tests/system/small/test_index_io.py b/tests/system/small/test_index_io.py index a7cd4013b9..85001e4ec5 100644 --- a/tests/system/small/test_index_io.py +++ b/tests/system/small/test_index_io.py @@ -11,33 +11,38 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import bigframes def test_to_pandas_override_global_option(scalars_df_index): - bf_index = scalars_df_index.index + with bigframes.option_context("bigquery.allow_large_results", True): - # Direct call to_pandas uses global default setting (allow_large_results=True), - # table has 'bqdf' prefix. - bf_index.to_pandas() - table_id = bf_index._query_job.destination.table_id - assert table_id.startswith("bqdf") + bf_index = scalars_df_index.index - # When allow_large_results=False, a query_job object should not be created. - # Therefore, the table_id should remain unchanged. - bf_index.to_pandas(allow_large_results=False) - assert bf_index._query_job.destination.table_id == table_id + # Direct call to_pandas uses global default setting (allow_large_results=True), + # table has 'bqdf' prefix. + bf_index.to_pandas() + table_id = bf_index._query_job.destination.table_id + assert table_id.startswith("bqdf") + + # When allow_large_results=False, a query_job object should not be created. + # Therefore, the table_id should remain unchanged. + bf_index.to_pandas(allow_large_results=False) + assert bf_index._query_job.destination.table_id == table_id def test_to_numpy_override_global_option(scalars_df_index): - bf_index = scalars_df_index.index - - # Direct call to_numpy uses global default setting (allow_large_results=True), - # table has 'bqdf' prefix. - bf_index.to_numpy() - table_id = bf_index._query_job.destination.table_id - assert table_id.startswith("bqdf") - - # When allow_large_results=False, a query_job object should not be created. - # Therefore, the table_id should remain unchanged. - bf_index.to_numpy(allow_large_results=False) - assert bf_index._query_job.destination.table_id == table_id + with bigframes.option_context("bigquery.allow_large_results", True): + + bf_index = scalars_df_index.index + + # Direct call to_numpy uses global default setting (allow_large_results=True), + # table has 'bqdf' prefix. + bf_index.to_numpy() + table_id = bf_index._query_job.destination.table_id + assert table_id.startswith("bqdf") + + # When allow_large_results=False, a query_job object should not be created. + # Therefore, the table_id should remain unchanged. + bf_index.to_numpy(allow_large_results=False) + assert bf_index._query_job.destination.table_id == table_id diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index da78432cdb..2b6dfefb12 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -387,11 +387,43 @@ def test_merge_series(scalars_dfs, merge_how): assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) -def test_cut(scalars_dfs): +def _convert_pandas_category(pd_s: pd.Series): + if not isinstance(pd_s.dtype, pd.CategoricalDtype): + raise ValueError("Input must be a pandas Series with categorical data.") + + if len(pd_s.dtype.categories) == 0: + return pd.Series([pd.NA] * len(pd_s), name=pd_s.name) + + pd_interval: pd.IntervalIndex = pd_s.cat.categories[pd_s.cat.codes] # type: ignore + if pd_interval.closed == "left": + left_key = "left_inclusive" + right_key = "right_exclusive" + else: + left_key = "left_exclusive" + right_key = "right_inclusive" + return pd.Series( + [ + {left_key: interval.left, right_key: interval.right} + if pd.notna(val) + else pd.NA + for val, interval in zip(pd_s, pd_interval) + ], + name=pd_s.name, + ) + + +@pytest.mark.parametrize( + ("right"), + [ + pytest.param(True), + pytest.param(False), + ], +) +def test_cut(scalars_dfs, right): scalars_df, scalars_pandas_df = scalars_dfs - pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, labels=False) - bf_result = bpd.cut(scalars_df["float64_col"], 5, labels=False) + pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, labels=False, right=right) + bf_result = bpd.cut(scalars_df["float64_col"], 5, labels=False, right=right) # make sure the result is a supported dtype assert bf_result.dtype == bpd.Int64Dtype() @@ -399,56 +431,43 @@ def test_cut(scalars_dfs): pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) -def test_cut_default_labels(scalars_dfs): +@pytest.mark.parametrize( + ("right"), + [ + pytest.param(True), + pytest.param(False), + ], +) +def test_cut_default_labels(scalars_dfs, right): scalars_df, scalars_pandas_df = scalars_dfs - pd_result = pd.cut(scalars_pandas_df["float64_col"], 5) - bf_result = bpd.cut(scalars_df["float64_col"], 5).to_pandas() + pd_result = pd.cut(scalars_pandas_df["float64_col"], 5, right=right) + bf_result = bpd.cut(scalars_df["float64_col"], 5, right=right).to_pandas() # Convert to match data format - pd_result_converted = pd.Series( - [ - {"left_exclusive": interval.left, "right_inclusive": interval.right} - if pd.notna(val) - else pd.NA - for val, interval in zip( - pd_result, pd_result.cat.categories[pd_result.cat.codes] - ) - ], - name=pd_result.name, - ) - + pd_result_converted = _convert_pandas_category(pd_result) pd.testing.assert_series_equal( bf_result, pd_result_converted, check_index=False, check_dtype=False ) @pytest.mark.parametrize( - ("breaks",), + ("breaks", "right"), [ - ([0, 5, 10, 15, 20, 100, 1000],), # ints - ([0.5, 10.5, 15.5, 20.5, 100.5, 1000.5],), # floats - ([0, 5, 10.5, 15.5, 20, 100, 1000.5],), # mixed + pytest.param([0, 5, 10, 15, 20, 100, 1000], True, id="int_right"), + pytest.param([0, 5, 10, 15, 20, 100, 1000], False, id="int_left"), + pytest.param([0.5, 10.5, 15.5, 20.5, 100.5, 1000.5], False, id="float_left"), + pytest.param([0, 5, 10.5, 15.5, 20, 100, 1000.5], True, id="mixed_right"), ], ) -def test_cut_numeric_breaks(scalars_dfs, breaks): +def test_cut_numeric_breaks(scalars_dfs, breaks, right): scalars_df, scalars_pandas_df = scalars_dfs - pd_result = pd.cut(scalars_pandas_df["float64_col"], breaks) - bf_result = bpd.cut(scalars_df["float64_col"], breaks).to_pandas() + pd_result = pd.cut(scalars_pandas_df["float64_col"], breaks, right=right) + bf_result = bpd.cut(scalars_df["float64_col"], breaks, right=right).to_pandas() # Convert to match data format - pd_result_converted = pd.Series( - [ - {"left_exclusive": interval.left, "right_inclusive": interval.right} - if pd.notna(val) - else pd.NA - for val, interval in zip( - pd_result, pd_result.cat.categories[pd_result.cat.codes] - ) - ], - name=pd_result.name, - ) + pd_result_converted = _convert_pandas_category(pd_result) pd.testing.assert_series_equal( bf_result, pd_result_converted, check_index=False, check_dtype=False @@ -456,52 +475,59 @@ def test_cut_numeric_breaks(scalars_dfs, breaks): @pytest.mark.parametrize( - ("bins",), + "bins", [ - (-1,), # negative integer bins argument - ([],), # empty iterable of bins - (["notabreak"],), # iterable of wrong type - ([1],), # numeric breaks with only one numeric - # this is supported by pandas but not by - # the bigquery operation and a bigframes workaround - # is not yet available. Should return column - # of structs with all NaN values. + pytest.param([], id="empty_list"), + pytest.param( + [1], id="single_int_list", marks=pytest.mark.skip(reason="b/404338651") + ), + pytest.param(pd.IntervalIndex.from_tuples([]), id="empty_interval_index"), ], ) -def test_cut_errors(scalars_dfs, bins): - scalars_df, _ = scalars_dfs +def test_cut_w_edge_cases(scalars_dfs, bins): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = bpd.cut(scalars_df["int64_too"], bins, labels=False).to_pandas() + if isinstance(bins, list): + bins = pd.IntervalIndex.from_tuples(bins) + pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=False) + + # Convert to match data format + pd_result_converted = _convert_pandas_category(pd_result) - with pytest.raises(ValueError): - bpd.cut(scalars_df["float64_col"], bins) + pd.testing.assert_series_equal( + bf_result, pd_result_converted, check_index=False, check_dtype=False + ) @pytest.mark.parametrize( - ("bins",), + ("bins", "right"), [ - ([(-5, 2), (2, 3), (-3000, -10)],), - (pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)]),), + pytest.param([(-5, 2), (2, 3), (-3000, -10)], True, id="tuple_right"), + pytest.param([(-5, 2), (2, 3), (-3000, -10)], False, id="tuple_left"), + pytest.param( + pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)]), + True, + id="interval_right", + ), + pytest.param( + pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)]), + False, + id="interval_left", + ), ], ) -def test_cut_with_interval(scalars_dfs, bins): +def test_cut_with_interval(scalars_dfs, bins, right): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = bpd.cut(scalars_df["int64_too"], bins, labels=False).to_pandas() + bf_result = bpd.cut( + scalars_df["int64_too"], bins, labels=False, right=right + ).to_pandas() if isinstance(bins, list): bins = pd.IntervalIndex.from_tuples(bins) - pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=False) + pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=False, right=right) # Convert to match data format - pd_result_converted = pd.Series( - [ - {"left_exclusive": interval.left, "right_inclusive": interval.right} - if pd.notna(val) - else pd.NA - for val, interval in zip( - pd_result, pd_result.cat.categories[pd_result.cat.codes] - ) - ], - name=pd_result.name, - ) + pd_result_converted = _convert_pandas_category(pd_result) pd.testing.assert_series_equal( bf_result, pd_result_converted, check_index=False, check_dtype=False diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py index c580f926c9..d59b6d66b5 100644 --- a/tests/system/small/test_pandas_options.py +++ b/tests/system/small/test_pandas_options.py @@ -26,15 +26,6 @@ import bigframes.pandas as bpd -@pytest.fixture(autouse=True) -def reset_default_session_and_location(): - # Note: This starts a thread-local session and closes it once the test - # finishes. - with bpd.option_context("bigquery.location", None): - bpd.options.bigquery.location = None - yield - - @pytest.mark.parametrize( ("read_method", "query_prefix"), [ @@ -58,7 +49,9 @@ def test_read_gbq_start_sets_session_location( dataset_id_permanent, read_method, query_prefix, + reset_default_session_and_location, ): + # Form query as a table name or a SQL depending on the test scenario query_tokyo = test_data_tables_tokyo["scalars"] query = test_data_tables["scalars"] @@ -138,6 +131,7 @@ def test_read_gbq_after_session_start_must_comply_with_default_location( dataset_id_permanent_tokyo, read_method, query_prefix, + reset_default_session_and_location, ): # Form query as a table name or a SQL depending on the test scenario query_tokyo = test_data_tables_tokyo["scalars"] @@ -191,6 +185,7 @@ def test_read_gbq_must_comply_with_set_location_US( dataset_id_permanent_tokyo, read_method, query_prefix, + reset_default_session_and_location, ): # Form query as a table name or a SQL depending on the test scenario query_tokyo = test_data_tables_tokyo["scalars"] @@ -241,6 +236,7 @@ def test_read_gbq_must_comply_with_set_location_non_US( dataset_id_permanent, read_method, query_prefix, + reset_default_session_and_location, ): # Form query as a table name or a SQL depending on the test scenario query_tokyo = test_data_tables_tokyo["scalars"] @@ -269,7 +265,9 @@ def test_read_gbq_must_comply_with_set_location_non_US( assert df is not None -def test_credentials_need_reauthentication(monkeypatch): +def test_credentials_need_reauthentication( + monkeypatch, reset_default_session_and_location +): # Use a simple test query to verify that default session works to interact # with BQ. test_query = "SELECT 1" diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 73a9743e2f..3139ae5225 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -32,7 +32,7 @@ def test_progress_bar_dataframe( capsys.readouterr() # clear output with bf.option_context("display.progress_bar", "terminal"): - penguins_df_default_index.to_pandas() + penguins_df_default_index.to_pandas(allow_large_results=True) assert_loading_msg_exist(capsys.readouterr().out) assert penguins_df_default_index.query_job is not None @@ -43,7 +43,7 @@ def test_progress_bar_series(penguins_df_default_index: bf.dataframe.DataFrame, capsys.readouterr() # clear output with bf.option_context("display.progress_bar", "terminal"): - series.to_pandas() + series.to_pandas(allow_large_results=True) assert_loading_msg_exist(capsys.readouterr().out) assert series.query_job is not None @@ -103,7 +103,7 @@ def assert_loading_msg_exist(capystOut: str, pattern=job_load_message_regex): def test_query_job_repr_html(penguins_df_default_index: bf.dataframe.DataFrame): with bf.option_context("display.progress_bar", "terminal"): - penguins_df_default_index.to_pandas() + penguins_df_default_index.to_pandas(allow_large_results=True) query_job_repr = formatting_helpers.repr_query_job_html( penguins_df_default_index.query_job ).value @@ -120,7 +120,7 @@ def test_query_job_repr_html(penguins_df_default_index: bf.dataframe.DataFrame): def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame): - penguins_df_default_index.to_pandas() + penguins_df_default_index.to_pandas(allow_large_results=True) query_job_repr = formatting_helpers.repr_query_job( penguins_df_default_index.query_job ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index d62af962fc..ef544b0a0b 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -304,22 +304,24 @@ def test_series_construct_w_dtype_for_array_struct(): def test_series_construct_w_dtype_for_json(): - data = [ - "1", - '"str"', - "false", - '["a", {"b": 1}, null]', - None, - '{"a": {"b": [1, 2, 3], "c": true}}', - ] - s = bigframes.pandas.Series(data, dtype=dtypes.JSON_DTYPE) + # Until b/401630655 is resolved, json, not compatible with allow_large_results=False + with bigframes.option_context("bigquery.allow_large_results", True): + data = [ + "1", + '"str"', + "false", + '["a", {"b": 1}, null]', + None, + '{"a": {"b": [1, 2, 3], "c": true}}', + ] + s = bigframes.pandas.Series(data, dtype=dtypes.JSON_DTYPE) - assert s[0] == "1" - assert s[1] == '"str"' - assert s[2] == "false" - assert s[3] == '["a",{"b":1},null]' - assert pd.isna(s[4]) - assert s[5] == '{"a":{"b":[1,2,3],"c":true}}' + assert s[0] == "1" + assert s[1] == '"str"' + assert s[2] == "false" + assert s[3] == '["a",{"b":1},null]' + assert pd.isna(s[4]) + assert s[5] == '{"a":{"b":[1,2,3],"c":true}}' def test_series_keys(scalars_dfs): @@ -382,7 +384,8 @@ def test_get_column(scalars_dfs, col_name, expected_dtype): def test_get_column_w_json(json_df, json_pandas_df): series = json_df["json_col"] - series_pandas = series.to_pandas() + # Until b/401630655 is resolved, json not compatible with allow_large_results=False + series_pandas = series.to_pandas(allow_large_results=True) assert series.dtype == pd.ArrowDtype(db_dtypes.JSONArrowType()) assert series_pandas.shape[0] == json_pandas_df.shape[0] @@ -2270,11 +2273,8 @@ def test_head_then_series_operation(scalars_dfs): def test_series_peek(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - session = scalars_df._block.session - slot_millis_sum = session.slot_millis_sum peek_result = scalars_df["float64_col"].peek(n=3, force=False) - assert session.slot_millis_sum - slot_millis_sum > 1000 pd.testing.assert_series_equal( peek_result, scalars_pandas_df["float64_col"].reindex_like(peek_result), @@ -3888,15 +3888,17 @@ def test_series_bool_interpretation_error(scalars_df_index): def test_query_job_setters(scalars_dfs): - job_ids = set() - df, _ = scalars_dfs - series = df["int64_col"] - assert series.query_job is not None - repr(series) - job_ids.add(series.query_job.job_id) - series.to_pandas() - job_ids.add(series.query_job.job_id) - assert len(job_ids) == 2 + # if allow_large_results=False, might not create query job + with bigframes.option_context("bigquery.allow_large_results", True): + job_ids = set() + df, _ = scalars_dfs + series = df["int64_col"] + assert series.query_job is not None + repr(series) + job_ids.add(series.query_job.job_id) + series.to_pandas() + job_ids.add(series.query_job.job_id) + assert len(job_ids) == 2 @pytest.mark.parametrize( diff --git a/tests/system/small/test_series_io.py b/tests/system/small/test_series_io.py index d44d1e5b24..ae09a2cf5d 100644 --- a/tests/system/small/test_series_io.py +++ b/tests/system/small/test_series_io.py @@ -11,22 +11,25 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import bigframes def test_to_pandas_override_global_option(scalars_df_index): - bf_series = scalars_df_index["int64_col"] + with bigframes.option_context("bigquery.allow_large_results", True): - # Direct call to_pandas uses global default setting (allow_large_results=True), - # table has 'bqdf' prefix. - bf_series.to_pandas() - table_id = bf_series._query_job.destination.table_id - assert table_id.startswith("bqdf") + bf_series = scalars_df_index["int64_col"] - session = bf_series._block.session - execution_count = session._metrics.execution_count + # Direct call to_pandas uses global default setting (allow_large_results=True), + # table has 'bqdf' prefix. + bf_series.to_pandas() + table_id = bf_series._query_job.destination.table_id + assert table_id.startswith("bqdf") - # When allow_large_results=False, a query_job object should not be created. - # Therefore, the table_id should remain unchanged. - bf_series.to_pandas(allow_large_results=False) - assert bf_series._query_job.destination.table_id == table_id - assert session._metrics.execution_count - execution_count == 1 + session = bf_series._block.session + execution_count = session._metrics.execution_count + + # When allow_large_results=False, a query_job object should not be created. + # Therefore, the table_id should remain unchanged. + bf_series.to_pandas(allow_large_results=False) + assert bf_series._query_job.destination.table_id == table_id + assert session._metrics.execution_count - execution_count == 1 diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 4b7495694b..323d002df4 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -46,7 +46,10 @@ def test_read_gbq_tokyo( result = df.sort_index().to_pandas() expected = scalars_pandas_df_index - result = session_tokyo._executor.execute(df._block.expr) + # use_explicit_destination=True, otherwise might use path with no query_job + result = session_tokyo._executor.execute( + df._block.expr, use_explicit_destination=True + ) assert result.query_job.location == tokyo_location assert len(expected) == result.total_rows @@ -129,9 +132,10 @@ def test_read_gbq_w_unknown_index_col( CONCAT(t.string_col, "_2") AS my_strings, t.int64_col > 0 AS my_bools, FROM `{scalars_table_id}` AS t + ORDER BY my_strings """, ["my_strings"], - id="string_index", + id="string_index_w_order_by", ), pytest.param( "SELECT GENERATE_UUID() AS uuid, 0 AS my_value FROM UNNEST(GENERATE_ARRAY(1, 20))", @@ -640,7 +644,7 @@ def test_read_pandas_inline_respects_location(): session = bigframes.Session(options) df = session.read_pandas(pd.DataFrame([[1, 2, 3], [4, 5, 6]])) - repr(df) + df.to_gbq() assert df.query_job is not None @@ -682,10 +686,12 @@ def test_read_pandas_tokyo( tokyo_location: str, ): df = session_tokyo.read_pandas(scalars_pandas_df_index) - result = df.to_pandas() + df.to_gbq() expected = scalars_pandas_df_index - result = session_tokyo._executor.execute(df._block.expr) + result = session_tokyo._executor.execute( + df._block.expr, use_explicit_destination=True + ) assert result.query_job.location == tokyo_location assert len(expected) == result.total_rows @@ -716,6 +722,7 @@ def test_read_pandas_timedelta_dataframes(session, write_engine): def test_read_pandas_timedelta_series(session, write_engine): expected_series = pd.Series(pd.to_timedelta([1, 2, 3], unit="d")) + # Until b/401630655 is resolved, json not compatible with allow_large_results=False actual_result = ( session.read_pandas(expected_series, write_engine=write_engine) .to_pandas() @@ -738,9 +745,10 @@ def test_read_pandas_timedelta_index(session, write_engine): [1, 2, 3], unit="d" ) # to_timedelta returns an index + # Until b/401630655 is resolved, json not compatible with allow_large_results=False actual_result = ( session.read_pandas(expected_index, write_engine=write_engine) - .to_pandas() + .to_pandas(allow_large_results=True) .astype("timedelta64[ns]") ) @@ -767,9 +775,10 @@ def test_read_pandas_json_dataframes(session, write_engine): {"my_col": pd.Series(json_data, dtype=bigframes.dtypes.JSON_DTYPE)} ) + # Until b/401630655 is resolved, json not compatible with allow_large_results=False actual_result = session.read_pandas( expected_df, write_engine=write_engine - ).to_pandas() + ).to_pandas(allow_large_results=True) if write_engine == "bigquery_streaming": expected_df.index = pd.Index([pd.NA] * 4, dtype="Int64") @@ -789,9 +798,10 @@ def test_read_pandas_json_series(session, write_engine): ] expected_series = pd.Series(json_data, dtype=bigframes.dtypes.JSON_DTYPE) + # Until b/401630655 is resolved, json not compatible with allow_large_results=False actual_result = session.read_pandas( expected_series, write_engine=write_engine - ).to_pandas() + ).to_pandas(allow_large_results=True) pd.testing.assert_series_equal( actual_result, expected_series, check_index_type=False ) @@ -812,9 +822,10 @@ def test_read_pandas_json_index(session, write_engine): '{"a":1,"b":["x","y"],"c":{"x":[],"z":false}}', ] expected_index: pd.Index = pd.Index(json_data, dtype=bigframes.dtypes.JSON_DTYPE) + # Until b/401630655 is resolved, json not compatible with allow_large_results=False actual_result = session.read_pandas( expected_index, write_engine=write_engine - ).to_pandas() + ).to_pandas(allow_large_results=True) pd.testing.assert_index_equal(actual_result, expected_index) diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index 106997f3e9..f270d1903c 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -38,7 +38,7 @@ def test_unordered_mode_sql_no_hash(unordered_session): def test_unordered_mode_job_label(unordered_session): pd_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, dtype=pd.Int64Dtype()) df = bpd.DataFrame(pd_df, session=unordered_session) - df.to_pandas() + df.to_gbq() job_labels = df.query_job.labels # type:ignore assert "bigframes-mode" in job_labels assert job_labels["bigframes-mode"] == "unordered" diff --git a/tests/system/utils.py b/tests/system/utils.py index fd8feb0eeb..bc1fe6745e 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -56,6 +56,14 @@ "ml_generate_embedding_status", "content", ] +ML_MULTIMODAL_GENERATE_EMBEDDING_OUTPUT = [ + "ml_generate_embedding_result", + "ml_generate_embedding_status", + # start and end sec depend on input format. Images and videos input will contain these 2. + "ml_generate_embedding_start_sec", + "ml_generate_embedding_end_sec", + "content", +] def skip_legacy_pandas(test): @@ -401,15 +409,16 @@ def cleanup_function_assets( if not ignore_failures: raise - # Clean up cloud function - try: - delete_cloud_function( - cloudfunctions_client, bigframes_func.bigframes_cloud_function - ) - except Exception: - # By default don't raise exception in cleanup. - if not ignore_failures: - raise + if cloudfunctions_client: + # Clean up cloud function + try: + delete_cloud_function( + cloudfunctions_client, bigframes_func.bigframes_cloud_function + ) + except Exception: + # By default don't raise exception in cleanup. + if not ignore_failures: + raise def get_function_name(func, package_requirements=None, is_row_processor=False): diff --git a/tests/unit/functions/test_remote_function_utils.py b/tests/unit/functions/test_remote_function_utils.py index 0bcfee5c4e..3eceb99331 100644 --- a/tests/unit/functions/test_remote_function_utils.py +++ b/tests/unit/functions/test_remote_function_utils.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import bigframes_vendored.constants as constants import pytest import bigframes.dtypes @@ -73,7 +74,7 @@ def test_get_bigframes_metadata_array_type_not_serializable(output_type): with pytest.raises(ValueError) as context: _utils.get_bigframes_metadata(python_output_type=output_type) assert str(context.value) == ( - f"python_output_type {output_type} is not serializable." + f"python_output_type {output_type} is not serializable. {constants.FEEDBACK_LINK}" ) diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index 13531acbea..d024d332d4 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -25,10 +25,10 @@ import pytest import bigframes +from bigframes import version import bigframes.enums import bigframes.exceptions - -from .. import resources +from tests.unit import resources TABLE_REFERENCE = { "projectId": "my-project", @@ -443,3 +443,18 @@ def test_session_init_fails_with_no_project(): credentials=mock.Mock(spec=google.auth.credentials.Credentials) ) ) + + +def test_session_init_warns_if_bf_version_is_too_old(monkeypatch): + release_date = datetime.datetime.strptime(version.__release_date__, "%Y-%m-%d") + current_date = release_date + datetime.timedelta(days=366) + + class FakeDatetime(datetime.datetime): + @classmethod + def today(cls): + return current_date + + monkeypatch.setattr(datetime, "datetime", FakeDatetime) + + with pytest.warns(bigframes.exceptions.ObsoleteVersionWarning): + resources.create_bigquery_session() diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index 1ee52c08a1..64a287aaca 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -101,14 +101,20 @@ def test_cut_raises_with_labels(): @pytest.mark.parametrize( - ("bins",), - ( - (0,), - (-1,), - ), + ("bins", "error_message"), + [ + pytest.param(1.5, "`bins` must be an integer or interable.", id="float"), + pytest.param(0, "`bins` should be a positive integer.", id="zero_int"), + pytest.param(-1, "`bins` should be a positive integer.", id="neg_int"), + pytest.param( + ["notabreak"], + "`bins` iterable should contain tuples or numerics", + id="iterable_w_wrong_type", + ), + ], ) -def test_cut_raises_with_invalid_bins(bins: int): - with pytest.raises(ValueError, match="`bins` should be a positive integer."): +def test_cut_raises_with_invalid_bins(bins: int, error_message: str): + with pytest.raises(ValueError, match=error_message): mock_series = mock.create_autospec(bigframes.pandas.Series, instance=True) bigframes.pandas.cut(mock_series, bins, labels=False) diff --git a/third_party/bigframes_vendored/constants.py b/third_party/bigframes_vendored/constants.py index d1aaa800cc..d11d8ba2cb 100644 --- a/third_party/bigframes_vendored/constants.py +++ b/third_party/bigframes_vendored/constants.py @@ -24,8 +24,8 @@ FEEDBACK_LINK = ( "Share your usecase with the BigQuery DataFrames team at the " - "https://bit.ly/bigframes-feedback survey." - f"You are currently running BigFrames version {bigframes_vendored.version.__version__}" + "https://bit.ly/bigframes-feedback survey. " + f"You are currently running BigFrames version {bigframes_vendored.version.__version__}." ) ABSTRACT_METHOD_ERROR_MESSAGE = ( diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 6bda14b025..d911a303eb 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -4,14 +4,23 @@ """ from __future__ import annotations -from bigframes import constants +import typing + +import pandas as pd + +from bigframes import constants, series def cut( - x, - bins, + x: series.Series, + bins: typing.Union[ + int, + pd.IntervalIndex, + typing.Iterable, + ], *, - labels=None, + right: bool = True, + labels: typing.Union[typing.Iterable[str], bool, None] = None, ): """ Bin values into discrete intervals. @@ -87,6 +96,16 @@ def cut( 3 {'left_exclusive': 5, 'right_inclusive': 20} dtype: struct[pyarrow] + Cut with an interable of ints, where intervals are left-inclusive and right-exclusive. + + >>> bins_ints = [0, 1, 5, 20] + >>> bpd.cut(s, bins=bins_ints, right=False) + 0 {'left_inclusive': 0, 'right_exclusive': 1} + 1 {'left_inclusive': 1, 'right_exclusive': 5} + 2 {'left_inclusive': 5, 'right_exclusive': 20} + 3 {'left_inclusive': 5, 'right_exclusive': 20} + dtype: struct[pyarrow] + Args: x (Series): The input Series to be binned. Must be 1-dimensional. @@ -103,7 +122,12 @@ def cut( Iterable of numerics: Defines the exact bins by using the interval between each item and its following item. The items must be monotonically increasing. - labels (None): + right (bool, default True): + Indicates whether `bins` includes the rightmost edge or not. If + ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]`` + indicate (1,2], (2,3], (3,4]. This argument is ignored when + `bins` is an IntervalIndex. + labels (default None): Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the bins. This affects the type of the output container. diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index e4062aa0c6..4d0f809a6f 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.40.0" +__version__ = "1.41.0" # {x-release-please-start-date} -__release_date__ = "2025-03-11" +__release_date__ = "2025-03-19" # {x-release-please-end}