2):\n",
- " while (True):\n",
- " i+=1\n",
- " status = True\n",
- " for j in range(2,int(i/2)+1):\n",
- " if(i%j==0):\n",
- " status = False\n",
- " break\n",
- " if(status==True):\n",
- " prime_numbers.append(i)\n",
- " if(len(prime_numbers)==n):\n",
- " break\n",
- " return prime_numbers[n-1]\n",
- " else:\n",
- " return -1\n",
- "\n",
- "#-------------------------------------------------------------------------------------\n",
- "print(f\"\\nWall time: {(perf_counter()-start):.4f} s\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "id": "c1c9355f",
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "CPU times: user 22.2 ms, sys: 0 ns, total: 22.2 ms\n",
- "Wall time: 22.4 ms\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " text | \n",
- " score | \n",
- " n_prime | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 35153602 | \n",
- " Are you trying to access a nested array? | \n",
- " 0 | \n",
- " -1 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 35156124 | \n",
- " Sorry I didn't include my timeout method befor... | \n",
- " 0 | \n",
- " -1 | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 35157401 | \n",
- " As soon as I defined some sort of primary Key ... | \n",
- " 0 | \n",
- " -1 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 35158649 | \n",
- " @user3355243 I've edited it to give $values an... | \n",
- " 0 | \n",
- " -1 | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 35162039 | \n",
- " I pasted my exes @Matt. | \n",
- " 0 | \n",
- " -1 | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " 35162396 | \n",
- " @Gene - I do have separate fields, but I also ... | \n",
- " 0 | \n",
- " -1 | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " 35162907 | \n",
- " could you please provide any kind of sketch ho... | \n",
- " 0 | \n",
- " -1 | \n",
- "
\n",
- " \n",
- " 7 | \n",
- " 35166498 | \n",
- " We use PhoneGap. Using Angular. | \n",
- " 0 | \n",
- " -1 | \n",
- "
\n",
- " \n",
- " 8 | \n",
- " 35170138 | \n",
- " `decltype((int(Foo::*)(int))(&Foo::foo))` That... | \n",
- " 0 | \n",
- " -1 | \n",
- "
\n",
- " \n",
- " 9 | \n",
- " 35172348 | \n",
- " Yes, I found that one already. And I understan... | \n",
- " 0 | \n",
- " -1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
[10 rows x 4 columns in total]"
- ],
- "text/plain": [
- " id text score n_prime\n",
- "0 35153602 Are you trying to access a nested array? 0 -1\n",
- "1 35156124 Sorry I didn't include my timeout method befor... 0 -1\n",
- "2 35157401 As soon as I defined some sort of primary Key ... 0 -1\n",
- "3 35158649 @user3355243 I've edited it to give $values an... 0 -1\n",
- "4 35162039 I pasted my exes @Matt. 0 -1\n",
- "5 35162396 @Gene - I do have separate fields, but I also ... 0 -1\n",
- "6 35162907 could you please provide any kind of sketch ho... 0 -1\n",
- "7 35166498 We use PhoneGap. Using Angular. 0 -1\n",
- "8 35170138 `decltype((int(Foo::*)(int))(&Foo::foo))` That... 0 -1\n",
- "9 35172348 Yes, I found that one already. And I understan... 0 -1\n",
- "\n",
- "[10 rows x 4 columns]"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "%%time\n",
- "\n",
- "df = df.assign(n_prime=df['score'].apply(nth_prime))\n",
- "df.head(10)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.9"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/notebooks/02 - DataFrame.ipynb b/notebooks/dataframes/dataframe.ipynb
similarity index 79%
rename from notebooks/02 - DataFrame.ipynb
rename to notebooks/dataframes/dataframe.ipynb
index 51a3f604f3..85ea61d281 100644
--- a/notebooks/02 - DataFrame.ipynb
+++ b/notebooks/dataframes/dataframe.ipynb
@@ -31,7 +31,22 @@
"execution_count": 2,
"id": "96757c59-fc22-420e-a42f-c6cb956110ec",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "944f0e4417154e81b6496302fe756465",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HTML(value='Query job ac4d1f2b-e9f3-4d95-b78d-57e40eee93fa is RUNNING. Cubs\n",
" 175 | \n",
" \n",
+ " \n",
+ " 20 | \n",
+ " 71ab82a4-6e07-430a-b695-1af3bc42ea61 | \n",
+ " 2016 | \n",
+ " Nationals | \n",
+ " Cubs | \n",
+ " 257 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " d1a110c2-f6c8-4029-bcd8-2f8a01e1561c | \n",
+ " 2016 | \n",
+ " Brewers | \n",
+ " Cubs | \n",
+ " 178 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " 6d111b57-fa0b-4f24-82df-ff33a26f0252 | \n",
+ " 2016 | \n",
+ " Brewers | \n",
+ " Cubs | \n",
+ " 171 | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " a97e9539-bbbd-4e03-bf15-f25ea2c1d923 | \n",
+ " 2016 | \n",
+ " Brewers | \n",
+ " Cubs | \n",
+ " 248 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " dc0c9218-505c-4725-8c0c-40b72cca0956 | \n",
+ " 2016 | \n",
+ " Astros | \n",
+ " Cubs | \n",
+ " 174 | \n",
+ "
\n",
" \n",
"\n",
+ "25 rows × 5 columns
\n",
"[2431 rows x 5 columns in total]"
],
"text/plain": [
@@ -419,6 +503,34 @@
"tags": []
},
"outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "276760df4c904ced81cbaff3a65d026e",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HTML(value='Query job 1943ae42-bcbd-4c2f-914f-209377b5c4d9 is DONE. 0 Bytes processed. 175\n",
" Padres vs Cubs | \n",
" \n",
+ " \n",
+ " 20 | \n",
+ " 71ab82a4-6e07-430a-b695-1af3bc42ea61 | \n",
+ " 2016 | \n",
+ " Nationals | \n",
+ " Cubs | \n",
+ " 257 | \n",
+ " Nationals vs Cubs | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " d1a110c2-f6c8-4029-bcd8-2f8a01e1561c | \n",
+ " 2016 | \n",
+ " Brewers | \n",
+ " Cubs | \n",
+ " 178 | \n",
+ " Brewers vs Cubs | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " 6d111b57-fa0b-4f24-82df-ff33a26f0252 | \n",
+ " 2016 | \n",
+ " Brewers | \n",
+ " Cubs | \n",
+ " 171 | \n",
+ " Brewers vs Cubs | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " a97e9539-bbbd-4e03-bf15-f25ea2c1d923 | \n",
+ " 2016 | \n",
+ " Brewers | \n",
+ " Cubs | \n",
+ " 248 | \n",
+ " Brewers vs Cubs | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " dc0c9218-505c-4725-8c0c-40b72cca0956 | \n",
+ " 2016 | \n",
+ " Astros | \n",
+ " Cubs | \n",
+ " 174 | \n",
+ " Astros vs Cubs | \n",
+ "
\n",
" \n",
"\n",
+ "25 rows × 6 columns
\n",
"[2431 rows x 6 columns in total]"
],
"text/plain": [
@@ -717,6 +875,34 @@
"id": "8bbe000a-36f0-4b6f-b403-b9ec28dd608b",
"metadata": {},
"outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "c32dfe188b114fff911e370d3824df1e",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HTML(value='Query job e543bad8-fbdf-479d-8b42-fd970d6434de is DONE. 0 Bytes processed. 175\n",
" Padres vs Cubs | \n",
" \n",
+ " \n",
+ " 20 | \n",
+ " 71ab82a4-6e07-430a-b695-1af3bc42ea61 | \n",
+ " 2016 | \n",
+ " Nationals | \n",
+ " Cubs | \n",
+ " 257 | \n",
+ " Nationals vs Cubs | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " d1a110c2-f6c8-4029-bcd8-2f8a01e1561c | \n",
+ " 2016 | \n",
+ " Brewers | \n",
+ " Cubs | \n",
+ " 178 | \n",
+ " Brewers vs Cubs | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " 6d111b57-fa0b-4f24-82df-ff33a26f0252 | \n",
+ " 2016 | \n",
+ " Brewers | \n",
+ " Cubs | \n",
+ " 171 | \n",
+ " Brewers vs Cubs | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " a97e9539-bbbd-4e03-bf15-f25ea2c1d923 | \n",
+ " 2016 | \n",
+ " Brewers | \n",
+ " Cubs | \n",
+ " 248 | \n",
+ " Brewers vs Cubs | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " dc0c9218-505c-4725-8c0c-40b72cca0956 | \n",
+ " 2016 | \n",
+ " Astros | \n",
+ " Cubs | \n",
+ " 174 | \n",
+ " Astros vs Cubs | \n",
+ "
\n",
" \n",
"\n",
+ "25 rows × 6 columns
\n",
"[2431 rows x 6 columns in total]"
],
"text/plain": [
@@ -1016,6 +1248,34 @@
"id": "fad6d3da-1f40-4c5f-94ec-0bdfe21ca5b6",
"metadata": {},
"outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d53f91be2b254d26afe3f122ba0e5094",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HTML(value='Query job f859b666-ecc4-43fb-8b5f-58f654e8890f is DONE. 0 Bytes processed. Cubs\n",
" 175 | \n",
" \n",
+ " \n",
+ " 20 | \n",
+ " 71ab82a4-6e07-430a-b695-1af3bc42ea61 | \n",
+ " 2016 | \n",
+ " Nationals | \n",
+ " Cubs | \n",
+ " 257 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " d1a110c2-f6c8-4029-bcd8-2f8a01e1561c | \n",
+ " 2016 | \n",
+ " Brewers | \n",
+ " Cubs | \n",
+ " 178 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " 6d111b57-fa0b-4f24-82df-ff33a26f0252 | \n",
+ " 2016 | \n",
+ " Brewers | \n",
+ " Cubs | \n",
+ " 171 | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " a97e9539-bbbd-4e03-bf15-f25ea2c1d923 | \n",
+ " 2016 | \n",
+ " Brewers | \n",
+ " Cubs | \n",
+ " 248 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " dc0c9218-505c-4725-8c0c-40b72cca0956 | \n",
+ " 2016 | \n",
+ " Astros | \n",
+ " Cubs | \n",
+ " 174 | \n",
+ "
\n",
" \n",
"\n",
+ "25 rows × 5 columns
\n",
"[2431 rows x 5 columns in total]"
],
"text/plain": [
@@ -1292,6 +1593,34 @@
"id": "67a7c35f-80cf-4482-80f9-7f01c7743807",
"metadata": {},
"outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "2c6fb280aabc4667adaea61cdf4045ad",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HTML(value='Query job 5694d442-f886-42f8-bdcd-bad846759654 is DONE. 174.4 kB processed. Cubs\n",
" 175 | \n",
" \n",
+ " \n",
+ " 20 | \n",
+ " 71ab82a4-6e07-430a-b695-1af3bc42ea61 | \n",
+ " 2016 | \n",
+ " Nationals | \n",
+ " Cubs | \n",
+ " 257 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " d1a110c2-f6c8-4029-bcd8-2f8a01e1561c | \n",
+ " 2016 | \n",
+ " Brewers | \n",
+ " Cubs | \n",
+ " 178 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " 6d111b57-fa0b-4f24-82df-ff33a26f0252 | \n",
+ " 2016 | \n",
+ " Brewers | \n",
+ " Cubs | \n",
+ " 171 | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " a97e9539-bbbd-4e03-bf15-f25ea2c1d923 | \n",
+ " 2016 | \n",
+ " Brewers | \n",
+ " Cubs | \n",
+ " 248 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " dc0c9218-505c-4725-8c0c-40b72cca0956 | \n",
+ " 2016 | \n",
+ " Astros | \n",
+ " Cubs | \n",
+ " 174 | \n",
+ "
\n",
" \n",
"\n",
+ "25 rows × 5 columns
\n",
"[2431 rows x 5 columns in total]"
],
"text/plain": [
@@ -1569,6 +1939,34 @@
"id": "3f09ff32-ef43-4fab-a86b-8868afc34363",
"metadata": {},
"outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "480f9b81e9d24dcbb34ac6ae323fbacf",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HTML(value='Query job 45ad4f5b-29f7-4fd2-b124-c56e5e89790a is DONE. 174.4 kB processed. 303703bb-b55f-476d-8faf-bf582169fb1d\n",
" Padres | \n",
" \n",
+ " \n",
+ " 20 | \n",
+ " 71ab82a4-6e07-430a-b695-1af3bc42ea61 | \n",
+ " Nationals | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " d1a110c2-f6c8-4029-bcd8-2f8a01e1561c | \n",
+ " Brewers | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " 6d111b57-fa0b-4f24-82df-ff33a26f0252 | \n",
+ " Brewers | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " a97e9539-bbbd-4e03-bf15-f25ea2c1d923 | \n",
+ " Brewers | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " dc0c9218-505c-4725-8c0c-40b72cca0956 | \n",
+ " Astros | \n",
+ "
\n",
" \n",
"\n",
+ "25 rows × 2 columns
\n",
"[2431 rows x 2 columns in total]"
],
"text/plain": [
@@ -1737,7 +2161,7 @@
}
],
"source": [
- "df1 = df[\"gameId\", \"homeTeamName\"]\n",
+ "df1 = df[[\"gameId\", \"homeTeamName\"]]\n",
"df1"
]
},
@@ -1747,6 +2171,34 @@
"id": "5331d2c8-7912-4d96-8da1-f64b57374df3",
"metadata": {},
"outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "5fe3e2ded18d4aa694b448296e13a317",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HTML(value='Query job 7cfdb350-72e2-4dac-8b12-6f4bb53fd80c is DONE. 193.8 kB processed. \n",
" \n",
"\n",
+ "2 rows × 2 columns
\n",
"[2 rows x 2 columns in total]"
],
"text/plain": [
@@ -1801,7 +2254,7 @@
}
],
"source": [
- "df2 = df[\"gameId\", \"awayTeamName\"].head(2)\n",
+ "df2 = df[[\"gameId\", \"awayTeamName\"]].head(2)\n",
"df2"
]
},
@@ -1811,6 +2264,34 @@
"id": "a574ad3e-a219-454c-8bb5-c5ed6627f2c6",
"metadata": {},
"outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "bf9bcc212c794720ac6b12b51dc14d54",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HTML(value='Query job bdf487ff-e562-44c1-9d7e-59857f18aac4 is DONE. 193.8 kB processed. \n",
" \n",
"\n",
+ "2 rows × 3 columns
\n",
"[2 rows x 3 columns in total]"
],
"text/plain": [
@@ -1877,6 +2359,34 @@
"id": "288e7a95-a077-46c4-8fe6-802474c01f8b",
"metadata": {},
"outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "f746c71d757f458f86e40314f466bd9b",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HTML(value='Query job 10f28cc0-b952-4472-9359-3318e4c7c56e is DONE. 193.8 kB processed. Padres\n",
" <NA> | \n",
" \n",
+ " \n",
+ " 20 | \n",
+ " 71ab82a4-6e07-430a-b695-1af3bc42ea61 | \n",
+ " Nationals | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " d1a110c2-f6c8-4029-bcd8-2f8a01e1561c | \n",
+ " Brewers | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " 6d111b57-fa0b-4f24-82df-ff33a26f0252 | \n",
+ " Brewers | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " a97e9539-bbbd-4e03-bf15-f25ea2c1d923 | \n",
+ " Brewers | \n",
+ " <NA> | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " dc0c9218-505c-4725-8c0c-40b72cca0956 | \n",
+ " Astros | \n",
+ " <NA> | \n",
+ "
\n",
" \n",
"\n",
+ "25 rows × 3 columns
\n",
"[2431 rows x 3 columns in total]"
],
"text/plain": [
@@ -2077,9 +2618,37 @@
"outputs": [
{
"data": {
- "text/html": [
- "\n",
- "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " text | \n",
+ " score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10 | \n",
+ " It will help if you give some details of which... | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 25 | \n",
+ " infact it does. Look a the first lines of your... | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 27 | \n",
+ " \"Currently + is implemented using StringBuffer... | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 41 | \n",
+ " I don't think that's the magic number he was r... | \n",
+ " 18 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 59 | \n",
+ " It's still very useful to know that magic numb... | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 96 | \n",
+ " This implementation is also nice if you wish t... | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 108 | \n",
+ " That's not full text searching, it's searching... | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 109 | \n",
+ " That's not full text searching, it's searching... | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 137 | \n",
+ " In vim you can open > 1 buffer. :e filename. T... | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 154 | \n",
+ " Sure, but what about a solution using O(1) mem... | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id text score\n",
+ "0 10 It will help if you give some details of which... 6\n",
+ "1 25 infact it does. Look a the first lines of your... 10\n",
+ "2 27 \"Currently + is implemented using StringBuffer... 7\n",
+ "3 41 I don't think that's the magic number he was r... 18\n",
+ "4 59 It's still very useful to know that magic numb... 12\n",
+ "5 96 This implementation is also nice if you wish t... 9\n",
+ "6 108 That's not full text searching, it's searching... 6\n",
+ "7 109 That's not full text searching, it's searching... 6\n",
+ "8 137 In vim you can open > 1 buffer. :e filename. T... 9\n",
+ "9 154 Sure, but what about a solution using O(1) mem... 8"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "\n",
+ "##############################\n",
+ "# Pandas World #\n",
+ "##############################\n",
+ "\n",
+ "import pandas as pd\n",
+ "df = pd.read_gbq(TABLE, max_results=MAX_ROWS)[['id', 'text', 'score']]\n",
+ "df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "fd8a04a3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# User defined function\n",
+ "# https://www.codespeedy.com/find-nth-prime-number-in-python/\n",
+ "def nth_prime(n):\n",
+ " prime_numbers = [2,3]\n",
+ " i=3\n",
+ " if(02):\n",
+ " while (True):\n",
+ " i+=1\n",
+ " status = True\n",
+ " for j in range(2,int(i/2)+1):\n",
+ " if(i%j==0):\n",
+ " status = False\n",
+ " break\n",
+ " if(status==True):\n",
+ " prime_numbers.append(i)\n",
+ " if(len(prime_numbers)==n):\n",
+ " break\n",
+ " return prime_numbers[n-1]\n",
+ " else:\n",
+ " return -1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "2b5e4568",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 4.22 s, sys: 18.2 ms, total: 4.24 s\n",
+ "Wall time: 4.26 s\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " text | \n",
+ " score | \n",
+ " n_prime | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 10 | \n",
+ " It will help if you give some details of which... | \n",
+ " 6 | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 25 | \n",
+ " infact it does. Look a the first lines of your... | \n",
+ " 10 | \n",
+ " 29 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 27 | \n",
+ " \"Currently + is implemented using StringBuffer... | \n",
+ " 7 | \n",
+ " 17 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 41 | \n",
+ " I don't think that's the magic number he was r... | \n",
+ " 18 | \n",
+ " 61 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 59 | \n",
+ " It's still very useful to know that magic numb... | \n",
+ " 12 | \n",
+ " 37 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 96 | \n",
+ " This implementation is also nice if you wish t... | \n",
+ " 9 | \n",
+ " 23 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 108 | \n",
+ " That's not full text searching, it's searching... | \n",
+ " 6 | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 109 | \n",
+ " That's not full text searching, it's searching... | \n",
+ " 6 | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 137 | \n",
+ " In vim you can open > 1 buffer. :e filename. T... | \n",
+ " 9 | \n",
+ " 23 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 154 | \n",
+ " Sure, but what about a solution using O(1) mem... | \n",
+ " 8 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id text score n_prime\n",
+ "0 10 It will help if you give some details of which... 6 13\n",
+ "1 25 infact it does. Look a the first lines of your... 10 29\n",
+ "2 27 \"Currently + is implemented using StringBuffer... 7 17\n",
+ "3 41 I don't think that's the magic number he was r... 18 61\n",
+ "4 59 It's still very useful to know that magic numb... 12 37\n",
+ "5 96 This implementation is also nice if you wish t... 9 23\n",
+ "6 108 That's not full text searching, it's searching... 6 13\n",
+ "7 109 That's not full text searching, it's searching... 6 13\n",
+ "8 137 In vim you can open > 1 buffer. :e filename. T... 9 23\n",
+ "9 154 Sure, but what about a solution using O(1) mem... 8 19"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "\n",
+ "df = df.assign(n_prime=df['score'].apply(nth_prime))\n",
+ "df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "b81feaef",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n",
+ " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n",
+ "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n",
+ " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "2b1c9d671db14d2ca3be6a0b0c698430",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HTML(value='Query job 6b0a39de-40a0-4dd4-be88-248bd8ebcd77 is RUNNING. \n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " text | \n",
+ " score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 11012908 | \n",
+ " you're welcome! according to the docs it shoul... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 11013760 | \n",
+ " You *should* be concerned with the disk being ... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 11013784 | \n",
+ " have you looked at `Integrate` or `NIntegrate`? | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 11015512 | \n",
+ " sorry, is a typo. The variable name is dist. (... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 11016238 | \n",
+ " Pfff, I'm having trouble with that formula too... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 11016276 | \n",
+ " Thanks thinksteep! Does this mean that by usin... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 11016551 | \n",
+ " Jason, thanks for the reply. I've been workin... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 11017973 | \n",
+ " I assume an `off` of 0.5 would put be exactly ... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 11018225 | \n",
+ " Thank you very much. I do worry too much abou... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 11018370 | \n",
+ " @IanClelland, I edited my question a bit. The ... | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "10 rows × 3 columns
\n",
+ "[10 rows x 3 columns in total]"
+ ],
+ "text/plain": [
+ " id text score\n",
+ "0 11012908 you're welcome! according to the docs it shoul... 0\n",
+ "1 11013760 You *should* be concerned with the disk being ... 0\n",
+ "2 11013784 have you looked at `Integrate` or `NIntegrate`? 0\n",
+ "3 11015512 sorry, is a typo. The variable name is dist. (... 0\n",
+ "4 11016238 Pfff, I'm having trouble with that formula too... 0\n",
+ "5 11016276 Thanks thinksteep! Does this mean that by usin... 0\n",
+ "6 11016551 Jason, thanks for the reply. I've been workin... 0\n",
+ "7 11017973 I assume an `off` of 0.5 would put be exactly ... 0\n",
+ "8 11018225 Thank you very much. I do worry too much abou... 0\n",
+ "9 11018370 @IanClelland, I edited my question a bit. The ... 0\n",
+ "\n",
+ "[10 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "\n",
+ "##############################\n",
+ "# BigQuery DataFrames #\n",
+ "##############################\n",
+ "\n",
+ "import bigframes.pandas as pd\n",
+ "\n",
+ "df = pd.read_gbq(TABLE).head(MAX_ROWS)[['id', 'text', 'score']]\n",
+ "df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "55ed241e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Help on function remote_function in module bigframes.pandas:\n",
+ "\n",
+ "remote_function(input_types: 'List[type]', output_type: 'type', dataset: 'Optional[str]' = None, bigquery_connection: 'Optional[str]' = None, reuse: 'bool' = True)\n",
+ " Decorator to turn a user defined function into a BigQuery remote function.\n",
+ " \n",
+ " .. note::\n",
+ " Please make sure following is setup before using this API:\n",
+ " \n",
+ " 1. Have the below APIs enabled for your project:\n",
+ " \n",
+ " * BigQuery Connection API\n",
+ " * Cloud Functions API\n",
+ " * Cloud Run API\n",
+ " * Cloud Build API\n",
+ " * Artifact Registry API\n",
+ " * Cloud Resource Manager API\n",
+ " \n",
+ " This can be done from the cloud console (change `PROJECT_ID` to yours):\n",
+ " https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID\n",
+ " \n",
+ " Or from the gcloud CLI:\n",
+ " \n",
+ " `$ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com`\n",
+ " \n",
+ " 2. Have following IAM roles enabled for you:\n",
+ " \n",
+ " * BigQuery Data Editor (roles/bigquery.dataEditor)\n",
+ " * BigQuery Connection Admin (roles/bigquery.connectionAdmin)\n",
+ " * Cloud Functions Developer (roles/cloudfunctions.developer)\n",
+ " * Service Account User (roles/iam.serviceAccountUser)\n",
+ " * Storage Object Viewer (roles/storage.objectViewer)\n",
+ " * Project IAM Admin (roles/resourcemanager.projectIamAdmin) (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.)\n",
+ " \n",
+ " 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set:\n",
+ " \n",
+ " 1. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection\n",
+ " 2. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function\n",
+ " \n",
+ " Alternatively, the IAM could also be setup via the gcloud CLI:\n",
+ " \n",
+ " `$ gcloud projects add-iam-policy-binding PROJECT_ID --member=\"serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID\" --role=\"roles/run.invoker\"`.\n",
+ " \n",
+ " Args:\n",
+ " input_types (list(type)):\n",
+ " List of input data types in the user defined function.\n",
+ " output_type (type):\n",
+ " Data type of the output in the user defined function.\n",
+ " dataset (str, Optional):\n",
+ " Dataset in which to create a BigQuery remote function. It should be in\n",
+ " `.` or `` format. If this\n",
+ " parameter is not provided then session dataset id is used.\n",
+ " bigquery_connection (str, Optional):\n",
+ " Name of the BigQuery connection. You should either have the\n",
+ " connection already created in the `location` you have chosen, or\n",
+ " you should have the Project IAM Admin role to enable the service\n",
+ " to create the connection for you if you need it.If this parameter is\n",
+ " not provided then the BigQuery connection from the session is used.\n",
+ " reuse (bool, Optional):\n",
+ " Reuse the remote function if already exists.\n",
+ " `True` by default, which will result in reusing an existing remote\n",
+ " function (if any) that was previously created for the same udf.\n",
+ " Setting it to false would force creating a unique remote function.\n",
+ " If the required remote function does not exist then it would be\n",
+ " created irrespective of this param.\n",
+ " Returns:\n",
+ " callable: A remote function object pointing to the cloud assets created\n",
+ " in the background to support the remote execution. The cloud assets can be\n",
+ " located through the following properties set in the object:\n",
+ " \n",
+ " `bigframes_cloud_function` - The google cloud function deployed for the user defined code.\n",
+ " \n",
+ " `bigframes_remote_function` - The bigquery remote function capable of calling into `bigframes_cloud_function`.\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Tell the user what needs to be done offline before using BigQuery DataFrame\n",
+ "# remote functions\n",
+ "help(pd.remote_function)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "c9a8d03d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# BigQuery DataFrames user is a data scientist and may not have privileges to\n",
+ "# create a BQ connector and set it up for invoking a cloud function. They\n",
+ "# should get such a connector created from their cloud admin and use it with\n",
+ "# BigQuery DataFrames remote functions. If the provided connection name does not\n",
+ "# exist, BigQuery DataFrames will try to create it on the fly assuming the user\n",
+ "# has sufficient privileges.\n",
+ "bq_connection_name = 'bigframes-rf-conn'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "fbc27f81",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[INFO][2023-08-18 21:23:29,687][bigframes.remote_function] Creating new cloud function: gcloud functions deploy bigframes-b0feb1fbaf8188b64d7e70118d93c5d4 --gen2 --runtime=python310 --project=bigframes-dev --region=us-central1 --source=/tmp/tmpl2ewfnue --entry-point=udf_http --trigger-http --no-allow-unauthenticated\n",
+ "[INFO][2023-08-18 21:24:43,689][bigframes.remote_function] Successfully created cloud function bigframes-b0feb1fbaf8188b64d7e70118d93c5d4 with uri (https://bigframes-b0feb1fbaf8188b64d7e70118d93c5d4-7krlje3eoq-uc.a.run.app)\n",
+ "[INFO][2023-08-18 21:24:57,348][bigframes.remote_function] Connector bigframes-rf-conn already exists\n",
+ "[INFO][2023-08-18 21:24:57,351][bigframes.remote_function] Creating BQ remote function: \n",
+ " CREATE OR REPLACE FUNCTION `bigframes-dev.bigframes_temp_us`.bigframes_b0feb1fbaf8188b64d7e70118d93c5d4(n INT64)\n",
+ " RETURNS INT64\n",
+ " REMOTE WITH CONNECTION `bigframes-dev.us.bigframes-rf-conn`\n",
+ " OPTIONS (\n",
+ " endpoint = \"https://bigframes-b0feb1fbaf8188b64d7e70118d93c5d4-7krlje3eoq-uc.a.run.app\"\n",
+ " )\n",
+ "[INFO][2023-08-18 21:24:58,300][bigframes.remote_function] Created remote function bigframes-dev.bigframes_temp_us.bigframes_b0feb1fbaf8188b64d7e70118d93c5d4\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Wall time: 89.0601 s\n"
+ ]
+ }
+ ],
+ "source": [
+ "from time import perf_counter\n",
+ "start = perf_counter()\n",
+ "#-------------------------------------------------------------------------------------\n",
+ "\n",
+ "# User defined function\n",
+ "# https://www.codespeedy.com/find-nth-prime-number-in-python/\n",
+ "@pd.remote_function([int], int, bigquery_connection=bq_connection_name)\n",
+ "def nth_prime(n):\n",
+ " prime_numbers = [2,3]\n",
+ " i=3\n",
+ " if(02):\n",
+ " while (True):\n",
+ " i+=1\n",
+ " status = True\n",
+ " for j in range(2,int(i/2)+1):\n",
+ " if(i%j==0):\n",
+ " status = False\n",
+ " break\n",
+ " if(status==True):\n",
+ " prime_numbers.append(i)\n",
+ " if(len(prime_numbers)==n):\n",
+ " break\n",
+ " return prime_numbers[n-1]\n",
+ " else:\n",
+ " return -1\n",
+ "\n",
+ "#-------------------------------------------------------------------------------------\n",
+ "print(f\"\\nWall time: {(perf_counter()-start):.4f} s\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "c1c9355f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 16.8 ms, sys: 61 µs, total: 16.8 ms\n",
+ "Wall time: 17 ms\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "2f840ad27c514ed19c759a004b32de33",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HTML(value='Query job 0f421233-9d02-4746-bb39-86a3b0880aba is RUNNING. \n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " text | \n",
+ " score | \n",
+ " n_prime | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 11012908 | \n",
+ " you're welcome! according to the docs it shoul... | \n",
+ " 0 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 11013760 | \n",
+ " You *should* be concerned with the disk being ... | \n",
+ " 0 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 11013784 | \n",
+ " have you looked at `Integrate` or `NIntegrate`? | \n",
+ " 0 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 11015512 | \n",
+ " sorry, is a typo. The variable name is dist. (... | \n",
+ " 0 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 11016238 | \n",
+ " Pfff, I'm having trouble with that formula too... | \n",
+ " 0 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 11016276 | \n",
+ " Thanks thinksteep! Does this mean that by usin... | \n",
+ " 0 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 11016551 | \n",
+ " Jason, thanks for the reply. I've been workin... | \n",
+ " 0 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 11017973 | \n",
+ " I assume an `off` of 0.5 would put be exactly ... | \n",
+ " 0 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 11018225 | \n",
+ " Thank you very much. I do worry too much abou... | \n",
+ " 0 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 11018370 | \n",
+ " @IanClelland, I edited my question a bit. The ... | \n",
+ " 0 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "10 rows × 4 columns
\n",
+ "[10 rows x 4 columns in total]"
+ ],
+ "text/plain": [
+ " id text score n_prime\n",
+ "0 11012908 you're welcome! according to the docs it shoul... 0 -1\n",
+ "1 11013760 You *should* be concerned with the disk being ... 0 -1\n",
+ "2 11013784 have you looked at `Integrate` or `NIntegrate`? 0 -1\n",
+ "3 11015512 sorry, is a typo. The variable name is dist. (... 0 -1\n",
+ "4 11016238 Pfff, I'm having trouble with that formula too... 0 -1\n",
+ "5 11016276 Thanks thinksteep! Does this mean that by usin... 0 -1\n",
+ "6 11016551 Jason, thanks for the reply. I've been workin... 0 -1\n",
+ "7 11017973 I assume an `off` of 0.5 would put be exactly ... 0 -1\n",
+ "8 11018225 Thank you very much. I do worry too much abou... 0 -1\n",
+ "9 11018370 @IanClelland, I edited my question a bit. The ... 0 -1\n",
+ "\n",
+ "[10 rows x 4 columns]"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "\n",
+ "# Let's apply the function to the dataframe\n",
+ "df = df.assign(n_prime=df['score'].apply(nth_prime))\n",
+ "df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "2701cb81",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "bigframes-dev.bigframes_temp_us.bigframes_b0feb1fbaf8188b64d7e70118d93c5d4\n",
+ "projects/bigframes-dev/locations/us-central1/functions/bigframes-b0feb1fbaf8188b64d7e70118d93c5d4\n"
+ ]
+ }
+ ],
+ "source": [
+ "# We can see the path to the BQ remote function and the google cloud function\n",
+ "# that was created under the hood\n",
+ "print(nth_prime.bigframes_remote_function)\n",
+ "print(nth_prime.bigframes_cloud_function)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "920fa18e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Help on function read_gbq_function in module bigframes.pandas:\n",
+ "\n",
+ "read_gbq_function(function_name: 'str')\n",
+ " Loads a BigQuery function from BigQuery.\n",
+ " \n",
+ " Then it can be applied to a DataFrame or Series.\n",
+ " \n",
+ " Args:\n",
+ " function_name (str):\n",
+ " the function's name in BigQuery in the format\n",
+ " `project_id.dataset_id.function_name`, or\n",
+ " `dataset_id.function_name` to load from the default project, or\n",
+ " `function_name` to load from the default project and the dataset\n",
+ " associated with the current session.\n",
+ " \n",
+ " Returns:\n",
+ " callable: A function object pointing to the BigQuery function read\n",
+ " from BigQuery.\n",
+ " \n",
+ " The object is similar to the one created by the `remote_function`\n",
+ " decorator, including the `bigframes_remote_function` property, but\n",
+ " not including the `bigframes_cloud_function` property.\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Let's try to simulate a scenario in which user shares this remote funciton to\n",
+ "# their colleague who simply wants to reuse it. BigFrames provides an API to do\n",
+ "# so via `read_gbq_function`. Usage details are available via `help` command.\n",
+ "help(pd.read_gbq_function)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "a6c9da0a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "EXISTING_REMOTE_FUNCTION=nth_prime.bigframes_remote_function\n",
+ "\n",
+ "# Let's read the existing remote function in bigframes\n",
+ "nth_prime_existing = pd.read_gbq_function(EXISTING_REMOTE_FUNCTION)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "d7e7de7f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 10.9 ms, sys: 0 ns, total: 10.9 ms\n",
+ "Wall time: 11.4 ms\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "73d1a73593cb4115821ab128c221a48d",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HTML(value='Query job bec5f7d1-3df1-4292-8c68-c396bce7dc5d is RUNNING. \n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " text | \n",
+ " score | \n",
+ " n_prime | \n",
+ " n_prime_again | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 11012908 | \n",
+ " you're welcome! according to the docs it shoul... | \n",
+ " 0 | \n",
+ " -1 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 11013760 | \n",
+ " You *should* be concerned with the disk being ... | \n",
+ " 0 | \n",
+ " -1 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 11013784 | \n",
+ " have you looked at `Integrate` or `NIntegrate`? | \n",
+ " 0 | \n",
+ " -1 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 11015512 | \n",
+ " sorry, is a typo. The variable name is dist. (... | \n",
+ " 0 | \n",
+ " -1 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 11016238 | \n",
+ " Pfff, I'm having trouble with that formula too... | \n",
+ " 0 | \n",
+ " -1 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 11016276 | \n",
+ " Thanks thinksteep! Does this mean that by usin... | \n",
+ " 0 | \n",
+ " -1 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 11016551 | \n",
+ " Jason, thanks for the reply. I've been workin... | \n",
+ " 0 | \n",
+ " -1 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 11017973 | \n",
+ " I assume an `off` of 0.5 would put be exactly ... | \n",
+ " 0 | \n",
+ " -1 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 11018225 | \n",
+ " Thank you very much. I do worry too much abou... | \n",
+ " 0 | \n",
+ " -1 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 11018370 | \n",
+ " @IanClelland, I edited my question a bit. The ... | \n",
+ " 0 | \n",
+ " -1 | \n",
+ " -1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "10 rows × 5 columns
\n",
+ "[10 rows x 5 columns in total]"
+ ],
+ "text/plain": [
+ " id text score \\\n",
+ "0 11012908 you're welcome! according to the docs it shoul... 0 \n",
+ "1 11013760 You *should* be concerned with the disk being ... 0 \n",
+ "2 11013784 have you looked at `Integrate` or `NIntegrate`? 0 \n",
+ "3 11015512 sorry, is a typo. The variable name is dist. (... 0 \n",
+ "4 11016238 Pfff, I'm having trouble with that formula too... 0 \n",
+ "5 11016276 Thanks thinksteep! Does this mean that by usin... 0 \n",
+ "6 11016551 Jason, thanks for the reply. I've been workin... 0 \n",
+ "7 11017973 I assume an `off` of 0.5 would put be exactly ... 0 \n",
+ "8 11018225 Thank you very much. I do worry too much abou... 0 \n",
+ "9 11018370 @IanClelland, I edited my question a bit. The ... 0 \n",
+ "\n",
+ " n_prime n_prime_again \n",
+ "0 -1 -1 \n",
+ "1 -1 -1 \n",
+ "2 -1 -1 \n",
+ "3 -1 -1 \n",
+ "4 -1 -1 \n",
+ "5 -1 -1 \n",
+ "6 -1 -1 \n",
+ "7 -1 -1 \n",
+ "8 -1 -1 \n",
+ "9 -1 -1 \n",
+ "\n",
+ "[10 rows x 5 columns]"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "\n",
+ "# Let's apply the existing function to the dataframe\n",
+ "df = df.assign(n_prime_again=df['score'].apply(nth_prime_existing))\n",
+ "df.head(10)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/noxfile.py b/noxfile.py
index 7d4cb1c61b..1ceca6831b 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -19,6 +19,7 @@
from multiprocessing import Process
import os
import pathlib
+from pathlib import Path
import re
import shutil
from typing import Dict, List
@@ -592,21 +593,35 @@ def notebook(session):
session.install("-e", ".[all]")
session.install("pytest", "pytest-xdist", "pytest-retry", "nbmake")
- notebooks = [
- "00 - Summary.ipynb",
- "01 - Getting Started.ipynb",
- "02 - DataFrame.ipynb",
- "03 - Using ML - ML fundamentals.ipynb",
- "04 - Using ML - SKLearn linear regression.ipynb",
- "05 - Using ML - Easy linear regression.ipynb",
- "06 - Using ML - Large Language Models.ipynb",
- "50 - Remote Function.ipynb",
+ notebooks_list = list(Path("notebooks/").glob("*/*.ipynb"))
+
+ denylist = [
+ # Regionalized testing is manually added later.
+ "notebooks/location/regionalized.ipynb",
+ # These notebooks contain special colab `param {type:"string"}`
+ # comments, which make it easy for customers to fill in their
+ # own information.
+ # TODO(ashleyxu): Test these notebooks by replacing parameters with
+ # appropriate values and omitting cleanup logic that may break
+ # our test infrastructure.
+ "notebooks/getting_started/getting_started_bq_dataframes.ipynb",
+ "notebooks/getting_started/bq_dataframes_llm_code_generation.ipynb",
+ "notebooks/getting_started/bq_dataframes_ml_linear_regression.ipynb",
+ "notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb",
+ # The experimental notebooks imagine features that don't yet
+ # exist or only exist as temporary prototypes.
+ "notebooks/experimental/longer_ml_demo.ipynb",
]
- notebooks = [os.path.join("notebooks", nb) for nb in notebooks]
+
+ # Convert each Path notebook object to a string using a list comprehension.
+ notebooks = [str(nb) for nb in notebooks_list]
+
+ # Remove tests that we choose not to test.
+ notebooks = list(filter(lambda nb: nb not in denylist, notebooks))
# Regionalized notebooks
notebooks_reg = {
- "10 - Regionalized.ipynb": [
+ "regionalized.ipynb": [
"asia-southeast1",
"eu",
"europe-west4",
@@ -616,7 +631,8 @@ def notebook(session):
]
}
notebooks_reg = {
- os.path.join("notebooks", nb): regions for nb, regions in notebooks_reg.items()
+ os.path.join("notebooks/location", nb): regions
+ for nb, regions in notebooks_reg.items()
}
# For some reason nbmake exits silently with "no tests ran" message if
diff --git a/owlbot.py b/owlbot.py
new file mode 100644
index 0000000000..4ba7d14eb5
--- /dev/null
+++ b/owlbot.py
@@ -0,0 +1,68 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""This script is used to synthesize generated parts of this library."""
+
+import pathlib
+
+from synthtool import gcp
+import synthtool as s
+from synthtool.languages import python
+
+REPO_ROOT = pathlib.Path(__file__).parent.absolute()
+
+common = gcp.CommonTemplates()
+
+# ----------------------------------------------------------------------------
+# Add templated files
+# ----------------------------------------------------------------------------
+
+templated_files = common.py_library(
+ unit_test_python_versions=["3.9", "3.10", "3.11"],
+ system_test_python_versions=["3.9", "3.11"],
+ cov_level=40,
+ intersphinx_dependencies={
+ "pandas": "https://pandas.pydata.org/pandas-docs/stable/",
+ "pydata-google-auth": "https://pydata-google-auth.readthedocs.io/en/latest/",
+ },
+)
+s.move(
+ templated_files,
+ excludes=[
+ # Multi-processing note isn't relevant, as pandas_gbq is responsible for
+ # creating clients, not the end user.
+ "docs/multiprocessing.rst",
+ "noxfile.py",
+ "README.rst",
+ ],
+)
+
+# ----------------------------------------------------------------------------
+# Fixup files
+# ----------------------------------------------------------------------------
+
+
+# ----------------------------------------------------------------------------
+# Samples templates
+# ----------------------------------------------------------------------------
+
+python.py_samples(skip_readmes=True)
+
+# ----------------------------------------------------------------------------
+# Final cleanup
+# ----------------------------------------------------------------------------
+
+s.shell.run(["nox", "-s", "blacken"], hide_output=False)
+for noxfile in REPO_ROOT.glob("samples/**/noxfile.py"):
+ s.shell.run(["nox", "-s", "blacken"], cwd=noxfile.parent, hide_output=False)
diff --git a/pytest.ini b/pytest.ini
index 693439f47c..204c743bbf 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,3 +1,4 @@
[pytest]
+doctest_optionflags = NORMALIZE_WHITESPACE
filterwarnings =
ignore::pandas.errors.SettingWithCopyWarning
diff --git a/samples/snippets/remote_function.py b/samples/snippets/remote_function.py
new file mode 100644
index 0000000000..37972672c3
--- /dev/null
+++ b/samples/snippets/remote_function.py
@@ -0,0 +1,147 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def run_remote_function_and_read_gbq_function(project_id: str):
+ your_gcp_project_id = project_id
+
+ # [START bigquery_dataframes_remote_function]
+ import bigframes.pandas as bpd
+
+ # Set BigQuery DataFrames options
+ bpd.options.bigquery.project = your_gcp_project_id
+ bpd.options.bigquery.location = "us"
+
+ # BigQuery DataFrames gives you the ability to turn your custom scalar
+ # functions into a BigQuery remote function. It requires the GCP project to
+ # be set up appropriately and the user having sufficient privileges to use
+ # them. One can find more details about the usage and the requirements via
+ # `help` command.
+ help(bpd.remote_function)
+
+ # Read a table and inspect the column of interest.
+ df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins")
+ df["body_mass_g"].head(10)
+
+ # Define a custom function, and specify the intent to turn it into a remote
+ # function. It requires a BigQuery connection. If the connection is not
+ # already created, BigQuery DataFrames will attempt to create one assuming
+ # the necessary APIs and IAM permissions are setup in the project. In our
+ # examples we would be using a pre-created connection named
+ # `bigframes-rf-conn`. Let's try a `pandas`-like use case in which we want
+ # to apply a user defined scalar function to every value in a `Series`, more
+ # specifically bucketize the `body_mass_g` value of the penguins, which is a
+ # real number, into a category, which is a string.
+ @bpd.remote_function([float], str, bigquery_connection="bigframes-rf-conn")
+ def get_bucket(num):
+ if not num:
+ return "NA"
+ boundary = 4000
+ return "at_or_above_4000" if num >= boundary else "below_4000"
+
+ # Then we can apply the remote function on the `Series`` of interest via
+ # `apply` API and store the result in a new column in the DataFrame.
+ df = df.assign(body_mass_bucket=df["body_mass_g"].apply(get_bucket))
+
+ # This will add a new column `body_mass_bucket` in the DataFrame. You can
+ # preview the original value and the bucketized value side by side.
+ df[["body_mass_g", "body_mass_bucket"]].head(10)
+
+ # The above operation was possible by doing all the computation on the
+ # cloud. For that, there is a google cloud function deployed by serializing
+ # the user code, and a BigQuery remote function created to call the cloud
+ # function via the latter's http endpoint on the data in the DataFrame.
+
+ # The BigQuery remote function created to support the BigQuery DataFrames
+ # remote function can be located via a property `bigframes_remote_function`
+ # set in the remote function object.
+ print(f"Created BQ remote function: {get_bucket.bigframes_remote_function}")
+
+ # The cloud function can be located via another property
+ # `bigframes_cloud_function` set in the remote function object.
+ print(f"Created cloud function: {get_bucket.bigframes_cloud_function}")
+
+ # Warning: The deployed cloud function may be visible to other users with
+ # sufficient privilege in the project, so the user should be careful about
+ # having any sensitive data in the code that will be deployed as a remote
+ # function.
+
+ # Let's continue trying other potential use cases of remote functions. Let's
+ # say we consider the `species`, `island` and `sex` of the penguins
+ # sensitive information and want to redact that by replacing with their hash
+ # code instead. Let's define another scalar custom function and decorated it
+ # as a remote function
+ @bpd.remote_function([str], str, bigquery_connection="bigframes-rf-conn")
+ def get_hash(input):
+ import hashlib
+
+ # handle missing value
+ if input is None:
+ input = ""
+ encoded_input = input.encode()
+ hash = hashlib.md5(encoded_input)
+ return hash.hexdigest()
+
+ # We can use this remote function in another `pandas`-like API `map` that
+ # can be applied on a DataFrame
+ df_redacted = df[["species", "island", "sex"]].map(get_hash)
+ df_redacted.head(10)
+
+ # [END bigquery_dataframes_remote_function]
+
+ existing_get_bucket_bq_udf = get_bucket.bigframes_remote_function
+
+ # [START bigquery_dataframes_read_gbq_function]
+
+ # If you have already defined a custom function in BigQuery, either via the
+ # BigQuery Google Cloud Console or with the `remote_function` decorator,
+ # or otherwise, you may use it with BigQuery DataFrames with the
+ # `read_gbq_function` method. More details are available via the `help`
+ # command.
+ import bigframes.pandas as pd
+
+ help(pd.read_gbq_function)
+
+ # Here is an example of using `read_gbq_function` to load an existing
+ # BigQuery function.
+ df = pd.read_gbq("bigquery-public-data.ml_datasets.penguins")
+ get_bucket_function = pd.read_gbq_function(existing_get_bucket_bq_udf)
+
+ df = df.assign(body_mass_bucket=df["body_mass_g"].apply(get_bucket_function))
+ df.head(10)
+
+ # It should be noted that if a function is created using the
+ # `remote_function` decorator, its created BQ remote function is accessible
+ # immediately afterward via the function's `bigframes_remote_function`
+ # attribute. The same string can be passed to `read_gbq_function` later in
+ # another context.
+
+ # [END bigquery_dataframes_read_gbq_function]
+
+ # Clean up cloud artifacts
+ session = bpd.get_global_session()
+ for function in (get_bucket, get_hash):
+ try:
+ session.bqclient.delete_routine(function.bigframes_remote_function)
+ except Exception:
+ # Ignore exception during clean-up
+ pass
+
+ try:
+ session.cloudfunctionsclient.delete_function(
+ name=function.bigframes_cloud_function
+ )
+ except Exception:
+ # Ignore exception during clean-up
+ pass
diff --git a/samples/snippets/remote_function_test.py b/samples/snippets/remote_function_test.py
new file mode 100644
index 0000000000..8b51e46b45
--- /dev/null
+++ b/samples/snippets/remote_function_test.py
@@ -0,0 +1,32 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import bigframes.pandas
+
+from . import remote_function
+
+
+def test_remote_function_and_read_gbq_function(
+ capsys: pytest.CaptureFixture[str],
+) -> None:
+ # We need a fresh session since we're modifying connection options.
+ bigframes.pandas.reset_session()
+
+ # TODO(swast): Get project from environment so contributors can run tests.
+ remote_function.run_remote_function_and_read_gbq_function("bigframes-dev")
+ out, _ = capsys.readouterr()
+ assert "Created BQ remote function:" in out
+ assert "Created cloud function:" in out
diff --git a/setup.py b/setup.py
index 20f080b166..139873e6fc 100644
--- a/setup.py
+++ b/setup.py
@@ -40,6 +40,8 @@
"google-cloud-bigquery[bqstorage,pandas] >=3.10.0",
"google-cloud-functions >=1.10.1",
"google-cloud-bigquery-connection >=1.12.0",
+ "google-cloud-iam >=2.12.1",
+ "google-cloud-resource-manager >=1.10.3",
"google-cloud-storage >=2.0.0",
# TODO: Relax upper bound once we have fixed `system_prerelease` tests.
"ibis-framework[bigquery] >=6.0.0,<=6.1.0",
diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt
index fe3d49ef20..523256ee83 100644
--- a/testing/constraints-3.9.txt
+++ b/testing/constraints-3.9.txt
@@ -34,6 +34,8 @@ google-cloud-bigquery-connection==1.12.0
google-cloud-bigquery-storage==2.19.1
google-cloud-core==2.3.2
google-cloud-functions==1.10.1
+google-cloud-iam==2.12.1
+google-cloud-resource-manager==1.10.3
google-cloud-storage==2.0.0
google-cloud-testutils==1.3.3
google-crc32c==1.5.0
diff --git a/tests/data/hockey_players.json b/tests/data/hockey_players.json
new file mode 100644
index 0000000000..8a9b252992
--- /dev/null
+++ b/tests/data/hockey_players.json
@@ -0,0 +1,37 @@
+[
+ {
+ "mode": "NULLABLE",
+ "name": "team_name",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "position",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "player_name",
+ "type": "STRING"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "goals",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "assists",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "number",
+ "type": "INTEGER"
+ },
+ {
+ "mode": "NULLABLE",
+ "name": "season",
+ "type": "INTEGER"
+ }
+]
diff --git a/tests/data/hockey_players.jsonl b/tests/data/hockey_players.jsonl
new file mode 100644
index 0000000000..d2b26cffdd
--- /dev/null
+++ b/tests/data/hockey_players.jsonl
@@ -0,0 +1,10 @@
+{"team_name":"Canucks", "position":"C", "player_name":"Elias Petterson", "goals":39, "assists":63, "number":40, "season":2023}
+{"team_name":"Canucks", "position":"LW", "player_name":"Ilya Mikheyev", "goals":13, "assists":15, "number":65, "season":2023}
+{"team_name":"Canucks", "position":"RW", "player_name":"Andrei Kuzmenko", "goals":39, "assists":35, "number":40, "season":2023}
+{"team_name":"Kraken", "position":"C", "player_name":"Jared McCann", "goals":40, "assists":30, "number":19, "season":2023}
+{"team_name":"Kraken", "position":"LW", "player_name":"Yanni Gourde", "goals":14, "assists":34, "number":37, "season":2023}
+{"team_name":"Kraken", "position":"RW", "player_name":"Jordan Eberle", "goals":20, "assists":43, "number":7, "season":2023}
+{"team_name":"Canucks", "position":"C", "player_name":"Elias Petterson", "goals":32, "assists":36, "number":40, "season":2022}
+{"team_name":"Kraken", "position":"C", "player_name":"Jared McCann", "goals":27, "assists":23, "number":19, "season":2022}
+{"team_name":"Kraken", "position":"LW", "player_name":"Yanni Gourde", "goals":21, "assists":27, "number":37, "season":2022}
+{"team_name":"Kraken", "position":"RW", "player_name":"Jordan Eberle", "goals":21, "assists":23, "number":7, "season":2022}
diff --git a/tests/system/conftest.py b/tests/system/conftest.py
index 41c8eaffd7..3153bd1559 100644
--- a/tests/system/conftest.py
+++ b/tests/system/conftest.py
@@ -15,6 +15,7 @@
from datetime import datetime
import hashlib
import logging
+import math
import pathlib
import typing
from typing import Dict, Optional
@@ -23,6 +24,7 @@
import google.cloud.bigquery_connection_v1 as bigquery_connection_v1
import google.cloud.exceptions
import google.cloud.functions_v2 as functions_v2
+import google.cloud.resourcemanager_v3 as resourcemanager_v3
import google.cloud.storage as storage # type: ignore
import ibis.backends.base
import pandas as pd
@@ -101,6 +103,13 @@ def cloudfunctions_client(
return session.cloudfunctionsclient
+@pytest.fixture(scope="session")
+def resourcemanager_client(
+ session: bigframes.Session,
+) -> resourcemanager_v3.ProjectsClient:
+ return session.resourcemanagerclient
+
+
@pytest.fixture(scope="session")
def session() -> bigframes.Session:
return bigframes.Session()
@@ -211,6 +220,7 @@ def load_test_data_tables(
("scalars_too", "scalars_schema.json", "scalars.jsonl"),
("penguins", "penguins_schema.json", "penguins.jsonl"),
("time_series", "time_series_schema.json", "time_series.jsonl"),
+ ("hockey_players", "hockey_players.json", "hockey_players.jsonl"),
]:
test_data_hash = hashlib.md5()
_hash_digest_file(test_data_hash, DATA_DIR / schema_filename)
@@ -255,6 +265,11 @@ def scalars_table_id(test_data_tables) -> str:
return test_data_tables["scalars"]
+@pytest.fixture(scope="session")
+def hockey_table_id(test_data_tables) -> str:
+ return test_data_tables["hockey_players"]
+
+
@pytest.fixture(scope="session")
def scalars_table_id_2(test_data_tables) -> str:
return test_data_tables["scalars_too"]
@@ -354,6 +369,34 @@ def scalars_dfs(
return scalars_df_index, scalars_pandas_df_index
+@pytest.fixture(scope="session")
+def hockey_df(
+ hockey_table_id: str, session: bigframes.Session
+) -> bigframes.dataframe.DataFrame:
+ """DataFrame pointing at test data."""
+ return session.read_gbq(hockey_table_id)
+
+
+@pytest.fixture(scope="session")
+def hockey_pandas_df() -> pd.DataFrame:
+ """pd.DataFrame pointing at test data."""
+ df = pd.read_json(
+ DATA_DIR / "hockey_players.jsonl",
+ lines=True,
+ dtype={
+ "team_name": pd.StringDtype(storage="pyarrow"),
+ "position": pd.StringDtype(storage="pyarrow"),
+ "player_name": pd.StringDtype(storage="pyarrow"),
+ "goals": pd.Int64Dtype(),
+ "assists": pd.Int64Dtype(),
+ "number": pd.Int64Dtype(),
+ "season": pd.Int64Dtype(),
+ },
+ )
+ df.index = df.index.astype("Int64")
+ return df
+
+
@pytest.fixture(scope="session")
def penguins_df_default_index(
penguins_table_id: str, session: bigframes.Session
@@ -535,6 +578,38 @@ def penguins_kmeans_model_name(
return model_name
+@pytest.fixture(scope="session")
+def penguins_pca_model_name(
+ session: bigframes.Session, dataset_id_permanent, penguins_table_id
+) -> str:
+ """Provides a pretrained model as a test fixture that is cached across test runs.
+ This lets us run system tests without having to wait for a model.fit(...)"""
+ # TODO(garrettwu): Create a shared method to get different types of pretrained models.
+ sql = f"""
+CREATE OR REPLACE MODEL `$model_name`
+OPTIONS (
+ model_type='pca',
+ num_principal_components=3
+) AS SELECT
+ *
+FROM `{penguins_table_id}`"""
+ # We use the SQL hash as the name to ensure the model is regenerated if this fixture is edited
+ model_name = (
+ f"{dataset_id_permanent}.penguins_pca_{hashlib.md5(sql.encode()).hexdigest()}"
+ )
+ sql = sql.replace("$model_name", model_name)
+
+ try:
+ return session.read_gbq_model(model_name)
+ except google.cloud.exceptions.NotFound:
+ logging.info(
+ "penguins_pca_model fixture was not found in the permanent dataset, regenerating it..."
+ )
+ session.bqclient.query(sql).result()
+ finally:
+ return model_name
+
+
@pytest.fixture(scope="session")
def penguins_xgbregressor_model_name(
session: bigframes.Session, dataset_id_permanent, penguins_table_id
@@ -721,3 +796,100 @@ def restore_sampling_settings():
yield
bigframes.options.sampling.enable_downsampling = enable_downsampling
bigframes.options.sampling.max_download_size = max_download_size
+
+
+@pytest.fixture()
+def weird_strings_pd():
+ df = pd.DataFrame(
+ {
+ "string_col": [
+ "٠١٢٣٤٥٦٧٨٩",
+ "",
+ "0",
+ "字",
+ "五",
+ "0123456789",
+ pd.NA,
+ "abc 123 mixed letters and numbers",
+ "no numbers here",
+ "123a",
+ "23!",
+ " 45",
+ "a45",
+ "Dž",
+ "tT",
+ "-123",
+ "-123.4",
+ "-0",
+ "-.0",
+ ".0",
+ ".1",
+ "⅙",
+ "²",
+ "\t",
+ "a\ta",
+ "p1\np2",
+ " ",
+ ]
+ },
+ dtype=pd.StringDtype(storage="pyarrow"),
+ )
+ df.index = df.index.astype("Int64")
+ return df.string_col
+
+
+@pytest.fixture()
+def weird_strings(session, weird_strings_pd):
+ return session.read_pandas(weird_strings_pd.to_frame()).string_col
+
+
+@pytest.fixture()
+def floats_pd():
+ df = pd.DataFrame(
+ {
+ "float64_col": [
+ float("-inf"),
+ float("inf"),
+ float("nan"),
+ float(-234239487.4),
+ float(-1.0),
+ float(-0.000000001),
+ float(0),
+ float(0.000000001),
+ float(0.9999999999),
+ float(1.0),
+ float(1.0000001),
+ float(math.pi / 2),
+ float(math.e),
+ float(math.pi),
+ float(234239487.4),
+ float(1.23124 * (2**70)),
+ pd.NA,
+ ]
+ },
+ dtype=pd.Float64Dtype(),
+ )
+ # Index helps debug failed cases
+ df.index = df.float64_col
+ # Upload fails if index name same as column name
+ df.index.name = None
+ return df.float64_col
+
+
+@pytest.fixture()
+def floats_product_pd(floats_pd):
+ df = pd.merge(floats_pd, floats_pd, how="cross")
+ # Index helps debug failed cases
+ df = df.set_index([df.float64_col_x, df.float64_col_y])
+ df.index.names = ["left", "right"]
+ return df
+
+
+@pytest.fixture()
+def floats_bf(session, floats_pd):
+ return session.read_pandas(floats_pd.to_frame()).float64_col
+
+
+@pytest.fixture()
+def floats_product_bf(session, floats_product_pd):
+ return session.read_pandas(floats_product_pd)
diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py
index 0c2744819d..b65baa63eb 100644
--- a/tests/system/large/ml/test_compose.py
+++ b/tests/system/large/ml/test_compose.py
@@ -21,7 +21,7 @@
import bigframes.ml.preprocessing
-def test_columntransformer_standalone_fit_transform(
+def test_columntransformer_standalone_fit_and_transform(
penguins_df_default_index, new_penguins_df
):
transformer = bigframes.ml.compose.ColumnTransformer(
@@ -73,3 +73,54 @@ def test_columntransformer_standalone_fit_transform(
)
pandas.testing.assert_frame_equal(result, expected, rtol=1e-3)
+
+
+def test_columntransformer_standalone_fit_transform(new_penguins_df):
+ transformer = bigframes.ml.compose.ColumnTransformer(
+ [
+ (
+ "onehot",
+ bigframes.ml.preprocessing.OneHotEncoder(),
+ "species",
+ ),
+ (
+ "scale",
+ bigframes.ml.preprocessing.StandardScaler(),
+ ["culmen_length_mm", "flipper_length_mm"],
+ ),
+ ]
+ )
+
+ result = transformer.fit_transform(
+ new_penguins_df[["species", "culmen_length_mm", "flipper_length_mm"]]
+ ).to_pandas()
+
+ # TODO: bug? feature columns seem to be in nondeterministic random order
+ # workaround: sort columns by name. Can't repro it in pantheon, so could
+ # be a bigframes issue...
+ result = result.reindex(sorted(result.columns), axis=1)
+
+ expected = pandas.DataFrame(
+ {
+ "onehotencoded_species": [
+ [{"index": 1, "value": 1.0}],
+ [{"index": 1, "value": 1.0}],
+ [{"index": 2, "value": 1.0}],
+ ],
+ "scaled_culmen_length_mm": [
+ 1.313249,
+ -0.20198,
+ -1.111118,
+ ],
+ "scaled_flipper_length_mm": [1.251098, -1.196588, -0.054338],
+ },
+ index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"),
+ )
+ expected.scaled_culmen_length_mm = expected.scaled_culmen_length_mm.astype(
+ "Float64"
+ )
+ expected.scaled_flipper_length_mm = expected.scaled_flipper_length_mm.astype(
+ "Float64"
+ )
+
+ pandas.testing.assert_frame_equal(result, expected, rtol=1e-3)
diff --git a/tests/system/large/ml/test_core.py b/tests/system/large/ml/test_core.py
index ab33e5d718..133af2dae4 100644
--- a/tests/system/large/ml/test_core.py
+++ b/tests/system/large/ml/test_core.py
@@ -20,7 +20,7 @@
def test_bqml_e2e(session, dataset_id, penguins_df_default_index, new_penguins_df):
df = penguins_df_default_index.dropna()
- train_X = df[
+ X_train = df[
[
"species",
"island",
@@ -30,10 +30,10 @@ def test_bqml_e2e(session, dataset_id, penguins_df_default_index, new_penguins_d
"sex",
]
]
- train_y = df[["body_mass_g"]]
+ y_train = df[["body_mass_g"]]
model = bigframes.ml.core.create_bqml_model(
- train_X, train_y, options={"model_type": "linear_reg"}
+ X_train, y_train, options={"model_type": "linear_reg"}
)
# no data - report evaluation from the automatic data split
@@ -85,22 +85,22 @@ def test_bqml_manual_preprocessing_e2e(
session, dataset_id, penguins_df_default_index, new_penguins_df
):
df = penguins_df_default_index.dropna()
- train_X = df[
+ X_train = df[
[
"culmen_length_mm",
"culmen_depth_mm",
"flipper_length_mm",
]
]
- train_y = df[["body_mass_g"]]
+ y_train = df[["body_mass_g"]]
transforms = [
bigframes.ml.sql.ml_standard_scaler(column, column)
- for column in train_X.columns.tolist()
+ for column in X_train.columns.tolist()
]
- transforms.extend(train_y.columns.tolist())
+ transforms.extend(y_train.columns.tolist())
options = {"model_type": "linear_reg"}
model = bigframes.ml.core.create_bqml_model(
- train_X, train_y, transforms=transforms, options=options
+ X_train, y_train, transforms=transforms, options=options
)
# no data - report evaluation from the automatic data split
diff --git a/tests/system/large/ml/test_ensemble.py b/tests/system/large/ml/test_ensemble.py
index 88c5ccd2f0..9b2872d673 100644
--- a/tests/system/large/ml/test_ensemble.py
+++ b/tests/system/large/ml/test_ensemble.py
@@ -25,7 +25,7 @@ def test_xgbregressor_default_params(penguins_df_default_index, dataset_id):
model = bigframes.ml.ensemble.XGBRegressor()
df = penguins_df_default_index.dropna()
- train_X = df[
+ X_train = df[
[
"species",
"island",
@@ -35,11 +35,11 @@ def test_xgbregressor_default_params(penguins_df_default_index, dataset_id):
"sex",
]
]
- train_y = df[["body_mass_g"]]
- model.fit(train_X, train_y)
+ y_train = df[["body_mass_g"]]
+ model.fit(X_train, y_train)
# Check score to ensure the model was fitted
- result = model.score(train_X, train_y).to_pandas()
+ result = model.score(X_train, y_train).to_pandas()
expected = pandas.DataFrame(
{
"mean_absolute_error": [97.368139],
@@ -86,7 +86,7 @@ def test_xgbregressor_dart_booster_multiple_params(
)
df = penguins_df_default_index.dropna().sample(n=70)
- train_X = df[
+ X_train = df[
[
"species",
"island",
@@ -96,11 +96,11 @@ def test_xgbregressor_dart_booster_multiple_params(
"sex",
]
]
- train_y = df[["body_mass_g"]]
- model.fit(train_X, train_y)
+ y_train = df[["body_mass_g"]]
+ model.fit(X_train, y_train)
# Check score to ensure the model was fitted
- result = model.score(train_X, train_y).to_pandas()
+ result = model.score(X_train, y_train).to_pandas()
TestCase().assertSequenceEqual(result.shape, (1, 6))
for col_name in [
"mean_absolute_error",
@@ -144,7 +144,7 @@ def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id):
model = bigframes.ml.ensemble.XGBClassifier()
df = penguins_df_default_index.dropna().sample(n=70)
- train_X = df[
+ X_train = df[
[
"species",
"island",
@@ -153,11 +153,11 @@ def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id):
"flipper_length_mm",
]
]
- train_y = df[["sex"]]
- model.fit(train_X, train_y)
+ y_train = df[["sex"]]
+ model.fit(X_train, y_train)
# Check score to ensure the model was fitted
- result = model.score(train_X, train_y).to_pandas()
+ result = model.score(X_train, y_train).to_pandas()
TestCase().assertSequenceEqual(result.shape, (1, 6))
for col_name in [
"precision",
@@ -201,7 +201,7 @@ def test_xgbclassifier_dart_booster_multiple_params(
)
df = penguins_df_default_index.dropna().sample(n=70)
- train_X = df[
+ X_train = df[
[
"species",
"island",
@@ -210,11 +210,11 @@ def test_xgbclassifier_dart_booster_multiple_params(
"flipper_length_mm",
]
]
- train_y = df[["sex"]]
- model.fit(train_X, train_y)
+ y_train = df[["sex"]]
+ model.fit(X_train, y_train)
# Check score to ensure the model was fitted
- result = model.score(train_X, train_y).to_pandas()
+ result = model.score(X_train, y_train).to_pandas()
TestCase().assertSequenceEqual(result.shape, (1, 6))
for col_name in [
"precision",
@@ -258,7 +258,7 @@ def test_randomforestregressor_default_params(penguins_df_default_index, dataset
model = bigframes.ml.ensemble.RandomForestRegressor()
df = penguins_df_default_index.dropna()
- train_X = df[
+ X_train = df[
[
"species",
"island",
@@ -268,11 +268,11 @@ def test_randomforestregressor_default_params(penguins_df_default_index, dataset
"sex",
]
]
- train_y = df[["body_mass_g"]]
- model.fit(train_X, train_y)
+ y_train = df[["body_mass_g"]]
+ model.fit(X_train, y_train)
# Check score to ensure the model was fitted
- result = model.score(train_X, train_y).to_pandas()
+ result = model.score(X_train, y_train).to_pandas()
TestCase().assertSequenceEqual(result.shape, (1, 6))
for col_name in [
"mean_absolute_error",
@@ -311,7 +311,7 @@ def test_randomforestregressor_multiple_params(penguins_df_default_index, datase
)
df = penguins_df_default_index.dropna().sample(n=70)
- train_X = df[
+ X_train = df[
[
"species",
"island",
@@ -321,11 +321,11 @@ def test_randomforestregressor_multiple_params(penguins_df_default_index, datase
"sex",
]
]
- train_y = df[["body_mass_g"]]
- model.fit(train_X, train_y)
+ y_train = df[["body_mass_g"]]
+ model.fit(X_train, y_train)
# Check score to ensure the model was fitted
- result = model.score(train_X, train_y).to_pandas()
+ result = model.score(X_train, y_train).to_pandas()
TestCase().assertSequenceEqual(result.shape, (1, 6))
for col_name in [
"mean_absolute_error",
@@ -366,7 +366,7 @@ def test_randomforestclassifier_default_params(penguins_df_default_index, datase
model = bigframes.ml.ensemble.RandomForestClassifier()
df = penguins_df_default_index.dropna().sample(n=70)
- train_X = df[
+ X_train = df[
[
"species",
"island",
@@ -375,11 +375,11 @@ def test_randomforestclassifier_default_params(penguins_df_default_index, datase
"flipper_length_mm",
]
]
- train_y = df[["sex"]]
- model.fit(train_X, train_y)
+ y_train = df[["sex"]]
+ model.fit(X_train, y_train)
# Check score to ensure the model was fitted
- result = model.score(train_X, train_y).to_pandas()
+ result = model.score(X_train, y_train).to_pandas()
TestCase().assertSequenceEqual(result.shape, (1, 6))
for col_name in [
"precision",
@@ -418,7 +418,7 @@ def test_randomforestclassifier_multiple_params(penguins_df_default_index, datas
)
df = penguins_df_default_index.dropna().sample(n=70)
- train_X = df[
+ X_train = df[
[
"species",
"island",
@@ -427,11 +427,11 @@ def test_randomforestclassifier_multiple_params(penguins_df_default_index, datas
"flipper_length_mm",
]
]
- train_y = df[["sex"]]
- model.fit(train_X, train_y)
+ y_train = df[["sex"]]
+ model.fit(X_train, y_train)
# Check score to ensure the model was fitted
- result = model.score(train_X, train_y).to_pandas()
+ result = model.score(X_train, y_train).to_pandas()
TestCase().assertSequenceEqual(result.shape, (1, 6))
for col_name in [
"precision",
diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py
index d1e2d12296..33b835e852 100644
--- a/tests/system/large/ml/test_forecasting.py
+++ b/tests/system/large/ml/test_forecasting.py
@@ -21,9 +21,9 @@ def test_arima_plus_model_fit_score(
time_series_df_default_index, dataset_id, new_time_series_df
):
model = forecasting.ARIMAPlus()
- train_X = time_series_df_default_index[["parsed_date"]]
- train_y = time_series_df_default_index[["total_visits"]]
- model.fit(train_X, train_y)
+ X_train = time_series_df_default_index[["parsed_date"]]
+ y_train = time_series_df_default_index[["total_visits"]]
+ model.fit(X_train, y_train)
result = model.score(
new_time_series_df[["parsed_date"]], new_time_series_df[["total_visits"]]
diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py
index 332b460fe5..3b90568450 100644
--- a/tests/system/large/ml/test_linear_model.py
+++ b/tests/system/large/ml/test_linear_model.py
@@ -21,7 +21,7 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase
model = bigframes.ml.linear_model.LinearRegression(fit_intercept=False)
df = penguins_df_default_index.dropna()
- train_X = df[
+ X_train = df[
[
"species",
"island",
@@ -31,11 +31,11 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase
"sex",
]
]
- train_y = df[["body_mass_g"]]
- model.fit(train_X, train_y)
+ y_train = df[["body_mass_g"]]
+ model.fit(X_train, y_train)
# Check score to ensure the model was fitted
- result = model.score(train_X, train_y).to_pandas()
+ result = model.score(X_train, y_train).to_pandas()
expected = pd.DataFrame(
{
"mean_absolute_error": [225.735767],
@@ -66,7 +66,7 @@ def test_linear_regression_manual_split_configure_fit_score(
model = bigframes.ml.linear_model.LinearRegression(fit_intercept=True)
df = penguins_df_default_index.dropna()
- train_X = df[
+ X_train = df[
[
"species",
"island",
@@ -76,11 +76,11 @@ def test_linear_regression_manual_split_configure_fit_score(
"sex",
]
]
- train_y = df[["body_mass_g"]]
- model.fit(train_X, train_y)
+ y_train = df[["body_mass_g"]]
+ model.fit(X_train, y_train)
# Check score to ensure the model was fitted
- result = model.score(train_X, train_y).to_pandas()
+ result = model.score(X_train, y_train).to_pandas()
expected = pd.DataFrame(
{
"mean_absolute_error": [225.735767],
@@ -108,7 +108,7 @@ def test_logistic_regression_auto_class_weights_configure_fit_score(
):
model = bigframes.ml.linear_model.LogisticRegression()
df = penguins_df_default_index.dropna()
- train_X = df[
+ X_train = df[
[
"species",
"island",
@@ -117,11 +117,11 @@ def test_logistic_regression_auto_class_weights_configure_fit_score(
"flipper_length_mm",
]
]
- train_y = df[["sex"]]
- model.fit(train_X, train_y)
+ y_train = df[["sex"]]
+ model.fit(X_train, y_train)
# Check score to ensure the model was fitted
- result = model.score(train_X, train_y).to_pandas()
+ result = model.score(X_train, y_train).to_pandas()
expected = pd.DataFrame(
{
"precision": [0.58085],
@@ -155,7 +155,7 @@ def test_logistic_regression_manual_split_configure_fit_score(
model = bigframes.ml.linear_model.LogisticRegression(fit_intercept=True)
df = penguins_df_default_index.dropna()
- train_X = df[
+ X_train = df[
[
"species",
"island",
@@ -165,11 +165,11 @@ def test_logistic_regression_manual_split_configure_fit_score(
"body_mass_g",
]
]
- train_y = df[["sex"]]
- model.fit(train_X, train_y)
+ y_train = df[["sex"]]
+ model.fit(X_train, y_train)
# Check score to ensure the model was fitted
- result = model.score(train_X, train_y).to_pandas()
+ result = model.score(X_train, y_train).to_pandas()
expected = pd.DataFrame(
{
"precision": [0.616753],
diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py
index 87ea46f969..9ca5a2fd0e 100644
--- a/tests/system/small/ml/conftest.py
+++ b/tests/system/small/ml/conftest.py
@@ -12,12 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import hashlib
-import logging
from typing import cast
import uuid
-import google.cloud.exceptions
import pandas as pd
import pytest
@@ -35,8 +32,8 @@
@pytest.fixture(scope="session")
-def ml_connection() -> str:
- return "bigframes-dev.us.bigframes-ml"
+def bq_connection() -> str:
+ return "bigframes-dev.us.bigframes-rf-conn"
@pytest.fixture(scope="session")
@@ -56,11 +53,21 @@ def ephemera_penguins_bqml_linear_model(
@pytest.fixture(scope="session")
-def penguins_bqml_kmeans_model(session, penguins_kmeans_model_name) -> core.BqmlModel:
+def penguins_bqml_kmeans_model(
+ session: bigframes.Session, penguins_kmeans_model_name: str
+) -> core.BqmlModel:
model = session.bqclient.get_model(penguins_kmeans_model_name)
return core.BqmlModel(session, model)
+@pytest.fixture(scope="session")
+def penguins_bqml_pca_model(
+ session: bigframes.Session, penguins_pca_model_name: str
+) -> core.BqmlModel:
+ model = session.bqclient.get_model(penguins_pca_model_name)
+ return core.BqmlModel(session, model)
+
+
@pytest.fixture(scope="session")
def penguins_linear_model(
session, penguins_linear_model_name: str
@@ -140,32 +147,12 @@ def penguins_kmeans_model(session, penguins_kmeans_model_name: str) -> cluster.K
@pytest.fixture(scope="session")
def penguins_pca_model(
- session: bigframes.Session, dataset_id_permanent, penguins_table_id
+ session: bigframes.Session, penguins_pca_model_name: str
) -> decomposition.PCA:
-
- # TODO(yunmengxie): Create a shared method to get different types of pretrained models.
- sql = f"""
-CREATE OR REPLACE MODEL `$model_name`
-OPTIONS (
- model_type='pca',
- num_principal_components=3
-) AS SELECT
- *
-FROM `{penguins_table_id}`"""
- # We use the SQL hash as the name to ensure the model is regenerated if this fixture is edited
- model_name = (
- f"{dataset_id_permanent}.penguins_pca_{hashlib.md5(sql.encode()).hexdigest()}"
+ return cast(
+ decomposition.PCA,
+ session.read_gbq_model(penguins_pca_model_name),
)
- sql = sql.replace("$model_name", model_name)
-
- try:
- return session.read_gbq_model(model_name)
- except google.cloud.exceptions.NotFound:
- logging.info(
- "penguins_pca_model fixture was not found in the permanent dataset, regenerating it..."
- )
- session.bqclient.query(sql).result()
- return session.read_gbq_model(model_name)
@pytest.fixture(scope="session")
@@ -211,33 +198,33 @@ def llm_text_df(session, llm_text_pandas_df):
@pytest.fixture(scope="session")
-def bqml_palm2_text_generator_model(session, ml_connection) -> core.BqmlModel:
+def bqml_palm2_text_generator_model(session, bq_connection) -> core.BqmlModel:
options = {
"remote_service_type": "CLOUD_AI_LARGE_LANGUAGE_MODEL_V1",
}
return core.create_bqml_remote_model(
- session=session, connection_name=ml_connection, options=options
+ session=session, connection_name=bq_connection, options=options
)
@pytest.fixture(scope="session")
-def palm2_text_generator_model(session, ml_connection) -> llm.PaLM2TextGenerator:
- return llm.PaLM2TextGenerator(session=session, connection_name=ml_connection)
+def palm2_text_generator_model(session, bq_connection) -> llm.PaLM2TextGenerator:
+ return llm.PaLM2TextGenerator(session=session, connection_name=bq_connection)
@pytest.fixture(scope="function")
def ephemera_palm2_text_generator_model(
- session, ml_connection
+ session, bq_connection
) -> llm.PaLM2TextGenerator:
- return llm.PaLM2TextGenerator(session=session, connection_name=ml_connection)
+ return llm.PaLM2TextGenerator(session=session, connection_name=bq_connection)
@pytest.fixture(scope="session")
def palm2_embedding_generator_model(
- session, ml_connection
+ session, bq_connection
) -> llm.PaLM2TextEmbeddingGenerator:
return llm.PaLM2TextEmbeddingGenerator(
- session=session, connection_name=ml_connection
+ session=session, connection_name=bq_connection
)
@@ -260,10 +247,22 @@ def time_series_arima_plus_model(
@pytest.fixture(scope="session")
-def imported_tensorflow_model(session) -> imported.TensorFlowModel:
+def imported_tensorflow_model_path() -> str:
+ return "gs://cloud-training-demos/txtclass/export/exporter/1549825580/*"
+
+
+@pytest.fixture(scope="session")
+def imported_onnx_model_path() -> str:
+ return "gs://cloud-samples-data/bigquery/ml/onnx/pipeline_rf.onnx"
+
+
+@pytest.fixture(scope="session")
+def imported_tensorflow_model(
+ session, imported_tensorflow_model_path
+) -> imported.TensorFlowModel:
return imported.TensorFlowModel(
session=session,
- model_path="gs://cloud-training-demos/txtclass/export/exporter/1549825580/*",
+ model_path=imported_tensorflow_model_path,
)
@@ -276,8 +275,8 @@ def ephemera_imported_tensorflow_model(session) -> imported.TensorFlowModel:
@pytest.fixture(scope="session")
-def imported_onnx_model(session) -> imported.ONNXModel:
+def imported_onnx_model(session, imported_onnx_model_path) -> imported.ONNXModel:
return imported.ONNXModel(
session=session,
- model_path="gs://cloud-samples-data/bigquery/ml/onnx/pipeline_rf.onnx",
+ model_path=imported_onnx_model_path,
)
diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py
index 4b184b0d4c..6c3e8e06f5 100644
--- a/tests/system/small/ml/test_core.py
+++ b/tests/system/small/ml/test_core.py
@@ -140,6 +140,100 @@ def test_model_centroids(penguins_bqml_kmeans_model: core.BqmlModel):
)
+def test_pca_model_principal_components(penguins_bqml_pca_model: core.BqmlModel):
+ result = penguins_bqml_pca_model.principal_components().to_pandas()
+ assert result.shape == (21, 4)
+
+ # result is too long, only check the first principal component here.
+ result = result.head(7)
+ expected = pd.DataFrame(
+ {
+ "principal_component_id": [0] * 7,
+ "feature": [
+ "species",
+ "island",
+ "culmen_length_mm",
+ "culmen_depth_mm",
+ "flipper_length_mm",
+ "body_mass_g",
+ "sex",
+ ],
+ "numerical_value": [
+ pd.NA,
+ pd.NA,
+ 0.401489,
+ -0.377482,
+ 0.524052,
+ 0.501174,
+ pd.NA,
+ ],
+ "categorical_value": [
+ [
+ {
+ "category": "Gentoo penguin (Pygoscelis papua)",
+ "value": 0.25068877125667804,
+ },
+ {
+ "category": "Adelie Penguin (Pygoscelis adeliae)",
+ "value": -0.20622291900416198,
+ },
+ {
+ "category": "Chinstrap penguin (Pygoscelis antarctica)",
+ "value": -0.030161149275185855,
+ },
+ ],
+ [
+ {"category": "Biscoe", "value": 0.19761120114410635},
+ {"category": "Dream", "value": -0.11264736305259061},
+ {"category": "Torgersen", "value": -0.07065913511418596},
+ ],
+ [],
+ [],
+ [],
+ [],
+ [
+ {"category": ".", "value": 0.0015916894448071784},
+ {"category": "MALE", "value": 0.06869704739750442},
+ {"category": "FEMALE", "value": -0.052521171596813174},
+ {"category": "_null_filler", "value": -0.0034628622681684906},
+ ],
+ ],
+ },
+ )
+ pd.testing.assert_frame_equal(
+ result,
+ expected,
+ check_exact=False,
+ rtol=0.1,
+ # int64 Index by default in pandas versus Int64 (nullable) Index in BigQuery DataFrame
+ check_index_type=False,
+ check_dtype=False,
+ )
+
+
+def test_pca_model_principal_component_info(penguins_bqml_pca_model: core.BqmlModel):
+ result = penguins_bqml_pca_model.principal_component_info().to_pandas()
+ assert result.shape == (3, 4)
+
+ expected = pd.DataFrame(
+ {
+ "principal_component_id": [0, 1, 2],
+ "eigenvalue": [3.278657, 1.270829, 1.125354],
+ "explained_variance_ratio": [0.469357, 0.181926, 0.1611],
+ "cumulative_explained_variance_ratio": [0.469357, 0.651283, 0.812383],
+ },
+ )
+ pd.testing.assert_frame_equal(
+ result,
+ expected,
+ check_exact=False,
+ rtol=0.1,
+ # int64 Index by default in pandas versus Int64 (nullable) Index in BigQuery DataFrame
+ check_index_type=False,
+ check_dtype=False,
+ )
+
+
def test_model_predict(penguins_bqml_linear_model: core.BqmlModel, new_penguins_df):
predictions = penguins_bqml_linear_model.predict(new_penguins_df).to_pandas()
expected = pd.DataFrame(
diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py
index 01d5207750..8df4145fcf 100644
--- a/tests/system/small/ml/test_decomposition.py
+++ b/tests/system/small/ml/test_decomposition.py
@@ -55,7 +55,7 @@ def test_pca_predict(session, penguins_pca_model: decomposition.PCA):
)
-def test_pca_score(session, penguins_pca_model: decomposition.PCA):
+def test_pca_score(penguins_pca_model: decomposition.PCA):
result = penguins_pca_model.score().to_pandas()
expected = pd.DataFrame(
{"total_explained_variance_ratio": [0.812383]},
@@ -68,3 +68,110 @@ def test_pca_score(session, penguins_pca_model: decomposition.PCA):
rtol=0.1,
check_index_type=False,
)
+
+
+def test_pca_components_(penguins_pca_model: decomposition.PCA):
+ result = penguins_pca_model.components_.to_pandas()
+
+ # result is too long, only check the first principal component here.
+ result = result.head(7)
+ expected = pd.DataFrame(
+ {
+ "principal_component_id": [0] * 7,
+ "feature": [
+ "species",
+ "island",
+ "culmen_length_mm",
+ "culmen_depth_mm",
+ "flipper_length_mm",
+ "body_mass_g",
+ "sex",
+ ],
+ "numerical_value": [
+ pd.NA,
+ pd.NA,
+ 0.401489,
+ -0.377482,
+ 0.524052,
+ 0.501174,
+ pd.NA,
+ ],
+ "categorical_value": [
+ [
+ {
+ "category": "Gentoo penguin (Pygoscelis papua)",
+ "value": 0.25068877125667804,
+ },
+ {
+ "category": "Adelie Penguin (Pygoscelis adeliae)",
+ "value": -0.20622291900416198,
+ },
+ {
+ "category": "Chinstrap penguin (Pygoscelis antarctica)",
+ "value": -0.030161149275185855,
+ },
+ ],
+ [
+ {"category": "Biscoe", "value": 0.19761120114410635},
+ {"category": "Dream", "value": -0.11264736305259061},
+ {"category": "Torgersen", "value": -0.07065913511418596},
+ ],
+ [],
+ [],
+ [],
+ [],
+ [
+ {"category": ".", "value": 0.0015916894448071784},
+ {"category": "MALE", "value": 0.06869704739750442},
+ {"category": "FEMALE", "value": -0.052521171596813174},
+ {"category": "_null_filler", "value": -0.0034628622681684906},
+ ],
+ ],
+ },
+ )
+ pd.testing.assert_frame_equal(
+ result,
+ expected,
+ check_exact=False,
+ rtol=0.1,
+ check_index_type=False,
+ check_dtype=False,
+ )
+
+
+def test_pca_explained_variance_(penguins_pca_model: decomposition.PCA):
+ result = penguins_pca_model.explained_variance_.to_pandas()
+
+ expected = pd.DataFrame(
+ {
+ "principal_component_id": [0, 1, 2],
+ "explained_variance": [3.278657, 1.270829, 1.125354],
+ },
+ )
+ pd.testing.assert_frame_equal(
+ result,
+ expected,
+ check_exact=False,
+ rtol=0.1,
+ check_index_type=False,
+ check_dtype=False,
+ )
+
+
+def test_pca_explained_variance_ratio_(penguins_pca_model: decomposition.PCA):
+ result = penguins_pca_model.explained_variance_ratio_.to_pandas()
+
+ expected = pd.DataFrame(
+ {
+ "principal_component_id": [0, 1, 2],
+ "explained_variance_ratio": [0.469357, 0.181926, 0.1611],
+ },
+ )
+ pd.testing.assert_frame_equal(
+ result,
+ expected,
+ check_exact=False,
+ rtol=0.1,
+ check_index_type=False,
+ check_dtype=False,
+ )
diff --git a/tests/system/small/ml/test_ensemble.py b/tests/system/small/ml/test_ensemble.py
index fde3cc431e..bba083d98d 100644
--- a/tests/system/small/ml/test_ensemble.py
+++ b/tests/system/small/ml/test_ensemble.py
@@ -25,7 +25,7 @@ def test_xgbregressor_model_score(
penguins_xgbregressor_model, penguins_df_default_index
):
df = penguins_df_default_index.dropna()
- test_X = df[
+ X_test = df[
[
"species",
"island",
@@ -35,8 +35,8 @@ def test_xgbregressor_model_score(
"body_mass_g",
]
]
- test_y = df[["sex"]]
- result = penguins_xgbregressor_model.score(test_X, test_y).to_pandas()
+ y_test = df[["sex"]]
+ result = penguins_xgbregressor_model.score(X_test, y_test).to_pandas()
expected = pandas.DataFrame(
{
"mean_absolute_error": [108.77582],
@@ -62,7 +62,7 @@ def test_xgbregressor_model_score_series(
penguins_xgbregressor_model, penguins_df_default_index
):
df = penguins_df_default_index.dropna()
- test_X = df[
+ X_test = df[
[
"species",
"island",
@@ -72,8 +72,8 @@ def test_xgbregressor_model_score_series(
"body_mass_g",
]
]
- test_y = df["sex"]
- result = penguins_xgbregressor_model.score(test_X, test_y).to_pandas()
+ y_test = df["sex"]
+ result = penguins_xgbregressor_model.score(X_test, y_test).to_pandas()
expected = pandas.DataFrame(
{
"mean_absolute_error": [108.77582],
@@ -120,7 +120,7 @@ def test_to_gbq_saved_xgbregressor_model_scores(
f"{dataset_id}.test_penguins_model", replace=True
)
df = penguins_df_default_index.dropna()
- test_X = df[
+ X_test = df[
[
"species",
"island",
@@ -130,8 +130,8 @@ def test_to_gbq_saved_xgbregressor_model_scores(
"body_mass_g",
]
]
- test_y = df[["sex"]]
- result = saved_model.score(test_X, test_y).to_pandas()
+ y_test = df[["sex"]]
+ result = saved_model.score(X_test, y_test).to_pandas()
expected = pandas.DataFrame(
{
"mean_absolute_error": [109.016973],
@@ -165,7 +165,7 @@ def test_xgbclassifier_model_score(
penguins_xgbclassifier_model, penguins_df_default_index
):
df = penguins_df_default_index.dropna()
- test_X = df[
+ X_test = df[
[
"species",
"island",
@@ -175,8 +175,8 @@ def test_xgbclassifier_model_score(
"body_mass_g",
]
]
- test_y = df[["sex"]]
- result = penguins_xgbclassifier_model.score(test_X, test_y).to_pandas()
+ y_test = df[["sex"]]
+ result = penguins_xgbclassifier_model.score(X_test, y_test).to_pandas()
TestCase().assertSequenceEqual(result.shape, (1, 6))
for col_name in [
"precision",
@@ -193,7 +193,7 @@ def test_xgbclassifier_model_score_series(
penguins_xgbclassifier_model, penguins_df_default_index
):
df = penguins_df_default_index.dropna()
- test_X = df[
+ X_test = df[
[
"species",
"island",
@@ -203,8 +203,8 @@ def test_xgbclassifier_model_score_series(
"body_mass_g",
]
]
- test_y = df["sex"]
- result = penguins_xgbclassifier_model.score(test_X, test_y).to_pandas()
+ y_test = df["sex"]
+ result = penguins_xgbclassifier_model.score(X_test, y_test).to_pandas()
TestCase().assertSequenceEqual(result.shape, (1, 6))
for col_name in [
"precision",
@@ -242,7 +242,7 @@ def test_to_gbq_saved_xgbclassifier_model_scores(
f"{dataset_id}.test_penguins_model", replace=True
)
df = penguins_df_default_index.dropna()
- test_X = df[
+ X_test = df[
[
"species",
"island",
@@ -252,8 +252,8 @@ def test_to_gbq_saved_xgbclassifier_model_scores(
"body_mass_g",
]
]
- test_y = df[["sex"]]
- result = saved_model.score(test_X, test_y).to_pandas()
+ y_test = df[["sex"]]
+ result = saved_model.score(X_test, y_test).to_pandas()
expected = pandas.DataFrame(
{
"precision": [1.0],
@@ -289,7 +289,7 @@ def test_randomforestregressor_model_score(
penguins_randomforest_regressor_model, penguins_df_default_index
):
df = penguins_df_default_index.dropna()
- test_X = df[
+ X_test = df[
[
"species",
"island",
@@ -299,8 +299,8 @@ def test_randomforestregressor_model_score(
"body_mass_g",
]
]
- test_y = df[["sex"]]
- result = penguins_randomforest_regressor_model.score(test_X, test_y).to_pandas()
+ y_test = df[["sex"]]
+ result = penguins_randomforest_regressor_model.score(X_test, y_test).to_pandas()
expected = pandas.DataFrame(
{
"mean_absolute_error": [317.031042],
@@ -326,7 +326,7 @@ def test_randomforestregressor_model_score_series(
penguins_randomforest_regressor_model, penguins_df_default_index
):
df = penguins_df_default_index.dropna()
- test_X = df[
+ X_test = df[
[
"species",
"island",
@@ -336,8 +336,8 @@ def test_randomforestregressor_model_score_series(
"body_mass_g",
]
]
- test_y = df["sex"]
- result = penguins_randomforest_regressor_model.score(test_X, test_y).to_pandas()
+ y_test = df["sex"]
+ result = penguins_randomforest_regressor_model.score(X_test, y_test).to_pandas()
expected = pandas.DataFrame(
{
"mean_absolute_error": [317.031042],
@@ -385,7 +385,7 @@ def test_to_gbq_saved_randomforestregressor_model_scores(
f"{dataset_id}.test_penguins_model", replace=True
)
df = penguins_df_default_index.dropna()
- test_X = df[
+ X_test = df[
[
"species",
"island",
@@ -395,8 +395,8 @@ def test_to_gbq_saved_randomforestregressor_model_scores(
"body_mass_g",
]
]
- test_y = df[["sex"]]
- result = saved_model.score(test_X, test_y).to_pandas()
+ y_test = df[["sex"]]
+ result = saved_model.score(X_test, y_test).to_pandas()
expected = pandas.DataFrame(
{
"mean_absolute_error": [319.239235],
@@ -434,7 +434,7 @@ def test_randomforestclassifier_model_score(
penguins_randomforest_classifier_model, penguins_df_default_index
):
df = penguins_df_default_index.dropna()
- test_X = df[
+ X_test = df[
[
"species",
"island",
@@ -444,8 +444,8 @@ def test_randomforestclassifier_model_score(
"body_mass_g",
]
]
- test_y = df[["sex"]]
- result = penguins_randomforest_classifier_model.score(test_X, test_y).to_pandas()
+ y_test = df[["sex"]]
+ result = penguins_randomforest_classifier_model.score(X_test, y_test).to_pandas()
TestCase().assertSequenceEqual(result.shape, (1, 6))
for col_name in [
"precision",
@@ -462,7 +462,7 @@ def test_randomforestclassifier_model_score_series(
penguins_randomforest_classifier_model, penguins_df_default_index
):
df = penguins_df_default_index.dropna()
- test_X = df[
+ X_test = df[
[
"species",
"island",
@@ -472,8 +472,8 @@ def test_randomforestclassifier_model_score_series(
"body_mass_g",
]
]
- test_y = df["sex"]
- result = penguins_randomforest_classifier_model.score(test_X, test_y).to_pandas()
+ y_test = df["sex"]
+ result = penguins_randomforest_classifier_model.score(X_test, y_test).to_pandas()
TestCase().assertSequenceEqual(result.shape, (1, 6))
for col_name in [
"precision",
@@ -512,7 +512,7 @@ def test_to_gbq_saved_randomforestclassifier_model_scores(
f"{dataset_id}.test_penguins_model", replace=True
)
df = penguins_df_default_index.dropna()
- test_X = df[
+ X_test = df[
[
"species",
"island",
@@ -522,8 +522,8 @@ def test_to_gbq_saved_randomforestclassifier_model_scores(
"body_mass_g",
]
]
- test_y = df[["sex"]]
- result = saved_model.score(test_X, test_y).to_pandas()
+ y_test = df[["sex"]]
+ result = saved_model.score(X_test, y_test).to_pandas()
expected = pandas.DataFrame(
{
"precision": [0.636746],
diff --git a/tests/system/small/ml/test_imported.py b/tests/system/small/ml/test_imported.py
index 6274ab1245..d305567066 100644
--- a/tests/system/small/ml/test_imported.py
+++ b/tests/system/small/ml/test_imported.py
@@ -25,6 +25,11 @@ def test_tensorflow_create_model(imported_tensorflow_model):
assert imported_tensorflow_model is not None
+def test_tensorflow_create_model_default_session(imported_tensorflow_model_path):
+ model = imported.TensorFlowModel(model_path=imported_tensorflow_model_path)
+ assert model is not None
+
+
def test_tensorflow_model_predict(imported_tensorflow_model, llm_text_df):
df = llm_text_df.rename(columns={"prompt": "input"})
result = imported_tensorflow_model.predict(df).to_pandas()
@@ -61,6 +66,11 @@ def test_onnx_create_model(imported_onnx_model):
assert imported_onnx_model is not None
+def test_onnx_create_model_default_session(imported_onnx_model_path):
+ model = imported.TensorFlowModel(model_path=imported_onnx_model_path)
+ assert model is not None
+
+
def test_onnx_model_predict(imported_onnx_model, onnx_iris_df):
result = imported_onnx_model.predict(onnx_iris_df).to_pandas()
value1 = np.array([0.9999993443489075, 0.0, 0.0])
diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py
index bbb7e2820c..3a8232ed9e 100644
--- a/tests/system/small/ml/test_linear_model.py
+++ b/tests/system/small/ml/test_linear_model.py
@@ -19,7 +19,7 @@
def test_linear_reg_model_score(penguins_linear_model, penguins_df_default_index):
df = penguins_df_default_index.dropna()
- test_X = df[
+ X_test = df[
[
"species",
"island",
@@ -29,8 +29,8 @@ def test_linear_reg_model_score(penguins_linear_model, penguins_df_default_index
"sex",
]
]
- test_y = df[["body_mass_g"]]
- result = penguins_linear_model.score(test_X, test_y).to_pandas()
+ y_test = df[["body_mass_g"]]
+ result = penguins_linear_model.score(X_test, y_test).to_pandas()
expected = pandas.DataFrame(
{
"mean_absolute_error": [225.817334],
@@ -56,7 +56,7 @@ def test_linear_reg_model_score_series(
penguins_linear_model, penguins_df_default_index
):
df = penguins_df_default_index.dropna()
- test_X = df[
+ X_test = df[
[
"species",
"island",
@@ -66,8 +66,8 @@ def test_linear_reg_model_score_series(
"sex",
]
]
- test_y = df["body_mass_g"]
- result = penguins_linear_model.score(test_X, test_y).to_pandas()
+ y_test = df["body_mass_g"]
+ result = penguins_linear_model.score(X_test, y_test).to_pandas()
expected = pandas.DataFrame(
{
"mean_absolute_error": [225.817334],
@@ -111,7 +111,7 @@ def test_to_gbq_saved_linear_reg_model_scores(
f"{dataset_id}.test_penguins_model", replace=True
)
df = penguins_df_default_index.dropna()
- test_X = df[
+ X_test = df[
[
"species",
"island",
@@ -121,8 +121,8 @@ def test_to_gbq_saved_linear_reg_model_scores(
"sex",
]
]
- test_y = df[["body_mass_g"]]
- result = saved_model.score(test_X, test_y).to_pandas()
+ y_test = df[["body_mass_g"]]
+ result = saved_model.score(X_test, y_test).to_pandas()
expected = pandas.DataFrame(
{
"mean_absolute_error": [227.01223],
@@ -152,7 +152,7 @@ def test_to_gbq_replace(penguins_linear_model, dataset_id):
def test_logistic_model_score(penguins_logistic_model, penguins_df_default_index):
df = penguins_df_default_index.dropna()
- test_X = df[
+ X_test = df[
[
"species",
"island",
@@ -162,8 +162,8 @@ def test_logistic_model_score(penguins_logistic_model, penguins_df_default_index
"body_mass_g",
]
]
- test_y = df[["sex"]]
- result = penguins_logistic_model.score(test_X, test_y).to_pandas()
+ y_test = df[["sex"]]
+ result = penguins_logistic_model.score(X_test, y_test).to_pandas()
expected = pandas.DataFrame(
{
"precision": [0.616753],
@@ -189,7 +189,7 @@ def test_logistic_model_score_series(
penguins_logistic_model, penguins_df_default_index
):
df = penguins_df_default_index.dropna()
- test_X = df[
+ X_test = df[
[
"species",
"island",
@@ -199,8 +199,8 @@ def test_logistic_model_score_series(
"body_mass_g",
]
]
- test_y = df["sex"]
- result = penguins_logistic_model.score(test_X, test_y).to_pandas()
+ y_test = df["sex"]
+ result = penguins_logistic_model.score(X_test, y_test).to_pandas()
expected = pandas.DataFrame(
{
"precision": [0.616753],
@@ -244,7 +244,7 @@ def test_logsitic_model_to_gbq_saved_score(
f"{dataset_id}.test_penguins_model", replace=True
)
df = penguins_df_default_index.dropna()
- test_X = df[
+ X_test = df[
[
"species",
"island",
@@ -254,8 +254,8 @@ def test_logsitic_model_to_gbq_saved_score(
"body_mass_g",
]
]
- test_y = df[["sex"]]
- result = saved_model.score(test_X, test_y).to_pandas()
+ y_test = df[["sex"]]
+ result = saved_model.score(X_test, y_test).to_pandas()
expected = pandas.DataFrame(
{
"precision": [0.616753],
diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py
index 74356c81e1..7486277487 100644
--- a/tests/system/small/ml/test_llm.py
+++ b/tests/system/small/ml/test_llm.py
@@ -15,13 +15,31 @@
from unittest import TestCase
import numpy as np
+import pytest
+
+from bigframes.ml import llm
def test_create_text_generator_model(palm2_text_generator_model):
# Model creation doesn't return error
assert palm2_text_generator_model is not None
+ assert palm2_text_generator_model._bqml_model is not None
+
+
+def test_create_text_generator_model_defaults(bq_connection):
+ import bigframes.pandas as bpd
+
+ bpd.reset_session()
+ bpd.options.bigquery.bq_connection = bq_connection
+ bpd.options.bigquery.location = "us"
+
+ model = llm.PaLM2TextGenerator()
+ assert model is not None
+ assert model._bqml_model is not None
+# Marked as flaky only because BQML LLM is in preview, the service only has limited capacity, not stable enough.
+@pytest.mark.flaky(retries=2, delay=120)
def test_text_generator_predict_default_params_success(
palm2_text_generator_model, llm_text_df
):
@@ -32,6 +50,7 @@ def test_text_generator_predict_default_params_success(
assert all(series.str.len() > 20)
+@pytest.mark.flaky(retries=2, delay=120)
def test_text_generator_predict_series_default_params_success(
palm2_text_generator_model, llm_text_df
):
@@ -42,6 +61,7 @@ def test_text_generator_predict_series_default_params_success(
assert all(series.str.len() > 20)
+@pytest.mark.flaky(retries=2, delay=120)
def test_text_generator_predict_arbitrary_col_label_success(
palm2_text_generator_model, llm_text_df
):
@@ -53,6 +73,7 @@ def test_text_generator_predict_arbitrary_col_label_success(
assert all(series.str.len() > 20)
+@pytest.mark.flaky(retries=2, delay=120)
def test_text_generator_predict_with_params_success(
palm2_text_generator_model, llm_text_df
):
@@ -68,8 +89,22 @@ def test_text_generator_predict_with_params_success(
def test_create_embedding_generator_model(palm2_embedding_generator_model):
# Model creation doesn't return error
assert palm2_embedding_generator_model is not None
+ assert palm2_embedding_generator_model._bqml_model is not None
+
+
+def test_create_text_embedding_generator_model_defaults(bq_connection):
+ import bigframes.pandas as bpd
+
+ bpd.reset_session()
+ bpd.options.bigquery.bq_connection = bq_connection
+ bpd.options.bigquery.location = "us"
+
+ model = llm.PaLM2TextEmbeddingGenerator()
+ assert model is not None
+ assert model._bqml_model is not None
+@pytest.mark.flaky(retries=2, delay=120)
def test_embedding_generator_predict_success(
palm2_embedding_generator_model, llm_text_df
):
@@ -82,6 +117,7 @@ def test_embedding_generator_predict_success(
assert value.size == 768
+@pytest.mark.flaky(retries=2, delay=120)
def test_embedding_generator_predict_series_success(
palm2_embedding_generator_model, llm_text_df
):
diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py
index 420a80754f..57b9900c48 100644
--- a/tests/system/small/ml/test_preprocessing.py
+++ b/tests/system/small/ml/test_preprocessing.py
@@ -24,13 +24,13 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df):
scaler = bigframes.ml.preprocessing.StandardScaler()
scaler.fit(
penguins_df_default_index[
- "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"
+ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]
]
)
result = scaler.transform(
penguins_df_default_index[
- "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"
+ ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]
]
).to_pandas()
@@ -58,6 +58,35 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df):
pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
+def test_standard_scaler_normalizeds_fit_transform(new_penguins_df):
+ # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod.
+ scaler = bigframes.ml.preprocessing.StandardScaler()
+ result = scaler.fit_transform(
+ new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]]
+ ).to_pandas()
+
+ # If standard-scaled correctly, mean should be 0.0
+ for column in result.columns:
+ assert math.isclose(result[column].mean(), 0.0, abs_tol=1e-3)
+
+ # TODO: bug? feature columns seem to be in nondeterministic random order
+ # workaround: sort columns by name. Can't repro it in pantheon, so could
+ # be a bigframes issue...
+ result = result.reindex(sorted(result.columns), axis=1)
+
+ expected = pd.DataFrame(
+ {
+ "scaled_culmen_depth_mm": [1.17072, -1.272416, 0.101848],
+ "scaled_culmen_length_mm": [1.313249, -0.20198, -1.111118],
+ "scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338],
+ },
+ dtype="Float64",
+ index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
+ )
+
+ pd.testing.assert_frame_equal(result, expected, rtol=1e-3)
+
+
def test_standard_scaler_series_normalizes(penguins_df_default_index, new_penguins_df):
# TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod.
scaler = bigframes.ml.preprocessing.StandardScaler()
@@ -93,7 +122,7 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui
def test_one_hot_encoder_default_params(new_penguins_df):
encoder = bigframes.ml.preprocessing.OneHotEncoder()
- encoder.fit(new_penguins_df["species", "sex"])
+ encoder.fit(new_penguins_df[["species", "sex"]])
result = encoder.transform(new_penguins_df).to_pandas()
@@ -121,6 +150,35 @@ def test_one_hot_encoder_default_params(new_penguins_df):
pd.testing.assert_frame_equal(result, expected)
+def test_one_hot_encoder_default_params_fit_transform(new_penguins_df):
+ encoder = bigframes.ml.preprocessing.OneHotEncoder()
+
+ result = encoder.fit_transform(new_penguins_df[["species", "sex"]]).to_pandas()
+
+ # TODO: bug? feature columns seem to be in nondeterministic random order
+ # workaround: sort columns by name. Can't repro it in pantheon, so could
+ # be a bigframes issue...
+ result = result.reindex(sorted(result.columns), axis=1)
+
+ expected = pd.DataFrame(
+ {
+ "onehotencoded_sex": [
+ [{"index": 2, "value": 1.0}],
+ [{"index": 1, "value": 1.0}],
+ [{"index": 1, "value": 1.0}],
+ ],
+ "onehotencoded_species": [
+ [{"index": 1, "value": 1.0}],
+ [{"index": 1, "value": 1.0}],
+ [{"index": 2, "value": 1.0}],
+ ],
+ },
+ index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"),
+ )
+
+ pd.testing.assert_frame_equal(result, expected)
+
+
def test_one_hot_encoder_series_default_params(new_penguins_df):
encoder = bigframes.ml.preprocessing.OneHotEncoder()
encoder.fit(new_penguins_df["species"])
@@ -148,7 +206,7 @@ def test_one_hot_encoder_series_default_params(new_penguins_df):
def test_one_hot_encoder_params(new_penguins_df):
encoder = bigframes.ml.preprocessing.OneHotEncoder("most_frequent", 100, 2)
- encoder.fit(new_penguins_df["species", "sex"])
+ encoder.fit(new_penguins_df[["species", "sex"]])
result = encoder.transform(new_penguins_df).to_pandas()
@@ -178,7 +236,7 @@ def test_one_hot_encoder_params(new_penguins_df):
def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_df):
encoder = bigframes.ml.preprocessing.OneHotEncoder()
- encoder.fit(penguins_df_default_index["species", "sex"])
+ encoder.fit(penguins_df_default_index[["species", "sex"]])
result = encoder.transform(new_penguins_df).to_pandas()
diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py
index 31b64f4314..241cbd576b 100644
--- a/tests/system/small/operations/test_strings.py
+++ b/tests/system/small/operations/test_strings.py
@@ -254,31 +254,93 @@ def test_upper(scalars_dfs):
)
-def test_isnumeric(session):
- pandas_df = pd.DataFrame(
- {
- "numeric_string_col": [
- "٠١٢٣٤٥٦٧٨٩",
- "",
- "0",
- "字",
- "五",
- "0123456789",
- pd.NA,
- "abc 123 mixed letters and numbers",
- "no numbers here",
- "123a",
- "23!",
- " 45",
- "a45",
- ]
- }
- )
-
- df = session.read_pandas(pandas_df)
-
- pd_result = pandas_df.numeric_string_col.str.isnumeric()
- bf_result = df.numeric_string_col.str.isnumeric().to_pandas()
+def test_isnumeric(weird_strings, weird_strings_pd):
+ pd_result = weird_strings_pd.str.isnumeric()
+ bf_result = weird_strings.str.isnumeric().to_pandas()
+
+ pd.testing.assert_series_equal(
+ bf_result,
+ pd_result.astype(pd.BooleanDtype())
+ # the dtype here is a case of intentional diversion from pandas
+ # see go/bigframes-dtypes
+ )
+
+
+def test_isalpha(weird_strings, weird_strings_pd):
+ pd_result = weird_strings_pd.str.isalpha()
+ bf_result = weird_strings.str.isalpha().to_pandas()
+
+ pd.testing.assert_series_equal(
+ bf_result,
+ pd_result.astype(pd.BooleanDtype())
+ # the dtype here is a case of intentional diversion from pandas
+ # see go/bigframes-dtypes
+ )
+
+
+def test_isdigit(weird_strings, weird_strings_pd):
+ pd_result = weird_strings_pd.str.isdigit()
+ bf_result = weird_strings.str.isdigit().to_pandas()
+
+ pd.testing.assert_series_equal(
+ bf_result,
+ pd_result.astype(pd.BooleanDtype())
+ # the dtype here is a case of intentional diversion from pandas
+ # see go/bigframes-dtypes
+ )
+
+
+def test_isdecimal(weird_strings, weird_strings_pd):
+ pd_result = weird_strings_pd.str.isdecimal()
+ bf_result = weird_strings.str.isdecimal().to_pandas()
+
+ pd.testing.assert_series_equal(
+ bf_result,
+ pd_result.astype(pd.BooleanDtype())
+ # the dtype here is a case of intentional diversion from pandas
+ # see go/bigframes-dtypes
+ )
+
+
+def test_isalnum(weird_strings, weird_strings_pd):
+ pd_result = weird_strings_pd.str.isalnum()
+ bf_result = weird_strings.str.isalnum().to_pandas()
+
+ pd.testing.assert_series_equal(
+ bf_result,
+ pd_result.astype(pd.BooleanDtype())
+ # the dtype here is a case of intentional diversion from pandas
+ # see go/bigframes-dtypes
+ )
+
+
+def test_isspace(weird_strings, weird_strings_pd):
+ pd_result = weird_strings_pd.str.isspace()
+ bf_result = weird_strings.str.isspace().to_pandas()
+
+ pd.testing.assert_series_equal(
+ bf_result,
+ pd_result.astype(pd.BooleanDtype())
+ # the dtype here is a case of intentional diversion from pandas
+ # see go/bigframes-dtypes
+ )
+
+
+def test_islower(weird_strings, weird_strings_pd):
+ pd_result = weird_strings_pd.str.islower()
+ bf_result = weird_strings.str.islower().to_pandas()
+
+ assert_series_equal_ignoring_order(
+ bf_result,
+ pd_result.astype(pd.BooleanDtype())
+ # the dtype here is a case of intentional diversion from pandas
+ # see go/bigframes-dtypes
+ )
+
+
+def test_isupper(weird_strings, weird_strings_pd):
+ pd_result = weird_strings_pd.str.isupper()
+ bf_result = weird_strings.str.isupper().to_pandas()
assert_series_equal_ignoring_order(
bf_result,
@@ -394,9 +456,6 @@ def test_str_get(scalars_dfs):
bf_result = bf_series.str.get(8).to_pandas()
pd_result = scalars_pandas_df[col_name].str.get(8)
- print(pd_result)
- print(bf_result)
-
assert_series_equal_ignoring_order(
pd_result,
bf_result,
@@ -416,6 +475,16 @@ def test_str_pad(scalars_dfs):
)
+def test_str_zfill(weird_strings, weird_strings_pd):
+ bf_result = weird_strings.str.zfill(5).to_pandas()
+ pd_result = weird_strings_pd.str.zfill(5)
+
+ pd.testing.assert_series_equal(
+ pd_result,
+ bf_result,
+ )
+
+
def test_str_ljust(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "string_col"
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
index 2c44dd8067..85c3cce1d7 100644
--- a/tests/system/small/test_dataframe.py
+++ b/tests/system/small/test_dataframe.py
@@ -77,6 +77,20 @@ def test_df_construct_from_series(scalars_dfs):
pandas.testing.assert_frame_equal(bf_result, pd_result)
+def test_df_construct_from_dict():
+ input_dict = {
+ "Animal": ["Falcon", "Falcon", "Parrot", "Parrot"],
+ # With a space in column name. We use standardized SQL schema ids to solve the problem that BQ schema doesn't support column names with spaces. b/296751058
+ "Max Speed": [380.0, 370.0, 24.0, 26.0],
+ }
+ bf_result = dataframe.DataFrame(input_dict).to_pandas()
+ pd_result = pd.DataFrame(input_dict)
+
+ pandas.testing.assert_frame_equal(
+ bf_result, pd_result, check_dtype=False, check_index_type=False
+ )
+
+
def test_get_column(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
col_name = "int64_col"
@@ -356,6 +370,52 @@ def test_assign_new_column_w_setitem(scalars_dfs):
pd.testing.assert_frame_equal(bf_result, pd_result)
+def test_assign_new_column_w_setitem_list(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ bf_df = scalars_df.copy()
+ pd_df = scalars_pandas_df.copy()
+ bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1]
+ pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1]
+ bf_result = bf_df.to_pandas()
+ pd_result = pd_df
+
+ # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes.
+ pd_result["new_col"] = pd_result["new_col"].astype("Int64")
+
+ pd.testing.assert_frame_equal(bf_result, pd_result)
+
+
+def test_assign_new_column_w_setitem_list_custom_index(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ bf_df = scalars_df.copy()
+ pd_df = scalars_pandas_df.copy()
+
+ # set the custom index
+ pd_df = pd_df.set_index("string_col")
+ bf_df = bf_df.set_index("string_col")
+
+ bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1]
+ pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1]
+ bf_result = bf_df.to_pandas()
+ pd_result = pd_df
+
+ # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes.
+ pd_result["new_col"] = pd_result["new_col"].astype("Int64")
+
+ pd.testing.assert_frame_equal(bf_result, pd_result)
+
+
+def test_assign_new_column_w_setitem_list_error(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ bf_df = scalars_df.copy()
+ pd_df = scalars_pandas_df.copy()
+
+ with pytest.raises(ValueError):
+ pd_df["new_col"] = [1, 2, 3] # should be len 9, is 3
+ with pytest.raises(ValueError):
+ bf_df["new_col"] = [1, 2, 3]
+
+
def test_assign_existing_column(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
kwargs = {"int64_col": 2}
@@ -491,13 +551,69 @@ def test_assign_callable_lambda(scalars_dfs):
assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
-def test_dropna(scalars_dfs):
+@pytest.mark.parametrize(
+ ("axis", "how", "ignore_index"),
+ [
+ (0, "any", False),
+ (0, "any", True),
+ (1, "any", False),
+ (1, "all", False),
+ ],
+)
+def test_df_dropna(scalars_dfs, axis, how, ignore_index):
+ if pd.__version__.startswith("1."):
+ pytest.skip("ignore_index parameter not supported in pandas 1.x.")
scalars_df, scalars_pandas_df = scalars_dfs
- df = scalars_df.dropna()
+ df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index)
bf_result = df.to_pandas()
- pd_result = scalars_pandas_df.dropna()
+ pd_result = scalars_pandas_df.dropna(axis=axis, how=how, ignore_index=ignore_index)
+
+ # Pandas uses int64 instead of Int64 (nullable) dtype.
+ pd_result.index = pd_result.index.astype(pd.Int64Dtype())
+ pandas.testing.assert_frame_equal(bf_result, pd_result)
+
+
+def test_df_fillna(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ df = scalars_df[["int64_col", "float64_col"]].fillna(3)
+ bf_result = df.to_pandas()
+ pd_result = scalars_pandas_df[["int64_col", "float64_col"]].fillna(3)
+
+ pandas.testing.assert_frame_equal(bf_result, pd_result)
- assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+
+def test_df_isin_list(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ values = ["Hello, World!", 55555, 2.51, pd.NA, True]
+ bf_result = (
+ scalars_df[["int64_col", "float64_col", "string_col", "bool_col"]]
+ .isin(values)
+ .to_pandas()
+ )
+ pd_result = scalars_pandas_df[
+ ["int64_col", "float64_col", "string_col", "bool_col"]
+ ].isin(values)
+
+ pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean"))
+
+
+def test_df_isin_dict(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ values = {
+ "string_col": ["Hello, World!", 55555, 2.51, pd.NA, True],
+ "int64_col": [5555, 2.51],
+ "bool_col": [pd.NA],
+ }
+ bf_result = (
+ scalars_df[["int64_col", "float64_col", "string_col", "bool_col"]]
+ .isin(values)
+ .to_pandas()
+ )
+ pd_result = scalars_pandas_df[
+ ["int64_col", "float64_col", "string_col", "bool_col"]
+ ].isin(values)
+
+ pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean"))
@pytest.mark.parametrize(
@@ -840,6 +956,14 @@ def test_set_index(scalars_dfs, index_column, drop, append):
pandas.testing.assert_frame_equal(bf_result, pd_result)
+def test_set_index_key_error(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ with pytest.raises(KeyError):
+ scalars_pandas_df.set_index(["not_a_col"])
+ with pytest.raises(KeyError):
+ scalars_df.set_index(["not_a_col"])
+
+
@pytest.mark.parametrize(
("ascending",),
((True,), (False,)),
@@ -1016,50 +1140,43 @@ def test_series_binop_axis_index(
@pytest.mark.parametrize(
- ("op"),
+ ("left_labels", "right_labels"),
[
- (lambda x, y: x.add(y, axis="index")),
- (lambda x, y: x.radd(y, axis="index")),
- (lambda x, y: x.sub(y, axis="index")),
- (lambda x, y: x.rsub(y, axis="index")),
- (lambda x, y: x.mul(y, axis="index")),
- (lambda x, y: x.rmul(y, axis="index")),
- (lambda x, y: x.truediv(y, axis="index")),
- (lambda x, y: x.rtruediv(y, axis="index")),
- (lambda x, y: x.floordiv(y, axis="index")),
- (lambda x, y: x.floordiv(y, axis="index")),
- (lambda x, y: x.gt(y, axis="index")),
- (lambda x, y: x.ge(y, axis="index")),
- (lambda x, y: x.lt(y, axis="index")),
- (lambda x, y: x.le(y, axis="index")),
+ (["a", "a", "b"], ["c", "c", "d"]),
+ (["a", "b", "c"], ["c", "a", "b"]),
+ (["a", "c", "c"], ["c", "a", "c"]),
],
ids=[
- "add",
- "radd",
- "sub",
- "rsub",
- "mul",
- "rmul",
- "truediv",
- "rtruediv",
- "floordiv",
- "rfloordiv",
- "gt",
- "ge",
- "lt",
- "le",
+ "no_overlap",
+ "one_one_match",
+ "multi_match",
],
)
-def test_dataframe_binop_axis_index_throws_not_implemented(
- scalars_dfs,
- op,
+def test_binop_df_df_binary_op(
+ scalars_df_index,
+ scalars_df_2_index,
+ scalars_pandas_df_index,
+ left_labels,
+ right_labels,
):
- scalars_df, scalars_pandas_df = scalars_dfs
- df_columns = ["int64_col", "float64_col"]
- other_df_columns = ["int64_too"]
+ if pd.__version__.startswith("1."):
+ pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.")
+ columns = ["int64_too", "int64_col", "float64_col"]
- with pytest.raises(NotImplementedError):
- op(scalars_df[df_columns], scalars_df[other_df_columns]).to_pandas()
+ bf_df_a = scalars_df_index[columns]
+ bf_df_a.columns = left_labels
+ bf_df_b = scalars_df_2_index[columns]
+ bf_df_b.columns = right_labels
+ bf_result = (bf_df_a - bf_df_b).to_pandas()
+
+ pd_df_a = scalars_pandas_df_index[columns]
+ pd_df_a.columns = left_labels
+ pd_df_b = scalars_pandas_df_index[columns]
+ pd_df_b.columns = right_labels
+ pd_result = pd_df_a - pd_df_b
+
+ # Some dtype inconsistency for all-NULL columns
+ pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
# Differnt table will only work for explicit index, since default index orders are arbitrary.
@@ -1321,6 +1438,56 @@ def test_df_describe(scalars_dfs):
).all()
+def test_df_stack(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ # To match bigquery dataframes
+ scalars_pandas_df = scalars_pandas_df.copy()
+ scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]")
+ # Can only stack identically-typed columns
+ columns = ["int64_col", "int64_too", "rowindex_2"]
+
+ bf_result = scalars_df[columns].stack().to_pandas()
+ pd_result = scalars_pandas_df[columns].stack()
+
+ # Pandas produces NaN, where bq dataframes produces pd.NA
+ pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+ ("values", "index", "columns"),
+ [
+ ("int64_col", "int64_too", ["string_col"]),
+ (["int64_col"], "int64_too", ["string_col"]),
+ (["int64_col", "float64_col"], "int64_too", ["string_col"]),
+ ],
+)
+def test_df_pivot(scalars_dfs, values, index, columns):
+ scalars_df, scalars_pandas_df = scalars_dfs
+
+ bf_result = scalars_df.pivot(
+ values=values, index=index, columns=columns
+ ).to_pandas()
+ pd_result = scalars_pandas_df.pivot(values=values, index=index, columns=columns)
+
+ # Pandas produces NaN, where bq dataframes produces pd.NA
+ pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+ ("values", "index", "columns"),
+ [
+ (["goals", "assists"], ["team_name", "season"], ["position"]),
+ (["goals", "assists"], ["season"], ["team_name", "position"]),
+ ],
+)
+def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns):
+ bf_result = hockey_df.pivot(values=values, index=index, columns=columns).to_pandas()
+ pd_result = hockey_pandas_df.pivot(values=values, index=index, columns=columns)
+
+ # Pandas produces NaN, where bq dataframes produces pd.NA
+ pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
+
+
def test_ipython_key_completions_with_drop(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
col_names = "string_col"
@@ -1621,6 +1788,7 @@ def test_sample_raises_value_error(scalars_dfs):
@pytest.mark.parametrize(
("axis",),
[
+ (None,),
(0,),
(1,),
],
@@ -1690,8 +1858,9 @@ def test_df___array__(scalars_df_index, scalars_pandas_df_index):
)
-def test_getattr_not_implemented(scalars_df_index):
- with pytest.raises(NotImplementedError):
+def test_getattr_attribute_error_when_pandas_has(scalars_df_index):
+ # asof is implemented in pandas but not in bigframes
+ with pytest.raises(AttributeError):
scalars_df_index.asof()
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
index 1f5aa906c8..3886b85f40 100644
--- a/tests/system/small/test_dataframe_io.py
+++ b/tests/system/small/test_dataframe_io.py
@@ -32,6 +32,7 @@
import bigframes
import bigframes.dataframe
+import bigframes.pandas as bpd
def test_to_pandas_w_correct_dtypes(scalars_df_default_index):
@@ -339,51 +340,68 @@ def test_to_parquet_index(scalars_dfs, gcs_folder, index):
pd.testing.assert_frame_equal(gcs_df, scalars_pandas_df)
+def test_to_sql_query_unnamed_index_included(
+ session: bigframes.Session,
+ scalars_df_default_index: bpd.DataFrame,
+ scalars_pandas_df_default_index: pd.DataFrame,
+):
+ bf_df = scalars_df_default_index.reset_index(drop=True)
+ sql, idx_ids, idx_labels = bf_df._to_sql_query(include_index=True)
+ assert len(idx_labels) == 1
+ assert len(idx_ids) == 1
+ assert idx_labels[0] is None
+ assert idx_ids[0].startswith("bigframes")
+
+ pd_df = scalars_pandas_df_default_index.reset_index(drop=True)
+ roundtrip = session.read_gbq(sql, index_col=idx_ids)
+ roundtrip.index.names = [None]
+ assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df)
+
+
def test_to_sql_query_named_index_included(
- session, scalars_df_index, scalars_pandas_df_index
+ session: bigframes.Session,
+ scalars_df_default_index: bpd.DataFrame,
+ scalars_pandas_df_default_index: pd.DataFrame,
):
- sql, index_columns = scalars_df_index._to_sql_query(always_include_index=True)
- assert len(index_columns) == 1
- index_column, is_named = index_columns[0]
- assert index_column == "rowindex"
- assert is_named
-
- roundtrip = session.read_gbq(sql, index_col=[index_column])
- assert_pandas_df_equal_ignore_ordering(
- roundtrip.to_pandas(), scalars_pandas_df_index
- )
+ bf_df = scalars_df_default_index.set_index("rowindex_2", drop=True)
+ sql, idx_ids, idx_labels = bf_df._to_sql_query(include_index=True)
+ assert len(idx_labels) == 1
+ assert len(idx_ids) == 1
+ assert idx_labels[0] == "rowindex_2"
+ assert idx_ids[0] == "rowindex_2"
+
+ pd_df = scalars_pandas_df_default_index.set_index("rowindex_2", drop=True)
+ roundtrip = session.read_gbq(sql, index_col=idx_ids)
+ assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df)
def test_to_sql_query_unnamed_index_excluded(
- session, scalars_df_default_index, scalars_pandas_df_default_index
+ session: bigframes.Session,
+ scalars_df_default_index: bpd.DataFrame,
+ scalars_pandas_df_default_index: pd.DataFrame,
):
- # The .sql property should return SQL without the unnamed indexes
- sql, index_columns = scalars_df_default_index._to_sql_query(
- always_include_index=False
- )
- assert len(index_columns) == 0
+ bf_df = scalars_df_default_index.reset_index(drop=True)
+ sql, idx_ids, idx_labels = bf_df._to_sql_query(include_index=False)
+ assert len(idx_labels) == 0
+ assert len(idx_ids) == 0
+ pd_df = scalars_pandas_df_default_index.reset_index(drop=True)
roundtrip = session.read_gbq(sql)
- assert_pandas_df_equal_ignore_ordering(
- roundtrip.to_pandas(), scalars_pandas_df_default_index
- )
+ assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df)
-def test_to_sql_query_unnamed_index_always_include(
- session,
- scalars_df_default_index: bigframes.dataframe.DataFrame,
- scalars_pandas_df_default_index,
+def test_to_sql_query_named_index_excluded(
+ session: bigframes.Session,
+ scalars_df_default_index: bpd.DataFrame,
+ scalars_pandas_df_default_index: pd.DataFrame,
):
- sql, index_columns = scalars_df_default_index._to_sql_query(
- always_include_index=True
- )
- assert len(index_columns) == 1
- index_column, is_named = index_columns[0]
- assert index_column == "bigframes_index_0"
- assert not is_named
-
- roundtrip = session.read_gbq(sql, index_col=[index_column])
- roundtrip.index.name = None
- assert_pandas_df_equal_ignore_ordering(
- roundtrip.to_pandas(), scalars_pandas_df_default_index
- )
+ bf_df = scalars_df_default_index.set_index("rowindex_2", drop=True)
+ sql, idx_ids, idx_labels = bf_df._to_sql_query(include_index=False)
+ assert len(idx_labels) == 0
+ assert len(idx_ids) == 0
+
+ pd_df = scalars_pandas_df_default_index.set_index(
+ "rowindex_2", drop=True
+ ).reset_index(drop=True)
+ roundtrip = session.read_gbq(sql)
+ assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df)
diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py
index d5dd4e357b..987368ce77 100644
--- a/tests/system/small/test_groupby.py
+++ b/tests/system/small/test_groupby.py
@@ -109,12 +109,12 @@ def test_dataframe_groupby_agg_list(scalars_df_index, scalars_pandas_df_index):
)
bf_result_computed = bf_result.to_pandas()
- # Pandas produces multi-index which isn't supported in bq df yet
- pd_result = pd_result.set_axis(bf_result.columns, axis=1)
pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False)
-def test_dataframe_groupby_agg_dict(scalars_df_index, scalars_pandas_df_index):
+def test_dataframe_groupby_agg_dict_with_list(
+ scalars_df_index, scalars_pandas_df_index
+):
col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"]
bf_result = (
scalars_df_index[col_names]
@@ -128,8 +128,23 @@ def test_dataframe_groupby_agg_dict(scalars_df_index, scalars_pandas_df_index):
)
bf_result_computed = bf_result.to_pandas()
- # Pandas produces multi-index which isn't supported in bq df yet
- pd_result = pd_result.set_axis(bf_result.columns, axis=1)
+ pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False)
+
+
+def test_dataframe_groupby_agg_dict_no_lists(scalars_df_index, scalars_pandas_df_index):
+ col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"]
+ bf_result = (
+ scalars_df_index[col_names]
+ .groupby("string_col")
+ .agg({"int64_too": "mean", "string_col": "count"})
+ )
+ pd_result = (
+ scalars_pandas_df_index[col_names]
+ .groupby("string_col")
+ .agg({"int64_too": "mean", "string_col": "count"})
+ )
+ bf_result_computed = bf_result.to_pandas()
+
pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False)
diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py
index ac1f8c7220..558dd12e69 100644
--- a/tests/system/small/test_index.py
+++ b/tests/system/small/test_index.py
@@ -50,3 +50,17 @@ def test_index_getitem_int(scalars_df_index, scalars_pandas_df_index):
bf_result = scalars_df_index.index[-2]
pd_result = scalars_pandas_df_index.index[-2]
assert bf_result == pd_result
+
+
+def test_is_monotonic_increasing(scalars_df_index, scalars_pandas_df_index):
+ assert (
+ scalars_df_index.index.is_monotonic_increasing
+ == scalars_pandas_df_index.index.is_monotonic_increasing
+ )
+
+
+def test_is_monotonic_decreasing(scalars_df_index, scalars_pandas_df_index):
+ assert (
+ scalars_df_index.index.is_monotonic_increasing
+ == scalars_pandas_df_index.index.is_monotonic_increasing
+ )
diff --git a/tests/system/small/test_ipython.py b/tests/system/small/test_ipython.py
new file mode 100644
index 0000000000..6725805d9a
--- /dev/null
+++ b/tests/system/small/test_ipython.py
@@ -0,0 +1,28 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+IPython = pytest.importorskip("IPython")
+
+
+def test_repr_cache(scalars_df_index):
+ display_formatter = IPython.core.formatters.DisplayFormatter()
+ # Make sure the df has a new block that the method return value
+ # is not already cached.
+ test_df = scalars_df_index.head()
+ results = display_formatter.format(test_df)
+ assert results[0].keys() == {"text/plain", "text/html"}
+ assert test_df._block.retrieve_repr_request_results.cache_info().misses == 1
+ assert test_df._block.retrieve_repr_request_results.cache_info().hits == 1
diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py
index 914be6dae4..25d1e2ad49 100644
--- a/tests/system/small/test_multiindex.py
+++ b/tests/system/small/test_multiindex.py
@@ -19,6 +19,7 @@
from tests.system.utils import assert_pandas_df_equal_ignore_ordering
+# Row Multi-index tests
def test_set_multi_index(scalars_df_index, scalars_pandas_df_index):
bf_result = scalars_df_index.set_index(["bool_col", "int64_too"]).to_pandas()
pd_result = scalars_pandas_df_index.set_index(["bool_col", "int64_too"])
@@ -443,3 +444,281 @@ def test_multi_index_series_rename_dict_same_type(
pandas.testing.assert_series_equal(
bf_result, pd_result, check_dtype=False, check_index_type=False
)
+
+
+# Column Multi-index tests
+
+
+def test_column_multi_index_getitem(scalars_df_index, scalars_pandas_df_index):
+ columns = ["int64_too", "string_col", "bool_col"]
+ multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], columns))
+ bf_df = scalars_df_index[columns].copy()
+ bf_df.columns = multi_columns
+ pd_df = scalars_pandas_df_index[columns].copy()
+ pd_df.columns = multi_columns
+
+ bf_a = bf_df["a"].to_pandas()
+ pd_a = pd_df["a"]
+ pandas.testing.assert_frame_equal(bf_a, pd_a)
+
+ bf_b = bf_df["b"].to_pandas()
+ pd_b = pd_df["b"]
+ pandas.testing.assert_frame_equal(bf_b, pd_b)
+
+ bf_fullkey = bf_df[("a", "int64_too")].to_pandas()
+ pd_fullkey = pd_df[("a", "int64_too")]
+ pandas.testing.assert_series_equal(bf_fullkey, pd_fullkey)
+
+
+def test_column_multi_index_concat(scalars_df_index, scalars_pandas_df_index):
+ columns = ["int64_too", "string_col", "bool_col", "int64_col"]
+ multi_columns1 = pandas.MultiIndex.from_tuples(
+ zip(["a", "b", "a", "b"], [1, 1, 2, 2])
+ )
+ multi_columns2 = pandas.MultiIndex.from_tuples(
+ zip(["a", "b", "a", "c"], [3, 1, 2, 1])
+ )
+
+ bf_df1 = scalars_df_index[columns].copy()
+ bf_df1.columns = multi_columns1
+ bf_df2 = scalars_df_index[columns].copy()
+ bf_df2.columns = multi_columns2
+
+ pd_df1 = scalars_pandas_df_index[columns].copy()
+ pd_df1.columns = multi_columns1
+ pd_df2 = scalars_pandas_df_index[columns].copy()
+ pd_df2.columns = multi_columns2
+
+ bf_result = bpd.concat([bf_df1, bf_df2, bf_df1]).to_pandas()
+ pd_result = pandas.concat([pd_df1, pd_df2, pd_df1])
+
+ pandas.testing.assert_frame_equal(bf_result, pd_result)
+
+
+def test_column_multi_index_drop(scalars_df_index, scalars_pandas_df_index):
+ columns = ["int64_too", "string_col", "bool_col"]
+ multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], columns))
+ bf_df = scalars_df_index[columns].copy()
+ bf_df.columns = multi_columns
+ pd_df = scalars_pandas_df_index[columns].copy()
+ pd_df.columns = multi_columns
+
+ bf_a = bf_df.drop(("a", "int64_too"), axis=1).to_pandas()
+ pd_a = pd_df.drop(("a", "int64_too"), axis=1)
+ pandas.testing.assert_frame_equal(bf_a, pd_a)
+
+
+@pytest.mark.parametrize(
+ ("key",),
+ [
+ ("a",),
+ ("b",),
+ ("c",),
+ ],
+)
+def test_column_multi_index_assign(scalars_df_index, scalars_pandas_df_index, key):
+ columns = ["int64_too", "int64_col", "float64_col"]
+ multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], columns))
+ bf_df = scalars_df_index[columns].copy()
+ bf_df.columns = multi_columns
+ pd_df = scalars_pandas_df_index[columns].copy()
+ pd_df.columns = multi_columns
+
+ kwargs = {key: 42}
+ bf_result = bf_df.assign(**kwargs).to_pandas()
+ pd_result = pd_df.assign(**kwargs)
+
+ # Pandas assign results in non-nullable dtype
+ pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
+
+
+def test_column_multi_index_rename(scalars_df_index, scalars_pandas_df_index):
+ columns = ["int64_too", "int64_col", "float64_col"]
+ multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"]))
+ bf_df = scalars_df_index[columns].copy()
+ bf_df.columns = multi_columns
+ pd_df = scalars_pandas_df_index[columns].copy()
+ pd_df.columns = multi_columns
+
+ bf_result = bf_df.rename(columns={"b": "c"}).to_pandas()
+ pd_result = pd_df.rename(columns={"b": "c"})
+
+ pandas.testing.assert_frame_equal(bf_result, pd_result)
+
+
+def test_column_multi_index_reset_index(scalars_df_index, scalars_pandas_df_index):
+ columns = ["int64_too", "int64_col", "float64_col"]
+ multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"]))
+ bf_df = scalars_df_index[columns].copy()
+ bf_df.columns = multi_columns
+ pd_df = scalars_pandas_df_index[columns].copy()
+ pd_df.columns = multi_columns
+
+ bf_result = bf_df.reset_index().to_pandas()
+ pd_result = pd_df.reset_index()
+
+ # Pandas uses int64 instead of Int64 (nullable) dtype.
+ pd_result.index = pd_result.index.astype(pandas.Int64Dtype())
+ pandas.testing.assert_frame_equal(bf_result, pd_result)
+
+
+def test_column_multi_index_binary_op(scalars_df_index, scalars_pandas_df_index):
+ columns = ["int64_too", "int64_col", "float64_col"]
+ multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"]))
+ bf_df = scalars_df_index[columns].copy()
+ bf_df.columns = multi_columns
+ pd_df = scalars_pandas_df_index[columns].copy()
+ pd_df.columns = multi_columns
+
+ bf_result = (bf_df[("a", "a")] + 3).to_pandas()
+ pd_result = pd_df[("a", "a")] + 3
+
+ pandas.testing.assert_series_equal(bf_result, pd_result)
+
+
+def test_column_multi_index_agg(scalars_df_index, scalars_pandas_df_index):
+ columns = ["int64_too", "int64_col", "float64_col"]
+ multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"]))
+ bf_df = scalars_df_index[columns].copy()
+ bf_df.columns = multi_columns
+ pd_df = scalars_pandas_df_index[columns].copy()
+ pd_df.columns = multi_columns
+
+ bf_result = bf_df.agg(["sum", "mean"]).to_pandas()
+ pd_result = pd_df.agg(["sum", "mean"])
+
+ # Pandas may produce narrower numeric types, but bigframes always produces Float64
+ pd_result = pd_result.astype("Float64")
+ pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False)
+
+
+def test_column_multi_index_prefix_suffix(scalars_df_index, scalars_pandas_df_index):
+ columns = ["int64_too", "int64_col", "float64_col"]
+ multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"]))
+ bf_df = scalars_df_index[columns].copy()
+ bf_df.columns = multi_columns
+ pd_df = scalars_pandas_df_index[columns].copy()
+ pd_df.columns = multi_columns
+
+ bf_result = bf_df.add_prefix("prefixed_").add_suffix("_suffixed").to_pandas()
+ pd_result = pd_df.add_prefix("prefixed_").add_suffix("_suffixed")
+
+ pandas.testing.assert_frame_equal(bf_result, pd_result)
+
+
+def test_column_multi_index_cumsum(scalars_df_index, scalars_pandas_df_index):
+ if pandas.__version__.startswith("1."):
+ pytest.skip("pandas 1.x. does not handle nullable ints properly in cumsum")
+ columns = ["int64_too", "int64_col", "float64_col"]
+ multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"]))
+ bf_df = scalars_df_index[columns].copy()
+ bf_df.columns = multi_columns
+ pd_df = scalars_pandas_df_index[columns].copy()
+ pd_df.columns = multi_columns
+
+ bf_result = bf_df.cumsum().to_pandas()
+ pd_result = pd_df.cumsum()
+
+ pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
+
+
+def test_column_multi_index_stack(scalars_df_index, scalars_pandas_df_index):
+ columns = ["int64_too", "int64_col", "rowindex_2"]
+ level1 = pandas.Index(["b", "a", "b"])
+ # Need resulting column to be pyarrow string rather than object dtype
+ level2 = pandas.Index(["a", "b", "b"], dtype="string[pyarrow]")
+ multi_columns = pandas.MultiIndex.from_arrays([level1, level2])
+ bf_df = scalars_df_index[columns].copy()
+ bf_df.columns = multi_columns
+ pd_df = scalars_pandas_df_index[columns].copy()
+ pd_df.columns = multi_columns
+
+ bf_result = bf_df.stack().to_pandas()
+ # Shifting sort behavior in stack
+ pd_result = pd_df.stack()
+
+ # Pandas produces NaN, where bq dataframes produces pd.NA
+ # Column ordering seems to depend on pandas version
+ pandas.testing.assert_frame_equal(
+ bf_result.sort_index(axis=1), pd_result.sort_index(axis=1), check_dtype=False
+ )
+
+
+@pytest.mark.skip(reason="Pandas fails in newer versions.")
+def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index):
+ columns = ["int64_too", "int64_col", "rowindex_2"]
+ level1 = pandas.Index(["b", pandas.NA, pandas.NA])
+ # Need resulting column to be pyarrow string rather than object dtype
+ level2 = pandas.Index([pandas.NA, "b", "b"], dtype="string[pyarrow]")
+ multi_columns = pandas.MultiIndex.from_arrays([level1, level2])
+ bf_df = scalars_df_index[columns].copy()
+ bf_df.columns = multi_columns
+ pd_df = scalars_pandas_df_index[columns].copy()
+ pd_df.columns = multi_columns
+
+ bf_result = bf_df.stack().to_pandas()
+ pd_result = pd_df.stack()
+
+ # Pandas produces NaN, where bq dataframes produces pd.NA
+ pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+ ("index_names",),
+ [
+ (["rowindex_2", "int64_too"],),
+ (["int64_too", "rowindex_2"],),
+ ],
+)
+def test_is_monotonic_increasing(
+ scalars_df_index, scalars_pandas_df_index, index_names
+):
+ bf_result = scalars_df_index.set_index(index_names).index
+ pd_result = scalars_pandas_df_index.set_index(index_names).index
+
+ assert bf_result.is_monotonic_increasing == pd_result.is_monotonic_increasing
+
+
+@pytest.mark.parametrize(
+ ("indexes",),
+ [
+ ({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3]},),
+ ({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, None, 3]},),
+ ({"A": [1, 2, 2], "B": [1, 2, 1], "C": [1, 2, 3]},),
+ ({"A": [1, 2, 2], "B": [1, 2, 3], "C": [1, 2, 1]},),
+ ({"A": [1, 2, 1], "B": [1, 2, 3], "C": [1, 2, 1]},),
+ ({"A": [3, 2, 1], "B": [3, 2, 1], "C": [2, 2, 1]},),
+ ],
+)
+def test_is_monotonic_increasing_extra(indexes):
+ bf_result = bpd.DataFrame(indexes)
+ bf_result = bf_result.set_index(["A", "B", "C"])
+ pd_result = pandas.DataFrame(indexes)
+ pd_result = pd_result.set_index(["A", "B", "C"])
+
+ assert (
+ bf_result.index.is_monotonic_increasing
+ == pd_result.index.is_monotonic_increasing
+ )
+
+
+@pytest.mark.parametrize(
+ ("indexes",),
+ [
+ ({"A": [3, 2, 1], "B": [3, 2, 1], "C": [3, 2, 1]},),
+ ({"A": [3, 2, 1], "B": [3, 2, 1], "C": [3, None, 1]},),
+ ({"A": [2, 2, 1], "B": [1, 2, 1], "C": [3, 2, 1]},),
+ ({"A": [2, 2, 1], "B": [3, 2, 1], "C": [1, 2, 1]},),
+ ({"A": [1, 2, 1], "B": [3, 2, 1], "C": [1, 2, 1]},),
+ ],
+)
+def test_is_monotonic_decreasing_extra(indexes):
+ bf_result = bpd.DataFrame(indexes)
+ bf_result = bf_result.set_index(["A", "B", "C"])
+ pd_result = pandas.DataFrame(indexes)
+ pd_result = pd_result.set_index(["A", "B", "C"])
+
+ assert (
+ bf_result.index.is_monotonic_decreasing
+ == pd_result.index.is_monotonic_decreasing
+ )
diff --git a/tests/system/small/test_numpy.py b/tests/system/small/test_numpy.py
new file mode 100644
index 0000000000..5c2a93ec39
--- /dev/null
+++ b/tests/system/small/test_numpy.py
@@ -0,0 +1,135 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pandas as pd
+import pytest
+
+
+@pytest.mark.parametrize(
+ ("opname",),
+ [
+ ("sin",),
+ ("cos",),
+ ("tan",),
+ ("arcsin",),
+ ("arccos",),
+ ("arctan",),
+ ("sinh",),
+ ("cosh",),
+ ("tanh",),
+ ("arcsinh",),
+ ("arccosh",),
+ ("arctanh",),
+ ("exp",),
+ ("log",),
+ ("log10",),
+ ("sqrt",),
+ ("abs",),
+ ],
+)
+def test_series_ufuncs(floats_pd, floats_bf, opname):
+ bf_result = getattr(np, opname)(floats_bf).to_pandas()
+ pd_result = getattr(np, opname)(floats_pd)
+
+ pd.testing.assert_series_equal(bf_result, pd_result)
+
+
+@pytest.mark.parametrize(
+ ("opname",),
+ [
+ ("sin",),
+ ("cos",),
+ ("tan",),
+ ("log",),
+ ("log10",),
+ ("sqrt",),
+ ("abs",),
+ ],
+)
+def test_df_ufuncs(scalars_dfs, opname):
+ scalars_df, scalars_pandas_df = scalars_dfs
+
+ bf_result = getattr(np, opname)(
+ scalars_df[["float64_col", "int64_col"]]
+ ).to_pandas()
+ pd_result = getattr(np, opname)(scalars_pandas_df[["float64_col", "int64_col"]])
+
+ pd.testing.assert_frame_equal(bf_result, pd_result)
+
+
+@pytest.mark.parametrize(
+ ("opname",),
+ [
+ ("add",),
+ ("subtract",),
+ ("multiply",),
+ ("divide",),
+ ("power",),
+ ],
+)
+def test_series_binary_ufuncs(floats_product_pd, floats_product_bf, opname):
+ bf_result = getattr(np, opname)(
+ floats_product_bf.float64_col_x, floats_product_bf.float64_col_y
+ ).to_pandas()
+ pd_result = getattr(np, opname)(
+ floats_product_pd.float64_col_x, floats_product_pd.float64_col_y
+ )
+ pd.testing.assert_series_equal(bf_result, pd_result)
+
+
+@pytest.mark.parametrize(
+ ("opname",),
+ [
+ ("add",),
+ ("subtract",),
+ ("multiply",),
+ ("divide",),
+ ("power",),
+ ],
+)
+def test_df_binary_ufuncs(scalars_dfs, opname):
+ scalars_df, scalars_pandas_df = scalars_dfs
+
+ bf_result = getattr(np, opname)(
+ scalars_df[["float64_col", "int64_col"]], 5.1
+ ).to_pandas()
+ pd_result = getattr(np, opname)(
+ scalars_pandas_df[["float64_col", "int64_col"]], 5.1
+ )
+
+ pd.testing.assert_frame_equal(bf_result, pd_result)
+
+
+def test_series_binary_ufuncs_reverse(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+
+ # Could be any non-symmetric binary op
+ bf_result = np.subtract(5.1, scalars_df["int64_col"]).to_pandas()
+ pd_result = np.subtract(5.1, scalars_pandas_df["int64_col"])
+
+ pd.testing.assert_series_equal(bf_result, pd_result)
+
+
+def test_df_binary_ufuncs_reverse(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+
+ # Could be any non-symmetric binary op
+ bf_result = np.subtract(5.1, scalars_df[["float64_col", "int64_col"]]).to_pandas()
+ pd_result = np.subtract(
+ 5.1,
+ scalars_pandas_df[["float64_col", "int64_col"]],
+ )
+
+ pd.testing.assert_frame_equal(bf_result, pd_result)
diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py
index 98bafc6392..e451d5c3a2 100644
--- a/tests/system/small/test_pandas.py
+++ b/tests/system/small/test_pandas.py
@@ -16,6 +16,7 @@
import pytest
import bigframes.pandas as bpd
+from tests.system.utils import assert_pandas_df_equal_ignore_ordering
def test_concat_dataframe(scalars_dfs):
@@ -105,3 +106,106 @@ def test_concat_axis_1(scalars_dfs, how):
pd_result = pd.concat([pd_part1, pd_part2, pd_part3], join=how, axis=1)
pd.testing.assert_frame_equal(bf_result.to_pandas(), pd_result)
+
+
+@pytest.mark.parametrize(
+ ("merge_how",),
+ [
+ ("inner",),
+ ("outer",),
+ ("left",),
+ ("right",),
+ ],
+)
+def test_merge(scalars_dfs, merge_how):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ on = "rowindex_2"
+ left_columns = ["int64_col", "float64_col", "rowindex_2"]
+ right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"]
+
+ left = scalars_df[left_columns]
+ # Offset the rows somewhat so that outer join can have an effect.
+ right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2)
+
+ df = bpd.merge(left, right, merge_how, on, sort=True)
+ bf_result = df.to_pandas()
+
+ pd_result = pd.merge(
+ scalars_pandas_df[left_columns],
+ scalars_pandas_df[right_columns].assign(
+ rowindex_2=scalars_pandas_df["rowindex_2"] + 2
+ ),
+ merge_how,
+ on,
+ sort=True,
+ )
+
+ assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+
+
+@pytest.mark.parametrize(
+ ("merge_how",),
+ [
+ ("inner",),
+ ("outer",),
+ ("left",),
+ ("right",),
+ ],
+)
+def test_merge_left_on_right_on(scalars_dfs, merge_how):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ left_columns = ["int64_col", "float64_col", "int64_too"]
+ right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"]
+
+ left = scalars_df[left_columns]
+ right = scalars_df[right_columns]
+
+ df = bpd.merge(
+ left, right, merge_how, left_on="int64_too", right_on="rowindex_2", sort=True
+ )
+ bf_result = df.to_pandas()
+
+ pd_result = pd.merge(
+ scalars_pandas_df[left_columns],
+ scalars_pandas_df[right_columns],
+ merge_how,
+ left_on="int64_too",
+ right_on="rowindex_2",
+ sort=True,
+ )
+
+ assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+
+
+@pytest.mark.parametrize(
+ ("merge_how",),
+ [
+ ("inner",),
+ ("outer",),
+ ("left",),
+ ("right",),
+ ],
+)
+def test_merge_series(scalars_dfs, merge_how):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ left_column = "int64_too"
+ right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"]
+
+ left = scalars_df[left_column]
+ right = scalars_df[right_columns]
+
+ df = bpd.merge(
+ left, right, merge_how, left_on="int64_too", right_on="rowindex_2", sort=True
+ )
+ bf_result = df.to_pandas()
+
+ pd_result = pd.merge(
+ scalars_pandas_df[left_column],
+ scalars_pandas_df[right_columns],
+ merge_how,
+ left_on="int64_too",
+ right_on="rowindex_2",
+ sort=True,
+ )
+
+ assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py
index 96697dbcab..6510c4fa27 100644
--- a/tests/system/small/test_pandas_options.py
+++ b/tests/system/small/test_pandas_options.py
@@ -20,6 +20,7 @@
import google.auth.exceptions
import pytest
+import bigframes.core.global_session
import bigframes.pandas as bpd
@@ -253,49 +254,6 @@ def test_read_gbq_must_comply_with_set_location_non_US(
assert df is not None
-def test_reset_session_after_bq_session_ended():
- # Use a simple test query to verify that default session works to interact
- # with BQ
- test_query = "SELECT 1"
-
- # Confirm that there is a session id in the default session
- session = bpd.get_global_session()
- assert session._session_id
-
- # Confirm that session works as usual
- df = bpd.read_gbq(test_query)
- assert df is not None
-
- # Abort the session to simulate the auto-expiration
- # https://cloud.google.com/bigquery/docs/sessions-terminating#auto-terminate_a_session
- abort_session_query = "CALL BQ.ABORT_SESSION()"
- query_job = session.bqclient.query(abort_session_query)
- query_job.result() # blocks until finished
-
- # Confirm that session is unusable to run any jobs
- with pytest.raises(
- google.api_core.exceptions.BadRequest,
- match=f"Session {session._session_id} has expired and is no longer available.",
- ):
- query_job = session.bqclient.query(test_query)
- query_job.result() # blocks until finished
-
- # Confirm that as a result bigframes.pandas interface is unusable
- with pytest.raises(
- google.api_core.exceptions.BadRequest,
- match=f"Session {session._session_id} has expired and is no longer available.",
- ):
- bpd.read_gbq(test_query)
-
- # Now try to reset session and verify that it works
- bpd.reset_session()
- assert bpd._global_session is None
-
- # Now verify that use is able to start over
- df = bpd.read_gbq(test_query)
- assert df is not None
-
-
def test_reset_session_after_credentials_need_reauthentication(monkeypatch):
# Use a simple test query to verify that default session works to interact
# with BQ
@@ -332,7 +290,7 @@ def test_reset_session_after_credentials_need_reauthentication(monkeypatch):
# Now verify that resetting the session works
bpd.reset_session()
- assert bpd._global_session is None
+ assert bigframes.core.global_session._global_session is None
# Now verify that use is able to start over
df = bpd.read_gbq(test_query)
diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py
index 47d758763b..77fb81d2c9 100644
--- a/tests/system/small/test_remote_function.py
+++ b/tests/system/small/test_remote_function.py
@@ -12,11 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+from google.cloud import bigquery
+from ibis.backends.bigquery import datatypes as bq_types
+from ibis.expr import datatypes as ibis_types
import pandas as pd
import pytest
import bigframes
-from bigframes.remote_function import read_gbq_function, remote_function
+from bigframes import remote_function as rf
from tests.system.utils import assert_pandas_df_equal_ignore_ordering
@@ -62,9 +65,7 @@ def bq_cf_connection_location_project_mismatched() -> str:
@pytest.fixture(scope="module")
def session_with_bq_connection(bq_cf_connection) -> bigframes.Session:
- return bigframes.Session(
- bigframes.BigQueryOptions(remote_udf_connection=bq_cf_connection)
- )
+ return bigframes.Session(bigframes.BigQueryOptions(bq_connection=bq_cf_connection))
@pytest.fixture(scope="module")
@@ -72,7 +73,7 @@ def session_with_bq_connection_location_specified(
bq_cf_connection_location,
) -> bigframes.Session:
return bigframes.Session(
- bigframes.BigQueryOptions(remote_udf_connection=bq_cf_connection_location)
+ bigframes.BigQueryOptions(bq_connection=bq_cf_connection_location)
)
@@ -81,9 +82,7 @@ def session_with_bq_connection_location_mistached(
bq_cf_connection_location_mistached,
) -> bigframes.Session:
return bigframes.Session(
- bigframes.BigQueryOptions(
- remote_udf_connection=bq_cf_connection_location_mistached
- )
+ bigframes.BigQueryOptions(bq_connection=bq_cf_connection_location_mistached)
)
@@ -92,27 +91,37 @@ def session_with_bq_connection_location_project_specified(
bq_cf_connection_location_project,
) -> bigframes.Session:
return bigframes.Session(
- bigframes.BigQueryOptions(
- remote_udf_connection=bq_cf_connection_location_project
- )
+ bigframes.BigQueryOptions(bq_connection=bq_cf_connection_location_project)
)
+def test_supported_types_correspond():
+ # The same types should be representable by the supported Python and BigQuery types.
+ ibis_types_from_python = {ibis_types.dtype(t) for t in rf.SUPPORTED_IO_PYTHON_TYPES}
+ ibis_types_from_bigquery = {
+ bq_types.BigQueryType.to_ibis(tk) for tk in rf.SUPPORTED_IO_BIGQUERY_TYPEKINDS
+ }
+
+ assert ibis_types_from_python == ibis_types_from_bigquery
+
+
@pytest.mark.flaky(retries=2, delay=120)
def test_remote_function_direct_no_session_param(
bigquery_client,
bigqueryconnection_client,
cloudfunctions_client,
+ resourcemanager_client,
scalars_dfs,
dataset_id_permanent,
bq_cf_connection,
):
- @remote_function(
+ @rf.remote_function(
[int],
int,
bigquery_client=bigquery_client,
bigquery_connection_client=bigqueryconnection_client,
cloud_functions_client=cloudfunctions_client,
+ resource_manager_client=resourcemanager_client,
dataset=dataset_id_permanent,
bigquery_connection=bq_cf_connection,
# See e2e tests for tests that actually deploy the Cloud Function.
@@ -153,16 +162,18 @@ def test_remote_function_direct_no_session_param_location_specified(
bigquery_client,
bigqueryconnection_client,
cloudfunctions_client,
+ resourcemanager_client,
scalars_dfs,
dataset_id_permanent,
bq_cf_connection_location,
):
- @remote_function(
+ @rf.remote_function(
[int],
int,
bigquery_client=bigquery_client,
bigquery_connection_client=bigqueryconnection_client,
cloud_functions_client=cloudfunctions_client,
+ resource_manager_client=resourcemanager_client,
dataset=dataset_id_permanent,
bigquery_connection=bq_cf_connection_location,
# See e2e tests for tests that actually deploy the Cloud Function.
@@ -200,17 +211,19 @@ def test_remote_function_direct_no_session_param_location_mismatched(
bigquery_client,
bigqueryconnection_client,
cloudfunctions_client,
+ resourcemanager_client,
dataset_id_permanent,
bq_cf_connection_location_mismatched,
):
with pytest.raises(ValueError):
- @remote_function(
+ @rf.remote_function(
[int],
int,
bigquery_client=bigquery_client,
bigquery_connection_client=bigqueryconnection_client,
cloud_functions_client=cloudfunctions_client,
+ resource_manager_client=resourcemanager_client,
dataset=dataset_id_permanent,
bigquery_connection=bq_cf_connection_location_mismatched,
# See e2e tests for tests that actually deploy the Cloud Function.
@@ -225,16 +238,18 @@ def test_remote_function_direct_no_session_param_location_project_specified(
bigquery_client,
bigqueryconnection_client,
cloudfunctions_client,
+ resourcemanager_client,
scalars_dfs,
dataset_id_permanent,
bq_cf_connection_location_project,
):
- @remote_function(
+ @rf.remote_function(
[int],
int,
bigquery_client=bigquery_client,
bigquery_connection_client=bigqueryconnection_client,
cloud_functions_client=cloudfunctions_client,
+ resource_manager_client=resourcemanager_client,
dataset=dataset_id_permanent,
bigquery_connection=bq_cf_connection_location_project,
# See e2e tests for tests that actually deploy the Cloud Function.
@@ -272,17 +287,19 @@ def test_remote_function_direct_no_session_param_project_mismatched(
bigquery_client,
bigqueryconnection_client,
cloudfunctions_client,
+ resourcemanager_client,
dataset_id_permanent,
bq_cf_connection_location_project_mismatched,
):
with pytest.raises(ValueError):
- @remote_function(
+ @rf.remote_function(
[int],
int,
bigquery_client=bigquery_client,
bigquery_connection_client=bigqueryconnection_client,
cloud_functions_client=cloudfunctions_client,
+ resource_manager_client=resourcemanager_client,
dataset=dataset_id_permanent,
bigquery_connection=bq_cf_connection_location_project_mismatched,
# See e2e tests for tests that actually deploy the Cloud Function.
@@ -294,7 +311,7 @@ def square(x):
@pytest.mark.flaky(retries=2, delay=120)
def test_remote_function_direct_session_param(session_with_bq_connection, scalars_dfs):
- @remote_function(
+ @rf.remote_function(
[int],
int,
session=session_with_bq_connection,
@@ -409,7 +426,7 @@ def test_remote_function_via_session_context_connection_setter(
# Creating a session scoped only to this test as we would be setting a
# property in it
context = bigframes.BigQueryOptions()
- context.remote_udf_connection = bq_cf_connection
+ context.bq_connection = bq_cf_connection
session = bigframes.connect(context)
# Without an explicit bigquery connection, the one present in Session,
@@ -500,29 +517,62 @@ def add_one(x):
assert_pandas_df_equal_ignore_ordering(bf_result, pd_result)
+@pytest.mark.flaky(retries=2, delay=120)
+def test_series_map(session_with_bq_connection, scalars_dfs):
+ def add_one(x):
+ return x + 1
+
+ remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one)
+
+ scalars_df, scalars_pandas_df = scalars_dfs
+
+ bf_result = scalars_df.int64_too.map(remote_add_one).to_pandas()
+ pd_result = scalars_pandas_df.int64_too.map(add_one)
+ pd_result = pd_result.astype("Int64") # pandas type differences
+
+ pd.testing.assert_series_equal(
+ bf_result,
+ pd_result,
+ )
+
+
+@pytest.mark.flaky(retries=2, delay=120)
+def test_read_gbq_function_detects_invalid_function(bigquery_client, dataset_id):
+ dataset_ref = bigquery.DatasetReference.from_string(dataset_id)
+ with pytest.raises(ValueError) as e:
+ rf.read_gbq_function(
+ str(dataset_ref.routine("not_a_function")),
+ bigquery_client=bigquery_client,
+ )
+
+ assert "Unknown function" in str(e.value)
+
+
@pytest.mark.flaky(retries=2, delay=120)
def test_read_gbq_function_like_original(
bigquery_client,
bigqueryconnection_client,
cloudfunctions_client,
+ resourcemanager_client,
scalars_df_index,
dataset_id_permanent,
bq_cf_connection,
):
- @remote_function(
+ @rf.remote_function(
[int],
int,
bigquery_client=bigquery_client,
bigquery_connection_client=bigqueryconnection_client,
dataset=dataset_id_permanent,
cloud_functions_client=cloudfunctions_client,
+ resource_manager_client=resourcemanager_client,
bigquery_connection=bq_cf_connection,
reuse=True,
)
def square1(x):
return x * x
- square2 = read_gbq_function(
+ square2 = rf.read_gbq_function(
function_name=square1.bigframes_remote_function,
bigquery_client=bigquery_client,
)
@@ -551,3 +601,111 @@ def square1(x):
s2_result = int64_col_filtered.to_frame().assign(result=s2_result_col)
assert_pandas_df_equal_ignore_ordering(s1_result.to_pandas(), s2_result.to_pandas())
+
+
+@pytest.mark.flaky(retries=2, delay=120)
+def test_read_gbq_function_reads_udfs(bigquery_client, scalars_dfs, dataset_id):
+ dataset_ref = bigquery.DatasetReference.from_string(dataset_id)
+ arg = bigquery.RoutineArgument(
+ name="x",
+ data_type=bigquery.StandardSqlDataType(bigquery.StandardSqlTypeNames.INT64),
+ )
+ sql_routine = bigquery.Routine(
+ dataset_ref.routine("square_sql"),
+ body="x * x",
+ arguments=[arg],
+ return_type=bigquery.StandardSqlDataType(bigquery.StandardSqlTypeNames.INT64),
+ type_=bigquery.RoutineType.SCALAR_FUNCTION,
+ )
+ js_routine = bigquery.Routine(
+ dataset_ref.routine("square_js"),
+ body="return x * x",
+ language="JAVASCRIPT",
+ arguments=[arg],
+ return_type=bigquery.StandardSqlDataType(bigquery.StandardSqlTypeNames.INT64),
+ type_=bigquery.RoutineType.SCALAR_FUNCTION,
+ )
+
+ for routine in (sql_routine, js_routine):
+ # Create the routine in BigQuery and read it back using read_gbq_function.
+ bigquery_client.create_routine(routine, exists_ok=True)
+ square = rf.read_gbq_function(
+ str(routine.reference), bigquery_client=bigquery_client
+ )
+
+ # It should point to the named routine and yield the expected results.
+ assert square.bigframes_remote_function == str(routine.reference)
+
+ src = {"x": [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]}
+
+ routine_ref_str = rf.routine_ref_to_string_for_query(routine.reference)
+ direct_sql = " UNION ALL ".join(
+ [f"SELECT {x} AS x, {routine_ref_str}({x}) AS y" for x in src["x"]]
+ )
+ direct_df = bigquery_client.query(direct_sql).to_dataframe()
+
+ indirect_df = bigframes.dataframe.DataFrame(src)
+ indirect_df = indirect_df.assign(y=indirect_df.x.apply(square))
+ indirect_df = indirect_df.to_pandas()
+
+ assert_pandas_df_equal_ignore_ordering(direct_df, indirect_df)
+
+
+@pytest.mark.flaky(retries=2, delay=120)
+def test_read_gbq_function_enforces_explicit_types(bigquery_client, dataset_id):
+ dataset_ref = bigquery.DatasetReference.from_string(dataset_id)
+ typed_arg = bigquery.RoutineArgument(
+ name="x",
+ data_type=bigquery.StandardSqlDataType(bigquery.StandardSqlTypeNames.INT64),
+ )
+ untyped_arg = bigquery.RoutineArgument(
+ name="x",
+ kind="ANY_TYPE", # With this kind, data_type not required for SQL functions.
+ )
+
+ both_types_specified = bigquery.Routine(
+ dataset_ref.routine("both_types_specified"),
+ body="x * x",
+ arguments=[typed_arg],
+ return_type=bigquery.StandardSqlDataType(bigquery.StandardSqlTypeNames.INT64),
+ type_=bigquery.RoutineType.SCALAR_FUNCTION,
+ )
+ only_return_type_specified = bigquery.Routine(
+ dataset_ref.routine("only_return_type_specified"),
+ body="x * x",
+ arguments=[untyped_arg],
+ return_type=bigquery.StandardSqlDataType(bigquery.StandardSqlTypeNames.INT64),
+ type_=bigquery.RoutineType.SCALAR_FUNCTION,
+ )
+ only_arg_type_specified = bigquery.Routine(
+ dataset_ref.routine("only_arg_type_specified"),
+ body="x * x",
+ arguments=[typed_arg],
+ type_=bigquery.RoutineType.SCALAR_FUNCTION,
+ )
+ neither_type_specified = bigquery.Routine(
+ dataset_ref.routine("neither_type_specified"),
+ body="x * x",
+ arguments=[untyped_arg],
+ type_=bigquery.RoutineType.SCALAR_FUNCTION,
+ )
+
+ bigquery_client.create_routine(both_types_specified, exists_ok=True)
+ bigquery_client.create_routine(only_return_type_specified, exists_ok=True)
+ bigquery_client.create_routine(only_arg_type_specified, exists_ok=True)
+ bigquery_client.create_routine(neither_type_specified, exists_ok=True)
+
+ rf.read_gbq_function(
+ str(both_types_specified.reference), bigquery_client=bigquery_client
+ )
+ rf.read_gbq_function(
+ str(only_return_type_specified.reference), bigquery_client=bigquery_client
+ )
+ with pytest.raises(ValueError):
+ rf.read_gbq_function(
+ str(only_arg_type_specified.reference), bigquery_client=bigquery_client
+ )
+ with pytest.raises(ValueError):
+ rf.read_gbq_function(
+ str(neither_type_specified.reference), bigquery_client=bigquery_client
+ )
diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py
index 70c56e5e13..07dc892ddc 100644
--- a/tests/system/small/test_series.py
+++ b/tests/system/small/test_series.py
@@ -186,6 +186,23 @@ def test_fillna(scalars_dfs):
)
+@pytest.mark.parametrize(
+ ("ignore_index",),
+ (
+ (True,),
+ (False,),
+ ),
+)
+def test_series_dropna(scalars_dfs, ignore_index):
+ if pd.__version__.startswith("1."):
+ pytest.skip("ignore_index parameter not supported in pandas 1.x.")
+ scalars_df, scalars_pandas_df = scalars_dfs
+ col_name = "string_col"
+ bf_result = scalars_df[col_name].dropna(ignore_index=ignore_index).to_pandas()
+ pd_result = scalars_pandas_df[col_name].dropna(ignore_index=ignore_index)
+ pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False)
+
+
def test_series_agg_single_string(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
bf_result = scalars_df["int64_col"].agg("sum")
@@ -365,6 +382,24 @@ def test_series_int_int_operators_scalar(
assert_series_equal_ignoring_order(pd_result, bf_result)
+def test_series_pow_scalar(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+
+ bf_result = (scalars_df["int64_col"] ** 2).to_pandas()
+ pd_result = scalars_pandas_df["int64_col"] ** 2
+
+ assert_series_equal_ignoring_order(pd_result, bf_result)
+
+
+def test_series_pow_scalar_reverse(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+
+ bf_result = (0.8 ** scalars_df["int64_col"]).to_pandas()
+ pd_result = 0.8 ** scalars_pandas_df["int64_col"]
+
+ assert_series_equal_ignoring_order(pd_result, bf_result)
+
+
@pytest.mark.parametrize(
("operator"),
[
@@ -459,6 +494,19 @@ def test_mods(scalars_dfs, col_x, col_y, method):
pd.testing.assert_series_equal(pd_result, bf_result)
+# We work around a pandas bug that doesn't handle correlating nullable dtypes by doing this
+# manually with dumb self-correlation instead of parameterized as test_mods is above.
+def test_corr(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+ bf_result = scalars_df["int64_too"].corr(scalars_df["int64_too"])
+ pd_result = (
+ scalars_pandas_df["int64_too"]
+ .astype("int64")
+ .corr(scalars_pandas_df["int64_too"].astype("int64"))
+ )
+ assert math.isclose(pd_result, bf_result)
+
+
@pytest.mark.parametrize(
("col_x",),
[
@@ -900,7 +948,7 @@ def test_binop_repeated_application_does_row_identity_joins(scalars_dfs):
pd_result,
)
- bf_sql, _ = bf_series.to_frame()._to_sql_query(always_include_index=True)
+ bf_sql, _, _ = bf_series.to_frame()._to_sql_query(include_index=True)
selects = re.findall("SELECT", bf_sql.upper())
assert 0 < len(selects) < (num_joins // 2)
@@ -2222,8 +2270,9 @@ def test_argmax(scalars_df_index, scalars_pandas_df_index):
assert bf_result == pd_result
-def test_getattr_not_implemented(scalars_df_index):
- with pytest.raises(NotImplementedError):
+def test_getattr_attribute_error_when_pandas_has(scalars_df_index):
+ # asof is implemented in pandas but not in bigframes
+ with pytest.raises(AttributeError):
scalars_df_index.string_col.asof()
@@ -2449,3 +2498,57 @@ def test_is_monotonic_decreasing(series_input):
assert (
scalars_df.is_monotonic_decreasing == scalars_pandas_df.is_monotonic_decreasing
)
+
+
+def test_map_dict_input(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+
+ local_map = dict()
+ # construct a local map, incomplete to cover behavior
+ for s in scalars_pandas_df.string_col[:-3]:
+ if isinstance(s, str):
+ local_map[s] = ord(s[0])
+
+ pd_result = scalars_pandas_df.string_col.map(local_map)
+ pd_result = pd_result.astype("Int64") # pandas type differences
+ bf_result = scalars_df.string_col.map(local_map)
+
+ pd.testing.assert_series_equal(
+ bf_result.to_pandas(),
+ pd_result,
+ )
+
+
+def test_map_series_input(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+
+ new_index = scalars_pandas_df.int64_too.drop_duplicates()
+ pd_map_series = scalars_pandas_df.string_col.iloc[0 : len(new_index)]
+ pd_map_series.index = new_index
+ bf_map_series = series.Series(
+ pd_map_series, session=scalars_df._get_block().expr._session
+ )
+
+ pd_result = scalars_pandas_df.int64_too.map(pd_map_series)
+ bf_result = scalars_df.int64_too.map(bf_map_series)
+
+ pd.testing.assert_series_equal(
+ bf_result.to_pandas(),
+ pd_result,
+ )
+
+
+def test_map_series_input_duplicates_error(scalars_dfs):
+ scalars_df, scalars_pandas_df = scalars_dfs
+
+ new_index = scalars_pandas_df.int64_too
+ pd_map_series = scalars_pandas_df.string_col.iloc[0 : len(new_index)]
+ pd_map_series.index = new_index
+ bf_map_series = series.Series(
+ pd_map_series, session=scalars_df._get_block().expr._session
+ )
+
+ with pytest.raises(pd.errors.InvalidIndexError):
+ scalars_pandas_df.int64_too.map(pd_map_series)
+ with pytest.raises(pd.errors.InvalidIndexError):
+ scalars_df.int64_too.map(bf_map_series, verify_integrity=True)
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
index 2fc34f9bae..b7bee16ffd 100644
--- a/tests/system/small/test_session.py
+++ b/tests/system/small/test_session.py
@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import io
import random
import tempfile
import textwrap
@@ -110,6 +111,54 @@ def test_read_gbq_w_col_order(
["uuid"],
id="unique_uuid_index_query",
),
+ pytest.param(
+ """
+ SELECT my_index, my_value
+ FROM UNNEST(
+ [
+ STRUCT(0, 12),
+ STRUCT(1, 12),
+ STRUCT(2, 24)
+ ]
+ )
+ -- Can't normally cluster tables with ORDER BY clause.
+ ORDER BY my_index DESC
+ """,
+ ["my_index"],
+ id="unique_index_query_has_order_by",
+ ),
+ pytest.param(
+ """
+ WITH my_table AS (
+ SELECT *
+ FROM UNNEST(
+ [
+ STRUCT(0, 12),
+ STRUCT(1, 12),
+ STRUCT(2, 24)
+ ]
+ )
+ )
+ SELECT my_index, my_value FROM my_table
+ """,
+ ["my_index"],
+ id="unique_index_query_with_named_table_expression",
+ ),
+ pytest.param(
+ """
+ CREATE TEMP TABLE test_read_gbq_w_index_col_unique_index_query_with_script
+ AS SELECT * FROM UNNEST(
+ [
+ STRUCT(0, 12),
+ STRUCT(1, 12),
+ STRUCT(2, 24)
+ ]
+ );
+ SELECT my_index, my_value FROM test_read_gbq_w_index_col_unique_index_query_with_script
+ """,
+ ["my_index"],
+ id="unique_index_query_with_script",
+ ),
pytest.param(
"{scalars_table_id}",
["bool_col"],
@@ -220,7 +269,7 @@ def test_read_gbq_w_max_results(
assert bf_result.shape[0] == max_results
-def test_read_gbq_w_script(session, dataset_id: str):
+def test_read_gbq_w_script_no_select(session, dataset_id: str):
ddl = f"""
CREATE TABLE `{dataset_id}.test_read_gbq_w_ddl` (
`col_a` INT64,
@@ -251,6 +300,20 @@ def test_read_pandas(session, scalars_dfs):
pd.testing.assert_frame_equal(result, expected)
+def test_read_pandas_col_label_w_space(session: bigframes.Session):
+ expected = pd.DataFrame(
+ {
+ "Animal": ["Falcon", "Falcon", "Parrot", "Parrot"],
+ "Max Speed": [380.0, 370.0, 24.0, 26.0],
+ }
+ )
+ result = session.read_pandas(expected).to_pandas()
+
+ pd.testing.assert_frame_equal(
+ result, expected, check_index_type=False, check_dtype=False
+ )
+
+
def test_read_pandas_multi_index(session, scalars_pandas_df_multi_index):
df = session.read_pandas(scalars_pandas_df_multi_index)
result = df.to_pandas()
@@ -683,6 +746,43 @@ def test_read_csv_local_w_encoding(session, penguins_pandas_df_default_index, en
assert df.shape[0] == penguins_pandas_df_default_index.shape[0]
+def test_read_pickle_local(session, penguins_pandas_df_default_index, tmp_path):
+ path = tmp_path / "test_read_csv_local_w_encoding.pkl"
+
+ penguins_pandas_df_default_index.to_pickle(path)
+ df = session.read_pickle(path)
+
+ pd.testing.assert_frame_equal(penguins_pandas_df_default_index, df.to_pandas())
+
+
+def test_read_pickle_buffer(session, penguins_pandas_df_default_index):
+ buffer = io.BytesIO()
+ penguins_pandas_df_default_index.to_pickle(buffer)
+ buffer.seek(0)
+ df = session.read_pickle(buffer)
+
+ pd.testing.assert_frame_equal(penguins_pandas_df_default_index, df.to_pandas())
+
+
+def test_read_pickle_series_buffer(session):
+ pd_series = pd.Series([1, 2, 3, 4, 5], dtype="Int64")
+ buffer = io.BytesIO()
+ pd_series.to_pickle(buffer)
+ buffer.seek(0)
+ bf_series = session.read_pickle(buffer).to_pandas()
+ pd_series.index = pd_series.index.astype("Int64")
+
+ assert (pd_series == bf_series).all()
+
+
+def test_read_pickle_gcs(session, penguins_pandas_df_default_index, gcs_folder):
+ path = gcs_folder + "test_read_pickle_gcs.pkl"
+ penguins_pandas_df_default_index.to_pickle(path)
+ df = session.read_pickle(path)
+
+ pd.testing.assert_frame_equal(penguins_pandas_df_default_index, df.to_pandas())
+
+
def test_read_parquet_gcs(session: bigframes.Session, scalars_dfs, gcs_folder):
scalars_df, _ = scalars_dfs
# Include wildcard so that multiple files can be written/read if > 1 GB.
@@ -717,6 +817,67 @@ def test_read_parquet_gcs(session: bigframes.Session, scalars_dfs, gcs_folder):
pd.testing.assert_frame_equal(pd_df_in, pd_df_out)
+def test_read_json_gcs_bq_engine(session, scalars_dfs, gcs_folder):
+ scalars_df, _ = scalars_dfs
+ path = gcs_folder + "test_read_json_gcs_bq_engine_w_index*.json"
+ read_path = path.replace("*", FIRST_FILE)
+ scalars_df.to_json(path, index=False, lines=True, orient="records")
+ df = session.read_json(read_path, lines=True, orient="records", engine="bigquery")
+
+ # The auto detects of BigQuery load job does not preserve any ordering of columns for json.
+ pd.testing.assert_index_equal(
+ df.columns.sort_values(), scalars_df.columns.sort_values()
+ )
+
+ # The auto detects of BigQuery load job have restrictions to detect the bytes,
+ # datetime, numeric and geometry types, so they're skipped here.
+ df = df.drop(columns=["bytes_col", "datetime_col", "numeric_col", "geography_col"])
+ scalars_df = scalars_df.drop(
+ columns=["bytes_col", "datetime_col", "numeric_col", "geography_col"]
+ )
+ assert df.shape[0] == scalars_df.shape[0]
+ pd.testing.assert_series_equal(
+ df.dtypes.sort_index(), scalars_df.dtypes.sort_index()
+ )
+
+
+def test_read_json_gcs_default_engine(session, scalars_dfs, gcs_folder):
+ scalars_df, _ = scalars_dfs
+ path = gcs_folder + "test_read_json_gcs_default_engine_w_index*.json"
+ read_path = path.replace("*", FIRST_FILE)
+ scalars_df.to_json(
+ path,
+ index=False,
+ lines=True,
+ orient="records",
+ )
+ dtype = scalars_df.dtypes.to_dict()
+ dtype.pop("geography_col")
+
+ df = session.read_json(
+ read_path,
+ # Convert default pandas dtypes to match BigQuery DataFrames dtypes.
+ dtype=dtype,
+ lines=True,
+ orient="records",
+ )
+
+ assert df._block._expr._ordering is not None
+ pd.testing.assert_index_equal(df.columns, scalars_df.columns)
+
+ # The auto detects of BigQuery load job have restrictions to detect the bytes,
+ # numeric and geometry types, so they're skipped here.
+ df = df.drop(columns=["bytes_col", "numeric_col", "geography_col"])
+ scalars_df = scalars_df.drop(columns=["bytes_col", "numeric_col", "geography_col"])
+
+ # pandas read_json does not respect the dtype overrides for these columns
+ df = df.drop(columns=["date_col", "datetime_col", "time_col"])
+ scalars_df = scalars_df.drop(columns=["date_col", "datetime_col", "time_col"])
+
+ assert df.shape[0] == scalars_df.shape[0]
+ pd.testing.assert_series_equal(df.dtypes, scalars_df.dtypes)
+
+
def test_session_id(session):
assert session._session_id is not None
diff --git a/tests/unit/_config/test_bigquery_options.py b/tests/unit/_config/test_bigquery_options.py
index 43b5663bf7..aeee058319 100644
--- a/tests/unit/_config/test_bigquery_options.py
+++ b/tests/unit/_config/test_bigquery_options.py
@@ -26,7 +26,7 @@
("credentials", object(), object()),
("location", "us-east1", "us-central1"),
("project", "my-project", "my-other-project"),
- ("remote_udf_connection", "path/to/connection/1", "path/to/connection/2"),
+ ("bq_connection", "path/to/connection/1", "path/to/connection/2"),
],
)
def test_setter_raises_if_session_started(attribute, original_value, new_value):
@@ -56,7 +56,7 @@ def test_setter_raises_if_session_started(attribute, original_value, new_value):
"credentials",
"location",
"project",
- "remote_udf_connection",
+ "bq_connection",
]
],
)
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
deleted file mode 100644
index dcf2d918a5..0000000000
--- a/tests/unit/conftest.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from typing import Callable, Optional, Tuple, Union
-from unittest import mock
-
-import google.api_core.exceptions
-import google.auth
-import google.cloud.bigquery as bigquery
-import google.cloud.bigquery.table
-import google.oauth2.credentials # type: ignore
-import ibis.expr.types as ibis_types
-import pandas
-import pytest
-
-import bigframes
-import bigframes.core
-import bigframes.dataframe
-
-SCALARS_TABLE_ID = "project.dataset.scalars_table"
-
-
-@pytest.fixture
-def scalars_pandas_df_default_index() -> pandas.DataFrame:
- # Note: as of 2023-02-07, using nullable dtypes with the ibis pandas
- # backend requires running ibis at HEAD. See:
- # https://github.com/ibis-project/ibis/pull/5345
- return pandas.DataFrame(
- {
- "rowindex": pandas.Series(
- [
- 0,
- 1,
- 2,
- 3,
- 4,
- 5,
- 6,
- 7,
- 8,
- 9,
- ],
- dtype="Int64",
- ),
- "bool_col": pandas.Series(
- [
- True,
- None,
- False,
- True,
- None,
- False,
- True,
- None,
- False,
- True,
- ],
- dtype="boolean",
- ),
- "int64_col": pandas.Series(
- [
- 1,
- 2,
- 3,
- None,
- 0,
- -1,
- -2,
- 2**63 - 1,
- -(2**63),
- None,
- ],
- dtype="Int64",
- ),
- "float64_col": pandas.Series(
- [
- None,
- 1,
- math.pi,
- math.e * 1e10,
- 0,
- float("nan"),
- float("inf"),
- float("-inf"),
- -2.23e-308,
- 1.8e308,
- ],
- dtype="Float64",
- ),
- "string_col": pandas.Series(
- [
- "abc",
- "XYZ",
- "aBcDeFgHiJkLmNoPqRsTuVwXyZ",
- "1_2-3+4=5~6*7/8&9%10#11@12$" "",
- None,
- "こんにちは",
- "你好",
- "வணக்கம்",
- "שלום",
- ],
- dtype="string[pyarrow]",
- ),
- }
- )
-
-
-# We parameterize the fixtures at this point with the real pandas
-# dataframes and deferred bigframes dataframes as we have the following
-# chain of dependencies:
-# -> index/default_index parameterization
-# -> pandas dataframe
-# -> bqclient mock
-# -> session
-# -> bigframes dataframe
-@pytest.fixture
-def scalars_testdata_setup(
- scalars_pandas_df_default_index,
-) -> Tuple[
- pandas.DataFrame, Callable[[bigframes.Session], bigframes.dataframe.DataFrame]
-]:
- return (
- scalars_pandas_df_default_index.set_index("rowindex"),
- lambda session: session.read_gbq(SCALARS_TABLE_ID, index_col=["rowindex"]),
- )
-
-
-@pytest.fixture(autouse=True)
-def mock_bigquery_client(monkeypatch, scalars_testdata_setup) -> bigquery.Client:
- scalars_pandas_df, _ = scalars_testdata_setup
- mock_client = mock.create_autospec(bigquery.Client)
- # Constructor returns the mock itself, so this mock can be treated as the
- # constructor or the instance.
- mock_client.return_value = mock_client
- mock_client.project = "default-project"
- most_recent_table = None
-
- def mock_bigquery_client_get_table(
- table_ref: Union[google.cloud.bigquery.table.TableReference, str]
- ):
- global most_recent_table
-
- if isinstance(table_ref, google.cloud.bigquery.table.TableReference):
- table_name = table_ref.__str__()
- else:
- table_name = table_ref
-
- schema = [
- {"mode": "NULLABLE", "name": "rowindex", "type": "INTEGER"},
- {
- "mode": "NULLABLE",
- "name": "bigframes_ordering_id",
- "type": "INTEGER",
- },
- ]
-
- if table_name == SCALARS_TABLE_ID:
- schema += [
- {"mode": "NULLABLE", "name": "bool_col", "type": "BOOL"},
- {"mode": "NULLABLE", "name": "int64_col", "type": "INTEGER"},
- {"mode": "NULLABLE", "name": "float64_col", "type": "FLOAT"},
- {"mode": "NULLABLE", "name": "string_col", "type": "STRING"},
- ]
- else:
- raise google.api_core.exceptions.NotFound("Not Found Table")
-
- most_recent_table = bigquery.Table(table_name, schema) # type: ignore
- return most_recent_table # type: ignore
-
- def mock_query(
- sql: str,
- job_config: Optional[bigquery.QueryJobConfig] = None,
- location: str = "US",
- ) -> bigquery.QueryJob:
- global most_recent_table
-
- def mock_result(max_results=None):
- mock_rows = mock.create_autospec(google.cloud.bigquery.table.RowIterator)
- mock_rows.total_rows = len(scalars_pandas_df.index)
- mock_rows.schema = [
- bigquery.SchemaField(name=name, field_type="INT64")
- for name in scalars_pandas_df.columns
- ]
- # Use scalars_pandas_df instead of ibis_expr.execute() to preserve dtypes.
- mock_rows.to_dataframe.return_value = scalars_pandas_df.head(n=max_results)
- return mock_rows
-
- mock_job = mock.create_autospec(bigquery.QueryJob)
- mock_job.result = mock_result
- return mock_job
-
- mock_client.get_table = mock_bigquery_client_get_table
- mock_client.query.side_effect = mock_query
- monkeypatch.setattr(bigquery, "Client", mock_client)
- mock_client.reset_mock()
- return mock_client
-
-
-@pytest.fixture
-def session() -> bigframes.Session:
- return bigframes.Session(
- context=bigframes.BigQueryOptions(
- credentials=mock.create_autospec(google.oauth2.credentials.Credentials),
- project="unit-test-project",
- )
- )
-
-
-@pytest.fixture
-def scalars_ibis_table(session) -> ibis_types.Table:
- return session.ibis_client.table(SCALARS_TABLE_ID)
diff --git a/tests/unit/core/test_bf_utils.py b/tests/unit/core/test_bf_utils.py
new file mode 100644
index 0000000000..fc34f35d9c
--- /dev/null
+++ b/tests/unit/core/test_bf_utils.py
@@ -0,0 +1,56 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from bigframes.core import utils
+
+
+def test_get_standardized_ids_columns():
+ col_labels = ["string", 0, None, "duplicate", "duplicate", "with space"]
+
+ col_ids, idx_ids = utils.get_standardized_ids(col_labels)
+
+ assert col_ids == [
+ "string",
+ "0",
+ utils.UNNAMED_COLUMN_ID,
+ "duplicate",
+ "duplicate.1",
+ "with_space",
+ ]
+ assert idx_ids == []
+
+
+def test_get_standardized_ids_indexes():
+ col_labels = ["duplicate"]
+ idx_labels = ["string", 0, None, "duplicate", "duplicate", "with space"]
+
+ col_ids, idx_ids = utils.get_standardized_ids(col_labels, idx_labels)
+
+ assert col_ids == ["duplicate.2"]
+ assert idx_ids == [
+ "string",
+ "0",
+ utils.UNNAMED_INDEX_ID,
+ "duplicate",
+ "duplicate.1",
+ "with_space",
+ ]
+
+
+def test_get_standardized_ids_tuple():
+ col_labels = [("foo", 1), ("foo", 2), ("bar", 1)]
+
+ col_ids, _ = utils.get_standardized_ids(col_labels)
+
+ assert col_ids == ["('foo',_1)", "('foo',_2)", "('bar',_1)"]
diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py
index d8c8a2d108..c20a17f7d6 100644
--- a/tests/unit/ml/test_sql.py
+++ b/tests/unit/ml/test_sql.py
@@ -162,3 +162,18 @@ def test_ml_generate_text_produces_correct_sql():
== """SELECT * FROM ML.GENERATE_TEXT(MODEL `my_dataset.my_model`,
(SELECT * FROM my_table), STRUCT(value AS item))"""
)
+
+
+def test_ml_principal_components_produces_correct_sql():
+ sql = ml_sql.ml_principal_components(model_name="my_dataset.my_model")
+ assert (
+ sql == """SELECT * FROM ML.PRINCIPAL_COMPONENTS(MODEL `my_dataset.my_model`)"""
+ )
+
+
+def test_ml_principal_component_info_produces_correct_sql():
+ sql = ml_sql.ml_principal_component_info(model_name="my_dataset.my_model")
+ assert (
+ sql
+ == """SELECT * FROM ML.PRINCIPAL_COMPONENT_INFO(MODEL `my_dataset.my_model`)"""
+ )
diff --git a/tests/unit/resources.py b/tests/unit/resources.py
new file mode 100644
index 0000000000..c8ed6e86ed
--- /dev/null
+++ b/tests/unit/resources.py
@@ -0,0 +1,73 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional
+import unittest.mock as mock
+
+import google.auth.credentials
+import google.cloud.bigquery
+import ibis
+import pandas
+
+import bigframes
+import bigframes.core as core
+
+"""Utilities for creating test resources."""
+
+
+def create_bigquery_session(
+ bqclient: Optional[google.cloud.bigquery.Client] = None, session_id: str = "abcxyz"
+) -> bigframes.Session:
+ credentials = mock.create_autospec(
+ google.auth.credentials.Credentials, instance=True
+ )
+
+ if bqclient is None:
+ bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True)
+ bqclient.project = "test-project"
+
+ clients_provider = mock.create_autospec(bigframes.session.ClientsProvider)
+ type(clients_provider).bqclient = mock.PropertyMock(return_value=bqclient)
+ clients_provider._credentials = credentials
+
+ bqoptions = bigframes.BigQueryOptions(
+ credentials=credentials, location="test-region"
+ )
+ session = bigframes.Session(context=bqoptions, clients_provider=clients_provider)
+ session._session_id = session_id
+ return session
+
+
+def create_pandas_session(tables: Dict[str, pandas.DataFrame]) -> bigframes.Session:
+ # TODO(tswast): Refactor to make helper available for all tests. Consider
+ # providing a proper "local Session" for use by downstream developers.
+ session = mock.create_autospec(bigframes.Session, instance=True)
+ ibis_client = ibis.pandas.connect(tables)
+ type(session).ibis_client = mock.PropertyMock(return_value=ibis_client)
+ return session
+
+
+def create_arrayvalue(
+ df: pandas.DataFrame, total_ordering_columns: List[str]
+) -> bigframes.core.ArrayValue:
+ session = create_pandas_session({"test_table": df})
+ ibis_table = session.ibis_client.table("test_table")
+ columns = tuple(ibis_table[key] for key in ibis_table.columns)
+ ordering = core.ExpressionOrdering(
+ [core.OrderingColumnReference(column) for column in total_ordering_columns],
+ total_ordering_columns=frozenset(total_ordering_columns),
+ )
+ return core.ArrayValue(
+ session=session, table=ibis_table, columns=columns, ordering=ordering
+ )
diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py
index 123dae7939..e01638e22e 100644
--- a/tests/unit/test_core.py
+++ b/tests/unit/test_core.py
@@ -13,39 +13,55 @@
# limitations under the License.
import ibis
-from ibis.expr.types import Table
+import pandas
-from bigframes import core
+import bigframes.core as core
-ORDERING = core.ExpressionOrdering(
- [
- core.OrderingColumnReference("int64_col"),
- core.OrderingColumnReference("string_col"),
- ],
- total_ordering_columns=frozenset(["int64_col", "string_col"]),
-)
+from . import resources
-def test_constructor_from_ibis_table_adds_all_columns(
- session, scalars_ibis_table: Table
-):
- columns = tuple(scalars_ibis_table[key] for key in scalars_ibis_table.columns)
+def test_arrayvalue_constructor_from_ibis_table_adds_all_columns():
+ session = resources.create_pandas_session(
+ {
+ "test_table": pandas.DataFrame(
+ {
+ "col1": [1, 2, 3],
+ "not_included": [True, False, True],
+ "col2": ["a", "b", "c"],
+ "col3": [0.1, 0.2, 0.3],
+ }
+ )
+ }
+ )
+ ibis_table = session.ibis_client.table("test_table")
+ columns = (ibis_table["col1"], ibis_table["col2"], ibis_table["col3"])
+ ordering = core.ExpressionOrdering(
+ [core.OrderingColumnReference("col1")],
+ total_ordering_columns=frozenset(["col1"]),
+ )
actual = core.ArrayValue(
- session=session, table=scalars_ibis_table, columns=columns, ordering=ORDERING
+ session=session, table=ibis_table, columns=columns, ordering=ordering
)
- assert actual._table is scalars_ibis_table
- assert len(actual._columns) == len(scalars_ibis_table.columns)
+ assert actual.table is ibis_table
+ assert len(actual.columns) == 3
-def test_to_ibis_expr_with_projection(session, scalars_ibis_table: Table):
- columns = tuple(scalars_ibis_table[key] for key in scalars_ibis_table.columns)
- expr = core.ArrayValue(
- session=session, table=scalars_ibis_table, columns=columns, ordering=ORDERING
- ).projection(
+def test_arrayvalue_to_ibis_expr_with_projection():
+ value = resources.create_arrayvalue(
+ pandas.DataFrame(
+ {
+ "col1": [1, 2, 3],
+ "col2": ["a", "b", "c"],
+ "col3": [0.1, 0.2, 0.3],
+ }
+ ),
+ total_ordering_columns=["col1"],
+ )
+ expr = value.projection(
[
- scalars_ibis_table["int64_col"],
+ (value.table["col1"] + ibis.literal(-1)).name("int64_col"),
ibis.literal(123456789).name("literals"),
- scalars_ibis_table["string_col"],
+ value.table["col2"].name("string_col"),
]
)
actual = expr.to_ibis_expr()
diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py
index dafed08980..bb8ae570dc 100644
--- a/tests/unit/test_dtypes.py
+++ b/tests/unit/test_dtypes.py
@@ -175,9 +175,12 @@ def test_literal_to_ibis_scalar_throws_on_incompatible_literal():
def test_remote_function_io_types_are_supported_bigframes_types():
+ from ibis.expr.datatypes.core import dtype as python_type_to_bigquery_type
+
from bigframes.remote_function import (
- _supported_io_ibis_types as rf_supported_io_ibis_types,
+ SUPPORTED_IO_PYTHON_TYPES as rf_supported_io_types,
)
- for ibis_type in rf_supported_io_ibis_types:
+ for python_type in rf_supported_io_types:
+ ibis_type = python_type_to_bigquery_type(python_type)
assert ibis_type in bigframes.dtypes.IBIS_TO_BIGFRAMES
diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py
index a178a45438..2325fc96a0 100644
--- a/tests/unit/test_pandas.py
+++ b/tests/unit/test_pandas.py
@@ -17,12 +17,17 @@
import sys
import unittest.mock as mock
+import google.api_core.exceptions
+import google.cloud.bigquery
import pandas as pd
import pytest
+import bigframes.core.global_session
import bigframes.pandas as bpd
import bigframes.session
+from . import resources
+
leading_whitespace = re.compile(r"^\s+", flags=re.MULTILINE)
@@ -109,3 +114,37 @@ def test_pandas_attribute():
assert bpd.Int64Dtype is pd.Int64Dtype
assert bpd.StringDtype is pd.StringDtype
assert bpd.ArrowDtype is pd.ArrowDtype
+
+
+def test_reset_session_after_bq_session_ended(monkeypatch):
+ bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True)
+ bqclient.project = "test-project"
+ session = resources.create_bigquery_session(
+ bqclient=bqclient, session_id="JUST_A_TEST"
+ )
+
+ # Simulate that the session has already expired.
+ # Note: this needs to be done after the Session is constructed, as the
+ # initializer sends a query to start the BigQuery Session.
+ query_job = mock.create_autospec(google.cloud.bigquery.QueryJob, instance=True)
+ query_job.result.side_effect = google.api_core.exceptions.BadRequest(
+ "Session JUST_A_TEST has expired and is no longer available."
+ )
+ bqclient.query.return_value = query_job
+
+ # Simulate that the session has already started.
+ monkeypatch.setattr(bigframes.core.global_session, "_global_session", session)
+ bpd.options.bigquery._session_started = True
+
+ # Confirm that as a result bigframes.pandas interface is unusable
+ with pytest.raises(
+ google.api_core.exceptions.BadRequest,
+ match="Session JUST_A_TEST has expired and is no longer available.",
+ ):
+ bpd.read_gbq("SELECT 1")
+
+ # Even though the query to stop the session raises an exception, we should
+ # still be able to reset it without raising an error to the user.
+ bpd.reset_session()
+ assert "CALL BQ.ABORT_SESSION('JUST_A_TEST')" in bqclient.query.call_args.args[0]
+ assert bigframes.core.global_session._global_session is None
diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py
index ab573c4c11..e39a316e5b 100644
--- a/tests/unit/test_session.py
+++ b/tests/unit/test_session.py
@@ -20,9 +20,13 @@
import bigframes
+from . import resources
+
@pytest.mark.parametrize("missing_parts_table_id", [(""), ("table")])
-def test_read_gbq_missing_parts(session, missing_parts_table_id):
+def test_read_gbq_missing_parts(missing_parts_table_id):
+ session = resources.create_bigquery_session()
+
with pytest.raises(ValueError):
session.read_gbq(missing_parts_table_id)
@@ -31,7 +35,14 @@ def test_read_gbq_missing_parts(session, missing_parts_table_id):
"not_found_table_id",
[("unknown.dataset.table"), ("project.unknown.table"), ("project.dataset.unknown")],
)
-def test_read_gdb_not_found_tables(session, not_found_table_id):
+def test_read_gdb_not_found_tables(not_found_table_id):
+ bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True)
+ bqclient.project = "test-project"
+ bqclient.get_table.side_effect = google.api_core.exceptions.NotFound(
+ "table not found"
+ )
+ session = resources.create_bigquery_session(bqclient=bqclient)
+
with pytest.raises(google.api_core.exceptions.NotFound):
session.read_gbq(not_found_table_id)
diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py
index 653b65c834..8c81b23b6c 100644
--- a/third_party/bigframes_vendored/pandas/core/frame.py
+++ b/third_party/bigframes_vendored/pandas/core/frame.py
@@ -15,6 +15,7 @@
import numpy
+from bigframes import constants
from third_party.bigframes_vendored.pandas.core.generic import NDFrame
# -----------------------------------------------------------------------
@@ -33,7 +34,7 @@ class DataFrame(NDFrame):
@property
def shape(self) -> tuple[int, int]:
"""Return a tuple representing the dimensionality of the DataFrame."""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def axes(self) -> list:
@@ -44,6 +45,7 @@ def axes(self) -> list:
They are returned in that order.
Examples
+
.. code-block::
df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
@@ -66,7 +68,7 @@ def values(self) -> numpy.ndarray:
na_value (default None):
The value to use for missing values.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# ----------------------------------------------------------------------
# IO methods (to / from other formats)
@@ -89,7 +91,7 @@ def to_numpy(
Returns:
numpy.ndarray: The converted NumPy array.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def to_gbq(
self,
@@ -123,7 +125,7 @@ def to_gbq(
If set, write the ordering of the DataFrame as a column in the
result table with this name.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def to_parquet(
self,
@@ -150,7 +152,7 @@ def to_parquet(
Returns:
None.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# ----------------------------------------------------------------------
# Unsorted
@@ -175,10 +177,10 @@ def assign(self, **kwargs) -> DataFrame:
are simply assigned to the column.
Returns:
- DataFrame: A new DataFrame with the new columns in addition to
- all the existing columns.
+ bigframes.dataframe.DataFrame: A new DataFrame with the new columns
+ in addition to all the existing columns.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# ----------------------------------------------------------------------
# Reindexing and alignment
@@ -205,12 +207,12 @@ def drop(
level:
For MultiIndex, level from which the labels will be removed.
Returns:
- DataFrame: DataFrame without the removed column labels.
+ bigframes.dataframe.DataFrame: DataFrame without the removed column labels.
Raises:
KeyError: If any of the labels is not found in the selected axis.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rename(
self,
@@ -227,18 +229,18 @@ def rename(
Dict-like from old column labels to new column labels.
Returns:
- DataFrame: DataFrame with the renamed axis labels.
+ bigframes.dataframe.DataFrame: DataFrame with the renamed axis labels.
Raises:
KeyError: If any of the labels is not found.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rename_axis(self, mapper: Optional[str], **kwargs) -> DataFrame:
"""
Set the name of the axis for the index.
- .. Note::
+ .. note::
Currently only accepts a single string parameter (the new name of the index).
@@ -247,9 +249,9 @@ def rename_axis(self, mapper: Optional[str], **kwargs) -> DataFrame:
Value to set the axis name attribute.
Returns:
- DataFrame: DataFrame with the new index name
+ bigframes.dataframe.DataFrame: DataFrame with the new index name
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def set_index(
self,
@@ -272,7 +274,7 @@ def set_index(
Returns:
DataFrame: Changed row labels.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def reorder_levels(self, order: Sequence[int | str]) -> DataFrame:
"""
@@ -286,7 +288,7 @@ def reorder_levels(self, order: Sequence[int | str]) -> DataFrame:
Returns:
DataFrame: DataFrame of rearranged index.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def droplevel(self, level):
"""
@@ -300,7 +302,7 @@ def droplevel(self, level):
Returns:
DataFrame: DataFrame with requested index / column level(s) removed.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def reset_index(
self,
@@ -317,9 +319,9 @@ def reset_index(
the index to the default integer index.
Returns:
- DataFrame: DataFrame with the new index.
+ bigframes.dataframe.DataFrame: DataFrame with the new index.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def drop_duplicates(
self,
@@ -344,9 +346,9 @@ def drop_duplicates(
- ``False`` : Drop all duplicates.
Returns:
- DataFrame: DataFrame with duplicates removed
+ bigframes.dataframe.DataFrame: DataFrame with duplicates removed
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def duplicated(self, subset=None, keep="first"):
"""
@@ -366,9 +368,9 @@ def duplicated(self, subset=None, keep="first"):
- False : Mark all duplicates as ``True``.
Returns:
- Boolean series for each duplicated rows.
+ bigframes.series.Series: Boolean series for each duplicated rows.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# ----------------------------------------------------------------------
# Reindex-based selection methods
@@ -378,10 +380,43 @@ def dropna(
) -> DataFrame:
"""Remove missing values.
+ Args:
+ axis ({0 or 'index', 1 or 'columns'}, default 'columns'):
+ Determine if rows or columns which contain missing values are
+ removed.
+
+ * 0, or 'index' : Drop rows which contain missing values.
+ * 1, or 'columns' : Drop columns which contain missing value.
+ how ({'any', 'all'}, default 'any'):
+ Determine if row or column is removed from DataFrame, when we have
+ at least one NA or all NA.
+
+ * 'any' : If any NA values are present, drop that row or column.
+ * 'all' : If all values are NA, drop that row or column.
+ ignore_index (bool, default ``False``):
+ If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
+
+
+ Returns:
+ bigframes.dataframe.DataFrame: DataFrame with NA entries dropped from it.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def isin(self, values):
+ """
+ Whether each element in the DataFrame is contained in values.
+
+ Args:
+ values (iterable, or dict):
+ The result will only be true at a location if all the
+ labels match. If `values` is a dict, the keys must be
+ the column names, which must match.
+
Returns:
- DataFrame: DataFrame with NA entries dropped from it.
+ DataFrame: DataFrame of booleans showing whether each element
+ in the DataFrame is contained in values.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# ----------------------------------------------------------------------
# Sorting
@@ -414,7 +449,7 @@ def sort_values(
Returns:
DataFrame with sorted values.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def sort_index(
self,
@@ -424,7 +459,7 @@ def sort_index(
Returns:
The original DataFrame sorted by the labels.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# ----------------------------------------------------------------------
# Arithmetic Methods
@@ -449,7 +484,7 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame:
Returns:
Result of the comparison.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def ne(self, other, axis: str | int = "columns") -> DataFrame:
"""
@@ -470,7 +505,7 @@ def ne(self, other, axis: str | int = "columns") -> DataFrame:
Returns:
DataFrame: Result of the comparison.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def le(self, other, axis: str | int = "columns") -> DataFrame:
"""Get 'less than or equal to' of dataframe and other, element-wise (binary operator `<=`).
@@ -496,7 +531,7 @@ def le(self, other, axis: str | int = "columns") -> DataFrame:
Returns:
DataFrame: DataFrame of bool. The result of the comparison.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def lt(self, other, axis: str | int = "columns") -> DataFrame:
"""Get 'less than' of DataFrame and other, element-wise (binary operator `<`).
@@ -522,7 +557,7 @@ def lt(self, other, axis: str | int = "columns") -> DataFrame:
Returns:
DataFrame: DataFrame of bool. The result of the comparison.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def ge(self, other, axis: str | int = "columns") -> DataFrame:
"""Get 'greater than or equal to' of DataFrame and other, element-wise (binary operator `>=`).
@@ -548,7 +583,7 @@ def ge(self, other, axis: str | int = "columns") -> DataFrame:
Returns:
DataFrame: DataFrame of bool. The result of the comparison.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def gt(self, other, axis: str | int = "columns") -> DataFrame:
"""Get 'greater than' of DataFrame and other, element-wise (binary operator `>`).
@@ -574,7 +609,7 @@ def gt(self, other, axis: str | int = "columns") -> DataFrame:
Returns:
DataFrame: DataFrame of bool: The result of the comparison.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def add(self, other, axis: str | int = "columns") -> DataFrame:
"""Get addition of DataFrame and other, element-wise (binary operator `+`).
@@ -597,7 +632,7 @@ def add(self, other, axis: str | int = "columns") -> DataFrame:
Returns:
DataFrame: DataFrame result of the arithmetic operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def sub(self, other, axis: str | int = "columns") -> DataFrame:
"""Get subtraction of DataFrame and other, element-wise (binary operator `-`).
@@ -620,7 +655,7 @@ def sub(self, other, axis: str | int = "columns") -> DataFrame:
Returns:
DataFrame: DataFrame result of the arithmetic operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rsub(self, other, axis: str | int = "columns") -> DataFrame:
"""Get subtraction of DataFrame and other, element-wise (binary operator `-`).
@@ -643,7 +678,7 @@ def rsub(self, other, axis: str | int = "columns") -> DataFrame:
Returns:
DataFrame: DataFrame result of the arithmetic operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def mul(self, other, axis: str | int = "columns") -> DataFrame:
"""Get multiplication of DataFrame and other, element-wise (binary operator `*`).
@@ -666,7 +701,7 @@ def mul(self, other, axis: str | int = "columns") -> DataFrame:
Returns:
DataFrame: DataFrame result of the arithmetic operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def truediv(self, other, axis: str | int = "columns") -> DataFrame:
"""Get floating division of DataFrame and other, element-wise (binary operator `/`).
@@ -689,7 +724,7 @@ def truediv(self, other, axis: str | int = "columns") -> DataFrame:
Returns:
DataFrame: DataFrame result of the arithmetic operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rtruediv(self, other, axis: str | int = "columns") -> DataFrame:
"""Get floating division of DataFrame and other, element-wise (binary operator `/`).
@@ -712,7 +747,7 @@ def rtruediv(self, other, axis: str | int = "columns") -> DataFrame:
Returns:
DataFrame result of the arithmetic operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def floordiv(self, other, axis: str | int = "columns") -> DataFrame:
"""Get integer division of DataFrame and other, element-wise (binary operator `//`).
@@ -735,7 +770,7 @@ def floordiv(self, other, axis: str | int = "columns") -> DataFrame:
Returns:
DataFrame: DataFrame result of the arithmetic operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame:
"""Get integer division of DataFrame and other, element-wise (binary operator `//`).
@@ -758,7 +793,7 @@ def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame:
Returns:
DataFrame: DataFrame result of the arithmetic operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def mod(self, other, axis: str | int = "columns") -> DataFrame:
"""Get modulo of DataFrame and other, element-wise (binary operator `%`).
@@ -781,7 +816,7 @@ def mod(self, other, axis: str | int = "columns") -> DataFrame:
Returns:
DataFrame: DataFrame result of the arithmetic operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rmod(self, other, axis: str | int = "columns") -> DataFrame:
"""Get modulo of DataFrame and other, element-wise (binary operator `%`).
@@ -804,7 +839,55 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame:
Returns:
DataFrame: DataFrame result of the arithmetic operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def pow(self, other, axis: str | int = "columns") -> DataFrame:
+ """Get Exponential power of dataframe and other, element-wise (binary operator `pow`).
+
+ Equivalent to ``dataframe ** other``, but with support to substitute a fill_value
+ for missing data in one of the inputs. With reverse version, `rpow`.
+
+ Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to
+ arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`.
+
+ .. note::
+ Mismatched indices will be unioned together.
+
+ Args:
+ other (float, int, or Series):
+ Any single or multiple element data structure, or list-like object.
+ axis ({0 or 'index', 1 or 'columns'}):
+ Whether to compare by the index (0 or 'index') or columns.
+ (1 or 'columns'). For Series input, axis to match Series index on.
+
+ Returns:
+ DataFrame: DataFrame result of the arithmetic operation.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def rpow(self, other, axis: str | int = "columns") -> DataFrame:
+ """Get Exponential power of dataframe and other, element-wise (binary operator `rpow`).
+
+ Equivalent to ``other ** dataframe``, but with support to substitute a fill_value
+ for missing data in one of the inputs. With reverse version, `pow`.
+
+ Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to
+ arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`.
+
+ .. note::
+ Mismatched indices will be unioned together.
+
+ Args:
+ other (float, int, or Series):
+ Any single or multiple element data structure, or list-like object.
+ axis ({0 or 'index', 1 or 'columns'}):
+ Whether to compare by the index (0 or 'index') or columns.
+ (1 or 'columns'). For Series input, axis to match Series index on.
+
+ Returns:
+ DataFrame: DataFrame result of the arithmetic operation.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# ----------------------------------------------------------------------
# Data reshaping
@@ -844,9 +927,9 @@ def groupby(
values will also be treated as the key in groups.
Returns:
- A groupby object that contains information about the groups.
+ bigframes.core.groupby.SeriesGroupBy: A groupby object that contains information about the groups.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# ----------------------------------------------------------------------
# Function application
@@ -871,9 +954,9 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame:
values, without passing them to func.
Returns:
- DataFrame: Transformed DataFrame.
+ bigframes.dataframe.DataFrame: Transformed DataFrame.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# ----------------------------------------------------------------------
# Merging / joining methods
@@ -899,9 +982,9 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame:
index, preserving the order of the calling's one.
Returns:
- DataFrame: A dataframe containing columns from both the caller and `other`.
+ bigframes.dataframe.DataFrame: A dataframe containing columns from both the caller and `other`.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def merge(
self,
@@ -969,9 +1052,9 @@ def merge(
no suffix. At least one of the values must not be None.
Returns:
- DataFrame: A DataFrame of the two merged objects.
+ bigframes.dataframe.DataFrame: A DataFrame of the two merged objects.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# ----------------------------------------------------------------------
# ndarray-like stats methods
@@ -991,7 +1074,7 @@ def any(self, *, bool_only: bool = False):
Returns:
Series
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def all(self, *, bool_only: bool = False):
"""
@@ -1006,9 +1089,9 @@ def all(self, *, bool_only: bool = False):
Include only boolean columns.
Returns:
- Series
+ bigframes.series.Series: Series if all elements are True.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def prod(self, *, numeric_only: bool = False):
"""
@@ -1019,9 +1102,9 @@ def prod(self, *, numeric_only: bool = False):
Include only float, int, boolean columns.
Returns:
- Series
+ bigframes.series.Series: Series with the product of the values.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def min(self, *, numeric_only: bool = False):
"""Return the minimum of the values over the requested axis.
@@ -1034,9 +1117,9 @@ def min(self, *, numeric_only: bool = False):
Default False. Include only float, int, boolean columns.
Returns:
- Series
+ bigframes.series.Series: Series with the minimum of the values.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def max(self, *, numeric_only: bool = False):
"""Return the maximum of the values over the requested axis.
@@ -1049,9 +1132,9 @@ def max(self, *, numeric_only: bool = False):
Default False. Include only float, int, boolean columns.
Returns:
- Series
+ bigframes.series.Series: Series after the maximum of values.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def sum(self, *, numeric_only: bool = False):
"""Return the sum of the values over the requested axis.
@@ -1063,9 +1146,9 @@ def sum(self, *, numeric_only: bool = False):
Default False. Include only float, int, boolean columns.
Returns:
- Series
+ bigframes.series.Series: Series with the sum of values.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def mean(self, *, numeric_only: bool = False):
"""Return the mean of the values over the requested axis.
@@ -1075,9 +1158,9 @@ def mean(self, *, numeric_only: bool = False):
Default False. Include only float, int, boolean columns.
Returns:
- Series
+ bigframes.series.Series: Series with the mean of values.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def median(self, *, numeric_only: bool = False, exact: bool = False):
"""Return the median of the values over the requested axis.
@@ -1090,9 +1173,9 @@ def median(self, *, numeric_only: bool = False, exact: bool = False):
one. Note: ``exact=True`` not yet supported.
Returns:
- Series
+ bigframes.series.Series: Series with the median of values.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def var(self, *, numeric_only: bool = False):
"""Return unbiased variance over requested axis.
@@ -1104,9 +1187,9 @@ def var(self, *, numeric_only: bool = False):
Default False. Include only float, int, boolean columns.
Returns:
- Series
+ bigframes.series.Series: Series with unbiased variance over requested axis.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def std(self, *, numeric_only: bool = False):
"""Return sample standard deviation over requested axis.
@@ -1118,9 +1201,9 @@ def std(self, *, numeric_only: bool = False):
Default False. Include only float, int, boolean columns.
Returns:
- Series
+ bigframes.series.Series: Series with sample standard deviation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def count(self, *, numeric_only: bool = False):
"""
@@ -1134,21 +1217,19 @@ def count(self, *, numeric_only: bool = False):
Include only `float`, `int` or `boolean` data.
Returns:
- For each column/row the number of non-NA/null entries.
- If `level` is specified returns a `DataFrame`.
+ bigframes.series.Series: For each column/row the number of
+ non-NA/null entries. If `level` is specified returns a `DataFrame`.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def nunique(self):
"""
Count number of distinct elements in specified axis.
- Return Series with number of distinct elements.
-
Returns:
- Series
+ bigframes.series.Series: Series with number of distinct elements.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def cummin(self) -> DataFrame:
"""Return cumulative minimum over a DataFrame axis.
@@ -1156,9 +1237,9 @@ def cummin(self) -> DataFrame:
Returns a DataFrame of the same size containing the cumulative minimum.
Returns:
- DataFrame: Return cumulative minimum of DataFrame.
+ bigframes.dataframe.DataFrame: Return cumulative minimum of DataFrame.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def cummax(self) -> DataFrame:
"""Return cumulative maximum over a DataFrame axis.
@@ -1166,9 +1247,9 @@ def cummax(self) -> DataFrame:
Returns a DataFrame of the same size containing the cumulative maximum.
Returns:
- DataFrame: Return cumulative maximum of DataFrame.
+ bigframes.dataframe.DataFrame: Return cumulative maximum of DataFrame.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def cumsum(self) -> DataFrame:
"""Return cumulative sum over a DataFrame axis.
@@ -1176,9 +1257,9 @@ def cumsum(self) -> DataFrame:
Returns a DataFrame of the same size containing the cumulative sum.
Returns:
- DataFrame: Return cumulative sum of DataFrame.
+ bigframes.dataframe.DataFrame: Return cumulative sum of DataFrame.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def cumprod(self) -> DataFrame:
"""Return cumulative product over a DataFrame axis.
@@ -1186,9 +1267,9 @@ def cumprod(self) -> DataFrame:
Returns a DataFrame of the same size containing the cumulative product.
Returns:
- DataFrame: Return cumulative product of DataFrame.
+ bigframes.dataframe.DataFrame: Return cumulative product of DataFrame.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def agg(self, func):
"""
@@ -1201,9 +1282,9 @@ def agg(self, func):
function names, e.g. ``['sum', 'mean']``.
Returns:
- Series or DataFrame: Aggregated results
+ DataFrame or bigframes.series.Series: Aggregated results.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def describe(self):
"""
@@ -1218,19 +1299,76 @@ def describe(self):
.. note::
Percentile values are approximates only.
+ .. note::
+ For numeric data, the result's index will include ``count``,
+ ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and
+ upper percentiles. By default the lower percentile is ``25`` and the
+ upper percentile is ``75``. The ``50`` percentile is the
+ same as the median.
+
Returns:
- Summary statistics of the Series or Dataframe provided.
+ bigframes.dataframe.DataFrame: Summary statistics of the Series or Dataframe provided.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def pivot(self, *, columns, index=None, values=None):
+ """
+ Return reshaped DataFrame organized by given index / column values.
+
+ Reshape data (produce a "pivot" table) based on column values. Uses
+ unique values from specified `index` / `columns` to form axes of the
+ resulting DataFrame. This function does not support data
+ aggregation, multiple values will result in a MultiIndex in the
+ columns.
+ .. note::
+ BigQuery supports up to 10000 columns. Pivot operations on columns
+ with too many unique values will fail if they would exceed this limit.
- Notes
- -----
- For numeric data, the result's index will include ``count``,
- ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and
- upper percentiles. By default the lower percentile is ``25`` and the
- upper percentile is ``75``. The ``50`` percentile is the
- same as the median.
+ .. note::
+ The validity of the pivot operation is not checked. If columns and index
+ do not together uniquely identify input rows, the output will be
+ silently non-deterministic.
+
+ Args:
+ columns (str or object or a list of str):
+ Column to use to make new frame's columns.
+
+ index (str or object or a list of str, optional):
+ Column to use to make new frame's index. If not given, uses existing index.
+
+ values (str, object or a list of the previous, optional):
+ Column(s) to use for populating new frame's values. If not
+ specified, all remaining columns will be used and the result will
+ have hierarchically indexed columns.
+
+ Returns:
+ Returns reshaped DataFrame.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def stack(self):
"""
- raise NotImplementedError("abstract method")
+ Stack the prescribed level(s) from columns to index.
+
+ Return a reshaped DataFrame or Series having a multi-level
+ index with one or more new inner-most levels compared to the current
+ DataFrame. The new inner-most levels are created by pivoting the
+ columns of the current dataframe:
+
+ - if the columns have a single level, the output is a Series;
+ - if the columns have multiple levels, the new index
+ level(s) is (are) taken from the prescribed level(s) and
+ the output is a DataFrame.
+
+ .. note::
+ BigQuery DataFrames does not support stack operations that would
+ combine columns of different dtypes.
+
+ Returns:
+ DataFrame or Series: Stacked dataframe or series.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# ----------------------------------------------------------------------
# Add index and columns
@@ -1247,12 +1385,12 @@ def index(self):
Returns:
The index labels of the DataFrame.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def columns(self):
"The column labels of the DataFrame."
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def value_counts(
self,
@@ -1280,4 +1418,21 @@ def value_counts(
Returns:
Series: Series containing counts of unique rows in the DataFrame
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def fillna(self, value):
+ """
+ Fill NA/NaN values using the specified method.
+
+ Args:
+ value (scalar, Series):
+ Value to use to fill holes (e.g. 0), alternately a
+ Series of values specifying which value to use for
+ each index (for a Series) or column (for a DataFrame). Values not
+ in the Series will not be filled. This value cannot
+ be a list.
+
+ Returns:
+ DataFrame: Object with missing values filled
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py
index 4843c971da..56d3b2434f 100644
--- a/third_party/bigframes_vendored/pandas/core/generic.py
+++ b/third_party/bigframes_vendored/pandas/core/generic.py
@@ -3,6 +3,7 @@
from typing import Literal, Optional
+from bigframes import constants
from third_party.bigframes_vendored.pandas.core import indexing
@@ -22,7 +23,7 @@ def ndim(self) -> int:
Returns:
int: Return 1 if Series. Otherwise return 2 if DataFrame.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def size(self) -> int:
@@ -32,7 +33,7 @@ def size(self) -> int:
int: Return the number of rows if Series. Otherwise return the number of
rows times number of columns if DataFrame.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# -------------------------------------------------------------------------
# Unary Methods
@@ -46,7 +47,7 @@ def abs(self):
Series/DataFrame containing the absolute value of each element.
Returns a Series/DataFrame containing the absolute value of each element.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def astype(self, dtype):
"""
@@ -66,7 +67,7 @@ def astype(self, dtype):
same type as caller
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# ----------------------------------------------------------------------
# Iteration
@@ -85,7 +86,7 @@ def empty(self) -> bool:
Returns:
bool: If Series/DataFrame is empty, return True, if not return False.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# ----------------------------------------------------------------------
# I/O Methods
@@ -155,7 +156,7 @@ def to_json(
Returns:
None: String output not yet supported.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def to_csv(self, path_or_buf: str, *, index: bool = True) -> str | None:
"""Write object to a comma-separated values (csv) file on Cloud Storage.
@@ -177,7 +178,7 @@ def to_csv(self, path_or_buf: str, *, index: bool = True) -> str | None:
Returns:
None: String output not yet supported.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# ----------------------------------------------------------------------
# Unsorted
@@ -215,7 +216,7 @@ def add_prefix(self, prefix: str, axis: int | str | None = None):
Returns:
New Series or DataFrame with updated labels.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def add_suffix(self, suffix: str, axis: int | str | None = None):
"""Suffix labels with string `suffix`.
@@ -233,7 +234,7 @@ def add_suffix(self, suffix: str, axis: int | str | None = None):
Returns:
New Series or DataFrame with updated labels.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def head(self, n: int = 5):
"""Return the first `n` rows.
@@ -254,7 +255,7 @@ def head(self, n: int = 5):
Returns:
The first `n` rows of the caller object.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def tail(self, n: int = 5):
"""Return the last `n` rows.
@@ -275,7 +276,7 @@ def tail(self, n: int = 5):
Returns:
The last `n` rows of the caller object.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def sample(
self,
@@ -301,7 +302,7 @@ def sample(
A new object of same type as caller containing `n` items randomly
sampled from the caller object.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# ----------------------------------------------------------------------
# Internal Interface Methods
@@ -317,7 +318,7 @@ def dtypes(self):
Returns:
A *pandas* Series with the data type of each column.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def copy(self):
"""Make a copy of this object's indices and data.
@@ -329,7 +330,7 @@ def copy(self):
Returns:
Object type matches caller.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# ----------------------------------------------------------------------
# Action Methods
@@ -346,7 +347,7 @@ def isna(self) -> NDFrame:
Mask of bool values for each element that indicates whether an
element is an NA value.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
isnull = isna
@@ -362,7 +363,7 @@ def notna(self) -> NDFrame:
NDFrame: Mask of bool values for each element that indicates whether an
element is not an NA value.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
notnull = notna
@@ -381,7 +382,7 @@ def shift(
Returns:
NDFrame: Copy of input object, shifted.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rank(
self,
@@ -419,7 +420,7 @@ def rank(
Returns:
same type as caller: Return a Series or DataFrame with data ranks as values.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def __nonzero__(self):
raise ValueError(
diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
index 95822718c3..9271da8a5e 100644
--- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
+++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py
@@ -9,6 +9,8 @@ class providing the base-class of operations.
"""
from __future__ import annotations
+from bigframes import constants
+
class GroupBy:
"""
@@ -24,7 +26,7 @@ def any(self):
where a value is True if any element is True within its
respective group, False otherwise.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def all(self):
"""
@@ -35,7 +37,7 @@ def all(self):
where a value is True if all elements are True within its
respective group, False otherwise.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def count(self):
"""
@@ -44,7 +46,7 @@ def count(self):
Returns:
Series or DataFrame: Count of values within each group.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def mean(
self,
@@ -60,7 +62,7 @@ def mean(
Returns:
pandas.Series or pandas.DataFrame: Mean of groups.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def median(
self,
@@ -81,7 +83,7 @@ def median(
Returns:
pandas.Series or pandas.DataFrame: Median of groups.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def std(
self,
@@ -100,7 +102,7 @@ def std(
Returns:
Series or DataFrame: Standard deviation of values within each group.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def var(
self,
@@ -120,7 +122,7 @@ def var(
Series or DataFrame
Variance of values within each group.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def sum(
self,
@@ -140,7 +142,7 @@ def sum(
Returns:
Series or DataFrame: Computed sum of values within each group.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def prod(self, numeric_only: bool = False, min_count: int = 0):
"""
@@ -156,7 +158,7 @@ def prod(self, numeric_only: bool = False, min_count: int = 0):
Returns:
Series or DataFrame: Computed prod of values within each group.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def min(
self,
@@ -176,7 +178,7 @@ def min(
Returns:
Series or DataFrame: Computed min of values within each group.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def max(
self,
@@ -196,7 +198,7 @@ def max(
Returns:
Series or DataFrame: Computed max of values within each group.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def cumcount(self, ascending: bool = True):
"""
@@ -209,7 +211,7 @@ def cumcount(self, ascending: bool = True):
Returns:
Series: Sequence number of each element within each group.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def cumprod(self, *args, **kwargs):
"""
@@ -218,7 +220,7 @@ def cumprod(self, *args, **kwargs):
Returns:
Series or DataFrame: Cumulative product for each group.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def cumsum(self, *args, **kwargs):
"""
@@ -227,7 +229,7 @@ def cumsum(self, *args, **kwargs):
Returns:
Series or DataFrame: Cumulative sum for each group.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def cummin(self, *args, numeric_only: bool = False, **kwargs):
"""
@@ -236,7 +238,7 @@ def cummin(self, *args, numeric_only: bool = False, **kwargs):
Returns:
Series or DataFrame: Cumulative min for each group.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def cummax(self, *args, numeric_only: bool = False, **kwargs):
"""
@@ -245,7 +247,7 @@ def cummax(self, *args, numeric_only: bool = False, **kwargs):
Returns:
Series or DataFrame: Cumulative max for each group.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def diff(self):
"""
@@ -256,7 +258,7 @@ def diff(self):
Returns:
Series or DataFrame: First differences.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def shift(self, periods: int = 1):
"""
@@ -269,7 +271,7 @@ def shift(self, periods: int = 1):
Returns:
Series or DataFrame: Object shifted within each group.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rolling(self, *args, **kwargs):
"""
@@ -289,7 +291,7 @@ def rolling(self, *args, **kwargs):
Returns:
Series or DataFrame: Return a new grouper with our rolling appended.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def expanding(self, *args, **kwargs):
"""
@@ -298,7 +300,7 @@ def expanding(self, *args, **kwargs):
Returns:
Series or DataFrame: A expanding grouper, providing expanding functionality per group.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
class SeriesGroupBy(GroupBy):
@@ -318,7 +320,7 @@ def agg(self, func):
Returns:
Series or DataFrame
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
class DataFrameGroupBy(GroupBy):
@@ -347,4 +349,4 @@ def agg(self, func, **kwargs):
Returns:
DataFrame
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py
index d59886e8aa..2b4a326317 100644
--- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py
+++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py
@@ -1,3 +1,6 @@
+from bigframes import constants
+
+
class DatetimeProperties:
"""
Accessor object for datetime-like properties of the Series values.
@@ -7,7 +10,7 @@ class DatetimeProperties:
def day(self):
"""The day of the datetime."""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def dayofweek(self):
@@ -22,7 +25,7 @@ def dayofweek(self):
Series or Index: Containing integers indicating the day number.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def date(self):
@@ -36,31 +39,31 @@ def date(self):
a numpy array.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def hour(self):
"""The hours of the datetime."""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def minute(self):
"""The minutes of the datetime."""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def month(self):
"""The month as January=1, December=12."""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def second(self):
"""The seconds of the datetime."""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def time(self):
@@ -73,7 +76,7 @@ def time(self):
a numpy array.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def quarter(self):
@@ -84,10 +87,10 @@ def quarter(self):
a numpy array.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def year(self):
"""The year of the datetime."""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py
index ebad5eb918..404a99809c 100644
--- a/third_party/bigframes_vendored/pandas/core/indexes/base.py
+++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py
@@ -1,5 +1,7 @@
# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/indexes/base.py
+from bigframes import constants
+
class Index:
"""Immutable sequence used for indexing and alignment.
@@ -10,14 +12,14 @@ class Index:
@property
def name(self):
"""Returns Index name."""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def shape(self):
"""
Return a tuple of the shape of the underlying data.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def to_numpy(self, dtype):
"""
@@ -33,4 +35,4 @@ def to_numpy(self, dtype):
Returns:
numpy.ndarray
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/pandas/core/indexing.py b/third_party/bigframes_vendored/pandas/core/indexing.py
index d5b9f3c079..fae5d6261f 100644
--- a/third_party/bigframes_vendored/pandas/core/indexing.py
+++ b/third_party/bigframes_vendored/pandas/core/indexing.py
@@ -1,5 +1,7 @@
# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/indexing.py
+from bigframes import constants
+
class IndexingMixin:
"""
@@ -32,7 +34,7 @@ def iloc(self):
out-of-bounds, except *slice* indexers which allow out-of-bounds
indexing (this conforms with python/numpy *slice* semantics).
"""
- raise NotImplementedError("abstract methdod")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def loc(self):
@@ -63,4 +65,4 @@ def loc(self):
NotImplementError: if the inputs are not supported.
"""
- raise NotImplementedError("abstract methdod")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/pandas/core/reshape/concat.py b/third_party/bigframes_vendored/pandas/core/reshape/concat.py
index 6a5a9fdde9..6e6d2d8b5c 100644
--- a/third_party/bigframes_vendored/pandas/core/reshape/concat.py
+++ b/third_party/bigframes_vendored/pandas/core/reshape/concat.py
@@ -4,6 +4,8 @@
"""
from __future__ import annotations
+from bigframes import constants
+
def concat(
objs,
@@ -135,4 +137,4 @@ def concat(
[4 rows x 2 columns]
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/pandas/core/reshape/merge.py b/third_party/bigframes_vendored/pandas/core/reshape/merge.py
new file mode 100644
index 0000000000..ee02d698da
--- /dev/null
+++ b/third_party/bigframes_vendored/pandas/core/reshape/merge.py
@@ -0,0 +1,78 @@
+# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/reshape/merge.py
+"""
+SQL-style merge routines
+"""
+from __future__ import annotations
+
+
+def merge(
+ left,
+ right,
+ how="inner",
+ on=None,
+ *,
+ left_on=None,
+ right_on=None,
+ sort=False,
+ suffixes=("_x", "_y"),
+):
+
+ """
+ Merge DataFrame objects with a database-style join.
+
+ The join is done on columns or indexes. If joining columns on
+ columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
+ on indexes or indexes on a column or columns, the index will be passed on.
+ When performing a cross merge, no column specifications to merge on are
+ allowed.
+
+ .. note::
+ A named Series object is treated as a DataFrame with a single named column.
+
+ .. warning::
+ If both key columns contain rows where the key is a null value, those
+ rows will be matched against each other. This is different from usual SQL
+ join behaviour and can lead to unexpected results.
+
+ Args:
+ left:
+ The primary object to be merged.
+ right:
+ Object to merge with.
+ how:
+ ``{'left', 'right', 'outer', 'inner'}, default 'inner'``
+ Type of merge to be performed.
+ ``left``: use only keys from left frame, similar to a SQL left outer join;
+ preserve key order.
+ ``right``: use only keys from right frame, similar to a SQL right outer join;
+ preserve key order.
+ ``outer``: use union of keys from both frames, similar to a SQL full outer
+ join; sort keys lexicographically.
+ ``inner``: use intersection of keys from both frames, similar to a SQL inner
+ join; preserve the order of the left keys.
+
+ on:
+ Column join on. It must be found in both DataFrames. Either on or left_on + right_on
+ must be passed in.
+ left_on:
+ Column join on in the left DataFrame. Either on or left_on + right_on
+ must be passed in.
+ right_on:
+ Column join on in the right DataFrame. Either on or left_on + right_on
+ must be passed in.
+ sort:
+ Default False. Sort the join keys lexicographically in the
+ result DataFrame. If False, the order of the join keys depends
+ on the join type (how keyword).
+ suffixes:
+ Default ``("_x", "_y")``. A length-2 sequence where each
+ element is optionally a string indicating the suffix to add to
+ overlapping column names in `left` and `right` respectively.
+ Pass a value of `None` instead of a string to indicate that the
+ column name from `left` or `right` should be left as-is, with
+ no suffix. At least one of the values must not be None.
+
+ Returns:
+ bigframes.dataframe.DataFrame: A DataFrame of the two merged objects.
+ """
+ raise NotImplementedError("abstract method")
diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py
index 9381ad4552..4f5f2efef0 100644
--- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py
+++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py
@@ -4,6 +4,8 @@
"""
from __future__ import annotations
+from bigframes import constants
+
def cut(
x,
@@ -62,4 +64,4 @@ def cut(
are whatever the type in the sequence is.
False : returns an ndarray of integers.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py
index 039dc1eae0..76fb46a700 100644
--- a/third_party/bigframes_vendored/pandas/core/series.py
+++ b/third_party/bigframes_vendored/pandas/core/series.py
@@ -10,6 +10,7 @@
from pandas._typing import Axis, FilePath, NaPosition, WriteBuffer
import pandas.io.formats.format as fmt
+from bigframes import constants
from third_party.bigframes_vendored.pandas.core.generic import NDFrame
if TYPE_CHECKING:
@@ -23,31 +24,31 @@ def dt(self):
"""
Accessor object for datetime-like properties of the Series values.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def index(self):
"""The index (axis labels) of the Series."""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def shape(self):
"""Return a tuple of the shape of the underlying data."""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def dtype(self):
"""
Return the dtype object of the underlying data.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def dtypes(self):
"""
Return the dtype object of the underlying data.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def name(self) -> Hashable:
@@ -62,7 +63,7 @@ def name(self) -> Hashable:
hashable object: The name of the Series, also the column name
if part of a DataFrame.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def reset_index(
self,
@@ -94,13 +95,13 @@ def reset_index(
In either case, if ``inplace=True``, no value is returned.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def __repr__(self) -> str:
"""
Return a string representation for a particular Series.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# ----------------------------------------------------------------------
# IO methods (to / from other formats)
@@ -165,7 +166,7 @@ def to_string(
result = formatter.to_string()
# catch contract violations
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def to_markdown(
self,
@@ -188,7 +189,7 @@ def to_markdown(
Returns:
str: {klass} in Markdown-friendly format.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def to_dict(self, into: type[dict] = dict) -> Mapping:
"""
@@ -204,16 +205,22 @@ def to_dict(self, into: type[dict] = dict) -> Mapping:
Returns:
collections.abc.Mapping: Key-value representation of Series.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
- def to_frame(self) -> DataFrame:
+ def to_frame(self, name=None) -> DataFrame:
"""
Convert Series to DataFrame.
+ The column in the new dataframe will be named name (the keyword parameter)
+ if the name parameter is provided and not None.
+
+ Args:
+ name (Hashable, default None)
+
Returns:
- DataFrame: DataFrame representation of Series.
+ bigframes.dataframe.DataFrame: DataFrame representation of Series.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def to_excel(self, excel_writer, sheet_name):
"""
@@ -235,7 +242,7 @@ def to_excel(self, excel_writer, sheet_name):
sheet_name (str, default 'Sheet1'):
Name of sheet to contain Series.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def to_latex(self, buf=None, columns=None, header=True, index=True, **kwargs):
"""
@@ -256,7 +263,7 @@ def to_latex(self, buf=None, columns=None, header=True, index=True, **kwargs):
str or None: If buf is None, returns the result as a string.
Otherwise returns None.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def tolist(self) -> list:
"""
@@ -269,7 +276,7 @@ def tolist(self) -> list:
Returns:
list: list of the values
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
to_list = tolist
@@ -296,7 +303,7 @@ def to_numpy(self, dtype, copy=False, na_value=None):
numpy.ndarray: A NumPy ndarray representing the values in this
Series or Index.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def to_pickle(self, path, **kwargs):
"""
@@ -308,7 +315,7 @@ def to_pickle(self, path, **kwargs):
object implementing a binary ``write()`` function. File path where
the pickled object will be stored.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def to_xarray(self):
"""
@@ -319,7 +326,7 @@ def to_xarray(self):
converted to Dataset if the object is a DataFrame, or a DataArray if
the object is a Series.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def to_json(
self,
@@ -354,7 +361,7 @@ def to_json(
None or str: If path_or_buf is None, returns the resulting json format as a
string. Otherwise returns None.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def to_csv(self, path_or_buf: str, *, index: bool = True) -> str | None:
"""
@@ -372,7 +379,7 @@ def to_csv(self, path_or_buf: str, *, index: bool = True) -> str | None:
None or str: If path_or_buf is None, returns the resulting csv format
as a string. Otherwise returns None.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def agg(self, func):
"""
@@ -387,7 +394,7 @@ def agg(self, func):
Returns:
scalar or Series: Aggregated results
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def count(self):
"""
@@ -397,7 +404,7 @@ def count(self):
int or Series (if level specified): Number of non-null values in the
Series.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def nunique(self) -> int:
"""
@@ -408,7 +415,7 @@ def nunique(self) -> int:
Returns:
int: number of unique elements in the object.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def mode(self) -> Series:
"""
@@ -419,9 +426,9 @@ def mode(self) -> Series:
Always returns Series even if only one value is returned.
Returns:
- Series: Modes of the Series in sorted order.
+ bigframes.series.Series: Modes of the Series in sorted order.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def drop_duplicates(
self,
@@ -440,9 +447,9 @@ def drop_duplicates(
``False`` : Drop all duplicates.
Returns:
- Series: Series with duplicates dropped or None if ``inplace=True``.
+ bigframes.series.Series: Series with duplicates dropped or None if ``inplace=True``.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def duplicated(self, keep="first") -> Series:
"""
@@ -463,10 +470,10 @@ def duplicated(self, keep="first") -> Series:
``False`` : Mark all duplicates as ``True``.
Returns:
- Series: Series indicating whether each value has occurred in the
- preceding values.
+ bigframes.series.Series: Series indicating whether each value has occurred in the
+ preceding values.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def round(self, decimals: int = 0) -> Series:
"""
@@ -478,7 +485,30 @@ def round(self, decimals: int = 0) -> Series:
it specifies the number of positions to the left of the decimal point.
Returns:
- Series: Rounded values of the Series.
+ bigframes.series.Series: Rounded values of the Series.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def corr(self, other, method="pearson", min_periods=None) -> float:
+ """
+ Compute the correlation with the other Series. Non-number values are ignored in the
+ computation.
+
+ Uses the "Pearson" method of correlation. Numbers are converted to float before
+ calculation, so the result may be unstable.
+
+ Args:
+ other (Series):
+ The series with which this is to be correlated.
+ method (string, default "pearson"):
+ Correlation method to use - currently only "pearson" is supported.
+ min_periods (int, default None):
+ The minimum number of observations needed to return a result. Non-default values
+ are not yet supported, so a result will be returned for at least two observations.
+
+ Returns:
+ float; Will return NaN if there are fewer than two numeric pairs, either series has a
+ variance or covariance of zero, or any input value is infinite.
"""
raise NotImplementedError("abstract method")
@@ -497,7 +527,7 @@ def diff(self) -> Series:
Returns:
{klass}: First differences of the Series.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def dot(self, other) -> Series | np.ndarray:
"""
@@ -527,19 +557,19 @@ def dot(self, other) -> Series | np.ndarray:
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def __matmul__(self, other):
"""
Matrix multiplication using binary `@` operator in Python>=3.5.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def __rmatmul__(self, other):
"""
Matrix multiplication using binary `@` operator in Python>=3.5.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def sort_values(
self,
@@ -569,9 +599,9 @@ def sort_values(
the end.
Returns:
- Series or None: Series ordered by values or None if ``inplace=True``.
+ bigframes.series.Series: Series ordered by values or None if ``inplace=True``.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def sort_index(
self,
@@ -597,12 +627,12 @@ def sort_index(
Not implemented for MultiIndex.
Returns:
- Series or None: The original Series sorted by the labels or None if
+ bigframes.series.Series: The original Series sorted by the labels or None if
``inplace=True``.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def nlargest(
self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
@@ -624,9 +654,9 @@ def nlargest(
size larger than `n`.
Returns:
- Series: The `n` largest values in the Series, sorted in decreasing order.
+ bigframes.series.Series: The `n` largest values in the Series, sorted in decreasing order.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def nsmallest(self, n: int = 5, keep: str = "first") -> Series:
"""
@@ -647,9 +677,9 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series:
size larger than `n`.
Returns:
- Series: The `n` smallest values in the Series, sorted in increasing order.
+ bigframes.series.Series: The `n` smallest values in the Series, sorted in increasing order.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
# ----------------------------------------------------------------------
# function application
@@ -669,10 +699,10 @@ def apply(
Python function or NumPy ufunc to apply.
Returns:
- Series or DataFrame: If func returns a Series object the result
+ bigframes.series.Series: If func returns a Series object the result
will be a DataFrame.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def groupby(
self,
@@ -723,9 +753,10 @@ def groupby(
If False, NA values will also be treated as the key in groups.
Returns:
- SeriesGroupBy: Returns a groupby object that contains information about the groups.
+ bigframes.core.groupby.SeriesGroupBy: Returns a groupby object that contains
+ information about the groups.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def drop(
self, labels=None, *, axis=0, index=None, columns=None, level=None
@@ -750,17 +781,14 @@ def drop(
level:
For MultiIndex, level for which the labels will be removed.
- Returns
- -------
- Series or None
- Series with specified index labels removed or None if ``inplace=True``.
+ Returns:
+ bigframes.series.Series: Series with specified index labels removed
+ or None if ``inplace=True``.
- Raises
- ------
- KeyError
- If none of the labels are found in the index.
+ Raises:
+ KeyError: If none of the labels are found in the index.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def reorder_levels(self, order: Sequence) -> Series:
"""
@@ -775,7 +803,7 @@ def reorder_levels(self, order: Sequence) -> Series:
Returns:
type of caller (new object)
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def droplevel(self, level):
"""
@@ -790,7 +818,7 @@ def droplevel(self, level):
Returns:
Series with requested index / column level(s) removed.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def fillna(
self,
@@ -806,7 +834,24 @@ def fillna(
Returns:
Series or None: Object with missing values filled or None.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series:
+ """
+ Return a new Series with missing values removed.
+
+ Args:
+ axis (0 or 'index'):
+ Unused. Parameter needed for compatibility with DataFrame.
+ inplace (bool, default False):
+ Unsupported, do not set.
+ how (str, optional):
+ Not in use. Kept for compatibility.
+
+ Returns:
+ Series: Series with NA entries dropped from it.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def between(
self,
@@ -834,7 +879,7 @@ def between(
right (inclusive).
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def cumprod(self):
"""
@@ -844,9 +889,9 @@ def cumprod(self):
product.
Returns:
- Return cumulative sum of scalar or Series.
+ bigframes.series.Series: Return cumulative sum of scalar or Series.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def cumsum(self):
"""
@@ -863,7 +908,7 @@ def cumsum(self):
Returns:
scalar or Series: Return cumulative sum of scalar or Series.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def cummax(self):
"""
@@ -878,9 +923,9 @@ def cummax(self):
For `Series` this parameter is unused and defaults to 0.
Returns:
- scalar or Series: Return cumulative maximum of scalar or Series.
+ bigframes.series.Series: Return cumulative maximum of scalar or Series.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def cummin(self):
"""
@@ -901,9 +946,9 @@ def cummin(self):
compatibility with NumPy.
Returns:
- scalar or Series: Return cumulative minimum of scalar or Series.
+ bigframes.series.Series: Return cumulative minimum of scalar or Series.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def eq(self, other) -> Series:
"""Return equal of Series and other, element-wise (binary operator eq).
@@ -918,7 +963,7 @@ def eq(self, other) -> Series:
Series: The result of the operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def ne(self, other) -> Series:
"""Return not equal of Series and other, element-wise (binary operator ne).
@@ -930,10 +975,10 @@ def ne(self, other) -> Series:
other (Series, or scalar value):
Returns:
- Series: The result of the operation.
+ bigframes.series.Series: The result of the operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def le(self, other) -> Series:
"""Get 'less than or equal to' of Series and other, element-wise (binary operator `<=`).
@@ -945,10 +990,10 @@ def le(self, other) -> Series:
other: Series, or scalar value
Returns:
- Series. The result of the comparison.
+ bigframes.series.Series. The result of the comparison.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def lt(self, other) -> Series:
"""Get 'less than' of Series and other, element-wise (binary operator `<`).
@@ -960,10 +1005,10 @@ def lt(self, other) -> Series:
other (Series, or scalar value):
Returns:
- Series: The result of the operation.
+ bigframes.series.Series: The result of the operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def ge(self, other) -> Series:
"""Get 'greater than or equal to' of Series and other, element-wise (binary operator `>=`).
@@ -975,10 +1020,10 @@ def ge(self, other) -> Series:
other (Series, or scalar value):
Returns:
- Series: The result of the operation.
+ bigframes.series.Series: The result of the operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def gt(self, other) -> Series:
"""Get 'less than or equal to' of Series and other, element-wise (binary operator `<=`).
@@ -990,10 +1035,10 @@ def gt(self, other) -> Series:
other (Series, or scalar value):
Returns:
- Series: The result of the operation.
+ bigframes.series.Series: The result of the operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def add(self, other) -> Series:
"""Return addition of Series and other, element-wise (binary operator add).
@@ -1005,10 +1050,10 @@ def add(self, other) -> Series:
other (Series, or scalar value):
Returns:
- Series: The result of the operation.
+ bigframes.series.Series: The result of the operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def radd(self, other) -> Series:
"""Return addition of Series and other, element-wise (binary operator radd).
@@ -1020,10 +1065,10 @@ def radd(self, other) -> Series:
other (Series, or scalar value):
Returns:
- Series: The result of the operation.
+ bigframes.series.Series: The result of the operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def sub(
self,
@@ -1038,10 +1083,10 @@ def sub(
other (Series, or scalar value):
Returns:
- Series: The result of the operation.
+ bigframes.series.Series: The result of the operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rsub(self, other) -> Series:
"""Return subtraction of Series and other, element-wise (binary operator rsub).
@@ -1053,10 +1098,10 @@ def rsub(self, other) -> Series:
other (Series, or scalar value):
Returns:
- Series: The result of the operation.
+ bigframes.series.Series: The result of the operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def mul(self, other) -> Series:
"""Return multiplication of Series and other, element-wise (binary operator mul).
@@ -1068,10 +1113,10 @@ def mul(self, other) -> Series:
other (Series, or scalar value):
Returns:
- Series: The result of the operation.
+ bigframes.series.Series: The result of the operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rmul(self, other) -> Series:
"""Return multiplication of Series and other, element-wise (binary operator mul).
@@ -1085,7 +1130,7 @@ def rmul(self, other) -> Series:
Returns:
Series: The result of the operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def truediv(self, other) -> Series:
"""Return floating division of Series and other, element-wise (binary operator truediv).
@@ -1097,10 +1142,10 @@ def truediv(self, other) -> Series:
other (Series, or scalar value):
Returns:
- Series: The result of the operation.
+ bigframes.series.Series: The result of the operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rtruediv(self, other) -> Series:
"""Return floating division of Series and other, element-wise (binary operator rtruediv).
@@ -1112,10 +1157,10 @@ def rtruediv(self, other) -> Series:
other (Series, or scalar value):
Returns:
- Series: The result of the operation.
+ bigframes.series.Series: The result of the operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def floordiv(self, other) -> Series:
"""Return integer division of Series and other, element-wise (binary operator floordiv).
@@ -1127,10 +1172,10 @@ def floordiv(self, other) -> Series:
other (Series, or scalar value):
Returns:
- Series: The result of the operation.
+ bigframes.series.Series: The result of the operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rfloordiv(self, other) -> Series:
"""Return integer division of Series and other, element-wise (binary operator rfloordiv).
@@ -1142,10 +1187,10 @@ def rfloordiv(self, other) -> Series:
other (Series, or scalar value):
Returns:
- Series: The result of the operation.
+ bigframes.series.Series: The result of the operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def mod(self, other) -> Series:
"""Return modulo of Series and other, element-wise (binary operator mod).
@@ -1157,25 +1202,55 @@ def mod(self, other) -> Series:
other (Series, or scalar value):
Returns:
- Series: The result of the operation.
+ bigframes.series.Series: The result of the operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rmod(self, other) -> Series:
- """Get modulo of Series and other, element-wise (binary operator `rmod`).
+ """Return modulo of Series and other, element-wise (binary operator mod).
- Equivalent to ``other % series``, but with support to substitute a fill_value for
+ Equivalent to ``series % other``, but with support to substitute a fill_value for
missing data in either one of the inputs.
Args:
other (Series, or scalar value):
Returns:
- Series: The result of the operation.
+ bigframes.series.Series: The result of the operation.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def pow(self, other) -> Series:
+ """Return Exponential power of series and other, element-wise (binary operator `pow`).
+
+ Equivalent to ``series ** other``, but with support to substitute a fill_value for
+ missing data in either one of the inputs.
+
+ Args:
+ other (Series, or scalar value):
+
+ Returns:
+ bigframes.series.Series: The result of the operation.
+
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def rpow(self, other) -> Series:
+ """Return Exponential power of series and other, element-wise (binary operator `rpow`).
+
+ Equivalent to ``other ** series``, but with support to substitute a fill_value for
+ missing data in either one of the inputs.
+
+ Args:
+ other (Series, or scalar value):
+
+ Returns:
+ bigframes.series.Series: The result of the operation.
+
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def divmod(self, other) -> Series:
"""Return integer division and modulo of Series and other, element-wise (binary operator divmod).
@@ -1190,7 +1265,7 @@ def divmod(self, other) -> Series:
consistent with (floordiv, mod) (though pandas may not).
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rdivmod(self, other) -> Series:
"""Return integer division and modulo of Series and other, element-wise (binary operator rdivmod).
@@ -1205,7 +1280,7 @@ def rdivmod(self, other) -> Series:
consistent with (rfloordiv, rmod) (though pandas may not).
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def all(
self,
@@ -1220,7 +1295,7 @@ def all(
scalar or Series: If level is specified, then, Series is returned;
otherwise, scalar is returned.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def any(
self,
@@ -1235,7 +1310,7 @@ def any(
scalar or Series: If level is specified, then, Series is returned;
otherwise, scalar is returned.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def max(
self,
@@ -1250,7 +1325,7 @@ def max(
Returns:
scalar or scalar
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def min(
self,
@@ -1264,7 +1339,7 @@ def min(
Returns:
scalar or scalar
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def std(
self,
@@ -1279,7 +1354,7 @@ def std(
-------
scalar or Series (if level specified)
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def var(
self,
@@ -1292,7 +1367,7 @@ def var(
Returns:
scalar or Series (if level specified)
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def sum(self):
"""Return the sum of the values over the requested axis.
@@ -1302,7 +1377,7 @@ def sum(self):
Returns:
scalar
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def mean(self):
"""Return the mean of the values over the requested axis.
@@ -1310,7 +1385,7 @@ def mean(self):
Returns:
scalar
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def median(self, *, exact: bool = False):
"""Return the median of the values over the requested axis.
@@ -1323,7 +1398,7 @@ def median(self, *, exact: bool = False):
Returns:
scalar
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def prod(self):
"""Return the product of the values over the requested axis.
@@ -1331,7 +1406,7 @@ def prod(self):
Returns:
scalar
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def skew(self):
"""Return unbiased skew over requested axis.
@@ -1341,7 +1416,7 @@ def skew(self):
Returns:
scalar
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def kurt(self):
"""Return unbiased kurtosis over requested axis.
@@ -1351,7 +1426,7 @@ def kurt(self):
Returns:
scalar or scalar: Unbiased kurtosis over requested axis.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def where(self, cond, other):
"""Replace values where the condition is False.
@@ -1373,9 +1448,9 @@ def where(self, cond, other):
extension dtypes).
Returns:
- Series
+ bigframes.series.Series: Series after the replacement.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def mask(self, cond, other):
"""Replace values where the condition is True.
@@ -1397,9 +1472,9 @@ def mask(self, cond, other):
extension dtypes).
Returns:
- Series
+ bigframes.series.Series: Series after the replacement.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def clip(self):
"""Trim values at input threshold(s).
@@ -1418,7 +1493,7 @@ def clip(self):
Returns:
Series.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def argmax(self):
"""
@@ -1429,7 +1504,7 @@ def argmax(self):
Returns:
Series: Row position of the maximum value.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def argmin(self):
"""
@@ -1440,7 +1515,7 @@ def argmin(self):
Returns:
Series: Row position of the minimum value.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rename(self, index, **kwargs) -> Series | None:
"""
@@ -1460,10 +1535,10 @@ def rename(self, index, **kwargs) -> Series | None:
attribute.
Returns:
- Series: Series with index labels
+ bigframes.series.Series: Series with index labels.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rename_axis(self, mapper, **kwargs):
"""
@@ -1474,9 +1549,9 @@ def rename_axis(self, mapper, **kwargs):
Value to set the axis name attribute.
Returns:
- Series: Series with the name of the axis set.
+ bigframes.series.Series: Series with the name of the axis set.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rolling(
self,
@@ -1514,9 +1589,10 @@ def rolling(
to the size of the window.
Returns:
- ``Window`` subclass if a ``win_type`` is passed.``Rolling`` subclass if ``win_type`` is not passed
+ bigframes.core.window.Window: ``Window`` subclass if a ``win_type`` is passed.
+ ``Rolling`` subclass if ``win_type`` is not passed.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def expanding(self, min_periods=1):
"""
@@ -1528,9 +1604,9 @@ def expanding(self, min_periods=1):
otherwise, result is ``np.nan``.
Returns:
- ``Expanding`` subclass
+ bigframes.core.window.Window: ``Expanding`` subclass.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def value_counts(
self,
@@ -1561,7 +1637,7 @@ def value_counts(
Returns:
Series: Series containing counts of unique values.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def str(self):
@@ -1571,7 +1647,7 @@ def str(self):
NAs stay NA unless handled otherwise by a particular method. Patterned
after Python’s string methods, with some inspiration from R’s stringr package.
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def isin(self, values):
"""
@@ -1591,12 +1667,12 @@ def isin(self, values):
TypeError. Instead, turn a single string into a list of one element.
Returns:
- bigframes.series.Series: Series of booleans indicating if each element is in values.
+ bigframes.series.Series: Series of booleans indicating if each element is in values.
Raises:
TypeError: If input is not list-like.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def is_monotonic_increasing(self) -> bool:
@@ -1606,7 +1682,7 @@ def is_monotonic_increasing(self) -> bool:
Returns:
bool
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def is_monotonic_decreasing(self) -> bool:
@@ -1616,4 +1692,47 @@ def is_monotonic_decreasing(self) -> bool:
Returns:
bool
"""
- raise NotImplementedError("abstract property")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def map(
+ self,
+ arg,
+ na_action=None,
+ *,
+ verify_integrity=False,
+ ) -> Series:
+ """
+ Map values of Series according to an input mapping or function.
+
+ Used for substituting each value in a Series with another value,
+ that may be derived from a remote function, ``dict``, or a :class:`Series`.
+
+ If arg is a remote function, the overhead for remote functions
+ applies. If mapping with a dict, fully deferred computation is possible.
+ If mapping with a Series, fully deferred computation is only possible if
+ verify_integrity=False.
+
+ .. note::
+ Bigframes does not yet support ``dict`` subclasses that define
+ ``__missing__`` (i.e. provide a method for default values). These
+ are treated the same as ``dict``.
+
+ Args:
+ arg (function, Mapping, Series):
+ remote function, collections.abc.Mapping subclass or Series
+ Mapping correspondence.
+ na_action: (str, default None)
+ Only None is currently supported, indicating that arg may
+ map values to scalars. values won't be ignored.
+ Passing 'ignore' will raise NotImplementedException.
+ verify_integrity: (bool, default False)
+ Only applies when arg is a Series. If True, throw if the Series
+ index contains duplicate entries (this matches pandas behavior).
+ If False, skip the expensive computation, and any duplicate
+ index entries will produce duplicate rows in the result for each
+ index entry.
+
+ Returns:
+ Series: Same index as caller.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py
index a27093b552..ecdd9547d5 100644
--- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py
+++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py
@@ -1,6 +1,8 @@
import re
import typing
+from bigframes import constants
+
class StringMethods:
"""
@@ -32,7 +34,7 @@ def extract(self, pat: str, flags: int = 0):
expression pat will be used for column names; otherwise
capture group numbers will be used.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def find(self, sub, start: int = 0, end=None):
"""Return lowest indexes in each strings in the Series/Index.
@@ -52,7 +54,7 @@ def find(self, sub, start: int = 0, end=None):
Returns:
bigframes.series.Series: Series with lowest indexes in each strings.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def len(self):
"""Compute the length of each element in the Series/Index.
@@ -65,7 +67,7 @@ def len(self):
the length of each element in the Series or Index.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def lower(self):
"""Convert strings in the Series/Index to lowercase.
@@ -76,7 +78,7 @@ def lower(self):
bigframes.series.Series: Series with lowercase.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def slice(self, start=None, stop=None):
"""Slice substrings from each element in the Series or Index.
@@ -94,7 +96,7 @@ def slice(self, start=None, stop=None):
substring from original string object.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def strip(self):
"""Remove leading and trailing characters.
@@ -109,7 +111,7 @@ def strip(self):
and trailing characters.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def upper(self):
"""Convert strings in the Series/Index to uppercase.
@@ -120,7 +122,7 @@ def upper(self):
bigframes.series.Series: Series with uppercase strings.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def isnumeric(self):
"""Check whether all characters in each string are numeric.
@@ -134,7 +136,103 @@ def isnumeric(self):
same length as the original Series/Index.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def isalpha(self):
+ """Check whether all characters in each string are alphabetic.
+
+ This is equivalent to running the Python string method
+ :meth:`str.isalpha` for each element of the Series/Index. If a string
+ has zero characters, ``False`` is returned for that check.
+
+ Returns:
+ bigframes.series.Series: Series with the same length as the originalSeries/Index.
+ """
+
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def isdigit(self):
+ """Check whether all characters in each string are digits.
+
+ This is equivalent to running the Python string method
+ :meth:`str.isdigit` for each element of the Series/Index. If a string
+ has zero characters, ``False`` is returned for that check.
+
+ Returns:
+ bigframes.series.Series: Series with the same length as the originalSeries/Index.
+ """
+
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def isalnum(self):
+ """Check whether all characters in each string are alphanumeric.
+
+ This is equivalent to running the Python string method
+ :meth:`str.isalnum` for each element of the Series/Index. If a string
+ has zero characters, ``False`` is returned for that check.
+
+ Returns:
+ bigframes.series.Series: Series or Index of boolean values with the
+ same length as the original Series/Index.
+ """
+
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def isspace(self):
+ """Check whether all characters in each string are whitespace.
+
+ This is equivalent to running the Python string method
+ :meth:`str.isspace` for each element of the Series/Index. If a string
+ has zero characters, ``False`` is returned for that check.
+
+ Returns:
+ bigframes.series.Series: Series or Index of boolean values with the
+ same length as the original Series/Index.
+ """
+
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def islower(self):
+ """Check whether all characters in each string are lowercase.
+
+ This is equivalent to running the Python string method
+ :meth:`str.islower` for each element of the Series/Index. If a string
+ has zero characters, ``False`` is returned for that check.
+
+ Returns:
+ bigframes.series.Series: Series or Index of boolean values with the
+ same length as the original Series/Index.
+ """
+
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def isupper(self):
+ """Check whether all characters in each string are uppercase.
+
+ This is equivalent to running the Python string method
+ :meth:`str.isupper` for each element of the Series/Index. If a string
+ has zero characters, ``False`` is returned for that check.
+
+ Returns:
+ bigframes.series.Series: Series or Index of boolean values with the
+ same length as the original Series/Index.
+ """
+
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def isdecimal(self):
+ """Check whether all characters in each string are decimal.
+
+ This is equivalent to running the Python string method
+ :meth:`str.isdecimal` for each element of the Series/Index. If a string
+ has zero characters, ``False`` is returned for that check.
+
+ Returns:
+ bigframes.series.Series: Series or Index of boolean values with the
+ same length as the original Series/Index.
+ """
+
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rstrip(self):
"""Remove trailing characters.
@@ -148,7 +246,7 @@ def rstrip(self):
bigframes.series.Series: Series without trailing characters.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def lstrip(self):
"""Remove leading characters.
@@ -162,7 +260,7 @@ def lstrip(self):
bigframes.series.Series: Series without leading characters.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def repeat(self, repeats: int):
"""Duplicate each string in the Series or Index.
@@ -176,7 +274,7 @@ def repeat(self, repeats: int):
objects specified by input parameter repeats.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def capitalize(self):
"""Convert strings in the Series/Index to be capitalized.
@@ -187,7 +285,7 @@ def capitalize(self):
bigframes.series.Series: Series with captitalized strings.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def cat(self, others, *, join):
"""Concatenate strings in the Series/Index with given separator.
@@ -208,7 +306,7 @@ def cat(self, others, *, join):
bigframes.series.Series: Series with concatenated strings.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def contains(self, pat, case: bool = True, flags: int = 0, *, regex: bool = True):
"""
@@ -233,7 +331,7 @@ def contains(self, pat, case: bool = True, flags: int = 0, *, regex: bool = True
whether the given pattern is contained within the string of each
element of the Series or Index.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def replace(
self,
@@ -277,7 +375,7 @@ def replace(
of `pat` replaced by `repl`.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def startswith(
self,
@@ -295,7 +393,7 @@ def startswith(
bigframes.series.Series: A Series of booleans indicating whether the given
pattern matches the start of each string element.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def endswith(
self,
@@ -313,7 +411,7 @@ def endswith(
bigframes.series.Series: A Series of booleans indicating whether the given
pattern matches the end of each string element.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def match(self, pat: str, case: bool = True, flags: int = 0):
"""
@@ -330,7 +428,7 @@ def match(self, pat: str, case: bool = True, flags: int = 0):
Returns:
bigframes.series.Series: Series of boolean values
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def fullmatch(self, pat: str, case: bool = True, flags: int = 0):
"""
@@ -347,7 +445,7 @@ def fullmatch(self, pat: str, case: bool = True, flags: int = 0):
Returns:
bigframes.series.Series: Series of boolean values
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def get(self, i: int):
"""
@@ -363,7 +461,7 @@ def get(self, i: int):
Returns:
bigframes.series.Series: Series
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def pad(
self,
@@ -386,7 +484,7 @@ def pad(
Returns:
bigframes.series.Series: Returns Series or Index with minimum number of char in object.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def ljust(
self,
@@ -406,7 +504,7 @@ def ljust(
Returns:
bigframes.series.Series: Returns Series or Index with minimum number of char in object.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def rjust(
self,
@@ -426,4 +524,48 @@ def rjust(
Returns:
bigframes.series.Series: Returns Series or Index with minimum number of char in object.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def zfill(
+ self,
+ width: int,
+ ):
+ """
+ Pad strings in the Series/Index by prepending '0' characters.
+
+ Strings in the Series/Index are padded with '0' characters on the
+ left of the string to reach a total string length `width`. Strings
+ in the Series/Index with length greater or equal to `width` are
+ unchanged.
+
+ Args:
+ width (int):
+ Minimum length of resulting string; strings with length less
+ than `width` be prepended with '0' characters.
+
+ Returns:
+ bigframes.series.Series: Series of objects.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def center(
+ self,
+ width: int,
+ fillchar: str = " ",
+ ):
+ """
+ Pad left and right side of strings in the Series/Index.
+
+ Equivalent to :meth:`str.center`.
+
+ Args:
+ width (int):
+ Minimum width of resulting string; additional characters will be filled
+ with character defined in `fillchar`.
+ fillchar (str, default ' '):
+ Additional character for filling, default is whitespace.
+
+ Returns:
+ bigframes.series.Series: Returns Series or Index with minimum number of char in object.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/pandas/core/window/rolling.py b/third_party/bigframes_vendored/pandas/core/window/rolling.py
index 7a9239b70c..a869c86e72 100644
--- a/third_party/bigframes_vendored/pandas/core/window/rolling.py
+++ b/third_party/bigframes_vendored/pandas/core/window/rolling.py
@@ -4,34 +4,36 @@
similar to how we have a Groupby object.
"""
+from bigframes import constants
+
class Window:
"""Provide window calculations."""
def count(self):
"""Calculate the window count of non-NULL observations."""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def sum(self):
"""Calculate the weighted window sum."""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def mean(self):
"""Calculate the weighted window mean."""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def var(self):
"""Calculate the weighted window variance."""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def std(self):
"""Calculate the weighted window standard deviation."""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def max(self):
"""Calculate the weighted window maximum."""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def min(self):
"""Calculate the weighted window minimum."""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py
index 9425ead0e3..95531ff5e8 100644
--- a/third_party/bigframes_vendored/pandas/io/gbq.py
+++ b/third_party/bigframes_vendored/pandas/io/gbq.py
@@ -5,6 +5,8 @@
from typing import Iterable, Optional
+from bigframes import constants
+
class GBQIOMixin:
def read_gbq(
@@ -15,7 +17,56 @@ def read_gbq(
col_order: Iterable[str] = (),
max_results: Optional[int] = None,
):
- """Loads DataFrame from BigQuery.
+ """Loads a DataFrame from BigQuery.
+
+ BigQuery tables are an unordered, unindexed data source. By default,
+ the DataFrame will have an arbitrary index and ordering.
+
+ Set the `index_col` argument to one or more columns to choose an
+ index. The resulting DataFrame is sorted by the index columns. For the
+ best performance, ensure the index columns don't contain duplicate
+ values.
+
+ .. note::
+ By default, even SQL query inputs with an ORDER BY clause create a
+ DataFrame with an arbitrary ordering. Use ``row_number() OVER
+ (ORDER BY ...) AS rowindex`` in your SQL query and set
+ ``index_col='rowindex'`` to preserve the desired ordering.
+
+ If your query doesn't have an ordering, select ``GENERATE_UUID() AS
+ rowindex`` in your SQL and set ``index_col='rowindex'`` for the
+ best performance.
+
+ **Examples:**
+
+ >>> import bigframes.pandas as bpd
+ >>> bpd.options.display.progress_bar = None
+
+ Preserve ordering in a query input.
+
+ >>> bpd.read_gbq('''
+ ... SELECT
+ ... -- Instead of an ORDER BY clause on the query, use
+ ... -- ROW_NUMBER() to create an ordered DataFrame.
+ ... ROW_NUMBER() OVER (ORDER BY AVG(pitchSpeed) DESC)
+ ... AS rowindex,
+ ...
+ ... pitcherFirstName,
+ ... pitcherLastName,
+ ... AVG(pitchSpeed) AS averagePitchSpeed
+ ... FROM `bigquery-public-data.baseball.games_wide`
+ ... WHERE year = 2016
+ ... GROUP BY pitcherFirstName, pitcherLastName
+ ... ''', index_col="rowindex").head(n=5)
+ pitcherFirstName pitcherLastName averagePitchSpeed
+ rowindex
+ 1 Albertin Chapman 96.514113
+ 2 Zachary Britton 94.591039
+ 3 Trevor Rosenthal 94.213953
+ 4 Jose Torres 94.103448
+ 5 Tayron Guerrero 93.863636
+
+ [5 rows x 3 columns]
Args:
query (str):
@@ -34,4 +85,4 @@ def read_gbq(
Returns:
bigframes.dataframe.DataFrame: A DataFrame representing results of the query or table.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/pandas/io/parquet.py b/third_party/bigframes_vendored/pandas/io/parquet.py
index 6f0a2b3cb4..9aed9af5a8 100644
--- a/third_party/bigframes_vendored/pandas/io/parquet.py
+++ b/third_party/bigframes_vendored/pandas/io/parquet.py
@@ -2,6 +2,8 @@
""" parquet compat """
from __future__ import annotations
+from bigframes import constants
+
class ParquetIOMixin:
def read_parquet(
@@ -22,4 +24,4 @@ def read_parquet(
Returns:
bigframes.dataframe.DataFrame: A BigQuery DataFrames.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/pandas/io/parsers/readers.py b/third_party/bigframes_vendored/pandas/io/parsers/readers.py
index e01eb734fb..d19a92ecdf 100644
--- a/third_party/bigframes_vendored/pandas/io/parsers/readers.py
+++ b/third_party/bigframes_vendored/pandas/io/parsers/readers.py
@@ -6,10 +6,22 @@
"""
from __future__ import annotations
-from typing import Any, Dict, Literal, MutableSequence, Optional, Sequence, Tuple, Union
+from typing import (
+ Any,
+ Dict,
+ IO,
+ Literal,
+ MutableSequence,
+ Optional,
+ Sequence,
+ Tuple,
+ Union,
+)
import numpy as np
+from bigframes import constants
+
class ReaderIOMixin:
def read_csv(
@@ -45,7 +57,8 @@ def read_csv(
Args:
filepath_or_buffer (str):
- a string path including Cloud Storage and local file.
+ A local or Google Cloud Storage (`gs://`) path with `engine="bigquery"`
+ otherwise passed to pandas.read_csv.
sep (Optional[str], default ","):
the separator for fields in a CSV file. For the BigQuery engine, the separator
can be any ISO-8859-1 single-byte character. To use a character in the range
@@ -104,10 +117,71 @@ def read_csv(
https://docs.python.org/3/library/codecs.html#standard-encodings
The BigQuery engine only supports `UTF-8` and `ISO-8859-1`.
**kwargs:
- keyword arguments.
+ keyword arguments for `pandas.read_csv` when not using the BigQuery engine.
Returns:
bigframes.dataframe.DataFrame: A BigQuery DataFrames.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ def read_json(
+ self,
+ path_or_buf: str | IO["bytes"],
+ *,
+ orient: Literal[
+ "split", "records", "index", "columns", "values", "table"
+ ] = "columns",
+ dtype: Optional[Dict] = None,
+ encoding: Optional[str] = None,
+ lines: bool = False,
+ engine: Literal["ujson", "pyarrow", "bigquery"] = "ujson",
+ **kwargs,
+ ):
+ """
+ Convert a JSON string to DataFrame object.
+
+ .. note::
+ using `engine="bigquery"` will not guarantee the same ordering as the
+ file. Instead, set a serialized index column as the index and sort by
+ that in the resulting DataFrame.
+
+ Args:
+ path_or_buf (a valid JSON str, path object or file-like object):
+ A local or Google Cloud Storage (`gs://`) path with `engine="bigquery"`
+ otherwise passed to pandas.read_json.
+ orient (str, optional):
+ If `engine="bigquery"` orient only supports "records".
+ Indication of expected JSON string format.
+ Compatible JSON strings can be produced by ``to_json()`` with a
+ corresponding orient value.
+ The set of possible orients is:
+
+ - ``'split'`` : dict like
+ ``{{index -> [index], columns -> [columns], data -> [values]}}``
+ - ``'records'`` : list like
+ ``[{{column -> value}}, ... , {{column -> value}}]``
+ - ``'index'`` : dict like ``{{index -> {{column -> value}}}}``
+ - ``'columns'`` : dict like ``{{column -> {{index -> value}}}}``
+ - ``'values'`` : just the values array
+
+ dtype (bool or dict, default None):
+ If True, infer dtypes; if a dict of column to dtype, then use those;
+ if False, then don't infer dtypes at all, applies only to the data.
+
+ For all ``orient`` values except ``'table'``, default is True.
+ encoding (str, default is 'utf-8'):
+ The encoding to use to decode py3 bytes.
+ lines (bool, default False):
+ Read the file as a json object per line. If using `engine="bigquery"` lines only supports True.
+ engine ({{"ujson", "pyarrow", "bigquery"}}, default "ujson"):
+ Type of engine to use. If `engine="bigquery"` is specified, then BigQuery's load API will be used.
+ Otherwise, the engine will be passed to `pandas.read_json`.
+ **kwargs:
+ keyword arguments for `pandas.read_json` when not using the BigQuery engine.
+
+ Returns:
+ bigframes.dataframe.DataFrame:
+ The DataFrame representing JSON contents.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/pandas/io/pickle.py b/third_party/bigframes_vendored/pandas/io/pickle.py
new file mode 100644
index 0000000000..71b31956a0
--- /dev/null
+++ b/third_party/bigframes_vendored/pandas/io/pickle.py
@@ -0,0 +1,57 @@
+# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/io/pickle.py
+""" pickle compat """
+from __future__ import annotations
+
+from pandas._typing import (
+ CompressionOptions,
+ FilePath,
+ ReadPickleBuffer,
+ StorageOptions,
+)
+
+from bigframes import constants
+
+
+class PickleIOMixin:
+ def read_pickle(
+ self,
+ filepath_or_buffer: FilePath | ReadPickleBuffer,
+ compression: CompressionOptions = "infer",
+ storage_options: StorageOptions = None,
+ ):
+ """Load pickled BigFrames object (or any object) from file.
+
+ .. note::
+ If the content of the pickle file is a Series and its name attribute is None,
+ the name will be set to '0' by default.
+
+ Args:
+ filepath_or_buffer (str, path object, or file-like object):
+ String, path object (implementing os.PathLike[str]), or file-like object
+ implementing a binary readlines() function. Also accepts URL. URL is not
+ limited to S3 and GCS.
+ compression (str or dict, default 'infer'):
+ For on-the-fly decompression of on-disk data. If 'infer' and
+ 'filepath_or_buffer' is path-like, then detect compression from the following
+ extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz'
+ or '.tar.bz2' (otherwise no compression). If using 'zip' or 'tar', the ZIP
+ file must contain only one data file to be read in. Set to None for no
+ decompression. Can also be a dict with key 'method' set to one of {'zip',
+ 'gzip', 'bz2', 'zstd', 'tar'} and other key-value pairs are forwarded to
+ zipfile.ZipFile, gzip.GzipFile, bz2.BZ2File, zstandard.ZstdDecompressor or
+ tarfile.TarFile, respectively. As an example, the following could be passed
+ for Zstandard decompression using a custom compression dictionary
+ compression={'method': 'zstd', 'dict_data': my_compression_dict}.
+ storage_options (dict, default None):
+ Extra options that make sense for a particular storage connection, e.g. host,
+ port, username, password, etc. For HTTP(S) URLs the key-value pairs are
+ forwarded to urllib.request.Request as header options. For other URLs (e.g.
+ starting with “s3://”, and “gcs://”) the key-value pairs are forwarded to
+ fsspec.open. Please see fsspec and urllib for more details, and for more
+ examples on storage options refer here.
+
+ Returns:
+ bigframes.dataframe.DataFrame or bigframes.series.Series: same type as object
+ stored in file.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py
index 03958f7595..847ad06f75 100644
--- a/third_party/bigframes_vendored/sklearn/base.py
+++ b/third_party/bigframes_vendored/sklearn/base.py
@@ -9,6 +9,8 @@
import inspect
from typing import Any, Dict, List
+from bigframes import constants
+
class BaseEstimator:
"""Base class for all estimators.
@@ -94,7 +96,7 @@ def score(self, X, y):
Returns:
bigframes.dataframe.DataFrame: A DataFrame of the evaluation result.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
class RegressorMixin:
@@ -113,14 +115,35 @@ def score(self, X, y):
``(n_samples, n_samples_fitted)``, where ``n_samples_fitted``
is the number of samples used in the fitting for the estimator.
- y (bigframes.dataframe.DataFrame or bigframes.series.Series:
+ y (bigframes.dataframe.DataFrame or bigframes.series.Series):
Series or DataFrame of shape (n_samples,) or (n_samples, n_outputs). True
values for `X`.
Returns:
bigframes.dataframe.DataFrame: A DataFrame of the evaluation result.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+
+class TransformerMixin:
+ """Mixin class for all transformers."""
+
+ def fit_transform(self, X, y=None):
+ """Fit to data, then transform it.
+
+ Args:
+ X (bigframes.dataframe.DataFrame or bigframes.series.Series):
+ Series or DataFrame of shape (n_samples, n_features).
+ Input samples.
+
+ y (bigframes.dataframe.DataFrame or bigframes.series.Series):
+ Series or DataFrame of shape (n_samples,) or (n_samples, n_outputs). Default None.
+ Target values (None for unsupervised transformations).
+
+ Returns:
+ bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_features_new)
+ Transformed DataFrame.
+ """
class MetaEstimatorMixin:
diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py
index 068aa4d290..ff1c04edbe 100644
--- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py
+++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py
@@ -14,6 +14,7 @@
from abc import ABC
from typing import List, Optional
+from bigframes import constants
from third_party.bigframes_vendored.sklearn.base import BaseEstimator
@@ -32,7 +33,7 @@ def predict(self, X):
bigframes.dataframe.DataFrame: DataFrame of shape (n_samples,), containing the
class labels for each sample.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
class KMeans(_BaseKMeans):
@@ -65,7 +66,7 @@ def fit(
Returns:
KMeans: Fitted Estimator.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def predict(
self,
@@ -76,13 +77,11 @@ def predict(
Args:
X (bigframes.dataframe.DataFrame or bigframes.series.Series):
DataFrame of shape (n_samples, n_features). New data to predict.
- y: (default None)
- Not used, present here for API consistency by convention.
Returns:
bigframes.dataframe.DataFrame: DataFrame of the cluster each sample belongs to.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def score(
self,
@@ -100,7 +99,7 @@ def score(
Returns:
bigframes.dataframe.DataFrame: DataFrame of the metrics.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
@property
def cluster_centers_(self):
@@ -121,4 +120,4 @@ def cluster_centers_(self):
The output contains one row per feature per centroid.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py b/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py
index bc8bc3980a..dead173b2d 100644
--- a/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py
+++ b/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py
@@ -6,6 +6,7 @@
from abc import ABCMeta
+from bigframes import constants
from third_party.bigframes_vendored.sklearn.base import BaseEstimator
@@ -43,7 +44,7 @@ def fit(
Returns:
ColumnTransformer: Fitted estimator.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def transform(
self,
@@ -58,4 +59,4 @@ def transform(
Returns:
bigframes.dataframe.DataFrame: Transformed result.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py
index 619c13f35d..85feab0024 100644
--- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py
+++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py
@@ -12,6 +12,7 @@
from abc import ABCMeta
+from bigframes import constants
from third_party.bigframes_vendored.sklearn.base import BaseEstimator
@@ -55,7 +56,7 @@ def fit(self, X, y=None):
Returns:
PCA: Fitted estimator.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def score(self, X=None, y=None):
"""Return the metrics of the model.
@@ -69,7 +70,7 @@ def score(self, X=None, y=None):
Returns:
bigframes.dataframe.DataFrame: DataFrame that represents model metrics.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def predict(self, X):
"""Predict the closest cluster for each sample in X.
@@ -80,4 +81,51 @@ def predict(self, X):
Returns:
bigframes.dataframe.DataFrame: predicted DataFrames."""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ @property
+ def components_(self):
+ """Principal axes in feature space, representing the directions of maximum variance in the data.
+
+ Returns:
+ bigframes.dataframe.DataFrame: DataFrame of principal components, containing following columns:
+ principal_component_id: An integer that identifies the principal component.
+
+ feature: The column name that contains the feature.
+
+ numerical_value: If feature is numeric, the value of feature for the principal component that principal_component_id identifies. If feature isn't numeric, the value is NULL.
+
+ categorical_value: An list of mappings containing information about categorical features. Each mapping contains the following fields:
+ categorical_value.category: The name of each category.
+
+ categorical_value.value: The value of categorical_value.category for the centroid that centroid_id identifies.
+
+ The output contains one row per feature per component.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ @property
+ def explained_variance_(self):
+ """The amount of variance explained by each of the selected components.
+
+ Returns:
+ bigframes.dataframe.DataFrame: DataFrame containing following columns:
+ principal_component_id: An integer that identifies the principal component.
+
+ explained_variance: The factor by which the eigenvector is scaled. Eigenvalue and explained variance are the same concepts in PCA.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+ @property
+ def explained_variance_ratio_(self):
+ """Percentage of variance explained by each of the selected components.
+
+ Returns:
+ bigframes.dataframe.DataFrame: DataFrame containing following columns:
+ principal_component_id: An integer that identifies the principal component.
+
+ explained_variance_ratio: the total variance is the sum of variances, also known as eigenvalues, of all
+ of the individual principal components. The explained variance ratio by a principal component is
+ the ratio between the variance, also known as eigenvalue, of that principal component and the total variance.
+ """
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py
index 73f4684dc3..79224a772d 100644
--- a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py
+++ b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py
@@ -33,6 +33,8 @@ class calls the ``fit`` method of each sub-estimator on random samples
from abc import ABCMeta
+from bigframes import constants
+
from ..base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin, RegressorMixin
@@ -60,7 +62,7 @@ def fit(self, X, y):
Returns:
Fitted Estimator.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta):
@@ -82,7 +84,7 @@ def predict(self, X):
Returns:
The predicted values.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
class RandomForestRegressor(ForestRegressor):
@@ -148,7 +150,7 @@ def predict(self, X):
Returns:
The predicted values.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
class RandomForestClassifier(ForestClassifier):
diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py
index 65e895298d..8141da4e3b 100644
--- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py
+++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py
@@ -18,6 +18,7 @@
from abc import ABCMeta
from typing import List, Optional
+from bigframes import constants
from third_party.bigframes_vendored.sklearn.base import (
BaseEstimator,
ClassifierMixin,
@@ -36,7 +37,7 @@ def predict(self, X):
Returns:
bigframes.dataframe.DataFrame: DataFrame of shape (n_samples,). Returns predicted values.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
class LinearClassifierMixin(ClassifierMixin):
@@ -52,7 +53,7 @@ def predict(self, X):
bigframes.dataframe.DataFrame: DataFrame of shape (n_samples,), containing
the class labels for each sample.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
class LinearRegression(RegressorMixin, LinearModel):
@@ -92,4 +93,4 @@ def fit(
Returns:
LinearRegression: Fitted Estimator.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py
index 8525e57068..a06035eef6 100644
--- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py
+++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py
@@ -14,6 +14,7 @@
from typing import List, Optional
+from bigframes import constants
from third_party.bigframes_vendored.sklearn.linear_model._base import (
BaseEstimator,
LinearClassifierMixin,
@@ -57,4 +58,4 @@ def fit(
Returns:
LogisticRegression: Fitted Estimator.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/sklearn/metrics/_classification.py b/third_party/bigframes_vendored/sklearn/metrics/_classification.py
index 6d9692ac8d..a9d8038e59 100644
--- a/third_party/bigframes_vendored/sklearn/metrics/_classification.py
+++ b/third_party/bigframes_vendored/sklearn/metrics/_classification.py
@@ -20,6 +20,8 @@
# Michal Karbownik
# License: BSD 3 clause
+from bigframes import constants
+
def accuracy_score(y_true, y_pred, normalize=True) -> float:
"""Accuracy classification score.
@@ -39,7 +41,7 @@ def accuracy_score(y_true, y_pred, normalize=True) -> float:
classified samples (float), else returns the number of correctly
classified samples (int).
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def confusion_matrix(
@@ -68,7 +70,7 @@ def confusion_matrix(
samples with true label being i-th class and predicted label
being j-th class.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def recall_score(
@@ -99,7 +101,7 @@ def recall_score(
of the positive class in binary classification or weighted
average of the recall of each class for the multiclass task.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def precision_score(
@@ -132,7 +134,7 @@ def precision_score(
Precision of the positive class in binary classification or weighted
average of the precision of each class for the multiclass task.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def f1_score(
@@ -167,4 +169,4 @@ def f1_score(
average of the F1 scores of each class for the multiclass task.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py
index 693996070f..ac919edbe3 100644
--- a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py
+++ b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py
@@ -16,6 +16,8 @@
# Michal Karbownik
# License: BSD 3 clause
+from bigframes import constants
+
def auc(x, y) -> float:
"""Compute Area Under the Curve (AUC) using the trapezoidal rule.
@@ -35,7 +37,7 @@ def auc(x, y) -> float:
Returns:
float: Area Under the Curve.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def roc_auc_score(y_true, y_score) -> float:
@@ -60,7 +62,7 @@ def roc_auc_score(y_true, y_score) -> float:
Returns:
float: Area Under the Curve score.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def roc_curve(
@@ -95,4 +97,4 @@ def roc_curve(
fpr and tpr. `thresholds[0]` represents no instances being predicted
and is arbitrarily set to `max(y_score) + 1`.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/sklearn/metrics/_regression.py b/third_party/bigframes_vendored/sklearn/metrics/_regression.py
index b90c415887..9740c540e9 100644
--- a/third_party/bigframes_vendored/sklearn/metrics/_regression.py
+++ b/third_party/bigframes_vendored/sklearn/metrics/_regression.py
@@ -24,6 +24,8 @@
# Ohad Michel
# License: BSD 3 clause
+from bigframes import constants
+
def r2_score(y_true, y_pred, force_finite=True) -> float:
""":math:`R^2` (coefficient of determination) regression score function.
@@ -49,4 +51,4 @@ def r2_score(y_true, y_pred, force_finite=True) -> float:
Returns:
float: The :math:`R^2` score.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/sklearn/pipeline.py b/third_party/bigframes_vendored/sklearn/pipeline.py
index f8bbae86df..4b8eb25a97 100644
--- a/third_party/bigframes_vendored/sklearn/pipeline.py
+++ b/third_party/bigframes_vendored/sklearn/pipeline.py
@@ -11,6 +11,7 @@
from abc import ABCMeta
+from bigframes import constants
from third_party.bigframes_vendored.sklearn.base import BaseEstimator
@@ -47,7 +48,7 @@ def fit(
Returns:
Pipeline: Pipeline with fitted steps.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def score(self, X, y):
@@ -67,7 +68,7 @@ def score(self, X, y):
DataFrame: A DataFrame representing the result
of calling `score` on the final estimator.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def predict(self, X):
@@ -81,4 +82,4 @@ def predict(self, X):
bigframes.dataframe.DataFrame: A Dataframe representing
predicted result.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py
index c57d1f2230..d013043467 100644
--- a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py
+++ b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py
@@ -7,6 +7,7 @@
# Eric Chang
# License: BSD 3 clause
+from bigframes import constants
from third_party.bigframes_vendored.sklearn.base import BaseEstimator
@@ -59,7 +60,7 @@ def fit(self, X):
Returns:
StandardScaler: Fitted scaler.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def transform(self, X):
"""Perform standardization by centering and scaling.
@@ -71,4 +72,4 @@ def transform(self, X):
Returns:
bigframes.dataframe.DataFrame: Transformed result.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py
index a6c32d91c1..b1cf17e539 100644
--- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py
+++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py
@@ -2,6 +2,7 @@
# Joris Van den Bossche
# License: BSD 3 clause
+from bigframes import constants
from third_party.bigframes_vendored.sklearn.base import BaseEstimator
@@ -61,7 +62,7 @@ def fit(self, X):
Returns:
OneHotEncoder: Fitted encoder.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def transform(self, X):
"""Transform X using one-hot encoding.
@@ -73,4 +74,4 @@ def transform(self, X):
Returns:
bigframes.dataframe.DataFrame: The result is categorized as index: number, value: number.
Where index is the position of the dict that seeing the category, and value is 0 or 1."""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
diff --git a/third_party/bigframes_vendored/xgboost/sklearn.py b/third_party/bigframes_vendored/xgboost/sklearn.py
index fcb5d2ec59..620c87fa3d 100644
--- a/third_party/bigframes_vendored/xgboost/sklearn.py
+++ b/third_party/bigframes_vendored/xgboost/sklearn.py
@@ -2,6 +2,8 @@
from typing import Any
+from bigframes import constants
+
from ..sklearn.base import BaseEstimator as XGBModelBase
from ..sklearn.base import ClassifierMixin as XGBClassifierBase
from ..sklearn.base import RegressorMixin as XGBRegressorBase
@@ -18,7 +20,7 @@ def predict(self, X):
Returns:
DataFrame of shape (n_samples,): Returns predicted values.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
def fit(self, X, y):
"""Fit gradient boosting model.
@@ -42,7 +44,7 @@ def fit(self, X, y):
Returns:
XGBModel: Fitted Estimator.
"""
- raise NotImplementedError("abstract method")
+ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
class XGBClassifierMixIn: