Skip to content

Commit b26e135

Browse files
feat: Support python type as astype arg (#1316)
1 parent deb015d commit b26e135

File tree

17 files changed

+160
-87
lines changed

17 files changed

+160
-87
lines changed

bigframes/core/blocks.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -707,7 +707,7 @@ def split(
707707
# Create an ordering col and convert to string
708708
block, ordering_col = block.promote_offsets()
709709
block, string_ordering_col = block.apply_unary_op(
710-
ordering_col, ops.AsTypeOp(to_type="string[pyarrow]")
710+
ordering_col, ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE)
711711
)
712712

713713
# Apply hash method to sum col and order by it.
@@ -1479,7 +1479,9 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block:
14791479
expr, new_col = expr.project_to_id(
14801480
expression=ops.add_op.as_expr(
14811481
ex.const(prefix),
1482-
ops.AsTypeOp(to_type="string").as_expr(index_col),
1482+
ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE).as_expr(
1483+
index_col
1484+
),
14831485
),
14841486
)
14851487
new_index_cols.append(new_col)
@@ -1502,7 +1504,9 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block:
15021504
for index_col in self._index_columns:
15031505
expr, new_col = expr.project_to_id(
15041506
expression=ops.add_op.as_expr(
1505-
ops.AsTypeOp(to_type="string").as_expr(index_col),
1507+
ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE).as_expr(
1508+
index_col
1509+
),
15061510
ex.const(suffix),
15071511
),
15081512
)

bigframes/core/compile/ibis_types.py

Lines changed: 3 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,8 @@
1313
# limitations under the License.
1414
from __future__ import annotations
1515

16-
import textwrap
1716
import typing
18-
from typing import Any, cast, Dict, Iterable, Optional, Tuple, Union
17+
from typing import cast, Dict, Iterable, Optional, Tuple, Union
1918

2019
import bigframes_vendored.constants as constants
2120
import bigframes_vendored.ibis
@@ -28,7 +27,6 @@
2827
import db_dtypes # type: ignore
2928
import geopandas as gpd # type: ignore
3029
import google.cloud.bigquery as bigquery
31-
import numpy as np
3230
import pandas as pd
3331
import pyarrow as pa
3432

@@ -228,9 +226,7 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value:
228226

229227

230228
def bigframes_dtype_to_ibis_dtype(
231-
bigframes_dtype: Union[
232-
bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype, np.dtype[Any]
233-
]
229+
bigframes_dtype: bigframes.dtypes.Dtype,
234230
) -> ibis_dtypes.DataType:
235231
"""Converts a BigQuery DataFrames supported dtype to an Ibis dtype.
236232
@@ -244,36 +240,14 @@ def bigframes_dtype_to_ibis_dtype(
244240
Raises:
245241
ValueError: If passed a dtype not supported by BigQuery DataFrames.
246242
"""
247-
if str(bigframes_dtype) in bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES:
248-
bigframes_dtype = bigframes.dtypes.BIGFRAMES_STRING_TO_BIGFRAMES[
249-
cast(bigframes.dtypes.DtypeString, str(bigframes_dtype))
250-
]
251-
252243
if bigframes_dtype in BIGFRAMES_TO_IBIS.keys():
253244
return BIGFRAMES_TO_IBIS[bigframes_dtype]
254245

255246
elif isinstance(bigframes_dtype, pd.ArrowDtype) and bigframes_dtype.pyarrow_dtype:
256247
return _arrow_dtype_to_ibis_dtype(bigframes_dtype.pyarrow_dtype)
257248

258249
else:
259-
raise ValueError(
260-
textwrap.dedent(
261-
f"""
262-
Unexpected data type {bigframes_dtype}. The following
263-
str dtypes are supppted: 'boolean','Float64','Int64',
264-
'int64[pyarrow]','string','string[pyarrow]',
265-
'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]',
266-
'date32[day][pyarrow]','time64[us][pyarrow]'.
267-
The following pandas.ExtensionDtype are supported:
268-
pandas.BooleanDtype(), pandas.Float64Dtype(),
269-
pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"),
270-
pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")),
271-
pd.ArrowDtype(pa.timestamp("us")),
272-
pd.ArrowDtype(pa.timestamp("us", tz="UTC")).
273-
{constants.FEEDBACK_LINK}
274-
"""
275-
)
276-
)
250+
raise ValueError(f"Datatype has no ibis type mapping: {bigframes_dtype}")
277251

278252

279253
def ibis_dtype_to_bigframes_dtype(

bigframes/core/indexes/base.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ def __new__(
7878
if name is not None:
7979
index.name = name
8080
if dtype is not None:
81-
index = index.astype(dtype)
81+
bf_dtype = bigframes.dtypes.bigframes_type(dtype)
82+
index = index.astype(bf_dtype)
8283
block = index._block
8384
elif isinstance(data, pandas.Index):
8485
pd_df = pandas.DataFrame(index=data)
@@ -310,14 +311,15 @@ def sort_values(self, *, ascending: bool = True, na_position: str = "last"):
310311

311312
def astype(
312313
self,
313-
dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype],
314+
dtype,
314315
*,
315316
errors: Literal["raise", "null"] = "raise",
316317
) -> Index:
317318
if errors not in ["raise", "null"]:
318319
raise ValueError("Argument 'errors' must be one of 'raise' or 'null'")
319320
if self.nlevels > 1:
320321
raise TypeError("Multiindex does not support 'astype'")
322+
dtype = bigframes.dtypes.bigframes_type(dtype)
321323
return self._apply_unary_expr(
322324
ops.AsTypeOp(to_type=dtype, safe=(errors == "null")).as_expr(
323325
ex.free_var("arg")

bigframes/core/local_data.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,12 @@ def arrow_type_replacements(type: pa.DataType) -> pa.DataType:
5959
if pa.types.is_time64(type):
6060
# This is potentially lossy, but BigFrames doesn't support ns
6161
return pa.time64("us")
62+
if pa.types.is_decimal128(type):
63+
return pa.decimal128(38, 9)
64+
if pa.types.is_decimal256(type):
65+
return pa.decimal256(76, 38)
66+
if pa.types.is_dictionary(type):
67+
return arrow_type_replacements(type.value_type)
6268
if pa.types.is_large_string(type):
6369
# simple string type can handle the largest strings needed
6470
return pa.string()

bigframes/dataframe.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,8 @@ def __init__(
180180
if columns:
181181
block = block.select_columns(list(columns)) # type:ignore
182182
if dtype:
183-
block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=dtype))
183+
bf_dtype = bigframes.dtypes.bigframes_type(dtype)
184+
block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype))
184185
self._block = block
185186

186187
else:
@@ -368,6 +369,7 @@ def astype(
368369
dtype: Union[
369370
bigframes.dtypes.DtypeString,
370371
bigframes.dtypes.Dtype,
372+
type,
371373
dict[str, Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype]],
372374
],
373375
*,
@@ -378,23 +380,15 @@ def astype(
378380

379381
safe_cast = errors == "null"
380382

381-
# Type strings check
382-
if dtype in bigframes.dtypes.DTYPE_STRINGS:
383-
return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast))
384-
385-
# Type instances check
386-
if type(dtype) in bigframes.dtypes.DTYPES:
387-
return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast))
388-
389383
if isinstance(dtype, dict):
390384
result = self.copy()
391385
for col, to_type in dtype.items():
392386
result[col] = result[col].astype(to_type)
393387
return result
394388

395-
raise TypeError(
396-
f"Invalid type {type(dtype)} for dtype input. {constants.FEEDBACK_LINK}"
397-
)
389+
dtype = bigframes.dtypes.bigframes_type(dtype)
390+
391+
return self._apply_unary_op(ops.AsTypeOp(dtype, safe_cast))
398392

399393
def _to_sql_query(
400394
self, include_index: bool, enable_cache: bool = True

bigframes/dtypes.py

Lines changed: 91 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from dataclasses import dataclass
1818
import datetime
1919
import decimal
20+
import textwrap
2021
import typing
2122
from typing import Any, Dict, List, Literal, Union
2223

@@ -422,7 +423,7 @@ def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype:
422423
return DEFAULT_DTYPE
423424

424425
# No other types matched.
425-
raise ValueError(
426+
raise TypeError(
426427
f"Unexpected Arrow data type {arrow_dtype}. {constants.FEEDBACK_LINK}"
427428
)
428429

@@ -447,7 +448,7 @@ def bigframes_dtype_to_arrow_dtype(
447448
if pa.types.is_struct(bigframes_dtype.pyarrow_dtype):
448449
return bigframes_dtype.pyarrow_dtype
449450
else:
450-
raise ValueError(
451+
raise TypeError(
451452
f"No arrow conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
452453
)
453454

@@ -474,7 +475,7 @@ def bigframes_dtype_to_literal(
474475
if isinstance(bigframes_dtype, gpd.array.GeometryDtype):
475476
return shapely.Point((0, 0))
476477

477-
raise ValueError(
478+
raise TypeError(
478479
f"No literal conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
479480
)
480481

@@ -515,11 +516,91 @@ def arrow_type_to_literal(
515516
if pa.types.is_time(arrow_type):
516517
return datetime.time(1, 1, 1)
517518

518-
raise ValueError(
519+
raise TypeError(
519520
f"No literal conversion for {arrow_type}. {constants.FEEDBACK_LINK}"
520521
)
521522

522523

524+
def bigframes_type(dtype) -> Dtype:
525+
"""Convert type object to canoncial bigframes dtype."""
526+
if _is_bigframes_dtype(dtype):
527+
return dtype
528+
elif isinstance(dtype, str):
529+
return _dtype_from_string(dtype)
530+
elif isinstance(dtype, type):
531+
return _infer_dtype_from_python_type(dtype)
532+
elif isinstance(dtype, pa.DataType):
533+
return arrow_dtype_to_bigframes_dtype(dtype)
534+
else:
535+
raise TypeError(
536+
f"Cannot infer supported datatype for: {dtype}. {constants.FEEDBACK_LINK}"
537+
)
538+
539+
540+
def _is_bigframes_dtype(dtype) -> bool:
541+
"""True iff dtyps is a canonical bigframes dtype"""
542+
# have to be quite strict, as pyarrow dtypes equal their string form, and we don't consider that a canonical form.
543+
if (type(dtype), dtype) in set(
544+
(type(item.dtype), item.dtype) for item in SIMPLE_TYPES
545+
):
546+
return True
547+
if isinstance(dtype, pd.ArrowDtype):
548+
try:
549+
_ = arrow_dtype_to_bigframes_dtype(dtype.pyarrow_dtype)
550+
return True
551+
except TypeError:
552+
return False
553+
return False
554+
555+
556+
def _infer_dtype_from_python_type(type: type) -> Dtype:
557+
if issubclass(type, (bool, np.bool_)):
558+
return BOOL_DTYPE
559+
if issubclass(type, (int, np.integer)):
560+
return INT_DTYPE
561+
if issubclass(type, (float, np.floating)):
562+
return FLOAT_DTYPE
563+
if issubclass(type, decimal.Decimal):
564+
return NUMERIC_DTYPE
565+
if issubclass(type, (str, np.str_)):
566+
return STRING_DTYPE
567+
if issubclass(type, (bytes, np.bytes_)):
568+
return BYTES_DTYPE
569+
if issubclass(type, datetime.date):
570+
return DATE_DTYPE
571+
if issubclass(type, datetime.time):
572+
return TIME_DTYPE
573+
else:
574+
raise TypeError(
575+
f"No matching datatype for python type: {type}. {constants.FEEDBACK_LINK}"
576+
)
577+
578+
579+
def _dtype_from_string(dtype_string: str) -> typing.Optional[Dtype]:
580+
if str(dtype_string) in BIGFRAMES_STRING_TO_BIGFRAMES:
581+
return BIGFRAMES_STRING_TO_BIGFRAMES[
582+
typing.cast(DtypeString, str(dtype_string))
583+
]
584+
raise TypeError(
585+
textwrap.dedent(
586+
f"""
587+
Unexpected data type string {dtype_string}. The following
588+
dtypes are supppted: 'boolean','Float64','Int64',
589+
'int64[pyarrow]','string','string[pyarrow]',
590+
'timestamp[us, tz=UTC][pyarrow]','timestamp[us][pyarrow]',
591+
'date32[day][pyarrow]','time64[us][pyarrow]'.
592+
The following pandas.ExtensionDtype are supported:
593+
pandas.BooleanDtype(), pandas.Float64Dtype(),
594+
pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"),
595+
pd.ArrowDtype(pa.date32()), pd.ArrowDtype(pa.time64("us")),
596+
pd.ArrowDtype(pa.timestamp("us")),
597+
pd.ArrowDtype(pa.timestamp("us", tz="UTC")).
598+
{constants.FEEDBACK_LINK}
599+
"""
600+
)
601+
)
602+
603+
523604
def infer_literal_type(literal) -> typing.Optional[Dtype]:
524605
# Maybe also normalize literal to canonical python representation to remove this burden from compilers?
525606
if pd.api.types.is_list_like(literal):
@@ -539,30 +620,17 @@ def infer_literal_type(literal) -> typing.Optional[Dtype]:
539620
return pd.ArrowDtype(pa.struct(fields))
540621
if pd.isna(literal):
541622
return None # Null value without a definite type
542-
if isinstance(literal, (bool, np.bool_)):
543-
return BOOL_DTYPE
544-
if isinstance(literal, (int, np.integer)):
545-
return INT_DTYPE
546-
if isinstance(literal, (float, np.floating)):
547-
return FLOAT_DTYPE
548-
if isinstance(literal, decimal.Decimal):
549-
return NUMERIC_DTYPE
550-
if isinstance(literal, (str, np.str_)):
551-
return STRING_DTYPE
552-
if isinstance(literal, (bytes, np.bytes_)):
553-
return BYTES_DTYPE
554623
# Make sure to check datetime before date as datetimes are also dates
555624
if isinstance(literal, (datetime.datetime, pd.Timestamp)):
556625
if literal.tzinfo is not None:
557626
return TIMESTAMP_DTYPE
558627
else:
559628
return DATETIME_DTYPE
560-
if isinstance(literal, datetime.date):
561-
return DATE_DTYPE
562-
if isinstance(literal, datetime.time):
563-
return TIME_DTYPE
629+
from_python_type = _infer_dtype_from_python_type(type(literal))
630+
if from_python_type is not None:
631+
return from_python_type
564632
else:
565-
raise ValueError(f"Unable to infer type for value: {literal}")
633+
raise TypeError(f"Unable to infer type for value: {literal}")
566634

567635

568636
def infer_literal_arrow_type(literal) -> typing.Optional[pa.DataType]:
@@ -602,7 +670,7 @@ def convert_schema_field(
602670
return field.name, pd.ArrowDtype(pa_type)
603671
return field.name, _TK_TO_BIGFRAMES[field.field_type]
604672
else:
605-
raise ValueError(f"Cannot handle type: {field.field_type}")
673+
raise TypeError(f"Cannot handle type: {field.field_type}")
606674

607675

608676
def convert_to_schema_field(
@@ -636,7 +704,7 @@ def convert_to_schema_field(
636704
if bigframes_dtype.pyarrow_dtype == pa.duration("us"):
637705
# Timedeltas are represented as integers in microseconds.
638706
return google.cloud.bigquery.SchemaField(name, "INTEGER")
639-
raise ValueError(
707+
raise TypeError(
640708
f"No arrow conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
641709
)
642710

bigframes/operations/base.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,8 @@ def __init__(
8787
if name is not None:
8888
data.name = name
8989
if dtype is not None:
90-
data = data.astype(dtype)
90+
bf_dtype = bigframes.dtypes.bigframes_type(dtype)
91+
data = data.astype(bf_dtype)
9192
else: # local dict-like data
9293
data = read_pandas_func(pd.Series(data, name=name, dtype=dtype)) # type: ignore
9394
data_block = data._block

0 commit comments

Comments
 (0)