Skip to content

Commit 2d564a6

Browse files
authored
feat: Add support for temporal types in dataframe's describe() method (#1189)
* feat: Add support for temporal types in dataframe's describe() method * add type hint to make mypy happy * directly use output_type to check agg op support for input type * format code * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * perf: update df.corr, df.cov to be used with more than 30 columns case. (#1161) * perf: update df.corr, df.cov to be used with more than 30 columns case. * add large test * remove print * fix_index * fix index * test fix * fix test * fix test * lint code * fix import path
1 parent 05f8b4d commit 2d564a6

File tree

5 files changed

+100
-119
lines changed

5 files changed

+100
-119
lines changed

bigframes/dataframe.py

Lines changed: 50 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -517,6 +517,17 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame:
517517
)
518518
return DataFrame(self._block.select_columns(selected_columns))
519519

520+
def _select_exact_dtypes(
521+
self, dtypes: Sequence[bigframes.dtypes.Dtype]
522+
) -> DataFrame:
523+
"""Selects columns without considering inheritance relationships."""
524+
columns = [
525+
col_id
526+
for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
527+
if dtype in dtypes
528+
]
529+
return DataFrame(self._block.select_columns(columns))
530+
520531
def _set_internal_query_job(self, query_job: Optional[bigquery.QueryJob]):
521532
self._query_job = query_job
522533

@@ -2437,13 +2448,9 @@ def agg(
24372448
aggregations = [agg_ops.lookup_agg_func(f) for f in func]
24382449

24392450
for dtype, agg in itertools.product(self.dtypes, aggregations):
2440-
if not bigframes.operations.aggregations.is_agg_op_supported(
2441-
dtype, agg
2442-
):
2443-
raise NotImplementedError(
2444-
f"Type {dtype} does not support aggregation {agg}. "
2445-
f"Share your usecase with the BigQuery DataFrames team at the {constants.FEEDBACK_LINK}"
2446-
)
2451+
agg.output_type(
2452+
dtype
2453+
) # Raises exception if the agg does not support the dtype.
24472454

24482455
return DataFrame(
24492456
self._block.summarize(
@@ -2512,7 +2519,10 @@ def melt(
25122519

25132520
def describe(self, include: None | Literal["all"] = None) -> DataFrame:
25142521
if include is None:
2515-
numeric_df = self._drop_non_numeric(permissive=False)
2522+
numeric_df = self._select_exact_dtypes(
2523+
bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
2524+
+ bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
2525+
)
25162526
if len(numeric_df.columns) == 0:
25172527
# Describe eligible non-numeric columns
25182528
return self._describe_non_numeric()
@@ -2540,9 +2550,11 @@ def describe(self, include: None | Literal["all"] = None) -> DataFrame:
25402550
raise ValueError(f"Unsupported include type: {include}")
25412551

25422552
def _describe_numeric(self) -> DataFrame:
2543-
return typing.cast(
2553+
number_df_result = typing.cast(
25442554
DataFrame,
2545-
self._drop_non_numeric(permissive=False).agg(
2555+
self._select_exact_dtypes(
2556+
bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
2557+
).agg(
25462558
[
25472559
"count",
25482560
"mean",
@@ -2555,16 +2567,41 @@ def _describe_numeric(self) -> DataFrame:
25552567
]
25562568
),
25572569
)
2570+
temporal_df_result = typing.cast(
2571+
DataFrame,
2572+
self._select_exact_dtypes(
2573+
bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
2574+
).agg(["count"]),
2575+
)
2576+
2577+
if len(number_df_result.columns) == 0:
2578+
return temporal_df_result
2579+
elif len(temporal_df_result.columns) == 0:
2580+
return number_df_result
2581+
else:
2582+
import bigframes.core.reshape.api as rs
2583+
2584+
original_columns = self._select_exact_dtypes(
2585+
bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
2586+
+ bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
2587+
).columns
2588+
2589+
# Use reindex after join to preserve the original column order.
2590+
return rs.concat(
2591+
[number_df_result, temporal_df_result],
2592+
axis=1,
2593+
)._reindex_columns(original_columns)
25582594

25592595
def _describe_non_numeric(self) -> DataFrame:
25602596
return typing.cast(
25612597
DataFrame,
2562-
self.select_dtypes(
2563-
include={
2598+
self._select_exact_dtypes(
2599+
[
25642600
bigframes.dtypes.STRING_DTYPE,
25652601
bigframes.dtypes.BOOL_DTYPE,
25662602
bigframes.dtypes.BYTES_DTYPE,
2567-
}
2603+
bigframes.dtypes.TIME_DTYPE,
2604+
]
25682605
).agg(["count", "nunique"]),
25692606
)
25702607

bigframes/dtypes.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import datetime
1919
import decimal
2020
import typing
21-
from typing import Dict, Literal, Union
21+
from typing import Dict, List, Literal, Union
2222

2323
import bigframes_vendored.constants as constants
2424
import geopandas as gpd # type: ignore
@@ -211,7 +211,7 @@ class SimpleDtypeInfo:
211211

212212
# Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation)
213213
# Pandas is inconsistent, so two definitions are provided, each used in different contexts
214-
NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE = [
214+
NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE: List[Dtype] = [
215215
FLOAT_DTYPE,
216216
INT_DTYPE,
217217
]
@@ -222,7 +222,16 @@ class SimpleDtypeInfo:
222222
]
223223

224224

225-
## dtype predicates - use these to maintain consistency
225+
# Temporal types that are considered as "numeric" by Pandas
226+
TEMPORAL_NUMERIC_BIGFRAMES_TYPES: List[Dtype] = [
227+
DATE_DTYPE,
228+
TIMESTAMP_DTYPE,
229+
DATETIME_DTYPE,
230+
]
231+
TEMPORAL_BIGFRAMES_TYPES = TEMPORAL_NUMERIC_BIGFRAMES_TYPES + [TIME_DTYPE]
232+
233+
234+
# dtype predicates - use these to maintain consistency
226235
def is_datetime_like(type_: ExpressionType) -> bool:
227236
return type_ in (DATETIME_DTYPE, TIMESTAMP_DTYPE)
228237

@@ -630,7 +639,7 @@ def can_coerce(source_type: ExpressionType, target_type: ExpressionType) -> bool
630639
return True # None can be coerced to any supported type
631640
else:
632641
return (source_type == STRING_DTYPE) and (
633-
target_type in (DATETIME_DTYPE, TIMESTAMP_DTYPE, TIME_DTYPE, DATE_DTYPE)
642+
target_type in TEMPORAL_BIGFRAMES_TYPES
634643
)
635644

636645

bigframes/operations/aggregations.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -579,14 +579,3 @@ def lookup_agg_func(key: str) -> typing.Union[UnaryAggregateOp, NullaryAggregate
579579
return _AGGREGATIONS_LOOKUP[key]
580580
else:
581581
raise ValueError(f"Unrecognize aggregate function: {key}")
582-
583-
584-
def is_agg_op_supported(dtype: dtypes.Dtype, op: AggregateOp) -> bool:
585-
if dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE:
586-
return True
587-
588-
if dtype in (dtypes.STRING_DTYPE, dtypes.BOOL_DTYPE, dtypes.BYTES_DTYPE):
589-
return isinstance(op, (CountOp, NuniqueOp))
590-
591-
# For all other types, support no aggregation
592-
return False

tests/system/small/test_dataframe.py

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2671,11 +2671,11 @@ def test_dataframe_agg_int_multi_string(scalars_dfs):
26712671

26722672

26732673
@skip_legacy_pandas
2674-
def test_df_describe(scalars_dfs):
2674+
def test_df_describe_non_temporal(scalars_dfs):
26752675
scalars_df, scalars_pandas_df = scalars_dfs
2676-
# pyarrows time columns fail in pandas
2676+
# excluding temporal columns here because BigFrames cannot perform percentiles operations on them
26772677
unsupported_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"]
2678-
bf_result = scalars_df.describe().to_pandas()
2678+
bf_result = scalars_df.drop(columns=unsupported_columns).describe().to_pandas()
26792679

26802680
modified_pd_df = scalars_pandas_df.drop(columns=unsupported_columns)
26812681
pd_result = modified_pd_df.describe()
@@ -2709,12 +2709,14 @@ def test_df_describe(scalars_dfs):
27092709
def test_df_describe_non_numeric(scalars_dfs, include):
27102710
scalars_df, scalars_pandas_df = scalars_dfs
27112711

2712-
non_numeric_columns = ["string_col", "bytes_col", "bool_col"]
2712+
# Excluding "date_col" here because in BigFrames it is used as PyArrow[date32()], which is
2713+
# considered numerical in Pandas
2714+
target_columns = ["string_col", "bytes_col", "bool_col", "time_col"]
27132715

2714-
modified_bf = scalars_df[non_numeric_columns]
2716+
modified_bf = scalars_df[target_columns]
27152717
bf_result = modified_bf.describe(include=include).to_pandas()
27162718

2717-
modified_pd_df = scalars_pandas_df[non_numeric_columns]
2719+
modified_pd_df = scalars_pandas_df[target_columns]
27182720
pd_result = modified_pd_df.describe(include=include)
27192721

27202722
# Reindex results with the specified keys and their order, because
@@ -2726,8 +2728,35 @@ def test_df_describe_non_numeric(scalars_dfs, include):
27262728
).rename(index={"unique": "nunique"})
27272729

27282730
pd.testing.assert_frame_equal(
2729-
pd_result[non_numeric_columns].astype("Int64"),
2730-
bf_result[non_numeric_columns],
2731+
pd_result.astype("Int64"),
2732+
bf_result,
2733+
check_index_type=False,
2734+
)
2735+
2736+
2737+
@skip_legacy_pandas
2738+
def test_df_describe_temporal(scalars_dfs):
2739+
scalars_df, scalars_pandas_df = scalars_dfs
2740+
2741+
temporal_columns = ["datetime_col", "timestamp_col", "time_col", "date_col"]
2742+
2743+
modified_bf = scalars_df[temporal_columns]
2744+
bf_result = modified_bf.describe(include="all").to_pandas()
2745+
2746+
modified_pd_df = scalars_pandas_df[temporal_columns]
2747+
pd_result = modified_pd_df.describe(include="all")
2748+
2749+
# Reindex results with the specified keys and their order, because
2750+
# the relative order is not important.
2751+
bf_result = bf_result.reindex(["count", "nunique"])
2752+
pd_result = pd_result.reindex(
2753+
["count", "unique"]
2754+
# BF counter part of "unique" is called "nunique"
2755+
).rename(index={"unique": "nunique"})
2756+
2757+
pd.testing.assert_frame_equal(
2758+
pd_result.astype("Float64"),
2759+
bf_result.astype("Float64"),
27312760
check_index_type=False,
27322761
)
27332762

tests/unit/operations/test_aggregations.py

Lines changed: 0 additions & 83 deletions
This file was deleted.

0 commit comments

Comments
 (0)