From 9798a2b14dffb20432f732343cac92341e42fe09 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 15 Dec 2023 14:42:07 -0600 Subject: [PATCH 01/27] deps: migrate to `ibis-framework >= "7.1.0"` (#53) * deps: migrate to `ibis-framework >= "7.0.0"` This should unlock some bug fixes as well as potential `UNNEST` support in a future change. * use dtype instead of output_dtype in custom ops * adjust type annotations * Update noxfile.py * update type annotations * fix for deferred values * fix prerelease * minimum 7.1.0 * mypy * revert presubmit changes * fix minimum sqlglot * fix custom op * hack InMemoryTable formatter back in * use ops module to avoid breaking changes if ops move around * workaround nullscalar issue * update usage of percent_rank to explicitly order by the value * disable ibis prerelease tests for now * fix unit_prerelease --- bigframes/core/compile/compiled.py | 25 ++++++- bigframes/core/reshape/__init__.py | 6 +- bigframes/operations/__init__.py | 70 ++++++++++++++++--- bigframes/operations/aggregations.py | 8 +-- bigframes/remote_function.py | 9 +-- bigframes/session/__init__.py | 7 +- mypy.ini | 3 + noxfile.py | 26 +++---- setup.py | 2 +- testing/constraints-3.9.txt | 4 +- tests/system/small/test_ibis.py | 13 ++-- tests/unit/resources.py | 2 +- .../ibis/backends/bigquery/__init__.py | 3 + .../ibis/backends/bigquery/compiler.py | 59 ++++++++++++++++ .../ibis/expr/operations/analytic.py | 14 ++-- .../ibis/expr/operations/json.py | 2 +- .../ibis/expr/operations/reductions.py | 8 +-- 17 files changed, 198 insertions(+), 63 deletions(-) create mode 100644 third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 461c2c005a..537d9c8b52 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -21,6 +21,7 @@ import ibis import ibis.backends.bigquery as ibis_bigquery +import ibis.common.deferred # type: ignore import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types import pandas @@ -62,7 +63,16 @@ def __init__( self._columns = tuple(columns) # To allow for more efficient lookup by column name, create a # dictionary mapping names to column values. - self._column_names = {column.get_name(): column for column in self._columns} + self._column_names = { + ( + column.resolve(table) + # TODO(https://github.com/ibis-project/ibis/issues/7613): use + # public API to refer to Deferred type. + if isinstance(column, ibis.common.deferred.Deferred) + else column + ).get_name(): column + for column in self._columns + } @property def columns(self) -> typing.Tuple[ibis_types.Value, ...]: @@ -643,7 +653,16 @@ def __init__( # To allow for more efficient lookup by column name, create a # dictionary mapping names to column values. - self._column_names = {column.get_name(): column for column in self._columns} + self._column_names = { + ( + column.resolve(table) + # TODO(https://github.com/ibis-project/ibis/issues/7613): use + # public API to refer to Deferred type. + if isinstance(column, ibis.common.deferred.Deferred) + else column + ).get_name(): column + for column in self._columns + } self._hidden_ordering_column_names = { column.get_name(): column for column in self._hidden_ordering_columns } @@ -860,7 +879,7 @@ def project_window_op( case_statement = ibis.case() for clause in clauses: case_statement = case_statement.when(clause[0], clause[1]) - case_statement = case_statement.else_(window_op).end() + case_statement = case_statement.else_(window_op).end() # type: ignore window_op = case_statement result = self._set_or_replace_by_id(output_name or column_name, window_op) diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index dc61c3baad..24c1bff309 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -18,6 +18,7 @@ import bigframes.constants as constants import bigframes.core as core +import bigframes.core.ordering as order import bigframes.core.utils as utils import bigframes.dataframe import bigframes.operations as ops @@ -145,7 +146,10 @@ def qcut( block, result = block.apply_window_op( x._value_column, agg_ops.QcutOp(q), - window_spec=core.WindowSpec(grouping_keys=(nullity_id,)), + window_spec=core.WindowSpec( + grouping_keys=(nullity_id,), + ordering=(order.OrderingColumnReference(x._value_column),), + ), ) block, result = block.apply_binary_op( result, nullity_id, ops.partial_arg3(ops.where_op, None), result_label=label diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index a29dd36c72..0655aafdb3 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -18,6 +18,7 @@ import typing import ibis +import ibis.common.annotations import ibis.common.exceptions import ibis.expr.datatypes as ibis_dtypes import ibis.expr.operations.generic @@ -352,14 +353,23 @@ def _as_ibis(self, x: ibis_types.Value): str_val = typing.cast(ibis_types.StringValue, x) # SQL pad operations will truncate, we do not want to truncate though. - pad_length = ibis.greatest(str_val.length(), self._length) + pad_length = typing.cast( + ibis_types.IntegerValue, ibis.greatest(str_val.length(), self._length) + ) if self._side == "left": return str_val.lpad(pad_length, self._fillchar) elif self._side == "right": return str_val.rpad(pad_length, self._fillchar) else: # side == both # Pad more on right side if can't pad both sides equally - lpad_amount = ((pad_length - str_val.length()) // 2) + str_val.length() + lpad_amount = typing.cast( + ibis_types.IntegerValue, + ( + (pad_length - str_val.length()) + // typing.cast(ibis_types.NumericValue, ibis.literal(2)) + ) + + str_val.length(), + ) return str_val.lpad(lpad_amount, self._fillchar).rpad( pad_length, self._fillchar ) @@ -722,10 +732,29 @@ def ne_op( return x != y +def _null_or_value(value: ibis_types.Value, where_value: ibis_types.BooleanValue): + return ibis.where( + where_value, + value, + ibis.null(), + ) + + def and_op( x: ibis_types.Value, y: ibis_types.Value, ): + # Workaround issue https://github.com/ibis-project/ibis/issues/7775 by + # implementing three-valued logic ourselves. For AND, when we encounter a + # NULL value, we only know when the result is FALSE, otherwise the result + # is unknown (NULL). See: truth table at + # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR + if isinstance(x, ibis_types.NullScalar): + return _null_or_value(y, y == ibis.literal(False)) + + if isinstance(y, ibis_types.NullScalar): + return _null_or_value(x, x == ibis.literal(False)) + return typing.cast(ibis_types.BooleanValue, x) & typing.cast( ibis_types.BooleanValue, y ) @@ -735,6 +764,17 @@ def or_op( x: ibis_types.Value, y: ibis_types.Value, ): + # Workaround issue https://github.com/ibis-project/ibis/issues/7775 by + # implementing three-valued logic ourselves. For OR, when we encounter a + # NULL value, we only know when the result is TRUE, otherwise the result + # is unknown (NULL). See: truth table at + # https://en.wikibooks.org/wiki/Structured_Query_Language/NULLs_and_the_Three_Valued_Logic#AND,_OR + if isinstance(x, ibis_types.NullScalar): + return _null_or_value(y, y == ibis.literal(True)) + + if isinstance(y, ibis_types.NullScalar): + return _null_or_value(x, x == ibis.literal(True)) + return typing.cast(ibis_types.BooleanValue, x) | typing.cast( ibis_types.BooleanValue, y ) @@ -746,10 +786,16 @@ def add_op( y: ibis_types.Value, ): if isinstance(x, ibis_types.NullScalar) or isinstance(x, ibis_types.NullScalar): - return - return typing.cast(ibis_types.NumericValue, x) + typing.cast( - ibis_types.NumericValue, y - ) + return ibis.null() + try: + # Could be string concatenation or numeric addition. + return x + y # type: ignore + except ibis.common.annotations.SignatureValidationError as exc: + left_type = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(x.type()) + right_type = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(y.type()) + raise TypeError( + f"Cannot add {repr(left_type)} and {repr(right_type)}. {constants.FEEDBACK_LINK}" + ) from exc @short_circuit_nulls() @@ -1047,7 +1093,7 @@ def where_op( replacement: ibis_types.Value, ) -> ibis_types.Value: """Returns x if y is true, otherwise returns z.""" - return ibis.case().when(condition, original).else_(replacement).end() + return ibis.case().when(condition, original).else_(replacement).end() # type: ignore def clip_op( @@ -1060,7 +1106,7 @@ def clip_op( not isinstance(upper, ibis_types.NullScalar) ): return ( - ibis.case() + ibis.case() # type: ignore .when(upper.isnull() | (original > upper), upper) .else_(original) .end() @@ -1069,7 +1115,7 @@ def clip_op( upper, ibis_types.NullScalar ): return ( - ibis.case() + ibis.case() # type: ignore .when(lower.isnull() | (original < lower), lower) .else_(original) .end() @@ -1079,9 +1125,11 @@ def clip_op( ): return original else: - # Note: Pandas has unchanged behavior when upper bound and lower bound are flipped. This implementation requires that lower_bound < upper_bound + # Note: Pandas has unchanged behavior when upper bound and lower bound + # are flipped. + # This implementation requires that lower_bound < upper_bound. return ( - ibis.case() + ibis.case() # type: ignore .when(lower.isnull() | (original < lower), lower) .when(upper.isnull() | (original > upper), upper) .else_(original) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 465d188724..363dfe819d 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -74,7 +74,7 @@ def _as_ibis( # Will be null if all inputs are null. Pandas defaults to zero sum though. bq_sum = _apply_window_if_present(column.sum(), window) return ( - ibis.case().when(bq_sum.isnull(), ibis_types.literal(0)).else_(bq_sum).end() + ibis.case().when(bq_sum.isnull(), ibis_types.literal(0)).else_(bq_sum).end() # type: ignore ) @@ -167,7 +167,7 @@ def _as_ibis( .else_(magnitude * pow(-1, negative_count_parity)) .end() ) - return float_result.cast(column.type()) + return float_result.cast(column.type()) # type: ignore class MaxOp(AggregateOp): @@ -290,7 +290,7 @@ def _as_ibis( dtypes.literal_to_ibis_scalar(bucket_n, force_dtype=Int64Dtype()), ) out = out.else_(None) - return out.end() + return out.end() # type: ignore @property def skips_nulls(self): @@ -482,7 +482,7 @@ def _map_to_literal( original: ibis_types.Value, literal: ibis_types.Scalar ) -> ibis_types.Column: # Hack required to perform aggregations on literals in ibis, even though bigquery will let you directly aggregate literals (eg. 'SELECT COUNT(1) from table1') - return ibis.ifelse(original.isnull(), literal, literal) + return ibis.ifelse(original.isnull(), literal, literal) # type: ignore sum_op = SumOp() diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index a899ebd371..f54c26fa56 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -535,17 +535,14 @@ def remote_function_node( """Creates an Ibis node representing a remote function call.""" fields = { - name: rlz.value(type_) if type_ else rlz.any + name: rlz.ValueOf(None if type_ == "ANY TYPE" else type_) for name, type_ in zip( ibis_signature.parameter_names, ibis_signature.input_types ) } - try: - fields["output_type"] = rlz.shape_like("args", dtype=ibis_signature.output_type) # type: ignore - except TypeError: - fields["output_dtype"] = property(lambda _: ibis_signature.output_type) - fields["output_shape"] = rlz.shape_like("args") + fields["dtype"] = ibis_signature.output_type # type: ignore + fields["shape"] = rlz.shape_like("args") node = type(routine_ref_to_string_for_query(routine_ref), (ops.ValueOp,), fields) # type: ignore diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 5364060d1c..fb5fab86ce 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -79,9 +79,9 @@ import bigframes.session.clients import bigframes.version -# Even though the ibis.backends.bigquery.registry import is unused, it's needed +# Even though the ibis.backends.bigquery import is unused, it's needed # to register new and replacement ops with the Ibis BigQuery backend. -import third_party.bigframes_vendored.ibis.backends.bigquery.registry # noqa +import third_party.bigframes_vendored.ibis.backends.bigquery # noqa import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import third_party.bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import third_party.bigframes_vendored.pandas.io.parquet as third_party_pandas_parquet @@ -873,8 +873,9 @@ def _read_pandas( total_ordering_columns=frozenset([ordering_col]), integer_encoding=IntegerEncoding(True, is_sequential=True), ) - table_expression = self.ibis_client.table( + table_expression = self.ibis_client.table( # type: ignore load_table_destination.table_id, + # TODO: use "dataset_id" as the "schema" database=f"{load_table_destination.project}.{load_table_destination.dataset_id}", ) diff --git a/mypy.ini b/mypy.ini index 901394813a..3809f8e241 100644 --- a/mypy.ini +++ b/mypy.ini @@ -24,5 +24,8 @@ ignore_missing_imports = True [mypy-pyarrow] ignore_missing_imports = True +[mypy-ibis.*] +ignore_missing_imports = True + [mypy-ipywidgets] ignore_missing_imports = True diff --git a/noxfile.py b/noxfile.py index 2174e27529..c0ec3b0c54 100644 --- a/noxfile.py +++ b/noxfile.py @@ -524,23 +524,19 @@ def prerelease(session: nox.sessions.Session, tests_path): ) already_installed.add("pandas") - # TODO(shobs): - # Commit https://github.com/ibis-project/ibis/commit/c20ba7feab6bdea6c299721310e04dbc10551cc2 - # introduced breaking change that removed the following: - # ibis.expr.rules.column - # ibis.expr.rules.value - # ibis.expr.rules.any - # Let's exclude ibis head from prerelease install list for now. Instead, use - # a working ibis-framework version resolved via setup.by (currently resolves - # to version 6.2.0 due to version requirement "6.2.0,<7.0.0dev"). - # We should enable the head back once bigframes support a version that - # includes the above commit. + # Ibis has introduced breaking changes. Let's exclude ibis head + # from prerelease install list for now. We should enable the head back + # once bigframes supports the version at HEAD. # session.install( - # "--upgrade", - # "-e", # Use -e so that py.typed file is included. - # "git+https://github.com/ibis-project/ibis.git#egg=ibis-framework", + # "--upgrade", + # "-e", # Use -e so that py.typed file is included. + # "git+https://github.com/ibis-project/ibis.git@7.x.x#egg=ibis-framework", # ) - session.install("--no-deps", "ibis-framework==6.2.0") + session.install( + "--upgrade", + # "--pre", + "ibis-framework>=7.1.0,<8.0.0dev", + ) already_installed.add("ibis-framework") # Workaround https://github.com/googleapis/python-db-dtypes-pandas/issues/178 diff --git a/setup.py b/setup.py index 3351542985..1ad4bbd3eb 100644 --- a/setup.py +++ b/setup.py @@ -43,8 +43,8 @@ "google-cloud-iam >=2.12.1", "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", + "ibis-framework[bigquery] >=7.1.0,<8.0.0dev", # TODO: Relax upper bound once we have fixed `system_prerelease` tests. - "ibis-framework[bigquery] >=6.2.0,<7.0.0dev", "pandas >=1.5.0,<2.1.4", "pydata-google-auth >=1.8.2", "requests >=2.27.1", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index f43d3b4ca0..218255c77e 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -45,7 +45,7 @@ greenlet==2.0.2 grpc-google-iam-v1==0.12.6 grpcio==1.53.0 grpcio-status==1.48.2 -ibis-framework==6.2.0 +ibis-framework==7.1.0 humanize==4.6.0 identify==2.5.22 idna==3.4 @@ -107,7 +107,7 @@ scikit-learn==1.2.2 SecretStorage==3.3.3 six==1.16.0 SQLAlchemy==1.4.0 -sqlglot==10.6.4 +sqlglot==18.12.0 tomli==2.0.1 toolz==0.12.0 tqdm==4.65.0 diff --git a/tests/system/small/test_ibis.py b/tests/system/small/test_ibis.py index 58b78e0048..9fe1176068 100644 --- a/tests/system/small/test_ibis.py +++ b/tests/system/small/test_ibis.py @@ -23,11 +23,16 @@ def test_approximate_quantiles(session: bigframes.Session, scalars_table_id: str): num_bins = 3 ibis_client = session.ibis_client - _, dataset, table_id = scalars_table_id.split(".") - ibis_table: ibis_types.Table = ibis_client.table(table_id, database=dataset) + project, dataset, table_id = scalars_table_id.split(".") + ibis_table: ibis_types.Table = ibis_client.table( # type: ignore + table_id, + schema=dataset, + database=project, + ) ibis_column: ibis_types.NumericColumn = ibis_table["int64_col"] - quantiles: ibis_types.ArrayScalar = vendored_ibis_ops.ApproximateMultiQuantile( # type: ignore - ibis_column, num_bins=num_bins + quantiles: ibis_types.ArrayScalar = vendored_ibis_ops.ApproximateMultiQuantile( + ibis_column, # type: ignore + num_bins=num_bins, # type: ignore ).to_expr() value = quantiles[1] num_edges = quantiles.length() diff --git a/tests/unit/resources.py b/tests/unit/resources.py index 8ba321d122..b239b04671 100644 --- a/tests/unit/resources.py +++ b/tests/unit/resources.py @@ -79,7 +79,7 @@ def create_dataframe( # might not actually be used. Mock out the global session, too. monkeypatch.setattr(bigframes.core.global_session, "_global_session", session) bigframes.options.bigquery._session_started = True - return bigframes.dataframe.DataFrame({}, session=session) + return bigframes.dataframe.DataFrame({"col": []}, session=session) def create_pandas_session(tables: Dict[str, pandas.DataFrame]) -> bigframes.Session: diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py index e69de29bb2..43508fab11 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py @@ -0,0 +1,3 @@ +# Import all sub-modules to monkeypatch everything. +import third_party.bigframes_vendored.ibis.backends.bigquery.compiler # noqa +import third_party.bigframes_vendored.ibis.backends.bigquery.registry # noqa diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py b/third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py new file mode 100644 index 0000000000..414f0a7c81 --- /dev/null +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/compiler.py @@ -0,0 +1,59 @@ +# Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/compiler.py +"""Module to convert from Ibis expression to SQL string.""" + +from __future__ import annotations + +import re + +from ibis.backends.base.sql import compiler as sql_compiler +import ibis.backends.bigquery.compiler +from ibis.backends.bigquery.datatypes import BigQueryType +import ibis.expr.datatypes as dt +import ibis.expr.operations as ops + +_NAME_REGEX = re.compile(r'[^!"$()*,./;?@[\\\]^`{}~\n]+') +_EXACT_NAME_REGEX = re.compile(f"^{_NAME_REGEX.pattern}$") + + +class BigQueryTableSetFormatter(sql_compiler.TableSetFormatter): + def _quote_identifier(self, name): + """Restore 6.x version of identifier quoting. + + 7.x uses sqlglot which as of December 2023 doesn't know about the + extended unicode names for BigQuery yet. + """ + if _EXACT_NAME_REGEX.match(name) is not None: + return name + return f"`{name}`" + + def _format_in_memory_table(self, op): + """Restore 6.x version of InMemoryTable. + + BigQuery DataFrames explicitly uses InMemoryTable only when we know + the data is small enough to embed in SQL. + """ + schema = op.schema + names = schema.names + types = schema.types + + raw_rows = [] + for row in op.data.to_frame().itertuples(index=False): + raw_row = ", ".join( + f"{self._translate(lit)} AS {name}" + for lit, name in zip( + map(ops.Literal, row, types), map(self._quote_identifier, names) + ) + ) + raw_rows.append(f"STRUCT({raw_row})") + array_type = BigQueryType.from_ibis(dt.Array(op.schema.as_struct())) + + return f"UNNEST({array_type}[{', '.join(raw_rows)}])" + + +# Override implementation. +ibis.backends.bigquery.compiler.BigQueryTableSetFormatter._quote_identifier = ( + BigQueryTableSetFormatter._quote_identifier +) +ibis.backends.bigquery.compiler.BigQueryTableSetFormatter._format_in_memory_table = ( + BigQueryTableSetFormatter._format_in_memory_table +) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py index 038987cac9..3d6a3b37b1 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/analytic.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/analytic.py @@ -2,22 +2,22 @@ from __future__ import annotations -from ibis.expr.operations.analytic import Analytic +import ibis.expr.operations as ops import ibis.expr.rules as rlz -class FirstNonNullValue(Analytic): +class FirstNonNullValue(ops.Analytic): """Retrieve the first element.""" - arg = rlz.column(rlz.any) - output_dtype = rlz.dtype_like("arg") + arg: ops.Column + dtype = rlz.dtype_like("arg") -class LastNonNullValue(Analytic): +class LastNonNullValue(ops.Analytic): """Retrieve the last element.""" - arg = rlz.column(rlz.any) - output_dtype = rlz.dtype_like("arg") + arg: ops.Column + dtype = rlz.dtype_like("arg") __all__ = [ diff --git a/third_party/bigframes_vendored/ibis/expr/operations/json.py b/third_party/bigframes_vendored/ibis/expr/operations/json.py index dbb3fa3066..772c2e8ff4 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/json.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/json.py @@ -6,4 +6,4 @@ class ToJsonString(Unary): - output_dtype = dt.string + dtype = dt.string diff --git a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py index 5e6ad9ecf2..e6644f477a 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/reductions.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/reductions.py @@ -3,8 +3,8 @@ from __future__ import annotations import ibis.expr.datatypes as dt +import ibis.expr.operations.core as ibis_ops_core from ibis.expr.operations.reductions import Filterable, Reduction -import ibis.expr.rules as rlz class ApproximateMultiQuantile(Filterable, Reduction): @@ -13,9 +13,9 @@ class ApproximateMultiQuantile(Filterable, Reduction): See: https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions#approx_quantiles """ - arg = rlz.any - num_bins = rlz.value(dt.int64) - output_dtype = dt.Array(dt.float64) + arg: ibis_ops_core.Value + num_bins: ibis_ops_core.Value[dt.Int64] + dtype = dt.Array(dt.float64) __all__ = [ From 7cbbb7d4608d8b7d1a360b2fe2d39d89a52f9546 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Fri, 15 Dec 2023 15:24:15 -0800 Subject: [PATCH 02/27] docs: add code snippets for explore query result page (#278) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 316614454 🦕 --- samples/snippets/explore_query_result_test.py | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 samples/snippets/explore_query_result_test.py diff --git a/samples/snippets/explore_query_result_test.py b/samples/snippets/explore_query_result_test.py new file mode 100644 index 0000000000..5f0ec7d9b6 --- /dev/null +++ b/samples/snippets/explore_query_result_test.py @@ -0,0 +1,70 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_bigquery_dataframes_explore_query_result(): + import bigframes.pandas as bpd + + # [START bigquery_dataframes_explore_query_result] + # Load data from BigQuery + query_or_table = "bigquery-public-data.ml_datasets.penguins" + bq_df = bpd.read_gbq(query_or_table) + + # Inspect one of the columns (or series) of the DataFrame: + bq_df["body_mass_g"] + + # Compute the mean of this series: + average_body_mass = bq_df["body_mass_g"].mean() + print(f"average_body_mass: {average_body_mass}") + + # Find the heaviest species using the groupby operation to calculate the + # mean body_mass_g: + ( + bq_df["body_mass_g"] + .groupby(by=bq_df["species"]) + .mean() + .sort_values(ascending=False) + .head(10) + ) + + # Create the Linear Regression model + from bigframes.ml.linear_model import LinearRegression + + # Filter down to the data we want to analyze + adelie_data = bq_df[bq_df.species == "Adelie Penguin (Pygoscelis adeliae)"] + + # Drop the columns we don't care about + adelie_data = adelie_data.drop(columns=["species"]) + + # Drop rows with nulls to get our training data + training_data = adelie_data.dropna() + + # Pick feature columns and label column + X = training_data[ + [ + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "sex", + ] + ] + y = training_data[["body_mass_g"]] + + model = LinearRegression(fit_intercept=False) + model.fit(X, y) + model.score(X, y) + # [END bigquery_dataframes_explore_query_result] + assert average_body_mass is not None + assert model is not None From 02f7ab64bf520f41a0d9a536d4d4880d3a3d401a Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Fri, 15 Dec 2023 16:18:18 -0800 Subject: [PATCH 03/27] refactor: move query execution from ArrayValue to Session (#255) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/core/__init__.py | 110 ++++------------------------- bigframes/core/blocks.py | 37 +++++++--- bigframes/core/compile/compiled.py | 16 +++++ bigframes/core/compile/compiler.py | 6 ++ bigframes/core/indexes/index.py | 6 +- bigframes/core/nodes.py | 8 ++- bigframes/dataframe.py | 3 +- bigframes/session/__init__.py | 76 ++++++++++++++++++++ tests/system/small/test_session.py | 4 +- 9 files changed, 155 insertions(+), 111 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index e19fec8f3f..e8ac8c1d0f 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -16,10 +16,8 @@ from dataclasses import dataclass import io import typing -from typing import Iterable, Literal, Optional, Sequence, Tuple +from typing import Iterable, Literal, Sequence -from google.cloud import bigquery -import ibis import ibis.expr.types as ibis_types import pandas @@ -86,7 +84,17 @@ def session(self) -> Session: required_session = self.node.session from bigframes import get_global_session - return self.node.session[0] if required_session else get_global_session() + return ( + required_session if (required_session is not None) else get_global_session() + ) + + def _try_evaluate_local(self): + """Use only for unit testing paths - not fully featured. Will throw exception if fails.""" + import ibis + + return ibis.pandas.connect({}).execute( + self._compile_ordered()._to_ibis_expr(ordering_mode="unordered") + ) def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: return self._compile_ordered().get_column_type(key) @@ -97,97 +105,9 @@ def _compile_ordered(self) -> compiled.OrderedIR: def _compile_unordered(self) -> compiled.UnorderedIR: return compiler.compile_unordered(self.node) - def shape(self) -> typing.Tuple[int, int]: - """Returns dimensions as (length, width) tuple.""" - width = len(self._compile_unordered().columns) - count_expr = self._compile_unordered()._to_ibis_expr().count() - - # Support in-memory engines for hermetic unit tests. - if not self.node.session: - try: - length = ibis.pandas.connect({}).execute(count_expr) - return (length, width) - except Exception: - # Not all cases can be handled by pandas engine - pass - - sql = self.session.ibis_client.compile(count_expr) - row_iterator, _ = self.session._start_query( - sql=sql, - max_results=1, - ) - length = next(row_iterator)[0] - return (length, width) - - def to_sql( - self, - offset_column: typing.Optional[str] = None, - col_id_overrides: typing.Mapping[str, str] = {}, - sorted: bool = False, - ) -> str: - array_value = self - if offset_column: - array_value = self.promote_offsets(offset_column) - if sorted: - return array_value._compile_ordered().to_sql( - col_id_overrides=col_id_overrides, - sorted=sorted, - ) - else: - return array_value._compile_unordered().to_sql( - col_id_overrides=col_id_overrides - ) - - def start_query( - self, - job_config: Optional[bigquery.job.QueryJobConfig] = None, - max_results: Optional[int] = None, - *, - sorted: bool = True, - ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: - """Execute a query and return metadata about the results.""" - # TODO(swast): Cache the job ID so we can look it up again if they ask - # for the results? We'd need a way to invalidate the cache if DataFrame - # becomes mutable, though. Or move this method to the immutable - # expression class. - # TODO(swast): We might want to move this method to Session and/or - # provide our own minimal metadata class. Tight coupling to the - # BigQuery client library isn't ideal, especially if we want to support - # a LocalSession for unit testing. - # TODO(swast): Add a timeout here? If the query is taking a long time, - # maybe we just print the job metadata that we have so far? - sql = self.to_sql(sorted=sorted) # type:ignore - return self.session._start_query( - sql=sql, - job_config=job_config, - max_results=max_results, - ) - - def cached(self, cluster_cols: typing.Sequence[str]) -> ArrayValue: - """Write the ArrayValue to a session table and create a new block object that references it.""" - compiled_value = self._compile_ordered() - ibis_expr = compiled_value._to_ibis_expr( - ordering_mode="unordered", expose_hidden_cols=True - ) - tmp_table = self.session._ibis_to_temp_table( - ibis_expr, cluster_cols=cluster_cols, api_name="cached" - ) - - table_expression = self.session.ibis_client.table( - f"{tmp_table.project}.{tmp_table.dataset_id}.{tmp_table.table_id}" - ) - new_columns = [table_expression[column] for column in compiled_value.column_ids] - new_hidden_columns = [ - table_expression[column] - for column in compiled_value._hidden_ordering_column_names - ] - return ArrayValue.from_ibis( - self.session, - table_expression, - columns=new_columns, - hidden_ordering_columns=new_hidden_columns, - ordering=compiled_value._ordering, - ) + def row_count(self) -> ArrayValue: + """Get number of rows in ArrayValue as a single-entry ArrayValue.""" + return ArrayValue(nodes.RowCountNode(child=self.node)) # Operations diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 34913872e7..6542b694d2 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -137,8 +137,19 @@ def index(self) -> indexes.IndexValue: @functools.cached_property def shape(self) -> typing.Tuple[int, int]: """Returns dimensions as (length, width) tuple.""" - impl_length, _ = self._expr.shape() - return (impl_length, len(self.value_columns)) + row_count_expr = self.expr.row_count() + + # Support in-memory engines for hermetic unit tests. + if self.expr.node.session is None: + try: + row_count = row_count_expr._try_evaluate_local().squeeze() + return (row_count, len(self.value_columns)) + except Exception: + pass + + iter, _ = self.session._execute(row_count_expr, sorted=False) + row_count = next(iter)[0] + return (row_count, len(self.value_columns)) @property def index_columns(self) -> Sequence[str]: @@ -182,6 +193,10 @@ def index_dtypes( """Returns the dtypes of the index columns.""" return [self.expr.get_column_type(col) for col in self.index_columns] + @property + def session(self) -> core.Session: + return self._expr.session + @functools.cached_property def col_id_to_label(self) -> typing.Mapping[str, Label]: """Get column label for value columns, or index name for index columns""" @@ -376,7 +391,7 @@ def _to_dataframe(self, result) -> pd.DataFrame: """Convert BigQuery data to pandas DataFrame with specific dtypes.""" dtypes = dict(zip(self.index_columns, self.index_dtypes)) dtypes.update(zip(self.value_columns, self.dtypes)) - return self._expr.session._rows_to_dataframe(result, dtypes) + return self.session._rows_to_dataframe(result, dtypes) def to_pandas( self, @@ -404,9 +419,9 @@ def to_pandas_batches(self): """Download results one message at a time.""" dtypes = dict(zip(self.index_columns, self.index_dtypes)) dtypes.update(zip(self.value_columns, self.dtypes)) - results_iterator, _ = self._expr.start_query() + results_iterator, _ = self.session._execute(self.expr, sorted=True) for arrow_table in results_iterator.to_arrow_iterable( - bqstorage_client=self._expr.session.bqstoragereadclient + bqstorage_client=self.session.bqstoragereadclient ): df = bigframes.session._io.pandas.arrow_to_pandas(arrow_table, dtypes) self._copy_index_to_pandas(df) @@ -460,12 +475,12 @@ def _compute_and_count( expr = self._apply_value_keys_to_expr(value_keys=value_keys) - results_iterator, query_job = expr.start_query( - max_results=max_results, sorted=ordered + results_iterator, query_job = self.session._execute( + expr, max_results=max_results, sorted=ordered ) table_size = ( - expr.session._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES + self.session._get_table_size(query_job.destination) / _BYTES_TO_MEGABYTES ) fraction = ( max_download_size / table_size @@ -607,7 +622,7 @@ def _compute_dry_run( ) -> bigquery.QueryJob: expr = self._apply_value_keys_to_expr(value_keys=value_keys) job_config = bigquery.QueryJobConfig(dry_run=True) - _, query_job = expr.start_query(job_config=job_config) + _, query_job = self.session._execute(expr, job_config=job_config, dry_run=True) return query_job def _apply_value_keys_to_expr(self, value_keys: Optional[Iterable[str]] = None): @@ -1668,7 +1683,7 @@ def to_sql_query( # the BigQuery unicode column name feature? substitutions[old_id] = new_id - sql = array_value.to_sql(col_id_overrides=substitutions) + sql = self.session._to_sql(array_value, col_id_overrides=substitutions) return ( sql, new_ids[: len(idx_labels)], @@ -1678,7 +1693,7 @@ def to_sql_query( def cached(self) -> Block: """Write the block to a session table and create a new block object that references it.""" return Block( - self.expr.cached(cluster_cols=self.index_columns), + self.session._execute_and_cache(self.expr, cluster_cols=self.index_columns), index_columns=self.index_columns, column_labels=self.column_labels, index_labels=self.index_labels, diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 537d9c8b52..d6183228d1 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -268,6 +268,22 @@ def to_sql( ) return typing.cast(str, sql) + def row_count(self) -> OrderedIR: + original_table = self._to_ibis_expr() + ibis_table = original_table.agg( + [ + original_table.count().name("count"), + ] + ) + return OrderedIR( + ibis_table, + (ibis_table["count"],), + ordering=ExpressionOrdering( + ordering_value_columns=(OrderingColumnReference("count"),), + total_ordering_columns=frozenset(["count"]), + ), + ) + def _to_ibis_expr( self, *, diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index 39892635f1..17dcde638f 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -173,6 +173,12 @@ def compile_concat(node: nodes.ConcatNode, ordered: bool = True): return concat_impl.concat_unordered(compiled_unordered) +@_compile_node.register +def compile_rowcount(node: nodes.RowCountNode, ordered: bool = True): + result = compile_unordered(node.child).row_count() + return result if ordered else result.to_unordered() + + @_compile_node.register def compile_aggregate(node: nodes.AggregateNode, ordered: bool = True): result = compile_unordered(node.child).aggregate( diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index fc7cf167d4..6fc284403d 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -396,6 +396,10 @@ def dtypes( ) -> typing.Sequence[typing.Union[bf_dtypes.Dtype, np.dtype[typing.Any]]]: return self._block.index_dtypes + @property + def session(self) -> core.Session: + return self._expr.session + def __repr__(self) -> str: """Converts an Index to a string.""" # TODO(swast): Add a timeout here? If the query is taking a long time, @@ -411,7 +415,7 @@ def to_pandas(self) -> pandas.Index: index_columns = list(self._block.index_columns) dtypes = dict(zip(index_columns, self.dtypes)) expr = self._expr.select_columns(index_columns) - results, _ = expr.start_query() + results, _ = self.session._execute(expr) df = expr.session._rows_to_dataframe(results, dtypes) df = df.set_index(index_columns) index = df.index diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 82a869dac2..30444f5565 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -139,7 +139,7 @@ class ReadGbqNode(BigFrameNode): @property def session(self): - return (self.table_session,) + return self.table_session def __hash__(self): return self._node_hash @@ -229,6 +229,12 @@ def __hash__(self): return self._node_hash +# TODO: Merge RowCount and Corr into Aggregate Node +@dataclass(frozen=True) +class RowCountNode(UnaryNode): + pass + + @dataclass(frozen=True) class AggregateNode(UnaryNode): aggregations: typing.Tuple[typing.Tuple[str, agg_ops.AggregateOp, str], ...] diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 3b0fd7008a..d777784f64 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2701,7 +2701,8 @@ def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str: if ordering_id is not None: array_value = array_value.promote_offsets(ordering_id) - return array_value.to_sql( + return self._block.session._to_sql( + array_value=array_value, col_id_overrides=id_overrides, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index fb5fab86ce..a57f7b94c5 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -67,6 +67,7 @@ from bigframes.core import log_adapter import bigframes.core as core import bigframes.core.blocks as blocks +import bigframes.core.compile import bigframes.core.guid as guid from bigframes.core.ordering import IntegerEncoding, OrderingColumnReference import bigframes.core.ordering as orderings @@ -1437,6 +1438,81 @@ def _start_query( results_iterator = query_job.result(max_results=max_results) return results_iterator, query_job + def _execute_and_cache( + self, array_value: core.ArrayValue, cluster_cols: typing.Sequence[str] + ) -> core.ArrayValue: + """Executes the query and uses the resulting table to rewrite future executions.""" + # TODO: Use this for all executions? Problem is that caching materializes extra + # ordering columns + compiled_value = self._compile_ordered(array_value) + + ibis_expr = compiled_value._to_ibis_expr( + ordering_mode="unordered", expose_hidden_cols=True + ) + tmp_table = self._ibis_to_temp_table( + ibis_expr, cluster_cols=cluster_cols, api_name="cached" + ) + table_expression = self.ibis_client.table( + f"{tmp_table.project}.{tmp_table.dataset_id}.{tmp_table.table_id}" + ) + new_columns = [table_expression[column] for column in compiled_value.column_ids] + new_hidden_columns = [ + table_expression[column] + for column in compiled_value._hidden_ordering_column_names + ] + # TODO: Instead, keep session-wide map of cached results and automatically reuse + return core.ArrayValue.from_ibis( + self, + table_expression, + columns=new_columns, + hidden_ordering_columns=new_hidden_columns, + ordering=compiled_value._ordering, + ) + + def _execute( + self, + array_value: core.ArrayValue, + job_config: Optional[bigquery.job.QueryJobConfig] = None, + max_results: Optional[int] = None, + *, + sorted: bool = True, + dry_run=False, + ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + sql = self._to_sql(array_value, sorted=sorted) # type:ignore + job_config = bigquery.QueryJobConfig(dry_run=dry_run) + return self._start_query( + sql=sql, + job_config=job_config, + max_results=max_results, + ) + + def _to_sql( + self, + array_value: core.ArrayValue, + offset_column: typing.Optional[str] = None, + col_id_overrides: typing.Mapping[str, str] = {}, + sorted: bool = False, + ) -> str: + if offset_column: + array_value = array_value.promote_offsets(offset_column) + if sorted: + return self._compile_ordered(array_value).to_sql( + col_id_overrides=col_id_overrides, sorted=True + ) + return self._compile_unordered(array_value).to_sql( + col_id_overrides=col_id_overrides + ) + + def _compile_ordered( + self, array_value: core.ArrayValue + ) -> bigframes.core.compile.OrderedIR: + return bigframes.core.compile.compile_ordered(array_value.node) + + def _compile_unordered( + self, array_value: core.ArrayValue + ) -> bigframes.core.compile.UnorderedIR: + return bigframes.core.compile.compile_unordered(array_value.node) + def _get_table_size(self, destination_table): table = self.bqclient.get_table(destination_table) return table.num_bytes diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 26c5093b35..e6eb40a5fa 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -44,7 +44,7 @@ def test_read_gbq_tokyo( result = df.sort_index().to_pandas() expected = scalars_pandas_df_index - _, query_job = df._block.expr.start_query() + _, query_job = session_tokyo._execute(df._block.expr) assert query_job.location == tokyo_location pd.testing.assert_frame_equal(result, expected) @@ -379,7 +379,7 @@ def test_read_pandas_tokyo( result = df.to_pandas() expected = scalars_pandas_df_index - _, query_job = df._block.expr.start_query() + _, query_job = session_tokyo._execute(df._block.expr) assert query_job.location == tokyo_location pd.testing.assert_frame_equal(result, expected) From 6c1969a35fe720cf3a804006bcc9046ba554fcc3 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Mon, 18 Dec 2023 12:34:37 -0800 Subject: [PATCH 04/27] feat: Add IntervalIndex support to bigframes.pandas.cut (#254) * feature: Add IntervalIndex support to bigframes.pandas.cut * add bins <= 0 error in CutOp * add type ignore * add type ignore to session --------- Co-authored-by: Shobhit Singh --- bigframes/core/reshape/__init__.py | 20 +++++-- bigframes/operations/aggregations.py | 41 ++++++++++----- bigframes/series.py | 2 +- bigframes/session/__init__.py | 4 +- tests/system/small/test_pandas.py | 34 ++++++++++++ .../pandas/core/reshape/tile.py | 52 +++++++++++++++---- 6 files changed, 124 insertions(+), 29 deletions(-) diff --git a/bigframes/core/reshape/__init__.py b/bigframes/core/reshape/__init__.py index 24c1bff309..d9cc99a036 100644 --- a/bigframes/core/reshape/__init__.py +++ b/bigframes/core/reshape/__init__.py @@ -14,7 +14,9 @@ from __future__ import annotations import typing -from typing import Iterable, Literal, Optional, Union +from typing import Iterable, Literal, Optional, Tuple, Union + +import pandas as pd import bigframes.constants as constants import bigframes.core as core @@ -108,17 +110,29 @@ def concat( def cut( x: bigframes.series.Series, - bins: int, + bins: Union[ + int, + pd.IntervalIndex, + Iterable[Tuple[Union[int, float], Union[int, float]]], + ], *, labels: Optional[bool] = None, ) -> bigframes.series.Series: - if bins <= 0: + if isinstance(bins, int) and bins <= 0: raise ValueError("`bins` should be a positive integer.") + if isinstance(bins, Iterable): + if not isinstance(bins, pd.IntervalIndex): + bins = pd.IntervalIndex.from_tuples(list(bins)) + + if bins.is_overlapping: + raise ValueError("Overlapping IntervalIndex is not accepted.") + if labels is not False: raise NotImplementedError( f"Only labels=False is supported in BigQuery DataFrames so far. {constants.FEEDBACK_LINK}" ) + return x._apply_window_op(agg_ops.CutOp(bins), window_spec=core.WindowSpec()) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 363dfe819d..8178ebfaea 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -20,6 +20,7 @@ import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types from pandas import Int64Dtype +import pandas as pd import bigframes.constants as constants import bigframes.dtypes as dtypes @@ -228,21 +229,37 @@ def skips_nulls(self): class CutOp(WindowOp): - def __init__(self, bins: int): - self._bins_ibis = dtypes.literal_to_ibis_scalar(bins, force_dtype=Int64Dtype()) - self._bins_int = bins + def __init__(self, bins: typing.Union[int, pd.IntervalIndex]): + if isinstance(bins, int): + if not bins > 0: + raise ValueError("`bins` should be a positive integer.") + self._bins_int = bins + self._bins = dtypes.literal_to_ibis_scalar(bins, force_dtype=Int64Dtype()) + else: + self._bins_int = 0 + self._bins = bins def _as_ibis(self, x: ibis_types.Column, window=None): - col_min = _apply_window_if_present(x.min(), window) - col_max = _apply_window_if_present(x.max(), window) - bin_width = (col_max - col_min) / self._bins_ibis out = ibis.case() - for this_bin in range(self._bins_int - 1): - out = out.when( - x <= (col_min + (this_bin + 1) * bin_width), - dtypes.literal_to_ibis_scalar(this_bin, force_dtype=Int64Dtype()), - ) - out = out.when(x.notnull(), self._bins_ibis - 1) + + if self._bins_int > 0: + col_min = _apply_window_if_present(x.min(), window) + col_max = _apply_window_if_present(x.max(), window) + bin_width = (col_max - col_min) / self._bins + + for this_bin in range(self._bins_int - 1): + out = out.when( + x <= (col_min + (this_bin + 1) * bin_width), + dtypes.literal_to_ibis_scalar(this_bin, force_dtype=Int64Dtype()), + ) + out = out.when(x.notnull(), self._bins - 1) + else: + for interval in self._bins: + condition = (x > interval.left) & (x <= interval.right) + interval_struct = ibis.struct( + {"left_exclusive": interval.left, "right_inclusive": interval.right} + ) + out = out.when(condition, interval_struct) return out.end() @property diff --git a/bigframes/series.py b/bigframes/series.py index c929775a00..8d8c711c92 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1324,7 +1324,7 @@ def to_csv(self, path_or_buf=None, **kwargs) -> typing.Optional[str]: return self.to_pandas().to_csv(path_or_buf, **kwargs) def to_dict(self, into: type[dict] = dict) -> typing.Mapping: - return typing.cast(dict, self.to_pandas().to_dict(into)) + return typing.cast(dict, self.to_pandas().to_dict(into)) # type: ignore def to_excel(self, excel_writer, sheet_name="Sheet1", **kwargs) -> None: return self.to_pandas().to_excel(excel_writer, sheet_name, **kwargs) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index a57f7b94c5..fbe900106a 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1017,13 +1017,13 @@ def read_csv( header=header, names=names, index_col=index_col, - usecols=usecols, + usecols=usecols, # type: ignore dtype=dtype, engine=engine, encoding=encoding, **kwargs, ) - return self.read_pandas(pandas_df) + return self.read_pandas(pandas_df) # type: ignore def read_pickle( self, diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index a1079288cf..282c0d68eb 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -365,6 +365,40 @@ def test_cut(scalars_dfs): pd.testing.assert_series_equal(bf_result, pd_result) +@pytest.mark.parametrize( + ("bins",), + [ + ([(-5, 2), (2, 3), (-3000, -10)],), + (pd.IntervalIndex.from_tuples([(1, 2), (2, 3), (4, 5)]),), + ], +) +def test_cut_with_interval(scalars_dfs, bins): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = bpd.cut(scalars_df["int64_too"], bins, labels=False).to_pandas() + + if isinstance(bins, list): + bins = pd.IntervalIndex.from_tuples(bins) + pd_result = pd.cut(scalars_pandas_df["int64_too"], bins, labels=False) + + # Convert to match data format + pd_result_converted = pd.Series( + [ + {"left_exclusive": interval.left, "right_inclusive": interval.right} + if pd.notna(val) + else pd.NA + for val, interval in zip( + pd_result, pd_result.cat.categories[pd_result.cat.codes] + ) + ], + name=pd_result.name, + ) + pd_result.index = pd_result.index.astype("Int64") + + pd.testing.assert_series_equal( + bf_result, pd_result_converted, check_index=False, check_dtype=False + ) + + @pytest.mark.parametrize( ("q",), [ diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index d4471ed68e..55975c3fc1 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -24,31 +24,61 @@ def cut( ``labels=False`` implies you just want the bins back. - Examples: - - .. code-block:: - - import bigframes.pandas as pd + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> s = bpd.Series([0, 1, 5, 10]) + >>> s + 0 0 + 1 1 + 2 5 + 3 10 + dtype: Int64 - pd.options.display.progress_bar = None - s = pd.Series([0, 1, 1, 2]) - pd.cut(s, bins=4, labels=False) + Cut with an integer (equal-width bins): + >>> bpd.cut(s, bins=4, labels=False) 0 0 - 1 1 + 1 0 2 1 3 3 dtype: Int64 + Cut with pd.IntervalIndex, requires importing pandas for IntervalIndex: + + >>> import pandas as pd + + >>> interval_index = pd.IntervalIndex.from_tuples([(0, 1), (1, 5), (5, 20)]) + >>> bpd.cut(s, bins=interval_index, labels=False) + 0 + 1 {'left_exclusive': 0, 'right_inclusive': 1} + 2 {'left_exclusive': 1, 'right_inclusive': 5} + 3 {'left_exclusive': 5, 'right_inclusive': 20} + dtype: struct[pyarrow] + + Cut with an iterable of tuples: + + >>> bins_tuples = [(0, 1), (1, 4), (5, 20)] + >>> bpd.cut(s, bins=bins_tuples, labels=False) + 0 + 1 {'left_exclusive': 0, 'right_inclusive': 1} + 2 + 3 {'left_exclusive': 5, 'right_inclusive': 20} + dtype: struct[pyarrow] + Args: x (Series): The input Series to be binned. Must be 1-dimensional. - bins (int): + bins (int, pd.IntervalIndex, Iterable[Tuple[Union[int, float], Union[int, float]]]): The criteria to bin by. - int : Defines the number of equal-width bins in the range of `x`. The + int: Defines the number of equal-width bins in the range of `x`. The range of `x` is extended by .1% on each side to include the minimum and maximum values of `x`. + + pd.IntervalIndex or Iterable of tuples: Defines the exact bins to be used. + It's important to ensure that these bins are non-overlapping. labels (None): Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the From bac62f76af1af6ca8834c3690c7c79aeb12dd331 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Mon, 18 Dec 2023 14:47:39 -0800 Subject: [PATCH 05/27] fix: dataframes to_gbq now creates dataset if it doesn't exist (#222) * "fix: dataframes to_gbq now creates dataset if it doesn't exist * fix: dataframes to_gbq now creates dataset if it doesn't exist * fix: dataframes to_gbq now creates dataset if it doesn't exist * update test * update create dataset method. * fix --------- Co-authored-by: Shobhit Singh --- bigframes/dataframe.py | 17 +++++++++++++++-- tests/system/small/test_dataframe.py | 15 +++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d777784f64..1251e64fb0 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -34,6 +34,7 @@ Union, ) +import google.api_core.exceptions import google.cloud.bigquery as bigquery import numpy import pandas @@ -2508,7 +2509,14 @@ def to_gbq( ) if_exists = "replace" - if "." not in destination_table: + table_parts = destination_table.split(".") + default_project = self._block.expr.session.bqclient.project + + if len(table_parts) == 2: + destination_dataset = f"{default_project}.{table_parts[0]}" + elif len(table_parts) == 3: + destination_dataset = f"{table_parts[0]}.{table_parts[1]}" + else: raise ValueError( f"Got invalid value for destination_table {repr(destination_table)}. " "Should be of the form 'datasetId.tableId' or 'projectId.datasetId.tableId'." @@ -2523,11 +2531,16 @@ def to_gbq( f"Valid options include None or one of {dispositions.keys()}." ) + try: + self._session.bqclient.get_dataset(destination_dataset) + except google.api_core.exceptions.NotFound: + self._session.bqclient.create_dataset(destination_dataset, exists_ok=True) + job_config = bigquery.QueryJobConfig( write_disposition=dispositions[if_exists], destination=bigquery.table.TableReference.from_string( destination_table, - default_project=self._block.expr.session.bqclient.project, + default_project=default_project, ), ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 663a7ceb49..ab68543d91 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3683,3 +3683,18 @@ def test_to_pandas_downsampling_option_override(session): total_memory_bytes = df.memory_usage(deep=True).sum() total_memory_mb = total_memory_bytes / (1024 * 1024) assert total_memory_mb == pytest.approx(download_size, rel=0.3) + + +def test_to_gbq_and_create_dataset(session, scalars_df_index, dataset_id_not_created): + dataset_id = dataset_id_not_created + destination_table = f"{dataset_id}.scalars_df" + + result_table = scalars_df_index.to_gbq(destination_table) + assert ( + result_table == destination_table + if destination_table + else result_table is not None + ) + + loaded_scalars_df_index = session.read_gbq(result_table) + assert not loaded_scalars_df_index.empty From dab2f2cb8720ec68413a71bddaeadc1ec40b6541 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 19 Dec 2023 08:27:16 -0600 Subject: [PATCH 06/27] chore: use latest pip in nightly builds (#281) * chore: use latest pip in nightly builds * restrict ibis version * update prerelease tests too --- .kokoro/release-nightly.sh | 3 +++ noxfile.py | 2 +- setup.py | 3 ++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.kokoro/release-nightly.sh b/.kokoro/release-nightly.sh index 0751cf2502..5624df3b8d 100755 --- a/.kokoro/release-nightly.sh +++ b/.kokoro/release-nightly.sh @@ -55,6 +55,9 @@ rm -rf build dist # internal issue b/261050975. git config --global --add safe.directory "${PROJECT_ROOT}" +# Workaround for older pip not able to resolve dependencies. See internal +# issue 316909553. +python3.10 -m pip install pip==23.3.2 python3.10 -m pip install --require-hashes -r .kokoro/requirements.txt # Disable buffering, so that the logs stream through. diff --git a/noxfile.py b/noxfile.py index c0ec3b0c54..c4bbd7a65a 100644 --- a/noxfile.py +++ b/noxfile.py @@ -535,7 +535,7 @@ def prerelease(session: nox.sessions.Session, tests_path): session.install( "--upgrade", # "--pre", - "ibis-framework>=7.1.0,<8.0.0dev", + "ibis-framework>=7.1.0,<7.2.0dev", ) already_installed.add("ibis-framework") diff --git a/setup.py b/setup.py index 1ad4bbd3eb..9aaaaae04f 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,8 @@ "google-cloud-iam >=2.12.1", "google-cloud-resource-manager >=1.10.3", "google-cloud-storage >=2.0.0", - "ibis-framework[bigquery] >=7.1.0,<8.0.0dev", + # TODO: Relax upper bound once we have fixed unit tests with 7.2.0. + "ibis-framework[bigquery] >=7.1.0,<7.2.0dev", # TODO: Relax upper bound once we have fixed `system_prerelease` tests. "pandas >=1.5.0,<2.1.4", "pydata-google-auth >=1.8.2", From 5092215767d77c90b132e9cd6b3e3749827ebe09 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 19 Dec 2023 11:47:25 -0800 Subject: [PATCH 07/27] feat: add replace method to DataFrame (#261) * feat: add replace method to DataFrame * remove unwanted change to describe method * better docs * is_patype docstring * docstring fix * mypy fix --- bigframes/dataframe.py | 15 ++++ bigframes/dtypes.py | 48 ++++++++++ bigframes/operations/__init__.py | 14 +++ bigframes/series.py | 87 +++++++++++------- tests/system/small/test_dataframe.py | 44 ++++++++++ .../bigframes_vendored/pandas/core/frame.py | 88 +++++++++++++++++++ 6 files changed, 265 insertions(+), 31 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1251e64fb0..1d8169960b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1562,6 +1562,21 @@ def interpolate(self, method: str = "linear") -> DataFrame: def fillna(self, value=None) -> DataFrame: return self._apply_binop(value, ops.fillna_op, how="left") + def replace( + self, to_replace: typing.Any, value: typing.Any = None, *, regex: bool = False + ): + if utils.is_dict_like(value): + return self.apply( + lambda x: x.replace( + to_replace=to_replace, value=value[x.name], regex=regex + ) + if (x.name in value) + else x + ) + return self.apply( + lambda x: x.replace(to_replace=to_replace, value=value, regex=regex) + ) + def ffill(self, *, limit: typing.Optional[int] = None) -> DataFrame: window = bigframes.core.WindowSpec(preceding=limit, following=0) return self._apply_window_op(agg_ops.LastNonNullOp(), window) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 774eb74d06..6dfcc17f37 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -14,6 +14,7 @@ """Mappings for Pandas dtypes supported by BigQuery DataFrames package""" +import datetime import textwrap import typing from typing import Any, Dict, Iterable, Literal, Tuple, Union @@ -437,3 +438,50 @@ def to_pandas_dtypes_overrides(schema: Iterable[bigquery.SchemaField]) -> Dict: gcb3p_pandas_helpers.bq_to_arrow_data_type(field) ) return dtypes + + +def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool: + """Captures whether a scalar can be losslessly represented by a dtype.""" + if scalar is None: + return True + if pd.api.types.is_bool_dtype(dtype): + return pd.api.types.is_bool(scalar) + if pd.api.types.is_float_dtype(dtype): + return pd.api.types.is_float(scalar) + if pd.api.types.is_integer_dtype(dtype): + return pd.api.types.is_integer(scalar) + if isinstance(dtype, pd.StringDtype): + return isinstance(scalar, str) + if isinstance(dtype, pd.ArrowDtype): + pa_type = dtype.pyarrow_dtype + return is_patype(scalar, pa_type) + return False + + +def is_patype(scalar: typing.Any, pa_type: pa.DataType) -> bool: + """Determine whether a scalar's type matches a given pyarrow type.""" + if pa_type == pa.time64("us"): + return isinstance(scalar, datetime.time) + if pa_type == pa.timestamp("us"): + if isinstance(scalar, datetime.datetime): + return not scalar.tzinfo + if isinstance(scalar, pd.Timestamp): + return not scalar.tzinfo + if pa_type == pa.timestamp("us", tz="UTC"): + if isinstance(scalar, datetime.datetime): + return scalar.tzinfo == datetime.timezone.utc + if isinstance(scalar, pd.Timestamp): + return scalar.tzinfo == datetime.timezone.utc + if pa_type == pa.date32(): + return isinstance(scalar, datetime.date) + return False + + +def is_comparable(scalar: typing.Any, dtype: Dtype) -> bool: + """Whether scalar can be compare to items of dtype (though maybe requiring coercion)""" + if is_dtype(scalar, dtype): + return True + elif pd.api.types.is_numeric_dtype(dtype): + return pd.api.types.is_number(scalar) + else: + return False diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 0655aafdb3..753870a42d 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -523,6 +523,20 @@ def _as_ibis(self, x: ibis_types.Value): return bigframes.dtypes.cast_ibis_value(x, self.to_type) +class MapOp(UnaryOp): + def __init__( + self, + mappings: typing.Tuple[typing.Tuple[typing.Hashable, typing.Hashable], ...], + ): + self._mappings = mappings + + def _as_ibis(self, x: ibis_types.Value): + case = ibis.case() + for mapping in self._mappings: + case = case.when(x == mapping[0], mapping[1]) + return case.else_(x).end() + + class FindOp(UnaryOp): def __init__(self, sub, start, end): self._sub = sub diff --git a/bigframes/series.py b/bigframes/series.py index 8d8c711c92..1b9982877a 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -442,42 +442,67 @@ def replace( self, to_replace: typing.Any, value: typing.Any = None, *, regex: bool = False ): if regex: - if not (isinstance(to_replace, str) and isinstance(value, str)): - raise NotImplementedError( - f"replace regex mode only supports strings for 'to_replace' and 'value'. {constants.FEEDBACK_LINK}" - ) - block, result_col = self._block.apply_unary_op( - self._value_column, - ops.ReplaceRegexOp(to_replace, value), - result_label=self.name, - ) - return Series(block.select_column(result_col)) + # No-op unless to_replace and series dtype are both string type + if not isinstance(to_replace, str) or not isinstance( + self.dtype, pandas.StringDtype + ): + return self + return self._regex_replace(to_replace, value) elif utils.is_dict_like(to_replace): - raise NotImplementedError( - f"Dict 'to_replace' not supported. {constants.FEEDBACK_LINK}" - ) + return self._mapping_replace(to_replace) # type: ignore elif utils.is_list_like(to_replace): - block, cond = self._block.apply_unary_op( - self._value_column, ops.IsInOp(to_replace) - ) - block, result_col = block.apply_binary_op( - cond, - self._value_column, - ops.partial_arg1(ops.where_op, value), - result_label=self.name, - ) - return Series(block.select_column(result_col)) + replace_list = to_replace else: # Scalar - block, cond = self._block.apply_unary_op( - self._value_column, ops.BinopPartialLeft(ops.eq_op, to_replace) + replace_list = [to_replace] + replace_list = [ + i for i in replace_list if bigframes.dtypes.is_comparable(i, self.dtype) + ] + return self._simple_replace(replace_list, value) if replace_list else self + + def _regex_replace(self, to_replace: str, value: str): + if not bigframes.dtypes.is_dtype(value, self.dtype): + raise NotImplementedError( + f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}" ) - block, result_col = block.apply_binary_op( - cond, - self._value_column, - ops.partial_arg1(ops.where_op, value), - result_label=self.name, + block, result_col = self._block.apply_unary_op( + self._value_column, + ops.ReplaceRegexOp(to_replace, value), + result_label=self.name, + ) + return Series(block.select_column(result_col)) + + def _simple_replace(self, to_replace_list: typing.Sequence, value): + if not bigframes.dtypes.is_dtype(value, self.dtype): + raise NotImplementedError( + f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}" ) - return Series(block.select_column(result_col)) + + block, cond = self._block.apply_unary_op( + self._value_column, ops.IsInOp(to_replace_list) + ) + block, result_col = block.apply_binary_op( + cond, + self._value_column, + ops.partial_arg1(ops.where_op, value), + result_label=self.name, + ) + return Series(block.select_column(result_col)) + + def _mapping_replace(self, mapping: dict[typing.Hashable, typing.Hashable]): + tuples = [] + for key, value in mapping.items(): + if not bigframes.dtypes.is_comparable(key, self.dtype): + continue + if not bigframes.dtypes.is_dtype(value, self.dtype): + raise NotImplementedError( + f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}" + ) + tuples.append((key, value)) + + block, result = self._block.apply_unary_op( + self._value_column, ops.MapOp(tuple(tuples)) + ) + return Series(block.select_column(result)) def interpolate(self, method: str = "linear") -> Series: if method == "pad": diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index ab68543d91..ed78e73e5d 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -839,6 +839,50 @@ def test_df_fillna(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_df_replace_scalar_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace("Hello, World!", "Howdy, Planet!").to_pandas() + pd_result = scalars_pandas_df.replace("Hello, World!", "Howdy, Planet!") + + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +def test_df_replace_regex_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace("^H.l", "Howdy, Planet!", regex=True).to_pandas() + pd_result = scalars_pandas_df.replace("^H.l", "Howdy, Planet!", regex=True) + + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +def test_df_replace_list_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace(["Hello, World!", "T"], "Howdy, Planet!").to_pandas() + pd_result = scalars_pandas_df.replace(["Hello, World!", "T"], "Howdy, Planet!") + + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +def test_df_replace_value_dict(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace(1, {"int64_col": 100, "int64_too": 200}).to_pandas() + pd_result = scalars_pandas_df.replace(1, {"int64_col": 100, "int64_too": 200}) + + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + def test_df_ffill(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df[["int64_col", "float64_col"]].ffill(limit=1).to_pandas() diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index c082b87336..00be9e5e9e 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4356,6 +4356,94 @@ def fillna(self, value): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def replace( + self, + to_replace, + value=None, + *, + regex=False, + ): + """ + Replace values given in `to_replace` with `value`. + + Values of the Series/DataFrame are replaced with other values dynamically. + This differs from updating with ``.loc`` or ``.iloc``, which require + you to specify a location to update with some value. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'int_col': [1, 1, 2, 3], + ... 'string_col': ["a", "b", "c", "b"], + ... }) + + Using scalar `to_replace` and `value`: + + >>> df.replace("b", "e") + int_col string_col + 0 1 a + 1 1 e + 2 2 c + 3 3 e + + [4 rows x 2 columns] + + Using dictionary: + + >>> df.replace({"a": "e", 2: 5}) + int_col string_col + 0 1 e + 1 1 b + 2 5 c + 3 3 b + + [4 rows x 2 columns] + + Using regex: + + >>> df.replace("[ab]", "e", regex=True) + int_col string_col + 0 1 e + 1 1 e + 2 2 c + 3 3 e + + [4 rows x 2 columns] + + + Args: + to_replace (str, regex, list, int, float or None): + How to find the values that will be replaced. + numeric: numeric values equal to `to_replace` will be replaced with `value` + str: string exactly matching `to_replace` will be replaced with `value` + regex: regexs matching `to_replace` will be replaced with`value` + list of str, regex, or numeric: + First, if `to_replace` and `value` are both lists, they **must** be the same length. + Second, if ``regex=True`` then all of the strings in **both** + lists will be interpreted as regexs otherwise they will match + directly. This doesn't matter much for `value` since there + are only a few possible substitution regexes you can use. + str, regex and numeric rules apply as above. + + value (scalar, default None): + Value to replace any values matching `to_replace` with. + For a DataFrame a dict of values can be used to specify which + value to use for each column (columns not in the dict will not be + filled). Regular expressions, strings and lists or dicts of such + objects are also allowed. + regex (bool, default False): + Whether to interpret `to_replace` and/or `value` as regular + expressions. If this is ``True`` then `to_replace` *must* be a + string. + + Returns: + Series/DataFrame: Object after replacement. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property def iloc(self): """Purely integer-location based indexing for selection by position.""" From ab493506e71ed8970a11fe2f88b2145150e09291 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 19 Dec 2023 12:32:15 -0800 Subject: [PATCH 08/27] fix: fix DataFrameGroupby.agg() issue with as_index=False (#273) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #271 🦕 --- bigframes/core/block_transforms.py | 1 - bigframes/core/blocks.py | 53 ++++++++------------------- bigframes/core/groupby/__init__.py | 30 ++++++++++----- bigframes/dataframe.py | 8 ++-- bigframes/series.py | 16 ++++---- tests/system/small/test_groupby.py | 17 +++++++-- tests/system/small/test_multiindex.py | 11 +++++- 7 files changed, 70 insertions(+), 66 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index df84f70859..6654892287 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -332,7 +332,6 @@ def value_counts( by_column_ids=columns, aggregations=[(dummy, agg_ops.count_op)], dropna=dropna, - as_index=True, ) count_id = agg_ids[0] if normalize: diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 6542b694d2..3163aa5b09 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -66,7 +66,7 @@ _MONOTONIC_DECREASING = "monotonic_decreasing" -LevelType = typing.Union[str, int] +LevelType = typing.Hashable LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] @@ -941,7 +941,6 @@ def aggregate( by_column_ids: typing.Sequence[str] = (), aggregations: typing.Sequence[typing.Tuple[str, agg_ops.AggregateOp]] = (), *, - as_index: bool = True, dropna: bool = True, ) -> typing.Tuple[Block, typing.Sequence[str]]: """ @@ -962,40 +961,21 @@ def aggregate( aggregate_labels = self._get_labels_for_columns( [agg[0] for agg in aggregations] ) - if as_index: - names: typing.List[Label] = [] - for by_col_id in by_column_ids: - if by_col_id in self.value_columns: - names.append(self.col_id_to_label[by_col_id]) - else: - names.append(self.col_id_to_index_name[by_col_id]) - return ( - Block( - result_expr, - index_columns=by_column_ids, - column_labels=aggregate_labels, - index_labels=names, - ), - output_col_ids, - ) - else: # as_index = False - # If as_index=False, drop grouping levels, but keep grouping value columns - by_value_columns = [ - col for col in by_column_ids if col in self.value_columns - ] - by_column_labels = self._get_labels_for_columns(by_value_columns) - labels = (*by_column_labels, *aggregate_labels) - offsets_id = guid.generate_guid() - result_expr_pruned = result_expr.select_columns( - [*by_value_columns, *output_col_ids] - ).promote_offsets(offsets_id) - - return ( - Block( - result_expr_pruned, index_columns=[offsets_id], column_labels=labels - ), - output_col_ids, - ) + names: typing.List[Label] = [] + for by_col_id in by_column_ids: + if by_col_id in self.value_columns: + names.append(self.col_id_to_label[by_col_id]) + else: + names.append(self.col_id_to_index_name[by_col_id]) + return ( + Block( + result_expr, + index_columns=by_column_ids, + column_labels=aggregate_labels, + index_labels=names, + ), + output_col_ids, + ) def get_stat(self, column_id: str, stat: agg_ops.AggregateOp): """Gets aggregates immediately, and caches it""" @@ -1324,7 +1304,6 @@ def pivot( result_block, _ = block.aggregate( by_column_ids=self.index_columns, aggregations=aggregations, - as_index=True, dropna=True, ) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index a8b8afdae7..3ee46ef675 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -263,10 +263,10 @@ def _agg_string(self, func: str) -> df.DataFrame: agg_block, _ = self._block.aggregate( by_column_ids=self._by_col_ids, aggregations=aggregations, - as_index=self._as_index, dropna=self._dropna, ) - return df.DataFrame(agg_block) + dataframe = df.DataFrame(agg_block) + return dataframe if self._as_index else self._convert_index(dataframe) def _agg_dict(self, func: typing.Mapping) -> df.DataFrame: aggregations: typing.List[typing.Tuple[str, agg_ops.AggregateOp]] = [] @@ -285,7 +285,6 @@ def _agg_dict(self, func: typing.Mapping) -> df.DataFrame: agg_block, _ = self._block.aggregate( by_column_ids=self._by_col_ids, aggregations=aggregations, - as_index=self._as_index, dropna=self._dropna, ) if want_aggfunc_level: @@ -297,7 +296,8 @@ def _agg_dict(self, func: typing.Mapping) -> df.DataFrame: ) else: agg_block = agg_block.with_column_labels(pd.Index(column_labels)) - return df.DataFrame(agg_block) + dataframe = df.DataFrame(agg_block) + return dataframe if self._as_index else self._convert_index(dataframe) def _agg_list(self, func: typing.Sequence) -> df.DataFrame: aggregations = [ @@ -311,7 +311,6 @@ def _agg_list(self, func: typing.Sequence) -> df.DataFrame: agg_block, _ = self._block.aggregate( by_column_ids=self._by_col_ids, aggregations=aggregations, - as_index=self._as_index, dropna=self._dropna, ) agg_block = agg_block.with_column_labels( @@ -319,7 +318,8 @@ def _agg_list(self, func: typing.Sequence) -> df.DataFrame: column_labels, names=[*self._block.column_labels.names, None] ) ) - return df.DataFrame(agg_block) + dataframe = df.DataFrame(agg_block) + return dataframe if self._as_index else self._convert_index(dataframe) def _agg_named(self, **kwargs) -> df.DataFrame: aggregations = [] @@ -339,11 +339,21 @@ def _agg_named(self, **kwargs) -> df.DataFrame: agg_block, _ = self._block.aggregate( by_column_ids=self._by_col_ids, aggregations=aggregations, - as_index=self._as_index, dropna=self._dropna, ) agg_block = agg_block.with_column_labels(column_labels) - return df.DataFrame(agg_block) + dataframe = df.DataFrame(agg_block) + return dataframe if self._as_index else self._convert_index(dataframe) + + def _convert_index(self, dataframe: df.DataFrame): + """Convert index levels to columns except where names conflict.""" + levels_to_drop = [ + level for level in dataframe.index.names if level in dataframe.columns + ] + + if len(levels_to_drop) == dataframe.index.nlevels: + return dataframe.reset_index(drop=True) + return dataframe.droplevel(levels_to_drop).reset_index(drop=False) aggregate = agg @@ -379,10 +389,10 @@ def _aggregate_all( result_block, _ = self._block.aggregate( by_column_ids=self._by_col_ids, aggregations=aggregations, - as_index=self._as_index, dropna=self._dropna, ) - return df.DataFrame(result_block) + dataframe = df.DataFrame(result_block) + return dataframe if self._as_index else self._convert_index(dataframe) def _apply_window_op( self, diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 1d8169960b..98aa8f1185 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -72,7 +72,7 @@ # TODO(tbergeron): Convert to bytes-based limit MAX_INLINE_DF_SIZE = 5000 -LevelType = typing.Union[str, int] +LevelType = typing.Hashable LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] SingleItemValue = Union[bigframes.series.Series, int, float, Callable] @@ -1956,7 +1956,7 @@ def _stack_mono(self): def _stack_multi(self, level: LevelsType = -1): n_levels = self.columns.nlevels - if isinstance(level, int) or isinstance(level, str): + if not utils.is_list_like(level): level = [level] level_indices = [] for level_ref in level: @@ -1966,7 +1966,7 @@ def _stack_multi(self, level: LevelsType = -1): else: level_indices.append(level_ref) else: # str - level_indices.append(self.columns.names.index(level_ref)) + level_indices.append(self.columns.names.index(level_ref)) # type: ignore new_order = [ *[i for i in range(n_levels) if i not in level_indices], @@ -1982,7 +1982,7 @@ def _stack_multi(self, level: LevelsType = -1): return DataFrame(block) def unstack(self, level: LevelsType = -1): - if isinstance(level, int) or isinstance(level, str): + if not utils.is_list_like(level): level = [level] block = self._block diff --git a/bigframes/series.py b/bigframes/series.py index 1b9982877a..6837c1c7f8 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -841,7 +841,6 @@ def mode(self) -> Series: block, agg_ids = block.aggregate( by_column_ids=[self._value_column], aggregations=((self._value_column, agg_ops.count_op),), - as_index=False, ) value_count_col_id = agg_ids[0] block, max_value_count_col_id = block.apply_window_op( @@ -855,14 +854,15 @@ def mode(self) -> Series: ops.eq_op, ) block = block.filter(is_mode_col_id) - mode_values_series = Series( - block.select_column(self._value_column).assign_label( - self._value_column, self.name - ) - ) - return typing.cast( - Series, mode_values_series.sort_values().reset_index(drop=True) + # use temporary name for reset_index to avoid collision, restore after dropping extra columns + block = ( + block.with_index_labels(["mode_temp_internal"]) + .order_by([OrderingColumnReference(self._value_column)]) + .reset_index(drop=False) ) + block = block.select_column(self._value_column).with_column_labels([self.name]) + mode_values_series = Series(block.select_column(self._value_column)) + return typing.cast(Series, mode_values_series) def mean(self) -> float: return typing.cast(float, self._apply_aggregation(agg_ops.mean_op)) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 5214905186..2919c167ef 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -122,23 +122,32 @@ def test_dataframe_groupby_agg_list(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) +@pytest.mark.parametrize( + ("as_index"), + [ + (True), + (False), + ], +) def test_dataframe_groupby_agg_dict_with_list( - scalars_df_index, scalars_pandas_df_index + scalars_df_index, scalars_pandas_df_index, as_index ): col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] bf_result = ( scalars_df_index[col_names] - .groupby("string_col") + .groupby("string_col", as_index=as_index) .agg({"int64_too": ["mean", "max"], "string_col": "count"}) ) pd_result = ( scalars_pandas_df_index[col_names] - .groupby("string_col") + .groupby("string_col", as_index=as_index) .agg({"int64_too": ["mean", "max"], "string_col": "count"}) ) bf_result_computed = bf_result.to_pandas() - pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) + pd.testing.assert_frame_equal( + pd_result, bf_result_computed, check_dtype=False, check_index_type=False + ) def test_dataframe_groupby_agg_dict_no_lists(scalars_df_index, scalars_pandas_df_index): diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index e7e93849c6..1708735f4c 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -356,17 +356,24 @@ def test_multi_index_dataframe_groupby(scalars_df_index, scalars_pandas_df_index def test_multi_index_dataframe_groupby_level_aggregate( scalars_df_index, scalars_pandas_df_index, level, as_index ): + index_cols = ["int64_too", "bool_col"] bf_result = ( - scalars_df_index.set_index(["int64_too", "bool_col"]) + scalars_df_index.set_index(index_cols) .groupby(level=level, as_index=as_index) .mean(numeric_only=True) .to_pandas() ) pd_result = ( - scalars_pandas_df_index.set_index(["int64_too", "bool_col"]) + scalars_pandas_df_index.set_index(index_cols) .groupby(level=level, as_index=as_index) .mean(numeric_only=True) ) + # For as_index=False, pandas will drop index levels used as groupings + # In the future, it will include this in the result, bigframes already does this behavior + if not as_index: + for col in index_cols: + if col in bf_result.columns: + bf_result = bf_result.drop(col, axis=1) # Pandas will have int64 index, while bigquery will have Int64 when resetting pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) From b36ce472678d2680bc8468d1e589b8bde3fde1fe Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 19 Dec 2023 13:22:15 -0800 Subject: [PATCH 09/27] chore: Remove symlink that breaks local testing (#274) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- docs/samples | 1 - 1 file changed, 1 deletion(-) delete mode 120000 docs/samples diff --git a/docs/samples b/docs/samples deleted file mode 120000 index e804737ed3..0000000000 --- a/docs/samples +++ /dev/null @@ -1 +0,0 @@ -../samples \ No newline at end of file From e8da3a1ca1adff59b687725d9f367e7b3bdd60cf Mon Sep 17 00:00:00 2001 From: "gcf-owl-bot[bot]" <78513119+gcf-owl-bot[bot]@users.noreply.github.com> Date: Tue, 19 Dec 2023 22:10:15 +0000 Subject: [PATCH 10/27] build: update actions/upload-artifact and actions/download-artifact (#276) Source-Link: https://togithub.com/googleapis/synthtool/commit/280ddaed417057dfe5b1395731de07b7d09f5058 Post-Processor: gcr.io/cloud-devrel-public-resources/owlbot-python:latest@sha256:346ab2efb51649c5dde7756cbbdc60dd394852ba83b9bbffc292a63549f33c17 --- .github/.OwlBot.lock.yaml | 4 +- .github/workflows/docs.yml | 8 ++-- .github/workflows/lint.yml | 4 +- .github/workflows/unittest.yml | 18 ++++---- .kokoro/requirements.txt | 48 ++++++++++---------- .kokoro/samples/python3.12/common.cfg | 40 ++++++++++++++++ .kokoro/samples/python3.12/continuous.cfg | 6 +++ .kokoro/samples/python3.12/periodic-head.cfg | 11 +++++ .kokoro/samples/python3.12/periodic.cfg | 6 +++ .kokoro/samples/python3.12/presubmit.cfg | 6 +++ 10 files changed, 110 insertions(+), 41 deletions(-) create mode 100644 .kokoro/samples/python3.12/common.cfg create mode 100644 .kokoro/samples/python3.12/continuous.cfg create mode 100644 .kokoro/samples/python3.12/periodic-head.cfg create mode 100644 .kokoro/samples/python3.12/periodic.cfg create mode 100644 .kokoro/samples/python3.12/presubmit.cfg diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index 453b540c1e..9bee240971 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:caffe0a9277daeccc4d1de5c9b55ebba0901b57c2f713ec9c876b0d4ec064f61 -# created: 2023-11-08T19:46:45.022803742Z + digest: sha256:346ab2efb51649c5dde7756cbbdc60dd394852ba83b9bbffc292a63549f33c17 +# created: 2023-12-14T22:17:57.611773021Z diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 221806cedf..698fbc5c94 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -8,9 +8,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.9" - name: Install nox @@ -24,9 +24,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.10" - name: Install nox diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 16d5a9e90f..4866193af2 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -8,9 +8,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.8" - name: Install nox diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 465199fc9a..f059b5548a 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -11,9 +11,9 @@ jobs: python: ['3.9', '3.10', '3.11'] steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} - name: Install nox @@ -26,9 +26,9 @@ jobs: run: | nox -s unit-${{ matrix.python }} - name: Upload coverage results - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: coverage-artifacts + name: coverage-artifact-${{ matrix.python }} path: .coverage-${{ matrix.python }} cover: @@ -37,9 +37,9 @@ jobs: - unit steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Setup Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.8" - name: Install coverage @@ -47,11 +47,11 @@ jobs: python -m pip install --upgrade setuptools pip wheel python -m pip install coverage - name: Download coverage results - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 with: - name: coverage-artifacts path: .coverage-results/ - name: Report coverage results run: | - coverage combine .coverage-results/.coverage* + find .coverage-results -type f -name '*.zip' -exec unzip {} \; + coverage combine .coverage-results/**/.coverage* coverage report --show-missing --fail-under=35 diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt index 8957e21104..e5c1ffca94 100644 --- a/.kokoro/requirements.txt +++ b/.kokoro/requirements.txt @@ -93,30 +93,30 @@ colorlog==6.7.0 \ # via # gcp-docuploader # nox -cryptography==41.0.5 \ - --hash=sha256:0c327cac00f082013c7c9fb6c46b7cc9fa3c288ca702c74773968173bda421bf \ - --hash=sha256:0d2a6a598847c46e3e321a7aef8af1436f11c27f1254933746304ff014664d84 \ - --hash=sha256:227ec057cd32a41c6651701abc0328135e472ed450f47c2766f23267b792a88e \ - --hash=sha256:22892cc830d8b2c89ea60148227631bb96a7da0c1b722f2aac8824b1b7c0b6b8 \ - --hash=sha256:392cb88b597247177172e02da6b7a63deeff1937fa6fec3bbf902ebd75d97ec7 \ - --hash=sha256:3be3ca726e1572517d2bef99a818378bbcf7d7799d5372a46c79c29eb8d166c1 \ - --hash=sha256:573eb7128cbca75f9157dcde974781209463ce56b5804983e11a1c462f0f4e88 \ - --hash=sha256:580afc7b7216deeb87a098ef0674d6ee34ab55993140838b14c9b83312b37b86 \ - --hash=sha256:5a70187954ba7292c7876734183e810b728b4f3965fbe571421cb2434d279179 \ - --hash=sha256:73801ac9736741f220e20435f84ecec75ed70eda90f781a148f1bad546963d81 \ - --hash=sha256:7d208c21e47940369accfc9e85f0de7693d9a5d843c2509b3846b2db170dfd20 \ - --hash=sha256:8254962e6ba1f4d2090c44daf50a547cd5f0bf446dc658a8e5f8156cae0d8548 \ - --hash=sha256:88417bff20162f635f24f849ab182b092697922088b477a7abd6664ddd82291d \ - --hash=sha256:a48e74dad1fb349f3dc1d449ed88e0017d792997a7ad2ec9587ed17405667e6d \ - --hash=sha256:b948e09fe5fb18517d99994184854ebd50b57248736fd4c720ad540560174ec5 \ - --hash=sha256:c707f7afd813478e2019ae32a7c49cd932dd60ab2d2a93e796f68236b7e1fbf1 \ - --hash=sha256:d38e6031e113b7421db1de0c1b1f7739564a88f1684c6b89234fbf6c11b75147 \ - --hash=sha256:d3977f0e276f6f5bf245c403156673db103283266601405376f075c849a0b936 \ - --hash=sha256:da6a0ff8f1016ccc7477e6339e1d50ce5f59b88905585f77193ebd5068f1e797 \ - --hash=sha256:e270c04f4d9b5671ebcc792b3ba5d4488bf7c42c3c241a3748e2599776f29696 \ - --hash=sha256:e886098619d3815e0ad5790c973afeee2c0e6e04b4da90b88e6bd06e2a0b1b72 \ - --hash=sha256:ec3b055ff8f1dce8e6ef28f626e0972981475173d7973d63f271b29c8a2897da \ - --hash=sha256:fba1e91467c65fe64a82c689dc6cf58151158993b13eb7a7f3f4b7f395636723 +cryptography==41.0.6 \ + --hash=sha256:068bc551698c234742c40049e46840843f3d98ad7ce265fd2bd4ec0d11306596 \ + --hash=sha256:0f27acb55a4e77b9be8d550d762b0513ef3fc658cd3eb15110ebbcbd626db12c \ + --hash=sha256:2132d5865eea673fe6712c2ed5fb4fa49dba10768bb4cc798345748380ee3660 \ + --hash=sha256:3288acccef021e3c3c10d58933f44e8602cf04dba96d9796d70d537bb2f4bbc4 \ + --hash=sha256:35f3f288e83c3f6f10752467c48919a7a94b7d88cc00b0668372a0d2ad4f8ead \ + --hash=sha256:398ae1fc711b5eb78e977daa3cbf47cec20f2c08c5da129b7a296055fbb22aed \ + --hash=sha256:422e3e31d63743855e43e5a6fcc8b4acab860f560f9321b0ee6269cc7ed70cc3 \ + --hash=sha256:48783b7e2bef51224020efb61b42704207dde583d7e371ef8fc2a5fb6c0aabc7 \ + --hash=sha256:4d03186af98b1c01a4eda396b137f29e4e3fb0173e30f885e27acec8823c1b09 \ + --hash=sha256:5daeb18e7886a358064a68dbcaf441c036cbdb7da52ae744e7b9207b04d3908c \ + --hash=sha256:60e746b11b937911dc70d164060d28d273e31853bb359e2b2033c9e93e6f3c43 \ + --hash=sha256:742ae5e9a2310e9dade7932f9576606836ed174da3c7d26bc3d3ab4bd49b9f65 \ + --hash=sha256:7e00fb556bda398b99b0da289ce7053639d33b572847181d6483ad89835115f6 \ + --hash=sha256:85abd057699b98fce40b41737afb234fef05c67e116f6f3650782c10862c43da \ + --hash=sha256:8efb2af8d4ba9dbc9c9dd8f04d19a7abb5b49eab1f3694e7b5a16a5fc2856f5c \ + --hash=sha256:ae236bb8760c1e55b7a39b6d4d32d2279bc6c7c8500b7d5a13b6fb9fc97be35b \ + --hash=sha256:afda76d84b053923c27ede5edc1ed7d53e3c9f475ebaf63c68e69f1403c405a8 \ + --hash=sha256:b27a7fd4229abef715e064269d98a7e2909ebf92eb6912a9603c7e14c181928c \ + --hash=sha256:b648fe2a45e426aaee684ddca2632f62ec4613ef362f4d681a9a6283d10e079d \ + --hash=sha256:c5a550dc7a3b50b116323e3d376241829fd326ac47bc195e04eb33a8170902a9 \ + --hash=sha256:da46e2b5df770070412c46f87bac0849b8d685c5f2679771de277a422c7d0b86 \ + --hash=sha256:f39812f70fc5c71a15aa3c97b2bbe213c3f2a460b79bd21c40d033bb34a9bf36 \ + --hash=sha256:ff369dd19e8fe0528b02e8df9f2aeb2479f89b1270d90f96a63500afe9af5cae # via # gcp-releasetool # secretstorage diff --git a/.kokoro/samples/python3.12/common.cfg b/.kokoro/samples/python3.12/common.cfg new file mode 100644 index 0000000000..abf83e196d --- /dev/null +++ b/.kokoro/samples/python3.12/common.cfg @@ -0,0 +1,40 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Build logs will be here +action { + define_artifacts { + regex: "**/*sponge_log.xml" + } +} + +# Specify which tests to run +env_vars: { + key: "RUN_TESTS_SESSION" + value: "py-3.12" +} + +# Declare build specific Cloud project. +env_vars: { + key: "BUILD_SPECIFIC_GCLOUD_PROJECT" + value: "python-docs-samples-tests-312" +} + +env_vars: { + key: "TRAMPOLINE_BUILD_FILE" + value: "github/python-bigquery-dataframes/.kokoro/test-samples.sh" +} + +# Configure the docker image for kokoro-trampoline. +env_vars: { + key: "TRAMPOLINE_IMAGE" + value: "gcr.io/cloud-devrel-kokoro-resources/python-samples-testing-docker" +} + +# Download secrets for samples +gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples" + +# Download trampoline resources. +gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" + +# Use the trampoline script to run in docker. +build_file: "python-bigquery-dataframes/.kokoro/trampoline_v2.sh" \ No newline at end of file diff --git a/.kokoro/samples/python3.12/continuous.cfg b/.kokoro/samples/python3.12/continuous.cfg new file mode 100644 index 0000000000..a1c8d9759c --- /dev/null +++ b/.kokoro/samples/python3.12/continuous.cfg @@ -0,0 +1,6 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "True" +} \ No newline at end of file diff --git a/.kokoro/samples/python3.12/periodic-head.cfg b/.kokoro/samples/python3.12/periodic-head.cfg new file mode 100644 index 0000000000..123a35fbd3 --- /dev/null +++ b/.kokoro/samples/python3.12/periodic-head.cfg @@ -0,0 +1,11 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "True" +} + +env_vars: { + key: "TRAMPOLINE_BUILD_FILE" + value: "github/python-bigquery-dataframes/.kokoro/test-samples-against-head.sh" +} diff --git a/.kokoro/samples/python3.12/periodic.cfg b/.kokoro/samples/python3.12/periodic.cfg new file mode 100644 index 0000000000..71cd1e597e --- /dev/null +++ b/.kokoro/samples/python3.12/periodic.cfg @@ -0,0 +1,6 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "False" +} diff --git a/.kokoro/samples/python3.12/presubmit.cfg b/.kokoro/samples/python3.12/presubmit.cfg new file mode 100644 index 0000000000..a1c8d9759c --- /dev/null +++ b/.kokoro/samples/python3.12/presubmit.cfg @@ -0,0 +1,6 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +env_vars: { + key: "INSTALL_LIBRARY_FROM_SOURCE" + value: "True" +} \ No newline at end of file From 9ec352a338f11d82aee9cd665ffb0e6e97cb391b Mon Sep 17 00:00:00 2001 From: Anthonios Partheniou Date: Tue, 19 Dec 2023 18:04:15 -0500 Subject: [PATCH 11/27] fix: use setuptools.find_namespace_packages (#246) Similar to googleapis/google-auth-library-python#1205 https://packaging.python.org/en/latest/guides/packaging-namespace-packages/#native-namespace-packages --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9aaaaae04f..345d1ea752 100644 --- a/setup.py +++ b/setup.py @@ -82,7 +82,7 @@ # benchmarks, etc. packages = [ package - for package in setuptools.PEP420PackageFinder.find() + for package in setuptools.find_namespace_packages() if package.startswith("bigframes") or package.startswith("third_party") ] From 95b673aeb1545744e4b1a353cf1f4d0202d8a1b2 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 20 Dec 2023 07:13:35 +0000 Subject: [PATCH 12/27] docs: code samples for `Series.{sum, mean, min, max}`, `astype` (#280) * docs: code samples for `Series.{sum, mean, min, max}`, `astype` * insert newlines in code samples to improve readability --------- Co-authored-by: Huan Chen <142538604+Genesis929@users.noreply.github.com> --- bigframes/dtypes.py | 2 +- .../bigframes_vendored/pandas/core/generic.py | 38 +++- .../bigframes_vendored/pandas/core/series.py | 192 ++++++++++++++++++ 3 files changed, 230 insertions(+), 2 deletions(-) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 6dfcc17f37..891c372a10 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -288,7 +288,7 @@ def bigframes_dtype_to_ibis_dtype( f""" Unexpected data type {bigframes_dtype}. The following str dtypes are supppted: 'boolean','Float64','Int64', 'string', - 'tring[pyarrow]','timestamp[us, tz=UTC][pyarrow]', + 'string[pyarrow]','timestamp[us, tz=UTC][pyarrow]', 'timestamp[us][pyarrow]','date32[day][pyarrow]', 'time64[us][pyarrow]'. The following pandas.ExtensionDtype are supported: pandas.BooleanDtype(), pandas.Float64Dtype(), diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 607243f844..ca5c6344ce 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -82,10 +82,46 @@ def astype(self, dtype): """ Cast a pandas object to a specified dtype ``dtype``. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Create a DataFrame: + + >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> df = bpd.DataFrame(data=d) + >>> df.dtypes + col1 Int64 + col2 Int64 + dtype: object + + Cast all columns to ``Float64``: + + >>> df.astype('Float64').dtypes + col1 Float64 + col2 Float64 + dtype: object + + Create a series of type ``Int64``: + + >>> ser = bpd.Series([1, 2], dtype='Int64') + >>> ser + 0 1 + 1 2 + dtype: Int64 + + Convert to ``Float64`` type: + + >>> ser.astype('Float64') + 0 1.0 + 1 2.0 + dtype: Float64 + Args: dtype (str or pandas.ExtensionDtype): A dtype supported by BigQuery DataFrame include 'boolean','Float64','Int64', - 'string', 'tring[pyarrow]','timestamp[us, tz=UTC][pyarrow]', + 'string', 'string[pyarrow]','timestamp[us, tz=UTC][pyarrow]', 'timestamp[us][pyarrow]','date32[day][pyarrow]','time64[us][pyarrow]' A pandas.ExtensionDtype include pandas.BooleanDtype(), pandas.Float64Dtype(), pandas.Int64Dtype(), pandas.StringDtype(storage="pyarrow"), diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 8303df5ef4..d054684598 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -890,6 +890,95 @@ def groupby( used to group large amounts of data and compute operations on these groups. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + You can group by a named index level. + + >>> s = bpd.Series([380, 370., 24., 26.], + ... index=["Falcon", "Falcon", "Parrot", "Parrot"], + ... name="Max Speed") + >>> s.index.name="Animal" + >>> s + Animal + Falcon 380.0 + Falcon 370.0 + Parrot 24.0 + Parrot 26.0 + Name: Max Speed, dtype: Float64 + >>> s.groupby("Animal").mean() + Animal + Falcon 375.0 + Parrot 25.0 + Name: Max Speed, dtype: Float64 + + You can also group by more than one index levels. + + >>> import pandas as pd + >>> s = bpd.Series([380, 370., 24., 26.], + ... index=pd.MultiIndex.from_tuples( + ... [("Falcon", "Clear"), + ... ("Falcon", "Cloudy"), + ... ("Parrot", "Clear"), + ... ("Parrot", "Clear")], + ... names=["Animal", "Sky"]), + ... name="Max Speed") + >>> s + Animal Sky + Falcon Clear 380.0 + Cloudy 370.0 + Parrot Clear 24.0 + Clear 26.0 + Name: Max Speed, dtype: Float64 + + >>> s.groupby("Animal").mean() + Animal + Falcon 375.0 + Parrot 25.0 + Name: Max Speed, dtype: Float64 + + >>> s.groupby("Sky").mean() + Sky + Clear 143.333333 + Cloudy 370.0 + Name: Max Speed, dtype: Float64 + + >>> s.groupby(["Animal", "Sky"]).mean() + Animal Sky + Falcon Clear 380.0 + Cloudy 370.0 + Parrot Clear 25.0 + Name: Max Speed, dtype: Float64 + + You can also group by values in a Series provided the index matches with + the original series. + + >>> df = bpd.DataFrame({'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'], + ... 'Max Speed': [380., 370., 24., 26.], + ... 'Age': [10., 20., 4., 6.]}) + >>> df + Animal Max Speed Age + 0 Falcon 380.0 10.0 + 1 Falcon 370.0 20.0 + 2 Parrot 24.0 4.0 + 3 Parrot 26.0 6.0 + + [4 rows x 3 columns] + + >>> df['Max Speed'].groupby(df['Animal']).mean() + Animal + Falcon 375.0 + Parrot 25.0 + Name: Max Speed, dtype: Float64 + + >>> df['Age'].groupby(df['Animal']).max() + Animal + Falcon 20.0 + Parrot 6.0 + Name: Age, dtype: Float64 + Args: by (mapping, function, label, pd.Grouper or list of such, default None): Used to determine the groups for the groupby. @@ -1661,6 +1750,31 @@ def max( If you want the index of the maximum, use ``idxmax``. This is the equivalent of the ``numpy.ndarray`` method ``argmax``. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Calculating the max of a Series: + + >>> s = bpd.Series([1, 3]) + >>> s + 0 1 + 1 3 + dtype: Int64 + >>> s.max() + 3 + + Calculating the max of a Series containing ``NA`` values: + + >>> s = bpd.Series([1, 3, bpd.NA]) + >>> s + 0 1.0 + 1 3.0 + 2 + dtype: Float64 + >>> s.max() + 3.0 Returns: scalar: Scalar. @@ -1676,6 +1790,32 @@ def min( If you want the index of the minimum, use ``idxmin``. This is the equivalent of the ``numpy.ndarray`` method ``argmin``. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Calculating the min of a Series: + + >>> s = bpd.Series([1, 3]) + >>> s + 0 1 + 1 3 + dtype: Int64 + >>> s.min() + 1 + + Calculating the min of a Series containing ``NA`` values: + + >>> s = bpd.Series([1, 3, bpd.NA]) + >>> s + 0 1.0 + 1 3.0 + 2 + dtype: Float64 + >>> s.min() + 1.0 + Returns: scalar: Scalar. """ @@ -1714,6 +1854,32 @@ def sum(self): This is equivalent to the method ``numpy.sum``. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Calculating the sum of a Series: + + >>> s = bpd.Series([1, 3]) + >>> s + 0 1 + 1 3 + dtype: Int64 + >>> s.sum() + 4 + + Calculating the sum of a Series containing ``NA`` values: + + >>> s = bpd.Series([1, 3, bpd.NA]) + >>> s + 0 1.0 + 1 3.0 + 2 + dtype: Float64 + >>> s.sum() + 4.0 + Returns: scalar: Scalar. """ @@ -1722,6 +1888,32 @@ def sum(self): def mean(self): """Return the mean of the values over the requested axis. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Calculating the mean of a Series: + + >>> s = bpd.Series([1, 3]) + >>> s + 0 1 + 1 3 + dtype: Int64 + >>> s.mean() + 2.0 + + Calculating the mean of a Series containing ``NA`` values: + + >>> s = bpd.Series([1, 3, bpd.NA]) + >>> s + 0 1.0 + 1 3.0 + 2 + dtype: Float64 + >>> s.mean() + 2.0 + Returns: scalar: Scalar. """ From ad6746569b3af11be9d40805a1449ee1e89288dc Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 21 Dec 2023 00:33:37 +0000 Subject: [PATCH 13/27] fix: make `Series.str.replace` work for simple strings (#285) --- bigframes/operations/__init__.py | 2 +- tests/system/small/operations/test_strings.py | 2 ++ .../bigframes_vendored/pandas/core/series.py | 18 ++++++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 753870a42d..678774978a 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -385,7 +385,7 @@ def _as_ibis(self, x: ibis_types.Value): ibis_types.StringValue, ibis_types.literal(self._pat) ) repl_str_value = typing.cast( - ibis_types.StringValue, ibis_types.literal(self._pat) + ibis_types.StringValue, ibis_types.literal(self._repl) ) return typing.cast(ibis_types.StringValue, x).replace( diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 27a35134d4..79f92c94b4 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -94,6 +94,8 @@ def test_str_extract(scalars_dfs, pat): (".*", "blah", True, 0, True), ("h.l", "blah", False, 0, True), (re.compile("(?i).e.."), "blah", None, 0, True), + ("H", "h", True, 0, False), + (", ", "__", True, 0, False), ], ) def test_str_replace(scalars_dfs, pat, repl, case, flags, regex): diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index d054684598..366f32c77e 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -2304,6 +2304,24 @@ def str(self): NAs stay NA unless handled otherwise by a particular method. Patterned after Python’s string methods, with some inspiration from R’s stringr package. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(["A_Str_Series"]) + >>> s + 0 A_Str_Series + dtype: string + + >>> s.str.lower() + 0 a_str_series + dtype: string + + >>> s.str.replace("_", "") + 0 AStrSeries + dtype: string + Returns: bigframes.operations.strings.StringMethods: An accessor containing string methods. From a1c06319ab0e3697c3175112490488002bb344c0 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 20 Dec 2023 18:35:18 -0800 Subject: [PATCH 14/27] feat: specific pyarrow mappings for decimal, bytes types (#283) * feat: new bytes, json, decimal type mappings * amend tests to reflect new types * add implicit type conversion for df.replace * more type casting tests * skip pandas 1.x for more tests --------- Co-authored-by: Tim Swast --- bigframes/core/block_transforms.py | 2 +- bigframes/core/blocks.py | 2 +- bigframes/core/compile/compiled.py | 14 +- bigframes/core/groupby/__init__.py | 7 +- bigframes/dataframe.py | 20 +-- bigframes/dtypes.py | 139 ++++++++++++++------ bigframes/series.py | 24 +++- tests/system/large/ml/test_compose.py | 4 +- tests/system/large/ml/test_core.py | 1 + tests/system/small/ml/test_core.py | 3 +- tests/system/small/ml/test_imported.py | 2 + tests/system/small/ml/test_llm.py | 10 +- tests/system/small/ml/test_preprocessing.py | 16 +++ tests/system/small/ml/test_remote.py | 1 + tests/system/small/test_dataframe.py | 37 +++--- tests/system/small/test_dataframe_io.py | 18 ++- tests/system/small/test_multiindex.py | 7 +- tests/system/small/test_series.py | 15 ++- tests/system/small/test_session.py | 3 + tests/system/utils.py | 36 ++++- tests/unit/test_dtypes.py | 7 +- 21 files changed, 267 insertions(+), 101 deletions(-) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index 6654892287..c6867c1a33 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -131,7 +131,7 @@ def interpolate(block: blocks.Block, method: str = "linear") -> blocks.Block: if len(index_columns) != 1: raise ValueError("only method 'linear' supports multi-index") xvalues = block.index_columns[0] - if block.index_dtypes[0] not in dtypes.NUMERIC_BIGFRAMES_TYPES: + if block.index_dtypes[0] not in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE: raise ValueError("Can only interpolate on numeric index.") for column in original_columns: diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 3163aa5b09..779d11b371 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1063,7 +1063,7 @@ def _standard_stats(self, column_id) -> typing.Sequence[agg_ops.AggregateOp]: stats: list[agg_ops.AggregateOp] = [agg_ops.count_op] if dtype not in bigframes.dtypes.UNORDERED_DTYPES: stats += [agg_ops.min_op, agg_ops.max_op] - if dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES: + if dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE: # Notable exclusions: # prod op tends to cause overflows # Also, var_op is redundant as can be derived from std diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index d6183228d1..199c8db785 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -220,7 +220,10 @@ def _get_ibis_column(self, key: str) -> ibis_types.Value: raise ValueError( "Column name {} not in set of values: {}".format(key, self.column_ids) ) - return typing.cast(ibis_types.Value, self._column_names[key]) + return typing.cast( + ibis_types.Value, + bigframes.dtypes.ibis_value_to_canonical_type(self._column_names[key]), + ) def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: ibis_type = typing.cast( @@ -1177,7 +1180,14 @@ def _to_ibis_expr( # Make sure all dtypes are the "canonical" ones for BigFrames. This is # important for operations like UNION where the schema must match. table = self._table.select( - bigframes.dtypes.ibis_value_to_canonical_type(column) for column in columns + bigframes.dtypes.ibis_value_to_canonical_type( + column.resolve(self._table) + # TODO(https://github.com/ibis-project/ibis/issues/7613): use + # public API to refer to Deferred type. + if isinstance(column, ibis.common.deferred.Deferred) + else column + ) + for column in columns ) base_table = table if self._reduced_predicate is not None: diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 3ee46ef675..66ba901649 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -359,7 +359,8 @@ def _convert_index(self, dataframe: df.DataFrame): def _raise_on_non_numeric(self, op: str): if not all( - dtype in dtypes.NUMERIC_BIGFRAMES_TYPES for dtype in self._block.dtypes + dtype in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE + for dtype in self._block.dtypes ): raise NotImplementedError( f"'{op}' does not support non-numeric columns. " @@ -371,7 +372,9 @@ def _raise_on_non_numeric(self, op: str): def _aggregated_columns(self, numeric_only: bool = False) -> typing.Sequence[str]: valid_agg_cols: list[str] = [] for col_id in self._selected_cols: - is_numeric = self._column_type(col_id) in dtypes.NUMERIC_BIGFRAMES_TYPES + is_numeric = ( + self._column_type(col_id) in dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE + ) if is_numeric or not numeric_only: valid_agg_cols.append(col_id) return valid_agg_cols diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 98aa8f1185..423c2bcaac 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1800,7 +1800,7 @@ def agg( ) -> DataFrame | bigframes.series.Series: if utils.is_list_like(func): if any( - dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES + dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE for dtype in self.dtypes ): raise NotImplementedError( @@ -1867,7 +1867,7 @@ def melt( ) def describe(self) -> DataFrame: - df_numeric = self._drop_non_numeric(keep_bool=False) + df_numeric = self._drop_non_numeric(permissive=False) if len(df_numeric.columns) == 0: raise NotImplementedError( f"df.describe() currently only supports numeric values. {constants.FEEDBACK_LINK}" @@ -2005,10 +2005,12 @@ def unstack(self, level: LevelsType = -1): ) return DataFrame(pivot_block) - def _drop_non_numeric(self, keep_bool=True) -> DataFrame: - types_to_keep = set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES) - if not keep_bool: - types_to_keep -= set(bigframes.dtypes.BOOL_BIGFRAMES_TYPES) + def _drop_non_numeric(self, permissive=True) -> DataFrame: + types_to_keep = ( + set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) + if permissive + else set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE) + ) non_numeric_cols = [ col_id for col_id, dtype in zip(self._block.value_columns, self._block.dtypes) @@ -2026,7 +2028,7 @@ def _drop_non_bool(self) -> DataFrame: def _raise_on_non_numeric(self, op: str): if not all( - dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES + dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE for dtype in self._block.dtypes ): raise NotImplementedError( @@ -2301,7 +2303,7 @@ def notna(self) -> DataFrame: def cumsum(self): is_numeric_types = [ - (dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES) + (dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) for _, dtype in self.dtypes.items() ] if not all(is_numeric_types): @@ -2313,7 +2315,7 @@ def cumsum(self): def cumprod(self) -> DataFrame: is_numeric_types = [ - (dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES) + (dtype in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE) for _, dtype in self.dtypes.items() ] if not all(is_numeric_types): diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 891c372a10..b754acea2e 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -15,6 +15,7 @@ """Mappings for Pandas dtypes supported by BigQuery DataFrames package""" import datetime +import decimal import textwrap import typing from typing import Any, Dict, Iterable, Literal, Tuple, Union @@ -30,6 +31,7 @@ import bigframes.constants as constants import third_party.bigframes_vendored.google_cloud_bigquery._pandas_helpers as gcb3p_pandas_helpers +import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops # Type hints for Pandas dtypes supported by BigQuery DataFrame Dtype = Union[ @@ -40,9 +42,6 @@ pd.ArrowDtype, ] -# Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation) -NUMERIC_BIGFRAMES_TYPES = [pd.BooleanDtype(), pd.Float64Dtype(), pd.Int64Dtype()] - # On BQ side, ARRAY, STRUCT, GEOGRAPHY, JSON are not orderable UNORDERED_DTYPES = [gpd.array.GeometryDtype()] @@ -57,6 +56,9 @@ "timestamp[us][pyarrow]", "date32[day][pyarrow]", "time64[us][pyarrow]", + "decimal128(38, 9)[pyarrow]", + "decimal256(38, 9)[pyarrow]", + "binary[pyarrow]", ] # Type hints for Ibis data types supported by BigQuery DataFrame @@ -72,8 +74,17 @@ BOOL_BIGFRAMES_TYPES = [pd.BooleanDtype()] -# Several operations are restricted to these types. -NUMERIC_BIGFRAMES_TYPES = [pd.BooleanDtype(), pd.Float64Dtype(), pd.Int64Dtype()] +# Corresponds to the pandas concept of numeric type (such as when 'numeric_only' is specified in an operation) +# Pandas is inconsistent, so two definitions are provided, each used in different contexts +NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE = [ + pd.Float64Dtype(), + pd.Int64Dtype(), +] +NUMERIC_BIGFRAMES_TYPES_PERMISSIVE = NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE + [ + pd.BooleanDtype(), + pd.ArrowDtype(pa.decimal128(38, 9)), + pd.ArrowDtype(pa.decimal256(76, 38)), +] # Type hints for Ibis data types that can be read to Python objects by BigQuery DataFrame ReadOnlyIbisDtype = Union[ @@ -97,6 +108,15 @@ ibis_dtypes.Timestamp(timezone="UTC"), pd.ArrowDtype(pa.timestamp("us", tz="UTC")), ), + (ibis_dtypes.binary, pd.ArrowDtype(pa.binary())), + ( + ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), + pd.ArrowDtype(pa.decimal128(38, 9)), + ), + ( + ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), + pd.ArrowDtype(pa.decimal256(76, 38)), + ), ) BIGFRAMES_TO_IBIS: Dict[Dtype, ibis_dtypes.DataType] = { @@ -112,6 +132,9 @@ ibis_dtypes.time: pa.time64("us"), ibis_dtypes.Timestamp(timezone=None): pa.timestamp("us"), ibis_dtypes.Timestamp(timezone="UTC"): pa.timestamp("us", tz="UTC"), + ibis_dtypes.binary: pa.binary(), + ibis_dtypes.Decimal(precision=38, scale=9, nullable=True): pa.decimal128(38, 9), + ibis_dtypes.Decimal(precision=76, scale=38, nullable=True): pa.decimal256(76, 38), } ARROW_TO_IBIS = {arrow: ibis for ibis, arrow in IBIS_TO_ARROW.items()} @@ -125,10 +148,6 @@ ) IBIS_TO_BIGFRAMES.update( { - ibis_dtypes.binary: np.dtype("O"), - ibis_dtypes.json: np.dtype("O"), - ibis_dtypes.Decimal(precision=38, scale=9, nullable=True): np.dtype("O"), - ibis_dtypes.Decimal(precision=76, scale=38, nullable=True): np.dtype("O"), ibis_dtypes.GeoSpatial( geotype="geography", srid=4326, nullable=True ): gpd.array.GeometryDtype(), @@ -178,7 +197,7 @@ def ibis_dtype_to_bigframes_dtype( # our IO returns them as objects. Eventually, we should support them as # ArrowDType (and update the IO accordingly) if isinstance(ibis_dtype, ibis_dtypes.Array): - return np.dtype("O") + return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype)) if isinstance(ibis_dtype, ibis_dtypes.Struct): return pd.ArrowDtype(ibis_dtype_to_arrow_dtype(ibis_dtype)) @@ -200,7 +219,9 @@ def ibis_dtype_to_bigframes_dtype( def ibis_dtype_to_arrow_dtype(ibis_dtype: ibis_dtypes.DataType) -> pa.DataType: if isinstance(ibis_dtype, ibis_dtypes.Array): - return pa.list_(ibis_dtype_to_arrow_dtype(ibis_dtype.value_type)) + return pa.list_( + ibis_dtype_to_arrow_dtype(ibis_dtype.value_type.copy(nullable=True)) + ) if isinstance(ibis_dtype, ibis_dtypes.Struct): return pa.struct( @@ -224,21 +245,13 @@ def ibis_value_to_canonical_type(value: ibis_types.Value) -> ibis_types.Value: This is useful in cases where multiple types correspond to the same BigFrames dtype. """ ibis_type = value.type() + name = value.get_name() + if ibis_type.is_json(): + value = vendored_ibis_ops.ToJsonString(value).to_expr() + return value.name(name) # Allow REQUIRED fields to be joined with NULLABLE fields. nullable_type = ibis_type.copy(nullable=True) - return value.cast(nullable_type).name(value.get_name()) - - -def ibis_table_to_canonical_types(table: ibis_types.Table) -> ibis_types.Table: - """Converts an Ibis table expression to canonical types. - - This is useful in cases where multiple types correspond to the same BigFrames dtype. - """ - casted_columns = [] - for column_name in table.columns: - column = typing.cast(ibis_types.Value, table[column_name]) - casted_columns.append(ibis_value_to_canonical_type(column)) - return table.select(*casted_columns) + return value.cast(nullable_type).name(name) def arrow_dtype_to_ibis_dtype(arrow_dtype: pa.DataType) -> ibis_dtypes.DataType: @@ -386,15 +399,35 @@ def cast_ibis_value( ibis_dtypes.bool, ibis_dtypes.float64, ibis_dtypes.string, + ibis_dtypes.Decimal(precision=38, scale=9), + ibis_dtypes.Decimal(precision=76, scale=38), + ), + ibis_dtypes.float64: ( + ibis_dtypes.string, + ibis_dtypes.int64, + ibis_dtypes.Decimal(precision=38, scale=9), + ibis_dtypes.Decimal(precision=76, scale=38), + ), + ibis_dtypes.string: ( + ibis_dtypes.int64, + ibis_dtypes.float64, + ibis_dtypes.Decimal(precision=38, scale=9), + ibis_dtypes.Decimal(precision=76, scale=38), + ibis_dtypes.binary, ), - ibis_dtypes.float64: (ibis_dtypes.string, ibis_dtypes.int64), - ibis_dtypes.string: (ibis_dtypes.int64, ibis_dtypes.float64), ibis_dtypes.date: (ibis_dtypes.string,), - ibis_dtypes.Decimal(precision=38, scale=9): (ibis_dtypes.float64,), - ibis_dtypes.Decimal(precision=76, scale=38): (ibis_dtypes.float64,), + ibis_dtypes.Decimal(precision=38, scale=9): ( + ibis_dtypes.float64, + ibis_dtypes.Decimal(precision=76, scale=38), + ), + ibis_dtypes.Decimal(precision=76, scale=38): ( + ibis_dtypes.float64, + ibis_dtypes.Decimal(precision=38, scale=9), + ), ibis_dtypes.time: (), ibis_dtypes.timestamp: (ibis_dtypes.Timestamp(timezone="UTC"),), ibis_dtypes.Timestamp(timezone="UTC"): (ibis_dtypes.timestamp,), + ibis_dtypes.binary: (ibis_dtypes.string,), } value = ibis_value_to_canonical_type(value) @@ -458,30 +491,62 @@ def is_dtype(scalar: typing.Any, dtype: Dtype) -> bool: return False +# string is binary def is_patype(scalar: typing.Any, pa_type: pa.DataType) -> bool: """Determine whether a scalar's type matches a given pyarrow type.""" if pa_type == pa.time64("us"): return isinstance(scalar, datetime.time) - if pa_type == pa.timestamp("us"): + elif pa_type == pa.timestamp("us"): if isinstance(scalar, datetime.datetime): return not scalar.tzinfo if isinstance(scalar, pd.Timestamp): return not scalar.tzinfo - if pa_type == pa.timestamp("us", tz="UTC"): + elif pa_type == pa.timestamp("us", tz="UTC"): if isinstance(scalar, datetime.datetime): return scalar.tzinfo == datetime.timezone.utc if isinstance(scalar, pd.Timestamp): return scalar.tzinfo == datetime.timezone.utc - if pa_type == pa.date32(): + elif pa_type == pa.date32(): return isinstance(scalar, datetime.date) + elif pa_type == pa.binary(): + return isinstance(scalar, bytes) + elif pa_type == pa.decimal128(38, 9): + # decimal.Decimal is a superset, but ibis performs out-of-bounds and loss-of-precision checks + return isinstance(scalar, decimal.Decimal) + elif pa_type == pa.decimal256(76, 38): + # decimal.Decimal is a superset, but ibis performs out-of-bounds and loss-of-precision checks + return isinstance(scalar, decimal.Decimal) return False -def is_comparable(scalar: typing.Any, dtype: Dtype) -> bool: - """Whether scalar can be compare to items of dtype (though maybe requiring coercion)""" +def is_compatible(scalar: typing.Any, dtype: Dtype) -> typing.Optional[Dtype]: + """Whether scalar can be compare to items of dtype (though maybe requiring coercion). Returns the datatype that must be used for the comparison""" if is_dtype(scalar, dtype): - return True + return dtype elif pd.api.types.is_numeric_dtype(dtype): - return pd.api.types.is_number(scalar) - else: - return False + # Implicit conversion currently only supported for numeric types + if pd.api.types.is_bool(scalar): + return lcd_type(pd.BooleanDtype(), dtype) + if pd.api.types.is_float(scalar): + return lcd_type(pd.Float64Dtype(), dtype) + if pd.api.types.is_integer(scalar): + return lcd_type(pd.Int64Dtype(), dtype) + if isinstance(scalar, decimal.Decimal): + # TODO: Check context to see if can use NUMERIC instead of BIGNUMERIC + return lcd_type(pd.ArrowDtype(pa.decimal128(76, 38)), dtype) + return None + + +def lcd_type(dtype1: Dtype, dtype2: Dtype) -> typing.Optional[Dtype]: + # Implicit conversion currently only supported for numeric types + hierarchy: list[Dtype] = [ + pd.BooleanDtype(), + pd.Int64Dtype(), + pd.Float64Dtype(), + pd.ArrowDtype(pa.decimal128(38, 9)), + pd.ArrowDtype(pa.decimal256(76, 38)), + ] + if (dtype1 not in hierarchy) or (dtype2 not in hierarchy): + return None + lcd_index = max(hierarchy.index(dtype1), hierarchy.index(dtype2)) + return hierarchy[lcd_index] diff --git a/bigframes/series.py b/bigframes/series.py index 6837c1c7f8..eefd2b755d 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -16,6 +16,7 @@ from __future__ import annotations +import functools import itertools import numbers import textwrap @@ -455,7 +456,7 @@ def replace( else: # Scalar replace_list = [to_replace] replace_list = [ - i for i in replace_list if bigframes.dtypes.is_comparable(i, self.dtype) + i for i in replace_list if bigframes.dtypes.is_compatible(i, self.dtype) ] return self._simple_replace(replace_list, value) if replace_list else self @@ -472,11 +473,15 @@ def _regex_replace(self, to_replace: str, value: str): return Series(block.select_column(result_col)) def _simple_replace(self, to_replace_list: typing.Sequence, value): - if not bigframes.dtypes.is_dtype(value, self.dtype): + result_type = bigframes.dtypes.is_compatible(value, self.dtype) + if not result_type: raise NotImplementedError( f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}" ) + if result_type != self.dtype: + return self.astype(result_type)._simple_replace(to_replace_list, value) + block, cond = self._block.apply_unary_op( self._value_column, ops.IsInOp(to_replace_list) ) @@ -490,15 +495,26 @@ def _simple_replace(self, to_replace_list: typing.Sequence, value): def _mapping_replace(self, mapping: dict[typing.Hashable, typing.Hashable]): tuples = [] + lcd_types: list[typing.Optional[bigframes.dtypes.Dtype]] = [] for key, value in mapping.items(): - if not bigframes.dtypes.is_comparable(key, self.dtype): + lcd_type = bigframes.dtypes.is_compatible(key, self.dtype) + if not lcd_type: continue if not bigframes.dtypes.is_dtype(value, self.dtype): raise NotImplementedError( f"Cannot replace {self.dtype} elements with incompatible item {value} as mixed-type columns not supported. {constants.FEEDBACK_LINK}" ) tuples.append((key, value)) + lcd_types.append(lcd_type) + result_dtype = functools.reduce( + lambda t1, t2: bigframes.dtypes.lcd_type(t1, t2) if (t1 and t2) else None, + lcd_types, + ) + if not result_dtype: + raise NotImplementedError( + f"Cannot replace {self.dtype} elements with incompatible mapping {mapping} as mixed-type columns not supported. {constants.FEEDBACK_LINK}" + ) block, result = self._block.apply_unary_op( self._value_column, ops.MapOp(tuple(tuples)) ) @@ -782,7 +798,7 @@ def _central_moment(self, n: int) -> float: def agg(self, func: str | typing.Sequence[str]) -> scalars.Scalar | Series: if _is_list_like(func): - if self.dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES: + if self.dtype not in bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_PERMISSIVE: raise NotImplementedError( f"Multiple aggregations only supported on numeric series. {constants.FEEDBACK_LINK}" ) diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index 0c280e5d02..6ea4f72489 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -72,7 +72,7 @@ def test_columntransformer_standalone_fit_and_transform( expected.standard_scaled_flipper_length_mm.astype("Float64") ) - pandas.testing.assert_frame_equal(result, expected, rtol=1e-3) + pandas.testing.assert_frame_equal(result, expected, rtol=1e-3, check_dtype=False) def test_columntransformer_standalone_fit_transform(new_penguins_df): @@ -123,4 +123,4 @@ def test_columntransformer_standalone_fit_transform(new_penguins_df): expected.standard_scaled_flipper_length_mm.astype("Float64") ) - pandas.testing.assert_frame_equal(result, expected, rtol=1e-3) + pandas.testing.assert_frame_equal(result, expected, rtol=1e-3, check_dtype=False) diff --git a/tests/system/large/ml/test_core.py b/tests/system/large/ml/test_core.py index 3b30d7eb1d..df387e6ee1 100644 --- a/tests/system/large/ml/test_core.py +++ b/tests/system/large/ml/test_core.py @@ -184,4 +184,5 @@ def test_bqml_standalone_transform(penguins_df_default_index, new_penguins_df): expected, check_exact=False, rtol=0.1, + check_dtype=False, ) diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index eece5ef21d..f39815aec2 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -292,11 +292,12 @@ def test_model_predict_with_unnamed_index( def test_remote_model_predict( bqml_linear_remote_model: core.BqmlModel, new_penguins_df ): - predictions = bqml_linear_remote_model.predict(new_penguins_df).to_pandas() expected = pd.DataFrame( {"predicted_body_mass_g": [[3739.54], [3675.79], [3619.54]]}, index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + dtype=pd.ArrowDtype(pa.list_(pa.float64())), ) + predictions = bqml_linear_remote_model.predict(new_penguins_df).to_pandas() pd.testing.assert_frame_equal( predictions[["predicted_body_mass_g"]].sort_index(), expected, diff --git a/tests/system/small/ml/test_imported.py b/tests/system/small/ml/test_imported.py index 9008e85a0b..8ffd9924e9 100644 --- a/tests/system/small/ml/test_imported.py +++ b/tests/system/small/ml/test_imported.py @@ -51,6 +51,7 @@ def test_tensorflow_model_predict(imported_tensorflow_model, llm_text_df): result, expected, check_exact=False, + check_dtype=False, atol=0.1, ) @@ -90,6 +91,7 @@ def test_onnx_model_predict(imported_onnx_model, onnx_iris_df): result, expected, check_exact=False, + check_dtype=False, atol=0.1, ) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 267a2ed9c1..fd1b803eea 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import pytest from bigframes.ml import llm @@ -202,8 +201,7 @@ def test_embedding_generator_predict_success( assert "text_embedding" in df.columns series = df["text_embedding"] value = series[0] - assert isinstance(value, np.ndarray) - assert value.size == 768 + assert len(value) == 768 @pytest.mark.flaky(retries=2, delay=120) @@ -215,8 +213,7 @@ def test_embedding_generator_multilingual_predict_success( assert "text_embedding" in df.columns series = df["text_embedding"] value = series[0] - assert isinstance(value, np.ndarray) - assert value.size == 768 + assert len(value) == 768 @pytest.mark.flaky(retries=2, delay=120) @@ -228,5 +225,4 @@ def test_embedding_generator_predict_series_success( assert "text_embedding" in df.columns series = df["text_embedding"] value = series[0] - assert isinstance(value, np.ndarray) - assert value.size == 768 + assert len(value) == 768 diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 45548acca3..c3bd7f3b87 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -15,6 +15,7 @@ import math import pandas as pd +import pyarrow as pa import bigframes.ml.preprocessing @@ -453,6 +454,9 @@ def test_one_hot_encoder_default_params(new_penguins_df): [{"index": 2, "value": 1.0}], ], }, + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])) + ), index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) @@ -482,6 +486,9 @@ def test_one_hot_encoder_default_params_fit_transform(new_penguins_df): [{"index": 2, "value": 1.0}], ], }, + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])) + ), index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) @@ -507,6 +514,9 @@ def test_one_hot_encoder_series_default_params(new_penguins_df): [{"index": 2, "value": 1.0}], ], }, + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])) + ), index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) @@ -537,6 +547,9 @@ def test_one_hot_encoder_params(new_penguins_df): [{"index": 0, "value": 1.0}], ], }, + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])) + ), index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) @@ -567,6 +580,9 @@ def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_ [{"index": 2, "value": 1.0}], ], }, + dtype=pd.ArrowDtype( + pa.list_(pa.struct([("index", pa.int64()), ("value", pa.float64())])) + ), index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), ) diff --git a/tests/system/small/ml/test_remote.py b/tests/system/small/ml/test_remote.py index e8eb1c85e8..5036cdadfc 100644 --- a/tests/system/small/ml/test_remote.py +++ b/tests/system/small/ml/test_remote.py @@ -29,5 +29,6 @@ def test_remote_linear_vertex_model_predict( predictions[["predicted_body_mass_g"]].sort_index(), expected, check_exact=False, + check_dtype=False, rtol=0.1, ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index ed78e73e5d..86b8cfbe66 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -19,7 +19,6 @@ from typing import Tuple import geopandas as gpd # type: ignore -import numpy as np import pandas as pd import pandas.testing import pyarrow as pa # type: ignore @@ -29,7 +28,11 @@ import bigframes._config.display_options as display_options import bigframes.dataframe as dataframe import bigframes.series as series -from tests.system.utils import assert_pandas_df_equal, assert_series_equal +from tests.system.utils import ( + assert_pandas_df_equal, + assert_series_equal, + skip_legacy_pandas, +) def test_df_construct_copy(scalars_dfs): @@ -273,19 +276,19 @@ def test_df_info(scalars_dfs): " # Column Non-Null Count Dtype\n" "--- ------------- ---------------- ------------------------------\n" " 0 bool_col 8 non-null boolean\n" - " 1 bytes_col 6 non-null object\n" + " 1 bytes_col 6 non-null binary[pyarrow]\n" " 2 date_col 7 non-null date32[day][pyarrow]\n" " 3 datetime_col 6 non-null timestamp[us][pyarrow]\n" " 4 geography_col 4 non-null geometry\n" " 5 int64_col 8 non-null Int64\n" " 6 int64_too 9 non-null Int64\n" - " 7 numeric_col 6 non-null object\n" + " 7 numeric_col 6 non-null decimal128(38, 9)[pyarrow]\n" " 8 float64_col 7 non-null Float64\n" " 9 rowindex_2 9 non-null Int64\n" " 10 string_col 8 non-null string\n" " 11 time_col 6 non-null time64[us][pyarrow]\n" " 12 timestamp_col 6 non-null timestamp[us, tz=UTC][pyarrow]\n" - "dtypes: Float64(1), Int64(3), boolean(1), date32[day][pyarrow](1), geometry(1), object(2), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" + "dtypes: Float64(1), Int64(3), binary[pyarrow](1), boolean(1), date32[day][pyarrow](1), decimal128(38, 9)[pyarrow](1), geometry(1), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" "memory usage: 945 bytes\n" ) @@ -362,6 +365,7 @@ def test_drop_bigframes_index_with_na(scalars_dfs): pd.testing.assert_frame_equal(pd_result, bf_result) +@skip_legacy_pandas def test_drop_bigframes_multiindex(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs scalars_df = scalars_df.copy() @@ -841,13 +845,11 @@ def test_df_fillna(scalars_dfs): def test_df_replace_scalar_scalar(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.replace("Hello, World!", "Howdy, Planet!").to_pandas() - pd_result = scalars_pandas_df.replace("Hello, World!", "Howdy, Planet!") + bf_result = scalars_df.replace(555.555, 3).to_pandas() + pd_result = scalars_pandas_df.replace(555.555, 3) - pd.testing.assert_frame_equal( - pd_result, - bf_result, - ) + # pandas has narrower result types as they are determined dynamically + pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) def test_df_replace_regex_scalar(scalars_dfs): @@ -863,12 +865,14 @@ def test_df_replace_regex_scalar(scalars_dfs): def test_df_replace_list_scalar(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.replace(["Hello, World!", "T"], "Howdy, Planet!").to_pandas() - pd_result = scalars_pandas_df.replace(["Hello, World!", "T"], "Howdy, Planet!") + bf_result = scalars_df.replace([555.555, 3.2], 3).to_pandas() + pd_result = scalars_pandas_df.replace([555.555, 3.2], 3) + # pandas has narrower result types as they are determined dynamically pd.testing.assert_frame_equal( pd_result, bf_result, + check_dtype=False, ) @@ -1198,13 +1202,13 @@ def test_get_dtypes(scalars_df_default_index): pd.Series( { "bool_col": pd.BooleanDtype(), - "bytes_col": np.dtype("O"), + "bytes_col": pd.ArrowDtype(pa.binary()), "date_col": pd.ArrowDtype(pa.date32()), "datetime_col": pd.ArrowDtype(pa.timestamp("us")), "geography_col": gpd.array.GeometryDtype(), "int64_col": pd.Int64Dtype(), "int64_too": pd.Int64Dtype(), - "numeric_col": np.dtype("O"), + "numeric_col": pd.ArrowDtype(pa.decimal128(38, 9)), "float64_col": pd.Float64Dtype(), "rowindex": pd.Int64Dtype(), "rowindex_2": pd.Int64Dtype(), @@ -1232,7 +1236,7 @@ def test_get_dtypes_array_struct(session): dtypes, pd.Series( { - "array_column": np.dtype("O"), + "array_column": pd.ArrowDtype(pa.list_(pa.int64())), "struct_column": pd.ArrowDtype( pa.struct( [ @@ -2138,6 +2142,7 @@ def test_dataframe_agg_multi_string(scalars_dfs): ).all() +@skip_legacy_pandas def test_df_describe(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs # pyarrows time columns fail in pandas diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index fb9fb7bb89..59864e483e 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -56,7 +56,9 @@ def test_to_pandas_array_struct_correct_result(session): result = df.to_pandas() expected = pd.DataFrame( { - "array_column": [[1, 3, 2]], + "array_column": pd.Series( + [[1, 3, 2]], dtype=pd.ArrowDtype(pa.list_(pa.int64())) + ), "struct_column": pd.Series( [{"string_field": "a", "float_field": 1.2}], dtype=pd.ArrowDtype( @@ -91,7 +93,8 @@ def test_load_json(session): expected = pd.DataFrame( { "json_column": ['{"bar":true,"foo":10}'], - } + }, + dtype=pd.StringDtype(storage="pyarrow"), ) expected.index = expected.index.astype("Int64") pd.testing.assert_series_equal(result.dtypes, expected.dtypes) @@ -137,6 +140,8 @@ def test_to_csv_index( dtype = scalars_df.reset_index().dtypes.to_dict() dtype.pop("geography_col") dtype.pop("rowindex") + # read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string + dtype.pop("bytes_col") gcs_df = pd.read_csv( path, dtype=dtype, @@ -148,7 +153,6 @@ def test_to_csv_index( scalars_pandas_df = scalars_pandas_df.copy() scalars_pandas_df.index = scalars_pandas_df.index.astype("int64") - # Ordering should be maintained for tables smaller than 1 GB. pd.testing.assert_frame_equal(gcs_df, scalars_pandas_df) @@ -174,6 +178,8 @@ def test_to_csv_tabs( dtype = scalars_df.reset_index().dtypes.to_dict() dtype.pop("geography_col") dtype.pop("rowindex") + # read_csv will decode into bytes inproperly, convert_pandas_dtypes will encode properly from string + dtype.pop("bytes_col") gcs_df = pd.read_csv( path, sep="\t", @@ -216,6 +222,8 @@ def test_to_gbq_index(scalars_dfs, dataset_id, index): df_out = df_out.sort_values("rowindex_2").reset_index(drop=True) convert_pandas_dtypes(df_out, bytes_col=False) + # pd.read_gbq interpets bytes_col as object, reconvert to pyarrow binary + df_out["bytes_col"] = df_out["bytes_col"].astype(pd.ArrowDtype(pa.binary())) expected = scalars_pandas_df.copy() expected.index.name = index_col pd.testing.assert_frame_equal(df_out, expected, check_index_type=False) @@ -377,7 +385,9 @@ def test_to_parquet_index(scalars_dfs, gcs_folder, index): scalars_pandas_df.index = scalars_pandas_df.index.astype("Int64") # Ordering should be maintained for tables smaller than 1 GB. - pd.testing.assert_frame_equal(gcs_df, scalars_pandas_df) + pd.testing.assert_frame_equal( + gcs_df.drop("bytes_col", axis=1), scalars_pandas_df.drop("bytes_col", axis=1) + ) def test_to_sql_query_unnamed_index_included( diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 1708735f4c..2d4e1f0204 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -394,14 +394,17 @@ def test_multi_index_dataframe_groupby_level_aggregate( def test_multi_index_dataframe_groupby_level_analytic( scalars_df_index, scalars_pandas_df_index, level, as_index ): + # Drop "numeric_col" as pandas doesn't support numerics for grouped window function bf_result = ( - scalars_df_index.set_index(["int64_too", "bool_col"]) + scalars_df_index.drop("numeric_col", axis=1) + .set_index(["int64_too", "bool_col"]) .groupby(level=level, as_index=as_index, dropna=False) .cumsum(numeric_only=True) .to_pandas() ) pd_result = ( - scalars_pandas_df_index.set_index(["int64_too", "bool_col"]) + scalars_pandas_df_index.drop("numeric_col", axis=1) + .set_index(["int64_too", "bool_col"]) .groupby(level=level, as_index=as_index, dropna=False) .cumsum(numeric_only=True) ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 623da74aa4..6f919f740f 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -24,7 +24,11 @@ import bigframes.pandas import bigframes.series as series -from tests.system.utils import assert_pandas_df_equal, assert_series_equal +from tests.system.utils import ( + assert_pandas_df_equal, + assert_series_equal, + skip_legacy_pandas, +) def test_series_construct_copy(scalars_dfs): @@ -81,14 +85,14 @@ def test_series_construct_from_list_escaped_strings(): [ ("bool_col", pd.BooleanDtype()), # TODO(swast): Use a more efficient type. - ("bytes_col", numpy.dtype("object")), + ("bytes_col", pd.ArrowDtype(pa.binary())), ("date_col", pd.ArrowDtype(pa.date32())), ("datetime_col", pd.ArrowDtype(pa.timestamp("us"))), ("float64_col", pd.Float64Dtype()), ("geography_col", gpd.array.GeometryDtype()), ("int64_col", pd.Int64Dtype()), # TODO(swast): Use a more efficient type. - ("numeric_col", numpy.dtype("object")), + ("numeric_col", pd.ArrowDtype(pa.decimal128(38, 9))), ("int64_too", pd.Int64Dtype()), ("string_col", pd.StringDtype(storage="pyarrow")), ("time_col", pd.ArrowDtype(pa.time64("us"))), @@ -2519,8 +2523,12 @@ def test_mask_custom_value(scalars_dfs): ("int64_col", pd.Float64Dtype()), ("int64_col", "string[pyarrow]"), ("int64_col", "boolean"), + ("int64_col", pd.ArrowDtype(pa.decimal128(38, 9))), + ("int64_col", pd.ArrowDtype(pa.decimal256(76, 38))), ("bool_col", "Int64"), ("bool_col", "string[pyarrow]"), + ("string_col", "binary[pyarrow]"), + ("bytes_col", "string[pyarrow]"), # pandas actually doesn't let folks convert to/from naive timestamp and # raises a deprecation warning to use tz_localize/tz_convert instead, # but BigQuery always stores values as UTC and doesn't have to deal @@ -2538,6 +2546,7 @@ def test_mask_custom_value(scalars_dfs): # https://cloud.google.com/bigquery/docs/reference/standard-sql/conversion_functions ], ) +@skip_legacy_pandas def test_astype(scalars_df_index, scalars_pandas_df_index, column, to_type): bf_result = scalars_df_index[column].astype(to_type).to_pandas() pd_result = scalars_pandas_df_index[column].astype(to_type) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index e6eb40a5fa..8ce442376a 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -30,6 +30,7 @@ import bigframes.dataframe import bigframes.dtypes import bigframes.ml.linear_model +from tests.system.utils import skip_legacy_pandas FIRST_FILE = "000000000000" @@ -385,6 +386,7 @@ def test_read_pandas_tokyo( pd.testing.assert_frame_equal(result, expected) +@skip_legacy_pandas def test_read_csv_gcs_default_engine(session, scalars_dfs, gcs_folder): scalars_df, _ = scalars_dfs if scalars_df.index.name is not None: @@ -441,6 +443,7 @@ def test_read_csv_gcs_bq_engine(session, scalars_dfs, gcs_folder): pytest.param("\t", id="custom_sep"), ], ) +@skip_legacy_pandas def test_read_csv_local_default_engine(session, scalars_dfs, sep): scalars_df, scalars_pandas_df = scalars_dfs with tempfile.TemporaryDirectory() as dir: diff --git a/tests/system/utils.py b/tests/system/utils.py index f49b5ece31..a4647b4f51 100644 --- a/tests/system/utils.py +++ b/tests/system/utils.py @@ -14,11 +14,23 @@ import base64 import decimal +import functools import geopandas as gpd # type: ignore import numpy as np import pandas as pd import pyarrow as pa # type: ignore +import pytest + + +def skip_legacy_pandas(test): + @functools.wraps(test) + def wrapper(*args, **kwds): + if pd.__version__.startswith("1."): + pytest.skip("Skips pandas 1.x as not compatible with 2.x behavior.") + return test(*args, **kwds) + + return wrapper def assert_pandas_df_equal(df0, df1, ignore_order: bool = False, **kwargs): @@ -133,16 +145,28 @@ def convert_pandas_dtypes(df: pd.DataFrame, bytes_col: bool): df["geography_col"].replace({np.nan: None}) ) - # Convert bytes types column. - if bytes_col: + if bytes_col and not isinstance(df["bytes_col"].dtype, pd.ArrowDtype): df["bytes_col"] = df["bytes_col"].apply( lambda value: base64.b64decode(value) if not pd.isnull(value) else value ) + arrow_table = pa.Table.from_pandas( + pd.DataFrame(df, columns=["bytes_col"]), + schema=pa.schema([("bytes_col", pa.binary())]), + ) + df["bytes_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)["bytes_col"] - # Convert numeric types column. - df["numeric_col"] = df["numeric_col"].apply( - lambda value: decimal.Decimal(str(value)) if value else None # type: ignore - ) + if not isinstance(df["numeric_col"].dtype, pd.ArrowDtype): + # Convert numeric types column. + df["numeric_col"] = df["numeric_col"].apply( + lambda value: decimal.Decimal(str(value)) if value else None # type: ignore + ) + arrow_table = pa.Table.from_pandas( + pd.DataFrame(df, columns=["numeric_col"]), + schema=pa.schema([("numeric_col", pa.decimal128(38, 9))]), + ) + df["numeric_col"] = arrow_table.to_pandas(types_mapper=pd.ArrowDtype)[ + "numeric_col" + ] def assert_pandas_df_equal_pca_components(actual, expected, **kwargs): diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py index 6ceaaf911b..e648fd28cc 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/test_dtypes.py @@ -31,11 +31,11 @@ # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types pytest.param( ibis_dtypes.Decimal(precision=76, scale=38, nullable=True), - np.dtype("O"), + pd.ArrowDtype(pa.decimal256(76, 38)), id="bignumeric", ), pytest.param(ibis_dtypes.boolean, pd.BooleanDtype(), id="bool"), - pytest.param(ibis_dtypes.binary, np.dtype("O"), id="bytes"), + pytest.param(ibis_dtypes.binary, pd.ArrowDtype(pa.binary()), id="bytes"), pytest.param(ibis_dtypes.date, pd.ArrowDtype(pa.date32()), id="date"), pytest.param( ibis_dtypes.Timestamp(), pd.ArrowDtype(pa.timestamp("us")), id="datetime" @@ -49,10 +49,9 @@ pytest.param(ibis_dtypes.int8, pd.Int64Dtype(), id="int8-as-int64"), pytest.param(ibis_dtypes.int64, pd.Int64Dtype(), id="int64"), # TODO(tswast): custom dtype (or at least string dtype) for JSON objects - pytest.param(ibis_dtypes.json, np.dtype("O"), id="json"), pytest.param( ibis_dtypes.Decimal(precision=38, scale=9, nullable=True), - np.dtype("O"), + pd.ArrowDtype(pa.decimal128(38, 9)), id="numeric", ), pytest.param( From 9c5012ec68275db83d1f6f7e743f5edaaaacd8cb Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 21 Dec 2023 23:41:45 +0000 Subject: [PATCH 15/27] docs: code samples for `drop` and `fillna` (#284) --- .../bigframes_vendored/pandas/core/frame.py | 141 ++++++++++++++++++ .../bigframes_vendored/pandas/core/series.py | 81 ++++++++++ 2 files changed, 222 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 00be9e5e9e..427e586c52 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -869,6 +869,97 @@ def drop( Remove columns by directly specifying column names. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame(np.arange(12).reshape(3, 4), + ... columns=['A', 'B', 'C', 'D']) + >>> df + A B C D + 0 0 1 2 3 + 1 4 5 6 7 + 2 8 9 10 11 + + [3 rows x 4 columns] + + Drop columns: + + >>> df.drop(['B', 'C'], axis=1) + A D + 0 0 3 + 1 4 7 + 2 8 11 + + [3 rows x 2 columns] + + >>> df.drop(columns=['B', 'C']) + A D + 0 0 3 + 1 4 7 + 2 8 11 + + [3 rows x 2 columns] + + Drop a row by index: + + >>> df.drop([0, 1]) + A B C D + 2 8 9 10 11 + + [1 rows x 4 columns] + + Drop columns and/or rows of MultiIndex DataFrame: + + >>> import pandas as pd + >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], + ... ['speed', 'weight', 'length']], + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + >>> df = bpd.DataFrame(index=midx, columns=['big', 'small'], + ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], + ... [250, 150], [1.5, 0.8], [320, 250], + ... [1, 0.8], [0.3, 0.2]]) + >>> df + big small + llama speed 45.0 30.0 + weight 200.0 100.0 + length 1.5 1.0 + cow speed 30.0 20.0 + weight 250.0 150.0 + length 1.5 0.8 + falcon speed 320.0 250.0 + weight 1.0 0.8 + length 0.3 0.2 + + [9 rows x 2 columns] + + Drop a specific index and column combination from the MultiIndex + DataFrame, i.e., drop the index ``'cow'`` and column ``'small'``: + + >>> df.drop(index='cow', columns='small') + big + llama speed 45.0 + weight 200.0 + length 1.5 + falcon speed 320.0 + weight 1.0 + length 0.3 + + [6 rows x 1 columns] + + >>> df.drop(index='length', level=1) + big small + llama speed 45.0 30.0 + weight 200.0 100.0 + cow speed 30.0 20.0 + weight 250.0 150.0 + falcon speed 320.0 250.0 + weight 1.0 0.8 + + [6 rows x 2 columns] + Args: labels: Index or column labels to drop. @@ -4343,6 +4434,56 @@ def fillna(self, value): """ Fill NA/NaN values using the specified method. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame([[np.nan, 2, np.nan, 0], + ... [3, 4, np.nan, 1], + ... [np.nan, np.nan, np.nan, np.nan], + ... [np.nan, 3, np.nan, 4]], + ... columns=list("ABCD")).astype("Float64") + >>> df + A B C D + 0 2.0 0.0 + 1 3.0 4.0 1.0 + 2 + 3 3.0 4.0 + + [4 rows x 4 columns] + + Replace all NA elements with 0s. + + >>> df.fillna(0) + A B C D + 0 0.0 2.0 0.0 0.0 + 1 3.0 4.0 0.0 1.0 + 2 0.0 0.0 0.0 0.0 + 3 0.0 3.0 0.0 4.0 + + [4 rows x 4 columns] + + You can use fill values from another DataFrame: + + >>> df_fill = bpd.DataFrame(np.arange(12).reshape(3, 4), + ... columns=['A', 'B', 'C', 'D']) + >>> df_fill + A B C D + 0 0 1 2 3 + 1 4 5 6 7 + 2 8 9 10 11 + + [3 rows x 4 columns] + >>> df.fillna(df_fill) + A B C D + 0 0.0 2.0 2.0 0.0 + 1 3.0 4.0 6.0 1.0 + 2 8.0 9.0 10.0 11.0 + 3 3.0 4.0 + + [4 rows x 4 columns] + Args: value (scalar, Series): Value to use to fill holes (e.g. 0), alternately a diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 366f32c77e..01cc3a0500 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1062,6 +1062,55 @@ def drop( When using a multi-index, labels on different levels can be removed by specifying the level. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(data=np.arange(3), index=['A', 'B', 'C']) + >>> s + A 0 + B 1 + C 2 + dtype: Int64 + + Drop labels B and C: + + >>> s.drop(labels=['B', 'C']) + A 0 + dtype: Int64 + + Drop 2nd level label in MultiIndex Series: + + >>> import pandas as pd + >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], + ... ['speed', 'weight', 'length']], + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], + ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) + + >>> s = bpd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], + ... index=midx) + >>> s + llama speed 45.0 + weight 200.0 + length 1.2 + cow speed 30.0 + weight 250.0 + length 1.5 + falcon speed 320.0 + weight 1.0 + length 0.3 + dtype: Float64 + + >>> s.drop(labels='weight', level=1) + llama speed 45.0 + length 1.2 + cow speed 30.0 + length 1.5 + falcon speed 320.0 + length 0.3 + dtype: Float64 + Args: labels (single label or list-like): Index labels to drop. @@ -1193,6 +1242,38 @@ def fillna( """ Fill NA/NaN values using the specified method. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([np.nan, 2, np.nan, -1]) + >>> s + 0 + 1 2.0 + 2 + 3 -1.0 + dtype: Float64 + + Replace all NA elements with 0s. + + >>> s.fillna(0) + 0 0.0 + 1 2.0 + 2 0.0 + 3 -1.0 + dtype: Float64 + + You can use fill values from another Series: + + >>> s_fill = bpd.Series([11, 22, 33]) + >>> s.fillna(s_fill) + 0 11.0 + 1 2.0 + 2 33.0 + 3 -1.0 + dtype: Float64 + Args: value (scalar, dict, Series, or DataFrame, default None): Value to use to fill holes (e.g. 0). From acc0eb7010951c8cfb91aecc45268b041217dd09 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 22 Dec 2023 02:45:36 +0000 Subject: [PATCH 16/27] docs: code samples for `reset_index` and `sort_values` (#282) * docs: code samples for `reset_index` and `sort_values` * fix alignment in dataframe api code samples --- .../bigframes_vendored/pandas/core/frame.py | 161 ++++++++++++++++++ .../bigframes_vendored/pandas/core/series.py | 110 ++++++++++++ 2 files changed, 271 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 427e586c52..fb34193710 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1138,6 +1138,93 @@ def reset_index( Reset the index of the DataFrame, and use the default one instead. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> import numpy as np + >>> df = bpd.DataFrame([('bird', 389.0), + ... ('bird', 24.0), + ... ('mammal', 80.5), + ... ('mammal', np.nan)], + ... index=['falcon', 'parrot', 'lion', 'monkey'], + ... columns=('class', 'max_speed')) + >>> df + class max_speed + falcon bird 389.0 + parrot bird 24.0 + lion mammal 80.5 + monkey mammal + + [4 rows x 2 columns] + + When we reset the index, the old index is added as a column, and a new sequential index is used: + + >>> df.reset_index() + index class max_speed + 0 falcon bird 389.0 + 1 parrot bird 24.0 + 2 lion mammal 80.5 + 3 monkey mammal + + [4 rows x 3 columns] + + We can use the ``drop`` parameter to avoid the old index being added as a column: + + >>> df.reset_index(drop=True) + class max_speed + 0 bird 389.0 + 1 bird 24.0 + 2 mammal 80.5 + 3 mammal + + [4 rows x 2 columns] + + You can also use ``reset_index`` with ``MultiIndex``. + + >>> import pandas as pd + >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), + ... ('bird', 'parrot'), + ... ('mammal', 'lion'), + ... ('mammal', 'monkey')], + ... names=['class', 'name']) + >>> columns = ['speed', 'max'] + >>> df = bpd.DataFrame([(389.0, 'fly'), + ... (24.0, 'fly'), + ... (80.5, 'run'), + ... (np.nan, 'jump')], + ... index=index, + ... columns=columns) + >>> df + speed max + class name + bird falcon 389.0 fly + parrot 24.0 fly + mammal lion 80.5 run + monkey jump + + [4 rows x 2 columns] + + >>> df.reset_index() + class name speed max + 0 bird falcon 389.0 fly + 1 bird parrot 24.0 fly + 2 mammal lion 80.5 run + 3 mammal monkey jump + + [4 rows x 4 columns] + + >>> df.reset_index(drop=True) + speed max + 0 389.0 fly + 1 24.0 fly + 2 80.5 run + 3 jump + + [4 rows x 2 columns] + + Args: drop (bool, default False): Do not try to insert index into dataframe columns. This resets @@ -1347,6 +1434,80 @@ def sort_values( ) -> DataFrame: """Sort by the values along row axis. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({ + ... 'col1': ['A', 'A', 'B', bpd.NA, 'D', 'C'], + ... 'col2': [2, 1, 9, 8, 7, 4], + ... 'col3': [0, 1, 9, 4, 2, 3], + ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F'] + ... }) + >>> df + col1 col2 col3 col4 + 0 A 2 0 a + 1 A 1 1 B + 2 B 9 9 c + 3 8 4 D + 4 D 7 2 e + 5 C 4 3 F + + [6 rows x 4 columns] + + Sort by col1: + + >>> df.sort_values(by=['col1']) + col1 col2 col3 col4 + 0 A 2 0 a + 1 A 1 1 B + 2 B 9 9 c + 5 C 4 3 F + 4 D 7 2 e + 3 8 4 D + + [6 rows x 4 columns] + + Sort by multiple columns: + + >>> df.sort_values(by=['col1', 'col2']) + col1 col2 col3 col4 + 1 A 1 1 B + 0 A 2 0 a + 2 B 9 9 c + 5 C 4 3 F + 4 D 7 2 e + 3 8 4 D + + [6 rows x 4 columns] + + Sort Descending: + + >>> df.sort_values(by='col1', ascending=False) + col1 col2 col3 col4 + 4 D 7 2 e + 5 C 4 3 F + 2 B 9 9 c + 0 A 2 0 a + 1 A 1 1 B + 3 8 4 D + + [6 rows x 4 columns] + + Putting NAs first: + + >>> df.sort_values(by='col1', ascending=False, na_position='first') + col1 col2 col3 col4 + 3 8 4 D + 4 D 7 2 e + 5 C 4 3 F + 2 B 9 9 c + 0 A 2 0 a + 1 A 1 1 B + + [6 rows x 4 columns] + Args: by (str or Sequence[str]): Name or list of names to sort by. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 01cc3a0500..778ad68e0e 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -168,6 +168,53 @@ def reset_index( when the index is meaningless and needs to be reset to the default before another operation. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1, 2, 3, 4], name='foo', + ... index=['a', 'b', 'c', 'd']) + >>> s.index.name = "idx" + >>> s + idx + a 1 + b 2 + c 3 + d 4 + Name: foo, dtype: Int64 + + Generate a DataFrame with default index. + + >>> s.reset_index() + idx foo + 0 a 1 + 1 b 2 + 2 c 3 + 3 d 4 + + [4 rows x 2 columns] + + To specify the name of the new column use ``name`` param. + + >>> s.reset_index(name="bar") + idx bar + 0 a 1 + 1 b 2 + 2 c 3 + 3 d 4 + + [4 rows x 2 columns] + + To generate a new Series with the default index set param ``drop=True``. + + >>> s.reset_index(drop=True) + 0 1 + 1 2 + 2 3 + 3 4 + Name: foo, dtype: Int64 + Args: drop (bool, default False): Just reset the index, without inserting it as a column in @@ -699,6 +746,69 @@ def sort_values( Sort a Series in ascending or descending order by some criterion. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([np.nan, 1, 3, 10, 5]) + >>> s + 0 + 1 1.0 + 2 3.0 + 3 10.0 + 4 5.0 + dtype: Float64 + + Sort values ascending order (default behaviour): + + >>> s.sort_values(ascending=True) + 1 1.0 + 2 3.0 + 4 5.0 + 3 10.0 + 0 + dtype: Float64 + + Sort values descending order: + + >>> s.sort_values(ascending=False) + 3 10.0 + 4 5.0 + 2 3.0 + 1 1.0 + 0 + dtype: Float64 + + Sort values putting NAs first: + + >>> s.sort_values(na_position='first') + 0 + 1 1.0 + 2 3.0 + 4 5.0 + 3 10.0 + dtype: Float64 + + Sort a series of strings: + + >>> s = bpd.Series(['z', 'b', 'd', 'a', 'c']) + >>> s + 0 z + 1 b + 2 d + 3 a + 4 c + dtype: string + + >>> s.sort_values() + 3 a + 1 b + 4 c + 2 d + 0 z + dtype: string + Args: axis (0 or 'index'): Unused. Parameter needed for compatibility with DataFrame. From ad51035bcf80d6a49f134df26624b578010b5b12 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 26 Dec 2023 18:29:00 +0000 Subject: [PATCH 17/27] docs: code samples for `isna`, `isnull`, `dropna`, `isin` (#289) * docs: code samples for `isna`, `isnull`, `dropna`, `isin` * fix header alignment in rendering --- .../bigframes_vendored/pandas/core/frame.py | 81 +++++++++++++++++++ .../bigframes_vendored/pandas/core/generic.py | 65 +++++++++++++++ .../bigframes_vendored/pandas/core/series.py | 70 ++++++++++++++++ 3 files changed, 216 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index fb34193710..2de63b9103 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1289,9 +1289,57 @@ def duplicated(self, subset=None, keep="first"): def dropna( self, + *, + axis: int | str = 0, + how: str = "any", + ignore_index=False, ) -> DataFrame: """Remove missing values. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], + ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], + ... "born": [bpd.NA, "1940-04-25", bpd.NA]}) + >>> df + name toy born + 0 Alfred + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip + + [3 rows x 3 columns] + + Drop the rows where at least one element is missing: + + >>> df.dropna() + name toy born + 1 Batman Batmobile 1940-04-25 + + [1 rows x 3 columns] + + Drop the columns where at least one element is missing. + + >>> df.dropna(axis='columns') + name + 0 Alfred + 1 Batman + 2 Catwoman + + [3 rows x 1 columns] + + Drop the rows where all elements are missing: + + >>> df.dropna(how='all') + name toy born + 0 Alfred + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip + + [3 rows x 3 columns] + Args: axis ({0 or 'index', 1 or 'columns'}, default 'columns'): Determine if rows or columns which contain missing values are @@ -1318,6 +1366,39 @@ def isin(self, values): """ Whether each element in the DataFrame is contained in values. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, + ... index=['falcon', 'dog']) + >>> df + num_legs num_wings + falcon 2 2 + dog 4 0 + + [2 rows x 2 columns] + + When ``values`` is a list check whether every value in the DataFrame is + present in the list (which animals have 0 or 2 legs or wings). + + >>> df.isin([0, 2]) + num_legs num_wings + falcon True True + dog False True + + [2 rows x 2 columns] + + When ``values`` is a dict, we can pass it to check for each column separately: + + >>> df.isin({'num_wings': [0, 3]}) + num_legs num_wings + falcon False False + dog False True + + [2 rows x 2 columns] + Args: values (iterable, or dict): The result will only be true at a location if all the diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index ca5c6344ce..2885162fd6 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -499,6 +499,71 @@ def isna(self) -> NDFrame: False values. Characters such as empty strings ``''`` or :attr:`numpy.inf` are not considered NA values. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + >>> import numpy as np + + >>> df = bpd.DataFrame(dict( + ... age=[5, 6, np.nan], + ... born=[bpd.NA, "1940-04-25", "1940-04-25"], + ... name=['Alfred', 'Batman', ''], + ... toy=[None, 'Batmobile', 'Joker'], + ... )) + >>> df + age born name toy + 0 5.0 Alfred + 1 6.0 1940-04-25 Batman Batmobile + 2 1940-04-25 Joker + + [3 rows x 4 columns] + + Show which entries in a DataFrame are NA: + + >>> df.isna() + age born name toy + 0 False True False True + 1 False False False False + 2 True False False False + + [3 rows x 4 columns] + + >>> df.isnull() + age born name toy + 0 False True False True + 1 False False False False + 2 True False False False + + [3 rows x 4 columns] + + Show which entries in a Series are NA: + + >>> ser = bpd.Series([5, None, 6, np.nan, bpd.NA]) + >>> ser + 0 5.0 + 1 + 2 6.0 + 3 + 4 + dtype: Float64 + + >>> ser.isna() + 0 False + 1 True + 2 False + 3 True + 4 True + dtype: boolean + + >>> ser.isnull() + 0 False + 1 True + 2 False + 3 True + 4 True + dtype: boolean + Returns: Mask of bool values for each element that indicates whether an element is an NA value. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 778ad68e0e..cbe0963051 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -1460,6 +1460,42 @@ def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series: """ Return a new Series with missing values removed. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Drop NA values from a Series: + + >>> ser = bpd.Series([1., 2., np.nan]) + >>> ser + 0 1.0 + 1 2.0 + 2 + dtype: Float64 + + >>> ser.dropna() + 0 1.0 + 1 2.0 + dtype: Float64 + + Empty strings are not considered NA values. ``None`` is considered an NA value. + + >>> ser = bpd.Series(['2', bpd.NA, '', None, 'I stay'], dtype='object') + >>> ser + 0 2 + 1 + 2 + 3 + 4 I stay + dtype: string + + >>> ser.dropna() + 0 2 + 2 + 4 I stay + dtype: string + Args: axis (0 or 'index'): Unused. Parameter needed for compatibility with DataFrame. @@ -2531,6 +2567,40 @@ def isin(self, values): the same. That is, if any form of NaN is present in values, all forms of NaN in the series will be considered a match. (though pandas may not) + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['llama', 'cow', 'llama', 'beetle', 'llama', + ... 'hippo'], name='animal') + >>> s + 0 llama + 1 cow + 2 llama + 3 beetle + 4 llama + 5 hippo + Name: animal, dtype: string + + >>> s.isin(['cow', 'llama']) + 0 True + 1 True + 2 True + 3 False + 4 True + 5 False + Name: animal, dtype: boolean + + Strings and integers are distinct and are therefore not comparable: + + >>> bpd.Series([1]).isin(['1']) + 0 False + dtype: boolean + >>> bpd.Series([1.1]).isin(['1.1']) + 0 False + dtype: boolean + Args: values (list-like): The sequence of values to test. Passing in a single string will raise a From 0e1bbfc1055aff9757b5138907c11caab2f3965a Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 26 Dec 2023 19:10:15 +0000 Subject: [PATCH 18/27] docs: code samples for `Series.{add, replace, unique, T, transpose}` (#287) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) - `Series.add`: https://screenshot.googleplex.com/763p36yJKKvM5DY - `Series.replace`: https://screenshot.googleplex.com/9MHuQZnKakqjjJn - `Series.unique`: https://screenshot.googleplex.com/7BfuQE3bFcYASqu - `Series.T`: https://screenshot.googleplex.com/8cSYpwKXrYetsEg - `Series.transpose`: https://screenshot.googleplex.com/7mM2zBwxRiqfDUV Fixes internal issue 317297573 🦕 --- .../bigframes_vendored/pandas/core/frame.py | 4 +- .../bigframes_vendored/pandas/core/series.py | 190 +++++++++++++++++- 2 files changed, 191 insertions(+), 3 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 2de63b9103..d7ecae102b 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -4371,7 +4371,7 @@ def stack(self, level=-1): BigQuery DataFrames does not support stack operations that would combine columns of different dtypes. - **Example:** + **Examples:** >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None @@ -4410,7 +4410,7 @@ def unstack(self, level=-1): If the index is not a MultiIndex, the output will be a Series (the analogue of stack when the columns are not a MultiIndex). - **Example:** + **Examples:** >>> import bigframes.pandas as bpd >>> bpd.options.display.progress_bar = None diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index cbe0963051..b0a4cb8193 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -143,13 +143,51 @@ def name(self) -> Hashable: @property def T(self) -> Series: - """Return the transpose, which is by definition self.""" + """Return the transpose, which is by definition self. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) + >>> s + 0 Ant + 1 Bear + 2 Cow + dtype: string + + >>> s.T + 0 Ant + 1 Bear + 2 Cow + dtype: string + + """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def transpose(self) -> Series: """ Return the transpose, which is by definition self. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['Ant', 'Bear', 'Cow']) + >>> s + 0 Ant + 1 Bear + 2 Cow + dtype: string + + >>> s.transpose() + 0 Ant + 1 Bear + 2 Cow + dtype: string + Returns: Series: Series. """ @@ -539,6 +577,36 @@ def nunique(self) -> int: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def unique(self) -> Series: + """ + Return unique values of Series object. + + Uniques are returned in order of appearance. Hash table-based unique, + therefore does NOT sort. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([2, 1, 3, 3], name='A') + >>> s + 0 2 + 1 1 + 2 3 + 3 3 + Name: A, dtype: Int64 + >>> s.unique() + 0 2 + 1 1 + 2 3 + Name: A, dtype: Int64 + + Returns: + Series: The unique values returned as a Series. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def mode(self) -> Series: """ Return the mode(s) of the Series. @@ -1405,6 +1473,77 @@ def replace( This differs from updating with ``.loc`` or ``.iloc``, which require you to specify a location to update with some value. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1, 2, 3, 4, 5]) + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + dtype: Int64 + + >>> s.replace(1, 5) + 0 5 + 1 2 + 2 3 + 3 4 + 4 5 + dtype: Int64 + + You can replace a list of values: + + >>> s.replace([1, 3, 5], -1) + 0 -1 + 1 2 + 2 -1 + 3 4 + 4 -1 + dtype: Int64 + + You can use a replacement mapping: + + >>> s.replace({1: 5, 3: 10}) + 0 5 + 1 2 + 2 10 + 3 4 + 4 5 + dtype: Int64 + + With a string Series you can use a simple string replacement or a regex + replacement: + + >>> s = bpd.Series(["Hello", "Another Hello"]) + >>> s.replace("Hello", "Hi") + 0 Hi + 1 Another Hello + dtype: string + + >>> s.replace("Hello", "Hi", regex=True) + 0 Hi + 1 Another Hi + dtype: string + + >>> s.replace("^Hello", "Hi", regex=True) + 0 Hi + 1 Another Hello + dtype: string + + >>> s.replace("Hello$", "Hi", regex=True) + 0 Hi + 1 Another Hi + dtype: string + + >>> s.replace("[Hh]e", "__", regex=True) + 0 __llo + 1 Anot__r __llo + dtype: string + Args: to_replace (str, regex, list, int, float or None): How to find the values that will be replaced. @@ -1702,6 +1841,55 @@ def add(self, other) -> Series: Equivalent to ``series + other``, but with support to substitute a fill_value for missing data in either one of the inputs. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> a = bpd.Series([1, 2, 3, bpd.NA]) + >>> a + 0 1.0 + 1 2.0 + 2 3.0 + 3 + dtype: Float64 + + >>> b = bpd.Series([10, 20, 30, 40]) + >>> b + 0 10 + 1 20 + 2 30 + 3 40 + dtype: Int64 + + >>> a.add(b) + 0 11.0 + 1 22.0 + 2 33.0 + 3 + dtype: Float64 + + You can also use the mathematical operator ``+``: + + >>> a + b + 0 11.0 + 1 22.0 + 2 33.0 + 3 + dtype: Float64 + + Adding two Series with explicit indexes: + + >>> a = bpd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + >>> b = bpd.Series([10, 20, 30, 40], index=['a', 'b', 'd', 'e']) + >>> a.add(b) + a 11 + b 22 + c + d 34 + e + dtype: Int64 + Args: other (Series, or scalar value): From 2cd64891170dcd4f2a709024a2993e36db210976 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Wed, 27 Dec 2023 16:36:15 -0800 Subject: [PATCH 19/27] feat: Add dataframe.to_html (#259) Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes b/296945119 --- bigframes/dataframe.py | 52 ++++++++ tests/system/small/test_dataframe.py | 9 ++ .../bigframes_vendored/pandas/core/frame.py | 124 ++++++++++++++++++ 3 files changed, 185 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 423c2bcaac..ab0006ea20 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2682,6 +2682,58 @@ def to_string( encoding, ) + def to_html( + self, + buf=None, + columns: Sequence[str] | None = None, + col_space=None, + header: bool = True, + index: bool = True, + na_rep: str = "NaN", + formatters=None, + float_format=None, + sparsify: bool | None = None, + index_names: bool = True, + justify: str | None = None, + max_rows: int | None = None, + max_cols: int | None = None, + show_dimensions: bool = False, + decimal: str = ".", + bold_rows: bool = True, + classes: str | list | tuple | None = None, + escape: bool = True, + notebook: bool = False, + border: int | None = None, + table_id: str | None = None, + render_links: bool = False, + encoding: str | None = None, + ) -> str: + return self.to_pandas().to_html( + buf, + columns, # type: ignore + col_space, + header, + index, + na_rep, + formatters, + float_format, + sparsify, + index_names, + justify, # type: ignore + max_rows, + max_cols, + show_dimensions, + decimal, + bold_rows, + classes, + escape, + notebook, + border, + table_id, + render_links, + encoding, + ) + def to_markdown( self, buf=None, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 86b8cfbe66..cb2e4f94fa 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3463,6 +3463,15 @@ def test_df_to_string(scalars_df_index, scalars_pandas_df_index): assert bf_result == pd_result +def test_df_to_html(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + + bf_result = scalars_df_index.drop(columns=unsupported).to_html() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_html() + + assert bf_result == pd_result + + def test_df_to_markdown(scalars_df_index, scalars_pandas_df_index): # Nulls have bug from tabulate https://github.com/astanin/python-tabulate/issues/231 bf_result = scalars_df_index.dropna().to_markdown() diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index d7ecae102b..f2de8fcb6a 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -685,6 +685,130 @@ def to_string( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def to_html( + self, + buf=None, + columns: Sequence[str] | None = None, + col_space=None, + header: bool = True, + index: bool = True, + na_rep: str = "NaN", + formatters=None, + float_format=None, + sparsify: bool | None = None, + index_names: bool = True, + justify: str | None = None, + max_rows: int | None = None, + max_cols: int | None = None, + show_dimensions: bool = False, + decimal: str = ".", + bold_rows: bool = True, + classes: str | list | tuple | None = None, + escape: bool = True, + notebook: bool = False, + border: int | None = None, + table_id: str | None = None, + render_links: bool = False, + encoding: str | None = None, + ): + """Render a DataFrame as an HTML table. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> print(df.to_html()) + + + + + + + + + + + + + + + + + + + + +
col1col2
013
124
+ + Args: + buf (str, Path or StringIO-like, optional, default None): + Buffer to write to. If None, the output is returned as a string. + columns (sequence, optional, default None): + The subset of columns to write. Writes all columns by default. + col_space (str or int, list or dict of int or str, optional): + The minimum width of each column in CSS length units. An int is + assumed to be px units. + header (bool, optional): + Whether to print column labels, default True. + index (bool, optional, default True): + Whether to print index (row) labels. + na_rep (str, optional, default 'NaN'): + String representation of NAN to use. + formatters (list, tuple or dict of one-param. functions, optional): + Formatter functions to apply to columns' elements by position or + name. + The result of each function must be a unicode string. + List/tuple must be of length equal to the number of columns. + float_format (one-parameter function, optional, default None): + Formatter function to apply to columns' elements if they are + floats. This function must return a unicode string and will + be applied only to the non-NaN elements, with NaN being + handled by na_rep. + sparsify (bool, optional, default True): + Set to False for a DataFrame with a hierarchical index to print + every multiindex key at each row. + index_names (bool, optional, default True): + Prints the names of the indexes. + justify (str, default None): + How to justify the column labels. If None uses the option from + the print configuration (controlled by set_option), 'right' out + of the box. Valid values are, 'left', 'right', 'center', 'justify', + 'justify-all', 'start', 'end', 'inherit', 'match-parent', 'initial', + 'unset'. + max_rows (int, optional): + Maximum number of rows to display in the console. + max_cols (int, optional): + Maximum number of columns to display in the console. + show_dimensions (bool, default False): + Display DataFrame dimensions (number of rows by number of columns). + decimal (str, default '.'): + Character recognized as decimal separator, e.g. ',' in Europe. + bold_rows (bool, default True): + Make the row labels bold in the output. + classes (str or list or tuple, default None): + CSS class(es) to apply to the resulting html table. + escape (bool, default True): + Convert the characters <, >, and & to HTML-safe sequences. + notebook (bool, default False): + Whether the generated HTML is for IPython Notebook. + border (int): + A border=border attribute is included in the opening + tag. Default pd.options.display.html.border. + table_id (str, optional): + A css id is included in the opening
tag if specified. + render_links (bool, default False): + Convert URLs to HTML links. + encoding (str, default "utf-8"): + Set character encoding. + + Returns: + str or None: If buf is None, returns the result as a string. Otherwise + returns None. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def to_markdown( self, buf=None, From ac1a745ddce9865f4585777b43c2234b9bf2841d Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 28 Dec 2023 18:00:22 +0000 Subject: [PATCH 20/27] fix: exclude pandas 2.2.0rc0 to unblock prerelease tests (#292) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 317908521 🦕 --- noxfile.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/noxfile.py b/noxfile.py index c4bbd7a65a..1d3624005a 100644 --- a/noxfile.py +++ b/noxfile.py @@ -518,9 +518,13 @@ def prerelease(session: nox.sessions.Session, tests_path): "--prefer-binary", "--pre", "--upgrade", - # TODO(shobs): Remove tying to version 2.1.3 after - # https://github.com/pandas-dev/pandas/issues/56463 is resolved - "pandas!=2.1.4", + # TODO(shobs): Remove excluding version 2.1.4 after + # https://github.com/pandas-dev/pandas/issues/56463 is resolved. + # + # TODO(shobs): Remove excluding version 2.2.0rc0 after + # https://github.com/pandas-dev/pandas/issues/56646 and + # https://github.com/pandas-dev/pandas/issues/56651 are resolved. + "pandas!=2.1.4,!=2.2.0rc0", ) already_installed.add("pandas") From 252f3a2a0e1296c7d786acdc0bdebe9e4a9ae1be Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Thu, 28 Dec 2023 10:54:15 -0800 Subject: [PATCH 21/27] docs: fix the rendering for `get_dummies` (#291) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) - docs: https://screenshot.googleplex.com/8X53mhLdQb2dQsd Fixes internal issue 317915956 🦕 --- third_party/bigframes_vendored/pandas/core/reshape/encoding.py | 1 + 1 file changed, 1 insertion(+) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/encoding.py b/third_party/bigframes_vendored/pandas/core/reshape/encoding.py index da92b58f50..b7f67473ea 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/encoding.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/encoding.py @@ -25,6 +25,7 @@ def get_dummies( prepended to the value. **Examples:** + >>> import bigframes.pandas as pd >>> pd.options.display.progress_bar = None >>> s = pd.Series(list('abca')) From 746115d5564c95bc3c4a5309c99e7a29e535e6fe Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Thu, 28 Dec 2023 12:42:15 -0800 Subject: [PATCH 22/27] Fix: Update dataframe.to_gbq to dedup column names. (#286) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/dataframe.py | 14 ++++---- tests/system/small/test_dataframe_io.py | 44 +++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 6 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index ab0006ea20..595670b0b6 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2759,26 +2759,28 @@ def _apply_unary_op(self, operation: ops.UnaryOp) -> DataFrame: def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str: """Create query text representing this dataframe for I/O.""" array_value = self._block.expr + + new_col_labels, new_idx_labels = utils.get_standardized_ids( + self._block.column_labels, self.index.names + ) + columns = list(self._block.value_columns) - column_labels = list(self._block.column_labels) + column_labels = new_col_labels # This code drops unnamed indexes to keep consistent with the behavior of # most pandas write APIs. The exception is `pandas.to_csv`, which keeps # unnamed indexes as `Unnamed: 0`. # TODO(chelsealin): check if works for multiple indexes. if index and self.index.name is not None: columns.extend(self._block.index_columns) - column_labels.extend(self.index.names) + column_labels.extend(new_idx_labels) else: array_value = array_value.drop_columns(self._block.index_columns) # Make columns in SQL reflect _labels_ not _ids_. Note: This may use # the arbitrary unicode column labels feature in BigQuery, which is # currently (June 2023) in preview. - # TODO(swast): Handle duplicate and NULL labels. id_overrides = { - col_id: col_label - for col_id, col_label in zip(columns, column_labels) - if col_label and isinstance(col_label, str) + col_id: col_label for col_id, col_label in zip(columns, column_labels) } if ordering_id is not None: diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 59864e483e..6f1b31b48e 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -273,6 +273,50 @@ def test_to_gbq_if_exists( ) +def test_to_gbq_w_duplicate_column_names( + scalars_df_index, scalars_pandas_df_index, dataset_id +): + """Test the `to_gbq` API when dealing with duplicate column names.""" + destination_table = f"{dataset_id}.test_to_gbq_w_duplicate_column_names" + + # Renaming 'int64_too' to 'int64_col', which will result in 'int64_too' + # becoming 'int64_col_1' after deduplication. + scalars_df_index = scalars_df_index.rename(columns={"int64_too": "int64_col"}) + scalars_df_index.to_gbq(destination_table, if_exists="replace") + + bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas() + + pd.testing.assert_series_equal( + scalars_pandas_df_index["int64_col"], bf_result["int64_col"] + ) + pd.testing.assert_series_equal( + scalars_pandas_df_index["int64_too"], + bf_result["int64_col_1"], + check_names=False, + ) + + +def test_to_gbq_w_None_column_names( + scalars_df_index, scalars_pandas_df_index, dataset_id +): + """Test the `to_gbq` API with None as a column name.""" + destination_table = f"{dataset_id}.test_to_gbq_w_none_column_names" + + scalars_df_index = scalars_df_index.rename(columns={"int64_too": None}) + scalars_df_index.to_gbq(destination_table, if_exists="replace") + + bf_result = bpd.read_gbq(destination_table, index_col="rowindex").to_pandas() + + pd.testing.assert_series_equal( + scalars_pandas_df_index["int64_col"], bf_result["int64_col"] + ) + pd.testing.assert_series_equal( + scalars_pandas_df_index["int64_too"], + bf_result["bigframes_unnamed_column"], + check_names=False, + ) + + def test_to_gbq_w_invalid_destination_table(scalars_df_index): with pytest.raises(ValueError): scalars_df_index.to_gbq("table_id") From 7cbc2b0ba572d11778ba7caf7c95b7fb8f3a31a7 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 29 Dec 2023 00:06:19 +0000 Subject: [PATCH 23/27] docs: code samples for `Series.{map, to_list, count}` (#290) docs: code samples for `DataFrame.copy` and `Series.copy` --- .../bigframes_vendored/pandas/core/generic.py | 61 ++++++++++++++++ .../bigframes_vendored/pandas/core/series.py | 70 +++++++++++++++++++ 2 files changed, 131 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 2885162fd6..c079cbff7f 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -448,6 +448,67 @@ def copy(self): and indices. Modifications to the data or indices of the copy will not be reflected in the original object. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Modification in the original Series will not affect the copy Series: + + >>> s = bpd.Series([1, 2], index=["a", "b"]) + >>> s + a 1 + b 2 + dtype: Int64 + + >>> s_copy = s.copy() + >>> s_copy + a 1 + b 2 + dtype: Int64 + + >>> s.loc['b'] = 22 + >>> s + a 1 + b 22 + dtype: Int64 + >>> s_copy + a 1 + b 2 + dtype: Int64 + + Modification in the original DataFrame will not affect the copy DataFrame: + + >>> df = bpd.DataFrame({'a': [1, 3], 'b': [2, 4]}) + >>> df + a b + 0 1 2 + 1 3 4 + + [2 rows x 2 columns] + + >>> df_copy = df.copy() + >>> df_copy + a b + 0 1 2 + 1 3 4 + + [2 rows x 2 columns] + + >>> df.loc[df["b"] == 2, "b"] = 22 + >>> df + a b + 0 1 22.0 + 1 3 4.0 + + [2 rows x 2 columns] + >>> df_copy + a b + 0 1 2 + 1 3 4 + + [2 rows x 2 columns] + Returns: Object type matches caller. """ diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index b0a4cb8193..b97f9018dd 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -433,6 +433,21 @@ def tolist(self) -> list: (for str, int, float) or a pandas scalar (for Timestamp/Timedelta/Interval/Period). + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: Int64 + + >>> s.to_list() + [1, 2, 3] + Returns: list: list of the values """ @@ -560,6 +575,20 @@ def count(self): """ Return number of non-NA/null observations in the Series. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([0.0, 1.0, bpd.NA]) + >>> s + 0 0.0 + 1 1.0 + 2 + dtype: Float64 + >>> s.count() + 2 + Returns: int or Series (if level specified): Number of non-null values in the Series. @@ -2845,6 +2874,47 @@ def map( ``__missing__`` (i.e. provide a method for default values). These are treated the same as ``dict``. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['cat', 'dog', bpd.NA, 'rabbit']) + >>> s + 0 cat + 1 dog + 2 + 3 rabbit + dtype: string + + `map` can accepts a `dict`. Values that are not found in the `dict` are + converted to `NA`: + + >>> s.map({'cat': 'kitten', 'dog': 'puppy'}) + 0 kitten + 1 puppy + 2 + 3 + dtype: string + + It also accepts a remote function: + + >>> @bpd.remote_function([str], str) + ... def my_mapper(val): + ... vowels = ["a", "e", "i", "o", "u"] + ... if val: + ... return "".join([ + ... ch.upper() if ch in vowels else ch for ch in val + ... ]) + ... return "N/A" + + >>> s.map(my_mapper) + 0 cAt + 1 dOg + 2 N/A + 3 rAbbIt + dtype: string + Args: arg (function, Mapping, Series): remote function, collections.abc.Mapping subclass or Series From 64bdf7622f3b5a5b5ec9176b1558a9cd4b7a756a Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Tue, 2 Jan 2024 11:20:16 -0800 Subject: [PATCH 24/27] chore: stop using deprecated ibis relabel method (#297) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/core/compile/compiled.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 199c8db785..524699290b 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -346,7 +346,9 @@ def _to_ibis_expr( table = table.filter(base_table[PREDICATE_COLUMN]) table = table.drop(*columns_to_drop) if col_id_overrides: - table = table.relabel(col_id_overrides) + table = table.rename( + {value: key for key, value in col_id_overrides.items()} + ) if fraction is not None: table = table.filter(ibis.random() < ibis.literal(fraction)) return table @@ -1194,7 +1196,9 @@ def _to_ibis_expr( table = table.filter(base_table[PREDICATE_COLUMN]) table = table.drop(*columns_to_drop) if col_id_overrides: - table = table.relabel(col_id_overrides) + table = table.rename( + {value: key for key, value in col_id_overrides.items()} + ) if fraction is not None: table = table.filter(ibis.random() < ibis.literal(fraction)) return table From c2b1892825545a34ce4ed5b0ef99e99348466108 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 2 Jan 2024 20:28:16 +0000 Subject: [PATCH 25/27] docs: code samples for `sample`, `get`, `Series.round` (#295) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BEGIN_COMMIT_OVERRIDE docs: code samples for `sample`, `get`, `Series.round` (#295) docs: code samples for DataFrame `set_index`, `items` (#295) END_COMMIT_OVERRIDE Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) - `DataFrame.sample`, `Series.sample`: https://screenshot.googleplex.com/kPy5swVACMeBhSo - `DataFrame.get`, `Series.get`: https://screenshot.googleplex.com/7hirn5oz2b4L6B3 - `DataFrame.set_index`: https://screenshot.googleplex.com/3CXARrp5hwV6gau - `DataFrame.items`: https://screenshot.googleplex.com/bk3HAiXZQq3TYD9 - `Series.round`: https://screenshot.googleplex.com/C9c4m84NWNMnAwS Fixes internal issues 318011542 and 318011745 🦕 --- .../bigframes_vendored/pandas/core/frame.py | 78 ++++++++++++++- .../bigframes_vendored/pandas/core/generic.py | 94 +++++++++++++++++++ .../bigframes_vendored/pandas/core/series.py | 19 ++++ 3 files changed, 189 insertions(+), 2 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index f2de8fcb6a..9259d14bab 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1187,6 +1187,47 @@ def set_index( Set the DataFrame index (row labels) using one existing column. The index can replace the existing index. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'month': [1, 4, 7, 10], + ... 'year': [2012, 2014, 2013, 2014], + ... 'sale': [55, 40, 84, 31]}) + >>> df + month year sale + 0 1 2012 55 + 1 4 2014 40 + 2 7 2013 84 + 3 10 2014 31 + + [4 rows x 3 columns] + + Set the 'month' column to become the index: + + >>> df.set_index('month') + year sale + month + 1 2012 55 + 4 2014 40 + 7 2013 84 + 10 2014 31 + + [4 rows x 2 columns] + + Create a MultiIndex using columns 'year' and 'month': + + >>> df.set_index(['year', 'month']) + sale + year month + 2012 1 55 + 2014 4 40 + 2013 7 84 + 2014 10 31 + + [4 rows x 1 columns] + Args: keys: A label. This parameter can be a single column key. @@ -1621,6 +1662,39 @@ def items(self): Iterates over the DataFrame columns, returning a tuple with the column name and the content as a Series. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'species': ['bear', 'bear', 'marsupial'], + ... 'population': [1864, 22000, 80000]}, + ... index=['panda', 'polar', 'koala']) + >>> df + species population + panda bear 1864 + polar bear 22000 + koala marsupial 80000 + + [3 rows x 2 columns] + + >>> for label, content in df.items(): + ... print(f'--> label: {label}') + ... print(f'--> content:\\n{content}') + ... + --> label: species + --> content: + panda bear + polar bear + koala marsupial + Name: species, dtype: string + --> label: population + --> content: + panda 1864 + polar 22000 + koala 80000 + Name: population, dtype: Int64 + Returns: Iterator: Iterator of label, Series for each column. """ @@ -4587,7 +4661,7 @@ def index(self): ... 'Location': ['Seattle', 'New York', 'Kona']}, ... index=([10, 20, 30])) >>> df - Name Age Location + Name Age Location 10 Alice 25 Seattle 20 Bob 30 New York 30 Aritra 35 Kona @@ -4603,7 +4677,7 @@ def index(self): >>> df1 = df.set_index(["Name", "Location"]) >>> df1 - Age + Age Name Location Alice Seattle 25 Bob New York 30 diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index c079cbff7f..bc31e02263 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -254,6 +254,55 @@ def get(self, key, default=None): Returns default value if not found. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame( + ... [ + ... [24.3, 75.7, "high"], + ... [31, 87.8, "high"], + ... [22, 71.6, "medium"], + ... [35, 95, "medium"], + ... ], + ... columns=["temp_celsius", "temp_fahrenheit", "windspeed"], + ... index=["2014-02-12", "2014-02-13", "2014-02-14", "2014-02-15"], + ... ) + >>> df + temp_celsius temp_fahrenheit windspeed + 2014-02-12 24.3 75.7 high + 2014-02-13 31.0 87.8 high + 2014-02-14 22.0 71.6 medium + 2014-02-15 35.0 95.0 medium + + [4 rows x 3 columns] + + >>> df.get(["temp_celsius", "windspeed"]) + temp_celsius windspeed + 2014-02-12 24.3 high + 2014-02-13 31.0 high + 2014-02-14 22.0 medium + 2014-02-15 35.0 medium + + [4 rows x 2 columns] + + >>> ser = df['windspeed'] + >>> ser + 2014-02-12 high + 2014-02-13 high + 2014-02-14 medium + 2014-02-15 medium + Name: windspeed, dtype: string + >>> ser.get('2014-02-13') + 'high' + + If the key is not found, the default value will be used. + + >>> df.get(["temp_celsius", "temp_kelvin"]) + >>> df.get(["temp_celsius", "temp_kelvin"], default="default_value") + 'default_value' + Args: key: object @@ -410,6 +459,51 @@ def sample( You can use `random_state` for reproducibility. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'num_legs': [2, 4, 8, 0], + ... 'num_wings': [2, 0, 0, 0], + ... 'num_specimen_seen': [10, 2, 1, 8]}, + ... index=['falcon', 'dog', 'spider', 'fish']) + >>> df + num_legs num_wings num_specimen_seen + falcon 2 2 10 + dog 4 0 2 + spider 8 0 1 + fish 0 0 8 + + [4 rows x 3 columns] + + Fetch one random row from the DataFrame (Note that we use `random_state` + to ensure reproducibility of the examples): + + >>> df.sample(random_state=1) + num_legs num_wings num_specimen_seen + dog 4 0 2 + + [1 rows x 3 columns] + + A random 50% sample of the DataFrame: + + >>> df.sample(frac=0.5, random_state=1) + num_legs num_wings num_specimen_seen + dog 4 0 2 + fish 0 0 8 + + [2 rows x 3 columns] + + Extract 3 random elements from the Series `df['num_legs']`: + + >>> s = df['num_legs'] + >>> s.sample(n=3, random_state=1) + dog 4 + fish 0 + spider 8 + Name: num_legs, dtype: Int64 + Args: n (Optional[int], default None): Number of items from axis to return. Cannot be used with `frac`. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index b97f9018dd..1ee1a8d5b5 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -722,6 +722,25 @@ def round(self, decimals: int = 0) -> Series: """ Round each value in a Series to the given number of decimals. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([0.1, 1.3, 2.7]) + >>> s.round() + 0 0.0 + 1 1.0 + 2 3.0 + dtype: Float64 + + >>> s = bpd.Series([0.123, 1.345, 2.789]) + >>> s.round(decimals=2) + 0 0.12 + 1 1.34 + 2 2.79 + dtype: Float64 + Args: decimals (int, default 0): Number of decimal places to round to. If decimals is negative, From eb69f60db52544882fb06c2d5fa0e41226dfe93f Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 2 Jan 2024 23:44:39 +0000 Subject: [PATCH 26/27] docs: code samples for `DataFrame.rename`, `Series.rename` (#293) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BEGIN_COMMIT_OVERRIDE docs: code samples for `rename` , `size` (#293) docs: code samples for `Series.{name, std, agg}` (#293) END_COMMIT_OVERRIDE Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) - `DataFrame.size`: https://screenshot.googleplex.com/55MHXNuAamdfbud - `Series.size`: https://screenshot.googleplex.com/5ve4T8UJq2TUiWb - `DataFrame.rename`: https://screenshot.googleplex.com/7eWsfcz8tmLx4pY - `Series.rename`: https://screenshot.googleplex.com/3HbXTxCaJVsbEzs - `Series.name`: https://screenshot.googleplex.com/7FpNDWJEyiqGLpN - `Series.std`: https://screenshot.googleplex.com/4RSTC8s2tYYK5cW - `Series.agg`: https://screenshot.googleplex.com/63TmACx23TPJu2K Fixes internal issues 317997641 and 317998300 🦕 --- .../bigframes_vendored/pandas/core/frame.py | 24 ++++ .../bigframes_vendored/pandas/core/generic.py | 13 ++ .../bigframes_vendored/pandas/core/series.py | 127 ++++++++++++++++++ 3 files changed, 164 insertions(+) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 9259d14bab..c3794c550e 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1146,6 +1146,30 @@ def rename( Dict values must be unique (1-to-1). Labels not contained in a dict will be left as-is. Extra labels listed don't throw an error. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + >>> df + A B + 0 1 4 + 1 2 5 + 2 3 6 + + [3 rows x 2 columns] + + Rename columns using a mapping: + + >>> df.rename(columns={"A": "col1", "B": "col2"}) + col1 col2 + 0 1 4 + 1 2 5 + 2 3 6 + + [3 rows x 2 columns] + Args: columns (Mapping): Dict-like from old column labels to new column labels. diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index bc31e02263..72b947f96c 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -29,6 +29,19 @@ def ndim(self) -> int: def size(self) -> int: """Return an int representing the number of elements in this object. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series({'a': 1, 'b': 2, 'c': 3}) + >>> s.size + 3 + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df.size + 4 + Returns: int: Return the number of rows if Series. Otherwise return the number of rows times number of columns if DataFrame. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 1ee1a8d5b5..98c4fcdd44 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -135,6 +135,35 @@ def name(self) -> Hashable: to form a DataFrame. It is also used whenever displaying the Series using the interpreter. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + For a Series: + + >>> s = bpd.Series([1, 2, 3], dtype="Int64", name='Numbers') + >>> s + 0 1 + 1 2 + 2 3 + Name: Numbers, dtype: Int64 + >>> s.name + 'Numbers' + + If the Series is part of a DataFrame: + + >>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + [2 rows x 2 columns] + >>> s = df["col1"] + >>> s.name + 'col1' + Returns: hashable object: The name of the Series, also the column name if part of a DataFrame. @@ -560,6 +589,27 @@ def agg(self, func): """ Aggregate using one or more operations over the specified axis. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1, 2, 3, 4]) + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + dtype: Int64 + + >>> s.agg('min') + 1 + + >>> s.agg(['min', 'max']) + min 1.0 + max 4.0 + dtype: Float64 + Args: func (function): Function to use for aggregating the data. @@ -2292,6 +2342,29 @@ def std( Normalized by N-1 by default. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> df = bpd.DataFrame({'person_id': [0, 1, 2, 3], + ... 'age': [21, 25, 62, 43], + ... 'height': [1.61, 1.87, 1.49, 2.01]} + ... ).set_index('person_id') + >>> df + age height + person_id + 0 21 1.61 + 1 25 1.87 + 2 62 1.49 + 3 43 2.01 + + [4 rows x 2 columns] + + >>> df.std() + age 18.786076 + height 0.237417 + dtype: Float64 Returns ------- @@ -2649,6 +2722,34 @@ def rename(self, index, **kwargs) -> Series | None: Alternatively, change ``Series.name`` with a scalar value. + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: Int64 + + You can changes the Series name by specifying a string scalar: + + >>> s.rename("my_name") + 0 1 + 1 2 + 2 3 + Name: my_name, dtype: Int64 + + You can change the labels by specifying a mapping: + + >>> s.rename({1: 3, 2: 5}) + 0 1 + 3 2 + 5 3 + dtype: Int64 + Args: index (scalar, hashable sequence, dict-like or function optional): Functions or dict-like are transformations to apply to @@ -2990,3 +3091,29 @@ def values(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @property + def size(self) -> int: + """Return the number of elements in the underlying data. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + For Series: + + >>> s = bpd.Series({'a': 1, 'b': 2, 'c': 3}) + >>> s.size + 3 + + For Index: + + >>> idx = bpd.Index(bpd.Series([1, 2, 3])) + >>> idx.size + 3 + + Returns: + int: Return the number of elements in the underlying data. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 655178ad71c2b64f720d0d195813a97889c38f5a Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Wed, 3 Jan 2024 11:29:36 -0800 Subject: [PATCH 27/27] chore(main): release 0.18.0 (#279) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 43 +++++++++++++++++++++++++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c49c5b63b..77a6576ee0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,49 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.18.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.17.0...v0.18.0) (2024-01-02) + + +### Features + +* Add dataframe.to_html ([#259](https://github.com/googleapis/python-bigquery-dataframes/issues/259)) ([2cd6489](https://github.com/googleapis/python-bigquery-dataframes/commit/2cd64891170dcd4f2a709024a2993e36db210976)) +* Add IntervalIndex support to bigframes.pandas.cut ([#254](https://github.com/googleapis/python-bigquery-dataframes/issues/254)) ([6c1969a](https://github.com/googleapis/python-bigquery-dataframes/commit/6c1969a35fe720cf3a804006bcc9046ba554fcc3)) +* Add replace method to DataFrame ([#261](https://github.com/googleapis/python-bigquery-dataframes/issues/261)) ([5092215](https://github.com/googleapis/python-bigquery-dataframes/commit/5092215767d77c90b132e9cd6b3e3749827ebe09)) +* Specific pyarrow mappings for decimal, bytes types ([#283](https://github.com/googleapis/python-bigquery-dataframes/issues/283)) ([a1c0631](https://github.com/googleapis/python-bigquery-dataframes/commit/a1c06319ab0e3697c3175112490488002bb344c0)) + + +### Bug Fixes + +* Dataframes to_gbq now creates dataset if it doesn't exist ([#222](https://github.com/googleapis/python-bigquery-dataframes/issues/222)) ([bac62f7](https://github.com/googleapis/python-bigquery-dataframes/commit/bac62f76af1af6ca8834c3690c7c79aeb12dd331)) +* Exclude pandas 2.2.0rc0 to unblock prerelease tests ([#292](https://github.com/googleapis/python-bigquery-dataframes/issues/292)) ([ac1a745](https://github.com/googleapis/python-bigquery-dataframes/commit/ac1a745ddce9865f4585777b43c2234b9bf2841d)) +* Fix DataFrameGroupby.agg() issue with as_index=False ([#273](https://github.com/googleapis/python-bigquery-dataframes/issues/273)) ([ab49350](https://github.com/googleapis/python-bigquery-dataframes/commit/ab493506e71ed8970a11fe2f88b2145150e09291)) +* Make `Series.str.replace` work for simple strings ([#285](https://github.com/googleapis/python-bigquery-dataframes/issues/285)) ([ad67465](https://github.com/googleapis/python-bigquery-dataframes/commit/ad6746569b3af11be9d40805a1449ee1e89288dc)) +* Update dataframe.to_gbq to dedup column names. ([#286](https://github.com/googleapis/python-bigquery-dataframes/issues/286)) ([746115d](https://github.com/googleapis/python-bigquery-dataframes/commit/746115d5564c95bc3c4a5309c99e7a29e535e6fe)) +* Use setuptools.find_namespace_packages ([#246](https://github.com/googleapis/python-bigquery-dataframes/issues/246)) ([9ec352a](https://github.com/googleapis/python-bigquery-dataframes/commit/9ec352a338f11d82aee9cd665ffb0e6e97cb391b)) + + +### Dependencies + +* Migrate to `ibis-framework >= "7.1.0"` ([#53](https://github.com/googleapis/python-bigquery-dataframes/issues/53)) ([9798a2b](https://github.com/googleapis/python-bigquery-dataframes/commit/9798a2b14dffb20432f732343cac92341e42fe09)) + + +### Documentation + +* Add code snippets for explore query result page ([#278](https://github.com/googleapis/python-bigquery-dataframes/issues/278)) ([7cbbb7d](https://github.com/googleapis/python-bigquery-dataframes/commit/7cbbb7d4608d8b7d1a360b2fe2d39d89a52f9546)) +* Code samples for `astype` common to DataFrame and Series ([#280](https://github.com/googleapis/python-bigquery-dataframes/issues/280)) ([95b673a](https://github.com/googleapis/python-bigquery-dataframes/commit/95b673aeb1545744e4b1a353cf1f4d0202d8a1b2)) +* Code samples for `DataFrame.copy` and `Series.copy` ([#290](https://github.com/googleapis/python-bigquery-dataframes/issues/290)) ([7cbc2b0](https://github.com/googleapis/python-bigquery-dataframes/commit/7cbc2b0ba572d11778ba7caf7c95b7fb8f3a31a7)) +* Code samples for `drop` and `fillna` ([#284](https://github.com/googleapis/python-bigquery-dataframes/issues/284)) ([9c5012e](https://github.com/googleapis/python-bigquery-dataframes/commit/9c5012ec68275db83d1f6f7e743f5edaaaacd8cb)) +* Code samples for `isna`, `isnull`, `dropna`, `isin` ([#289](https://github.com/googleapis/python-bigquery-dataframes/issues/289)) ([ad51035](https://github.com/googleapis/python-bigquery-dataframes/commit/ad51035bcf80d6a49f134df26624b578010b5b12)) +* Code samples for `rename` , `size` ([#293](https://github.com/googleapis/python-bigquery-dataframes/issues/293)) ([eb69f60](https://github.com/googleapis/python-bigquery-dataframes/commit/eb69f60db52544882fb06c2d5fa0e41226dfe93f)) +* Code samples for `reset_index` and `sort_values` ([#282](https://github.com/googleapis/python-bigquery-dataframes/issues/282)) ([acc0eb7](https://github.com/googleapis/python-bigquery-dataframes/commit/acc0eb7010951c8cfb91aecc45268b041217dd09)) +* Code samples for `sample`, `get`, `Series.round` ([#295](https://github.com/googleapis/python-bigquery-dataframes/issues/295)) ([c2b1892](https://github.com/googleapis/python-bigquery-dataframes/commit/c2b1892825545a34ce4ed5b0ef99e99348466108)) +* Code samples for `Series.{add, replace, unique, T, transpose}` ([#287](https://github.com/googleapis/python-bigquery-dataframes/issues/287)) ([0e1bbfc](https://github.com/googleapis/python-bigquery-dataframes/commit/0e1bbfc1055aff9757b5138907c11caab2f3965a)) +* Code samples for `Series.{map, to_list, count}` ([#290](https://github.com/googleapis/python-bigquery-dataframes/issues/290)) ([7cbc2b0](https://github.com/googleapis/python-bigquery-dataframes/commit/7cbc2b0ba572d11778ba7caf7c95b7fb8f3a31a7)) +* Code samples for `Series.{name, std, agg}` ([#293](https://github.com/googleapis/python-bigquery-dataframes/issues/293)) ([eb69f60](https://github.com/googleapis/python-bigquery-dataframes/commit/eb69f60db52544882fb06c2d5fa0e41226dfe93f)) +* Code samples for `Series.groupby` and `Series.{sum,mean,min,max}` ([#280](https://github.com/googleapis/python-bigquery-dataframes/issues/280)) ([95b673a](https://github.com/googleapis/python-bigquery-dataframes/commit/95b673aeb1545744e4b1a353cf1f4d0202d8a1b2)) +* Code samples for DataFrame `set_index`, `items` ([#295](https://github.com/googleapis/python-bigquery-dataframes/issues/295)) ([c2b1892](https://github.com/googleapis/python-bigquery-dataframes/commit/c2b1892825545a34ce4ed5b0ef99e99348466108)) +* Fix the rendering for `get_dummies` ([#291](https://github.com/googleapis/python-bigquery-dataframes/issues/291)) ([252f3a2](https://github.com/googleapis/python-bigquery-dataframes/commit/252f3a2a0e1296c7d786acdc0bdebe9e4a9ae1be)) + ## [0.17.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.16.0...v0.17.0) (2023-12-14) diff --git a/bigframes/version.py b/bigframes/version.py index 04eac385f6..494335acd7 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.17.0" +__version__ = "0.18.0"