Skip to content

Commit 40d6960

Browse files
authored
feat: detect duplicate column/index names in read_gbq before send query. (#1615)
1 parent 6ad38e8 commit 40d6960

File tree

2 files changed

+80
-0
lines changed

2 files changed

+80
-0
lines changed

bigframes/session/loader.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,31 @@ def _to_index_cols(
8989
return index_cols
9090

9191

92+
def _check_column_duplicates(index_cols: Iterable[str], columns: Iterable[str]):
93+
index_cols_list = list(index_cols) if index_cols is not None else []
94+
columns_list = list(columns) if columns is not None else []
95+
set_index = set(index_cols_list)
96+
set_columns = set(columns_list)
97+
98+
if len(index_cols_list) > len(set_index):
99+
raise ValueError(
100+
"The 'index_col' argument contains duplicate names. "
101+
"All column names specified in 'index_col' must be unique."
102+
)
103+
104+
if len(columns_list) > len(set_columns):
105+
raise ValueError(
106+
"The 'columns' argument contains duplicate names. "
107+
"All column names specified in 'columns' must be unique."
108+
)
109+
110+
if not set_index.isdisjoint(set_columns):
111+
raise ValueError(
112+
"Found column names that exist in both 'index_col' and 'columns' arguments. "
113+
"These arguments must specify distinct sets of columns."
114+
)
115+
116+
92117
@dataclasses.dataclass
93118
class GbqDataLoader:
94119
"""
@@ -328,6 +353,7 @@ def read_gbq_table(
328353
table=table,
329354
index_col=index_col,
330355
)
356+
_check_column_duplicates(index_cols, columns)
331357

332358
for key in index_cols:
333359
if key not in table_column_names:
@@ -569,6 +595,7 @@ def read_gbq_query(
569595
)
570596

571597
index_cols = _to_index_cols(index_col)
598+
_check_column_duplicates(index_cols, columns)
572599

573600
filters_copy1, filters_copy2 = itertools.tee(filters)
574601
has_filters = len(list(filters_copy1)) != 0

tests/system/small/test_session.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1627,3 +1627,56 @@ def test_read_gbq_test(test_session: bigframes.Session):
16271627
actual = test_session.read_gbq(table_id).to_pandas()
16281628

16291629
assert actual.shape == (1, 1)
1630+
1631+
1632+
@pytest.mark.parametrize(
1633+
("query_or_table", "index_col", "columns"),
1634+
[
1635+
pytest.param(
1636+
"{scalars_table_id}",
1637+
("int64_col", "string_col", "int64_col"),
1638+
("float64_col", "bool_col"),
1639+
id="table_input_index_col_dup",
1640+
marks=pytest.mark.xfail(
1641+
raises=ValueError,
1642+
reason="ValueError: Duplicate names within 'index_col'.",
1643+
strict=True,
1644+
),
1645+
),
1646+
pytest.param(
1647+
"""SELECT int64_col, string_col, float64_col, bool_col
1648+
FROM `{scalars_table_id}`""",
1649+
("int64_col",),
1650+
("string_col", "float64_col", "string_col"),
1651+
id="query_input_columns_dup",
1652+
marks=pytest.mark.xfail(
1653+
raises=ValueError,
1654+
reason="ValueError: Duplicate names within 'columns'.",
1655+
strict=True,
1656+
),
1657+
),
1658+
pytest.param(
1659+
"{scalars_table_id}",
1660+
("int64_col", "string_col"),
1661+
("float64_col", "string_col", "bool_col"),
1662+
id="table_input_cross_dup",
1663+
marks=pytest.mark.xfail(
1664+
raises=ValueError,
1665+
reason="ValueError: Overlap between 'index_col' and 'columns'.",
1666+
strict=True,
1667+
),
1668+
),
1669+
],
1670+
)
1671+
def test_read_gbq_duplicate_columns_xfail(
1672+
session: bigframes.Session,
1673+
scalars_table_id: str,
1674+
query_or_table: str,
1675+
index_col: tuple,
1676+
columns: tuple,
1677+
):
1678+
session.read_gbq(
1679+
query_or_table.format(scalars_table_id=scalars_table_id),
1680+
index_col=index_col,
1681+
columns=columns,
1682+
)

0 commit comments

Comments
 (0)