feat: detect duplicate column/index names in read_gbq before send query. (#1615)

Genesis929 · web-flow · commit 40d696088114 · 2025-04-15T12:43:09.000-07:00
diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py
@@ -89,6 +89,31 @@ def _to_index_cols(
     return index_cols
 
 
+def _check_column_duplicates(index_cols: Iterable[str], columns: Iterable[str]):
+    index_cols_list = list(index_cols) if index_cols is not None else []
+    columns_list = list(columns) if columns is not None else []
+    set_index = set(index_cols_list)
+    set_columns = set(columns_list)
+
+    if len(index_cols_list) > len(set_index):
+        raise ValueError(
+            "The 'index_col' argument contains duplicate names. "
+            "All column names specified in 'index_col' must be unique."
+        )
+
+    if len(columns_list) > len(set_columns):
+        raise ValueError(
+            "The 'columns' argument contains duplicate names. "
+            "All column names specified in 'columns' must be unique."
+        )
+
+    if not set_index.isdisjoint(set_columns):
+        raise ValueError(
+            "Found column names that exist in both 'index_col' and 'columns' arguments. "
+            "These arguments must specify distinct sets of columns."
+        )
+
+
 @dataclasses.dataclass
 class GbqDataLoader:
     """
@@ -328,6 +353,7 @@ def read_gbq_table(
             table=table,
             index_col=index_col,
         )
+        _check_column_duplicates(index_cols, columns)
 
         for key in index_cols:
             if key not in table_column_names:
@@ -569,6 +595,7 @@ def read_gbq_query(
             )
 
         index_cols = _to_index_cols(index_col)
+        _check_column_duplicates(index_cols, columns)
 
         filters_copy1, filters_copy2 = itertools.tee(filters)
         has_filters = len(list(filters_copy1)) != 0
diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py
@@ -1627,3 +1627,56 @@ def test_read_gbq_test(test_session: bigframes.Session):
     actual = test_session.read_gbq(table_id).to_pandas()
 
     assert actual.shape == (1, 1)
+
+
+@pytest.mark.parametrize(
+    ("query_or_table", "index_col", "columns"),
+    [
+        pytest.param(
+            "{scalars_table_id}",
+            ("int64_col", "string_col", "int64_col"),
+            ("float64_col", "bool_col"),
+            id="table_input_index_col_dup",
+            marks=pytest.mark.xfail(
+                raises=ValueError,
+                reason="ValueError: Duplicate names within 'index_col'.",
+                strict=True,
+            ),
+        ),
+        pytest.param(
+            """SELECT int64_col, string_col, float64_col, bool_col
+               FROM `{scalars_table_id}`""",
+            ("int64_col",),
+            ("string_col", "float64_col", "string_col"),
+            id="query_input_columns_dup",
+            marks=pytest.mark.xfail(
+                raises=ValueError,
+                reason="ValueError: Duplicate names within 'columns'.",
+                strict=True,
+            ),
+        ),
+        pytest.param(
+            "{scalars_table_id}",
+            ("int64_col", "string_col"),
+            ("float64_col", "string_col", "bool_col"),
+            id="table_input_cross_dup",
+            marks=pytest.mark.xfail(
+                raises=ValueError,
+                reason="ValueError: Overlap between 'index_col' and 'columns'.",
+                strict=True,
+            ),
+        ),
+    ],
+)
+def test_read_gbq_duplicate_columns_xfail(
+    session: bigframes.Session,
+    scalars_table_id: str,
+    query_or_table: str,
+    index_col: tuple,
+    columns: tuple,
+):
+    session.read_gbq(
+        query_or_table.format(scalars_table_id=scalars_table_id),
+        index_col=index_col,
+        columns=columns,
+    )