feat: column specs for tabular transformation (#466)

sirtorry · web-flow · commit 71d0bd4615b4 · 2021-07-15T19:42:25.000Z
- adds column_specs as an alternative to column_transformation
- adds training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs
- adds aiplatform.column
diff --git a/google/cloud/aiplatform/datasets/time_series_dataset.py b/google/cloud/aiplatform/datasets/time_series_dataset.py
@@ -46,7 +46,7 @@ def create(
         encryption_spec_key_name: Optional[str] = None,
         sync: bool = True,
     ) -> "TimeSeriesDataset":
-        """Creates a new tabular dataset.
+        """Creates a new time series dataset.
 
         Args:
             display_name (str):
diff --git a/google/cloud/aiplatform/training_jobs.py b/google/cloud/aiplatform/training_jobs.py
@@ -18,6 +18,7 @@
 import datetime
 import time
 from typing import Dict, List, Optional, Sequence, Tuple, Union
+import warnings
 
 import abc
 
@@ -2525,6 +2526,7 @@ def __init__(
         display_name: str,
         optimization_prediction_type: str,
         optimization_objective: Optional[str] = None,
+        column_specs: Optional[Dict[str, str]] = None,
         column_transformations: Optional[Union[Dict, List[Dict]]] = None,
         optimization_objective_recall_value: Optional[float] = None,
         optimization_objective_precision_value: Optional[float] = None,
@@ -2536,6 +2538,15 @@ def __init__(
     ):
         """Constructs a AutoML Tabular Training Job.
 
+        Example usage:
+
+        job = training_jobs.AutoMLTabularTrainingJob(
+            display_name="my_display_name",
+            optimization_prediction_type="classification",
+            optimization_objective="minimize-log-loss",
+            column_specs={"column_1": "auto", "column_2": "numeric"},
+        )
+
         Args:
             display_name (str):
                 Required. The user-defined name of this TrainingPipeline.
@@ -2576,15 +2587,29 @@ def __init__(
                 "minimize-rmse" (default) - Minimize root-mean-squared error (RMSE).
                 "minimize-mae" - Minimize mean-absolute error (MAE).
                 "minimize-rmsle" - Minimize root-mean-squared log error (RMSLE).
-            column_transformations (Optional[Union[Dict, List[Dict]]]):
+            column_specs (Dict[str, str]):
+                Optional. Alternative to column_transformations where the keys of the dict
+                are column names and their respective values are one of
+                AutoMLTabularTrainingJob.column_data_types.
+                When creating transformation for BigQuery Struct column, the column
+                should be flattened using "." as the delimiter. Only columns with no child
+                should have a transformation.
+                If an input column has no transformations on it, such a column is
+                ignored by the training, except for the targetColumn, which should have
+                no transformations defined on.
+                Only one of column_transformations or column_specs should be passed.
+            column_transformations (Union[Dict, List[Dict]]):
                 Optional. Transformations to apply to the input columns (i.e. columns other
                 than the targetColumn). Each transformation may produce multiple
                 result values from the column's value, and all are used for training.
                 When creating transformation for BigQuery Struct column, the column
-                should be flattened using "." as the delimiter.
+                should be flattened using "." as the delimiter. Only columns with no child
+                should have a transformation.
                 If an input column has no transformations on it, such a column is
                 ignored by the training, except for the targetColumn, which should have
                 no transformations defined on.
+                Only one of column_transformations or column_specs should be passed.
+                Consider using column_specs as column_transformations will be deprecated eventually.
             optimization_objective_recall_value (float):
                 Optional. Required when maximize-precision-at-recall optimizationObjective was
                 picked, represents the recall value at which the optimization is done.
@@ -2628,6 +2653,9 @@ def __init__(
                 If set, the trained Model will be secured by this key.
 
                 Overrides encryption_spec_key_name set in aiplatform.init.
+
+            Raises:
+                ValueError: When both column_transforations and column_specs were passed
         """
         super().__init__(
             display_name=display_name,
@@ -2637,7 +2665,26 @@ def __init__(
             training_encryption_spec_key_name=training_encryption_spec_key_name,
             model_encryption_spec_key_name=model_encryption_spec_key_name,
         )
-        self._column_transformations = column_transformations
+        # user populated transformations
+        if column_transformations is not None and column_specs is not None:
+            raise ValueError(
+                "Both column_transformations and column_specs were passed. Only one is allowed."
+            )
+        if column_transformations is not None:
+            self._column_transformations = column_transformations
+            warnings.simplefilter("always", DeprecationWarning)
+            warnings.warn(
+                "consider using column_specs instead. column_transformations will be deprecated in the future.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+        elif column_specs is not None:
+            self._column_transformations = [
+                {transformation: {"column_name": column_name}}
+                for column_name, transformation in column_specs.items()
+            ]
+        else:
+            self._column_transformations = None
         self._optimization_objective = optimization_objective
         self._optimization_prediction_type = optimization_prediction_type
         self._optimization_objective_recall_value = optimization_objective_recall_value
@@ -2860,6 +2907,7 @@ def _run(
 
         training_task_definition = schema.training_job.definition.automl_tabular
 
+        # auto-populate transformations
         if self._column_transformations is None:
             _LOGGER.info(
                 "No column transformations provided, so now retrieving columns from dataset in order to set default column transformations."
@@ -2870,21 +2918,19 @@ def _run(
                 for column_name in dataset.column_names
                 if column_name != target_column
             ]
-            column_transformations = [
+            self._column_transformations = [
                 {"auto": {"column_name": column_name}} for column_name in column_names
             ]
 
             _LOGGER.info(
                 "The column transformation of type 'auto' was set for the following columns: %s."
                 % column_names
             )
-        else:
-            column_transformations = self._column_transformations
 
         training_task_inputs_dict = {
             # required inputs
             "targetColumn": target_column,
-            "transformations": column_transformations,
+            "transformations": self._column_transformations,
             "trainBudgetMilliNodeHours": budget_milli_node_hours,
             # optional inputs
             "weightColumnName": weight_column,
@@ -2935,6 +2981,44 @@ def _add_additional_experiments(self, additional_experiments: List[str]):
         """
         self._additional_experiments.extend(additional_experiments)
 
+    @staticmethod
+    def get_auto_column_specs(
+        dataset: datasets.TabularDataset, target_column: str,
+    ) -> Dict[str, str]:
+        """Returns a dict with all non-target columns as keys and 'auto' as values.
+
+        Example usage:
+
+        column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
+            dataset=my_dataset,
+            target_column="my_target_column",
+        )
+
+        Args:
+            dataset (datasets.TabularDataset):
+                Required. Intended dataset.
+            target_column(str):
+                Required. Intended target column.
+        Returns:
+            Dict[str, str]
+                Column names as keys and 'auto' as values
+        """
+        column_names = [
+            column for column in dataset.column_names if column != target_column
+        ]
+        column_specs = {column: "auto" for column in column_names}
+        return column_specs
+
+    class column_data_types:
+        AUTO = "auto"
+        NUMERIC = "numeric"
+        CATEGORICAL = "categorical"
+        TIMESTAMP = "timestamp"
+        TEXT = "text"
+        REPEATED_NUMERIC = "repeated_numeric"
+        REPEATED_CATEGORICAL = "repeated_categorical"
+        REPEATED_TEXT = "repeated_text"
+
 
 class AutoMLForecastingTrainingJob(_TrainingJob):
     _supported_training_schemas = (schema.training_job.definition.automl_forecasting,)
diff --git a/tests/unit/aiplatform/test_automl_tabular_training_jobs.py b/tests/unit/aiplatform/test_automl_tabular_training_jobs.py