18
18
import datetime
19
19
import time
20
20
from typing import Dict , List , Optional , Sequence , Tuple , Union
21
+ import warnings
21
22
22
23
import abc
23
24
@@ -2525,6 +2526,7 @@ def __init__(
2525
2526
display_name : str ,
2526
2527
optimization_prediction_type : str ,
2527
2528
optimization_objective : Optional [str ] = None ,
2529
+ column_specs : Optional [Dict [str , str ]] = None ,
2528
2530
column_transformations : Optional [Union [Dict , List [Dict ]]] = None ,
2529
2531
optimization_objective_recall_value : Optional [float ] = None ,
2530
2532
optimization_objective_precision_value : Optional [float ] = None ,
@@ -2536,6 +2538,15 @@ def __init__(
2536
2538
):
2537
2539
"""Constructs a AutoML Tabular Training Job.
2538
2540
2541
+ Example usage:
2542
+
2543
+ job = training_jobs.AutoMLTabularTrainingJob(
2544
+ display_name="my_display_name",
2545
+ optimization_prediction_type="classification",
2546
+ optimization_objective="minimize-log-loss",
2547
+ column_specs={"column_1": "auto", "column_2": "numeric"},
2548
+ )
2549
+
2539
2550
Args:
2540
2551
display_name (str):
2541
2552
Required. The user-defined name of this TrainingPipeline.
@@ -2576,15 +2587,29 @@ def __init__(
2576
2587
"minimize-rmse" (default) - Minimize root-mean-squared error (RMSE).
2577
2588
"minimize-mae" - Minimize mean-absolute error (MAE).
2578
2589
"minimize-rmsle" - Minimize root-mean-squared log error (RMSLE).
2579
- column_transformations (Optional[Union[Dict, List[Dict]]]):
2590
+ column_specs (Dict[str, str]):
2591
+ Optional. Alternative to column_transformations where the keys of the dict
2592
+ are column names and their respective values are one of
2593
+ AutoMLTabularTrainingJob.column_data_types.
2594
+ When creating transformation for BigQuery Struct column, the column
2595
+ should be flattened using "." as the delimiter. Only columns with no child
2596
+ should have a transformation.
2597
+ If an input column has no transformations on it, such a column is
2598
+ ignored by the training, except for the targetColumn, which should have
2599
+ no transformations defined on.
2600
+ Only one of column_transformations or column_specs should be passed.
2601
+ column_transformations (Union[Dict, List[Dict]]):
2580
2602
Optional. Transformations to apply to the input columns (i.e. columns other
2581
2603
than the targetColumn). Each transformation may produce multiple
2582
2604
result values from the column's value, and all are used for training.
2583
2605
When creating transformation for BigQuery Struct column, the column
2584
- should be flattened using "." as the delimiter.
2606
+ should be flattened using "." as the delimiter. Only columns with no child
2607
+ should have a transformation.
2585
2608
If an input column has no transformations on it, such a column is
2586
2609
ignored by the training, except for the targetColumn, which should have
2587
2610
no transformations defined on.
2611
+ Only one of column_transformations or column_specs should be passed.
2612
+ Consider using column_specs as column_transformations will be deprecated eventually.
2588
2613
optimization_objective_recall_value (float):
2589
2614
Optional. Required when maximize-precision-at-recall optimizationObjective was
2590
2615
picked, represents the recall value at which the optimization is done.
@@ -2628,6 +2653,9 @@ def __init__(
2628
2653
If set, the trained Model will be secured by this key.
2629
2654
2630
2655
Overrides encryption_spec_key_name set in aiplatform.init.
2656
+
2657
+ Raises:
2658
+ ValueError: When both column_transforations and column_specs were passed
2631
2659
"""
2632
2660
super ().__init__ (
2633
2661
display_name = display_name ,
@@ -2637,7 +2665,26 @@ def __init__(
2637
2665
training_encryption_spec_key_name = training_encryption_spec_key_name ,
2638
2666
model_encryption_spec_key_name = model_encryption_spec_key_name ,
2639
2667
)
2640
- self ._column_transformations = column_transformations
2668
+ # user populated transformations
2669
+ if column_transformations is not None and column_specs is not None :
2670
+ raise ValueError (
2671
+ "Both column_transformations and column_specs were passed. Only one is allowed."
2672
+ )
2673
+ if column_transformations is not None :
2674
+ self ._column_transformations = column_transformations
2675
+ warnings .simplefilter ("always" , DeprecationWarning )
2676
+ warnings .warn (
2677
+ "consider using column_specs instead. column_transformations will be deprecated in the future." ,
2678
+ DeprecationWarning ,
2679
+ stacklevel = 2 ,
2680
+ )
2681
+ elif column_specs is not None :
2682
+ self ._column_transformations = [
2683
+ {transformation : {"column_name" : column_name }}
2684
+ for column_name , transformation in column_specs .items ()
2685
+ ]
2686
+ else :
2687
+ self ._column_transformations = None
2641
2688
self ._optimization_objective = optimization_objective
2642
2689
self ._optimization_prediction_type = optimization_prediction_type
2643
2690
self ._optimization_objective_recall_value = optimization_objective_recall_value
@@ -2860,6 +2907,7 @@ def _run(
2860
2907
2861
2908
training_task_definition = schema .training_job .definition .automl_tabular
2862
2909
2910
+ # auto-populate transformations
2863
2911
if self ._column_transformations is None :
2864
2912
_LOGGER .info (
2865
2913
"No column transformations provided, so now retrieving columns from dataset in order to set default column transformations."
@@ -2870,21 +2918,19 @@ def _run(
2870
2918
for column_name in dataset .column_names
2871
2919
if column_name != target_column
2872
2920
]
2873
- column_transformations = [
2921
+ self . _column_transformations = [
2874
2922
{"auto" : {"column_name" : column_name }} for column_name in column_names
2875
2923
]
2876
2924
2877
2925
_LOGGER .info (
2878
2926
"The column transformation of type 'auto' was set for the following columns: %s."
2879
2927
% column_names
2880
2928
)
2881
- else :
2882
- column_transformations = self ._column_transformations
2883
2929
2884
2930
training_task_inputs_dict = {
2885
2931
# required inputs
2886
2932
"targetColumn" : target_column ,
2887
- "transformations" : column_transformations ,
2933
+ "transformations" : self . _column_transformations ,
2888
2934
"trainBudgetMilliNodeHours" : budget_milli_node_hours ,
2889
2935
# optional inputs
2890
2936
"weightColumnName" : weight_column ,
@@ -2935,6 +2981,44 @@ def _add_additional_experiments(self, additional_experiments: List[str]):
2935
2981
"""
2936
2982
self ._additional_experiments .extend (additional_experiments )
2937
2983
2984
+ @staticmethod
2985
+ def get_auto_column_specs (
2986
+ dataset : datasets .TabularDataset , target_column : str ,
2987
+ ) -> Dict [str , str ]:
2988
+ """Returns a dict with all non-target columns as keys and 'auto' as values.
2989
+
2990
+ Example usage:
2991
+
2992
+ column_specs = training_jobs.AutoMLTabularTrainingJob.get_auto_column_specs(
2993
+ dataset=my_dataset,
2994
+ target_column="my_target_column",
2995
+ )
2996
+
2997
+ Args:
2998
+ dataset (datasets.TabularDataset):
2999
+ Required. Intended dataset.
3000
+ target_column(str):
3001
+ Required. Intended target column.
3002
+ Returns:
3003
+ Dict[str, str]
3004
+ Column names as keys and 'auto' as values
3005
+ """
3006
+ column_names = [
3007
+ column for column in dataset .column_names if column != target_column
3008
+ ]
3009
+ column_specs = {column : "auto" for column in column_names }
3010
+ return column_specs
3011
+
3012
+ class column_data_types :
3013
+ AUTO = "auto"
3014
+ NUMERIC = "numeric"
3015
+ CATEGORICAL = "categorical"
3016
+ TIMESTAMP = "timestamp"
3017
+ TEXT = "text"
3018
+ REPEATED_NUMERIC = "repeated_numeric"
3019
+ REPEATED_CATEGORICAL = "repeated_categorical"
3020
+ REPEATED_TEXT = "repeated_text"
3021
+
2938
3022
2939
3023
class AutoMLForecastingTrainingJob (_TrainingJob ):
2940
3024
_supported_training_schemas = (schema .training_job .definition .automl_forecasting ,)
0 commit comments