chore: e2e tabular system test improvement (#1627)

sararob · web-flow · commit fb2936d9b7ae · 2022-09-06T10:15:07.000-04:00
* chore: refactor tabular system test to reduce time

* update e2e tabular test to remove automl model training

* update to deploy perm automl model to temp endpoint

* remove extra lines after comments
diff --git a/tests/system/aiplatform/e2e_base.py b/tests/system/aiplatform/e2e_base.py
@@ -35,6 +35,12 @@
 _VPC_NETWORK_URI = os.getenv("_VPC_NETWORK_URI")
 _LOCATION = "us-central1"
 
+_PROJECT_NUMBER = (
+    resourcemanager.ProjectsClient()
+    .get_project(name=f"projects/{_PROJECT}")
+    .name.split("/", 1)[1]
+)
+
 
 class TestEndToEnd(metaclass=abc.ABCMeta):
     @property
@@ -86,13 +92,7 @@ def prepare_staging_bucket(
 
         # TODO(#1415) Once PR Is merged, use the added utilities to
         # provide create/view access to Pipeline's default service account (compute)
-        project_number = (
-            resourcemanager.ProjectsClient()
-            .get_project(name=f"projects/{_PROJECT}")
-            .name.split("/", 1)[1]
-        )
-
-        service_account = f"{project_number}-compute@developer.gserviceaccount.com"
+        service_account = f"{_PROJECT_NUMBER}-compute@developer.gserviceaccount.com"
         bucket_iam_policy = bucket.get_iam_policy()
         bucket_iam_policy.setdefault("roles/storage.objectCreator", set()).add(
             f"serviceAccount:{service_account}"
diff --git a/tests/system/aiplatform/test_e2e_tabular.py b/tests/system/aiplatform/test_e2e_tabular.py
@@ -16,6 +16,7 @@
 #
 
 import os
+import time
 from urllib import request
 
 import pytest
@@ -45,6 +46,8 @@
     "median_income": 3.014700,
 }
 
+_PERMANENT_AUTOML_MODEL_RESOURCE_NAME = f"projects/{e2e_base._PROJECT_NUMBER}/locations/us-central1/models/6591277539400876032"
+
 
 @pytest.mark.usefixtures(
     "prepare_staging_bucket", "delete_staging_bucket", "tear_down_resources"
@@ -78,7 +81,6 @@ def test_end_to_end_tabular(self, shared_state):
         )
 
         # Create and import to single managed dataset for both training jobs
-
         dataset_gcs_source = f'gs://{shared_state["staging_bucket_name"]}/{_BLOB_PATH}'
 
         ds = aiplatform.TabularDataset.create(
@@ -91,7 +93,6 @@ def test_end_to_end_tabular(self, shared_state):
         shared_state["resources"].extend([ds])
 
         # Define both training jobs
-
         custom_job = aiplatform.CustomTrainingJob(
             display_name=self._make_display_name("train-housing-custom"),
             script_path=_LOCAL_TRAINING_SCRIPT_PATH,
@@ -106,8 +107,7 @@ def test_end_to_end_tabular(self, shared_state):
             optimization_objective="minimize-rmse",
         )
 
-        # Kick off both training jobs, AutoML job will take approx one hour to run
-
+        # Kick off both training jobs to check they are started correctly, then cancel the AutoML job
         custom_model = custom_job.run(
             ds,
             replica_count=1,
@@ -119,21 +119,32 @@ def test_end_to_end_tabular(self, shared_state):
             create_request_timeout=None,
         )
 
-        automl_model = automl_job.run(
+        automl_job.run(
             dataset=ds,
             target_column="median_house_value",
             model_display_name=self._make_display_name("automl-housing-model"),
             sync=False,
         )
 
-        shared_state["resources"].extend(
-            [automl_job, automl_model, custom_job, custom_model]
-        )
+        while (
+            automl_job.state != gca_pipeline_state.PipelineState.PIPELINE_STATE_RUNNING
+        ):
+            time.sleep(5)
+
+        # Cancel the AutoML job once it's successfully been created, this is async
+        automl_job.cancel()
 
-        # Deploy both models after training completes
+        shared_state["resources"].extend([custom_job, custom_model])
+
+        # Deploy the custom model after training completes
         custom_endpoint = custom_model.deploy(machine_type="n1-standard-4", sync=False)
+
+        # Create a reference to the permanent AutoML model and deloy it to a temporary endpoint
+        automl_model = aiplatform.Model(
+            model_name=_PERMANENT_AUTOML_MODEL_RESOURCE_NAME
+        )
         automl_endpoint = automl_model.deploy(machine_type="n1-standard-4", sync=False)
-        shared_state["resources"].extend([automl_endpoint, custom_endpoint])
+        shared_state["resources"].extend([custom_endpoint, automl_endpoint])
 
         custom_batch_prediction_job = custom_model.batch_predict(
             job_display_name=self._make_display_name("automl-housing-model"),
@@ -149,7 +160,6 @@ def test_end_to_end_tabular(self, shared_state):
         in_progress_done_check = custom_job.done()
         custom_job.wait_for_resource_creation()
 
-        automl_job.wait_for_resource_creation()
         custom_batch_prediction_job.wait_for_resource_creation()
 
         # Send online prediction with same instance to both deployed models
@@ -172,7 +182,6 @@ def test_end_to_end_tabular(self, shared_state):
 
         custom_batch_prediction_job.wait()
 
-        automl_endpoint.wait()
         automl_prediction = automl_endpoint.predict(
             [{k: str(v) for k, v in _INSTANCE.items()}],  # Cast int values to strings
             timeout=180.0,
@@ -189,14 +198,14 @@ def test_end_to_end_tabular(self, shared_state):
             custom_job.state
             == gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
         )
-        assert (
-            automl_job.state
-            == gca_pipeline_state.PipelineState.PIPELINE_STATE_SUCCEEDED
-        )
         assert (
             custom_batch_prediction_job.state
             == gca_job_state.JobState.JOB_STATE_SUCCEEDED
         )
+        assert (
+            automl_job.state
+            == gca_pipeline_state.PipelineState.PIPELINE_STATE_CANCELLED
+        )
 
         # Ensure a single prediction was returned
         assert len(custom_prediction.predictions) == 1