|
28 | 28 | )
|
29 | 29 | from vertexai import generative_models
|
30 | 30 | from vertexai.preview import evaluation
|
| 31 | +from vertexai.preview.evaluation import _base as eval_base |
| 32 | +from vertexai.preview.evaluation import _evaluation |
31 | 33 | from vertexai.preview.evaluation import utils
|
| 34 | +import numpy as np |
32 | 35 | import pandas as pd
|
33 | 36 | import pytest
|
34 | 37 |
|
35 |
| - |
36 | 38 | _TEST_PROJECT = "test-project"
|
37 | 39 | _TEST_LOCATION = "us-central1"
|
38 | 40 | _TEST_METRICS = (
|
|
78 | 80 | text,text,text\n
|
79 | 81 | """
|
80 | 82 |
|
| 83 | +_TEST_EXPERIMENT = "test-experiment" |
81 | 84 |
|
82 | 85 | _MOCK_EXACT_MATCH_RESULT = (
|
83 | 86 | gapic_evaluation_service_types.EvaluateInstancesResponse(
|
|
135 | 138 | ]
|
136 | 139 | }
|
137 | 140 | )
|
| 141 | +MOCK_EVAL_RESULT = eval_base.EvalResult( |
| 142 | + summary_metrics={ |
| 143 | + "row_count": 1, |
| 144 | + "mock_metric/mean": 1.0, |
| 145 | + "mock_metric/std": np.nan, |
| 146 | + }, |
| 147 | + metrics_table=pd.DataFrame( |
| 148 | + { |
| 149 | + "response": ["test"], |
| 150 | + "mock_metric": [1.0], |
| 151 | + } |
| 152 | + ), |
| 153 | +) |
138 | 154 |
|
139 | 155 |
|
140 | 156 | @pytest.fixture
|
@@ -163,23 +179,22 @@ def teardown_method(self):
|
163 | 179 | initializer.global_pool.shutdown(wait=True)
|
164 | 180 |
|
165 | 181 | def test_create_eval_task(self):
|
166 |
| - test_experiment = "test_experiment_name" |
167 | 182 | test_content_column_name = "test_content_column_name"
|
168 | 183 | test_reference_column_name = "test_reference_column_name"
|
169 | 184 | test_response_column_name = "test_response_column_name"
|
170 | 185 |
|
171 | 186 | test_eval_task = evaluation.EvalTask(
|
172 | 187 | dataset=_TEST_EVAL_DATASET,
|
173 | 188 | metrics=_TEST_METRICS,
|
174 |
| - experiment=test_experiment, |
| 189 | + experiment=_TEST_EXPERIMENT, |
175 | 190 | content_column_name=test_content_column_name,
|
176 | 191 | reference_column_name=test_reference_column_name,
|
177 | 192 | response_column_name=test_response_column_name,
|
178 | 193 | )
|
179 | 194 |
|
180 | 195 | assert test_eval_task.dataset.equals(_TEST_EVAL_DATASET)
|
181 | 196 | assert test_eval_task.metrics == _TEST_METRICS
|
182 |
| - assert test_eval_task.experiment == test_experiment |
| 197 | + assert test_eval_task.experiment == _TEST_EXPERIMENT |
183 | 198 | assert test_eval_task.content_column_name == test_content_column_name
|
184 | 199 | assert test_eval_task.reference_column_name == test_reference_column_name
|
185 | 200 | assert test_eval_task.response_column_name == test_response_column_name
|
@@ -470,6 +485,44 @@ def test_compute_pairwise_metrics_without_inference(self, api_transport):
|
470 | 485 | == 0.5
|
471 | 486 | )
|
472 | 487 |
|
| 488 | + def test_eval_result_experiment_run_logging(self): |
| 489 | + test_eval_task = evaluation.EvalTask( |
| 490 | + dataset=_TEST_EVAL_DATASET, |
| 491 | + metrics=_TEST_METRICS, |
| 492 | + experiment=_TEST_EXPERIMENT, |
| 493 | + ) |
| 494 | + |
| 495 | + with mock.patch.multiple( |
| 496 | + metadata._experiment_tracker, |
| 497 | + _experiment=mock.MagicMock(name=_TEST_EXPERIMENT), |
| 498 | + _experiment_run=None, |
| 499 | + set_experiment=mock.DEFAULT, |
| 500 | + reset=mock.DEFAULT, |
| 501 | + ): |
| 502 | + with mock.patch.multiple( |
| 503 | + vertexai.preview, |
| 504 | + start_run=mock.MagicMock(), |
| 505 | + log_params=mock.DEFAULT, |
| 506 | + log_metrics=mock.DEFAULT, |
| 507 | + ) as mock_metadata: |
| 508 | + with mock.patch.object( |
| 509 | + target=_evaluation, |
| 510 | + attribute="evaluate", |
| 511 | + side_effect=[MOCK_EVAL_RESULT], |
| 512 | + ): |
| 513 | + test_result = test_eval_task.evaluate() |
| 514 | + |
| 515 | + assert test_result.summary_metrics["row_count"] == 1 |
| 516 | + assert test_result.summary_metrics["mock_metric/mean"] == 1.0 |
| 517 | + assert test_result.summary_metrics["mock_metric/std"] == "NaN" |
| 518 | + mock_metadata["log_metrics"].assert_called_once_with( |
| 519 | + { |
| 520 | + "row_count": 1, |
| 521 | + "mock_metric/mean": 1.0, |
| 522 | + "mock_metric/std": "NaN", |
| 523 | + } |
| 524 | + ) |
| 525 | + |
473 | 526 |
|
474 | 527 | @pytest.mark.usefixtures("google_auth_mock")
|
475 | 528 | class TestEvaluationErrors:
|
|
0 commit comments