Skip to content

Commit 641faec

Browse files
jsondaicopybara-github
authored andcommitted
fix: fix numerical NaN experiment run logging error in EvalTask.
PiperOrigin-RevId: 641981976
1 parent 4e2d87f commit 641faec

File tree

2 files changed

+65
-7
lines changed

2 files changed

+65
-7
lines changed

tests/unit/vertexai/test_evaluation.py

+57-4
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,13 @@
2828
)
2929
from vertexai import generative_models
3030
from vertexai.preview import evaluation
31+
from vertexai.preview.evaluation import _base as eval_base
32+
from vertexai.preview.evaluation import _evaluation
3133
from vertexai.preview.evaluation import utils
34+
import numpy as np
3235
import pandas as pd
3336
import pytest
3437

35-
3638
_TEST_PROJECT = "test-project"
3739
_TEST_LOCATION = "us-central1"
3840
_TEST_METRICS = (
@@ -78,6 +80,7 @@
7880
text,text,text\n
7981
"""
8082

83+
_TEST_EXPERIMENT = "test-experiment"
8184

8285
_MOCK_EXACT_MATCH_RESULT = (
8386
gapic_evaluation_service_types.EvaluateInstancesResponse(
@@ -135,6 +138,19 @@
135138
]
136139
}
137140
)
141+
MOCK_EVAL_RESULT = eval_base.EvalResult(
142+
summary_metrics={
143+
"row_count": 1,
144+
"mock_metric/mean": 1.0,
145+
"mock_metric/std": np.nan,
146+
},
147+
metrics_table=pd.DataFrame(
148+
{
149+
"response": ["test"],
150+
"mock_metric": [1.0],
151+
}
152+
),
153+
)
138154

139155

140156
@pytest.fixture
@@ -163,23 +179,22 @@ def teardown_method(self):
163179
initializer.global_pool.shutdown(wait=True)
164180

165181
def test_create_eval_task(self):
166-
test_experiment = "test_experiment_name"
167182
test_content_column_name = "test_content_column_name"
168183
test_reference_column_name = "test_reference_column_name"
169184
test_response_column_name = "test_response_column_name"
170185

171186
test_eval_task = evaluation.EvalTask(
172187
dataset=_TEST_EVAL_DATASET,
173188
metrics=_TEST_METRICS,
174-
experiment=test_experiment,
189+
experiment=_TEST_EXPERIMENT,
175190
content_column_name=test_content_column_name,
176191
reference_column_name=test_reference_column_name,
177192
response_column_name=test_response_column_name,
178193
)
179194

180195
assert test_eval_task.dataset.equals(_TEST_EVAL_DATASET)
181196
assert test_eval_task.metrics == _TEST_METRICS
182-
assert test_eval_task.experiment == test_experiment
197+
assert test_eval_task.experiment == _TEST_EXPERIMENT
183198
assert test_eval_task.content_column_name == test_content_column_name
184199
assert test_eval_task.reference_column_name == test_reference_column_name
185200
assert test_eval_task.response_column_name == test_response_column_name
@@ -470,6 +485,44 @@ def test_compute_pairwise_metrics_without_inference(self, api_transport):
470485
== 0.5
471486
)
472487

488+
def test_eval_result_experiment_run_logging(self):
489+
test_eval_task = evaluation.EvalTask(
490+
dataset=_TEST_EVAL_DATASET,
491+
metrics=_TEST_METRICS,
492+
experiment=_TEST_EXPERIMENT,
493+
)
494+
495+
with mock.patch.multiple(
496+
metadata._experiment_tracker,
497+
_experiment=mock.MagicMock(name=_TEST_EXPERIMENT),
498+
_experiment_run=None,
499+
set_experiment=mock.DEFAULT,
500+
reset=mock.DEFAULT,
501+
):
502+
with mock.patch.multiple(
503+
vertexai.preview,
504+
start_run=mock.MagicMock(),
505+
log_params=mock.DEFAULT,
506+
log_metrics=mock.DEFAULT,
507+
) as mock_metadata:
508+
with mock.patch.object(
509+
target=_evaluation,
510+
attribute="evaluate",
511+
side_effect=[MOCK_EVAL_RESULT],
512+
):
513+
test_result = test_eval_task.evaluate()
514+
515+
assert test_result.summary_metrics["row_count"] == 1
516+
assert test_result.summary_metrics["mock_metric/mean"] == 1.0
517+
assert test_result.summary_metrics["mock_metric/std"] == "NaN"
518+
mock_metadata["log_metrics"].assert_called_once_with(
519+
{
520+
"row_count": 1,
521+
"mock_metric/mean": 1.0,
522+
"mock_metric/std": "NaN",
523+
}
524+
)
525+
473526

474527
@pytest.mark.usefixtures("google_auth_mock")
475528
class TestEvaluationErrors:

vertexai/preview/evaluation/_eval_tasks.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from vertexai.preview.evaluation.metrics import (
2929
_base as metrics_base,
3030
)
31+
import numpy as np
3132

3233
if TYPE_CHECKING:
3334
import pandas as pd
@@ -284,9 +285,14 @@ def _evaluate_with_experiment(
284285
reference_column_name=self.reference_column_name,
285286
response_column_name=response_column_name,
286287
)
288+
289+
eval_result.summary_metrics = {
290+
k: ("NaN" if isinstance(v, float) and np.isnan(v) else v)
291+
for k, v in eval_result.summary_metrics.items()
292+
}
287293
try:
288294
vertexai.preview.log_metrics(eval_result.summary_metrics)
289-
except (ValueError, TypeError, exceptions.InvalidArgument) as e:
295+
except (TypeError, exceptions.InvalidArgument) as e:
290296
_LOGGER.warning(f"Experiment metrics logging failed: {str(e)}")
291297
return eval_result
292298

@@ -366,8 +372,7 @@ def _validate_experiment_run(self) -> None:
366372
if metadata._experiment_tracker.experiment_run:
367373
raise ValueError(
368374
"Experiment run already exists. Please specify the name of the"
369-
" experiment run to assign current session with in this evaluate"
370-
" method."
375+
" experiment run to assign current session within this evaluation."
371376
)
372377

373378
def _log_eval_experiment_param(

0 commit comments

Comments
 (0)