feat: Add notebook helper functions to preview eval SDK to display and visualize evaluation results in an IPython environment

jsondai · copybara-github · commit 1ab43441419e · 2025-02-10T17:25:09.000-08:00
PiperOrigin-RevId: 725404155
diff --git a/vertexai/evaluation/eval_task.py b/vertexai/evaluation/eval_task.py
@@ -14,8 +14,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import logging
 from typing import Any, Callable, Dict, List, Literal, Optional, TYPE_CHECKING, Union
 import uuid
+import warnings
 
 from google.api_core import exceptions
 import vertexai
@@ -47,6 +49,8 @@
     IPython_display = None
 
 _LOGGER = base.Logger(__name__)
+logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
+warnings.filterwarnings("ignore")
 
 EvalResult = eval_base.EvalResult
 GenerativeModel = generative_models.GenerativeModel
diff --git a/vertexai/preview/evaluation/eval_task.py b/vertexai/preview/evaluation/eval_task.py
@@ -16,6 +16,8 @@
 #
 """Evaluation Task class."""
 
+import logging
+import warnings
 from typing import Any, Callable, Dict, List, Literal, Optional, TYPE_CHECKING, Union
 import uuid
 
@@ -48,6 +50,8 @@
     IPython_display = None
 
 _LOGGER = base.Logger(__name__)
+logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
+warnings.filterwarnings("ignore")
 
 AutoraterConfig = eval_base.AutoraterConfig
 EvalResult = eval_base.EvalResult
diff --git a/vertexai/preview/evaluation/notebook_utils.py b/vertexai/preview/evaluation/notebook_utils.py
@@ -0,0 +1,251 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Python functions which run only within a Jupyter or Colab notebook."""
+
+import random
+import string
+import sys
+from typing import List, Optional, Tuple
+
+from vertexai.preview.evaluation import _base as eval_base
+from vertexai.preview.evaluation import constants
+
+# pylint: disable=g-import-not-at-top
+try:
+    import pandas as pd
+except ImportError:
+    pandas = None
+
+_MARKDOWN_H2 = "##"
+_MARKDOWN_H3 = "###"
+_DEFAULT_COLUMNS_TO_DISPLAY = [
+    constants.Dataset.MODEL_RESPONSE_COLUMN,
+    constants.Dataset.BASELINE_MODEL_RESPONSE_COLUMN,
+    constants.Dataset.PROMPT_COLUMN,
+    constants.MetricResult.ROW_COUNT_KEY,
+]
+_DEFAULT_RADAR_RANGE = (0, 5)
+
+
+def _get_ipython_shell_name() -> str:
+    if "IPython" in sys.modules:
+        # pylint: disable=g-import-not-at-top, g-importing-member
+        from IPython import get_ipython
+
+        return get_ipython().__class__.__name__
+    return ""
+
+
+def is_ipython_available() -> bool:
+    return _get_ipython_shell_name()
+
+
+def _filter_df(
+    df: pd.DataFrame, substrings: Optional[List[str]] = None
+) -> pd.DataFrame:
+    """Filters a DataFrame to include only columns containing the given substrings."""
+    if substrings is None:
+        return df
+
+    return df.copy().filter(
+        [
+            column_name
+            for column_name in df.columns
+            if any(substring in column_name for substring in substrings)
+        ]
+    )
+
+
+def display_eval_result(
+    eval_result: "eval_base.EvalResult",
+    title: Optional[str] = None,
+    metrics: Optional[List[str]] = None,
+) -> None:
+    """Displays evaluation results in a notebook using IPython.display.
+
+    Args:
+        eval_result: An object containing evaluation results with
+          `summary_metrics` and `metrics_table` attributes.
+        title: A string title to display above the results.
+        metrics: A list of metric name substrings to filter displayed columns. If
+          provided, only metrics whose names contain any of these strings will be
+          displayed.
+    """
+    if not is_ipython_available():
+        return
+    # pylint: disable=g-import-not-at-top, g-importing-member
+    from IPython.display import display
+    from IPython.display import Markdown
+
+    summary_metrics, metrics_table = (
+        eval_result.summary_metrics,
+        eval_result.metrics_table,
+    )
+
+    summary_metrics_df = pd.DataFrame.from_dict(summary_metrics, orient="index").T
+
+    if metrics:
+        columns_to_keep = metrics + _DEFAULT_COLUMNS_TO_DISPLAY
+        summary_metrics_df = _filter_df(summary_metrics_df, columns_to_keep)
+        metrics_table = _filter_df(metrics_table, columns_to_keep)
+
+    # Display the title in Markdown.
+    if title:
+        display(Markdown(f"{_MARKDOWN_H2} {title}"))
+
+    # Display the summary metrics.
+    display(Markdown(f"{_MARKDOWN_H3} Summary Metrics"))
+    display(summary_metrics_df)
+
+    # Display the metrics table.
+    display(Markdown(f"{_MARKDOWN_H3} Row-based Metrics"))
+    display(metrics_table)
+
+
+def display_explanations(
+    eval_result: "eval_base.EvalResult",
+    num: int = 1,
+    metrics: Optional[List[str]] = None,
+) -> None:
+    """Displays the explanations in a notebook using IPython.display.
+
+    Args:
+        eval_result: An object containing evaluation results. It is expected to
+          have attributes `summary_metrics` and `metrics_table`.
+        num: The number of row samples to display. Defaults to 1. If the number of
+          rows is less than `num`, all rows will be displayed.
+        metrics: A list of metric name substrings to filter displayed columns. If
+          provided, only metrics whose names contain any of these strings will be
+          displayed.
+    """
+    if not is_ipython_available():
+        return
+    # pylint: disable=g-import-not-at-top, g-importing-member
+    from IPython.display import display
+    from IPython.display import HTML
+
+    style = "white-space: pre-wrap; width: 1500px; overflow-x: auto;"
+    metrics_table = eval_result.metrics_table
+
+    if num < 1:
+        raise ValueError("Num must be greater than 0.")
+    num = min(num, len(metrics_table))
+
+    df = metrics_table.sample(n=num)
+
+    if metrics:
+        columns_to_keep = metrics + _DEFAULT_COLUMNS_TO_DISPLAY
+        df = _filter_df(df, columns_to_keep)
+
+    for _, row in df.iterrows():
+        for col in df.columns:
+            display(HTML(f"<div style='{style}'><h4>{col}:</h4>{row[col]}</div>"))
+        display(HTML("<hr>"))
+
+
+def display_radar_plot(
+    eval_results_with_title: List[Tuple[str, "eval_base.EvalResult"]],
+    metrics: List[str],
+    radar_range: Tuple[float, float] = _DEFAULT_RADAR_RANGE,
+) -> None:
+    """Plots a radar plot comparing evaluation results.
+
+    Args:
+        eval_results_with_title: List of (title, eval_result) tuples.
+        metrics: A list of metrics whose mean values will be plotted.
+        radar_range: Range of the radar plot axes.
+    """
+    # pylint: disable=g-import-not-at-top
+    try:
+        import plotly.graph_objects as go
+    except ImportError as exc:
+        raise ImportError(
+            '`plotly` is not installed. Please install using "!pip install plotly"'
+        ) from exc
+
+    fig = go.Figure()
+    for title, eval_result in eval_results_with_title:
+        summary_metrics = eval_result.summary_metrics
+        if metrics:
+            summary_metrics = {
+                key.replace("/mean", ""): summary_metrics[key]
+                for key in summary_metrics
+                if any(selected_metric + "/mean" in key for selected_metric in metrics)
+            }
+        fig.add_trace(
+            go.Scatterpolar(
+                r=list(summary_metrics.values()),
+                theta=list(summary_metrics.keys()),
+                fill="toself",
+                name=title,
+            )
+        )
+    fig.update_layout(
+        polar=dict(radialaxis=dict(visible=True, range=radar_range)),
+        showlegend=True,
+    )
+    fig.show()
+
+
+def display_bar_plot(
+    eval_results_with_title: List[Tuple[str, "eval_base.EvalResult"]],
+    metrics: List[str],
+) -> None:
+    """Plots a bar plot comparing evaluation results.
+
+    Args:
+        eval_results_with_title: List of (title, eval_result) tuples.
+        metrics: A list of metrics whose mean values will be plotted.
+    """
+
+    # pylint: disable=g-import-not-at-top
+    try:
+        import plotly.graph_objects as go
+    except ImportError as exc:
+        raise ImportError(
+            '`plotly` is not installed. Please install using "!pip install plotly"'
+        ) from exc
+
+    data = []
+
+    for title, eval_result in eval_results_with_title:
+        summary_metrics = eval_result.summary_metrics
+        mean_summary_metrics = [f"{metric}/mean" for metric in metrics]
+        updated_summary_metrics = []
+        if metrics:
+            for k, v in summary_metrics.items():
+                if k in mean_summary_metrics:
+                    updated_summary_metrics.append((k, v))
+            summary_metrics = dict(updated_summary_metrics)
+
+        data.append(
+            go.Bar(
+                x=list(summary_metrics.keys()),
+                y=list(summary_metrics.values()),
+                name=title,
+            )
+        )
+
+    fig = go.Figure(data=data)
+
+    fig.update_layout(barmode="group", showlegend=True)
+    fig.show()
+
+
+def generate_uuid(length: int = 8) -> str:
+    """Generates a uuid of a specified length (default=8)."""
+    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))