deps: remove jellyfish dependency (#1604)

tswast · web-flow · commit 1ac0e1e82c09 · 2025-04-09T10:50:12.000-05:00
* deps: remove jellyfish dependency

* add unit tests
diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py
@@ -21,7 +21,6 @@
 
 import google.api_core.exceptions
 import google.auth.credentials
-import jellyfish
 
 import bigframes.constants
 import bigframes.enums
@@ -37,6 +36,7 @@
 
 
 def _get_validated_location(value: Optional[str]) -> Optional[str]:
+    import bigframes._tools.strings
 
     if value is None or value in bigframes.constants.ALL_BIGQUERY_LOCATIONS:
         return value
@@ -53,7 +53,7 @@ def _get_validated_location(value: Optional[str]) -> Optional[str]:
 
     possibility = min(
         bigframes.constants.ALL_BIGQUERY_LOCATIONS,
-        key=lambda item: jellyfish.levenshtein_distance(location, item),
+        key=lambda item: bigframes._tools.strings.levenshtein_distance(location, item),
     )
     # There are many layers before we get to (possibly) the user's code:
     # -> bpd.options.bigquery.location = "us-central-1"
diff --git a/bigframes/_tools/__init__.py b/bigframes/_tools/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""_tools is a collection of helper functions with minimal dependencies.
+
+Please keep the dependencies used in this subpackage to a minimum to avoid the
+risk of circular dependencies.
+"""
diff --git a/bigframes/_tools/strings.py b/bigframes/_tools/strings.py
@@ -0,0 +1,66 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper methods for processing strings with minimal dependencies.
+
+Please keep the dependencies used in this subpackage to a minimum to avoid the
+risk of circular dependencies.
+"""
+
+import numpy
+
+
+def levenshtein_distance(left: str, right: str) -> int:
+    """Compute the edit distance between two strings.
+
+    This is the minumum number of substitutions, insertions, deletions
+    to get from left string to right string. See:
+    https://en.wikipedia.org/wiki/Levenshtein_distance
+    """
+    # TODO(tswast): accelerate with numba (if available) if we end up using this
+    # function in contexts other than when raising an exception or there are too
+    # many values to compare even in that context.
+
+    distances0 = numpy.zeros(len(right) + 1)
+    distances1 = numpy.zeros(len(right) + 1)
+
+    # Maximum distance is to drop all characters and then add the other string.
+    distances0[:] = range(len(right) + 1)
+
+    for left_index in range(len(left)):
+        # Calculate distance from distances0 to distances1.
+
+        # Edit distance is to delete (i + 1) chars from left to match empty right
+        distances1[0] = left_index + 1
+        # "ab"
+        for right_index in range(len(right)):
+            left_char = left[left_index]
+            right_char = right[right_index]
+
+            deletion_cost = distances0[right_index + 1] + 1
+            insertion_cost = distances1[right_index] + 1
+            if left_char == right_char:
+                substitution_cost = distances0[right_index]
+            else:
+                substitution_cost = distances0[right_index] + 1
+
+            distances1[right_index + 1] = min(
+                deletion_cost, insertion_cost, substitution_cost
+            )
+
+        temp = distances0
+        distances0 = distances1
+        distances1 = temp
+
+    return distances0[len(right)]
diff --git a/bigframes/core/groupby/dataframe_group_by.py b/bigframes/core/groupby/dataframe_group_by.py
@@ -19,7 +19,6 @@
 
 import bigframes_vendored.constants as constants
 import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
-import jellyfish
 import pandas as pd
 
 from bigframes import session
@@ -87,6 +86,8 @@ def __getitem__(
             typing.Sequence[blocks.Label],
         ],
     ):
+        import bigframes._tools.strings
+
         if utils.is_list_like(key):
             keys = list(key)
         else:
@@ -101,7 +102,7 @@ def __getitem__(
                 possible_key.append(
                     min(
                         self._block.column_labels,
-                        key=lambda item: jellyfish.damerau_levenshtein_distance(
+                        key=lambda item: bigframes._tools.strings.levenshtein_distance(
                             bad_key, item
                         ),
                     )
diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py
@@ -33,7 +33,6 @@
 import google.cloud.bigquery_storage_v1
 import google.cloud.functions_v2
 import google.cloud.resourcemanager_v3
-import jellyfish
 import pandas
 import pandas_gbq.schema.pandas_to_bigquery  # type: ignore
 
@@ -296,6 +295,7 @@ def read_gbq_table(
         filters: third_party_pandas_gbq.FiltersType = (),
         enable_snapshot: bool = True,
     ) -> dataframe.DataFrame:
+        import bigframes._tools.strings
         import bigframes.dataframe as dataframe
 
         # ---------------------------------
@@ -336,7 +336,9 @@ def read_gbq_table(
             if key not in table_column_names:
                 possibility = min(
                     table_column_names,
-                    key=lambda item: jellyfish.levenshtein_distance(key, item),
+                    key=lambda item: bigframes._tools.strings.levenshtein_distance(
+                        key, item
+                    ),
                 )
                 raise ValueError(
                     f"Column '{key}' of `columns` not found in this table. Did you mean '{possibility}'?"
@@ -354,7 +356,9 @@ def read_gbq_table(
             if key not in table_column_names:
                 possibility = min(
                     table_column_names,
-                    key=lambda item: jellyfish.levenshtein_distance(key, item),
+                    key=lambda item: bigframes._tools.strings.levenshtein_distance(
+                        key, item
+                    ),
                 )
                 raise ValueError(
                     f"Column '{key}' of `index_col` not found in this table. Did you mean '{possibility}'?"
diff --git a/setup.py b/setup.py
@@ -47,8 +47,6 @@
     "google-cloud-iam >=2.12.1",
     "google-cloud-resource-manager >=1.10.3",
     "google-cloud-storage >=2.0.0",
-    # Upper bound due to no windows build for 1.1.2
-    "jellyfish >=0.8.9,<1.1.2",
     "numpy >=1.24.0",
     "pandas >=1.5.3",
     "pandas-gbq >=0.26.1",
diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt
@@ -12,7 +12,6 @@ google-cloud-bigquery-connection==1.12.0
 google-cloud-iam==2.12.1
 google-cloud-resource-manager==1.10.3
 google-cloud-storage==2.0.0
-jellyfish==0.8.9
 numpy==1.24.0
 pandas==1.5.3
 pandas-gbq==0.26.1
diff --git a/tests/unit/_tools/__init__.py b/tests/unit/_tools/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for helper methods for processing Python objects with minimal dependencies.
+
+Please keep the dependencies used in this subpackage to a minimum to avoid the
+risk of circular dependencies.
+"""
diff --git a/tests/unit/_tools/test_strings.py b/tests/unit/_tools/test_strings.py
@@ -0,0 +1,149 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for helper methods for processing strings with minimal dependencies.
+
+Please keep the dependencies used in this subpackage to a minimum to avoid the
+risk of circular dependencies.
+"""
+
+import base64
+import random
+import sys
+import uuid
+
+import pytest
+
+from bigframes._tools import strings
+
+# To stress test some unicode comparisons.
+# https://stackoverflow.com/a/39682429/101923
+ALL_UNICODE_CHARS = "".join(chr(i) for i in range(32, 0x110000) if chr(i).isprintable())
+RANDOM_STRINGS = (
+    pytest.param(str(uuid.uuid4()), id="uuid4"),
+    pytest.param(hex(random.randint(0, sys.maxsize)), id="hex"),
+    pytest.param(
+        base64.b64encode(
+            "".join(random.choice(ALL_UNICODE_CHARS) for _ in range(100)).encode(
+                "utf-8"
+            )
+        ).decode("utf-8"),
+        id="base64",
+    ),
+    pytest.param(
+        "".join(random.choice(ALL_UNICODE_CHARS) for _ in range(8)), id="unicode8"
+    ),
+    pytest.param(
+        "".join(random.choice(ALL_UNICODE_CHARS) for _ in range(64)), id="unicode64"
+    ),
+)
+
+
+def random_char_not_equal(avoid: str):
+    random_char = avoid
+    while random_char == avoid:
+        random_char = random.choice(ALL_UNICODE_CHARS)
+    return random_char
+
+
+def random_deletion(original: str):
+    """original string with one character removed"""
+    char_index = random.randrange(len(original))
+    return original[:char_index] + original[char_index + 1 :]
+
+
+def random_insertion(original: str):
+    char_index = random.randrange(len(original))
+    random_char = random.choice(ALL_UNICODE_CHARS)
+    return original[: char_index + 1] + random_char + original[char_index + 1 :]
+
+
+@pytest.mark.parametrize(
+    ("left", "right", "expected"),
+    (
+        ("", "", 0),
+        ("abc", "abc", 0),
+        # Deletions
+        ("abcxyz", "abc", 3),
+        ("xyzabc", "abc", 3),
+        ("AXYZBC", "ABC", 3),
+        ("AXYZBC", "XYZ", 3),
+        # Insertions
+        ("abc", "abcxyz", 3),
+        ("abc", "xyzabc", 3),
+        # Substitutions
+        ("abc", "aBc", 1),
+        ("abcxyz", "aBcXyZ", 3),
+        # Combinations
+        ("abcdefxyz", "abcExyzα", 4),
+    ),
+)
+def test_levenshtein_distance(left: str, right: str, expected: int):
+    assert strings.levenshtein_distance(left, right) == expected
+
+
+@pytest.mark.parametrize(("random_string",), RANDOM_STRINGS)
+def test_levenshtein_distance_equal_strings(random_string: str):
+    """Mini fuzz test with different strings."""
+    assert strings.levenshtein_distance(random_string, random_string) == 0
+
+
+@pytest.mark.parametrize(("random_string",), RANDOM_STRINGS)
+def test_levenshtein_distance_random_deletion(random_string: str):
+    """Mini fuzz test with different strings."""
+
+    num_deleted = random.randrange(1, min(10, len(random_string)))
+    assert 1 <= num_deleted < len(random_string)
+
+    deleted = random_string
+    for _ in range(num_deleted):
+        deleted = random_deletion(deleted)
+
+    assert deleted != random_string
+    assert len(deleted) == len(random_string) - num_deleted
+    assert strings.levenshtein_distance(random_string, deleted) == num_deleted
+
+
+@pytest.mark.parametrize(("random_string",), RANDOM_STRINGS)
+def test_levenshtein_distance_random_insertion(random_string: str):
+    """Mini fuzz test with different strings."""
+
+    num_inserted = random.randrange(1, min(10, len(random_string)))
+    assert 1 <= num_inserted < len(random_string)
+
+    inserted = random_string
+    for _ in range(num_inserted):
+        inserted = random_insertion(inserted)
+
+    assert inserted != random_string
+    assert len(inserted) == len(random_string) + num_inserted
+    assert strings.levenshtein_distance(random_string, inserted) == num_inserted
+
+
+@pytest.mark.parametrize(("random_string",), RANDOM_STRINGS)
+def test_levenshtein_distance_random_substitution(random_string: str):
+    """Mini fuzz test with different strings.
+
+    Note: we don't do multiple substitutions here to avoid accidentally
+    substituting the same character twice.
+    """
+    char_index = random.randrange(len(random_string))
+    replaced_char = random_string[char_index]
+    random_char = random_char_not_equal(replaced_char)
+    substituted = (
+        random_string[:char_index] + random_char + random_string[char_index + 1 :]
+    )
+    assert substituted != random_string
+    assert len(substituted) == len(random_string)
+    assert strings.levenshtein_distance(random_string, substituted) == 1