Skip to content

Commit 1ac0e1e

Browse files
authored
deps: remove jellyfish dependency (#1604)
* deps: remove jellyfish dependency * add unit tests
1 parent f1cff16 commit 1ac0e1e

File tree

9 files changed

+265
-10
lines changed

9 files changed

+265
-10
lines changed

bigframes/_config/bigquery_options.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
import google.api_core.exceptions
2323
import google.auth.credentials
24-
import jellyfish
2524

2625
import bigframes.constants
2726
import bigframes.enums
@@ -37,6 +36,7 @@
3736

3837

3938
def _get_validated_location(value: Optional[str]) -> Optional[str]:
39+
import bigframes._tools.strings
4040

4141
if value is None or value in bigframes.constants.ALL_BIGQUERY_LOCATIONS:
4242
return value
@@ -53,7 +53,7 @@ def _get_validated_location(value: Optional[str]) -> Optional[str]:
5353

5454
possibility = min(
5555
bigframes.constants.ALL_BIGQUERY_LOCATIONS,
56-
key=lambda item: jellyfish.levenshtein_distance(location, item),
56+
key=lambda item: bigframes._tools.strings.levenshtein_distance(location, item),
5757
)
5858
# There are many layers before we get to (possibly) the user's code:
5959
# -> bpd.options.bigquery.location = "us-central-1"

bigframes/_tools/__init__.py

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""_tools is a collection of helper functions with minimal dependencies.
16+
17+
Please keep the dependencies used in this subpackage to a minimum to avoid the
18+
risk of circular dependencies.
19+
"""

bigframes/_tools/strings.py

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Helper methods for processing strings with minimal dependencies.
16+
17+
Please keep the dependencies used in this subpackage to a minimum to avoid the
18+
risk of circular dependencies.
19+
"""
20+
21+
import numpy
22+
23+
24+
def levenshtein_distance(left: str, right: str) -> int:
25+
"""Compute the edit distance between two strings.
26+
27+
This is the minumum number of substitutions, insertions, deletions
28+
to get from left string to right string. See:
29+
https://en.wikipedia.org/wiki/Levenshtein_distance
30+
"""
31+
# TODO(tswast): accelerate with numba (if available) if we end up using this
32+
# function in contexts other than when raising an exception or there are too
33+
# many values to compare even in that context.
34+
35+
distances0 = numpy.zeros(len(right) + 1)
36+
distances1 = numpy.zeros(len(right) + 1)
37+
38+
# Maximum distance is to drop all characters and then add the other string.
39+
distances0[:] = range(len(right) + 1)
40+
41+
for left_index in range(len(left)):
42+
# Calculate distance from distances0 to distances1.
43+
44+
# Edit distance is to delete (i + 1) chars from left to match empty right
45+
distances1[0] = left_index + 1
46+
# "ab"
47+
for right_index in range(len(right)):
48+
left_char = left[left_index]
49+
right_char = right[right_index]
50+
51+
deletion_cost = distances0[right_index + 1] + 1
52+
insertion_cost = distances1[right_index] + 1
53+
if left_char == right_char:
54+
substitution_cost = distances0[right_index]
55+
else:
56+
substitution_cost = distances0[right_index] + 1
57+
58+
distances1[right_index + 1] = min(
59+
deletion_cost, insertion_cost, substitution_cost
60+
)
61+
62+
temp = distances0
63+
distances0 = distances1
64+
distances1 = temp
65+
66+
return distances0[len(right)]

bigframes/core/groupby/dataframe_group_by.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020
import bigframes_vendored.constants as constants
2121
import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby
22-
import jellyfish
2322
import pandas as pd
2423

2524
from bigframes import session
@@ -87,6 +86,8 @@ def __getitem__(
8786
typing.Sequence[blocks.Label],
8887
],
8988
):
89+
import bigframes._tools.strings
90+
9091
if utils.is_list_like(key):
9192
keys = list(key)
9293
else:
@@ -101,7 +102,7 @@ def __getitem__(
101102
possible_key.append(
102103
min(
103104
self._block.column_labels,
104-
key=lambda item: jellyfish.damerau_levenshtein_distance(
105+
key=lambda item: bigframes._tools.strings.levenshtein_distance(
105106
bad_key, item
106107
),
107108
)

bigframes/session/loader.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
import google.cloud.bigquery_storage_v1
3434
import google.cloud.functions_v2
3535
import google.cloud.resourcemanager_v3
36-
import jellyfish
3736
import pandas
3837
import pandas_gbq.schema.pandas_to_bigquery # type: ignore
3938

@@ -296,6 +295,7 @@ def read_gbq_table(
296295
filters: third_party_pandas_gbq.FiltersType = (),
297296
enable_snapshot: bool = True,
298297
) -> dataframe.DataFrame:
298+
import bigframes._tools.strings
299299
import bigframes.dataframe as dataframe
300300

301301
# ---------------------------------
@@ -336,7 +336,9 @@ def read_gbq_table(
336336
if key not in table_column_names:
337337
possibility = min(
338338
table_column_names,
339-
key=lambda item: jellyfish.levenshtein_distance(key, item),
339+
key=lambda item: bigframes._tools.strings.levenshtein_distance(
340+
key, item
341+
),
340342
)
341343
raise ValueError(
342344
f"Column '{key}' of `columns` not found in this table. Did you mean '{possibility}'?"
@@ -354,7 +356,9 @@ def read_gbq_table(
354356
if key not in table_column_names:
355357
possibility = min(
356358
table_column_names,
357-
key=lambda item: jellyfish.levenshtein_distance(key, item),
359+
key=lambda item: bigframes._tools.strings.levenshtein_distance(
360+
key, item
361+
),
358362
)
359363
raise ValueError(
360364
f"Column '{key}' of `index_col` not found in this table. Did you mean '{possibility}'?"

setup.py

-2
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,6 @@
4747
"google-cloud-iam >=2.12.1",
4848
"google-cloud-resource-manager >=1.10.3",
4949
"google-cloud-storage >=2.0.0",
50-
# Upper bound due to no windows build for 1.1.2
51-
"jellyfish >=0.8.9,<1.1.2",
5250
"numpy >=1.24.0",
5351
"pandas >=1.5.3",
5452
"pandas-gbq >=0.26.1",

testing/constraints-3.9.txt

-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ google-cloud-bigquery-connection==1.12.0
1212
google-cloud-iam==2.12.1
1313
google-cloud-resource-manager==1.10.3
1414
google-cloud-storage==2.0.0
15-
jellyfish==0.8.9
1615
numpy==1.24.0
1716
pandas==1.5.3
1817
pandas-gbq==0.26.1

tests/unit/_tools/__init__.py

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Tests for helper methods for processing Python objects with minimal dependencies.
16+
17+
Please keep the dependencies used in this subpackage to a minimum to avoid the
18+
risk of circular dependencies.
19+
"""

tests/unit/_tools/test_strings.py

+149
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Tests for helper methods for processing strings with minimal dependencies.
16+
17+
Please keep the dependencies used in this subpackage to a minimum to avoid the
18+
risk of circular dependencies.
19+
"""
20+
21+
import base64
22+
import random
23+
import sys
24+
import uuid
25+
26+
import pytest
27+
28+
from bigframes._tools import strings
29+
30+
# To stress test some unicode comparisons.
31+
# https://stackoverflow.com/a/39682429/101923
32+
ALL_UNICODE_CHARS = "".join(chr(i) for i in range(32, 0x110000) if chr(i).isprintable())
33+
RANDOM_STRINGS = (
34+
pytest.param(str(uuid.uuid4()), id="uuid4"),
35+
pytest.param(hex(random.randint(0, sys.maxsize)), id="hex"),
36+
pytest.param(
37+
base64.b64encode(
38+
"".join(random.choice(ALL_UNICODE_CHARS) for _ in range(100)).encode(
39+
"utf-8"
40+
)
41+
).decode("utf-8"),
42+
id="base64",
43+
),
44+
pytest.param(
45+
"".join(random.choice(ALL_UNICODE_CHARS) for _ in range(8)), id="unicode8"
46+
),
47+
pytest.param(
48+
"".join(random.choice(ALL_UNICODE_CHARS) for _ in range(64)), id="unicode64"
49+
),
50+
)
51+
52+
53+
def random_char_not_equal(avoid: str):
54+
random_char = avoid
55+
while random_char == avoid:
56+
random_char = random.choice(ALL_UNICODE_CHARS)
57+
return random_char
58+
59+
60+
def random_deletion(original: str):
61+
"""original string with one character removed"""
62+
char_index = random.randrange(len(original))
63+
return original[:char_index] + original[char_index + 1 :]
64+
65+
66+
def random_insertion(original: str):
67+
char_index = random.randrange(len(original))
68+
random_char = random.choice(ALL_UNICODE_CHARS)
69+
return original[: char_index + 1] + random_char + original[char_index + 1 :]
70+
71+
72+
@pytest.mark.parametrize(
73+
("left", "right", "expected"),
74+
(
75+
("", "", 0),
76+
("abc", "abc", 0),
77+
# Deletions
78+
("abcxyz", "abc", 3),
79+
("xyzabc", "abc", 3),
80+
("AXYZBC", "ABC", 3),
81+
("AXYZBC", "XYZ", 3),
82+
# Insertions
83+
("abc", "abcxyz", 3),
84+
("abc", "xyzabc", 3),
85+
# Substitutions
86+
("abc", "aBc", 1),
87+
("abcxyz", "aBcXyZ", 3),
88+
# Combinations
89+
("abcdefxyz", "abcExyzα", 4),
90+
),
91+
)
92+
def test_levenshtein_distance(left: str, right: str, expected: int):
93+
assert strings.levenshtein_distance(left, right) == expected
94+
95+
96+
@pytest.mark.parametrize(("random_string",), RANDOM_STRINGS)
97+
def test_levenshtein_distance_equal_strings(random_string: str):
98+
"""Mini fuzz test with different strings."""
99+
assert strings.levenshtein_distance(random_string, random_string) == 0
100+
101+
102+
@pytest.mark.parametrize(("random_string",), RANDOM_STRINGS)
103+
def test_levenshtein_distance_random_deletion(random_string: str):
104+
"""Mini fuzz test with different strings."""
105+
106+
num_deleted = random.randrange(1, min(10, len(random_string)))
107+
assert 1 <= num_deleted < len(random_string)
108+
109+
deleted = random_string
110+
for _ in range(num_deleted):
111+
deleted = random_deletion(deleted)
112+
113+
assert deleted != random_string
114+
assert len(deleted) == len(random_string) - num_deleted
115+
assert strings.levenshtein_distance(random_string, deleted) == num_deleted
116+
117+
118+
@pytest.mark.parametrize(("random_string",), RANDOM_STRINGS)
119+
def test_levenshtein_distance_random_insertion(random_string: str):
120+
"""Mini fuzz test with different strings."""
121+
122+
num_inserted = random.randrange(1, min(10, len(random_string)))
123+
assert 1 <= num_inserted < len(random_string)
124+
125+
inserted = random_string
126+
for _ in range(num_inserted):
127+
inserted = random_insertion(inserted)
128+
129+
assert inserted != random_string
130+
assert len(inserted) == len(random_string) + num_inserted
131+
assert strings.levenshtein_distance(random_string, inserted) == num_inserted
132+
133+
134+
@pytest.mark.parametrize(("random_string",), RANDOM_STRINGS)
135+
def test_levenshtein_distance_random_substitution(random_string: str):
136+
"""Mini fuzz test with different strings.
137+
138+
Note: we don't do multiple substitutions here to avoid accidentally
139+
substituting the same character twice.
140+
"""
141+
char_index = random.randrange(len(random_string))
142+
replaced_char = random_string[char_index]
143+
random_char = random_char_not_equal(replaced_char)
144+
substituted = (
145+
random_string[:char_index] + random_char + random_string[char_index + 1 :]
146+
)
147+
assert substituted != random_string
148+
assert len(substituted) == len(random_string)
149+
assert strings.levenshtein_distance(random_string, substituted) == 1

0 commit comments

Comments
 (0)