Skip to content

Commit 9ad8150

Browse files
authored
ENH: add validate as a param to join (#46622) (#46740)
1 parent 8f00f59 commit 9ad8150

File tree

3 files changed

+170
-4
lines changed

3 files changed

+170
-4
lines changed

doc/source/whatsnew/v1.5.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ Other enhancements
9595
- :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`)
9696
- :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
9797
- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`)
98+
- Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`)
9899
- A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`)
99100
- Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`)
100101

pandas/core/frame.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9385,6 +9385,7 @@ def join(
93859385
lsuffix: str = "",
93869386
rsuffix: str = "",
93879387
sort: bool = False,
9388+
validate: str | None = None,
93889389
) -> DataFrame:
93899390
"""
93909391
Join columns of another DataFrame.
@@ -9428,6 +9429,14 @@ def join(
94289429
sort : bool, default False
94299430
Order result DataFrame lexicographically by the join key. If False,
94309431
the order of the join key depends on the join type (how keyword).
9432+
validate : str, optional
9433+
If specified, checks if join is of specified type.
9434+
* "one_to_one" or "1:1": check if join keys are unique in both left
9435+
and right datasets.
9436+
* "one_to_many" or "1:m": check if join keys are unique in left dataset.
9437+
* "many_to_one" or "m:1": check if join keys are unique in right dataset.
9438+
* "many_to_many" or "m:m": allowed, but does not result in checks.
9439+
.. versionadded:: 1.5.0
94319440
94329441
Returns
94339442
-------
@@ -9522,7 +9531,7 @@ def join(
95229531
4 K0 A4
95239532
5 K1 A5
95249533
9525-
>>> df.join(other.set_index('key'), on='key')
9534+
>>> df.join(other.set_index('key'), on='key', validate='m:1')
95269535
key A B
95279536
0 K0 A0 B0
95289537
1 K1 A1 B1
@@ -9532,7 +9541,13 @@ def join(
95329541
5 K1 A5 B1
95339542
"""
95349543
return self._join_compat(
9535-
other, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort
9544+
other,
9545+
on=on,
9546+
how=how,
9547+
lsuffix=lsuffix,
9548+
rsuffix=rsuffix,
9549+
sort=sort,
9550+
validate=validate,
95369551
)
95379552

95389553
def _join_compat(
@@ -9543,6 +9558,7 @@ def _join_compat(
95439558
lsuffix: str = "",
95449559
rsuffix: str = "",
95459560
sort: bool = False,
9561+
validate: str | None = None,
95469562
):
95479563
from pandas.core.reshape.concat import concat
95489564
from pandas.core.reshape.merge import merge
@@ -9561,6 +9577,7 @@ def _join_compat(
95619577
on=on,
95629578
suffixes=(lsuffix, rsuffix),
95639579
sort=sort,
9580+
validate=validate,
95649581
)
95659582
return merge(
95669583
self,
@@ -9571,6 +9588,7 @@ def _join_compat(
95719588
right_index=True,
95729589
suffixes=(lsuffix, rsuffix),
95739590
sort=sort,
9591+
validate=validate,
95749592
)
95759593
else:
95769594
if on is not None:
@@ -9603,7 +9621,12 @@ def _join_compat(
96039621

96049622
for frame in frames[1:]:
96059623
joined = merge(
9606-
joined, frame, how=how, left_index=True, right_index=True
9624+
joined,
9625+
frame,
9626+
how=how,
9627+
left_index=True,
9628+
right_index=True,
9629+
validate=validate,
96079630
)
96089631

96099632
return joined

pandas/tests/frame/methods/test_join.py

Lines changed: 143 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import numpy as np
44
import pytest
55

6+
from pandas.errors import MergeError
7+
68
import pandas as pd
79
from pandas import (
810
DataFrame,
@@ -12,6 +14,7 @@
1214
period_range,
1315
)
1416
import pandas._testing as tm
17+
from pandas.core.reshape.concat import concat
1518

1619

1720
@pytest.fixture
@@ -33,6 +36,39 @@ def right():
3336
return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2])
3437

3538

39+
@pytest.fixture
40+
def left_no_dup():
41+
return DataFrame(
42+
{"a": ["a", "b", "c", "d"], "b": ["cat", "dog", "weasel", "horse"]},
43+
index=range(4),
44+
)
45+
46+
47+
@pytest.fixture
48+
def right_no_dup():
49+
return DataFrame(
50+
{
51+
"a": ["a", "b", "c", "d", "e"],
52+
"c": ["meow", "bark", "um... weasel noise?", "nay", "chirp"],
53+
},
54+
index=range(5),
55+
).set_index("a")
56+
57+
58+
@pytest.fixture
59+
def left_w_dups(left_no_dup):
60+
return concat(
61+
[left_no_dup, DataFrame({"a": ["a"], "b": ["cow"]}, index=[3])], sort=True
62+
)
63+
64+
65+
@pytest.fixture
66+
def right_w_dups(right_no_dup):
67+
return concat(
68+
[right_no_dup, DataFrame({"a": ["e"], "c": ["moo"]}, index=[3])]
69+
).set_index("a")
70+
71+
3672
@pytest.mark.parametrize(
3773
"how, sort, expected",
3874
[
@@ -78,7 +114,7 @@ def right():
78114
)
79115
def test_join(left, right, how, sort, expected):
80116

81-
result = left.join(right, how=how, sort=sort)
117+
result = left.join(right, how=how, sort=sort, validate="1:1")
82118
tm.assert_frame_equal(result, expected)
83119

84120

@@ -104,6 +140,112 @@ def test_suffix_on_list_join():
104140
tm.assert_frame_equal(arr_joined, norm_joined)
105141

106142

143+
def test_join_invalid_validate(left_no_dup, right_no_dup):
144+
# GH 46622
145+
# Check invalid arguments
146+
msg = "Not a valid argument for validate"
147+
with pytest.raises(ValueError, match=msg):
148+
left_no_dup.merge(right_no_dup, on="a", validate="invalid")
149+
150+
151+
def test_join_on_single_col_dup_on_right(left_no_dup, right_w_dups):
152+
# GH 46622
153+
# Dups on right allowed by one_to_many constraint
154+
left_no_dup.join(
155+
right_w_dups,
156+
on="a",
157+
validate="one_to_many",
158+
)
159+
160+
# Dups on right not allowed by one_to_one constraint
161+
msg = "Merge keys are not unique in right dataset; not a one-to-one merge"
162+
with pytest.raises(MergeError, match=msg):
163+
left_no_dup.join(
164+
right_w_dups,
165+
on="a",
166+
validate="one_to_one",
167+
)
168+
169+
170+
def test_join_on_single_col_dup_on_left(left_w_dups, right_no_dup):
171+
# GH 46622
172+
# Dups on left allowed by many_to_one constraint
173+
left_w_dups.join(
174+
right_no_dup,
175+
on="a",
176+
validate="many_to_one",
177+
)
178+
179+
# Dups on left not allowed by one_to_one constraint
180+
msg = "Merge keys are not unique in left dataset; not a one-to-one merge"
181+
with pytest.raises(MergeError, match=msg):
182+
left_w_dups.join(
183+
right_no_dup,
184+
on="a",
185+
validate="one_to_one",
186+
)
187+
188+
189+
def test_join_on_single_col_dup_on_both(left_w_dups, right_w_dups):
190+
# GH 46622
191+
# Dups on both allowed by many_to_many constraint
192+
left_w_dups.join(right_w_dups, on="a", validate="many_to_many")
193+
194+
# Dups on both not allowed by many_to_one constraint
195+
msg = "Merge keys are not unique in right dataset; not a many-to-one merge"
196+
with pytest.raises(MergeError, match=msg):
197+
left_w_dups.join(
198+
right_w_dups,
199+
on="a",
200+
validate="many_to_one",
201+
)
202+
203+
# Dups on both not allowed by one_to_many constraint
204+
msg = "Merge keys are not unique in left dataset; not a one-to-many merge"
205+
with pytest.raises(MergeError, match=msg):
206+
left_w_dups.join(
207+
right_w_dups,
208+
on="a",
209+
validate="one_to_many",
210+
)
211+
212+
213+
def test_join_on_multi_col_check_dup():
214+
# GH 46622
215+
# Two column join, dups in both, but jointly no dups
216+
left = DataFrame(
217+
{
218+
"a": ["a", "a", "b", "b"],
219+
"b": [0, 1, 0, 1],
220+
"c": ["cat", "dog", "weasel", "horse"],
221+
},
222+
index=range(4),
223+
).set_index(["a", "b"])
224+
225+
right = DataFrame(
226+
{
227+
"a": ["a", "a", "b"],
228+
"b": [0, 1, 0],
229+
"d": ["meow", "bark", "um... weasel noise?"],
230+
},
231+
index=range(3),
232+
).set_index(["a", "b"])
233+
234+
expected_multi = DataFrame(
235+
{
236+
"a": ["a", "a", "b"],
237+
"b": [0, 1, 0],
238+
"c": ["cat", "dog", "weasel"],
239+
"d": ["meow", "bark", "um... weasel noise?"],
240+
},
241+
index=range(3),
242+
).set_index(["a", "b"])
243+
244+
# Jointly no dups allowed by one_to_one constraint
245+
result = left.join(right, how="inner", validate="1:1")
246+
tm.assert_frame_equal(result, expected_multi)
247+
248+
107249
def test_join_index(float_frame):
108250
# left / right
109251

0 commit comments

Comments
 (0)