16
16
import functools
17
17
import itertools
18
18
import typing
19
- from typing import Optional , Sequence
19
+ from typing import Literal , Optional , Sequence
20
20
21
21
import bigframes_vendored .ibis
22
22
import bigframes_vendored .ibis .backends .bigquery .backend as ibis_bigquery
@@ -94,7 +94,7 @@ def to_sql(
94
94
return typing .cast (str , sql )
95
95
96
96
@property
97
- def columns (self ) -> typing . Tuple [ibis_types .Value , ...]:
97
+ def columns (self ) -> tuple [ibis_types .Value , ...]:
98
98
return self ._columns
99
99
100
100
@property
@@ -107,7 +107,7 @@ def _ibis_bindings(self) -> dict[str, ibis_types.Value]:
107
107
108
108
def projection (
109
109
self ,
110
- expression_id_pairs : typing . Tuple [ typing . Tuple [ex .Expression , str ], ...],
110
+ expression_id_pairs : tuple [ tuple [ex .Expression , str ], ...],
111
111
) -> UnorderedIR :
112
112
"""Apply an expression to the ArrayValue and assign the output to a column."""
113
113
cannot_inline = any (expr .expensive for expr , _ in expression_id_pairs )
@@ -126,7 +126,7 @@ def projection(
126
126
127
127
def selection (
128
128
self ,
129
- input_output_pairs : typing . Tuple [ typing . Tuple [ex .DerefOp , str ], ...],
129
+ input_output_pairs : tuple [ tuple [ex .DerefOp , str ], ...],
130
130
) -> UnorderedIR :
131
131
"""Apply an expression to the ArrayValue and assign the output to a column."""
132
132
bindings = {col : self ._get_ibis_column (col ) for col in self .column_ids }
@@ -203,7 +203,7 @@ def filter(self, predicate: ex.Expression) -> UnorderedIR:
203
203
204
204
def aggregate (
205
205
self ,
206
- aggregations : typing .Sequence [typing . Tuple [ex .Aggregation , str ]],
206
+ aggregations : typing .Sequence [tuple [ex .Aggregation , str ]],
207
207
by_column_ids : typing .Sequence [ex .DerefOp ] = (),
208
208
dropna : bool = True ,
209
209
order_by : typing .Sequence [OrderingExpression ] = (),
@@ -323,7 +323,105 @@ def from_pandas(
323
323
columns = columns ,
324
324
)
325
325
326
- ## Methods that only work with ordering
326
+ def join (
327
+ self : UnorderedIR ,
328
+ right : UnorderedIR ,
329
+ conditions : tuple [tuple [str , str ], ...],
330
+ type : Literal ["inner" , "outer" , "left" , "right" , "cross" ],
331
+ * ,
332
+ join_nulls : bool = True ,
333
+ ) -> UnorderedIR :
334
+ """Join two expressions by column equality.
335
+
336
+ Arguments:
337
+ left: Expression for left table to join.
338
+ left_column_ids: Column IDs (not label) to join by.
339
+ right: Expression for right table to join.
340
+ right_column_ids: Column IDs (not label) to join by.
341
+ how: The type of join to perform.
342
+ join_nulls (bool):
343
+ If True, will joins NULL keys to each other.
344
+ Returns:
345
+ The joined expression. The resulting columns will be, in order,
346
+ first the coalesced join keys, then, all the left columns, and
347
+ finally, all the right columns.
348
+ """
349
+ # Shouldn't need to select the column ids explicitly, but it seems that ibis has some
350
+ # bug resolving column ids otherwise, potentially because of the "JoinChain" op
351
+ left_table = self ._to_ibis_expr ().select (self .column_ids )
352
+ right_table = right ._to_ibis_expr ().select (right .column_ids )
353
+
354
+ join_conditions = [
355
+ _join_condition (
356
+ left_table [left_index ], right_table [right_index ], nullsafe = join_nulls
357
+ )
358
+ for left_index , right_index in conditions
359
+ ]
360
+
361
+ combined_table = bigframes_vendored .ibis .join (
362
+ left_table ,
363
+ right_table ,
364
+ predicates = join_conditions ,
365
+ how = type , # type: ignore
366
+ )
367
+ columns = [combined_table [col .get_name ()] for col in self .columns ] + [
368
+ combined_table [col .get_name ()] for col in right .columns
369
+ ]
370
+ return UnorderedIR (
371
+ combined_table ,
372
+ columns = columns ,
373
+ )
374
+
375
+ def isin_join (
376
+ self : UnorderedIR ,
377
+ right : UnorderedIR ,
378
+ indicator_col : str ,
379
+ conditions : tuple [str , str ],
380
+ * ,
381
+ join_nulls : bool = True ,
382
+ ) -> UnorderedIR :
383
+ """Join two expressions by column equality.
384
+
385
+ Arguments:
386
+ left: Expression for left table to join.
387
+ right: Expression for right table to join.
388
+ conditions: Id pairs to compare
389
+ Returns:
390
+ The joined expression.
391
+ """
392
+ left_table = self ._to_ibis_expr ()
393
+ right_table = right ._to_ibis_expr ()
394
+ if join_nulls : # nullsafe isin join must actually use "exists" subquery
395
+ new_column = (
396
+ (
397
+ _join_condition (
398
+ left_table [conditions [0 ]],
399
+ right_table [conditions [1 ]],
400
+ nullsafe = True ,
401
+ )
402
+ )
403
+ .any ()
404
+ .name (indicator_col )
405
+ )
406
+
407
+ else : # Can do simpler "in" subquery
408
+ new_column = (
409
+ (left_table [conditions [0 ]])
410
+ .isin ((right_table [conditions [1 ]]))
411
+ .name (indicator_col )
412
+ )
413
+
414
+ columns = tuple (
415
+ itertools .chain (
416
+ (left_table [col .get_name ()] for col in self .columns ), (new_column ,)
417
+ )
418
+ )
419
+
420
+ return UnorderedIR (
421
+ left_table ,
422
+ columns = columns ,
423
+ )
424
+
327
425
def project_window_op (
328
426
self ,
329
427
expression : ex .Aggregation ,
@@ -429,7 +527,7 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec):
429
527
group_by : typing .List [ibis_types .Value ] = (
430
528
[
431
529
typing .cast (
432
- ibis_types .Column , _as_identity (self ._compile_expression (column ))
530
+ ibis_types .Column , _as_groupable (self ._compile_expression (column ))
433
531
)
434
532
for column in window_spec .grouping_keys
435
533
]
@@ -514,7 +612,68 @@ def _convert_ordering_to_table_values(
514
612
return ordering_values
515
613
516
614
517
- def _as_identity (value : ibis_types .Value ):
615
+ def _string_cast_join_cond (
616
+ lvalue : ibis_types .Column , rvalue : ibis_types .Column
617
+ ) -> ibis_types .BooleanColumn :
618
+ result = (
619
+ lvalue .cast (ibis_dtypes .str ).fill_null (ibis_types .literal ("0" ))
620
+ == rvalue .cast (ibis_dtypes .str ).fill_null (ibis_types .literal ("0" ))
621
+ ) & (
622
+ lvalue .cast (ibis_dtypes .str ).fill_null (ibis_types .literal ("1" ))
623
+ == rvalue .cast (ibis_dtypes .str ).fill_null (ibis_types .literal ("1" ))
624
+ )
625
+ return typing .cast (ibis_types .BooleanColumn , result )
626
+
627
+
628
+ def _numeric_join_cond (
629
+ lvalue : ibis_types .Column , rvalue : ibis_types .Column
630
+ ) -> ibis_types .BooleanColumn :
631
+ lvalue1 = lvalue .fill_null (ibis_types .literal (0 ))
632
+ lvalue2 = lvalue .fill_null (ibis_types .literal (1 ))
633
+ rvalue1 = rvalue .fill_null (ibis_types .literal (0 ))
634
+ rvalue2 = rvalue .fill_null (ibis_types .literal (1 ))
635
+ if lvalue .type ().is_floating () and rvalue .type ().is_floating ():
636
+ # NaN aren't equal so need to coalesce as well with diff constants
637
+ lvalue1 = (
638
+ typing .cast (ibis_types .FloatingColumn , lvalue )
639
+ .isnan ()
640
+ .ifelse (ibis_types .literal (2 ), lvalue1 )
641
+ )
642
+ lvalue2 = (
643
+ typing .cast (ibis_types .FloatingColumn , lvalue )
644
+ .isnan ()
645
+ .ifelse (ibis_types .literal (3 ), lvalue2 )
646
+ )
647
+ rvalue1 = (
648
+ typing .cast (ibis_types .FloatingColumn , rvalue )
649
+ .isnan ()
650
+ .ifelse (ibis_types .literal (2 ), rvalue1 )
651
+ )
652
+ rvalue2 = (
653
+ typing .cast (ibis_types .FloatingColumn , rvalue )
654
+ .isnan ()
655
+ .ifelse (ibis_types .literal (3 ), rvalue2 )
656
+ )
657
+ result = (lvalue1 == rvalue1 ) & (lvalue2 == rvalue2 )
658
+ return typing .cast (ibis_types .BooleanColumn , result )
659
+
660
+
661
+ def _join_condition (
662
+ lvalue : ibis_types .Column , rvalue : ibis_types .Column , nullsafe : bool
663
+ ) -> ibis_types .BooleanColumn :
664
+ if (lvalue .type ().is_floating ()) and (lvalue .type ().is_floating ()):
665
+ # Need to always make safe join condition to handle nan, even if no nulls
666
+ return _numeric_join_cond (lvalue , rvalue )
667
+ if nullsafe :
668
+ # TODO: Define more coalesce constants for non-numeric types to avoid cast
669
+ if (lvalue .type ().is_numeric ()) and (lvalue .type ().is_numeric ()):
670
+ return _numeric_join_cond (lvalue , rvalue )
671
+ else :
672
+ return _string_cast_join_cond (lvalue , rvalue )
673
+ return typing .cast (ibis_types .BooleanColumn , lvalue == rvalue )
674
+
675
+
676
+ def _as_groupable (value : ibis_types .Value ):
518
677
# Some types need to be converted to string to enable groupby
519
678
if value .type ().is_float64 () or value .type ().is_geospatial ():
520
679
return value .cast (ibis_dtypes .str )
0 commit comments