20
20
import functools
21
21
import itertools
22
22
import typing
23
- from typing import Callable , Iterable , Optional , Sequence , Tuple
23
+ from typing import Callable , cast , Iterable , Optional , Sequence , Tuple
24
24
25
25
import google .cloud .bigquery as bq
26
26
30
30
import bigframes .core .identifiers as bfet_ids
31
31
from bigframes .core .ordering import OrderingExpression
32
32
import bigframes .core .schema as schemata
33
+ import bigframes .core .slices as slices
33
34
import bigframes .core .window_spec as window
34
35
import bigframes .dtypes
35
36
import bigframes .operations .aggregations as agg_ops
@@ -82,6 +83,11 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]:
82
83
"""Direct children of this node"""
83
84
return tuple ([])
84
85
86
+ @property
87
+ @abc .abstractmethod
88
+ def row_count (self ) -> typing .Optional [int ]:
89
+ return None
90
+
85
91
@functools .cached_property
86
92
def session (self ):
87
93
sessions = []
@@ -304,6 +310,26 @@ def variables_introduced(self) -> int:
304
310
def relation_ops_created (self ) -> int :
305
311
return 2
306
312
313
+ @property
314
+ def is_limit (self ) -> bool :
315
+ """Returns whether this is equivalent to a ORDER BY ... LIMIT N."""
316
+ # TODO: Handle tail case.
317
+ return (
318
+ (not self .start )
319
+ and (self .step == 1 )
320
+ and (self .stop is not None )
321
+ and (self .stop > 0 )
322
+ )
323
+
324
+ @property
325
+ def row_count (self ) -> typing .Optional [int ]:
326
+ child_length = self .child .row_count
327
+ if child_length is None :
328
+ return None
329
+ return slices .slice_output_rows (
330
+ (self .start , self .stop , self .step ), child_length
331
+ )
332
+
307
333
308
334
@dataclass (frozen = True , eq = False )
309
335
class JoinNode (BigFrameNode ):
@@ -351,6 +377,15 @@ def variables_introduced(self) -> int:
351
377
def joins (self ) -> bool :
352
378
return True
353
379
380
+ @property
381
+ def row_count (self ) -> Optional [int ]:
382
+ if self .type == "cross" :
383
+ if self .left_child .row_count is None or self .right_child .row_count is None :
384
+ return None
385
+ return self .left_child .row_count * self .right_child .row_count
386
+
387
+ return None
388
+
354
389
def transform_children (
355
390
self , t : Callable [[BigFrameNode ], BigFrameNode ]
356
391
) -> BigFrameNode :
@@ -412,6 +447,16 @@ def variables_introduced(self) -> int:
412
447
"""Defines the number of variables generated by the current node. Used to estimate query planning complexity."""
413
448
return len (self .schema .items ) + OVERHEAD_VARIABLES
414
449
450
+ @property
451
+ def row_count (self ) -> Optional [int ]:
452
+ sub_counts = [node .row_count for node in self .child_nodes ]
453
+ total = 0
454
+ for count in sub_counts :
455
+ if count is None :
456
+ return None
457
+ total += count
458
+ return total
459
+
415
460
def transform_children (
416
461
self , t : Callable [[BigFrameNode ], BigFrameNode ]
417
462
) -> BigFrameNode :
@@ -460,6 +505,10 @@ def variables_introduced(self) -> int:
460
505
"""Defines the number of variables generated by the current node. Used to estimate query planning complexity."""
461
506
return len (self .schema .items ) + OVERHEAD_VARIABLES
462
507
508
+ @property
509
+ def row_count (self ) -> Optional [int ]:
510
+ return None
511
+
463
512
def transform_children (
464
513
self , t : Callable [[BigFrameNode ], BigFrameNode ]
465
514
) -> BigFrameNode :
@@ -484,19 +533,18 @@ def roots(self) -> typing.Set[BigFrameNode]:
484
533
return {self }
485
534
486
535
@property
487
- def supports_fast_head (self ) -> bool :
536
+ def fast_offsets (self ) -> bool :
537
+ return False
538
+
539
+ @property
540
+ def fast_ordered_limit (self ) -> bool :
488
541
return False
489
542
490
543
def transform_children (
491
544
self , t : Callable [[BigFrameNode ], BigFrameNode ]
492
545
) -> BigFrameNode :
493
546
return self
494
547
495
- @property
496
- def row_count (self ) -> typing .Optional [int ]:
497
- """How many rows are in the data source. None means unknown."""
498
- return None
499
-
500
548
501
549
class ScanItem (typing .NamedTuple ):
502
550
id : bfet_ids .ColumnId
@@ -528,7 +576,11 @@ def variables_introduced(self) -> int:
528
576
return len (self .scan_list .items ) + 1
529
577
530
578
@property
531
- def supports_fast_head (self ) -> bool :
579
+ def fast_offsets (self ) -> bool :
580
+ return True
581
+
582
+ @property
583
+ def fast_ordered_limit (self ) -> bool :
532
584
return True
533
585
534
586
@property
@@ -635,12 +687,27 @@ def relation_ops_created(self) -> int:
635
687
return 3
636
688
637
689
@property
638
- def supports_fast_head (self ) -> bool :
639
- # Fast head is only supported when row offsets are available.
640
- # In the future, ORDER BY+LIMIT optimizations may allow fast head when
641
- # clustered and/or partitioned on ordering key
690
+ def fast_offsets (self ) -> bool :
691
+ # Fast head is only supported when row offsets are available or data is clustered over ordering key.
642
692
return (self .source .ordering is not None ) and self .source .ordering .is_sequential
643
693
694
+ @property
695
+ def fast_ordered_limit (self ) -> bool :
696
+ if self .source .ordering is None :
697
+ return False
698
+ order_cols = self .source .ordering .all_ordering_columns
699
+ # monotonicity would probably be fine
700
+ if not all (col .scalar_expression .is_identity for col in order_cols ):
701
+ return False
702
+ order_col_ids = tuple (
703
+ cast (ex .DerefOp , col .scalar_expression ).id .name for col in order_cols
704
+ )
705
+ cluster_col_ids = self .source .table .cluster_cols
706
+ if cluster_col_ids is None :
707
+ return False
708
+
709
+ return order_col_ids == cluster_col_ids [: len (order_col_ids )]
710
+
644
711
@property
645
712
def order_ambiguous (self ) -> bool :
646
713
return (
@@ -706,6 +773,10 @@ def relation_ops_created(self) -> int:
706
773
def variables_introduced (self ) -> int :
707
774
return 1
708
775
776
+ @property
777
+ def row_count (self ) -> Optional [int ]:
778
+ return self .child .row_count
779
+
709
780
def prune (self , used_cols : COLUMN_SET ) -> BigFrameNode :
710
781
if self .col_id not in used_cols :
711
782
return self .child .prune (used_cols )
@@ -726,6 +797,10 @@ def row_preserving(self) -> bool:
726
797
def variables_introduced (self ) -> int :
727
798
return 1
728
799
800
+ @property
801
+ def row_count (self ) -> Optional [int ]:
802
+ return None
803
+
729
804
def prune (self , used_cols : COLUMN_SET ) -> BigFrameNode :
730
805
consumed_ids = used_cols .union (self .predicate .column_references )
731
806
pruned_child = self .child .prune (consumed_ids )
@@ -749,6 +824,10 @@ def relation_ops_created(self) -> int:
749
824
def explicitly_ordered (self ) -> bool :
750
825
return True
751
826
827
+ @property
828
+ def row_count (self ) -> Optional [int ]:
829
+ return self .child .row_count
830
+
752
831
def prune (self , used_cols : COLUMN_SET ) -> BigFrameNode :
753
832
ordering_cols = itertools .chain .from_iterable (
754
833
map (lambda x : x .referenced_columns , self .by )
@@ -772,6 +851,10 @@ def relation_ops_created(self) -> int:
772
851
# Doesnt directly create any relational operations
773
852
return 0
774
853
854
+ @property
855
+ def row_count (self ) -> Optional [int ]:
856
+ return self .child .row_count
857
+
775
858
776
859
@dataclass (frozen = True , eq = False )
777
860
class SelectionNode (UnaryNode ):
@@ -798,6 +881,10 @@ def variables_introduced(self) -> int:
798
881
def defines_namespace (self ) -> bool :
799
882
return True
800
883
884
+ @property
885
+ def row_count (self ) -> Optional [int ]:
886
+ return self .child .row_count
887
+
801
888
def prune (self , used_cols : COLUMN_SET ) -> BigFrameNode :
802
889
pruned_selections = tuple (
803
890
select for select in self .input_output_pairs if select [1 ] in used_cols
@@ -842,6 +929,10 @@ def variables_introduced(self) -> int:
842
929
new_vars = sum (1 for i in self .assignments if not i [0 ].is_identity )
843
930
return new_vars
844
931
932
+ @property
933
+ def row_count (self ) -> Optional [int ]:
934
+ return self .child .row_count
935
+
845
936
def prune (self , used_cols : COLUMN_SET ) -> BigFrameNode :
846
937
pruned_assignments = tuple (i for i in self .assignments if i [1 ] in used_cols )
847
938
if len (pruned_assignments ) == 0 :
@@ -877,6 +968,10 @@ def variables_introduced(self) -> int:
877
968
def defines_namespace (self ) -> bool :
878
969
return True
879
970
971
+ @property
972
+ def row_count (self ) -> Optional [int ]:
973
+ return 1
974
+
880
975
881
976
@dataclass (frozen = True , eq = False )
882
977
class AggregateNode (UnaryNode ):
@@ -926,6 +1021,12 @@ def explicitly_ordered(self) -> bool:
926
1021
def defines_namespace (self ) -> bool :
927
1022
return True
928
1023
1024
+ @property
1025
+ def row_count (self ) -> Optional [int ]:
1026
+ if not self .by_column_ids :
1027
+ return 1
1028
+ return None
1029
+
929
1030
def prune (self , used_cols : COLUMN_SET ) -> BigFrameNode :
930
1031
by_ids = (ref .id for ref in self .by_column_ids )
931
1032
pruned_aggs = tuple (agg for agg in self .aggregations if agg [1 ] in used_cols )
@@ -963,6 +1064,10 @@ def relation_ops_created(self) -> int:
963
1064
# Assume that if not reprojecting, that there is a sequence of window operations sharing the same window
964
1065
return 0 if self .skip_reproject_unsafe else 4
965
1066
1067
+ @property
1068
+ def row_count (self ) -> Optional [int ]:
1069
+ return self .child .row_count
1070
+
966
1071
@functools .cached_property
967
1072
def added_field (self ) -> Field :
968
1073
input_type = self .child .get_type (self .column_name .id )
@@ -994,6 +1099,10 @@ def row_preserving(self) -> bool:
994
1099
def variables_introduced (self ) -> int :
995
1100
return 1
996
1101
1102
+ @property
1103
+ def row_count (self ) -> Optional [int ]:
1104
+ return None
1105
+
997
1106
998
1107
# TODO: Explode should create a new column instead of overriding the existing one
999
1108
@dataclass (frozen = True , eq = False )
@@ -1030,6 +1139,10 @@ def variables_introduced(self) -> int:
1030
1139
def defines_namespace (self ) -> bool :
1031
1140
return True
1032
1141
1142
+ @property
1143
+ def row_count (self ) -> Optional [int ]:
1144
+ return None
1145
+
1033
1146
def prune (self , used_cols : COLUMN_SET ) -> BigFrameNode :
1034
1147
# Cannot prune explode op
1035
1148
return self .transform_children (
0 commit comments