Skip to content

Commit a70a607

Browse files
authored
fix: change the default value for pdf extract/chunk (#1517)
1 parent dcfbb63 commit a70a607

File tree

1 file changed

+14
-14
lines changed

1 file changed

+14
-14
lines changed

bigframes/operations/blob.py

+14-14
Original file line numberDiff line numberDiff line change
@@ -560,9 +560,9 @@ def pdf_extract(
560560
self,
561561
*,
562562
connection: Optional[str] = None,
563-
max_batching_rows: int = 8192,
564-
container_cpu: Union[float, int] = 0.33,
565-
container_memory: str = "512Mi",
563+
max_batching_rows: int = 1,
564+
container_cpu: Union[float, int] = 2,
565+
container_memory: str = "1Gi",
566566
) -> bigframes.series.Series:
567567
"""Extracts text from PDF URLs and saves the text as string.
568568
@@ -574,10 +574,10 @@ def pdf_extract(
574574
connection (str or None, default None): BQ connection used for
575575
function internet transactions, and the output blob if "dst"
576576
is str. If None, uses default connection of the session.
577-
max_batching_rows (int, default 8,192): Max number of rows per batch
577+
max_batching_rows (int, default 1): Max number of rows per batch
578578
send to cloud run to execute the function.
579-
container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers.
580-
container_memory (str, default "512Mi"): container memory size. String of the format <number><unit>. Possible values are from 512Mi to 32Gi.
579+
container_cpu (int or float, default 2): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers.
580+
container_memory (str, default "1Gi"): container memory size. String of the format <number><unit>. Possible values are from 512Mi to 32Gi.
581581
582582
Returns:
583583
bigframes.series.Series: conatins all text from a pdf file
@@ -604,11 +604,11 @@ def pdf_chunk(
604604
self,
605605
*,
606606
connection: Optional[str] = None,
607-
chunk_size: int = 1000,
607+
chunk_size: int = 2000,
608608
overlap_size: int = 200,
609-
max_batching_rows: int = 8192,
610-
container_cpu: Union[float, int] = 0.33,
611-
container_memory: str = "512Mi",
609+
max_batching_rows: int = 1,
610+
container_cpu: Union[float, int] = 2,
611+
container_memory: str = "1Gi",
612612
) -> bigframes.series.Series:
613613
"""Extracts and chunks text from PDF URLs and saves the text as
614614
arrays of strings.
@@ -620,15 +620,15 @@ def pdf_chunk(
620620
connection (str or None, default None): BQ connection used for
621621
function internet transactions, and the output blob if "dst"
622622
is str. If None, uses default connection of the session.
623-
chunk_size (int, default 1000): the desired size of each text chunk
623+
chunk_size (int, default 2000): the desired size of each text chunk
624624
(number of characters).
625625
overlap_size (int, default 200): the number of overlapping characters
626626
between consective chunks. The helps to ensure context is
627627
perserved across chunk boundaries.
628-
max_batching_rows (int, default 8,192): Max number of rows per batch
628+
max_batching_rows (int, default 1): Max number of rows per batch
629629
send to cloud run to execute the function.
630-
container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers.
631-
container_memory (str, default "512Mi"): container memory size. String of the format <number><unit>. Possible values are from 512Mi to 32Gi.
630+
container_cpu (int or float, default 2): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers.
631+
container_memory (str, default "1Gi"): container memory size. String of the format <number><unit>. Possible values are from 512Mi to 32Gi.
632632
633633
Returns:
634634
bigframe.series.Series: Series of array[str], where each string is a

0 commit comments

Comments
 (0)