3 namespace BookStack\Search\Vectors;
5 use InvalidArgumentException;
8 * Splits a given string into smaller chunks based on specified delimiters
9 * and a predefined maximum chunk size. This will work through the given delimiters
10 * to break down text further and further to fit into the chunk size.
12 * The last delimiter is always an empty string to ensure text can always be broken down.
16 public function __construct(
17 protected int $chunkSize,
18 protected array $delimiterOrder,
20 if (count($this->delimiterOrder) === 0 || $this->delimiterOrder[count($this->delimiterOrder) - 1] !== '') {
21 $this->delimiterOrder[] = '';
24 if ($this->chunkSize < 1) {
25 throw new InvalidArgumentException('Chunk size must be greater than 0');
29 public function chunk(string $text): array
31 $delimiter = $this->delimiterOrder[0];
32 $delimiterLength = strlen($delimiter);
33 $lines = ($delimiter === '') ? str_split($text, $this->chunkSize) : explode($delimiter, $text);
35 $cChunk = ''; // Current chunk
36 $cLength = 0; // Current chunk length
37 $chunks = []; // Chunks to return
38 $lDelim = ''; // Last delimiter
40 foreach ($lines as $index => $line) {
41 $lineLength = strlen($line);
42 if ($cLength + $lineLength + $delimiterLength <= $this->chunkSize) {
43 $cChunk .= $line . $delimiter;
44 $cLength += $lineLength + $delimiterLength;
46 } else if ($lineLength <= $this->chunkSize) {
47 $chunks[] = trim($cChunk, $delimiter);
48 $cChunk = $line . $delimiter;
49 $cLength = $lineLength + $delimiterLength;
52 $subChunks = new static($this->chunkSize, array_slice($this->delimiterOrder, 1));
53 $subDelimiter = $this->delimiterOrder[1] ?? '';
54 $subDelimiterLength = strlen($subDelimiter);
55 foreach ($subChunks->chunk($line) as $subChunk) {
56 $chunkLength = strlen($subChunk);
57 if ($cLength + $chunkLength + $subDelimiterLength <= $this->chunkSize) {
58 $cChunk .= $subChunk . $subDelimiter;
59 $cLength += $chunkLength + $subDelimiterLength;
60 $lDelim = $subDelimiter;
62 $chunks[] = trim($cChunk, $lDelim);
63 $cChunk = $subChunk . $subDelimiter;
64 $cLength = $chunkLength + $subDelimiterLength;
65 $lDelim = $subDelimiter;
72 $chunks[] = trim($cChunk, $lDelim);