namespace BookStack\Search;
-use BookStack\Actions\Tag;
+use BookStack\Activity\Models\Tag;
use BookStack\Entities\EntityProvider;
use BookStack\Entities\Models\Entity;
use BookStack\Entities\Models\Page;
-use DOMDocument;
+use BookStack\Util\HtmlDocument;
use DOMNode;
use Illuminate\Database\Eloquent\Builder;
use Illuminate\Support\Collection;
{
/**
* A list of delimiter characters used to break-up parsed content into terms for indexing.
- *
- * @var string
*/
- public static $delimiters = " \n\t.,!?:;()[]{}<>`'\"";
+ public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"«»";
/**
- * @var EntityProvider
+ * A list of delimiter which could be commonly used within a single term and also indicate a break between terms.
+ * The indexer will index the full term with these delimiters, plus the terms split via these delimiters.
*/
- protected $entityProvider;
+ public static string $softDelimiters = ".-";
- public function __construct(EntityProvider $entityProvider)
- {
- $this->entityProvider = $entityProvider;
+ public function __construct(
+ protected EntityProvider $entityProvider
+ ) {
}
/**
* Index the given entity.
*/
- public function indexEntity(Entity $entity)
+ public function indexEntity(Entity $entity): void
{
$this->deleteEntityTerms($entity);
$terms = $this->entityToTermDataArray($entity);
- SearchTerm::query()->insert($terms);
+ $this->insertTerms($terms);
}
/**
*
* @param Entity[] $entities
*/
- public function indexEntities(array $entities)
+ public function indexEntities(array $entities): void
{
$terms = [];
foreach ($entities as $entity) {
array_push($terms, ...$entityTerms);
}
- $chunkedTerms = array_chunk($terms, 500);
- foreach ($chunkedTerms as $termChunk) {
- SearchTerm::query()->insert($termChunk);
- }
+ $this->insertTerms($terms);
}
/**
*
* @param callable(Entity, int, int):void|null $progressCallback
*/
- public function indexAllEntities(?callable $progressCallback = null)
+ public function indexAllEntities(?callable $progressCallback = null): void
{
SearchTerm::query()->truncate();
/**
* Delete related Entity search terms.
*/
- public function deleteEntityTerms(Entity $entity)
+ public function deleteEntityTerms(Entity $entity): void
{
$entity->searchTerms()->delete();
}
+ /**
+ * Insert the given terms into the database.
+ * Chunks through the given terms to remain within database limits.
+ * @param array[] $terms
+ */
+ protected function insertTerms(array $terms): void
+ {
+ $chunkedTerms = array_chunk($terms, 500);
+ foreach ($chunkedTerms as $termChunk) {
+ SearchTerm::query()->insert($termChunk);
+ }
+ }
+
/**
* Create a scored term array from the given text, where the keys are the terms
* and the values are their scores.
*
* @returns array<string, int>
*/
- protected function generateTermScoreMapFromText(string $text, int $scoreAdjustment = 1): array
+ protected function generateTermScoreMapFromText(string $text, float $scoreAdjustment = 1): array
{
$termMap = $this->textToTermCountMap($text);
foreach ($termMap as $term => $count) {
- $termMap[$term] = $count * $scoreAdjustment;
+ $termMap[$term] = floor($count * $scoreAdjustment);
}
return $termMap;
'h6' => 1.5,
];
- $html = '<body>' . $html . '</body>';
$html = str_ireplace(['<br>', '<br />', '<br/>'], "\n", $html);
+ $doc = new HtmlDocument($html);
- libxml_use_internal_errors(true);
- $doc = new DOMDocument();
- $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
-
- $topElems = $doc->documentElement->childNodes->item(0)->childNodes;
/** @var DOMNode $child */
- foreach ($topElems as $child) {
+ foreach ($doc->getBodyChildren() as $child) {
$nodeName = $child->nodeName;
$termCounts = $this->textToTermCountMap(trim($child->textContent));
foreach ($termCounts as $term => $count) {
*/
protected function generateTermScoreMapFromTags(array $tags): array
{
- $scoreMap = [];
$names = [];
$values = [];
protected function textToTermCountMap(string $text): array
{
$tokenMap = []; // {TextToken => OccurrenceCount}
- $splitChars = static::$delimiters;
- $token = strtok($text, $splitChars);
+ $softDelims = static::$softDelimiters;
+ $tokenizer = new SearchTextTokenizer($text, static::$delimiters);
+ $extendedToken = '';
+ $extendedLen = 0;
+
+ $token = $tokenizer->next();
while ($token !== false) {
- if (!isset($tokenMap[$token])) {
- $tokenMap[$token] = 0;
+ $delim = $tokenizer->previousDelimiter();
+
+ if ($delim && str_contains($softDelims, $delim) && $token !== '') {
+ $extendedToken .= $delim . $token;
+ $extendedLen++;
+ } else {
+ if ($extendedLen > 1) {
+ $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
+ }
+ $extendedToken = $token;
+ $extendedLen = 1;
}
- $tokenMap[$token]++;
- $token = strtok($splitChars);
+
+ if ($token) {
+ $tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1;
+ }
+
+ $token = $tokenizer->next();
+ }
+
+ if ($extendedLen > 1) {
+ $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
}
return $tokenMap;