-<?php namespace BookStack\Entities\Tools;
+<?php
+namespace BookStack\Entities\Tools;
+
+use BookStack\Actions\Tag;
use BookStack\Entities\EntityProvider;
use BookStack\Entities\Models\Entity;
+use BookStack\Entities\Models\Page;
use BookStack\Entities\Models\SearchTerm;
+use DOMDocument;
+use DOMNode;
use Illuminate\Support\Collection;
class SearchIndex
{
/**
- * @var SearchTerm
+ * A list of delimiter characters used to break-up parsed content into terms for indexing.
+ *
+ * @var string
*/
- protected $searchTerm;
+ public static $delimiters = " \n\t.,!?:;()[]{}<>`'\"";
/**
* @var EntityProvider
*/
protected $entityProvider;
-
- public function __construct(SearchTerm $searchTerm, EntityProvider $entityProvider)
+ public function __construct(EntityProvider $entityProvider)
{
- $this->searchTerm = $searchTerm;
$this->entityProvider = $entityProvider;
}
-
/**
* Index the given entity.
*/
public function indexEntity(Entity $entity)
{
$this->deleteEntityTerms($entity);
- $nameTerms = $this->generateTermArrayFromText($entity->name, 5 * $entity->searchFactor);
- $bodyTerms = $this->generateTermArrayFromText($entity->getText(), 1 * $entity->searchFactor);
- $terms = array_merge($nameTerms, $bodyTerms);
- foreach ($terms as $index => $term) {
- $terms[$index]['entity_type'] = $entity->getMorphClass();
- $terms[$index]['entity_id'] = $entity->id;
- }
- $this->searchTerm->newQuery()->insert($terms);
+ $terms = $this->entityToTermDataArray($entity);
+ SearchTerm::query()->insert($terms);
}
/**
- * Index multiple Entities at once
+ * Index multiple Entities at once.
+ *
* @param Entity[] $entities
*/
- protected function indexEntities(array $entities)
+ public function indexEntities(array $entities)
{
$terms = [];
foreach ($entities as $entity) {
- $nameTerms = $this->generateTermArrayFromText($entity->name, 5 * $entity->searchFactor);
- $bodyTerms = $this->generateTermArrayFromText($entity->getText(), 1 * $entity->searchFactor);
- foreach (array_merge($nameTerms, $bodyTerms) as $term) {
- $term['entity_id'] = $entity->id;
- $term['entity_type'] = $entity->getMorphClass();
- $terms[] = $term;
- }
+ $entityTerms = $this->entityToTermDataArray($entity);
+ array_push($terms, ...$entityTerms);
}
$chunkedTerms = array_chunk($terms, 500);
foreach ($chunkedTerms as $termChunk) {
- $this->searchTerm->newQuery()->insert($termChunk);
+ SearchTerm::query()->insert($termChunk);
}
}
/**
* Delete and re-index the terms for all entities in the system.
+ * Can take a callback which is used for reporting progress.
+ * Callback receives three arguments:
+ * - An instance of the model being processed
+ * - The number that have been processed so far.
+ * - The total number of that model to be processed.
+ *
+ * @param callable(Entity, int, int)|null $progressCallback
*/
- public function indexAllEntities()
+ public function indexAllEntities(?callable $progressCallback = null)
{
- $this->searchTerm->newQuery()->truncate();
+ SearchTerm::query()->truncate();
foreach ($this->entityProvider->all() as $entityModel) {
- $selectFields = ['id', 'name', $entityModel->textField];
+ $indexContentField = $entityModel instanceof Page ? 'html' : 'description';
+ $selectFields = ['id', 'name', $indexContentField];
+ $total = $entityModel->newQuery()->withTrashed()->count();
+ $chunkSize = 250;
+ $processed = 0;
+
+ $chunkCallback = function (Collection $entities) use ($progressCallback, &$processed, $total, $chunkSize, $entityModel) {
+ $this->indexEntities($entities->all());
+ $processed = min($processed + $chunkSize, $total);
+
+ if (is_callable($progressCallback)) {
+ $progressCallback($entityModel, $processed, $total);
+ }
+ };
+
$entityModel->newQuery()
- ->withTrashed()
->select($selectFields)
- ->chunk(1000, function (Collection $entities) {
- $this->indexEntities($entities->all());
- });
+ ->with(['tags:id,name,value,entity_id,entity_type'])
+ ->chunk($chunkSize, $chunkCallback);
}
}
}
/**
- * Create a scored term array from the given text.
+ * Create a scored term array from the given text, where the keys are the terms
+ * and the values are their scores.
+ *
+ * @returns array<string, int>
+ */
+ protected function generateTermScoreMapFromText(string $text, int $scoreAdjustment = 1): array
+ {
+ $termMap = $this->textToTermCountMap($text);
+
+ foreach ($termMap as $term => $count) {
+ $termMap[$term] = $count * $scoreAdjustment;
+ }
+
+ return $termMap;
+ }
+
+ /**
+ * Create a scored term array from the given HTML, where the keys are the terms
+ * and the values are their scores.
+ *
+ * @returns array<string, int>
+ */
+ protected function generateTermScoreMapFromHtml(string $html): array
+ {
+ if (empty($html)) {
+ return [];
+ }
+
+ $scoresByTerm = [];
+ $elementScoreAdjustmentMap = [
+ 'h1' => 10,
+ 'h2' => 5,
+ 'h3' => 4,
+ 'h4' => 3,
+ 'h5' => 2,
+ 'h6' => 1.5,
+ ];
+
+ $html = '<body>' . $html . '</body>';
+ libxml_use_internal_errors(true);
+ $doc = new DOMDocument();
+ $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
+
+ $topElems = $doc->documentElement->childNodes->item(0)->childNodes;
+ /** @var DOMNode $child */
+ foreach ($topElems as $child) {
+ $nodeName = $child->nodeName;
+ $termCounts = $this->textToTermCountMap(trim($child->textContent));
+ foreach ($termCounts as $term => $count) {
+ $scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
+ $scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;
+ }
+ }
+
+ return $scoresByTerm;
+ }
+
+ /**
+ * Create a scored term map from the given set of entity tags.
+ *
+ * @param Tag[] $tags
+ *
+ * @returns array<string, int>
*/
- protected function generateTermArrayFromText(string $text, int $scoreAdjustment = 1): array
+ protected function generateTermScoreMapFromTags(array $tags): array
+ {
+ $scoreMap = [];
+ $names = [];
+ $values = [];
+
+ foreach ($tags as $tag) {
+ $names[] = $tag->name;
+ $values[] = $tag->value;
+ }
+
+ $nameMap = $this->generateTermScoreMapFromText(implode(' ', $names), 3);
+ $valueMap = $this->generateTermScoreMapFromText(implode(' ', $values), 5);
+
+ return $this->mergeTermScoreMaps($nameMap, $valueMap);
+ }
+
+ /**
+ * For the given text, return an array where the keys are the unique term words
+ * and the values are the frequency of that term.
+ *
+ * @returns array<string, int>
+ */
+ protected function textToTermCountMap(string $text): array
{
$tokenMap = []; // {TextToken => OccurrenceCount}
- $splitChars = " \n\t.,!?:;()[]{}<>`'\"";
+ $splitChars = static::$delimiters;
$token = strtok($text, $splitChars);
while ($token !== false) {
$token = strtok($splitChars);
}
- $terms = [];
- foreach ($tokenMap as $token => $count) {
- $terms[] = [
- 'term' => $token,
- 'score' => $count * $scoreAdjustment
+ return $tokenMap;
+ }
+
+ /**
+ * For the given entity, Generate an array of term data details.
+ * Is the raw term data, not instances of SearchTerm models.
+ *
+ * @returns array{term: string, score: float, entity_id: int, entity_type: string}[]
+ */
+ protected function entityToTermDataArray(Entity $entity): array
+ {
+ $nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor);
+ $tagTermsMap = $this->generateTermScoreMapFromTags($entity->tags->all());
+
+ if ($entity instanceof Page) {
+ $bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html);
+ } else {
+ $bodyTermsMap = $this->generateTermScoreMapFromText($entity->description ?? '', $entity->searchFactor);
+ }
+
+ $mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap, $tagTermsMap);
+
+ $dataArray = [];
+ $entityId = $entity->id;
+ $entityType = $entity->getMorphClass();
+ foreach ($mergedScoreMap as $term => $score) {
+ $dataArray[] = [
+ 'term' => $term,
+ 'score' => $score,
+ 'entity_type' => $entityType,
+ 'entity_id' => $entityId,
];
}
- return $terms;
+ return $dataArray;
+ }
+
+ /**
+ * For the given term data arrays, Merge their contents by term
+ * while combining any scores.
+ *
+ * @param array<string, int>[] ...$scoreMaps
+ *
+ * @returns array<string, int>
+ */
+ protected function mergeTermScoreMaps(...$scoreMaps): array
+ {
+ $mergedMap = [];
+
+ foreach ($scoreMaps as $scoreMap) {
+ foreach ($scoreMap as $term => $score) {
+ $mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score;
+ }
+ }
+
+ return $mergedMap;
}
}