]> BookStack Code Mirror - bookstack/blobdiff - app/Entities/Tools/SearchIndex.php
Fixed occurances of altered titles in search results
[bookstack] / app / Entities / Tools / SearchIndex.php
index 81a5022ce33c4f5ccc87ac6902305bcac0c65fac..d748c1695d46369c314eb6f6f121b0bc9dce3649 100644 (file)
@@ -1,84 +1,98 @@
-<?php namespace BookStack\Entities\Tools;
+<?php
 
+namespace BookStack\Entities\Tools;
+
+use BookStack\Actions\Tag;
 use BookStack\Entities\EntityProvider;
 use BookStack\Entities\Models\Entity;
+use BookStack\Entities\Models\Page;
 use BookStack\Entities\Models\SearchTerm;
+use DOMDocument;
+use DOMNode;
 use Illuminate\Support\Collection;
 
 class SearchIndex
 {
     /**
-     * @var SearchTerm
+     * A list of delimiter characters used to break-up parsed content into terms for indexing.
+     *
+     * @var string
      */
-    protected $searchTerm;
+    public static $delimiters = " \n\t.,!?:;()[]{}<>`'\"";
 
     /**
      * @var EntityProvider
      */
     protected $entityProvider;
 
-
-    public function __construct(SearchTerm $searchTerm, EntityProvider $entityProvider)
+    public function __construct(EntityProvider $entityProvider)
     {
-        $this->searchTerm = $searchTerm;
         $this->entityProvider = $entityProvider;
     }
 
-
     /**
      * Index the given entity.
      */
     public function indexEntity(Entity $entity)
     {
         $this->deleteEntityTerms($entity);
-        $nameTerms = $this->generateTermArrayFromText($entity->name, 5 * $entity->searchFactor);
-        $bodyTerms = $this->generateTermArrayFromText($entity->getText(), 1 * $entity->searchFactor);
-        $terms = array_merge($nameTerms, $bodyTerms);
-        foreach ($terms as $index => $term) {
-            $terms[$index]['entity_type'] = $entity->getMorphClass();
-            $terms[$index]['entity_id'] = $entity->id;
-        }
-        $this->searchTerm->newQuery()->insert($terms);
+        $terms = $this->entityToTermDataArray($entity);
+        SearchTerm::query()->insert($terms);
     }
 
     /**
-     * Index multiple Entities at once
+     * Index multiple Entities at once.
+     *
      * @param Entity[] $entities
      */
-    protected function indexEntities(array $entities)
+    public function indexEntities(array $entities)
     {
         $terms = [];
         foreach ($entities as $entity) {
-            $nameTerms = $this->generateTermArrayFromText($entity->name, 5 * $entity->searchFactor);
-            $bodyTerms = $this->generateTermArrayFromText($entity->getText(), 1 * $entity->searchFactor);
-            foreach (array_merge($nameTerms, $bodyTerms) as $term) {
-                $term['entity_id'] = $entity->id;
-                $term['entity_type'] = $entity->getMorphClass();
-                $terms[] = $term;
-            }
+            $entityTerms = $this->entityToTermDataArray($entity);
+            array_push($terms, ...$entityTerms);
         }
 
         $chunkedTerms = array_chunk($terms, 500);
         foreach ($chunkedTerms as $termChunk) {
-            $this->searchTerm->newQuery()->insert($termChunk);
+            SearchTerm::query()->insert($termChunk);
         }
     }
 
     /**
      * Delete and re-index the terms for all entities in the system.
+     * Can take a callback which is used for reporting progress.
+     * Callback receives three arguments:
+     * - An instance of the model being processed
+     * - The number that have been processed so far.
+     * - The total number of that model to be processed.
+     *
+     * @param callable(Entity, int, int)|null $progressCallback
      */
-    public function indexAllEntities()
+    public function indexAllEntities(?callable $progressCallback = null)
     {
-        $this->searchTerm->newQuery()->truncate();
+        SearchTerm::query()->truncate();
 
         foreach ($this->entityProvider->all() as $entityModel) {
-            $selectFields = ['id', 'name', $entityModel->textField];
+            $indexContentField = $entityModel instanceof Page ? 'html' : 'description';
+            $selectFields = ['id', 'name', $indexContentField];
+            $total = $entityModel->newQuery()->withTrashed()->count();
+            $chunkSize = 250;
+            $processed = 0;
+
+            $chunkCallback = function (Collection $entities) use ($progressCallback, &$processed, $total, $chunkSize, $entityModel) {
+                $this->indexEntities($entities->all());
+                $processed = min($processed + $chunkSize, $total);
+
+                if (is_callable($progressCallback)) {
+                    $progressCallback($entityModel, $processed, $total);
+                }
+            };
+
             $entityModel->newQuery()
-                ->withTrashed()
                 ->select($selectFields)
-                ->chunk(1000, function (Collection $entities) {
-                    $this->indexEntities($entities->all());
-                });
+                ->with(['tags:id,name,value,entity_id,entity_type'])
+                ->chunk($chunkSize, $chunkCallback);
         }
     }
 
@@ -91,12 +105,97 @@ class SearchIndex
     }
 
     /**
-     * Create a scored term array from the given text.
+     * Create a scored term array from the given text, where the keys are the terms
+     * and the values are their scores.
+     *
+     * @returns array<string, int>
+     */
+    protected function generateTermScoreMapFromText(string $text, int $scoreAdjustment = 1): array
+    {
+        $termMap = $this->textToTermCountMap($text);
+
+        foreach ($termMap as $term => $count) {
+            $termMap[$term] = $count * $scoreAdjustment;
+        }
+
+        return $termMap;
+    }
+
+    /**
+     * Create a scored term array from the given HTML, where the keys are the terms
+     * and the values are their scores.
+     *
+     * @returns array<string, int>
+     */
+    protected function generateTermScoreMapFromHtml(string $html): array
+    {
+        if (empty($html)) {
+            return [];
+        }
+
+        $scoresByTerm = [];
+        $elementScoreAdjustmentMap = [
+            'h1' => 10,
+            'h2' => 5,
+            'h3' => 4,
+            'h4' => 3,
+            'h5' => 2,
+            'h6' => 1.5,
+        ];
+
+        $html = '<body>' . $html . '</body>';
+        libxml_use_internal_errors(true);
+        $doc = new DOMDocument();
+        $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
+
+        $topElems = $doc->documentElement->childNodes->item(0)->childNodes;
+        /** @var DOMNode $child */
+        foreach ($topElems as $child) {
+            $nodeName = $child->nodeName;
+            $termCounts = $this->textToTermCountMap(trim($child->textContent));
+            foreach ($termCounts as $term => $count) {
+                $scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
+                $scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;
+            }
+        }
+
+        return $scoresByTerm;
+    }
+
+    /**
+     * Create a scored term map from the given set of entity tags.
+     *
+     * @param Tag[] $tags
+     *
+     * @returns array<string, int>
      */
-    protected function generateTermArrayFromText(string $text, int $scoreAdjustment = 1): array
+    protected function generateTermScoreMapFromTags(array $tags): array
+    {
+        $scoreMap = [];
+        $names = [];
+        $values = [];
+
+        foreach ($tags as $tag) {
+            $names[] = $tag->name;
+            $values[] = $tag->value;
+        }
+
+        $nameMap = $this->generateTermScoreMapFromText(implode(' ', $names), 3);
+        $valueMap = $this->generateTermScoreMapFromText(implode(' ', $values), 5);
+
+        return $this->mergeTermScoreMaps($nameMap, $valueMap);
+    }
+
+    /**
+     * For the given text, return an array where the keys are the unique term words
+     * and the values are the frequency of that term.
+     *
+     * @returns array<string, int>
+     */
+    protected function textToTermCountMap(string $text): array
     {
         $tokenMap = []; // {TextToken => OccurrenceCount}
-        $splitChars = " \n\t.,!?:;()[]{}<>`'\"";
+        $splitChars = static::$delimiters;
         $token = strtok($text, $splitChars);
 
         while ($token !== false) {
@@ -107,14 +206,61 @@ class SearchIndex
             $token = strtok($splitChars);
         }
 
-        $terms = [];
-        foreach ($tokenMap as $token => $count) {
-            $terms[] = [
-                'term' => $token,
-                'score' => $count * $scoreAdjustment
+        return $tokenMap;
+    }
+
+    /**
+     * For the given entity, Generate an array of term data details.
+     * Is the raw term data, not instances of SearchTerm models.
+     *
+     * @returns array{term: string, score: float, entity_id: int, entity_type: string}[]
+     */
+    protected function entityToTermDataArray(Entity $entity): array
+    {
+        $nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor);
+        $tagTermsMap = $this->generateTermScoreMapFromTags($entity->tags->all());
+
+        if ($entity instanceof Page) {
+            $bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html);
+        } else {
+            $bodyTermsMap = $this->generateTermScoreMapFromText($entity->description ?? '', $entity->searchFactor);
+        }
+
+        $mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap, $tagTermsMap);
+
+        $dataArray = [];
+        $entityId = $entity->id;
+        $entityType = $entity->getMorphClass();
+        foreach ($mergedScoreMap as $term => $score) {
+            $dataArray[] = [
+                'term'        => $term,
+                'score'       => $score,
+                'entity_type' => $entityType,
+                'entity_id'   => $entityId,
             ];
         }
 
-        return $terms;
+        return $dataArray;
+    }
+
+    /**
+     * For the given term data arrays, Merge their contents by term
+     * while combining any scores.
+     *
+     * @param array<string, int>[] ...$scoreMaps
+     *
+     * @returns array<string, int>
+     */
+    protected function mergeTermScoreMaps(...$scoreMaps): array
+    {
+        $mergedMap = [];
+
+        foreach ($scoreMaps as $scoreMap) {
+            foreach ($scoreMap as $term => $score) {
+                $mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score;
+            }
+        }
+
+        return $mergedMap;
     }
 }