use BookStack\Entities\EntityProvider;
use BookStack\Entities\Models\Entity;
+use BookStack\Entities\Models\Page;
use BookStack\Entities\Models\SearchTerm;
+use DOMDocument;
+use DOMNode;
use Illuminate\Support\Collection;
class SearchIndex
SearchTerm::query()->truncate();
foreach ($this->entityProvider->all() as $entityModel) {
- $selectFields = ['id', 'name', $entityModel->textField];
+ $indexContentField = $entityModel instanceof Page ? 'html' : 'description';
+ $selectFields = ['id', 'name', $indexContentField];
$total = $entityModel->newQuery()->withTrashed()->count();
$chunkSize = 250;
$processed = 0;
}
/**
- * Create a scored term array from the given text.
+ * Create a scored term array from the given text, where the keys are the terms
+ * and the values are their scores.
*
- * @returns array{term: string, score: float}
+ * @returns array<string, int>
*/
- protected function generateTermArrayFromText(string $text, int $scoreAdjustment = 1): array
+ protected function generateTermScoreMapFromText(string $text, int $scoreAdjustment = 1): array
+ {
+ $termMap = $this->textToTermCountMap($text);
+
+ foreach ($termMap as $term => $count) {
+ $termMap[$term] = $count * $scoreAdjustment;
+ }
+
+ return $termMap;
+ }
+
+ /**
+ * Create a scored term array from the given HTML, where the keys are the terms
+ * and the values are their scores.
+ *
+ * @returns array<string, int>
+ */
+ protected function generateTermScoreMapFromHtml(string $html): array
+ {
+ if (empty($html)) {
+ return [];
+ }
+
+ $scoresByTerm = [];
+ $elementScoreAdjustmentMap = [
+ 'h1' => 10,
+ 'h2' => 5,
+ 'h3' => 4,
+ 'h4' => 3,
+ 'h5' => 2,
+ 'h6' => 1.5,
+ ];
+
+ $html = '<body>' . $html . '</body>';
+ libxml_use_internal_errors(true);
+ $doc = new DOMDocument();
+ $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
+
+ $topElems = $doc->documentElement->childNodes->item(0)->childNodes;
+ /** @var DOMNode $child */
+ foreach ($topElems as $child) {
+ $nodeName = $child->nodeName;
+ $termCounts = $this->textToTermCountMap(trim($child->textContent));
+ foreach ($termCounts as $term => $count) {
+ $scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
+ $scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;
+ }
+ }
+
+ return $scoresByTerm;
+ }
+
+ /**
+ * For the given text, return an array where the keys are the unique term words
+ * and the values are the frequency of that term.
+ *
+ * @returns array<string, int>
+ */
+ protected function textToTermCountMap(string $text): array
{
$tokenMap = []; // {TextToken => OccurrenceCount}
$splitChars = " \n\t.,!?:;()[]{}<>`'\"";
$token = strtok($splitChars);
}
- $terms = [];
- foreach ($tokenMap as $token => $count) {
- $terms[] = [
- 'term' => $token,
- 'score' => $count * $scoreAdjustment,
- ];
- }
-
- return $terms;
+ return $tokenMap;
}
/**
* For the given entity, Generate an array of term data details.
* Is the raw term data, not instances of SearchTerm models.
*
- * @returns array{term: string, score: float}[]
+ * @returns array{term: string, score: float, entity_id: int, entity_type: string}[]
*/
protected function entityToTermDataArray(Entity $entity): array
{
- $nameTerms = $this->generateTermArrayFromText($entity->name, 40 * $entity->searchFactor);
- $bodyTerms = $this->generateTermArrayFromText($entity->getText(), 1 * $entity->searchFactor);
- $termData = array_merge($nameTerms, $bodyTerms);
+ $nameTermsMap = $this->generateTermScoreMapFromText($entity->name, 40 * $entity->searchFactor);
+
+ if ($entity instanceof Page) {
+ $bodyTermsMap = $this->generateTermScoreMapFromHtml($entity->html);
+ } else {
+ $bodyTermsMap = $this->generateTermScoreMapFromText($entity->description, $entity->searchFactor);
+ }
- foreach ($termData as $index => $term) {
- $termData[$index]['entity_type'] = $entity->getMorphClass();
- $termData[$index]['entity_id'] = $entity->id;
+ $mergedScoreMap = $this->mergeTermScoreMaps($nameTermsMap, $bodyTermsMap);
+
+ $dataArray = [];
+ $entityId = $entity->id;
+ $entityType = $entity->getMorphClass();
+ foreach ($mergedScoreMap as $term => $score) {
+ $dataArray[] = [
+ 'term' => $term,
+ 'score' => $score,
+ 'entity_type' => $entityType,
+ 'entity_id' => $entityId,
+ ];
+ }
+
+ return $dataArray;
+ }
+
+
+ /**
+ * For the given term data arrays, Merge their contents by term
+ * while combining any scores.
+ *
+ * @param array<string, int>[] ...$scoreMaps
+ *
+ * @returns array<string, int>
+ */
+ protected function mergeTermScoreMaps(...$scoreMaps): array
+ {
+ $mergedMap = [];
+
+ foreach ($scoreMaps as $scoreMap) {
+ foreach ($scoreMap as $term => $score) {
+ $mergedMap[$term] = ($mergedMap[$term] ?? 0) + $score;
+ }
}
- return $termData;
+ return $mergedMap;
}
}
use BookStack\Entities\Models\Bookshelf;
use BookStack\Entities\Models\Chapter;
use BookStack\Entities\Models\Page;
+use BookStack\Entities\Models\SearchTerm;
use Tests\TestCase;
class EntitySearchTest extends TestCase
$search->assertElementContains('.entity-list > .page', 'Test page B', 1);
$search->assertElementContains('.entity-list > .page', 'Test page A', 2);
}
+
+ public function test_terms_in_headers_have_an_adjusted_index_score()
+ {
+ $page = $this->newPage(['name' => 'Test page A', 'html' => '
+ <p>TermA</p>
+ <h1>TermB <strong>TermNested</strong></h1>
+ <h2>TermC</h2>
+ <h3>TermD</h3>
+ <h4>TermE</h4>
+ <h5>TermF</h5>
+ <h6>TermG</h6>
+ ']);
+
+ $entityRelationCols = ['entity_id' => $page->id, 'entity_type' => 'BookStack\\Page'];
+ $scoreByTerm = SearchTerm::query()->where($entityRelationCols)->pluck('score', 'term');
+
+ $this->assertEquals(1, $scoreByTerm->get('TermA'));
+ $this->assertEquals(10, $scoreByTerm->get('TermB'));
+ $this->assertEquals(10, $scoreByTerm->get('TermNested'));
+ $this->assertEquals(5, $scoreByTerm->get('TermC'));
+ $this->assertEquals(4, $scoreByTerm->get('TermD'));
+ $this->assertEquals(3, $scoreByTerm->get('TermE'));
+ $this->assertEquals(2, $scoreByTerm->get('TermF'));
+ // Is 1.5 but stored as integer, rounding up
+ $this->assertEquals(2, $scoreByTerm->get('TermG'));
+ }
+
+ public function test_name_and_content_terms_are_merged_to_single_score()
+ {
+ $page = $this->newPage(['name' => 'TermA', 'html' => '
+ <p>TermA</p>
+ ']);
+
+ $entityRelationCols = ['entity_id' => $page->id, 'entity_type' => 'BookStack\\Page'];
+ $scoreByTerm = SearchTerm::query()->where($entityRelationCols)->pluck('score', 'term');
+
+ // Scores 40 for being in the name then 1 for being in the content
+ $this->assertEquals(41, $scoreByTerm->get('TermA'));
+ }
}