/**
* A list of delimiter characters used to break-up parsed content into terms for indexing.
*/
- public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"";
+ public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"«»";
/**
* A list of delimiter which could be commonly used within a single term and also indicate a break between terms.
/** @var DOMNode $child */
foreach ($doc->getBodyChildren() as $child) {
$nodeName = $child->nodeName;
- $termCounts = $this->textToTermCountMap(trim($child->textContent));
+ $text = trim($child->textContent);
+ $text = str_replace("\u{00A0}", ' ', $text);
+ $termCounts = $this->textToTermCountMap($text);
foreach ($termCounts as $term => $count) {
$scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
$scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;