Search: Updated indexer to handle non-breaking-spaces

Related to #5640
This commit is contained in:
Dan Brown
2025-06-17 13:59:28 +01:00
parent 0208f066c5
commit f518a3be37
2 changed files with 13 additions and 1 deletions

View File

@ -160,7 +160,9 @@ class SearchIndex
/** @var DOMNode $child */
foreach ($doc->getBodyChildren() as $child) {
$nodeName = $child->nodeName;
$termCounts = $this->textToTermCountMap(trim($child->textContent));
$text = trim($child->textContent);
$text = str_replace("\u{00A0}", ' ', $text);
$termCounts = $this->textToTermCountMap($text);
foreach ($termCounts as $term => $count) {
$scoreChange = $count * ($elementScoreAdjustmentMap[$nodeName] ?? 1);
$scoresByTerm[$term] = ($scoresByTerm[$term] ?? 0) + $scoreChange;

View File

@ -106,4 +106,14 @@ class SearchIndexingTest extends TestCase
$this->assertNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is not indexed");
}
}
public function test_non_breaking_spaces_handled_as_spaces()
{
$page = $this->entities->newPage(['html' => '<p>a&nbsp;tigerbadger is a dangerous&nbsp;animal</p>']);
$scoreByTerm = $page->searchTerms()->pluck('score', 'term');
$this->assertNotNull($scoreByTerm->get('tigerbadger'));
$this->assertNotNull($scoreByTerm->get('dangerous'));
$this->assertNotNull($scoreByTerm->get('animal'));
}
}