Added page content parsing to up-rank header text in search

This adds parsing of page content so that headers apply a boost to
scores in the search term index.
Additionally, this merges title and content terms to reduce the amount
of stored terms a little.
Includes testing to cover.
This commit is contained in:
Dan Brown
2021-11-12 13:47:23 +00:00
parent 820be162f5
commit f28daa01d9
8 changed files with 158 additions and 38 deletions

View File

@ -7,6 +7,7 @@ use BookStack\Entities\Models\Book;
use BookStack\Entities\Models\Bookshelf;
use BookStack\Entities\Models\Chapter;
use BookStack\Entities\Models\Page;
use BookStack\Entities\Models\SearchTerm;
use Tests\TestCase;
class EntitySearchTest extends TestCase
@ -320,4 +321,43 @@ class EntitySearchTest extends TestCase
$search->assertElementContains('.entity-list > .page', 'Test page B', 1);
$search->assertElementContains('.entity-list > .page', 'Test page A', 2);
}
public function test_terms_in_headers_have_an_adjusted_index_score()
{
$page = $this->newPage(['name' => 'Test page A', 'html' => '
<p>TermA</p>
<h1>TermB <strong>TermNested</strong></h1>
<h2>TermC</h2>
<h3>TermD</h3>
<h4>TermE</h4>
<h5>TermF</h5>
<h6>TermG</h6>
']);
$entityRelationCols = ['entity_id' => $page->id, 'entity_type' => 'BookStack\\Page'];
$scoreByTerm = SearchTerm::query()->where($entityRelationCols)->pluck('score', 'term');
$this->assertEquals(1, $scoreByTerm->get('TermA'));
$this->assertEquals(10, $scoreByTerm->get('TermB'));
$this->assertEquals(10, $scoreByTerm->get('TermNested'));
$this->assertEquals(5, $scoreByTerm->get('TermC'));
$this->assertEquals(4, $scoreByTerm->get('TermD'));
$this->assertEquals(3, $scoreByTerm->get('TermE'));
$this->assertEquals(2, $scoreByTerm->get('TermF'));
// Is 1.5 but stored as integer, rounding up
$this->assertEquals(2, $scoreByTerm->get('TermG'));
}
public function test_name_and_content_terms_are_merged_to_single_score()
{
$page = $this->newPage(['name' => 'TermA', 'html' => '
<p>TermA</p>
']);
$entityRelationCols = ['entity_id' => $page->id, 'entity_type' => 'BookStack\\Page'];
$scoreByTerm = SearchTerm::query()->where($entityRelationCols)->pluck('score', 'term');
// Scores 40 for being in the name then 1 for being in the content
$this->assertEquals(41, $scoreByTerm->get('TermA'));
}
}