FIX: Ensure that aggregating search shows the post with the higest rank.

Previously, we would only take either the `MIN` or `MAX` for
`post_number` during aggregation meaning that the ranking is not
considered.

```
require 'benchmark/ips'

Benchmark.ips do |x|
  x.config(time: 10, warmup: 2)

  x.report("current aggregate search query") do
    DB.exec <<~SQL
    SELECT "posts"."id", "posts"."user_id", "posts"."topic_id", "posts"."post_number", "posts"."raw", "posts"."cooked", "posts"."created_at", "posts"."updated_at", "posts"."reply_to_post_number", "posts"."reply_count", "posts"."quote_count", "posts"."deleted_at", "posts"."off_topic_count", "posts"."like_count", "posts"."incoming_link_count", "posts"."bookmark_count", "posts"."score", "posts"."reads", "posts"."post_type", "posts"."sort_order", "posts"."last_editor_id", "posts"."hidden", "posts"."hidden_reason_id", "posts"."notify_moderators_count", "posts"."spam_count", "posts"."illegal_count", "posts"."inappropriate_count", "posts"."last_version_at", "posts"."user_deleted", "posts"."reply_to_user_id", "posts"."percent_rank", "posts"."notify_user_count", "posts"."like_score", "posts"."deleted_by_id", "posts"."edit_reason", "posts"."word_count", "posts"."version", "posts"."cook_method", "posts"."wiki", "posts"."baked_at", "posts"."baked_version", "posts"."hidden_at", "posts"."self_edits", "posts"."reply_quoted", "posts"."via_email", "posts"."raw_email", "posts"."public_version", "posts"."action_code", "posts"."locked_by_id", "posts"."image_upload_id" FROM "posts" JOIN (SELECT *, row_number() over() row_number FROM (SELECT topics.id, min(posts.post_number) post_number FROM "posts" INNER JOIN "post_search_data" ON "post_search_data"."post_id" = "posts"."id" INNER JOIN "topics" ON "topics"."id" = "posts"."topic_id" AND ("topics"."deleted_at" IS NULL) LEFT JOIN categories ON categories.id = topics.category_id WHERE ("posts"."deleted_at" IS NULL) AND "posts"."post_type" IN (1, 2, 3, 4) AND (topics.visible) AND (topics.archetype <> 'private_message') AND (post_search_data.search_data @@ TO_TSQUERY('english', '''postgres'':*ABCD')) AND (categories.id NOT IN (
      SELECT categories.id WHERE categories.search_priority = 1
    )
    ) AND ((categories.id IS NULL) OR (NOT categories.read_restricted)) GROUP BY topics.id ORDER BY MAX((
      TS_RANK_CD(
        post_search_data.search_data,
        TO_TSQUERY('english', '''postgres'':*ABCD'),
        1|32
      ) *
      (
        CASE categories.search_priority
        WHEN 2
        THEN 0.6
        WHEN 3
        THEN 0.8
        WHEN 4
        THEN 1.2
        WHEN 5
        THEN 1.4
        ELSE
          CASE WHEN topics.closed
          THEN 0.9
          ELSE 1
          END
        END
      )
    )
    ) DESC, topics.bumped_at DESC LIMIT 51 OFFSET 0) xxx) x ON x.id = posts.topic_id AND x.post_number = posts.post_number WHERE ("posts"."deleted_at" IS NULL) ORDER BY row_number;
    SQL
  end

  x.report("current aggregate search query with proper ranking") do
    DB.exec <<~SQL
    SELECT "posts"."id", "posts"."user_id", "posts"."topic_id", "posts"."post_number", "posts"."raw", "posts"."cooked", "posts"."created_at", "posts"."updated_at", "posts"."reply_to_post_number", "posts"."reply_count", "posts"."quote_count", "posts"."deleted_at", "posts"."off_topic_count", "posts"."like_count", "posts"."incoming_link_count", "posts"."bookmark_count", "posts"."score", "posts"."reads", "posts"."post_type", "posts"."sort_order", "posts"."last_editor_id", "posts"."hidden", "posts"."hidden_reason_id", "posts"."notify_moderators_count", "posts"."spam_count", "posts"."illegal_count", "posts"."inappropriate_count", "posts"."last_version_at", "posts"."user_deleted", "posts"."reply_to_user_id", "posts"."percent_rank", "posts"."notify_user_count", "posts"."like_score", "posts"."deleted_by_id", "posts"."edit_reason", "posts"."word_count", "posts"."version", "posts"."cook_method", "posts"."wiki", "posts"."baked_at", "posts"."baked_version", "posts"."hidden_at", "posts"."self_edits", "posts"."reply_quoted", "posts"."via_email", "posts"."raw_email", "posts"."public_version", "posts"."action_code", "posts"."locked_by_id", "posts"."image_upload_id" FROM "posts" JOIN (SELECT *, row_number() over() row_number FROM (SELECT subquery.topic_id id, (ARRAY_AGG(subquery.post_number))[1] post_number, MAX(subquery.rank) rank, MAX(subquery.bumped_at) bumped_at FROM (SELECT "posts"."id", "posts"."user_id", "posts"."topic_id", "posts"."post_number", "posts"."raw", "posts"."cooked", "posts"."created_at", "posts"."updated_at", "posts"."reply_to_post_number", "posts"."reply_count", "posts"."quote_count", "posts"."deleted_at", "posts"."off_topic_count", "posts"."like_count", "posts"."incoming_link_count", "posts"."bookmark_count", "posts"."score", "posts"."reads", "posts"."post_type", "posts"."sort_order", "posts"."last_editor_id", "posts"."hidden", "posts"."hidden_reason_id", "posts"."notify_moderators_count", "posts"."spam_count", "posts"."illegal_count", "posts"."inappropriate_count", "posts"."last_version_at", "posts"."user_deleted", "posts"."reply_to_user_id", "posts"."percent_rank", "posts"."notify_user_count", "posts"."like_score", "posts"."deleted_by_id", "posts"."edit_reason", "posts"."word_count", "posts"."version", "posts"."cook_method", "posts"."wiki", "posts"."baked_at", "posts"."baked_version", "posts"."hidden_at", "posts"."self_edits", "posts"."reply_quoted", "posts"."via_email", "posts"."raw_email", "posts"."public_version", "posts"."action_code", "posts"."locked_by_id", "posts"."image_upload_id", (
      TS_RANK_CD(
        post_search_data.search_data,
        TO_TSQUERY('english', '''postgres'':*ABCD'),
        1|32
      ) *
      (
        CASE categories.search_priority
        WHEN 2
        THEN 0.6
        WHEN 3
        THEN 0.8
        WHEN 4
        THEN 1.2
        WHEN 5
        THEN 1.4
        ELSE
          CASE WHEN topics.closed
          THEN 0.9
          ELSE 1
          END
        END
      )
    )
     rank, topics.bumped_at bumped_at FROM "posts" INNER JOIN "post_search_data" ON "post_search_data"."post_id" = "posts"."id" INNER JOIN "topics" ON "topics"."id" = "posts"."topic_id" AND ("topics"."deleted_at" IS NULL) LEFT JOIN categories ON categories.id = topics.category_id WHERE ("posts"."deleted_at" IS NULL) AND "posts"."post_type" IN (1, 2, 3, 4) AND (topics.visible) AND (topics.archetype <> 'private_message') AND (post_search_data.search_data @@ TO_TSQUERY('english', '''postgres'':*ABCD')) AND (categories.id NOT IN (
      SELECT categories.id WHERE categories.search_priority = 1
    )
    ) AND ((categories.id IS NULL) OR (NOT categories.read_restricted))) subquery GROUP BY subquery.topic_id ORDER BY rank DESC, bumped_at DESC LIMIT 51 OFFSET 0) xxx) x ON x.id = posts.topic_id AND x.post_number = posts.post_number WHERE ("posts"."deleted_at" IS NULL) ORDER BY row_number;
    SQL
  end

  x.compare!
end
```

```
Warming up --------------------------------------
current aggregate search query
                         1.000  i/100ms
current aggregate search query with proper ranking
                         1.000  i/100ms
Calculating -------------------------------------
current aggregate search query
                         17.726  (± 0.0%) i/s -    178.000  in  10.045107s
current aggregate search query with proper ranking
                         17.802  (± 0.0%) i/s -    178.000  in  10.002230s

Comparison:
current aggregate search query with proper ranking:       17.8 i/s
current aggregate search query:       17.7 i/s - 1.00x  (± 0.00) slower
```
This commit is contained in:
Guo Xiang Tan
2020-07-07 15:36:57 +08:00
parent 4009c9f711
commit d8c796bc44
2 changed files with 241 additions and 166 deletions

View File

@ -175,7 +175,6 @@ describe Search do
raw: 'hello from mars, we just landed') }
it 'searches correctly' do
expect do
Search.execute('mars', type_filter: 'private_messages')
end.to raise_error(Discourse::InvalidAccess)
@ -368,6 +367,121 @@ describe Search do
end
end
context 'posts' do
let(:post) { Fabricate(:post) }
let(:topic) { post.topic }
let!(:reply) do
Fabricate(:post_with_long_raw_content,
topic: topic,
user: topic.user,
).tap { |post| post.update!(raw: "#{post.raw} elephant") }
end
let(:expected_blurb) do
"...quire content longer than the typical test post raw content. It really is some long content, folks. elephant"
end
it 'returns the post' do
result = Search.execute('elephant',
type_filter: 'topic',
include_blurbs: true
)
expect(result.posts).to contain_exactly(reply)
expect(result.blurb(reply)).to eq(expected_blurb)
end
it 'returns the right post and blurb for searches with phrase' do
result = Search.execute('"elephant"',
type_filter: 'topic',
include_blurbs: true
)
expect(result.posts).to contain_exactly(reply)
expect(result.blurb(reply)).to eq(expected_blurb)
end
it 'does not allow a post with repeated words to dominate the ranking' do
category = Fabricate(:category_with_definition, name: "winter is coming")
post = Fabricate(:post,
raw: "I think winter will end soon",
topic: Fabricate(:topic,
title: "dragon john snow winter",
category: category
)
)
post2 = Fabricate(:post,
raw: "I think #{'winter' * 20} will end soon",
topic: Fabricate(:topic, title: "dragon john snow summer", category: category)
)
result = Search.execute('winter')
expect(result.posts.pluck(:id)).to eq([
post.id, category.topic.first_post.id, post2.id
])
end
it 'applies a small penalty to closed topic when ranking' do
post = Fabricate(:post,
raw: "My weekly update",
topic: Fabricate(:topic,
title: "A topic that will be closed",
closed: true
)
)
post2 = Fabricate(:post,
raw: "My weekly update",
topic: Fabricate(:topic,
title: "A topic that will be open"
)
)
result = Search.execute('weekly update')
expect(result.posts.pluck(:id)).to eq([post2.id, post.id])
end
it 'aggregates searches in a topic by returning the post with the highest rank' do
post = Fabricate(:post, topic: topic, raw: "this is a play post")
post2 = Fabricate(:post, topic: topic, raw: "play playing played")
post3 = Fabricate(:post, raw: "this is a play post")
results = Search.execute('play')
expect(results.posts.map(&:id)).to eq([
post2.id,
post3.id
])
end
it "allows the configuration of search to prefer recent posts" do
SiteSetting.search_prefer_recent_posts = true
SiteSetting.search_recent_posts_size = 1
post = Fabricate(:post, topic: topic, raw: "this is a play post")
results = Search.execute('play post')
expect(results.posts.map(&:id)).to eq([
post.id
])
post2 = Fabricate(:post, raw: "this is a play post")
results = Search.execute('play post')
expect(results.posts.map(&:id)).to eq([
post2.id,
post.id
])
ensure
Discourse.cache.clear
end
end
context 'topics' do
let(:post) { Fabricate(:post) }
let(:topic) { post.topic }
@ -435,80 +549,6 @@ describe Search do
end
end
context 'searching for a post' do
let!(:reply) do
Fabricate(:post_with_long_raw_content,
topic: topic,
user: topic.user,
).tap { |post| post.update!(raw: "#{post.raw} elephant") }
end
let(:expected_blurb) do
"...quire content longer than the typical test post raw content. It really is some long content, folks. elephant"
end
it 'returns the post' do
result = Search.execute('elephant',
type_filter: 'topic'
)
expect(result.posts).to contain_exactly(reply)
expect(result.blurb(reply)).to eq(expected_blurb)
end
it 'returns the right post and blurb for searches with phrase' do
result = Search.execute('"elephant"',
type_filter: 'topic'
)
expect(result.posts).to contain_exactly(reply)
expect(result.blurb(reply)).to eq(expected_blurb)
end
it 'does not allow a post with repeated words to dominate the ranking' do
category = Fabricate(:category_with_definition, name: "winter is coming")
post = Fabricate(:post,
raw: "I think winter will end soon",
topic: Fabricate(:topic,
title: "dragon john snow winter",
category: category
)
)
post2 = Fabricate(:post,
raw: "I think #{'winter' * 20} will end soon",
topic: Fabricate(:topic, title: "dragon john snow summer", category: category)
)
result = Search.execute('winter')
expect(result.posts.pluck(:id)).to eq([
post.id, category.topic.first_post.id, post2.id
])
end
it 'applies a small penalty to closed topic when ranking' do
post = Fabricate(:post,
raw: "My weekly update",
topic: Fabricate(:topic,
title: "A topic that will be closed",
closed: true
)
)
post2 = Fabricate(:post,
raw: "My weekly update",
topic: Fabricate(:topic,
title: "A topic that will be open"
)
)
result = Search.execute('weekly update')
expect(result.posts.pluck(:id)).to eq([post2.id, post.id])
end
end
context 'searching for quoted title' do
it "can find quoted title" do
create_post(raw: "this is the raw body", title: "I am a title yeah")
@ -681,18 +721,19 @@ describe Search do
it "should return posts in the right order" do
raw = "The pure genuine evian"
post = freeze_time(10.seconds.from_now) { Fabricate(:post, topic: category.topic, raw: raw) }
post2 = freeze_time(20.seconds.from_now) { Fabricate(:post, topic: category2.topic, raw: raw) }
post = Fabricate(:post, topic: category.topic, raw: raw)
post2 = Fabricate(:post, topic: category2.topic, raw: raw)
post2.topic.update!(bumped_at: 10.seconds.from_now)
search = Search.execute(raw)
expect(search.posts).to eq([post2, post])
expect(search.posts.map(&:id)).to eq([post2.id, post.id])
category.update!(search_priority: Searchable::PRIORITIES[:high])
search = Search.execute(raw)
expect(search.posts).to eq([post, post2])
expect(search.posts.map(&:id)).to eq([post.id, post2.id])
end
end
@ -1192,6 +1233,18 @@ describe Search do
])
end
it 'can order by topic views' do
topic = Fabricate(:topic, views: 1)
topic2 = Fabricate(:topic, views: 2)
post = Fabricate(:post, raw: 'Topic', topic: topic)
post2 = Fabricate(:post, raw: 'Topic', topic: topic2)
expect(Search.execute('Topic order:views').posts.map(&:id)).to eq([
post2.id,
post.id
])
end
it 'can tokenize dots' do
post = Fabricate(:post, raw: 'Will.2000 Will.Bob.Bill...')
expect(Search.execute('bill').posts.map(&:id)).to eq([post.id])
@ -1343,8 +1396,10 @@ describe Search do
expect(Search.execute('bakey tags:lunch order:latest').posts.map(&:id))
.to eq([post8.id, post7.id])
expect(Search.execute('#food tags:lunch order:latest').posts.map(&:id))
.to eq([post8.id, post7.id])
expect(Search.execute('#food tags:lunch order:likes').posts.map(&:id))
.to eq([post7.id, post8.id])
end