PERF: Avoid parsing Post#cooked with Nokogiri for every search.

This commit is contained in:
Guo Xiang Tan
2020-07-17 16:27:30 +08:00
committed by Alan Guo Xiang Tan
parent b979579c1b
commit 181c4eb760
7 changed files with 64 additions and 33 deletions

View File

@ -10,7 +10,7 @@ class SimilarTopicsController < ApplicationController
attr_reader :topic attr_reader :topic
def blurb def blurb
Search::GroupedSearchResults.blurb_for(@topic.try(:blurb)) Search::GroupedSearchResults.blurb_for(cooked: @topic.try(:blurb))
end end
end end

View File

@ -1,7 +1,7 @@
# frozen_string_literal: true # frozen_string_literal: true
class SearchIndexer class SearchIndexer
POST_INDEX_VERSION = 3 POST_INDEX_VERSION = 4
MIN_POST_REINDEX_VERSION = 3 MIN_POST_REINDEX_VERSION = 3
TOPIC_INDEX_VERSION = 3 TOPIC_INDEX_VERSION = 3
CATEGORY_INDEX_VERSION = 3 CATEGORY_INDEX_VERSION = 3
@ -39,8 +39,6 @@ class SearchIndexer
setweight(to_tsvector('#{stemmer}', coalesce(:d,'')), 'D') setweight(to_tsvector('#{stemmer}', coalesce(:d,'')), 'D')
SQL SQL
indexed_data = search_data.select { |d| d.length > 0 }.join(' ')
ranked_params = { ranked_params = {
a: search_data[0], a: search_data[0],
b: search_data[1], b: search_data[1],
@ -48,6 +46,13 @@ class SearchIndexer
d: search_data[3], d: search_data[3],
} }
indexed_data =
if table.to_s == "post"
ranked_params[:d]
else
search_data.select { |d| d.length > 0 }.join(' ')
end
tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0] tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0]
additional_lexemes = [] additional_lexemes = []
@ -105,7 +110,7 @@ class SearchIndexer
scrubbed_cooked = scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH] scrubbed_cooked = scrub_html_for_search(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH]
# a bit inconsitent that we use title as A and body as B when in # a bit inconsitent that we use title as A and body as B when in
# the post index body is C # the post index body is D
update_index(table: 'topic', id: topic_id, raw_data: [title, scrubbed_cooked]) update_index(table: 'topic', id: topic_id, raw_data: [title, scrubbed_cooked])
end end
@ -165,9 +170,11 @@ class SearchIndexer
end end
category_name = topic.category&.name if topic category_name = topic.category&.name if topic
if topic if topic
tags = topic.tags.select(:id, :name) tags = topic.tags.select(:id, :name).to_a
unless tags.empty?
if tags.present?
tag_names = (tags.map(&:name) + Tag.where(target_tag_id: tags.map(&:id)).pluck(:name)).join(' ') tag_names = (tags.map(&:name) + Tag.where(target_tag_id: tags.map(&:id)).pluck(:name)).join(' ')
end end
end end

View File

@ -1128,7 +1128,7 @@ class Search
end end
def posts_eager_loads(query) def posts_eager_loads(query)
query = query.includes(:user) query = query.includes(:user, :post_search_data)
topic_eager_loads = [:category] topic_eager_loads = [:category]
if SiteSetting.tagging_enabled if SiteSetting.tagging_enabled

View File

@ -58,7 +58,19 @@ class Search
end end
def blurb(post) def blurb(post)
GroupedSearchResults.blurb_for(post.cooked, @blurb_term, @blurb_length) opts = {
term: @blurb_term,
blurb_length: @blurb_length
}
if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION
opts[:cooked] = post.post_search_data.raw_data
opts[:scrub] = false
else
opts[:cooked] = post.cooked
end
GroupedSearchResults.blurb_for(**opts)
end end
def add(object) def add(object)
@ -73,9 +85,9 @@ class Search
end end
end end
def self.blurb_for(cooked, term = nil, blurb_length = BLURB_LENGTH) def self.blurb_for(cooked: nil, term: nil, blurb_length: BLURB_LENGTH, scrub: true)
blurb = nil blurb = nil
cooked = SearchIndexer.scrub_html_for_search(cooked) cooked = SearchIndexer.scrub_html_for_search(cooked) if scrub
urls = Set.new urls = Set.new
cooked.scan(URI.regexp(%w{http https})) { urls << $& } cooked.scan(URI.regexp(%w{http https})) { urls << $& }

View File

@ -38,7 +38,7 @@ describe Search do
link to a video file: https://somesite.com/content/somethingelse.MOV link to a video file: https://somesite.com/content/somethingelse.MOV
RAW RAW
result = Search::GroupedSearchResults.blurb_for(cooked) result = Search::GroupedSearchResults.blurb_for(cooked: cooked)
expect(result).to eq("link to an external page: https://google.com/?u=bar link to an audio file: #{I18n.t("search.audio")} link to a video file: #{I18n.t("search.video")}") expect(result).to eq("link to an external page: https://google.com/?u=bar link to an audio file: #{I18n.t("search.audio")} link to a video file: #{I18n.t("search.video")}")
end end
@ -51,7 +51,7 @@ describe Search do
http://localhost/uploads/default/original/1X/90adc0092b30c04b761541bc0322d0dce3d896e7.m4a http://localhost/uploads/default/original/1X/90adc0092b30c04b761541bc0322d0dce3d896e7.m4a
RAW RAW
result = Search::GroupedSearchResults.blurb_for(cooked) result = Search::GroupedSearchResults.blurb_for(cooked: cooked)
expect(result).to eq("Here goes a test cooked with enough characters to hit the blurb limit. Something is very interesting about this audio file. #{I18n.t("search.audio")}") expect(result).to eq("Here goes a test cooked with enough characters to hit the blurb limit. Something is very interesting about this audio file. #{I18n.t("search.audio")}")
end end
@ -59,7 +59,7 @@ describe Search do
cooked = <<~RAW cooked = <<~RAW
invalid URL: http:error] should not trip up blurb generation. invalid URL: http:error] should not trip up blurb generation.
RAW RAW
result = Search::GroupedSearchResults.blurb_for(cooked) result = Search::GroupedSearchResults.blurb_for(cooked: cooked)
expect(result).to eq("invalid URL: http:error] should not trip up blurb generation.") expect(result).to eq("invalid URL: http:error] should not trip up blurb generation.")
end end
end end

View File

@ -3,10 +3,22 @@
require 'rails_helper' require 'rails_helper'
describe SearchController do describe SearchController do
fab!(:awesome_topic) do
topic = Fabricate(:topic)
tag = Fabricate(:tag)
topic.tags << tag
Fabricate(:tag, target_tag_id: tag.id)
topic
end
fab!(:awesome_post) do fab!(:awesome_post) do
SearchIndexer.enable SearchIndexer.enable
Fabricate(:post, raw: 'this is my really awesome post') Fabricate(:post, topic: awesome_topic, raw: 'this is my really awesome post')
end
fab!(:awesome_post_2) do
SearchIndexer.enable
Fabricate(:post, raw: 'this is my really awesome post 2')
end end
fab!(:user) do fab!(:user) do
@ -95,10 +107,14 @@ describe SearchController do
data = response.parsed_body data = response.parsed_body
expect(data['posts'].length).to eq(1) expect(data['posts'].length).to eq(2)
expect(data['posts'][0]['id']).to eq(awesome_post.id) expect(data['posts'][0]['id']).to eq(awesome_post_2.id)
expect(data['posts'][0]['blurb']).to eq(awesome_post.raw) expect(data['posts'][0]['blurb']).to eq(awesome_post_2.raw)
expect(data['topics'][0]['id']).to eq(awesome_post.topic_id) expect(data['topics'][0]['id']).to eq(awesome_post_2.topic_id)
expect(data['posts'][1]['id']).to eq(awesome_post.id)
expect(data['posts'][1]['blurb']).to eq(awesome_post.raw)
expect(data['topics'][1]['id']).to eq(awesome_post.topic_id)
end end
it "can search correctly with advanced search filters" do it "can search correctly with advanced search filters" do

View File

@ -20,12 +20,13 @@ describe SearchIndexer do
it 'correctly indexes chinese' do it 'correctly indexes chinese' do
SiteSetting.default_locale = 'zh_CN' SiteSetting.default_locale = 'zh_CN'
data = "你好世界" data = "你好世界"
expect(data.split(" ").length).to eq(1)
SearchIndexer.update_posts_index(post_id, "你好世界", "", "", nil) SearchIndexer.update_posts_index(post_id, "", "", "", data)
raw_data = PostSearchData.where(post_id: post_id).pluck(:raw_data)[0] post_search_data = PostSearchData.find_by(post_id: post_id)
expect(raw_data.split(' ').length).to eq(2)
expect(post_search_data.raw_data).to eq("你好 世界")
expect(post_search_data.search_data).to eq("'世界':2 '你好':1")
end end
it 'extract youtube title' do it 'extract youtube title' do
@ -104,11 +105,6 @@ describe SearchIndexer do
expect(raw_data).to eq("This is a test") expect(raw_data).to eq("This is a test")
expect(locale).to eq(SiteSetting.default_locale) expect(locale).to eq(SiteSetting.default_locale)
expect(version).to eq(SearchIndexer::POST_INDEX_VERSION) expect(version).to eq(SearchIndexer::POST_INDEX_VERSION)
SearchIndexer.update_posts_index(post_id, "tester", "", nil, nil)
raw_data = PostSearchData.where(post_id: post_id).pluck(:raw_data)[0]
expect(raw_data).to eq("tester")
end end
describe '.index' do describe '.index' do
@ -118,10 +114,10 @@ describe SearchIndexer do
expect { post }.to change { PostSearchData.count }.by(1) expect { post }.to change { PostSearchData.count }.by(1)
expect { post.update!(raw: "this is new content") } expect { post.update!(raw: "this is new content") }
.to change { post.reload.post_search_data.raw_data } .to change { post.reload.post_search_data.search_data }
expect { post.update!(topic_id: Fabricate(:topic).id) } expect { post.update!(topic_id: Fabricate(:topic).id) }
.to change { post.reload.post_search_data.raw_data } .to change { post.reload.post_search_data.search_data }
end end
it 'should not index posts with empty raw' do it 'should not index posts with empty raw' do
@ -141,7 +137,7 @@ describe SearchIndexer do
topic = post.topic topic = post.topic
expect(post.post_search_data.raw_data).to eq( expect(post.post_search_data.raw_data).to eq(
"#{topic.title} #{topic.category.name} https://meta.discourse.org/some.png" "https://meta.discourse.org/some.png"
) )
end end
@ -158,7 +154,7 @@ describe SearchIndexer do
topic = post.topic topic = post.topic
expect(post.post_search_data.raw_data).to eq( expect(post.post_search_data.raw_data).to eq(
"#{topic.title} #{category.name} a https://cnn.com , http://stuff.com.au b http://abc.net/xyz=1 abc.net/xyz=1" "a https://cnn.com , http://stuff.com.au b http://abc.net/xyz=1 abc.net/xyz=1"
) )
expect(post.post_search_data.search_data).to eq( expect(post.post_search_data.search_data).to eq(
@ -190,7 +186,7 @@ describe SearchIndexer do
) )
expect(post.post_search_data.raw_data).to eq( expect(post.post_search_data.raw_data).to eq(
"#{topic.title} #{topic.category.name} Let me see how I can fix this image white walkers GOT" "Let me see how I can fix this image white walkers GOT"
) )
end end
end end