diff --git a/app/assets/javascripts/discourse/app/components/topic-summary.js b/app/assets/javascripts/discourse/app/components/topic-summary.js index 902344fc5bd..c86b03777ec 100644 --- a/app/assets/javascripts/discourse/app/components/topic-summary.js +++ b/app/assets/javascripts/discourse/app/components/topic-summary.js @@ -4,6 +4,7 @@ import { ajax } from "discourse/lib/ajax"; import { popupAjaxError } from "discourse/lib/ajax-error"; import { action } from "@ember/object"; import { schedule } from "@ember/runloop"; +import { cookAsync } from "discourse/lib/text"; export default class TopicSummary extends Component { @tracked loading = false; @@ -16,7 +17,9 @@ export default class TopicSummary extends Component { ajax(`/t/${this.args.topicId}/strategy-summary`) .then((data) => { - this.summary = data.summary; + cookAsync(data.summary).then((cooked) => { + this.summary = cooked; + }); }) .catch(popupAjaxError) .finally(() => (this.loading = false)); diff --git a/app/controllers/topics_controller.rb b/app/controllers/topics_controller.rb index eb4ff526652..3bd7beb72c6 100644 --- a/app/controllers/topics_controller.rb +++ b/app/controllers/topics_controller.rb @@ -1178,18 +1178,7 @@ class TopicsController < ApplicationController RateLimiter.new(current_user, "summary", 6, 5.minutes).performed! - hijack do - summary_opts = { - filter: "summary", - exclude_deleted_users: true, - exclude_hidden: true, - show_deleted: false, - } - - content = TopicView.new(topic, current_user, summary_opts).posts.pluck(:raw).join("\n") - - render json: { summary: strategy.summarize(content) } - end + hijack { render json: { summary: TopicSummarization.new(strategy).summarize(topic) } } end private diff --git a/app/models/post.rb b/app/models/post.rb index 014c72999eb..adccaf0d003 100644 --- a/app/models/post.rb +++ b/app/models/post.rb @@ -437,7 +437,7 @@ class Post < ActiveRecord::Base # percent rank has tons of ties where(topic_id: topic_id).where( [ - "id = ANY( + "posts.id = ANY( ( SELECT posts.id FROM posts diff --git a/app/models/summary_section.rb b/app/models/summary_section.rb new file mode 100644 index 00000000000..613c59b54e6 --- /dev/null +++ b/app/models/summary_section.rb @@ -0,0 +1,21 @@ +# frozen_string_literal: true + +class SummarySection < ActiveRecord::Base + belongs_to :target, polymorphic: true +end + +# == Schema Information +# +# Table name: summary_sections +# +# id :bigint not null, primary key +# target_id :integer not null +# target_type :string not null +# content_range :int4range +# summarized_text :string not null +# meta_section_id :integer +# original_content_sha :string not null +# algorithm :string not null +# created_at :datetime not null +# updated_at :datetime not null +# diff --git a/app/services/topic_summarization.rb b/app/services/topic_summarization.rb new file mode 100644 index 00000000000..b13452a739b --- /dev/null +++ b/app/services/topic_summarization.rb @@ -0,0 +1,90 @@ +# frozen_string_literal: true + +class TopicSummarization + def initialize(strategy) + @strategy = strategy + end + + def summarize(topic) + DistributedMutex.synchronize("toppic_summarization_#{topic.id}") do + existing_summary = SummarySection.find_by(target: topic, meta_section_id: nil) + return existing_summary.summarized_text if existing_summary + + content = { + resource_path: "#{Discourse.base_path}/t/-/#{topic.id}", + content_title: topic.title, + contents: [], + } + + targets_data = summary_targets(topic).pluck(:post_number, :raw, :username) + + targets_data.map do |(pn, raw, username)| + content[:contents] << { poster: username, id: pn, text: raw } + end + + summarization_result = strategy.summarize(content) + + cached_summary(summarization_result, targets_data.map(&:first), topic) + + summarization_result[:summary] + end + end + + def summary_targets(topic) + topic.has_summary? ? best_replies(topic) : pick_selection(topic) + end + + private + + attr_reader :strategy + + def best_replies(topic) + Post + .summary(topic.id) + .where("post_type = ?", Post.types[:regular]) + .where("NOT hidden") + .joins(:user) + .order(:post_number) + end + + def pick_selection(topic) + posts = + Post + .where(topic_id: topic.id) + .where("post_type = ?", Post.types[:regular]) + .where("NOT hidden") + .order(:post_number) + + post_numbers = posts.limit(5).pluck(:post_number) + post_numbers += posts.reorder("posts.score desc").limit(50).pluck(:post_number) + post_numbers += posts.reorder("post_number desc").limit(5).pluck(:post_number) + + Post + .where(topic_id: topic.id) + .joins(:user) + .where("post_number in (?)", post_numbers) + .order(:post_number) + end + + def cached_summary(result, post_numbers, topic) + main_summary = + SummarySection.create!( + target: topic, + algorithm: strategy.model, + content_range: (post_numbers.first..post_numbers.last), + summarized_text: result[:summary], + original_content_sha: Digest::SHA256.hexdigest(post_numbers.join), + ) + + result[:chunks].each do |chunk| + SummarySection.create!( + target: topic, + algorithm: strategy.model, + content_range: chunk[:ids].min..chunk[:ids].max, + summarized_text: chunk[:summary], + original_content_sha: Digest::SHA256.hexdigest(chunk[:ids].join), + meta_section_id: main_summary.id, + ) + end + end +end diff --git a/db/migrate/20230608163854_create_summary_sections_table.rb b/db/migrate/20230608163854_create_summary_sections_table.rb new file mode 100644 index 00000000000..b325f5f14a1 --- /dev/null +++ b/db/migrate/20230608163854_create_summary_sections_table.rb @@ -0,0 +1,16 @@ +# frozen_string_literal: true + +class CreateSummarySectionsTable < ActiveRecord::Migration[7.0] + def change + create_table :summary_sections do |t| + t.integer :target_id, null: false + t.string :target_type, null: false + t.int4range :content_range + t.string :summarized_text, null: false + t.integer :meta_section_id + t.string :original_content_sha, null: false + t.string :algorithm, null: false + t.timestamps + end + end +end diff --git a/lib/summarization/base.rb b/lib/summarization/base.rb index d4886476bbc..e70ac1c2a57 100644 --- a/lib/summarization/base.rb +++ b/lib/summarization/base.rb @@ -1,27 +1,28 @@ # frozen_string_literal: true +# Base class that defines the interface that every summarization +# strategy must implement. +# Above each method, you'll find an explanation of what +# it does and what it should return. + module Summarization class Base - def self.available_strategies - DiscoursePluginRegistry.summarization_strategies + class << self + def available_strategies + DiscoursePluginRegistry.summarization_strategies + end + + def find_strategy(strategy_model) + available_strategies.detect { |s| s.model == strategy_model } + end + + def selected_strategy + return if SiteSetting.summarization_strategy.blank? + + find_strategy(SiteSetting.summarization_strategy) + end end - def self.find_strategy(strategy_model) - available_strategies.detect { |s| s.model == strategy_model } - end - - def self.selected_strategy - return if SiteSetting.summarization_strategy.blank? - - find_strategy(SiteSetting.summarization_strategy) - end - - def initialize(model) - @model = model - end - - attr_reader :model - def can_request_summaries?(user) user_group_ids = user.group_ids @@ -30,20 +31,52 @@ module Summarization end end + # Some strategies could require other conditions to work correctly, + # like site settings. + # This method gets called when admins attempt to select it, + # checking if we met those conditions. def correctly_configured? raise NotImplemented end + # Strategy name to display to admins in the available strategies dropdown. def display_name raise NotImplemented end + # If we don't meet the conditions to enable this strategy, + # we'll display this hint as an error to admins. def configuration_hint raise NotImplemented end + # The idea behind this method is "give me a collection of texts, + # and I'll handle the summarization to the best of my capabilities.". + # It's important to emphasize the "collection of texts" part, which implies + # it's not tied to any model and expects the "content" to be a hash instead. + # + # @param content { Hash } - Includes the content to summarize, plus additional + # context to help the strategy produce a better result. Keys present in the content hash: + # - resource_path (optional): Helps the strategy build links to the content in the summary (e.g. "/t/-/:topic_id/POST_NUMBER") + # - content_title (optional): Provides guidance about what the content is about. + # - contents (required): Array of hashes with content to summarize (e.g. [{ poster: "asd", id: 1, text: "This is a text" }]) + # All keys are required. + # + # @returns { Hash } - The summarized content, plus chunks if the content couldn't be summarized in one pass. Example: + # { + # summary: "This is the final summary", + # chunks: [ + # { ids: [topic.first_post.post_number], summary: "this is the first chunk" }, + # { ids: [post_1.post_number, post_2.post_number], summary: "this is the second chunk" }, + # ], + # } def summarize(content) raise NotImplemented end + + # Returns the string we'll store in the selected strategy site setting. + def model + raise NotImplemented + end end end diff --git a/plugins/chat/app/controllers/chat/api/summaries_controller.rb b/plugins/chat/app/controllers/chat/api/summaries_controller.rb index 209aac46032..2070b6128bc 100644 --- a/plugins/chat/app/controllers/chat/api/summaries_controller.rb +++ b/plugins/chat/app/controllers/chat/api/summaries_controller.rb @@ -27,7 +27,7 @@ class Chat::Api::SummariesController < Chat::ApiController .map { "#{_1}: #{_2}" } .join("\n") - render json: { summary: strategy.summarize(content) } + render json: { summary: strategy.summarize(content).dig(:summary) } end end end diff --git a/plugins/chat/spec/system/chat_summarization_spec.rb b/plugins/chat/spec/system/chat_summarization_spec.rb index 38af780b02b..df504058e7d 100644 --- a/plugins/chat/spec/system/chat_summarization_spec.rb +++ b/plugins/chat/spec/system/chat_summarization_spec.rb @@ -11,10 +11,12 @@ RSpec.describe "Summarize a channel since your last visit", type: :system, js: t let(:chat) { PageObjects::Pages::Chat.new } + let(:summarization_result) { { summary: "This is a summary", chunks: [] } } + before do group.add(current_user) - strategy = DummyCustomSummarization.new("dummy") + strategy = DummyCustomSummarization.new(summarization_result) plugin.register_summarization_strategy(strategy) SiteSetting.summarization_strategy = strategy.model SiteSetting.custom_summarization_allowed_groups = group.id.to_s @@ -36,6 +38,6 @@ RSpec.describe "Summarize a channel since your last visit", type: :system, js: t find(".summarization-since").click find(".select-kit-row[data-value=\"3\"]").click - expect(find(".summary-area").text).to eq(DummyCustomSummarization::RESPONSE) + expect(find(".summary-area").text).to eq(summarization_result[:summary]) end end diff --git a/spec/lib/summarization/base_spec.rb b/spec/lib/summarization/base_spec.rb index 6b211d6f95a..9e3809eb37e 100644 --- a/spec/lib/summarization/base_spec.rb +++ b/spec/lib/summarization/base_spec.rb @@ -10,13 +10,13 @@ describe Summarization::Base do it "returns true if the user group is present in the custom_summarization_allowed_groups_map setting" do SiteSetting.custom_summarization_allowed_groups = group.id - expect(described_class.new(nil).can_request_summaries?(user)).to eq(true) + expect(subject.can_request_summaries?(user)).to eq(true) end it "returns false if the user group is not present in the custom_summarization_allowed_groups_map setting" do SiteSetting.custom_summarization_allowed_groups = "" - expect(described_class.new(nil).can_request_summaries?(user)).to eq(false) + expect(subject.can_request_summaries?(user)).to eq(false) end end end diff --git a/spec/services/topic_summarization_spec.rb b/spec/services/topic_summarization_spec.rb new file mode 100644 index 00000000000..7e5f553a94c --- /dev/null +++ b/spec/services/topic_summarization_spec.rb @@ -0,0 +1,129 @@ +# frozen_string_literal: true + +describe TopicSummarization do + fab!(:topic) { Fabricate(:topic) } + fab!(:post_1) { Fabricate(:post, topic: topic) } + fab!(:post_2) { Fabricate(:post, topic: topic) } + + shared_examples "includes only public-visible topics" do + subject { described_class.new(DummyCustomSummarization.new({})) } + + it "only includes visible posts" do + topic.first_post.update!(hidden: true) + + posts = subject.summary_targets(topic) + + expect(posts.none?(&:hidden?)).to eq(true) + end + + it "doesn't include posts without users" do + topic.first_post.user.destroy! + + posts = subject.summary_targets(topic) + + expect(posts.detect { |p| p.id == topic.first_post.id }).to be_nil + end + + it "doesn't include deleted posts" do + topic.first_post.update!(user_id: nil) + + posts = subject.summary_targets(topic) + + expect(posts.detect { |p| p.id == topic.first_post.id }).to be_nil + end + end + + describe "#summary_targets" do + context "when the topic has a best replies summary" do + before { topic.has_summary = true } + + it_behaves_like "includes only public-visible topics" + end + + context "when the topic doesn't have a best replies summary" do + before { topic.has_summary = false } + + it_behaves_like "includes only public-visible topics" + end + end + + describe "#summarize" do + let(:strategy) { DummyCustomSummarization.new(summary) } + + subject { described_class.new(strategy) } + + def assert_summary_is_cached(topic, summary_response) + cached_summary = SummarySection.find_by(target: topic, meta_section_id: nil) + + expect(cached_summary.content_range).to cover(*topic.posts.map(&:post_number)) + expect(cached_summary.summarized_text).to eq(summary_response[:summary]) + expect(cached_summary.original_content_sha).to eq( + Digest::SHA256.hexdigest(topic.posts.map(&:post_number).join), + ) + expect(cached_summary.algorithm).to eq(strategy.model) + end + + def assert_chunk_is_cached(topic, chunk_response) + cached_chunk = + SummarySection + .where.not(meta_section_id: nil) + .find_by( + target: topic, + content_range: (chunk_response[:ids].min..chunk_response[:ids].max), + ) + + expect(cached_chunk.summarized_text).to eq(chunk_response[:summary]) + expect(cached_chunk.original_content_sha).to eq( + Digest::SHA256.hexdigest(chunk_response[:ids].join), + ) + expect(cached_chunk.algorithm).to eq(strategy.model) + end + + context "when the content was summarized in a single chunk" do + let(:summary) { { summary: "This is the final summary", chunks: [] } } + + it "caches the summary" do + summarized_text = subject.summarize(topic) + + expect(summarized_text).to eq(summary[:summary]) + + assert_summary_is_cached(topic, summary) + end + + it "returns the cached version in subsequent calls" do + subject.summarize(topic) + + cached_summary_text = "This is a cached summary" + cached_summary = + SummarySection.find_by(target: topic, meta_section_id: nil).update!( + summarized_text: cached_summary_text, + ) + + summarized_text = subject.summarize(topic) + expect(summarized_text).to eq(cached_summary_text) + end + end + + context "when the content was summarized in multiple chunks" do + let(:summary) do + { + summary: "This is the final summary", + chunks: [ + { ids: [topic.first_post.post_number], summary: "this is the first chunk" }, + { ids: [post_1.post_number, post_2.post_number], summary: "this is the second chunk" }, + ], + } + end + + it "caches the summary and each chunk" do + summarized_text = subject.summarize(topic) + + expect(summarized_text).to eq(summary[:summary]) + + assert_summary_is_cached(topic, summary) + + summary[:chunks].each { |c| assert_chunk_is_cached(topic, c) } + end + end + end +end diff --git a/spec/support/dummy_custom_summarization.rb b/spec/support/dummy_custom_summarization.rb index 05d4593113d..1724a0274f6 100644 --- a/spec/support/dummy_custom_summarization.rb +++ b/spec/support/dummy_custom_summarization.rb @@ -1,7 +1,9 @@ # frozen_string_literal: true class DummyCustomSummarization < Summarization::Base - RESPONSE = "This is a summary of the content you gave me" + def initialize(summarization_result) + @summarization_result = summarization_result + end def display_name "dummy" @@ -15,7 +17,11 @@ class DummyCustomSummarization < Summarization::Base "hint" end + def model + "dummy" + end + def summarize(_content) - RESPONSE + @summarization_result end end diff --git a/spec/system/topic_summarization_spec.rb b/spec/system/topic_summarization_spec.rb index d54bc8444d6..e3fd5c54b41 100644 --- a/spec/system/topic_summarization_spec.rb +++ b/spec/system/topic_summarization_spec.rb @@ -10,9 +10,11 @@ RSpec.describe "Topic summarization", type: :system, js: true do let(:plugin) { Plugin::Instance.new } + let(:summarization_result) { { summary: "This is a summary", chunks: [] } } + before do sign_in(user) - strategy = DummyCustomSummarization.new("dummy") + strategy = DummyCustomSummarization.new(summarization_result) plugin.register_summarization_strategy(strategy) SiteSetting.summarization_strategy = strategy.model end @@ -24,6 +26,6 @@ RSpec.describe "Topic summarization", type: :system, js: true do expect(page.has_css?(".topic-summary-modal", wait: 5)).to eq(true) - expect(find(".summary-area").text).to eq(DummyCustomSummarization::RESPONSE) + expect(find(".summary-area").text).to eq(summarization_result[:summary]) end end