DEV: Add script to generate schema for intermediate DB (#24484)

This will be used by migration scripts.
2025-05-22 07:53:49 +08:00 · 2024-01-22 17:45:52 +00:00
parent 4f901cae8f
commit 93264da206
5 changed files with 1127 additions and 0 deletions
--- a/migrations/scripts/schema.yml
+++ b/migrations/scripts/schema.yml
@ -0,0 +1,481 @@
+## Configuration options for the base intermediate schema generator
+##
+## After modifying this file, regenerate the base intermediate schema
+## by running the `generate_schema` script.
+
+# Default relative path for generated base schema file.
+# An absolute path can also be provided to the script as the first CLI argument.
+# If the CLI argument is present, it takes precedence over the value specified here.
+output_file_path: ../common/intermediate_db_schema/000_base_schema.sql
+
+## Tables to include in the generated base intermediate schema.
+##
+## Available table options:
+## virtual:     Boolean. Enables the inclusion of a table in the schema solely based.
+##              on the provided configuration. A virtual table does not need to be available in the core schema.
+## ignore:      List of columns to ignore. Convenient if most of the table's column are needed.
+##              Usage is mutually exclusive with the `include` option. Only one should be used at a time.
+## include:     List of columns to include. Convenient if only a few columns are needed.
+##              Usage is mutually exclusive with the `include`` option. Only one should be used at a time.
+## primary_key: Literal or list of columns to use as primary key.
+## extend:      List of objects describing columns to be added/extended.
+##              The following options are available for an "extend" object:
+##                name:    Required. The name of the column being extended.
+##                is_null: Specifies if the column can be null.
+##                type:    Column type. Defaults to TEXT.
+## indexes:     List of indexes to create. The following options are available for an "index" object:
+##                name: Index name.
+##                columns: List of column(s) to index.
+tables:
+  schema_migrations:
+    virtual: true
+    primary_key: path
+    extend:
+      - name: path
+        is_null: false
+      - name: created_at
+        type: datetime
+  config:
+    virtual: true
+    primary_key: name
+    extend:
+      - name: name
+        is_null: false
+      - name: value
+        is_null: false
+  log_entries:
+    virtual: true
+    extend:
+      - name: created_at
+        type: datetime
+        is_null: false
+      - name: type
+        is_null: false
+      - name: message
+        is_null: false
+      - name: exception
+      - name: details
+  users:
+    ignore:
+      - seen_notification_id
+      - last_posted_at
+      - password_hash
+      - salt
+      - active
+      - last_emailed_at
+      - approved_by_id
+      - previous_visit_at
+      - suspended_at
+      - suspended_till
+      - views
+      - flag_level
+      - ip_address
+      - title
+      - uploaded_avatar_id
+      - locale
+      - primary_group_id
+      - first_seen_at
+      - silenced_till
+      - group_locked_trust_level
+      - manual_locked_trust_level
+      - secure_identifier
+      - flair_group_id
+      - last_seen_reviewable_id
+      - password_algorithm
+      - username_lower
+    extend:
+      - name: email
+      - name: created_at
+        is_null: true
+      - name: staged
+        is_null: true
+      - name: avatar_path
+      - name: avatar_url
+      - name: avatar_upload_id
+      - name: bio
+      - name: password
+        is_null: true
+      - name: trust_level
+        is_null: true
+      - name: suspension
+      - name: location
+      - name: website
+      - name: old_relative_url
+      - name: sso_record
+      - name: anonymized
+        type: boolean
+      - name: original_username
+      - name: timezone
+      - name: email_level
+        type: integer
+      - name: email_messages_level
+        type: integer
+      - name: email_digests
+        type: boolean
+  categories:
+    ignore:
+      - topic_id
+      - topic_count
+      - user_id
+      - topics_year
+      - topics_month
+      - topics_week
+      - auto_close_hours
+      - post_count
+      - latest_post_id
+      - latest_topic_id
+      - posts_year
+      - posts_month
+      - posts_week
+      - email_in
+      - email_in_allow_strangers
+      - topics_day
+      - posts_day
+      - allow_badges
+      - name_lower
+      - auto_close_based_on_last_post
+      - topic_template
+      - contains_messages
+      - sort_order
+      - sort_ascending
+      - uploaded_logo_id
+      - uploaded_background_id
+      - topic_featured_link_allowed
+      - all_topics_wiki
+      - show_subcategory_list
+      - num_featured_topics
+      - default_view
+      - subcategory_list_style
+      - default_top_period
+      - mailinglist_mirror
+      - minimum_required_tags
+      - navigate_to_first_post_after_read
+      - search_priority
+      - allow_global_tags
+      - reviewable_by_group_id
+      - read_only_banner
+      - default_list_filter
+      - allow_unlimited_owner_edits_on_first_post
+      - default_slow_mode_seconds
+      - uploaded_logo_dark_id
+      - uploaded_background_dark_id
+    extend:
+      - name: about_topic_title
+      - name: old_relative_url
+      - name: existing_id
+        type: integer
+      - name: permissions
+        type: json_text # JSON_TEXT ???
+      - name: logo_upload_id
+      - name: tag_group_ids
+        type: json_text # JSON_TEXT ???
+  topics:
+    ignore:
+      - last_posted_at
+      - posts_count
+      - last_post_user_id
+      - reply_count
+      - featured_user1_id
+      - featured_user2_id
+      - featured_user3_id
+      - featured_user4_id
+      - deleted_at
+      - highest_post_number
+      - like_count
+      - incoming_link_count
+      - moderator_posts_count
+      - bumped_at
+      - has_summary
+      - archetype
+      - notify_moderators_count
+      - spam_count
+      - score
+      - percent_rank
+      - slug
+      - deleted_by_id
+      - participant_count
+      - word_count
+      - excerpt
+      - fancy_title
+      - highest_staff_post_number
+      - featured_link
+      - reviewable_score
+      - image_upload_id
+      - slow_mode_seconds
+      - bannered_until
+      - external_id
+    extend:
+      - name: old_relative_url
+      - name: private_message
+  posts:
+    ignore:
+      - cooked
+      - reply_to_post_number
+      - reply_count
+      - quote_count
+      - deleted_at
+      - off_topic_count
+      - incoming_link_count
+      - bookmark_count
+      - score
+      - reads
+      - post_type
+      - sort_order
+      - last_editor_id
+      - hidden
+      - hidden_reason_id
+      - notify_moderators_count
+      - spam_count
+      - illegal_count
+      - inappropriate_count
+      - last_version_at
+      - user_deleted
+      - reply_to_user_id
+      - percent_rank
+      - notify_user_count
+      - like_score
+      - deleted_by_id
+      - edit_reason
+      - word_count
+      - version
+      - cook_method
+      - wiki
+      - baked_at
+      - baked_version
+      - hidden_at
+      - self_edits
+      - reply_quoted
+      - via_email
+      - raw_email
+      - public_version
+      - action_code
+      - locked_by_id
+      - image_upload_id
+      - outbound_message_id
+      - qa_vote_count   # TODO: added from plugin, maybe skip these automatically for core schema?
+    extend:
+      - name: reply_to_post_id # NOTE: should this be text??
+      - name: original_raw
+      - name: upload_ids
+        type: json_text
+      - name: post_number
+        type: integer
+      - name: old_relative_url
+      - name: accepted_answer
+        type: boolean
+      - name: small_action
+      - name: whisper
+        type: boolean
+      - name: placeholders
+        type: json_text
+    indexes:
+      - name: posts_by_topic_post_number
+        columns: [topic_id, post_number]
+  uploads:
+    ignore:
+      - original_filename
+      - filesize
+      - width
+      - height
+      - url
+      - created_at
+      - sha1
+      - origin
+      - retain_hours
+      - extension
+      - thumbnail_width
+      - thumbnail_height
+      - etag
+      - secure
+      - access_control_post_id
+      - original_sha1
+      - animated
+      - verification_status
+      - security_last_changed_at
+      - security_last_changed_reason
+      - dominant_color
+    extend:
+      - name: filename
+        is_null: false
+      - name: relative_path
+      - name: type
+      - name: data
+        type: blob
+  groups:
+    include:
+      - id
+      - name
+      - full_name
+      - visibility_level
+      - members_visibility_level
+      - mentionable_level
+      - messageable_level
+    extend:
+      - name: description
+  group_members:
+    virtual: true
+    primary_key: [group_id, user_id]
+    extend:
+      - name: group_id
+        type: integer
+      - name: user_id
+        type: integer
+      - name: owner
+        type: boolean
+  likes:
+    virtual: true
+    primary_key: [user_id, post_id]
+    extend:
+      - name: post_id
+        type: integer
+        is_null: false
+      - name: user_id
+        type: integer
+        is_null: false
+      - name: created_at
+        type: datetime
+        is_null: false
+  # TODO: Pending default values & auto incrementing id column
+  user_fields:
+    ignore:
+      - created_at
+      - external_name
+      - external_type
+    extend:
+      - name: options
+        type: json_text
+  muted_users:
+    primary_key: [user_id, muted_user_id]
+    ignore:
+      - id
+      - created_at
+  # NOTE: Perhaps use core's user_field_options instead?
+  user_field_values:
+    virtual: true
+    extend:
+      - name: user_id
+        type: integer
+        is_null: false
+      - name: field_id
+        type: integer
+        is_null: false
+      - name: is_multiselect_field
+        type: boolean
+        is_null: false
+      - name: value
+    indexes:
+      - name: user_field_values_multiselect
+        columns: [user_id, field_id, value]
+        unique: true
+        condition: WHERE is_multiselect_field = TRUE
+      - name: user_field_values_not_multiselect
+        columns: [user_id, field_id]
+        unique: true
+        condition: WHERE is_multiselect_field = FALSE
+  tags:
+    include:
+      - id
+      - name
+    extend:
+      - name: tag_group_id
+        type: integer
+  tag_groups:
+    include:
+      - id
+      - name
+  topic_tags:
+    primary_key: [topic_id, tag_id]
+    ignore:
+      - id
+      - created_at
+  tag_users:
+    primary_key: [tag_id, user_id]
+    ignore:
+      - id
+      - created_at
+  badges:
+    ignore:
+      - grant_count
+      - allow_title
+      - icon
+      - listable
+      - target_posts
+      - enabled
+      - auto_revoke
+      - trigger
+      - show_posts
+      - system
+      - image
+      - badge_grouping_id
+    extend:
+      - name: bage_group
+  user_badges:
+    include:
+      - user_id
+      - badge_id
+      - granted_at
+  topic_users:
+    primary_key: [user_id, topic_id]
+    ignore:
+      - id
+      - posted
+      - cleared_pinned_at
+      - last_emailed_post_number
+      - liked
+      - bookmarked
+      - last_posted_at
+  permalink_normalizations:
+    virtual: true
+    primary_key: normalization
+    extend:
+      - name: normalization
+        is_null: false
+  site_settings:
+    include:
+      - name
+      - value
+    extend:
+      - name: action
+  category_custom_fields:
+    primary_key: [category_id, name]
+    ignore:
+      - id
+      - created_at
+  post_custom_fields:
+    primary_key: [post_id, name]
+    ignore:
+      - id
+      - created_at
+  polls: {}
+  poll_options:
+    ignore:
+      - digest
+      - html
+      - anonymous_votes
+    extend:
+      - name: poll_id
+        is_null: false
+      - name: text
+        is_null: false
+      - name: position
+        type: integer
+      - name: created_at
+        is_null: true
+  poll_votes:
+    primary_key: [poll_option_id, user_id]
+    ignore: [poll_id]
+    extend:
+      - name: created_at
+        is_null: true
+      - name: poll_option_id
+        is_null: false
+      - name: user_id
+        is_null: false
+## Schema-wide column configuration options. These options apply to all tables.
+## See table specific column configuration options above.
+##
+## Available Options:
+## ignore:  List of core/plugin table columns to ignore and exclude from intermediate schema.
+columns:
+  ignore:
+    - updated_at
--- a/migrations/scripts/schema_generator
+++ b/migrations/scripts/schema_generator
@ -0,0 +1,263 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+# Generate the converter's base intermediate database migration file from
+# the core database state and YAML configuration in schema.yml
+# Invoke from core root directory as `./migrations/scripts/generate_schema`
+# It accepts an optional command line argument for the output file path which
+# overrides the path configured in schema.yml
+
+puts "Loading application..."
+require_relative "../../config/environment"
+
+module Migrations
+  class SchemaGenerator
+    def initialize(opts = {})
+      config = YAML.load_file(File.join(__dir__, "schema.yml"), symbolize_names: true)
+
+      @core_db_connection = ActiveRecord::Base.connection
+      @output_stream = StringIO.new
+      @indirectly_ignored_columns = Hash.new { |h, k| h[k] = [] }
+
+      @output_file_path = opts[:output_file_path] || config[:output_file_path]
+
+      @table_configs = config[:tables]
+      @column_configs = config[:columns]
+
+      @configured_table_names = @table_configs&.keys&.sort || []
+      @global_column_ignore_list = @column_configs[:ignore] || []
+    end
+
+    def run
+      puts "Generating base converter migration file for Discourse #{Discourse::VERSION::STRING}"
+
+      generate_header
+      generate_tables
+      generate_indirectly_ignored_columns_log
+      generate_migration_file
+
+      puts "", "Done"
+    end
+
+    private
+
+    def generate_header
+      @output_stream.puts <<~HEADER
+        /*
+         This file is auto-generated from the Discourse core database schema. Instead of editing it directly,
+         please update the `schema.yml` configuration file and re-run the `generate_schema` script to update it.
+        */
+      HEADER
+    end
+
+    def generate_tables
+      puts "Generating tables..."
+
+      @configured_table_names.each do |name|
+        raise "Core table named '#{name}' not found" unless valid_table?(name)
+
+        generate_table(name)
+      end
+    end
+
+    def generate_indirectly_ignored_columns_log
+      puts "Generating indirectly ignored column list..."
+
+      @output_stream.puts "\n\n/*"
+
+      @output_stream.puts <<~NOTE
+        Core table columns implicitly excluded from the generated schema above via the `include` configuration option
+        in `schema.yml`. This serves as an inventory of these columns, allowing new core additions to be tracked and,
+        if necessary, synchronized with the intermediate database schema.\n
+      NOTE
+
+      @indirectly_ignored_columns.each_with_index do |(table_name, columns), index|
+        next if virtual_table?(table_name) || columns.blank?
+
+        @output_stream.puts "" if index.positive?
+        @output_stream.puts "Table: #{table_name}"
+        @output_stream.puts "--------#{"-" * table_name.length}"
+
+        columns.each do |column|
+          @output_stream.puts " #{column.name} #{column.type} #{column.null}"
+        end
+      end
+
+      @output_stream.puts "*/"
+    end
+
+    def generate_migration_file
+      file_path = File.expand_path(@output_file_path, __dir__)
+
+      puts "Generating base migration file '#{file_path}'..."
+
+      File.open(file_path, "w") { |f| f << @output_stream.string.chomp }
+    end
+
+    def generate_column_definition(column)
+      definition = "  #{column.name} #{type(column)}"
+      definition << " NOT NULL" unless column.null
+
+      definition
+    end
+
+    def generate_index(table_name, index)
+      @output_stream.print "CREATE "
+      @output_stream.print "UNIQUE " if index[:unique]
+      @output_stream.print "INDEX #{index[:name]} ON #{table_name} (#{index[:columns].join(", ")})"
+      @output_stream.print " #{index[:condition]}" if index[:condition].present?
+      @output_stream.puts ";"
+    end
+
+    def column_list_for(table_name)
+      ignore_columns = @table_configs.dig(table_name, :ignore) || []
+      include_columns = @table_configs.dig(table_name, :include) || []
+
+      include_columns.present? ? [:include, include_columns] : [:ignore, ignore_columns]
+    end
+
+    def generate_table(name)
+      puts "Generating #{name}..."
+
+      column_definitions = []
+      column_records = columns(name)
+      mode, column_list = column_list_for(name)
+      indexes = indexes(name)
+      configured_primary_key = primary_key(name)
+
+      primary_key, composite_key =
+        if configured_primary_key.present?
+          [configured_primary_key].flatten.each do |pk|
+            if column_records.map(&:name).exclude?(pk)
+              raise "Column named '#{pk}' does not exist in table '#{name}'"
+            end
+          end
+
+          [
+            configured_primary_key,
+            configured_primary_key.is_a?(Array) && configured_primary_key.length > 1,
+          ]
+        else
+          virtual_table?(name) ? [] : [@core_db_connection.primary_key(name), false]
+        end
+
+      @output_stream.puts ""
+      @output_stream.puts "CREATE TABLE #{name} ("
+
+      if !composite_key && primary_key.present?
+        primary_key_column = column_records.find { |c| c.name == primary_key }
+
+        if (mode == :include && column_list.include?(primary_key_column.name)) ||
+             (mode == :ignore && column_list.exclude?(primary_key_column.name))
+          column_definitions << "  #{primary_key_column.name} #{type(primary_key_column)} NOT NULL PRIMARY KEY"
+        end
+      end
+
+      column_records.each do |column|
+        next if @global_column_ignore_list.include?(column.name)
+        next if (mode == :ignore) && column_list.include?(column.name)
+        if !column.is_a?(CustomColumn) && (mode == :include) && column_list.exclude?(column.name)
+          @indirectly_ignored_columns[name] << column
+
+          next
+        end
+        next if !composite_key && (column.name == primary_key)
+
+        column_definitions << generate_column_definition(column)
+      end
+
+      column_definitions << "  PRIMARY KEY (#{primary_key.join(", ")})" if composite_key
+
+      @output_stream.puts column_definitions.join(",\n")
+      @output_stream.puts ");"
+      @output_stream.puts "" if indexes.present?
+
+      indexes.each { |index| generate_index(name, index) }
+    end
+
+    class CustomColumn
+      attr_reader :name
+
+      def initialize(name, type, null)
+        @name = name
+        @raw_type = type
+        @raw_null = null
+      end
+
+      def type
+        @raw_type&.to_sym || :text
+      end
+
+      def null
+        @raw_null.nil? ? true : @raw_null
+      end
+
+      def merge!(other_column)
+        @raw_null = other_column.null if @raw_null.nil?
+        @raw_type ||= other_column.type
+
+        self
+      end
+    end
+
+    def columns(name)
+      extensions = column_extensions(name)
+
+      return extensions if virtual_table?(name)
+
+      default_columns = @core_db_connection.columns(name)
+
+      return default_columns if extensions.blank?
+
+      extended_columns =
+        default_columns.map do |default_column|
+          extension = extensions.find { |ext| ext.name == default_column.name }
+
+          if extension
+            extensions.delete(extension)
+
+            extension.merge!(default_column)
+          else
+            default_column
+          end
+        end
+
+      extended_columns + extensions
+    end
+
+    def column_extensions(name)
+      extensions = @table_configs.dig(name, :extend)
+
+      return [] if extensions.nil?
+
+      extensions.map { |column| CustomColumn.new(column[:name], column[:type], column[:is_null]) }
+    end
+
+    def type(column)
+      case column.type
+      when :string, :inet
+        "TEXT"
+      else
+        column.type.to_s.upcase
+      end
+    end
+
+    def valid_table?(name)
+      @core_db_connection.tables.include?(name.to_s) || virtual_table?(name)
+    end
+
+    def virtual_table?(name)
+      !!@table_configs.dig(name, :virtual)
+    end
+
+    def indexes(table_name)
+      @table_configs.dig(table_name, :indexes) || []
+    end
+
+    def primary_key(table_name)
+      @table_configs.dig(table_name, :primary_key)
+    end
+  end
+end
+
+Migrations::SchemaGenerator.new(output_file_path: ARGV.first).run