Class: SearchIndexer

Inherits:

Object

Object
SearchIndexer

show all

Defined in:: app/services/search_indexer.rb

Defined Under Namespace

Classes: HtmlScrubber

Constant Summary collapse

MIN_POST_BLURB_INDEX_VERSION =

POST_INDEX_VERSION =

TOPIC_INDEX_VERSION =

CATEGORY_INDEX_VERSION =

USER_INDEX_VERSION =

TAG_INDEX_VERSION =

REINDEX_VERSION = version to apply when issuing a background reindex

TS_VECTOR_PARSE_REGEX =

/('([^']*|'')*'\:)(([0-9]+[A-D]?,?)+)/

Class Method Summary collapse

Class Method Details

.disable ⇒ `Object`



16
17
18

# File 'app/services/search_indexer.rb', line 16

def self.disable
  @disabled = true
end

.enable ⇒ `Object`



20
21
22

# File 'app/services/search_indexer.rb', line 20

def self.enable
  @disabled = false
end

.index(obj, force: false) ⇒ `Object`

# File 'app/services/search_indexer.rb', line 253

def self.index(obj, force: false)
  return if @disabled

  category_name = nil
  tag_names = nil
  topic = nil

  if Topic === obj
    topic = obj
  elsif Post === obj
    topic = obj.topic
  end

  category_name = topic.category&.name if topic

  if topic
    tags = topic.tags.select(:id, :name).to_a

    if tags.present?
      tag_names =
        (tags.map(&:name) + Tag.where(target_tag_id: tags.map(&:id)).pluck(:name)).join(" ")
    end
  end

  if Post === obj && obj.raw.present? &&
       (force || obj.saved_change_to_cooked? || obj.saved_change_to_topic_id?)
    if topic
      SearchIndexer.update_posts_index(
        post_id: obj.id,
        topic_title: topic.title,
        category_name: category_name,
        topic_tags: tag_names,
        cooked: obj.cooked,
        private_message: topic.private_message?,
      )

      SearchIndexer.update_topics_index(topic.id, topic.title, obj.cooked) if obj.is_first_post?
    end
  end

  if User === obj && (obj.saved_change_to_username? || obj.saved_change_to_name? || force)
    SearchIndexer.update_users_index(
      obj.id,
      obj.username_lower || "",
      obj.name ? obj.name.downcase : "",
      obj.user_custom_fields.searchable.map(&:value).join(" "),
    )
  end

  if Topic === obj && (obj.saved_change_to_title? || force)
    if obj.posts
      if post = obj.posts.find_by(post_number: 1)
        SearchIndexer.update_posts_index(
          post_id: post.id,
          topic_title: obj.title,
          category_name: category_name,
          topic_tags: tag_names,
          cooked: post.cooked,
          private_message: obj.private_message?,
        )

        SearchIndexer.update_topics_index(obj.id, obj.title, post.cooked)
      end
    end
  end

  if Category === obj && (obj.saved_change_to_name? || force)
    SearchIndexer.queue_category_posts_reindex(obj.id)
    SearchIndexer.update_categories_index(obj.id, obj.name)
  end

  if Tag === obj && (obj.saved_change_to_name? || force)
    SearchIndexer.update_tags_index(obj.id, obj.name)
  end
end

.queue_category_posts_reindex(category_id) ⇒ `Object`

# File 'app/services/search_indexer.rb', line 217

def self.queue_category_posts_reindex(category_id)
  return if @disabled

  DB.exec(<<~SQL, category_id: category_id, version: REINDEX_VERSION)
    UPDATE post_search_data
    SET version = :version
    FROM posts
    INNER JOIN topics ON posts.topic_id = topics.id
    INNER JOIN categories ON topics.category_id = categories.id
    WHERE post_search_data.post_id = posts.id
    AND categories.id = :category_id
  SQL
end

.queue_post_reindex(topic_id) ⇒ `Object`

# File 'app/services/search_indexer.rb', line 241

def self.queue_post_reindex(topic_id)
  return if @disabled

  DB.exec(<<~SQL, topic_id: topic_id, version: REINDEX_VERSION)
    UPDATE post_search_data
    SET version = :version
    FROM posts
    WHERE post_search_data.post_id = posts.id
    AND posts.topic_id = :topic_id
  SQL
end

.queue_users_reindex(user_ids) ⇒ `Object`

# File 'app/services/search_indexer.rb', line 231

def self.queue_users_reindex(user_ids)
  return if @disabled

  DB.exec(<<~SQL, user_ids: user_ids, version: REINDEX_VERSION)
    UPDATE user_search_data
    SET version = :version
    WHERE user_search_data.user_id IN (:user_ids)
  SQL
end

.update_categories_index(category_id, name) ⇒ `Object`



209
210
211

# File 'app/services/search_indexer.rb', line 209

def self.update_categories_index(category_id, name)
  update_index(table: "category", id: category_id, a_weight: name)
end

.update_index(table:, id:, a_weight: nil, b_weight: nil, c_weight: nil, d_weight: nil) ⇒ `Object`

# File 'app/services/search_indexer.rb', line 24

def self.update_index(table:, id:, a_weight: nil, b_weight: nil, c_weight: nil, d_weight: nil)
  raw_data = { a: a_weight, b: b_weight, c: c_weight, d: d_weight }

  # The version used in excerpts
  search_data = raw_data.transform_values { |data| Search.prepare_data(data || "", :index) }

  # The version used to build the index
  indexed_data =
    search_data.transform_values do |data|
      data.gsub(/\S+/) { |word| word[0...SiteSetting.search_max_indexed_word_length] }
    end

  table_name = "#{table}_search_data"
  foreign_key = "#{table}_id"

  # for user login and name use "simple" lowercase stemmer
  stemmer = table == "user" ? "simple" : Search.ts_config

  ranked_index = <<~SQL
    setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:a,''))")}, 'A') ||
    setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:b,''))")}, 'B') ||
    setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:c,''))")}, 'C') ||
    setweight(to_tsvector('#{stemmer}', #{Search.wrap_unaccent("coalesce(:d,''))")}, 'D')
  SQL

  tsvector = DB.query_single("SELECT #{ranked_index}", indexed_data)[0]
  additional_lexemes = []

  # we also want to index parts of a domain name
  # that way stemmed single word searches will match
  additional_words = []

  tsvector
    .scan(/'(([a-zA-Z0-9]+\.)+[a-zA-Z0-9]+)'\:([\w+,]+)/)
    .reduce(additional_lexemes) do |array, (lexeme, _, positions)|
      count = 0

      if lexeme !~ /\A(\d+\.)?(\d+\.)*(\*|\d+)\z/
        loop do
          count += 1
          break if count >= 10 # Safeguard here to prevent infinite loop when a term has many dots
          term, _, remaining = lexeme.partition(".")
          break if remaining.blank?

          additional_words << [term, positions]

          array << "'#{remaining}':#{positions}"
          lexeme = remaining
        end
      end

      array
    end

  extra_domain_word_terms =
    if additional_words.length > 0
      DB
        .query_single(
          "SELECT to_tsvector(?, ?)",
          stemmer,
          additional_words.map { |term, _| term }.join(" "),
        )
        .first
        .scan(TS_VECTOR_PARSE_REGEX)
        .map do |term, _, indexes|
          new_indexes =
            indexes
              .split(",")
              .map do |index|
                existing_positions = additional_words[index.to_i - 1]
                if existing_positions
                  existing_positions[1]
                else
                  index
                end
              end
              .join(",")
          "#{term}#{new_indexes}"
        end
        .join(" ")
    end

  tsvector = "#{tsvector} #{additional_lexemes.join(" ")} #{extra_domain_word_terms}"

  if (max_dupes = SiteSetting.max_duplicate_search_index_terms) > 0
    reduced = []
    tsvector
      .scan(TS_VECTOR_PARSE_REGEX)
      .each do |term, _, indexes|
        family_counts = Hash.new(0)
        new_index_array = []

        indexes
          .split(",")
          .each do |index|
            family = nil
            family = index[-1] if index[-1].match?(/[A-D]/)
            # title dupes can completely dominate the index
            # so we limit them to 1
            if (family_counts[family] += 1) <= (family == "A" ? 1 : max_dupes)
              new_index_array << index
            end
          end
        reduced << "#{term.strip}#{new_index_array.join(",")}"
      end
    tsvector = reduced.join(" ")
  end

  indexed_data =
    if table.to_s == "post"
      clean_post_raw_data!(search_data[:d])
    else
      search_data.values.select { |d| d.length > 0 }.join(" ")
    end

  params = {
    "raw_data" => indexed_data,
    "#{foreign_key}" => id,
    "locale" => SiteSetting.default_locale,
    "version" => const_get("#{table.upcase}_INDEX_VERSION"),
    "search_data" => tsvector,
  }

  yield params if block_given?
  table_name.camelize.constantize.upsert(params)
rescue => e
  if Rails.env.test?
    raise
  else
    # TODO is there any way we can safely avoid this?
    # best way is probably pushing search indexer into a dedicated process so it no longer happens on save
    # instead in the post processor
    Discourse.warn_exception(
      e,
      message: "Unexpected error while indexing #{table} for search",
      env: {
        id: id,
      },
    )
  end
end

.update_posts_index(post_id:, topic_title:, category_name:, topic_tags:, cooked:, private_message:) ⇒ `Object`

# File 'app/services/search_indexer.rb', line 177

def self.update_posts_index(
  post_id:,
  topic_title:,
  category_name:,
  topic_tags:,
  cooked:,
  private_message:
)
  update_index(
    table: "post",
    id: post_id,
    a_weight: topic_title,
    b_weight: category_name,
    c_weight: topic_tags,
    # The tsvector resulted from parsing a string can be double the size of
    # the original string. Since there is no way to estimate the length of
    # the expected tsvector, we limit the input to ~50% of the maximum
    # length of a tsvector (1_048_576 bytes).
    d_weight: HtmlScrubber.scrub(cooked)[0..600_000],
  ) { |params| params["private_message"] = private_message }
end

.update_tags_index(tag_id, name) ⇒ `Object`



213
214
215

# File 'app/services/search_indexer.rb', line 213

def self.update_tags_index(tag_id, name)
  update_index(table: "tag", id: tag_id, a_weight: name.downcase)
end

.update_topics_index(topic_id, title, cooked) ⇒ `Object`

# File 'app/services/search_indexer.rb', line 166

def self.update_topics_index(topic_id, title, cooked)
  # a bit inconsistent that we use title as A and body as B when in
  # the post index body is D
  update_index(
    table: "topic",
    id: topic_id,
    a_weight: title,
    b_weight: HtmlScrubber.scrub(cooked)[0...Topic::MAX_SIMILAR_BODY_LENGTH],
  )
end

.update_users_index(user_id, username, name, custom_fields) ⇒ `Object`

# File 'app/services/search_indexer.rb', line 199

def self.update_users_index(user_id, username, name, custom_fields)
  update_index(
    table: "user",
    id: user_id,
    a_weight: username,
    b_weight: name,
    c_weight: custom_fields,
  )
end

Class: SearchIndexer

Defined Under Namespace

Constant Summary collapse

Class Method Summary collapse

Class Method Details

.disable ⇒ Object

.enable ⇒ Object

.index(obj, force: false) ⇒ Object

.queue_category_posts_reindex(category_id) ⇒ Object

.queue_post_reindex(topic_id) ⇒ Object

.queue_users_reindex(user_ids) ⇒ Object

.update_categories_index(category_id, name) ⇒ Object

.update_index(table:, id:, a_weight: nil, b_weight: nil, c_weight: nil, d_weight: nil) ⇒ Object

.update_posts_index(post_id:, topic_title:, category_name:, topic_tags:, cooked:, private_message:) ⇒ Object

.update_tags_index(tag_id, name) ⇒ Object

.update_topics_index(topic_id, title, cooked) ⇒ Object

.update_users_index(user_id, username, name, custom_fields) ⇒ Object