Class: TfidfConverter

Inherits:
Jekyll::Generator
  • Object
show all
Defined in:
lib/jekyll_ranked_search.rb

Overview

Jekyll plugin to generate a TF-IDF search index for posts.

Instance Method Summary collapse

Instance Method Details

#generate(site) ⇒ Object



19
20
21
22
23
24
25
26
27
# File 'lib/jekyll_ranked_search.rb', line 19

def generate(site)
  Jekyll.logger.info "Jekyll Ranked Search: Generating search index"

  self.generate_index(site, site.posts.docs)
  site.pages << self.search_json(site)
  site.pages << self.search_js(site)

  Jekyll.logger.info "Jekyll Ranked Search: Done"
end

#generate_index(site, docs) ⇒ Object

Generate search index, calculate tfidf values



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/jekyll_ranked_search.rb', line 30

def generate_index(site, docs)
  # All docs
  processed_docs = []
  # Map of word to document
  word2doc = {}
  # Bag of words, assigns word to index
  bow = {}
  # Term frequency per document in the format term_id,doc_id = freq
  # This is a sparse matrix to save disk space and memory on the receiving end
  tf = {}
  # Frequency of words in documents as sparse matrix
  df = {}
  # Total number of documents
  total_docs = docs.length

  # Markdown parser
  markdown = Redcarpet::Markdown.new(MarkdownRenderer)

  # Create vocabulary
  docs.each_with_index do |post, idx|
    content = markdown.render(post.content)

    # Tokenize content before applying any other transformations
    tokenized = self.tokenize_words "#{post.data['title']} #{content}"

    # Replace newlines with wide spaces and bullet points
    divider = ""
    content.gsub!(/\n/, divider)

    # Remove trailing divider
    if content.end_with?(divider)
      content = content[0..-4]
    end
    
    # Take first n words of post
    n_words = 40
    splitted_content = content.split(" ")
    word_count = splitted_content.length
    content = splitted_content[..n_words].join(" ")  # The first n words of the post
    if word_count > n_words
      content += "..."
    end

    processed_docs.push({
      title: post.data['title'],
      url: post.url,
      date: post.data['date'].strftime("%FT%T%z"),
      text: content,
    })

    token_seen = false
    tokenized.each do |word|
      if !bow.include?(word)
        bow[word] = bow.length
      end

      # The key is the term_id which is calculated in the step before.
      word2doc[bow[word]] ||= Set.new
      word2doc[bow[word]] << idx
      
      tf["#{bow[word]},#{idx}"] ||= 0
      tf["#{bow[word]},#{idx}"] += 1
      if !token_seen
        df[bow[word]] ||= 0
        df[bow[word]] += 1
      end
    end
  end

  # Convert word2doc set to array
  word2doc.each_key do |key|
    word2doc[key] = word2doc[key].to_a
  end

  # Save in site data object for access in templates
  site.data['docs'] = processed_docs.to_json
  site.data['word2doc'] = word2doc.to_json
  site.data['bow'] = bow.to_json

  # Calculate tf-idf for each document in the shape term_id,doc_id = tfidf
  tfidf = {}
  tf.each do |idx, freq|
    token_idx, doc_idx = idx.split(',').map { |i| i.to_i }
    _idf = Math.log(total_docs / df[token_idx] + 0.00001)

    # Exponential decay over time (boost newer posts)
    boost = 1.2**doc_idx/(total_docs/2)

    # Calculate TF-IDF and boost newer posts by up to 20%
    tfidf[idx] = (freq * _idf * boost).round(4)
  end
  
  site.data['tfidf'] = tfidf.to_json
end

#load_stopwordsSet<String>

Load english stopwords from file

Returns:

  • (Set<String>)

    the stopwords



149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/jekyll_ranked_search.rb', line 149

def load_stopwords
  filename = File.join(File.dirname(__FILE__), "stopwords/en.txt")
  Jekyll.logger.info "Loading stopwords: ", filename
  stopwords = Set.new
  File.open(filename, "r") do |f|
    f.each_line do |line|
      stopwords.add line.strip
    end
  end
  Jekyll.logger.info "Loaded #{stopwords.length} stopwords"
  stopwords
end

#search_js(site) ⇒ Object

Create search.js from template and return as Jekyll Page object



172
173
174
175
176
177
178
# File 'lib/jekyll_ranked_search.rb', line 172

def search_js(site)
  search_js = File.read(File.join(File.dirname(__FILE__), "search.js"))
  page = Jekyll::PageWithoutAFile.new(site, __dir__, "", "js/search.js").tap do |p|
    p.content = search_js
  end
  page
end

#search_json(site) ⇒ Object

Create search.json from template and return as Jekyll Page object



163
164
165
166
167
168
169
# File 'lib/jekyll_ranked_search.rb', line 163

def search_json(site)
  template = File.read(File.join(File.dirname(__FILE__), "search.json"))
  page = Jekyll::PageWithoutAFile.new(site, __dir__, "", "search.json").tap do |p|
    p.content = template
  end
  page
end

#tokenize_words(doc) ⇒ Array<String>

Tokenize document by removing special characters and splitting the document into tokens.

Parameters:

  • doc (String)

    The document to tokenize

Returns:

  • (Array<String>)

    individual tokens/words



129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# File 'lib/jekyll_ranked_search.rb', line 129

def tokenize_words(doc)
  # Remove stopwords from document
  @stopwords ||= self.load_stopwords

  # TODO: Remove Liquid tags via regex

  # Split document into tokens
  splitted_doc = doc.strip.downcase.split

  # Remove special characters
  splitted_doc.map! { |word| word.gsub(/[^a-z0-9_\/\-\s]/i, '') }

  # Remove stopwords in place
  splitted_doc.delete_if { |t| @stopwords.include? t }

  splitted_doc
end