Class: TfidfConverter
- Inherits:
-
Jekyll::Generator
- Object
- Jekyll::Generator
- TfidfConverter
- Defined in:
- lib/jekyll_ranked_search.rb
Overview
Jekyll plugin to generate a TF-IDF search index for posts.
Instance Method Summary collapse
- #generate(site) ⇒ Object
-
#generate_index(site, docs) ⇒ Object
Generate search index, calculate tfidf values.
-
#load_stopwords ⇒ Set<String>
Load english stopwords from file.
-
#search_js(site) ⇒ Object
Create search.js from template and return as Jekyll Page object.
-
#search_json(site) ⇒ Object
Create search.json from template and return as Jekyll Page object.
-
#tokenize_words(doc) ⇒ Array<String>
Tokenize document by removing special characters and splitting the document into tokens.
Instance Method Details
#generate(site) ⇒ Object
19 20 21 22 23 24 25 26 27 |
# File 'lib/jekyll_ranked_search.rb', line 19 def generate(site) Jekyll.logger.info "Jekyll Ranked Search: Generating search index" self.generate_index(site, site.posts.docs) site.pages << self.search_json(site) site.pages << self.search_js(site) Jekyll.logger.info "Jekyll Ranked Search: Done" end |
#generate_index(site, docs) ⇒ Object
Generate search index, calculate tfidf values
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
# File 'lib/jekyll_ranked_search.rb', line 30 def generate_index(site, docs) # All docs processed_docs = [] # Map of word to document word2doc = {} # Bag of words, assigns word to index bow = {} # Term frequency per document in the format term_id,doc_id = freq # This is a sparse matrix to save disk space and memory on the receiving end tf = {} # Frequency of words in documents as sparse matrix df = {} # Total number of documents total_docs = docs.length # Markdown parser markdown = Redcarpet::Markdown.new(MarkdownRenderer) # Create vocabulary docs.each_with_index do |post, idx| content = markdown.render(post.content) # Tokenize content before applying any other transformations tokenized = self.tokenize_words "#{post.data['title']} #{content}" # Replace newlines with wide spaces and bullet points divider = " • " content.gsub!(/\n/, divider) # Remove trailing divider if content.end_with?(divider) content = content[0..-4] end # Take first n words of post n_words = 40 splitted_content = content.split(" ") word_count = splitted_content.length content = splitted_content[..n_words].join(" ") # The first n words of the post if word_count > n_words content += "..." end processed_docs.push({ title: post.data['title'], url: post.url, date: post.data['date'].strftime("%FT%T%z"), text: content, }) token_seen = false tokenized.each do |word| if !bow.include?(word) bow[word] = bow.length end # The key is the term_id which is calculated in the step before. word2doc[bow[word]] ||= Set.new word2doc[bow[word]] << idx tf["#{bow[word]},#{idx}"] ||= 0 tf["#{bow[word]},#{idx}"] += 1 if !token_seen df[bow[word]] ||= 0 df[bow[word]] += 1 end end end # Convert word2doc set to array word2doc.each_key do |key| word2doc[key] = word2doc[key].to_a end # Save in site data object for access in templates site.data['docs'] = processed_docs.to_json site.data['word2doc'] = word2doc.to_json site.data['bow'] = bow.to_json # Calculate tf-idf for each document in the shape term_id,doc_id = tfidf tfidf = {} tf.each do |idx, freq| token_idx, doc_idx = idx.split(',').map { |i| i.to_i } _idf = Math.log(total_docs / df[token_idx] + 0.00001) # Exponential decay over time (boost newer posts) boost = 1.2**doc_idx/(total_docs/2) # Calculate TF-IDF and boost newer posts by up to 20% tfidf[idx] = (freq * _idf * boost).round(4) end site.data['tfidf'] = tfidf.to_json end |
#load_stopwords ⇒ Set<String>
Load english stopwords from file
149 150 151 152 153 154 155 156 157 158 159 160 |
# File 'lib/jekyll_ranked_search.rb', line 149 def load_stopwords filename = File.join(File.dirname(__FILE__), "stopwords/en.txt") Jekyll.logger.info "Loading stopwords: ", filename stopwords = Set.new File.open(filename, "r") do |f| f.each_line do |line| stopwords.add line.strip end end Jekyll.logger.info "Loaded #{stopwords.length} stopwords" stopwords end |
#search_js(site) ⇒ Object
Create search.js from template and return as Jekyll Page object
172 173 174 175 176 177 178 |
# File 'lib/jekyll_ranked_search.rb', line 172 def search_js(site) search_js = File.read(File.join(File.dirname(__FILE__), "search.js")) page = Jekyll::PageWithoutAFile.new(site, __dir__, "", "js/search.js").tap do |p| p.content = search_js end page end |
#search_json(site) ⇒ Object
Create search.json from template and return as Jekyll Page object
163 164 165 166 167 168 169 |
# File 'lib/jekyll_ranked_search.rb', line 163 def search_json(site) template = File.read(File.join(File.dirname(__FILE__), "search.json")) page = Jekyll::PageWithoutAFile.new(site, __dir__, "", "search.json").tap do |p| p.content = template end page end |
#tokenize_words(doc) ⇒ Array<String>
Tokenize document by removing special characters and splitting the document into tokens.
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
# File 'lib/jekyll_ranked_search.rb', line 129 def tokenize_words(doc) # Remove stopwords from document @stopwords ||= self.load_stopwords # TODO: Remove Liquid tags via regex # Split document into tokens splitted_doc = doc.strip.downcase.split # Remove special characters splitted_doc.map! { |word| word.gsub(/[^a-z0-9_\/\-\s]/i, '') } # Remove stopwords in place splitted_doc.delete_if { |t| @stopwords.include? t } splitted_doc end |