Class: Makasi::SearchIndex
- Inherits:
-
Object
- Object
- Makasi::SearchIndex
- Defined in:
- lib/makasi/search_index.rb
Constant Summary collapse
- MAX_LITERAL_SIZE =
4095
- MAX_TEXT_SIZE =
262144
Instance Method Summary collapse
- #add_item_to_cloudsearch(cloudsearch_doc, html_doc) ⇒ Object
- #asari ⇒ Object
- #content_of(doc) ⇒ Object
- #extract_text(nodes) ⇒ Object
- #language_of(doc) ⇒ Object
- #load_page(url, limit = 10) ⇒ Object
- #meta_tag_for(doc, name) ⇒ Object
- #read_sitemap ⇒ Object
- #reindex ⇒ Object
- #resource_name_of(doc) ⇒ Object
- #sync_db_with_sitemap ⇒ Object
- #title_of(doc) ⇒ Object
Instance Method Details
#add_item_to_cloudsearch(cloudsearch_doc, html_doc) ⇒ Object
33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# File 'lib/makasi/search_index.rb', line 33 def add_item_to_cloudsearch(cloudsearch_doc, html_doc) asari.add_item(cloudsearch_doc.url, { url: cloudsearch_doc.url, title: title_of(html_doc)[0..MAX_TEXT_SIZE], content: content_of(html_doc)[0..MAX_TEXT_SIZE], author: (html_doc, "author")[0..MAX_TEXT_SIZE], content_language: language_of(html_doc)[0..MAX_LITERAL_SIZE], description: (html_doc, "description")[0..MAX_TEXT_SIZE], keywords: (html_doc, "keywords").split(",").map(&:strip), resource_type: (html_doc, "resource_type")[0..MAX_TEXT_SIZE], resource_name: resource_name_of(html_doc)[0..MAX_TEXT_SIZE], resource_id: (html_doc, "resource_id")[0..MAX_TEXT_SIZE] }) end |
#asari ⇒ Object
89 90 91 |
# File 'lib/makasi/search_index.rb', line 89 def asari @asari ||= Makasi::AsariClient.new end |
#content_of(doc) ⇒ Object
108 109 110 111 112 113 114 115 |
# File 'lib/makasi/search_index.rb', line 108 def content_of(doc) content_nodes = doc.css(Makasi::Config.content_selector) if content_nodes.present? extract_text(content_nodes) else extract_text([doc]) end end |
#extract_text(nodes) ⇒ Object
122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
# File 'lib/makasi/search_index.rb', line 122 def extract_text(nodes) content = StringIO.new nodes.each do |node| node.traverse do |child_node| if child_node.text? content << child_node.text elsif child_node.name == "img" content << child_node["alt"] end content << " " end end HTMLEntities.new.decode content.string.gsub(/\s+/, " ").strip end |
#language_of(doc) ⇒ Object
117 118 119 120 |
# File 'lib/makasi/search_index.rb', line 117 def language_of(doc) nodes = doc.xpath("//html") nodes.present? ? nodes[0]["lang"].to_s : "" end |
#load_page(url, limit = 10) ⇒ Object
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
# File 'lib/makasi/search_index.rb', line 65 def load_page(url, limit = 10) if limit == 0 Rails.logger.error "ERROR: Faild load sitemap's url #{url}" return "" end ## Patch for indexing from localhost if Rails.env.development? url += "/" unless url.ends_with?("/") url.gsub! Makasi::Config.website_url, "http://localhost:3000/" end parsed_url = URI.parse(url) request = Net::HTTP::Get.new(url) response = Net::HTTP.start(parsed_url.host, parsed_url.port) { |http| http.request(request) } case response when Net::HTTPSuccess then response.body when Net::HTTPRedirection then load_page(response['location'], limit - 1) else Rails.logger.error "Makasi::SearchIndex ERROR: Faild load sitemap's url #{url}" return "" end end |
#meta_tag_for(doc, name) ⇒ Object
98 99 100 101 |
# File 'lib/makasi/search_index.rb', line 98 def (doc, name) nodes = doc.css("meta[name='#{name}']") nodes.present? ? HTMLEntities.new.decode(nodes[0]["content"].to_s.strip) : "" end |
#read_sitemap ⇒ Object
93 94 95 96 |
# File 'lib/makasi/search_index.rb', line 93 def read_sitemap sitemap_file = open(Makasi::Config.sitemap_url) Zlib::GzipReader.new(sitemap_file).read end |
#reindex ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
# File 'lib/makasi/search_index.rb', line 6 def reindex sync_db_with_sitemap CloudSearchDocument.desc(:reindexed_at).each do |cloudsearch_doc| html_content = load_page(cloudsearch_doc.url) html_doc = Nokogiri::HTML(html_content) if Rails.logger.debug? Rails.logger.debug ">>> URL: " + cloudsearch_doc.url + "\n\tTITLE: " + title_of(html_doc) + "\n\tCONTENT: " + content_of(html_doc)[0..300] + "\n\tAUTHOR: " + (html_doc, "author") + "\n\tCONTENT_LANGUAGE: " + language_of(html_doc) + "\n\tDESCRIPTION: " + (html_doc, "description")[0..300] + "\n\tKEYWORDS: " + (html_doc, "keywords") + "\n\tRESOURCE_TYPE: " + (html_doc, "resource_type") + "\n\tRESOURCE_NAME: " + resource_name_of(html_doc) + "\n\tRESOURCE_ID: " + (html_doc, "resource_id") + "\n" end add_item_to_cloudsearch(cloudsearch_doc, html_doc) cloudsearch_doc.update_attributes(reindexed_at: DateTime.now) end end |
#resource_name_of(doc) ⇒ Object
137 138 139 140 141 142 143 144 |
# File 'lib/makasi/search_index.rb', line 137 def resource_name_of(doc) content_nodes = doc.css(Makasi::Config.resource_name_selector) if content_nodes.present? HTMLEntities.new.decode content_nodes.map(&:text).join(" ") else title_of(doc) end end |
#sync_db_with_sitemap ⇒ Object
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
# File 'lib/makasi/search_index.rb', line 48 def sync_db_with_sitemap CloudSearchDocument.update_all(present_in_sitemap: false) url_nodes = Nokogiri::XML(read_sitemap).css('url loc') url_nodes.each do |url_node| cloudsearch_doc = CloudSearchDocument.find_or_initialize_by(url: url_node.text.strip) cloudsearch_doc.update_attributes(present_in_sitemap: true) end if Rails.logger.debug? Rails.logger.debug "SEARCH_INDEX: Updated #{CloudSearchDocument.where(present_in_sitemap: true).count} documents" Rails.logger.debug "SEARCH_INDEX: Removed #{CloudSearchDocument.where(present_in_sitemap: false).count} documents" end CloudSearchDocument.where(present_in_sitemap: false).destroy_all end |
#title_of(doc) ⇒ Object
103 104 105 106 |
# File 'lib/makasi/search_index.rb', line 103 def title_of(doc) nodes = doc.xpath("//title") nodes.present? ? HTMLEntities.new.decode(nodes[0].text) : "" end |