Class: Factbook::Sanitizer
- Inherits:
-
Object
- Object
- Factbook::Sanitizer
- Includes:
- Utils, LogUtils::Logging
- Defined in:
- lib/factbook/sanitizer.rb
Constant Summary collapse
- ARIA_ATTR_REGEX =
<span class=“subfield-date” aria-label=“Date of information: 2018”>(2018)</span>
remove aria labels
/\s* aria-label=('|").+?\1 ## note: use non-greedy match e.g. .+? /xim
Constants included from Utils
Utils::COUNTRY_CODE_REGEX, Utils::MONTH_EN_TO_S, Utils::PAGE_INFO_REGEX, Utils::PAGE_LAST_UPDATED_REGEX
Instance Method Summary collapse
Methods included from Utils
#data_to_csv, #encode_utf8, #find_country_code, #find_page_info, #find_page_last_updated, #values_to_csv
Instance Method Details
#find_country_profile(html) ⇒ Object
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
# File 'lib/factbook/sanitizer.rb', line 59 def find_country_profile( html ) #### ## remove header (everything before) ## <ul class="expandcollapse"> doc = Nokogiri::HTML( html ) ul = doc.css( 'ul.expandcollapse' )[0] puts ul.to_html[0..100] ## note: special case cc uses h2 instead of div block ## <h2 class="question cam_med" sectiontitle="Introduction" ccode="cc" ## style="border-bottom: 2px solid white; cursor: pointer;"> ## Introduction :: <span class="region">CURACAO </span> ## </h2> ## is old format !!!! ## cc - CURACAO ## http headers says - last-modified: Wed, 14 Nov 2018 14:09:28 GMT ## page says - PAGE LAST UPDATED ON MARCH 14, 2018 ## wait for new version to be generated / pushed!!! ## check for old format if h2 are present h2s = ul.css( 'h2' ) if h2s.size > 0 puts " !! WARN: found #{h2s.size} h2(s) - assume old format - sorry - must wait for update!!!" ## return empty html string - why? why not? return '' end ### ## sanitize ## remove link items ## assume two <li>s are a section html = String.new('') ## filter all li's ul_children = ul.children.select { |el| if el.name == 'li' true else # puts "skipping #{el.name} >#{el.to_html}<" false end } puts " #{ul_children.size} li(s):" ul_children.each_slice(2) do |lis| li = lis[0] div = li.at( 'div[sectiontitle]' ) if div.nil? puts "!! ERROR: no section title found in div:" puts li.to_html exit 1 end section_title = div['sectiontitle'].to_s html << "<h2>#{section_title}</h2>\n" li = lis[1] ## filter all div's li_children = li.children.select { |el| if el.name =='div' true else # puts "skipping #{el.name} >#{el.to_html}<" false end } puts " #{li_children.size} div(s):" li_children.each_slice(2) do |divs| div = divs[0] a = div.css('a')[0] if a html << "\n<h3>#{a.text}:</h3>\n" else puts "!! WARN: no anchor found:" puts div.to_html end div = divs[1] div_children = div.children.select {|el| el.name == 'div' ? true : false } div_children.each do |catdiv| if catdiv['class'] && catdiv['class'].index( 'category_data' ) if catdiv['class'].index( 'attachment' ) ## skip attachments e.g. maps, pop pyramids, etc. else html << catdiv.to_html html << "\n" end else puts "!! WARN: skipping div (W/O category_data class):" puts catdiv.to_html end end end end html = html.gsub( ARIA_ATTR_REGEX ) do |m| puts "remove aria-label attr:" puts "#{m}" '' end html end |
#sanitize(html_ascii) ⇒ Object
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
# File 'lib/factbook/sanitizer.rb', line 9 def sanitize( html_ascii ) ## todo: add option for (html source) encoding - why?? why not?? ## note: ## returns 1) html profile withouth headers, footers, scripts,etc. ## 2) page (meta) info e.g. country_name, country_code, last_updated, etc. ## 3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.) page_info = PageInfo.new ## todo: ## make page info optional? why? why not? ## not always available (if page structure changes) - check ## what page info is required?? h = find_page_info( html_ascii ) if h page_info.country_code = h[:country_code] page_info.country_name = h[:country_name] page_info.country_affiliation = h[:country_affiliation] page_info.region_code = h[:region_code] page_info.region_name = h[:region_name] else page_info.country_code = find_country_code( html_ascii ) ## print/warn: no page info found end page_info.last_updated = find_page_last_updated( html_ascii ) html = find_country_profile( html_ascii ) ## cut-off headers, footers, scripts, etc. ## todo/fix: assume windows 12xx encoding!!!! for factbook - try # html, errors = encode_utf8( html_profile_ascii ) ## change encoding to utf-8 (from binary/ascii8bit) # html = sanitize_profile( html ) [html, page_info, []] end |