Class: Factbook::Sanitizer

Inherits:
Object
  • Object
show all
Includes:
Utils, LogUtils::Logging
Defined in:
lib/factbook/sanitizer.rb

Constant Summary collapse

ARIA_ATTR_REGEX =

<span class=“subfield-date” aria-label=“Date of information: 2018”>(2018)</span>

remove aria labels
/\s*
  aria-label=('|").+?\1     ## note: use non-greedy match e.g. .+?
/xim

Constants included from Utils

Utils::COUNTRY_CODE_REGEX, Utils::MONTH_EN_TO_S, Utils::PAGE_INFO_REGEX, Utils::PAGE_LAST_UPDATED_REGEX

Instance Method Summary collapse

Methods included from Utils

#data_to_csv, #encode_utf8, #find_country_code, #find_page_info, #find_page_last_updated, #values_to_csv

Instance Method Details

#find_country_profile(html) ⇒ Object



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# File 'lib/factbook/sanitizer.rb', line 59

def find_country_profile( html )
  ####
  ## remove header (everything before)
  ##   <ul class="expandcollapse">

  doc = Nokogiri::HTML( html )

  ul = doc.css( 'ul.expandcollapse' )[0]

  puts ul.to_html[0..100]



  ## note: special case cc uses h2 instead of div block
  ##  <h2 class="question cam_med" sectiontitle="Introduction" ccode="cc"
  ##         style="border-bottom: 2px solid white; cursor: pointer;">
  ##         Introduction ::  <span class="region">CURACAO </span>
  ##   </h2>
  ##   is old format !!!!
  ##   cc - CURACAO
  ##  http headers says - last-modified: Wed, 14 Nov 2018 14:09:28 GMT
  ##   page says - PAGE LAST UPDATED ON MARCH 14, 2018
  ##    wait for new version to be generated / pushed!!!

  ## check for old format if h2 are present
  h2s = ul.css( 'h2' )
  if h2s.size > 0
    puts "  !! WARN: found #{h2s.size} h2(s) - assume old format - sorry - must wait for update!!!"
    ## return empty html string - why? why not?
    return  ''
  end


  ###
  ## sanitize

  ## remove link items
  ##   assume two <li>s are a section

  html = String.new('')

  ##  filter all li's
  ul_children = ul.children.select { |el| if el.name == 'li'
                                             true
                                          else
                                            # puts "skipping #{el.name} >#{el.to_html}<"
                                            false
                                          end
                                    }
  puts "  #{ul_children.size} li(s):"
  ul_children.each_slice(2) do |lis|
    li  = lis[0]
    div = li.at( 'div[sectiontitle]' )
    if div.nil?
      puts "!! ERROR: no section title found in div:"
      puts li.to_html
      exit 1
    end

    section_title = div['sectiontitle'].to_s

    html << "<h2>#{section_title}</h2>\n"


    li  = lis[1]
    ## filter all div's
    li_children = li.children.select { |el| if el.name =='div'
                                                true
                                            else
                                             # puts "skipping #{el.name} >#{el.to_html}<"
                                             false
                                            end
                                      }
    puts " #{li_children.size} div(s):"

    li_children.each_slice(2) do |divs|
      div = divs[0]
      a = div.css('a')[0]

      if a
        html << "\n<h3>#{a.text}:</h3>\n"
      else
        puts "!! WARN: no anchor found:"
        puts div.to_html
      end


      div = divs[1]
      div_children = div.children.select {|el| el.name == 'div' ? true : false }
      div_children.each do |catdiv|
         if catdiv['class'] && catdiv['class'].index( 'category_data' )

          if catdiv['class'].index( 'attachment' )
            ## skip attachments e.g. maps, pop pyramids, etc.
          else
            html << catdiv.to_html
            html << "\n"
          end
         else
          puts "!! WARN: skipping div (W/O category_data class):"
          puts catdiv.to_html
         end
      end
    end
  end


  html = html.gsub( ARIA_ATTR_REGEX ) do |m|
    puts "remove aria-label attr:"
    puts "#{m}"
    ''
  end

  html
end

#sanitize(html_ascii) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/factbook/sanitizer.rb', line 9

def sanitize( html_ascii )
  ## todo: add option for (html source) encoding - why?? why not??

  ## note:
  ##   returns 1) html profile withouth headers, footers, scripts,etc.
  ##           2) page (meta) info e.g. country_name, country_code, last_updated, etc.
  ##           3) errors e.g. list of errors e.g. endcoding errors (invalid byte sequence etc.)

  page_info = PageInfo.new

  ## todo:
  ##   make page info optional? why? why not?
  ##   not always available (if page structure changes) - check
  ##   what page info is required??
  h = find_page_info( html_ascii )
  if h
    page_info.country_code        = h[:country_code]
    page_info.country_name        = h[:country_name]
    page_info.country_affiliation = h[:country_affiliation]
    page_info.region_code         = h[:region_code]
    page_info.region_name         = h[:region_name]
  else
    page_info.country_code = find_country_code( html_ascii )
    ## print/warn: no page info found
  end


  page_info.last_updated        = find_page_last_updated( html_ascii )


  html = find_country_profile( html_ascii )    ## cut-off headers, footers, scripts, etc.

  ## todo/fix: assume windows 12xx encoding!!!! for factbook - try
  # html, errors = encode_utf8( html_profile_ascii )  ## change encoding to utf-8  (from binary/ascii8bit)

  # html = sanitize_profile( html )

  [html, page_info, []]
end