Class: Factbook::Page

Inherits:
Object
  • Object
show all
Includes:
LogUtils::Logging
Defined in:
lib/factbook/page.rb

Constant Summary collapse

SITE_BASE =
'https://www.cia.gov/library/publications/the-world-factbook/geos/countrytemplate_{code}.html'

Instance Method Summary collapse

Constructor Details

#initialize(code, opts = {}) ⇒ Page

Returns a new instance of Page.



16
17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/factbook/page.rb', line 16

def initialize( code, opts={} )
  ## note: requires factbook country code
  #   e.g. austria is au
  #        germany is gm  and so on
  @code  = code
  
  ### rename fields to format option?? why? why not? e.g. :format => 'long' ??
  @opts  = opts   # fields:  full|long|keep|std|??  -- find a good name for the option keeping field names as is

  @html  = nil
  @doc   = nil
  @sects = nil
  @data  = nil
end

Instance Method Details

#[](key) ⇒ Object

convenience shortcut



45
46
47
48
49
50
51
52
53
54
# File 'lib/factbook/page.rb', line 45

def [](key)  ### convenience shortcut
  # lets you use
  #   page['geo']
  #   instead of
  #   page.data['geo']

  ##  fix: use delegate data, [] from forwardable lib - why?? why not??

  data[key]
end

#dataObject



57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/factbook/page.rb', line 57

def data
  if @data.nil?
    @data = {}

    if @opts[:header]   ## include (leading) header section ??
      
      header_key =     @opts[:fields] ? 'Header' : 'header'
      last_built_key = @opts[:fields] ? 'last built' : 'last_built'

      @data[header_key] = {
        'code' => @code,
        'generator' => "factbook/#{VERSION}",
        last_built_key => "#{Time.now}",
      }
    end

    sects.each_with_index do |sect,i|
      logger.debug "############################"
      logger.debug "###  [#{i}] stats sect >#{sect.title}<: "

      @data[ sect.title ] = sect.data
    end
  end
  @data
end

#docObject



31
32
33
# File 'lib/factbook/page.rb', line 31

def doc
  @doc ||= Nokogiri::HTML( html )
end

#htmlObject



152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
# File 'lib/factbook/page.rb', line 152

def html
  if @html.nil?
    @html = fetch()

  ### remove everything up to 
  ##   <div id="countryInfo" style="display: none;">
  ## remove everything starting w/ footer
  ## remove head !!!
  ## in body remove header n footer

    ## remove inline script
    @html = @html.gsub( /<script[^>]*>.*?<\/script>/m ) do |m|
      puts "remove script:"
      puts "#{m}"
      ''
    end

    ## remove inline style
    @html = @html.gsub( /<style[^>]*>.*?<\/style>/m ) do |m|
      puts "remove style:"
      puts "#{m}"
      ''
    end

    ## remove link
    link_regex = /<link[^>]+>/
    @html = @html.gsub( link_regex ) do |m|
      puts "remove link:"
      puts "#{m}"
      ''
    end

    div_country_info_regex = /<div id="countryInfo"\s*>/
    ## remove everything before <div id="countryInfo" >
    pos = @html.index( div_country_info_regex )
    if pos  # not nil, false
      @html = @html[pos..-1]
    end

    ## remove country comparison
    ## e.g.  <span class="category" >country comparison to the world:</span>
    ##       <span class="category_data">
    ##  <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br&regionCode=soa&rank=5#br" onMouseDown=""  title="Country comparison to the world" alt="Country comparison to the world">
    ##    5
    ##  </a>
    ##  </span>
    
    ##
    ##
    ## <span class="category" style="padding-left:7px;">country comparison to the world:</span> <span class="category_data">
    ##  <a href="../rankorder/2147rank.html?countryname=Brazil&countrycode=br&regionCode=soa&rank=5#br" onMouseDown=""  title="Country comparison to the world" alt="Country comparison to the world"> 5 </a> </span>
    ##

    country_comparison_regex = /
     <span \s class="category"[^>]*>
       country \s comparison \s to \s the \s world:
     <\/span>
      \s*
     <span \s class="category_data"[^>]*>
      \s*
        <a \s [^>]+>
         .+?
        <\/a>
      \s*
     <\/span>
    /xm

    @html = @html.gsub( country_comparison_regex ) do |m|
      puts "remove country comparison:"
      puts "#{m}"
      ''
    end
    
    style_attr_regex = /\s*style="[^"]+"/
    @html = @html.gsub( style_attr_regex ) do |m|
      puts "remove style attr:"
      puts "#{m}"
      ''
    end
    
    ## <tr height="22">
    ##   <td class="category_data"></td>
    ##   </tr>
    tr_empty_regex = /
       <tr[^>]*>
         \s*
          <td[^>]*> \s* <\/td>
         \s*
       <\/tr>
    /xm
    @html = @html.gsub( tr_empty_regex ) do |m|
      puts "remove tr emtpy:"
      puts "#{m}"
      ''
    end

    ##  remove world leader website promo
    ##  <span class="category">(For more information visit the
    ##     <a href="/library/publications/world-leaders-1/index.html" target="_blank">World Leaders website</a>&nbsp;
    ##       <img src="../graphics/soa_newwindow.gif" alt="Opens in New Window" title="Opens in New Window" border="0"/>)
    ##  </span>
    world_leaders_website_regex = /
     <span \s class="category"[^>]*>
       \(
       For \s more \s information \s
        .+?       ## non-greedy (smallest possible match
       \)
     <\/span>
    /xm
    @html = @html.gsub( world_leaders_website_regex ) do |m|
      puts "remove world leader website promo:"
      puts "#{m}"
      ''
    end

  end
  @html
end

#html=(html) ⇒ Object



146
147
148
149
150
# File 'lib/factbook/page.rb', line 146

def html=(html)
  ## for debugging n testing
  ## lets you set html (no need to fetch via net)
  @html = html
end

#sectsObject



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# File 'lib/factbook/page.rb', line 84

def sects
  if @sects.nil?
    ## split html into sections
    ##   lets us avoids errors w/ (wrongly) nested tags

    ## check opts for using long or short category/field names
    divs = [
      [ @opts[:fields] ? 'Introduction'        : 'intro',    '<div id="CollapsiblePanel1_Intro"'   ],
      [ @opts[:fields] ? 'Geography'           : 'geo',      '<div id="CollapsiblePanel1_Geo"'     ],
      [ @opts[:fields] ? 'People and Society'  : 'people',   '<div id="CollapsiblePanel1_People"'  ],
      [ @opts[:fields] ? 'Government'          : 'govt',     '<div id="CollapsiblePanel1_Govt"'    ],
      [ @opts[:fields] ? 'Economy'             : 'econ',     '<div id="CollapsiblePanel1_Econ"'    ],
      [ @opts[:fields] ? 'Energy'              : 'energy',   '<div id="CollapsiblePanel1_Energy"'  ],
      [ @opts[:fields] ? 'Communications'      : 'comm',     '<div id="CollapsiblePanel1_Comm"'    ],
      [ @opts[:fields] ? 'Transportation'      : 'trans',    '<div id="CollapsiblePanel1_Trans"'   ],
      [ @opts[:fields] ? 'Military'            : 'military', '<div id="CollapsiblePanel1_Military"'],
      [ @opts[:fields] ? 'Transnational Issues': 'issues',   '<div id="CollapsiblePanel1_Issues"'  ]
    ]

    indexes = []

    ## note:
    ##   skip missing sections (w/ warning)
    ##   e.g. Vatican (Holy See), Liechtenstein etc. have no Energy section, for example

    divs.each_with_index do |rec,i|
      title = rec[0]
      div   = rec[1]
      p = html.index( div )
      if p.nil?
        ## issue warning: if not found
        logger.warn "***!!! section not found -- #{div} --; skipping"
      else
        logger.debug "  found section #{i} @ #{p}"
        indexes <<  [title,p]
      end
    end

    @sects = []

    indexes.each_with_index do |rec,i|
      title = rec[0]
      from  = rec[1]

      # is last entry? if yes use -1 otherewise pos
      #   note: subtract one (-1) from pos unless end-of-string (-1)
      to    = indexes[i+1].nil? ? -1 : indexes[i+1][1]-1

      ## todo: check that from is smaller than to
      logger.debug "   cut section #{i} [#{from}..#{to}]"
      @sects << Sect.new( title, html[ from..to ], @opts )

      ##if i==0 || i==1
        ## puts "debug sect #{i}:"
        ## puts ">>>|||#{html[ from..to ]}|||<<<"
      ##end
    end
  end

  @sects
end

#to_json(opts = {}) ⇒ Object



35
36
37
38
39
40
41
42
# File 'lib/factbook/page.rb', line 35

def to_json( opts={} )
  ## convenience helper for data.to_json
  if opts[:pretty] || opts[:pp]
    JSON.pretty_generate( data )
  else
    data.to_json
  end
end