Class: Worldfootball::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/webget-football/worldfootball/page.rb,
lib/webget-football/worldfootball/page_report.rb,
lib/webget-football/worldfootball/page_schedule.rb

Direct Known Subclasses

Report, Schedule

Defined Under Namespace

Classes: Report, Schedule

Constant Summary collapse

GENERATED_RE =

<!– [generated 2020-06-30 22:30:19] –>

<!-- [generated 2020-06-30 22:30:19] -->
%r{
  <!--
 [ ]+
 \[generated
     [ ]+
   (?<date>\d+-\d+-\d+)
     [ ]+
   (?<time>\d+:\d+:\d+)
 \]
 [ ]+
 -->
}x

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(html) ⇒ Page

Returns a new instance of Page.



10
11
12
# File 'lib/webget-football/worldfootball/page.rb', line 10

def initialize( html )
  @html = html
end

Class Method Details

.from_file(path) ⇒ Object



5
6
7
8
# File 'lib/webget-football/worldfootball/page.rb', line 5

def self.from_file( path )
  html = File.open( path, 'r:utf-8' ) {|f| f.read }
  new( html )
end

Instance Method Details

#assert(cond, msg) ⇒ Object



96
97
98
99
100
101
102
103
# File 'lib/webget-football/worldfootball/page.rb', line 96

def assert( cond, msg )
  if cond
    # do nothing
  else
    puts "!!! assert failed (in parse page) - #{msg}"
    exit 1
  end
end

#docObject



14
15
16
17
# File 'lib/webget-football/worldfootball/page.rb', line 14

def doc
  ## note: if we use a fragment and NOT a document - no access to page head (and meta elements and such)
  @doc ||= Nokogiri::HTML( @html )
end

#generatedObject



64
65
66
67
68
69
70
71
72
73
74
# File 'lib/webget-football/worldfootball/page.rb', line 64

def generated
   @generated ||= begin
     m=GENERATED_RE.match( @html )
     if m
      DateTime.strptime( "#{m[:date]} #{m[:time]}", '%Y-%m-%d %H:%M:%S')
     else
      puts "!! WARN - no generated timestamp found in page"
      nil
     end
   end
end

#generated_in_days_agoObject

convenience helper / formatter



77
78
79
80
81
82
83
84
# File 'lib/webget-football/worldfootball/page.rb', line 77

def generated_in_days_ago
  if generated
   diff_in_days = Date.today.jd - generated.jd
   "#{diff_in_days}d"
  else
   '?'
  end
end

#keywordsObject



25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/webget-football/worldfootball/page.rb', line 25

def keywords
   # <meta name="keywords"
   #  content="Bundesliga, 2010/2011, Spielplan, KSV Superfund, SC Magna Wiener Neustadt, SV Ried, FC Wacker Innsbruck, Austria Wien, Sturm Graz, SV Mattersburg, LASK Linz, Rapid Wien, RB Salzburg" />
   @keywords ||= doc.css( 'meta[name="keywords"]' ).first
   @keywords[:content]  ## get content attribute
   ## or      doc.xpath( '//meta[@name="keywords"]' ).first
   ## pp keywords
   # puts "  #{keywords[:content]}"

   # keywords = doc.at( 'meta[@name="Keywords"]' )
   # pp keywords
   ## check for
end

#squish(str) ⇒ Object

helper methods



89
90
91
92
93
94
# File 'lib/webget-football/worldfootball/page.rb', line 89

def squish( str )
  str = str.strip
  str = str.gsub( "\u{00A0}", ' ' )  # Unicode Character 'NO-BREAK SPACE' (U+00A0)
  str = str.gsub( /[ \t\n]+/, ' ' )  ## fold whitespace to one max.
  str
end

#titleObject



19
20
21
22
23
# File 'lib/webget-football/worldfootball/page.rb', line 19

def title
 # <title>Bundesliga 2010/2011 &raquo; Spielplan</title>
   @title ||= doc.css( 'title' ).first
   @title.text  ## get element's text content
end

#urlObject

<meta property=“og:url”

content="//www.weltfussball.de/alle_spiele/aut-bundesliga-2010-2011/" />


41
42
43
44
# File 'lib/webget-football/worldfootball/page.rb', line 41

def url
  @url ||= doc.css( 'meta[property="og:url"]' ).first
  @url[:content]
end