Class: Worldfootball::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/worldfootball/page.rb,
lib/worldfootball/page_team.rb,
lib/worldfootball/page_report.rb,
lib/worldfootball/page_schedule.rb

Direct Known Subclasses

Report, Schedule, Team

Defined Under Namespace

Classes: Report, Schedule, Team

Constant Summary collapse

GENERATED_RE =

<!– [generated 2020-06-30 22:30:19] –>

<!-- [generated 2020-06-30 22:30:19] -->
%r{
  <!--
 [ ]+
 \[generated
     [ ]+
   (?<date>\d+-\d+-\d+)
     [ ]+
   (?<time>\d+:\d+:\d+)
 \]
 [ ]+
 -->
}x

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(html) ⇒ Page

Returns a new instance of Page.



10
11
12
13
14
# File 'lib/worldfootball/page.rb', line 10

def initialize( html )
  ## todo/fix - fix upstream in wget!!!! why? why not?
  ##    normalize unicode (to nfc - ruby's default norm form)
  @html = html.unicode_normalize
end

Class Method Details

.from_file(path) ⇒ Object



5
6
7
8
# File 'lib/worldfootball/page.rb', line 5

def self.from_file( path )
  html = File.open( path, 'r:utf-8' ) {|f| f.read }
  new( html )
end

Instance Method Details

#assert(cond, msg) ⇒ Object



102
103
104
105
106
107
108
109
# File 'lib/worldfootball/page.rb', line 102

def assert( cond, msg )
  if cond
    # do nothing
  else
    puts "!!! assert failed (in parse page) - #{msg}"
    exit 1
  end
end

#debug?Boolean

helper methods

Returns:

  • (Boolean)


92
# File 'lib/worldfootball/page.rb', line 92

def debug?()  Worldfootball.debug?; end

#docObject



16
17
18
19
# File 'lib/worldfootball/page.rb', line 16

def doc
  ## note: if we use a fragment and NOT a document - no access to page head (and meta elements and such)
  @doc ||= Nokogiri::HTML( @html )
end

#generatedObject



66
67
68
69
70
71
72
73
74
75
76
# File 'lib/worldfootball/page.rb', line 66

def generated
   @generated ||= begin
     m=GENERATED_RE.match( @html )
     if m
      DateTime.strptime( "#{m[:date]} #{m[:time]}", '%Y-%m-%d %H:%M:%S')
     else
      puts "!! WARN - no generated timestamp found in page"
      nil
     end
   end
end

#generated_in_days_agoObject

convenience helper / formatter



79
80
81
82
83
84
85
86
# File 'lib/worldfootball/page.rb', line 79

def generated_in_days_ago
  if generated
   diff_in_days = Date.today.jd - generated.jd
   "#{diff_in_days}d"
  else
   '?'
  end
end

#keywordsObject



27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/worldfootball/page.rb', line 27

def keywords
   # <meta name="keywords"
   #  content="Bundesliga, 2010/2011, Spielplan, KSV Superfund, SC Magna Wiener Neustadt, SV Ried, FC Wacker Innsbruck, Austria Wien, Sturm Graz, SV Mattersburg, LASK Linz, Rapid Wien, RB Salzburg" />
   @keywords ||= doc.css( 'meta[name="keywords"]' ).first
   @keywords[:content]  ## get content attribute
   ## or      doc.xpath( '//meta[@name="keywords"]' ).first
   ## pp keywords
   # puts "  #{keywords[:content]}"

   # keywords = doc.at( 'meta[@name="Keywords"]' )
   # pp keywords
   ## check for
end

#log(msg) ⇒ Object

append to log



112
113
114
115
116
117
# File 'lib/worldfootball/page.rb', line 112

def log( msg )  ### append to log
  File.open( './logs.txt', 'a:utf-8' ) do |f|
    f.write( msg )
    f.write( "\n" )
  end
end

#squish(str) ⇒ Object



95
96
97
98
99
100
# File 'lib/worldfootball/page.rb', line 95

def squish( str )
  str = str.strip
  str = str.gsub( "\u{00A0}", ' ' )  # Unicode Character 'NO-BREAK SPACE' (U+00A0)
  str = str.gsub( /[ \t\n]+/, ' ' )  ## fold whitespace to one max.
  str
end

#titleObject



21
22
23
24
25
# File 'lib/worldfootball/page.rb', line 21

def title
 # <title>Bundesliga 2010/2011 &raquo; Spielplan</title>
   @title ||= doc.css( 'title' ).first
   @title.text  ## get element's text content
end

#urlObject

<meta property=“og:url”

content="//www.weltfussball.de/alle_spiele/aut-bundesliga-2010-2011/" />


43
44
45
46
# File 'lib/worldfootball/page.rb', line 43

def url
  @url ||= doc.css( 'meta[property="og:url"]' ).first
  @url[:content]
end