Class: Chomchom::Extractor

Inherits:
Object
  • Object
show all
Defined in:
lib/chomchom/extractor.rb

Constant Summary collapse

WPM =

average reading speed

250
MAX_MONOS =

parameters for max number of topics to retrieve

5
MAX_MULTIS =
3
MONTHS_RE =

match and select publish date. Strategy:

  1. scan for the most used patterns

  2. take the one at the very top (usually the one near title) - this fails for pages displaying today date

  3. parse to date object (ruby amazingly handles all the different formats)

Note: won’t work for pages using javascript to write date agent.page.response doesn’t work b/c most pages now are dynamically generated

"(?:#{(Date::MONTHNAMES + Date::ABBR_MONTHNAMES).compact.join("|")})"

Instance Method Summary collapse

Constructor Details

#initialize(html_txt) ⇒ Extractor

TODO: the current ruby-readability doesn’t pull next pages’ text



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/chomchom/extractor.rb', line 17

def initialize(html_txt)
  #fix utf-8 invalid string 
  #http://po-ru.com/diary/fixing-invalid-utf-8-in-ruby-revisited/
  ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
  html = ic.iconv(html_txt + ' ')[0..-2]
  
  begin
    @fulltext = Readability::Document.new(html).content
  rescue
    @fulltext = ''
  end
  
  @fulltext = @fulltext.gsub(/\s+/," ").gsub(/<\/.*?>/, "\n").gsub(/<.*?>/,'')
  @fulltext = HTMLEntities.new.decode(@fulltext) #decode html
  
  @title = Nokogiri::XML(html.scan(/<title.*>(?:\n|.)*?<\/title>/i)[0])
  @title = (@title)? @title.inner_text.gsub(/^\s+/,'').gsub(/\s+$/,'').gsub(/\n+/,' ') : ''
  @title = HTMLEntities.new.decode(@title)
  
  #use greedy match for <body> to cover embeded frames
  @body = html.match(/<body.*?>(?:\n|.)*<\/body>/i)
  @body = (@body)? @body[0] : '<body></body>'
  
  #remove scripts, styles, frames, and comments (all non greedy)      
  @body.gsub!(/<script.*?>(?:\n|.)*?<\/script>/i,'')
  @body.gsub!(/<style.*?>(?:\n|.)*?<\/style>/i,'')
  @body.gsub!(/<frame.*?>(?:\n|.)*?<\/frame>/i,'')
  @body.gsub!(/<iframe.*?>(?:\n|.)*?<\/iframe>/i,'')
  
  @body = @body.gsub(/<!\-\-(?:\n|.)*?\-\->/,'').gsub(/\s+/,' ').gsub(/\n+/,"\n")
  
  @body_dom = Nokogiri::XML(@body)
end

Instance Method Details

#authorObject



108
109
110
111
112
# File 'lib/chomchom/extractor.rb', line 108

def author
  writers = @body_dom.xpath(".//*[regex(.,'.*author|byline|auth.*','id|class|href')]", Chomchom::RegexPath.new).map { |n| n.inner_text }
  writers = writers.flatten.compact
  (writers and writers[0])? writers[0].gsub(/^\s+/,'') : ''
end

#consume_durationObject

return time in mintues factor in other embedded media duration



120
121
122
# File 'lib/chomchom/extractor.rb', line 120

def consume_duration
  (@fulltext)? (@fulltext.gsub(/<.*?>/,'').split(/[\s\n]/).size/WPM).ceil : 0
end

#fulltextObject



114
115
116
# File 'lib/chomchom/extractor.rb', line 114

def fulltext
  @fulltext
end

#publish_dateObject



93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/chomchom/extractor.rb', line 93

def publish_date
  dates = @body.scan(/(?:(#{MONTHS_RE}[^\w]+\d{1,2}(?:th|st|nd|rd)?[^\w]+(?:\d{4}|\d{2})?)[^\w]) |
                      (?:(\d{1,2}(?:th|st|nd|rd)?\s#{MONTHS_RE}[^\w]+(?:\d{4}|\d{2})?)[^\w]) |
                      (?:(\d{1,2}\-\d{1,2}\-\d{4})[^\w]) | (?:(\d{1,2}\.\d{1,2}\.\d{4})[^\w]) | (?:(\d{1,2}\/\d{1,2}\/\d{4})[^\w]) |
                      (?:(\d{4}\-\d{1,2}\-\d{1,2})[^\w]) | (?:(\d{4}\.\d{1,2}\.\d{1,2})[^\w]) | (?:(\d{4}\/\d{1,2}\/\d{1,2})[^\w])
                     /ix).flatten.compact
                     
  dates.delete_if { |d| is_not_date(d) } if dates
  begin
    Date.parse(dates[0])
  rescue
    Date.today
  end
end

#readability_titleObject

readability getArticleTitle



52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/chomchom/extractor.rb', line 52

def readability_title
  title = ''
  if @title.match(/[\|\-]/)
    title = @title.scan(/(.*)[\|\-].*/).flatten[0]
    title = @title.scan(/[^\|\-]*[\|\-](.*)/).flatten[0] if title.split(' ').size < 3
  elsif @title.index(': ') 
    title = @title.scan(/.*:(.*)/).flatten[0]
  elsif @title.length > 150 or @title.length < 15
    h1s = @body_dom.xpath(".//h1")
    title = h1s[0].inner_text if h1s and h1s.size > 0
  end
  title = @title if title.split(' ').size <= 4
  title.gsub(/^\s+/,'')
end

#titleObject

retrieving title strategy:

  1. get all the elements with class/id=“…title|head…” and h1-h3

  2. match them against the page title to get a bunch of candidates

  3. take the longest candidate, take original title if no candidate avail



71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/chomchom/extractor.rb', line 71

def title
  titles1 = @body_dom.xpath(".//*[regex(.,'.*title|head.*','id|class')]", Chomchom::RegexPath.new).map { |n| n.inner_text }
  titles2 = @body_dom.search('//h1','//h2').map {|n| n.inner_text }
  titles = (titles1 + titles2).flatten.compact
  candidates = titles.select { |t| @title.downcase.include?(t.downcase) }
  #select the longest candidate as title
  if candidates.size > 0
    title = ''
    candidates.each { |c| title = c if c.length > title.length }
    title.gsub(/\s+/,' ').gsub(/\n+/,'')
  else
    @title
  end
end