Class: WebMiner

Inherits:

Object

Object
WebMiner

show all

Defined in:: lib/webminer.rb

Overview

Main class, just a place holder

Defined Under Namespace

Classes: Util

Class Method Summary collapse

Class Method Details

.create_story(item, topic) ⇒ `Object`

# File 'lib/webminer.rb', line 15

def self.create_story(item, topic)
  params=CGI::parse(item.link)

  link = params["url"][0]
  puts Story.class
  if Story.where(:link => link).exists?
    return nil
  end

  story = Story.new
  story.title=item.title
  story.link = link

  if item.source != nil
    story.source = item.source
  else if story.link != nil
         
         matchdata = story.link.match'https?://(?:w{3}\.)?(.*?)/.*' #Since <source> tag is rarely set in Google News, we use domain name to identify source, e.g. www.nytimes.com, but still, only http(s) is going to work      
         story.source=matchdata[1]
       else 
         return nil
       end


  end #if

  story.topic=topic
  story.date_time=item.date.rfc2822

  puts "Title: "+story.title
  puts "Link: "+story.link
  puts "Source: "+story.source
  puts "Date/time:"+story.date_time.to_s
  
  #    raw_text = self.read_story(story.link)
  if !(self.get_parser_dictionary.keys.include? story.source)
    puts "Skipping text extraction from non-trusted source "+story.source+":"+story.title
    #      story.raw_content = open(story.link) do |s| Iconv.conv('UTF-8','ISO_8859-1',s.read) end
  else
    story.content = Iconv.conv('UTF-8','ISO_8859-1',self.extract_text(story.link, self.get_parser_dictionary[story.source]))
    if story.content == nil
      puts "Text extraction failed"
    end
  end

  begin
    story.save
    puts "Saved #{story.title}"
  rescue ex
    puts "Exception saving #{story.title}"
    pp ex
  end
  
  #doc.css 'div[class="storyText"]' cbsnews.com
  #'span[id="articleText"]' reuters
  return story

end

.extract_text(link, css_path) ⇒ `Object`

def

# File 'lib/webminer.rb', line 88

def self.extract_text(link, css_path)
  begin
    f=open(link)
    doc = Nokogiri::HTML(f)
    text_nodes = doc.css(css_path)
    raw_text = text_nodes.to_s
    f.close
    return Util.strip_story(raw_text)
  rescue => ex
    puts ex.message
    puts "Error fetching link:"+link
    return nil
  end
end

.get_parser_dictionary ⇒ `Object`

# File 'lib/webminer.rb', line 103

def self.get_parser_dictionary
  return {
  "google.com" => 'div[id="hostednews-article"]',
  "cbsnews.com" => 'div[id="contentBody"]',
  "reuters.com" => 'span[id="articleText"]',
  "latimes.com" => 'div[id="story-body-text"]',
  "csmonitor.com" => 'div[id="mainColumn"]',
  "npr.org" => 'div[id="storytext"]',
  "usatoday.com" => 'div[id="mainstory"]',
  "content.usatoday.com" => 'div[id="mainstory"]',
  "guardian.co.uk" => 'div[id="article-body-blocks"]',
  "nytimes.com" => 'div[id="article"]',
  "bloomberg.com" => 'div[id="story_content"]',
  "online.wsj.com" => 'div[id="article_story_body"]',
  "asia.wsj.com" => 'div[id="article_story_body"]',
  "businessweek.com" => 'div[id="story-body"]',
  "cnn.com" => 'div[id="cnnContentContainer"]',
  "edition.cnn.com" => 'cnn_storyarea[id="cnnContentContainer"]',
  "money.cnn.com" => 'div[id="storytext"]',
  "abcnews.go.com" => 'div[id="innerbody"]',
  "foxnews.com" => 'div[id="introduction"]',
  "businessweek.com" => 'div[id="story-body"]',
  "entertainment.msnbc.msn.com" => 'div[id="vine-t"] article',
  "washingtonpost.com" => 'div[id="article_body"]',
  #    "bbc.co.uk" => 'div[id="main-content"]',
  "huffingtonpost.com" => 'div[id="entry_12345"]',
  "telegraph.co.uk" => 'div[id="mainBodyArea"]',
  "chicagotribune.com" => 'div[id="story-body-text"]',
  "foxbusiness.com" => 'div[id="introduction"]',
  "thedailybeast.com" => 'div[id="main"] article',
  "economictimes.indiatimes.com" => 'div[id="storydiv"]',
  "forbes.com" => 'div[id="leftRail"]',
  "arstechnica.com" => 'div[id="story"]',
  "theregister.co.uk"=> 'div[id="body"]',
  "ingame.msnbc.msn.com"=> 'div[id="vine-t"] article',
  "informationweek.com"=> 'span[id="articleBody"]',
  "newyorker.com"=> 'div[id="articletext"]',
  "kotaku.com"=> 'div[id="page"]',
  "slashgear.com"=> 'span[id="intelliTxt"]',
  "pcworld.com"=> 'div[id="articleText"]',
  "news.cnet.com"=> 'div[id="article"]',
  "english.aljazeera.net"=> 'td[id="tdTextContent"]',
  "dailymail.co.uk"=> 'div[id="js-article-text"]',
  "rttnews.com"=> 'div[id=""]',
  "ft.com"=> 'div[id="storyContent"]',
  "politico.com"=> 'div[id="mainContent"]',
  "boston.com"=> 'div[id="page1"]',
  "sfgate.com"=> 'div[id="fontprefs_bottom"]',
  "oregonlive.com"=> 'div[id="article"]'
  #""=> 'div[id=""]',

  #    "wired.com"=> 'div[id=""]'?
  #http://latimesblogs.latimes.com ?
}
end

.main ⇒ `Object`

# File 'lib/webminer.rb', line 160

def self.main
  prng = Random.new
  topics = Hash.new(0)
  topics['Top Stories'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&output=rss'
  topics['U.S.'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=n&output=rss'
  topics['Health'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=m&output=rss'
  topics['Business'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=b&output=rss'
  topics['Technology'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=tc&output=rss'
  topics['World'] = 'http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&topic=w&output=rss'

  topic_threads = []
  $options = {}
  $options['User-Agent']='Mozilla/5.0 (X11; Linux x86_64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1'
  $options['Accept']='text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
  $options['From']='[email protected]'
  topics.keys.each do |topic|
    url = topics[topic]
    sleep 20+prng.rand*2
    topic_threads << Thread.new(url) do
      #uri = URI.parse(url)

      #http = Net::HTTP.new(uri.host, uri.port)
      #request = Net::HTTP::Get.new(uri.request_uri)
      print "I'm working on #{url}\n"    

      hash = 0

      while true
        #response = http.request(request)
        #code = response.code
        #body = response.body
        content = ""
        begin
          open(url,$options) do |s| content = s.read end
        rescue OpenURI::HTTPError => ex
          puts "Google caught us again! "+url
#          rescue => ex
#            puts "Unhandled exception:"+ex.message
        rescue SocketError => ex
          puts ex.message
        rescue Timeout::Error => ex
          puts ex.message
        end
        new_hash = content.hash

        # Headers are lowercased
        #response["cache-control"] # => public, max-age=2592000
        


        #print "Got code: #{code}\n"             # => 301
        print "Got content: #{content.length}, hash: #{new_hash}\n"             # => The body (HTML, XML, blob, whatever)

        if new_hash != hash
          print "Parsing\n"
          self.parse(content, topic)
        else
          print "Skipping\n"
        end
        hash = new_hash
        sleep 180+prng.rand*10
      end #while
    end #thread
  end #each


  topic_threads.each do |thr|
    thr.join
  end #each

end

.parse(content, topic) ⇒ `Object`

def

# File 'lib/webminer.rb', line 74

def self.parse(content, topic)
  rss = RSS::Parser::parse(content, false)
  if rss != nil
    items = rss.items
    if items != nil
      for item in items
        story = self.create_story(item, topic)
        #print "Created story: #{story}\n"
        #story.save
      end #for
    end
  end
end

Class: WebMiner

Overview

Defined Under Namespace

Class Method Summary collapse

Class Method Details

.create_story(item, topic) ⇒ Object

.extract_text(link, css_path) ⇒ Object

.get_parser_dictionary ⇒ Object

.main ⇒ Object

.parse(content, topic) ⇒ Object

.create_story(item, topic) ⇒ `Object`

.extract_text(link, css_path) ⇒ `Object`

.get_parser_dictionary ⇒ `Object`

.main ⇒ `Object`

.parse(content, topic) ⇒ `Object`