Class: GoogleWebSearch

Inherits:
Object
  • Object
show all
Defined in:
lib/gsearch-parser.rb

Overview

Google Web Search class

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(arg1, flag) ⇒ GoogleWebSearch

Class initializer



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/gsearch-parser.rb', line 29

def initialize(arg1, flag)
  # Initialize variables
  @results = Array.new

  case flag
  when 'QUERY'
    # Format query
    query = arg1.gsub(/ /, '+')
    updateResults("http://google.com/search?q=#{query}")
  when 'URI'
    updateResults(arg1)
  end

  # Update next URI
  updateNextURI   
end

Instance Attribute Details

#nextURIObject

Returns the value of attribute nextURI.



25
26
27
# File 'lib/gsearch-parser.rb', line 25

def nextURI
  @nextURI
end

#resultsObject

Returns the value of attribute results.



25
26
27
# File 'lib/gsearch-parser.rb', line 25

def results
  @results
end

Instance Method Details

#each(&blk) ⇒ Object

Iterator over results



115
116
117
# File 'lib/gsearch-parser.rb', line 115

def each(&blk)
  @results.each(&blk)
end

#fetchPage(url) ⇒ Object

Fetch the page from a URL



68
69
70
# File 'lib/gsearch-parser.rb', line 68

def fetchPage(url)
  Nokogiri::HTML(open(url, 'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.152 Safari/535.19'))
end

#nextResultsObject

Parse the results from the next page and append to results list



106
107
108
109
110
111
112
# File 'lib/gsearch-parser.rb', line 106

def nextResults
  # Update results
  updateResults(@nextURI)

  # Update nextURI
  updateNextURI
end

#parseCurrentPageObject

Parse the current page and populate results



73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# File 'lib/gsearch-parser.rb', line 73

def parseCurrentPage
  # Initialize local variables
  currentResults = Array.new

  # Iterate over each Google result list element 
  @currentPage.css('li.g').each do |result|
    begin
    # Extract the title
    title = result.css('h3 a').first.inner_html

    # Extract the content. There is the possibility for
    # the content to be nil, so check for this
    content = result.css('span.st').first.nil? ? '' : result.css('span.st').first.inner_html

    # Extract the URI
    uri = result.css('cite').first.inner_html

    # Ignore YouTube videos for websearch
    unless uri.index('www.youtube.com').nil? 
      next
    end

    # Create a new Result object and append to the array
    currentResults << Result.new(title, content, uri)
    rescue NoMethodError
      next
    end
  end
  @results += currentResults
  return currentResults
end

#updateNextURIObject

Update the nextURI attribute



47
48
49
50
51
52
53
# File 'lib/gsearch-parser.rb', line 47

def updateNextURI
  # Parse next result page link from the currently marked one
  nextPagePath = @currentPage.at_css("table#nav tr td.cur").next_sibling().at_css("a")['href']

  # Construct the URI
  @nextURI = "http://www.google.com" + nextPagePath
end

#updateResults(url) ⇒ Object

Update the WebSearch results array by performing a Fetch, Store, Parse routine



56
57
58
59
60
61
62
63
64
65
# File 'lib/gsearch-parser.rb', line 56

def updateResults(url)
  # Fetch
  searchPage = fetchPage(url)

  # Store
  @currentPage = searchPage

  # Parse
  parseCurrentPage
end