Class: SpeakerdeckScraper
- Inherits:
-
Object
- Object
- SpeakerdeckScraper
- Defined in:
- lib/spdeck-scrape/spdeck-scraper-class.rb
Constant Summary collapse
- SD_QUERY_FIRST_PAGE =
"https://speakerdeck.com/search?q=ruby"
- SD_DOMAIN =
"https://speakerdeck.com"
Instance Attribute Summary collapse
-
#display ⇒ Object
Returns the value of attribute display.
-
#end_time ⇒ Object
Returns the value of attribute end_time.
-
#opts ⇒ Object
Returns the value of attribute opts.
-
#page_object ⇒ Object
readonly
Returns the value of attribute page_object.
-
#presentations ⇒ Object
readonly
Returns the value of attribute presentations.
-
#query ⇒ Object
Returns the value of attribute query.
-
#start_time ⇒ Object
Returns the value of attribute start_time.
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Instance Method Summary collapse
- #concise_display ⇒ Object
- #html_gen ⇒ Object
-
#initialize(query, range = 5, display = '-v') ⇒ SpeakerdeckScraper
constructor
A new instance of SpeakerdeckScraper.
- #pres_author(pres_page) ⇒ Object
- #pres_author_link(pres_page) ⇒ Object
- #pres_category(pres_page) ⇒ Object
- #pres_date(pres_page) ⇒ Object
-
#pres_page_scrape(id, pres_link) ⇒ Object
grab data from one page note: this is a time consuming process – have to open each page (but necessary because the views data isn’t stored on the query pages).
- #pres_title(pres_page) ⇒ Object
- #pres_views(pres_page) ⇒ Object
- #query_results_scrape(range) ⇒ Object
-
#scrape_all ⇒ Object
wrapper to run the single page scraper for all links.
-
#single_results_page_scrape(i) ⇒ Object
dumps the query results into a hash, presentations = { ‘pres title’ => ‘pres_link.html’ } not called explicitly, lives in query scrape wrapper.
-
#verbose_display(pres_title, author_name) ⇒ Object
display options ############.
Constructor Details
#initialize(query, range = 5, display = '-v') ⇒ SpeakerdeckScraper
Returns a new instance of SpeakerdeckScraper.
13 14 15 16 17 18 19 20 21 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 13 def initialize(query, range = 5, display = '-v') @url = "https://speakerdeck.com/" @query = query @page_object = '' @presentations = {} @start_time = Time.now @range = range @display = display end |
Instance Attribute Details
#display ⇒ Object
Returns the value of attribute display.
8 9 10 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 8 def display @display end |
#end_time ⇒ Object
Returns the value of attribute end_time.
8 9 10 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 8 def end_time @end_time end |
#opts ⇒ Object
Returns the value of attribute opts.
8 9 10 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 8 def opts @opts end |
#page_object ⇒ Object (readonly)
Returns the value of attribute page_object.
7 8 9 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 7 def page_object @page_object end |
#presentations ⇒ Object (readonly)
Returns the value of attribute presentations.
7 8 9 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 7 def presentations @presentations end |
#query ⇒ Object
Returns the value of attribute query.
8 9 10 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 8 def query @query end |
#start_time ⇒ Object
Returns the value of attribute start_time.
8 9 10 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 8 def start_time @start_time end |
#url ⇒ Object (readonly)
Returns the value of attribute url.
7 8 9 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 7 def url @url end |
Instance Method Details
#concise_display ⇒ Object
62 63 64 65 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 62 def concise_display print "#" sleep(0.02) end |
#html_gen ⇒ Object
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 124 def html_gen # take data and sort it by views descending sorted_array = self.presentations.values.sort_by do |pres_hash| pres_hash[:views] end.reverse File.open("spd-#{query}.html", "w") do |file| file.write( <<-HTML <html> <head> </head> <body> <h1>speakerdeck presentations - #{query}</h1> <h4>this site was generated in #{self.end_time - self.start_time} seconds (last queried at #{self.start_time}) <table class="tablesorter" border="1"> <tr> <th>title</th> <th>date</th> <th>category</th> <th>author</th> <th>views</th> </tr> HTML ) sorted_array.each do |content_hash| link = "#{SD_DOMAIN}#{content_hash[:link]}" = "#{SD_DOMAIN}#{content_hash[:author_link]}" file.write ( <<-HTML <tr> <td><a href=#{link}>#{content_hash[:title]}</a></td> <td>#{content_hash[:date]}</td> <td><a href="https://speakerdeck.com/c/#{content_hash[:category].downcase}">#{content_hash[:category]}</a></td> <td><a href=#{}>#{content_hash[:]}</a></td> <td>#{content_hash[:views]}</td> </tr> HTML ) end file.write(<<-HTML </table> </body> </html> HTML ) end end |
#pres_author(pres_page) ⇒ Object
108 109 110 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 108 def (pres_page) pres_page.css('div#content header h2 a').text end |
#pres_author_link(pres_page) ⇒ Object
112 113 114 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 112 def (pres_page) pres_page.css('div#content header h2 a').attr('href').text end |
#pres_category(pres_page) ⇒ Object
120 121 122 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 120 def pres_category(pres_page) pres_page.css('div#talk-details mark a').text end |
#pres_date(pres_page) ⇒ Object
116 117 118 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 116 def pres_date(pres_page) pres_page.css('div#talk-details mark').first.text.strip end |
#pres_page_scrape(id, pres_link) ⇒ Object
grab data from one page note: this is a time consuming process – have to open each page (but necessary because the views data isn’t stored on the query pages)
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 79 def pres_page_scrape(id, pres_link) pres_page = Nokogiri::HTML(open("https://speakerdeck.com#{pres_link}")) presentations[id] = { :title => pres_title(pres_page), :link => pres_link, :date => pres_date(pres_page), :author => (pres_page), :author_link => (pres_page), :category => pres_category(pres_page), :views => pres_views(pres_page) } if self.display == '-c' concise_display else puts "#{presentations[id][:title]} has #{presentations[id][:views]} views!" end end |
#pres_title(pres_page) ⇒ Object
103 104 105 106 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 103 def pres_title(pres_page) pres_page.css('div#content header h1').text end |
#pres_views(pres_page) ⇒ Object
99 100 101 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 99 def pres_views(pres_page) pres_page.css('li.views').text.scan(/\d+/).join.to_i end |
#query_results_scrape(range) ⇒ Object
23 24 25 26 27 28 29 30 31 32 33 34 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 23 def query_results_scrape(range) puts "grabbing presentations" begin single_results_page_scrape(SD_QUERY_FIRST_PAGE) (2..range).collect do |i| single_results_page_scrape(i) end rescue puts "error! prob nothing to worry about" end puts "\ncool! we got #{presentations.length} presentations" end |
#scrape_all ⇒ Object
wrapper to run the single page scraper for all links
69 70 71 72 73 74 75 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 69 def scrape_all puts "reading presentation data" self.presentations.each do |id, link| pres_page_scrape(id, link) end self.end_time = Time.now end |
#single_results_page_scrape(i) ⇒ Object
dumps the query results into a hash, presentations = { ‘pres title’ => ‘pres_link.html’ } not called explicitly, lives in query scrape wrapper
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 38 def single_results_page_scrape(i) doc = Nokogiri::HTML(open "#{self.url}search?page=#{i}&q=#{query}") doc.css('div.talk').each do |presentation| # ensures a unique key in the hash pres_id = presentation.attr('data-id') pres_link = presentation.css('h3.title a').attr('href').text pres_title = presentation.css('h3.title').text.strip = presentation.parent.css('h3.title a').last.text verbose_display(pres_title, ) if self.display == "-v" concise_display if self.display == "-c" self.presentations[pres_id] = pres_link end end |
#verbose_display(pres_title, author_name) ⇒ Object
display options ############
56 57 58 59 60 |
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 56 def verbose_display(pres_title, ) good_words = ["awesome", "great", "amazing", "really cool", "tops", "mind-blowing", "super", "glittering", "thought-provoking", "glorious", "sweet", "classy","really great", "fun", "strong", "robust", "healthy", "fine", "superior", "quality", "thoughful", "intelligent", "clever", "genius","incredible", "smart", "beautiful", "handsome", "pulchritudinous", "elegant", "bespoke", "crazy", "satisfying", "inspirational", "inspiring", "mind-exploding", "hot"] puts "grabbed a #{good_words[rand(good_words.length)]} presentation #{pres_title} by #{}" sleep(0.02) end |