Class: SpeakerdeckScraper

Inherits:
Object
  • Object
show all
Defined in:
lib/spdeck-scrape/spdeck-scraper-class.rb

Constant Summary collapse

SD_QUERY_FIRST_PAGE =
"https://speakerdeck.com/search?q=ruby"
SD_DOMAIN =
"https://speakerdeck.com"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(query, range = 5, display = '-v') ⇒ SpeakerdeckScraper

Returns a new instance of SpeakerdeckScraper.



13
14
15
16
17
18
19
20
21
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 13

def initialize(query, range = 5, display = '-v')
    @url = "https://speakerdeck.com/"
    @query = query
    @page_object = ''
    @presentations = {}
    @start_time = Time.now
    @range = range
    @display = display
end

Instance Attribute Details

#displayObject

Returns the value of attribute display.



8
9
10
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 8

def display
  @display
end

#end_timeObject

Returns the value of attribute end_time.



8
9
10
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 8

def end_time
  @end_time
end

#optsObject

Returns the value of attribute opts.



8
9
10
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 8

def opts
  @opts
end

#page_objectObject (readonly)

Returns the value of attribute page_object.



7
8
9
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 7

def page_object
  @page_object
end

#presentationsObject (readonly)

Returns the value of attribute presentations.



7
8
9
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 7

def presentations
  @presentations
end

#queryObject

Returns the value of attribute query.



8
9
10
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 8

def query
  @query
end

#start_timeObject

Returns the value of attribute start_time.



8
9
10
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 8

def start_time
  @start_time
end

#urlObject (readonly)

Returns the value of attribute url.



7
8
9
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 7

def url
  @url
end

Instance Method Details

#concise_displayObject



62
63
64
65
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 62

def concise_display
    print "#"
    sleep(0.02)
end

#html_genObject



124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 124

def html_gen
    # take data and sort it by views descending
    sorted_array = self.presentations.values.sort_by do |pres_hash| 
        pres_hash[:views]
    end.reverse

    File.open("spd-#{query}.html", "w") do |file|
        file.write( <<-HTML
            <html>
            <head>
            </head>
            <body>
            <h1>speakerdeck presentations - #{query}</h1>
            <h4>this site was generated in #{self.end_time - self.start_time} seconds (last queried at #{self.start_time})
                <table class="tablesorter" border="1">
                <tr>
                    <th>title</th>
                    <th>date</th>
                    <th>category</th>
                    <th>author</th>
                    <th>views</th>
                </tr>
        HTML
        )
        sorted_array.each do |content_hash|
            link = "#{SD_DOMAIN}#{content_hash[:link]}"
            author_link = "#{SD_DOMAIN}#{content_hash[:author_link]}"
            file.write ( <<-HTML
                <tr>
                    <td><a href=#{link}>#{content_hash[:title]}</a></td>
                    <td>#{content_hash[:date]}</td>
                    <td><a href="https://speakerdeck.com/c/#{content_hash[:category].downcase}">#{content_hash[:category]}</a></td>
                    <td><a href=#{author_link}>#{content_hash[:author]}</a></td>
                    <td>#{content_hash[:views]}</td>
                </tr>  
                HTML
            )
        end
        file.write(<<-HTML
            </table>
            </body>
            </html>
            HTML
            )
    end
end

#pres_author(pres_page) ⇒ Object



108
109
110
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 108

def pres_author(pres_page)
    pres_page.css('div#content header h2 a').text
end


112
113
114
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 112

def pres_author_link(pres_page)
    pres_page.css('div#content header h2 a').attr('href').text
end

#pres_category(pres_page) ⇒ Object



120
121
122
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 120

def pres_category(pres_page)
    pres_page.css('div#talk-details mark a').text
end

#pres_date(pres_page) ⇒ Object



116
117
118
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 116

def pres_date(pres_page)
    pres_page.css('div#talk-details mark').first.text.strip
end

#pres_page_scrape(id, pres_link) ⇒ Object

grab data from one page note: this is a time consuming process – have to open each page (but necessary because the views data isn’t stored on the query pages)



79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 79

def pres_page_scrape(id, pres_link)
    pres_page = Nokogiri::HTML(open("https://speakerdeck.com#{pres_link}"))
    
    presentations[id] = { 
        :title => pres_title(pres_page),
        :link => pres_link,
        :date => pres_date(pres_page),
        :author => pres_author(pres_page),
        :author_link => pres_author_link(pres_page),
        :category => pres_category(pres_page), 
        :views => pres_views(pres_page)
        }

    if self.display == '-c'
        concise_display
    else
    puts "#{presentations[id][:title]} has #{presentations[id][:views]} views!"
    end
end

#pres_title(pres_page) ⇒ Object



103
104
105
106
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 103

def pres_title(pres_page)
    pres_page.css('div#content header h1').text

end

#pres_views(pres_page) ⇒ Object



99
100
101
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 99

def pres_views(pres_page)
    pres_page.css('li.views').text.scan(/\d+/).join.to_i
end

#query_results_scrape(range) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 23

def query_results_scrape(range)
    puts "grabbing presentations"
    begin
    single_results_page_scrape(SD_QUERY_FIRST_PAGE)
    (2..range).collect do |i|
        single_results_page_scrape(i)
    end
    rescue
        puts "error! prob nothing to worry about"
    end
    puts "\ncool! we got #{presentations.length} presentations"
end

#scrape_allObject

wrapper to run the single page scraper for all links



69
70
71
72
73
74
75
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 69

def scrape_all
    puts "reading presentation data"
    self.presentations.each do |id, link|
        pres_page_scrape(id, link)
    end
    self.end_time = Time.now
end

#single_results_page_scrape(i) ⇒ Object

dumps the query results into a hash, presentations = { ‘pres title’ => ‘pres_link.html’ } not called explicitly, lives in query scrape wrapper



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 38

def single_results_page_scrape(i)
    doc = Nokogiri::HTML(open "#{self.url}search?page=#{i}&q=#{query}")
    doc.css('div.talk').each do |presentation|
        # ensures a unique key in the hash
        pres_id = presentation.attr('data-id')
        
        pres_link = presentation.css('h3.title a').attr('href').text

        pres_title = presentation.css('h3.title').text.strip
        author_name = presentation.parent.css('h3.title a').last.text
        verbose_display(pres_title, author_name) if self.display == "-v"
        concise_display if self.display == "-c"

        self.presentations[pres_id] = pres_link 
    end
end

#verbose_display(pres_title, author_name) ⇒ Object

display options ############



56
57
58
59
60
# File 'lib/spdeck-scrape/spdeck-scraper-class.rb', line 56

def verbose_display(pres_title, author_name)
    good_words = ["awesome", "great", "amazing", "really cool", "tops", "mind-blowing", "super", "glittering", "thought-provoking", "glorious", "sweet", "classy","really great", "fun", "strong", "robust", "healthy", "fine", "superior", "quality", "thoughful", "intelligent", "clever", "genius","incredible", "smart", "beautiful", "handsome", "pulchritudinous", "elegant", "bespoke", "crazy", "satisfying", "inspirational", "inspiring", "mind-exploding", "hot"]
    puts "grabbed a #{good_words[rand(good_words.length)]} presentation #{pres_title} by #{author_name}"
    sleep(0.02)
end