Class: Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/podcast_book_club/scraper.rb

Instance Method Summary collapse

Constructor Details

#initializeScraper

Returns a new instance of Scraper.



5
6
7
8
9
# File 'lib/podcast_book_club/scraper.rb', line 5

def initialize
    path = build_path
    fetch_episodes(path)
    @config = googlebooks_config
end

Instance Method Details

#build_books(episode) ⇒ Object



33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/podcast_book_club/scraper.rb', line 33

def build_books(episode)
  describe_episode(episode)

  if @description.match?(/\b(B|b)ook/)
    queries = send_to_parser(episode)

    queries.each do |query|
      google_book_search = GoogleBooks.search(query, @config)
      result = google_book_search.first

      attributes = {}


      begin

        attributes[:url] = result.info_link unless result.info_link.nil?
        attributes[:title] = result.title  unless result.title.nil?
        attributes[:author] = result.authors_array unless result.authors_array.nil? || result.authors_array == [nil]
        attributes[:genre] = result.categories unless result.categories.nil? || result.categories == ""
        attributes[:synopsis] = result.description unless result.description.nil?
        attributes[:episode] = episode
        

        Book.find_or_create_by_title(attributes)

      rescue

        puts "I'm having trouble adding the book " + Rainbow("#{query}.").bg(:black).yellow

      end
    end
  end

end

#build_pathObject



68
69
70
71
72
73
# File 'lib/podcast_book_club/scraper.rb', line 68

def build_path
    snapshot_date = Date.new(2019,6,25)
    today = Date.today
    episodes_since_snapshot = snapshot_date.step(today).select{|d| d.monday? || d.thursday?}.size
    url = "https://player.fm/series/the-ezra-klein-show/episodes?active=true&limit=#{episodes_since_snapshot + 183}&order=newest&query=&style=list&container=false&offset=0"
end

#describe_episode(episode) ⇒ Object



75
76
77
78
79
80
81
# File 'lib/podcast_book_club/scraper.rb', line 75

def describe_episode(episode)
  path = episode.link

  html = open(path)
  @episode_doc = Nokogiri::HTML(html)
  @description = @episode_doc.css(".story .description").text
end

#fetch_episodes(path) ⇒ Object



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/podcast_book_club/scraper.rb', line 11

def fetch_episodes(path)
  html = open(path)
  doc = Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s)
  episodes = doc.css(".info")

  episodes.each do |episode|

    title = episode.css(".info-top a").text.strip
    link = "https://player.fm#{episode.css(".info-top a").attribute("href").value}"
    date = Date.strptime(episode.css(".timeago").attribute("datetime").value)

    attributes = {
        title: title,
        link: link,
        date: date
    }

    Episode.create(attributes)
  end

end


99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/podcast_book_club/scraper.rb', line 99

def parse_with_links(episode)
    book_links = @episode_doc.css(".description.prose > a")

    book_titles = book_links.select { |link| link.attribute("href").value.include?("amazon")}.map {|link| link.text}

    description = @description.split(book_titles[0]).pop.to_s

    book_titles.map.with_index do |title, i|

      if i + 1 < book_titles.length
        description = description.split(book_titles[i+1 || i])
        author = description[0].strip
        description = description.pop

        "#{title} #{author}"
      else

        "#{title} #{description[0].strip}"

      end

    end
end


123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# File 'lib/podcast_book_club/scraper.rb', line 123

def parse_without_links(episode)
  book_block = @description.split(/(B|b)ooks:?\s/)[-1]
  book_block = book_block.split("Notes from our sponsors")[0]
  books = book_block.split(/Find.*ART19/)[0]

  book_array = books.strip.split(/\sby(\s[A-Z]\w*\s[a-z]*\s?[A-Z][.\w]*(?<![a-z])\s?[A-Z]?[.a-zA-Z]*)/)

  book_queries = []

  book_array.each.with_index do |item, i|
      if i.even?
          book_queries << "#{item.strip} #{book_array[i+1]}"
      end
  end

  book_queries

end

#send_to_parser(episode) ⇒ Object



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/podcast_book_club/scraper.rb', line 83

def send_to_parser(episode)
  today = Date.today
  with_links = Date.new(2019, 1, 14)
  without_links = Date.new(2017, 3, 28)

  case episode.date
  when (with_links..today)
    parse_with_links(episode)
  when (without_links...with_links)
    parse_without_links(episode)
  else
    puts "This episode has no recommendations."
  end

end