Class: CorpusGenerator::Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/random_poetry_scraper/scraper.rb

Constant Summary collapse

"https://w0.poemhunter.com"
ROOT_LINK + "/members/random-poem/"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeScraper

Returns a new instance of Scraper.



6
7
8
# File 'lib/random_poetry_scraper/scraper.rb', line 6

def initialize
    self.html_doc = Nokogiri::HTML(open(BROWSE_LINK))
end

Instance Attribute Details

#html_docObject

Returns the value of attribute html_doc.



2
3
4
# File 'lib/random_poetry_scraper/scraper.rb', line 2

def html_doc
  @html_doc
end

Instance Method Details

#scrape_poem_pageObject



10
11
12
13
14
15
16
17
18
19
20
21
# File 'lib/random_poetry_scraper/scraper.rb', line 10

def scrape_poem_page
    poem_attributes = {}
    poem_attributes[:name] = html_doc.css(".poem").css("h2").text
    poem_attributes[:text] = html_doc.css(".poem").css("p").inner_html.gsub("<br>", "\n").gsub(/\r\n[\t]+/, "")
    if poet_name = html_doc.css(".poet").text
        poem_attributes[:poet] = {}
        poem_attributes[:poet][:name] = poet_name
        poem_attributes[:poet][:profile_url] = ROOT_LINK + html_doc.css(".poem a").attr("href").value
    end

    poem_attributes == [] ? nil : poem_attributes
end