Class: Scraper
- Inherits:
-
Object
- Object
- Scraper
- Defined in:
- lib/rubedility/scraper.rb
Overview
require ‘pry’
Class Method Summary collapse
- .scrape_index_page(index_url) ⇒ Object
- .scrape_lesson_page(lesson_url) ⇒ Object
- .scrape_task_page(task_url) ⇒ Object
Class Method Details
.scrape_index_page(index_url) ⇒ Object
7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
# File 'lib/rubedility/scraper.rb', line 7 def self.scrape_index_page(index_url) index = Nokogiri::HTML(open(index_url)) lessons = [] index.css("div.lessons_list a").each do |lesson| print "." name = lesson.css("div.title").text number = lesson.css("div.num").text.delete("Lesson").to_i lesson_url = "".concat(index_url).concat(lesson.attr("href").split("/").last) lessons.push({:name=>name, :number=>number, :lesson_url=>lesson_url}) end return lessons end |
.scrape_lesson_page(lesson_url) ⇒ Object
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/rubedility/scraper.rb', line 22 def self.scrape_lesson_page(lesson_url) begin print "." lesson = Nokogiri::HTML(open(lesson_url)) if lesson.css("a#readings").length > 0 reading_url = lesson.css("a#readings").attr("href").value end tests_started = lesson.css("span.started span.num").text.to_i tests_solved = lesson.css("span.finished span.num").text.to_i task_hashes_array = [] lesson.css("div.task-box").each do |task_row| name = task_row.css("h4.title").text.strip #url is just the last 'piece' of the task URL url = task_row.css("a").attr("href").text #have to add that to the end of the 'real' URL, but take off part of it task_url = lesson_url.split("/")[0..2].join("/").concat(url) difficulty = task_row.css("div.difficulty").text.strip tagline = task_row.css("div.synopsis").text.strip task_hashes_array.push({:name=>name, :task_url=>task_url, :difficulty=>difficulty, :tagline=>tagline, :task_reading_url=>reading_url}) end lesson_details = {:reading_url=>reading_url, :tests_started=>tests_started, :tests_solved=>tests_solved} #return [hash-of-lesson-details, array-of-task-detail-hashes] return [lesson_details, task_hashes_array] rescue OpenURI::HTTPError => er puts "404, Lesson not found" puts lesson_url puts er return nil else end end |
.scrape_task_page(task_url) ⇒ Object
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
# File 'lib/rubedility/scraper.rb', line 54 def self.scrape_task_page(task_url) print "." begin task = Nokogiri::HTML(open(task_url)) content = task.css("div.desc-rb-en div").first.text #the way they have the content is not best for command line display. #some '\n' and some '\n\n', command line looks better with '\n\n' #substitube singles for doubles content.gsub!(/[\n]+/,"\n") #substitute doubles for singles content.gsub!(/[\n]/,"\n\n") return {:content=>content} rescue OpenURI::HTTPError => er puts "404'd!" puts task_url puts er return nil else end end |