Class: MartialArts::Scraper
- Inherits:
-
Object
- Object
- MartialArts::Scraper
- Defined in:
- lib/martial_arts/scraper.rb
Constant Summary collapse
- @@all =
[]
Class Method Summary collapse
- .all ⇒ Object
- .correct_errors(info_1, info_2 = nil, info_3 = nil, info_4 = nil, info_5 = nil) ⇒ Object
- .import_countries ⇒ Object
- .import_popular ⇒ Object
- .import_styles ⇒ Object
- .scrape_data ⇒ Object
Class Method Details
.all ⇒ Object
100 101 102 |
# File 'lib/martial_arts/scraper.rb', line 100 def self.all @@all end |
.correct_errors(info_1, info_2 = nil, info_3 = nil, info_4 = nil, info_5 = nil) ⇒ Object
104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
# File 'lib/martial_arts/scraper.rb', line 104 def self.correct_errors(info_1, info_2 = nil, info_3 = nil, info_4 = nil, info_5 = nil) #enter code here for problems with style info if info_1 == "Karate In The United States" #info_1 = style(String class) - from .import_syles @style = "American Karate" @country = "United States" @fighting_focus = "Hybrid" end MartialArts::Styles.popular << info_1 if info_1.class == MartialArts::Styles #info_1 == style_instance - from .import_popular MartialArts::Styles.popular << "Chinese Martial Arts" if info_2 == "Kung fu" #info_2 == style - from .import_popular #Kung Fu encompasses many of the chinese martial arts #ERROR - some styles lead to sites about their people which include info on the style, it just prints out weird ex: Surma people end |
.import_countries ⇒ Object
90 91 92 93 94 95 96 97 98 |
# File 'lib/martial_arts/scraper.rb', line 90 def self.import_countries doc = Nokogiri::HTML(open("https://en.wikipedia.org/wiki/List_of_martial_arts")) doc.css('.div-col.columns.column-width').each_with_index do |info, i| info.css('li').each {|country| MartialArts::Countries.filtered << country.css('a')[1].text } if i == 0 #list for African countries info.css('dt a').each {|country| MartialArts::Countries.filtered << country.text } if i > 0 #rest of the countries end end |
.import_popular ⇒ Object
77 78 79 80 81 82 83 84 85 86 87 88 |
# File 'lib/martial_arts/scraper.rb', line 77 def self.import_popular #popular martial arts 2018 doc = Nokogiri::HTML(open("http://www.singpatong-sitnumnoi.com/10-most-popular-martial-arts/")) info = doc.css('strong').text.split(/No.\w*. | from /) #mixture of country and style styles = info.select.with_index {|_, style| style.odd?} styles.each do |style| #assuming the style_instance will definitely be there for the most popular style name style_instance = MartialArts::Styles.all.find {|style_instance| style_instance.name.downcase == style.downcase } self.correct_errors(style_instance, style) end end |
.import_styles ⇒ Object
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
# File 'lib/martial_arts/scraper.rb', line 59 def self.import_styles self.all.each do |data_string| data_array = data_string.split(" - ") @style = data_array[0] @country = data_array[1] @fighting_focus = data_array[2] @website = data_array[3] @description = data_array[4] if MartialArts::Styles.duplicates?(@style) == nil self.correct_errors(@style, @country, @fighting_focus, @website, @description) MartialArts::Styles.all << MartialArts::Styles.new(@style, @country, @fighting_focus, @website, @description) end end end |
.scrape_data ⇒ Object
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
# File 'lib/martial_arts/scraper.rb', line 5 def self.scrape_data #4 steps #1 retrieve all style information from their individual wikipedia websites doc = Nokogiri::HTML(open("https://en.wikipedia.org/wiki/List_of_martial_arts")) doc.css('.div-col.columns.column-width li').each do |style| #opens the style's website html = Nokogiri::HTML(open("https://en.wikipedia.org#{style.css('a')[0]['href']}")) if html.css('table.infobox tr') != nil html.css('table.infobox tr').each do |info| #retrieves focus and country data @focus = info.css('td a').text if info.css('th').text == "Focus" @country = info.css('td').text if info.css('th').text == "Country of origin" end end #retrieves style data style_edit = "#{html.css('h1').text.downcase}".gsub(/(\s\Dmartial\sarts?\D)/, '') #removes generic words in style name @style = "#{style_edit}".split.map(&:capitalize).join(' ') #capitalizes first letter of each word #retrieves description data description_info = html.css('div.mw-parser-output p').detect do |p| p.text.size > 20 and p.text.include?("#{@style}") and p.text != nil end if description_info == nil @description = "N/A" else description_edit = "#{description_info.text}".tr('(\D[123456789]\D)', '') #removes unwanted text @description = "#{description_edit}".sub('listen', '') #removes first occurence of listen end #retrieves website data @website = "https://en.wikipedia.org#{style.css('a')[0]['href']}" #checks for missing data that is converted to N/A @country = "N/A" if @country == "" @focus = "N/A" if @focus == "" #puts it all together into one string of data self.all << "#{@style} - #{@country} - #{@focus} - #{@website} - #{@description}" end #2 instantiates the informaiton self.import_styles #3 retrieves popular martial arts from a website self.import_popular #4 retrieves easier to acces country list self.import_countries end |