Class: MartialArts::Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/martial_arts/scraper.rb

Constant Summary collapse

@@all =
[]

Class Method Summary collapse

Class Method Details

.allObject



100
101
102
# File 'lib/martial_arts/scraper.rb', line 100

def self.all
  @@all
end

.correct_errors(info_1, info_2 = nil, info_3 = nil, info_4 = nil, info_5 = nil) ⇒ Object



104
105
106
107
108
109
110
111
112
113
114
115
116
117
# File 'lib/martial_arts/scraper.rb', line 104

def self.correct_errors(info_1, info_2 = nil, info_3 = nil, info_4 = nil, info_5 = nil)
  #enter code here for problems with style info
  if info_1 == "Karate In The United States"                                     #info_1 = style(String class) - from .import_syles
    @style = "American Karate"
    @country = "United States"
    @fighting_focus = "Hybrid"
  end

  MartialArts::Styles.popular << info_1 if info_1.class == MartialArts::Styles    #info_1 == style_instance - from .import_popular
  MartialArts::Styles.popular << "Chinese Martial Arts" if info_2 == "Kung fu"    #info_2 == style - from .import_popular
  #Kung Fu encompasses many of the chinese martial arts

  #ERROR - some styles lead to sites about their people which include info on the style, it just prints out weird ex: Surma people
end

.import_countriesObject



90
91
92
93
94
95
96
97
98
# File 'lib/martial_arts/scraper.rb', line 90

def self.import_countries
  doc = Nokogiri::HTML(open("https://en.wikipedia.org/wiki/List_of_martial_arts"))
  doc.css('.div-col.columns.column-width').each_with_index do |info, i|

    info.css('li').each {|country| MartialArts::Countries.filtered << country.css('a')[1].text } if i == 0  #list for African countries
    info.css('dt a').each {|country| MartialArts::Countries.filtered << country.text } if i > 0             #rest of the countries

  end
end


77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/martial_arts/scraper.rb', line 77

def self.import_popular
  #popular martial arts 2018
  doc = Nokogiri::HTML(open("http://www.singpatong-sitnumnoi.com/10-most-popular-martial-arts/"))
  info = doc.css('strong').text.split(/No.\w*. | from /) #mixture of country and style
  styles = info.select.with_index {|_, style| style.odd?}

  styles.each do |style|
    #assuming the style_instance will definitely be there for the most popular style name
    style_instance = MartialArts::Styles.all.find {|style_instance| style_instance.name.downcase == style.downcase }
    self.correct_errors(style_instance, style)
  end
end

.import_stylesObject



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/martial_arts/scraper.rb', line 59

def self.import_styles
  self.all.each do |data_string|
    data_array = data_string.split(" - ")

    @style = data_array[0]
    @country = data_array[1]
    @fighting_focus = data_array[2]
    @website = data_array[3]
    @description = data_array[4]

    if MartialArts::Styles.duplicates?(@style) == nil
      self.correct_errors(@style, @country, @fighting_focus, @website, @description)
      MartialArts::Styles.all << MartialArts::Styles.new(@style, @country, @fighting_focus, @website, @description)
    end

  end
end

.scrape_dataObject



5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/martial_arts/scraper.rb', line 5

def self.scrape_data
  #4 steps
  #1 retrieve all style information from their individual wikipedia websites
  doc = Nokogiri::HTML(open("https://en.wikipedia.org/wiki/List_of_martial_arts"))
  doc.css('.div-col.columns.column-width li').each do |style|
    #opens the style's website
    html = Nokogiri::HTML(open("https://en.wikipedia.org#{style.css('a')[0]['href']}"))

    if html.css('table.infobox tr') != nil
      html.css('table.infobox tr').each do |info|

        #retrieves focus and country data
          @focus = info.css('td a').text if info.css('th').text == "Focus"
          @country = info.css('td').text if info.css('th').text == "Country of origin"

      end
    end

    #retrieves style data
    style_edit = "#{html.css('h1').text.downcase}".gsub(/(\s\Dmartial\sarts?\D)/, '') #removes generic words in style name
    @style = "#{style_edit}".split.map(&:capitalize).join(' ')                        #capitalizes first letter of each word

    #retrieves description data
    description_info = html.css('div.mw-parser-output p').detect do |p|
      p.text.size > 20 and p.text.include?("#{@style}") and p.text != nil
    end

    if description_info == nil
      @description = "N/A"
    else
     description_edit =  "#{description_info.text}".tr('(\D[123456789]\D)', '') #removes unwanted text
     @description =  "#{description_edit}".sub('listen', '')                  #removes first occurence of listen
    end

    #retrieves website data
    @website = "https://en.wikipedia.org#{style.css('a')[0]['href']}"

    #checks for missing data that is converted to N/A

    @country = "N/A" if @country == ""
    @focus = "N/A" if @focus == ""

    #puts it all together into one string of data
    self.all << "#{@style} - #{@country} - #{@focus} - #{@website} - #{@description}"
  end

  #2 instantiates the informaiton
  self.import_styles
  #3 retrieves popular martial arts from a website
  self.import_popular
  #4 retrieves easier to acces country list
  self.import_countries
end