Class: Chinese::Scraper

Inherits:

Object

Object
Chinese::Scraper

Includes:: HelperMethods, WithValidations

Defined in:: lib/chinese_vocab/scraper.rb

Constant Summary collapse

Sources =

{
  nciku:
  {:url         => "http://www.nciku.com/search/all/examples/",
   :parent_sel  => "div.examples_box > dl",
   :cn_sel      => "//dt/span[1]",
   :en_sel      => "//dd/span[@class='tc_sub']",
                   # Only cn/en sentence pairs where the second node has a class 'tc_sub' belong together.
   :select_pair => lambda { |node1,node2| node1['class'] != "tc_sub" && node2['class'] == "tc_sub" },
                   # Just return the text stored in the node. :text_sel is mainly intended for jukuu (see below)
   :text_sel    => "text()",
                   # We want cn first, en second, but nciku does not return cn/en sentence pairs in a strict order.
   :reorder     => lambda { |text1,text2| if is_unicode?(text2) then [text2,text1] else [text1,text2] end }},
   jukuu:
   {:url         => "http://www.jukuu.com/search.php?q=",
    :parent_sel  => "table#Table1 table[width = '680']",
    :cn_sel      => "//tr[@class='c']",
    :en_sel      => "//tr[@class='e']",
                   # Only cn/en sentence pairs where the first node has a class 'e' belong together.
    :select_pair => lambda { |node1,node2| node1['class'] == "e" && node2['class'] != "e" },
    :text_sel    => "td[2]",
    :reorder     => lambda { |text1,text2| [text2,text1] }}
}

OPTIONS =

{:source =>  [:nciku,  lambda {|value| Sources.keys.include?(value) }],
:size   =>  [:short, lambda {|value| [:short, :average, :long].include?(value) }]}

Instance Attribute Summary collapse

#sentences ⇒ Object

Returns the value of attribute sentences.
#source ⇒ Object readonly

Returns the value of attribute source.
#word ⇒ Object readonly

Returns the value of attribute word.

Class Method Summary collapse

.average_size(sentence_pairs) ⇒ Object
.longest_size(sentence_pairs) ⇒ Object
.pair_with_empty_string?(pair) ⇒ Boolean

Helper methods ===================.
.sentence(word, options = {}) ⇒ Object
.sentence_times_longer_than_word?(sentence, word, factor) ⇒ Boolean
.sentences(word, options = {}) ⇒ Object

Options: size => [:short, :average, :long], default = :average.
.shortest_size(sentence_pairs) ⇒ Object

Methods included from HelperMethods

#distinct_words, #include_every_char?, included, #is_unicode?

Instance Attribute Details

#sentences ⇒ `Object`

Returns the value of attribute sentences.



16
17
18

# File 'lib/chinese_vocab/scraper.rb', line 16

def sentences
  @sentences
end

#source ⇒ `Object` (readonly)

Returns the value of attribute source.



15
16
17

# File 'lib/chinese_vocab/scraper.rb', line 15

def source
  @source
end

#word ⇒ `Object` (readonly)

Returns the value of attribute word.



15
16
17

# File 'lib/chinese_vocab/scraper.rb', line 15

def word
  @word
end

Class Method Details

.average_size(sentence_pairs) ⇒ `Object`

# File 'lib/chinese_vocab/scraper.rb', line 149

def self.average_size(sentence_pairs)
  sorted = sentence_pairs.sort_by {|(cn,_)| cn.length }
  length = sorted.length
  sorted.find {|(cn,_)| cn.size >= length/2 }
end

.longest_size(sentence_pairs) ⇒ `Object`



145
146
147

# File 'lib/chinese_vocab/scraper.rb', line 145

def self.longest_size(sentence_pairs)
  sentence_pairs.sort_by {|(cn,_)| cn.length }.last
end

.pair_with_empty_string?(pair) ⇒ `Boolean`

Helper methods

Returns:

(Boolean)



130
131
132

# File 'lib/chinese_vocab/scraper.rb', line 130

def self.pair_with_empty_string?(pair)
  pair[0].empty? || pair[1].empty?
end

.sentence(word, options = {}) ⇒ `Object`

# File 'lib/chinese_vocab/scraper.rb', line 109

def self.sentence(word, options={})
  value = validate { :size }

  scraped_sentences = sentences(word, options)
  return [] if scraped_sentences.empty?

  case value
  when :short
    shortest_size(scraped_sentences)
  when :average
    average_size(scraped_sentences)
  when :long
    longest_size(scraped_sentences)
  end
end

.sentence_times_longer_than_word?(sentence, word, factor) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/chinese_vocab/scraper.rb', line 135

def self.sentence_times_longer_than_word?(sentence, word, factor)
  sentence_chars = sentence.scan(/\p{Word}/)
  word_chars     = word.scan(/\p{Word}/)
  sentence_chars.size >= (factor * word_chars.size)
end

.sentences(word, options = {}) ⇒ `Object`

Options: size => [:short, :average, :long], default = :average

# File 'lib/chinese_vocab/scraper.rb', line 47

def self.sentences(word, options={})
  download_source = validate { :source }

  source = Sources[download_source]

  CGI.accept_charset = 'UTF-8'
  # Note: Use + because << changes the object on its left hand side, but + doesn't:
  # http://stackoverflow.com/questions/377768/string-concatenation-and-ruby/378258#378258
  url       = source[:url] + CGI.escape(word)
  # http://ruby-doc.org/stdlib-1.9.2/libdoc/timeout/rdoc/Timeout.html#method-c-timeout
  content   = Timeout.timeout(30) { open(url) }
  content   = open(url)
  main_node = Nokogiri::HTML(content).css(source[:parent_sel]) # Returns a single node.
  return []  if main_node.to_a.empty?

  # CSS selector:   Returns the tags in the order they are specified
  # XPath selector: Return the tags in the order they appear in the document (that's what we want here).
  # Source:         http://stackoverflow.com/questions/5825136/nokogiri-and-finding-element-by-name/5845985#5845985
  target_nodes = main_node.search("#{source[:cn_sel]} | #{source[:en_sel]}")
  return [] if target_nodes.to_a.empty?

  # In order to make sure we only return text that also has a translation,
  # we need to first group each target node with Array#overlap_pairs like this:
  # Input:  [cn1, cn2, en2, cn3, en3, cn4]
  # Output: [[cn1,cn2],[cn2,en2],[en2,cn3],[cn3,en3],[en3,cn4]]
  # and then select the correct pairs: [[cn2,en2],[cn3,en3]].
  # Regarding #to_a: Nokogiri::XML::NodeSet => Array
  sentence_pairs = target_nodes.to_a.overlap_pairs.select {|(node1,node2)| source[:select_pair].call(node1,node2) }
  sentence_pairs = sentence_pairs.reduce([]) do |acc,(cn_node,en_node)|
    cn   = cn_node.css(source[:text_sel]).text.strip  # 'text' returns an empty string when 'css' returns an empty array.
    en   = en_node.css(source[:text_sel]).text.strip
    pair = [cn,en]
    # Ensure that both the chinese and english selector have text.
    # (sometimes they don't).
    acc << pair unless pair_with_empty_string?(pair)
    acc
  end
  # Switch position of each pair if the first entry is the translation,
  # as we always return an array of [cn_sentence,en_sentence] pairs.
  # The following step is necessary because:
  # 1) Jukuu returns sentences in the order English first, Chinese second
  # 2) Nciku mostly returns sentences in the order Chinese first, English second
  #    (but sometimes it is the other way round.)
  sentence_pairs = sentence_pairs.map {|node1,node2| source[:reorder].call(node1,node2) }
  # Only select Chinese sentences that don't separate words, e.g., skip all sentences like the following:
  # 北边 => 树林边的河流向北方
  sentence_pairs = sentence_pairs.select { |cn, _| include_every_char?(word, cn) }

  # Only select Chinese sentences that are at least x times longer than the word (counting character length),
  # as sometimes only the word itself is listed as a sentence (or a short expression that does not really
  # count as a sentence).
  # Exception: If the result is an empty array (= none of the sentences fulfill the length constrain)
  # then just return the sentences selected so far.
  sentence_pairs_selected_by_length_factor = sentence_pairs.select { |cn, _| sentence_times_longer_than_word?(cn, word, 2.2) }

  unless sentence_pairs_selected_by_length_factor.empty?
    sentence_pairs_selected_by_length_factor
  else
    sentence_pairs
  end
end

.shortest_size(sentence_pairs) ⇒ `Object`



141
142
143

# File 'lib/chinese_vocab/scraper.rb', line 141

def self.shortest_size(sentence_pairs)
  sentence_pairs.sort_by {|(cn,_)| cn.length }.first
end

Class: Chinese::Scraper

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Helper methods ===================.

Methods included from HelperMethods

Instance Attribute Details

#sentences ⇒ Object

#source ⇒ Object (readonly)

#word ⇒ Object (readonly)

Class Method Details

.average_size(sentence_pairs) ⇒ Object

.longest_size(sentence_pairs) ⇒ Object

.pair_with_empty_string?(pair) ⇒ Boolean

.sentence(word, options = {}) ⇒ Object

.sentence_times_longer_than_word?(sentence, word, factor) ⇒ Boolean

.sentences(word, options = {}) ⇒ Object

.shortest_size(sentence_pairs) ⇒ Object