Class: Chinese::Scraper
- Inherits:
-
Object
- Object
- Chinese::Scraper
- Includes:
- HelperMethods, WithValidations
- Defined in:
- lib/chinese_vocab/scraper.rb
Constant Summary collapse
- Sources =
{ nciku: {:url => "http://www.nciku.com/search/all/examples/", :parent_sel => "div.examples_box > dl", :cn_sel => "//dt/span[1]", :en_sel => "//dd/span[@class='tc_sub']", # Only cn/en sentence pairs where the second node has a class 'tc_sub' belong together. :select_pair => lambda { |node1,node2| node1['class'] != "tc_sub" && node2['class'] == "tc_sub" }, # Just return the text stored in the node. :text_sel is mainly intended for jukuu (see below) :text_sel => "text()", # We want cn first, en second, but nciku does not return cn/en sentence pairs in a strict order. :reorder => lambda { |text1,text2| if is_unicode?(text2) then [text2,text1] else [text1,text2] end }}, jukuu: {:url => "http://www.jukuu.com/search.php?q=", :parent_sel => "table#Table1 table[width = '680']", :cn_sel => "//tr[@class='c']", :en_sel => "//tr[@class='e']", # Only cn/en sentence pairs where the first node has a class 'e' belong together. :select_pair => lambda { |node1,node2| node1['class'] == "e" && node2['class'] != "e" }, :text_sel => "td[2]", :reorder => lambda { |text1,text2| [text2,text1] }} }
- OPTIONS =
{:source => [:nciku, lambda {|value| Sources.keys.include?(value) }], :size => [:short, lambda {|value| [:short, :average, :long].include?(value) }]}
Instance Attribute Summary collapse
-
#sentences ⇒ Object
Returns the value of attribute sentences.
-
#source ⇒ Object
readonly
Returns the value of attribute source.
-
#word ⇒ Object
readonly
Returns the value of attribute word.
Class Method Summary collapse
- .average_size(sentence_pairs) ⇒ Object
- .longest_size(sentence_pairs) ⇒ Object
-
.pair_with_empty_string?(pair) ⇒ Boolean
Helper methods ===================.
- .sentence(word, options = {}) ⇒ Object
- .sentence_times_longer_than_word?(sentence, word, factor) ⇒ Boolean
-
.sentences(word, options = {}) ⇒ Object
Options: size => [:short, :average, :long], default = :average.
- .shortest_size(sentence_pairs) ⇒ Object
Methods included from HelperMethods
#distinct_words, #include_every_char?, included, #is_unicode?
Instance Attribute Details
#sentences ⇒ Object
Returns the value of attribute sentences.
16 17 18 |
# File 'lib/chinese_vocab/scraper.rb', line 16 def sentences @sentences end |
#source ⇒ Object (readonly)
Returns the value of attribute source.
15 16 17 |
# File 'lib/chinese_vocab/scraper.rb', line 15 def source @source end |
#word ⇒ Object (readonly)
Returns the value of attribute word.
15 16 17 |
# File 'lib/chinese_vocab/scraper.rb', line 15 def word @word end |
Class Method Details
.average_size(sentence_pairs) ⇒ Object
149 150 151 152 153 |
# File 'lib/chinese_vocab/scraper.rb', line 149 def self.average_size(sentence_pairs) sorted = sentence_pairs.sort_by {|(cn,_)| cn.length } length = sorted.length sorted.find {|(cn,_)| cn.size >= length/2 } end |
.longest_size(sentence_pairs) ⇒ Object
145 146 147 |
# File 'lib/chinese_vocab/scraper.rb', line 145 def self.longest_size(sentence_pairs) sentence_pairs.sort_by {|(cn,_)| cn.length }.last end |
.pair_with_empty_string?(pair) ⇒ Boolean
Helper methods
130 131 132 |
# File 'lib/chinese_vocab/scraper.rb', line 130 def self.pair_with_empty_string?(pair) pair[0].empty? || pair[1].empty? end |
.sentence(word, options = {}) ⇒ Object
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
# File 'lib/chinese_vocab/scraper.rb', line 109 def self.sentence(word, ={}) value = validate { :size } scraped_sentences = sentences(word, ) return [] if scraped_sentences.empty? case value when :short shortest_size(scraped_sentences) when :average average_size(scraped_sentences) when :long longest_size(scraped_sentences) end end |
.sentence_times_longer_than_word?(sentence, word, factor) ⇒ Boolean
135 136 137 138 139 |
# File 'lib/chinese_vocab/scraper.rb', line 135 def self.sentence_times_longer_than_word?(sentence, word, factor) sentence_chars = sentence.scan(/\p{Word}/) word_chars = word.scan(/\p{Word}/) sentence_chars.size >= (factor * word_chars.size) end |
.sentences(word, options = {}) ⇒ Object
Options: size => [:short, :average, :long], default = :average
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
# File 'lib/chinese_vocab/scraper.rb', line 47 def self.sentences(word, ={}) download_source = validate { :source } source = Sources[download_source] CGI.accept_charset = 'UTF-8' # Note: Use + because << changes the object on its left hand side, but + doesn't: # http://stackoverflow.com/questions/377768/string-concatenation-and-ruby/378258#378258 url = source[:url] + CGI.escape(word) # http://ruby-doc.org/stdlib-1.9.2/libdoc/timeout/rdoc/Timeout.html#method-c-timeout content = Timeout.timeout(30) { open(url) } content = open(url) main_node = Nokogiri::HTML(content).css(source[:parent_sel]) # Returns a single node. return [] if main_node.to_a.empty? # CSS selector: Returns the tags in the order they are specified # XPath selector: Return the tags in the order they appear in the document (that's what we want here). # Source: http://stackoverflow.com/questions/5825136/nokogiri-and-finding-element-by-name/5845985#5845985 target_nodes = main_node.search("#{source[:cn_sel]} | #{source[:en_sel]}") return [] if target_nodes.to_a.empty? # In order to make sure we only return text that also has a translation, # we need to first group each target node with Array#overlap_pairs like this: # Input: [cn1, cn2, en2, cn3, en3, cn4] # Output: [[cn1,cn2],[cn2,en2],[en2,cn3],[cn3,en3],[en3,cn4]] # and then select the correct pairs: [[cn2,en2],[cn3,en3]]. # Regarding #to_a: Nokogiri::XML::NodeSet => Array sentence_pairs = target_nodes.to_a.overlap_pairs.select {|(node1,node2)| source[:select_pair].call(node1,node2) } sentence_pairs = sentence_pairs.reduce([]) do |acc,(cn_node,en_node)| cn = cn_node.css(source[:text_sel]).text.strip # 'text' returns an empty string when 'css' returns an empty array. en = en_node.css(source[:text_sel]).text.strip pair = [cn,en] # Ensure that both the chinese and english selector have text. # (sometimes they don't). acc << pair unless pair_with_empty_string?(pair) acc end # Switch position of each pair if the first entry is the translation, # as we always return an array of [cn_sentence,en_sentence] pairs. # The following step is necessary because: # 1) Jukuu returns sentences in the order English first, Chinese second # 2) Nciku mostly returns sentences in the order Chinese first, English second # (but sometimes it is the other way round.) sentence_pairs = sentence_pairs.map {|node1,node2| source[:reorder].call(node1,node2) } # Only select Chinese sentences that don't separate words, e.g., skip all sentences like the following: # 北边 => 树林边的河流向北方 sentence_pairs = sentence_pairs.select { |cn, _| include_every_char?(word, cn) } # Only select Chinese sentences that are at least x times longer than the word (counting character length), # as sometimes only the word itself is listed as a sentence (or a short expression that does not really # count as a sentence). # Exception: If the result is an empty array (= none of the sentences fulfill the length constrain) # then just return the sentences selected so far. sentence_pairs_selected_by_length_factor = sentence_pairs.select { |cn, _| sentence_times_longer_than_word?(cn, word, 2.2) } unless sentence_pairs_selected_by_length_factor.empty? sentence_pairs_selected_by_length_factor else sentence_pairs end end |
.shortest_size(sentence_pairs) ⇒ Object
141 142 143 |
# File 'lib/chinese_vocab/scraper.rb', line 141 def self.shortest_size(sentence_pairs) sentence_pairs.sort_by {|(cn,_)| cn.length }.first end |