Class: Tml::Tokenizers::Dom
- Inherits:
-
Object
- Object
- Tml::Tokenizers::Dom
- Defined in:
- lib/tml/tokenizers/dom.rb
Instance Attribute Summary collapse
-
#context ⇒ Object
Returns the value of attribute context.
-
#options ⇒ Object
Returns the value of attribute options.
-
#tokens ⇒ Object
Returns the value of attribute tokens.
Instance Method Summary collapse
- #adjust_name(node) ⇒ Object
- #between_separators?(node) ⇒ Boolean
- #container_node?(node) ⇒ Boolean
- #contextualize(name, context) ⇒ Object
- #debug(doc) ⇒ Object
- #debug_translation(translation) ⇒ Object
- #debug_tree(node, depth) ⇒ Object
- #empty_string?(tml) ⇒ Boolean
- #generate_data_tokens(text) ⇒ Object
- #generate_html_token(node, value = nil) ⇒ Object
- #generate_tml_tags(node) ⇒ Object
- #has_child_nodes?(node) ⇒ Boolean
- #has_inline_or_text_siblings?(node) ⇒ Boolean
- #ignored_node?(node) ⇒ Boolean
-
#initialize(context = {}, options = {}) ⇒ Dom
constructor
A new instance of Dom.
- #inline_node?(node) ⇒ Boolean
- #no_translate_node?(node) ⇒ Boolean
- #node_info(node) ⇒ Object
- #non_translatable_node?(node) ⇒ Boolean
- #only_child?(node) ⇒ Boolean
- #option(name) ⇒ Object
- #reset_context ⇒ Object
- #sanitize_value(value) ⇒ Object
- #self_closing_node?(node) ⇒ Boolean
- #separator_node?(node) ⇒ Boolean
- #short_token?(token, value) ⇒ Boolean
- #translate(doc) ⇒ Object
- #translate_tml(tml) ⇒ Object
- #translate_tree(node) ⇒ Object
- #valid_text_node?(node) ⇒ Boolean
Constructor Details
#initialize(context = {}, options = {}) ⇒ Dom
Returns a new instance of Dom.
41 42 43 44 45 |
# File 'lib/tml/tokenizers/dom.rb', line 41 def initialize(context = {}, = {}) self.context = context self. = reset_context end |
Instance Attribute Details
#context ⇒ Object
Returns the value of attribute context.
39 40 41 |
# File 'lib/tml/tokenizers/dom.rb', line 39 def context @context end |
#options ⇒ Object
Returns the value of attribute options.
39 40 41 |
# File 'lib/tml/tokenizers/dom.rb', line 39 def @options end |
#tokens ⇒ Object
Returns the value of attribute tokens.
39 40 41 |
# File 'lib/tml/tokenizers/dom.rb', line 39 def tokens @tokens end |
Instance Method Details
#adjust_name(node) ⇒ Object
319 320 321 322 323 |
# File 'lib/tml/tokenizers/dom.rb', line 319 def adjust_name(node) name = node.name.downcase map = option('name_mapping') map[name.to_sym] ? map[name.to_sym] : name end |
#between_separators?(node) ⇒ Boolean
132 133 134 135 |
# File 'lib/tml/tokenizers/dom.rb', line 132 def between_separators?(node) (separator_node?(node.previous_sibling) and !valid_text_node?(node.next_sibling)) or (separator_node?(node.next_sibling) and !valid_text_node?(node.previous_sibling)) end |
#container_node?(node) ⇒ Boolean
208 209 210 |
# File 'lib/tml/tokenizers/dom.rb', line 208 def container_node?(node) node.type == 1 && !inline_node?(node) end |
#contextualize(name, context) ⇒ Object
325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 |
# File 'lib/tml/tokenizers/dom.rb', line 325 def contextualize(name, context) if self.tokens[name] and self.tokens[name] != context index = 0 matches = name.match(/\d+$/) if matches and matches.length > 0 index = matches[matches.length-1].to_i name = name.gsub(index.to_s, '') end name += (index + 1).to_s return contextualize(name, context) end self.tokens[name] = context name end |
#debug(doc) ⇒ Object
341 342 343 344 |
# File 'lib/tml/tokenizers/dom.rb', line 341 def debug(doc) self.doc = doc debug_tree(self.doc, 0) end |
#debug_translation(translation) ⇒ Object
162 163 164 |
# File 'lib/tml/tokenizers/dom.rb', line 162 def debug_translation(translation) option('debug_format').gsub('{$0}', translation) end |
#debug_tree(node, depth) ⇒ Object
346 347 348 349 350 351 352 353 354 |
# File 'lib/tml/tokenizers/dom.rb', line 346 def debug_tree(node, depth) padding = ('=' * (depth+1)) Tml.logger.log(padding + '=> ' + (node) + ': ' + node_info(node)) (node.children || []).each do |child| debug_tree(child, depth+1) end end |
#empty_string?(tml) ⇒ Boolean
166 167 168 169 170 171 172 173 |
# File 'lib/tml/tokenizers/dom.rb', line 166 def empty_string?(tml) tml = tml.gsub(/[\s\n\r\t]/, '').gsub(/[\u0080-\u00ff]/, '') return true if tml == '' return true if tml.match(/\A\$\{[^\}]+\}\z/) # ignore variables ${var_name} return true if tml.match(/\A\$?\d+\.?\d+\z/) # ignore prices and numbers false end |
#generate_data_tokens(text) ⇒ Object
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 |
# File 'lib/tml/tokenizers/dom.rb', line 235 def generate_data_tokens(text) if option('data_tokens.special.enabled') matches = text.scan(option('data_tokens.special.regex')) matches.each do |match| token = match[1, - 2] self.context[token] = match text = text.gsub(match, "{#{token}}") end end if option('data_tokens.date.enabled') token_name = option('data_tokens.date.name') formats = option('data_tokens.date.formats') formats.each do |format| regex = format[0] # date_format = format[1] matches = text.scan(regex) if matches matches.each do |match| next if match.first.nil? or match.first == '' date = match.first token = self.contextualize(token_name, date) replacement = "{#{token}}" text = text.gsub(date, replacement) end end end end rules = option('data_tokens.rules') if rules rules.each do |rule| next unless rule[:enabled] matches = text.scan(rule[:regex]) if matches matches.each do |match| next if match.first.nil? or match.first == '' value = match.first.strip unless value == '' token = contextualize(rule[:name], value.gsub(/[.,;\s]/, '').to_i) text = text.gsub(value, value.gsub(value, "{#{token}}")) end end end end end text end |
#generate_html_token(node, value = nil) ⇒ Object
288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 |
# File 'lib/tml/tokenizers/dom.rb', line 288 def generate_html_token(node, value = nil) name = node.name.downcase attributes = node.attributes attributes_hash = {} value = (!value ? '{$0}' : value) if attributes.length == 0 if self_closing_node?(node) return '<' + name + '/>' if %w(br hr).index(name) return '<' + name + '>' + '</' + name + '>' end return '<' + name + '>' + value + '</' + name + '>' end attributes.each do |name, attribute| attributes_hash[name] = attribute.value end keys = attributes_hash.keys.sort attr = [] keys.each do |key| quote = attributes_hash[key].index("'") ? '"' : "'" attr << (key + '=' + quote + attributes_hash[key] + quote) end attr = attr.join(' ') return '<' + name + ' ' + attr + '>' + '</' + name + '>' if self_closing_node?(node) '<' + name + ' ' + attr + '>' + value + '</' + name + '>' end |
#generate_tml_tags(node) ⇒ Object
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
# File 'lib/tml/tokenizers/dom.rb', line 137 def (node) buffer = '' node.children.each do |child| if child.type == 3 buffer += child.inner_text else buffer += (child) end end token_context = generate_html_token(node) token = contextualize(adjust_name(node), token_context) value = sanitize_value(buffer) return '{' + token + '}' if self_closing_node?(node) # return '[' + token + ': ' + value + ']' if short_token?(token, value) '<' + token + '>' + value + '</' + token + '>' end |
#has_child_nodes?(node) ⇒ Boolean
128 129 130 |
# File 'lib/tml/tokenizers/dom.rb', line 128 def has_child_nodes?(node) node.children and node.children.length > 0 end |
#has_inline_or_text_siblings?(node) ⇒ Boolean
188 189 190 191 192 193 194 195 196 197 198 |
# File 'lib/tml/tokenizers/dom.rb', line 188 def has_inline_or_text_siblings?(node) return false unless node.parent node.parent.children.each do |child| unless child == node return true if inline_node?(child) || valid_text_node?(child) end end false end |
#ignored_node?(node) ⇒ Boolean
216 217 218 219 |
# File 'lib/tml/tokenizers/dom.rb', line 216 def ignored_node?(node) return true if (node.type != 1) (option('nodes.ignored') || []).index(node.name.downcase) end |
#inline_node?(node) ⇒ Boolean
200 201 202 203 204 205 206 |
# File 'lib/tml/tokenizers/dom.rb', line 200 def inline_node?(node) ( node.type == 1 and (option('nodes.inline') || []).index(node.name.downcase) and !only_child?(node) ) end |
#no_translate_node?(node) ⇒ Boolean
88 89 90 91 92 93 94 |
# File 'lib/tml/tokenizers/dom.rb', line 88 def no_translate_node?(node) return unless node && node.type == 1 && node.attributes node.attributes.each do |name, attribute| return true if name == 'notranslate' or attribute.value.index('notranslate') end false end |
#node_info(node) ⇒ Object
356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 |
# File 'lib/tml/tokenizers/dom.rb', line 356 def node_info(node) info = [] info << node.type info << node.tagName if node.type == 1 if inline_node?(node) info << 'inline' if has_inline_or_text_siblings?(node) info << 'sentence' else info << 'only translatable' end end info << 'self closing' if self_closing_node?(node) info << 'only child' if only_child?(node) return "[#{info.join(', ')}]: " + node.inner_text if node.type == 3 "[#{info.join(', ')}]" end |
#non_translatable_node?(node) ⇒ Boolean
96 97 98 99 100 101 102 |
# File 'lib/tml/tokenizers/dom.rb', line 96 def non_translatable_node?(node) return false unless node return true if node.type == 1 && (option('nodes.scripts') || []).index(node.name.downcase) return true if node.type == 1 && node.children.length === 0 && node.inner_text == '' return true if no_translate_node?(node) false end |
#only_child?(node) ⇒ Boolean
183 184 185 186 |
# File 'lib/tml/tokenizers/dom.rb', line 183 def only_child?(node) return false unless node.parent node.parent.children.count == 1 end |
#option(name) ⇒ Object
157 158 159 160 |
# File 'lib/tml/tokenizers/dom.rb', line 157 def option(name) value = Tml::Utils.hash_value(self., name) value || Tml.config.translator_option(name) end |
#reset_context ⇒ Object
175 176 177 |
# File 'lib/tml/tokenizers/dom.rb', line 175 def reset_context self.tokens = {}.merge(self.context) end |
#sanitize_value(value) ⇒ Object
231 232 233 |
# File 'lib/tml/tokenizers/dom.rb', line 231 def sanitize_value(value) value.gsub(/^\s+/, '') end |
#self_closing_node?(node) ⇒ Boolean
212 213 214 |
# File 'lib/tml/tokenizers/dom.rb', line 212 def self_closing_node?(node) !node.children || !node.children.first end |
#separator_node?(node) ⇒ Boolean
226 227 228 229 |
# File 'lib/tml/tokenizers/dom.rb', line 226 def separator_node?(node) return false unless node node.type == 1 && (option('nodes.splitters') || []).index(node.name.downcase) end |
#short_token?(token, value) ⇒ Boolean
179 180 181 |
# File 'lib/tml/tokenizers/dom.rb', line 179 def short_token?(token, value) option('nodes.short').index(token.downcase) || value.length < 20 end |
#translate(doc) ⇒ Object
47 48 49 |
# File 'lib/tml/tokenizers/dom.rb', line 47 def translate(doc) translate_tree(doc.is_a?(String) ? Nokogiri::HTML.fragment(doc) : doc) end |
#translate_tml(tml) ⇒ Object
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# File 'lib/tml/tokenizers/dom.rb', line 104 def translate_tml(tml) return tml if empty_string?(tml) # pp tml tml = generate_data_tokens(tml) if option('split_sentences') sentences = Tml::Utils.split_sentences(tml) translation = tml sentences.each do |sentence| sentence_translation = option('debug') ? debug_translation(sentence) : Tml.session.current_language.translate(sentence, tokens, .dup) translation = translation.gsub(sentence, sentence_translation) end reset_context return translation end tml = tml.gsub(/[\n]/, '').gsub(/\s\s+/, ' ').strip translation = option('debug') ? debug_translation(tml) : Tml.session.target_language.translate(tml, tokens, .dup) reset_context translation end |
#translate_tree(node) ⇒ Object
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
# File 'lib/tml/tokenizers/dom.rb', line 51 def translate_tree(node) if non_translatable_node?(node) return node.inner_html end return translate_tml(node.inner_text) if node.type == 3 html = '' buffer = '' node.children.each do |child| if child.type == 3 buffer += child.inner_text elsif inline_node?(child) and has_inline_or_text_siblings?(child) and !between_separators?(child) buffer += (child) elsif separator_node?(child) html += translate_tml(buffer) if buffer != '' html += generate_html_token(child) buffer = '' else html += translate_tml(buffer) if buffer != '' container_value = translate_tree(child) if ignored_node?(child) html += container_value else html += generate_html_token(child, container_value) end buffer = '' end end html += translate_tml(buffer) if buffer != '' html end |
#valid_text_node?(node) ⇒ Boolean
221 222 223 224 |
# File 'lib/tml/tokenizers/dom.rb', line 221 def valid_text_node?(node) return false unless node node.type == 3 && !empty_string?(node.inner_text) end |