Class: Tml::Tokenizers::Dom

Inherits:
Object
  • Object
show all
Defined in:
lib/tml/tokenizers/dom.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(context = {}, options = {}) ⇒ Dom

Returns a new instance of Dom.



41
42
43
44
45
# File 'lib/tml/tokenizers/dom.rb', line 41

def initialize(context = {}, options = {})
  self.context = context
  self.options = options
  reset_context
end

Instance Attribute Details

#contextObject

Returns the value of attribute context.



39
40
41
# File 'lib/tml/tokenizers/dom.rb', line 39

def context
  @context
end

#optionsObject

Returns the value of attribute options.



39
40
41
# File 'lib/tml/tokenizers/dom.rb', line 39

def options
  @options
end

#tokensObject

Returns the value of attribute tokens.



39
40
41
# File 'lib/tml/tokenizers/dom.rb', line 39

def tokens
  @tokens
end

Instance Method Details

#adjust_name(node) ⇒ Object



319
320
321
322
323
# File 'lib/tml/tokenizers/dom.rb', line 319

def adjust_name(node)
  name = node.name.downcase
  map = option('name_mapping')
  map[name.to_sym] ? map[name.to_sym] : name
end

#between_separators?(node) ⇒ Boolean

Returns:

  • (Boolean)


132
133
134
135
# File 'lib/tml/tokenizers/dom.rb', line 132

def between_separators?(node)
  (separator_node?(node.previous_sibling) and !valid_text_node?(node.next_sibling)) or
  (separator_node?(node.next_sibling) and !valid_text_node?(node.previous_sibling))
end

#container_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


208
209
210
# File 'lib/tml/tokenizers/dom.rb', line 208

def container_node?(node)
  node.type == 1 && !inline_node?(node)
end

#contextualize(name, context) ⇒ Object



325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
# File 'lib/tml/tokenizers/dom.rb', line 325

def contextualize(name, context)
  if self.tokens[name] and self.tokens[name] != context
    index = 0
    matches = name.match(/\d+$/)
    if matches and matches.length > 0
      index = matches[matches.length-1].to_i
      name = name.gsub(index.to_s, '')
    end
    name += (index + 1).to_s
    return contextualize(name, context)
  end

  self.tokens[name] = context
  name
end

#debug(doc) ⇒ Object



341
342
343
344
# File 'lib/tml/tokenizers/dom.rb', line 341

def debug(doc)
  self.doc = doc
  debug_tree(self.doc, 0)
end

#debug_translation(translation) ⇒ Object



162
163
164
# File 'lib/tml/tokenizers/dom.rb', line 162

def debug_translation(translation)
  option('debug_format').gsub('{$0}', translation)
end

#debug_tree(node, depth) ⇒ Object



346
347
348
349
350
351
352
353
354
# File 'lib/tml/tokenizers/dom.rb', line 346

def debug_tree(node, depth)
  padding = ('=' * (depth+1))

  Tml.logger.log(padding + '=> ' + (node) + ': ' + node_info(node))

  (node.children || []).each do |child|
    debug_tree(child, depth+1)
  end
end

#empty_string?(tml) ⇒ Boolean

Returns:

  • (Boolean)


166
167
168
169
170
171
172
173
# File 'lib/tml/tokenizers/dom.rb', line 166

def empty_string?(tml)
  tml = tml.gsub(/[\s\n\r\t]/, '').gsub(/[\u0080-\u00ff]/, '')
  return true if tml == ''
  return true if tml.match(/\A\$\{[^\}]+\}\z/)  # ignore variables ${var_name}
  return true if tml.match(/\A\$?\d+\.?\d+\z/) # ignore prices and numbers

  false
end

#generate_data_tokens(text) ⇒ Object



235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
# File 'lib/tml/tokenizers/dom.rb', line 235

def generate_data_tokens(text)
  if option('data_tokens.special.enabled')
    matches = text.scan(option('data_tokens.special.regex'))
    matches.each do  |match|
      token = match[1, - 2]
      self.context[token] = match
      text = text.gsub(match, "{#{token}}")
    end
  end

  if option('data_tokens.date.enabled')
    token_name = option('data_tokens.date.name')
    formats = option('data_tokens.date.formats')
    formats.each do |format|
      regex = format[0]
      # date_format = format[1]

      matches = text.scan(regex)
      if matches
        matches.each do |match|
          next if match.first.nil? or match.first == ''
          date = match.first
          token = self.contextualize(token_name, date)
          replacement = "{#{token}}"
          text = text.gsub(date, replacement)
        end
      end
    end
  end

  rules = option('data_tokens.rules')
  if rules
    rules.each do |rule|
      next unless rule[:enabled]
      matches = text.scan(rule[:regex])

      if matches
        matches.each do |match|
          next if match.first.nil? or match.first == ''
          value = match.first.strip

          unless value == ''
            token = contextualize(rule[:name], value.gsub(/[.,;\s]/, '').to_i)
            text = text.gsub(value, value.gsub(value, "{#{token}}"))
          end
        end
      end
    end
  end

  text
end

#generate_html_token(node, value = nil) ⇒ Object



288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# File 'lib/tml/tokenizers/dom.rb', line 288

def generate_html_token(node, value = nil)
  name = node.name.downcase
  attributes = node.attributes
  attributes_hash = {}
  value = (!value ? '{$0}' : value)

  if attributes.length == 0
    if self_closing_node?(node)
      return '<' + name + '/>' if %w(br hr).index(name)
      return '<' + name + '>' + '</' + name + '>'
    end
    return '<' + name + '>' + value + '</' + name + '>'
  end

  attributes.each do |name, attribute|
    attributes_hash[name] = attribute.value
  end

  keys = attributes_hash.keys.sort

  attr = []
  keys.each do |key|
    quote = attributes_hash[key].index("'") ? '"' : "'"
    attr << (key + '=' + quote + attributes_hash[key] + quote)
  end
  attr = attr.join(' ')

  return '<' + name + ' ' + attr + '>' + '</' + name + '>' if self_closing_node?(node)
  '<' + name + ' ' + attr + '>' + value + '</' + name + '>'
end

#generate_tml_tags(node) ⇒ Object



137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/tml/tokenizers/dom.rb', line 137

def generate_tml_tags(node)
  buffer = ''
  node.children.each do |child|
    if child.type == 3
      buffer += child.inner_text
    else
      buffer += generate_tml_tags(child)
    end
  end

  token_context = generate_html_token(node)
  token = contextualize(adjust_name(node), token_context)
  value = sanitize_value(buffer)

  return '{' + token + '}' if self_closing_node?(node)
  # return '[' + token + ': ' + value + ']' if short_token?(token, value)

  '<' + token + '>' + value + '</' + token + '>'
end

#has_child_nodes?(node) ⇒ Boolean

Returns:

  • (Boolean)


128
129
130
# File 'lib/tml/tokenizers/dom.rb', line 128

def has_child_nodes?(node)
  node.children and node.children.length > 0
end

#has_inline_or_text_siblings?(node) ⇒ Boolean

Returns:

  • (Boolean)


188
189
190
191
192
193
194
195
196
197
198
# File 'lib/tml/tokenizers/dom.rb', line 188

def has_inline_or_text_siblings?(node)
  return false unless node.parent

  node.parent.children.each do |child|
    unless child == node
      return true if inline_node?(child) || valid_text_node?(child)
    end
  end

  false
end

#ignored_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


216
217
218
219
# File 'lib/tml/tokenizers/dom.rb', line 216

def ignored_node?(node)
  return true if (node.type != 1)
  (option('nodes.ignored') || []).index(node.name.downcase)
end

#inline_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


200
201
202
203
204
205
206
# File 'lib/tml/tokenizers/dom.rb', line 200

def inline_node?(node)
  (
    node.type == 1 and
    (option('nodes.inline') || []).index(node.name.downcase) and
    !only_child?(node)
  )
end

#no_translate_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


88
89
90
91
92
93
94
# File 'lib/tml/tokenizers/dom.rb', line 88

def no_translate_node?(node)
  return unless node && node.type == 1 && node.attributes
  node.attributes.each do |name, attribute|
    return true if name == 'notranslate' or attribute.value.index('notranslate')
  end
  false
end

#node_info(node) ⇒ Object



356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
# File 'lib/tml/tokenizers/dom.rb', line 356

def node_info(node)
  info = []
  info << node.type

  info << node.tagName if node.type == 1

  if inline_node?(node)
    info << 'inline'
    if has_inline_or_text_siblings?(node)
      info << 'sentence'
    else
      info << 'only translatable'
    end
  end

  info << 'self closing' if self_closing_node?(node)
  info << 'only child' if only_child?(node)

  return "[#{info.join(', ')}]: " + node.inner_text if node.type == 3
  "[#{info.join(', ')}]"
end

#non_translatable_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


96
97
98
99
100
101
102
# File 'lib/tml/tokenizers/dom.rb', line 96

def non_translatable_node?(node)
  return false unless node
  return true if node.type == 1 && (option('nodes.scripts') || []).index(node.name.downcase)
  return true if node.type == 1 && node.children.length === 0 && node.inner_text == ''
  return true if no_translate_node?(node)
  false
end

#only_child?(node) ⇒ Boolean

Returns:

  • (Boolean)


183
184
185
186
# File 'lib/tml/tokenizers/dom.rb', line 183

def only_child?(node)
  return false unless node.parent
  node.parent.children.count == 1
end

#option(name) ⇒ Object



157
158
159
160
# File 'lib/tml/tokenizers/dom.rb', line 157

def option(name)
  value = Tml::Utils.hash_value(self.options, name)
  value || Tml.config.translator_option(name)
end

#reset_contextObject



175
176
177
# File 'lib/tml/tokenizers/dom.rb', line 175

def reset_context
  self.tokens = {}.merge(self.context)
end

#sanitize_value(value) ⇒ Object



231
232
233
# File 'lib/tml/tokenizers/dom.rb', line 231

def sanitize_value(value)
  value.gsub(/^\s+/, '')
end

#self_closing_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


212
213
214
# File 'lib/tml/tokenizers/dom.rb', line 212

def self_closing_node?(node)
  !node.children || !node.children.first
end

#separator_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


226
227
228
229
# File 'lib/tml/tokenizers/dom.rb', line 226

def separator_node?(node)
  return false unless node
  node.type == 1 && (option('nodes.splitters') || []).index(node.name.downcase)
end

#short_token?(token, value) ⇒ Boolean

Returns:

  • (Boolean)


179
180
181
# File 'lib/tml/tokenizers/dom.rb', line 179

def short_token?(token, value)
  option('nodes.short').index(token.downcase) || value.length < 20
end

#translate(doc) ⇒ Object



47
48
49
# File 'lib/tml/tokenizers/dom.rb', line 47

def translate(doc)
  translate_tree(doc.is_a?(String) ? Nokogiri::HTML.fragment(doc) : doc)
end

#translate_tml(tml) ⇒ Object



104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/tml/tokenizers/dom.rb', line 104

def translate_tml(tml)
  return tml if empty_string?(tml)
  # pp tml

  tml = generate_data_tokens(tml)

  if option('split_sentences')
    sentences = Tml::Utils.split_sentences(tml)
    translation = tml
    sentences.each do |sentence|
      sentence_translation = option('debug') ? debug_translation(sentence) : Tml.session.current_language.translate(sentence, tokens, options.dup)
      translation = translation.gsub(sentence, sentence_translation)
    end
    reset_context
    return translation
  end

  tml = tml.gsub(/[\n]/, '').gsub(/\s\s+/, ' ').strip

  translation = option('debug') ? debug_translation(tml) : Tml.session.target_language.translate(tml, tokens, options.dup)
  reset_context
  translation
end

#translate_tree(node) ⇒ Object



51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/tml/tokenizers/dom.rb', line 51

def translate_tree(node)
  if non_translatable_node?(node)
    return node.inner_html
  end

  return translate_tml(node.inner_text) if node.type == 3

  html = ''
  buffer = ''

  node.children.each do |child|
    if child.type == 3
      buffer += child.inner_text
    elsif inline_node?(child) and has_inline_or_text_siblings?(child) and !between_separators?(child)
      buffer += generate_tml_tags(child)
    elsif separator_node?(child)
      html += translate_tml(buffer) if buffer != ''
      html += generate_html_token(child)
      buffer = ''
    else
      html += translate_tml(buffer) if buffer != ''

      container_value = translate_tree(child)
      if ignored_node?(child)
        html += container_value
      else
        html += generate_html_token(child, container_value)
      end

      buffer = ''
    end
  end

  html += translate_tml(buffer) if buffer != ''
  html
end

#valid_text_node?(node) ⇒ Boolean

Returns:

  • (Boolean)


221
222
223
224
# File 'lib/tml/tokenizers/dom.rb', line 221

def valid_text_node?(node)
  return false unless node
  node.type == 3 && !empty_string?(node.inner_text)
end