Class: Tml::Tokenizers::Dom

Inherits:

Object

Object
Tml::Tokenizers::Dom

show all

Defined in:: lib/tml/tokenizers/dom.rb

Instance Attribute Summary collapse

#context ⇒ Object

Returns the value of attribute context.
#options ⇒ Object

Returns the value of attribute options.
#tokens ⇒ Object

Returns the value of attribute tokens.

Instance Method Summary collapse

Constructor Details

#initialize(context = {}, options = {}) ⇒ `Dom`

Returns a new instance of Dom.

# File 'lib/tml/tokenizers/dom.rb', line 41

def initialize(context = {}, options = {})
  self.context = context
  self.options = options
  reset_context
end

Instance Attribute Details

#context ⇒ `Object`

Returns the value of attribute context.



39
40
41

# File 'lib/tml/tokenizers/dom.rb', line 39

def context
  @context
end

#options ⇒ `Object`

Returns the value of attribute options.



39
40
41

# File 'lib/tml/tokenizers/dom.rb', line 39

def options
  @options
end

#tokens ⇒ `Object`

Returns the value of attribute tokens.



39
40
41

# File 'lib/tml/tokenizers/dom.rb', line 39

def tokens
  @tokens
end

Instance Method Details

#adjust_name(node) ⇒ `Object`

# File 'lib/tml/tokenizers/dom.rb', line 319

def adjust_name(node)
  name = node.name.downcase
  map = option('name_mapping')
  map[name.to_sym] ? map[name.to_sym] : name
end

#between_separators?(node) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/tml/tokenizers/dom.rb', line 132

def between_separators?(node)
  (separator_node?(node.previous_sibling) and !valid_text_node?(node.next_sibling)) or
  (separator_node?(node.next_sibling) and !valid_text_node?(node.previous_sibling))
end

#container_node?(node) ⇒ `Boolean`

Returns:

(Boolean)



208
209
210

# File 'lib/tml/tokenizers/dom.rb', line 208

def container_node?(node)
  node.type == 1 && !inline_node?(node)
end

#contextualize(name, context) ⇒ `Object`

# File 'lib/tml/tokenizers/dom.rb', line 325

def contextualize(name, context)
  if self.tokens[name] and self.tokens[name] != context
    index = 0
    matches = name.match(/\d+$/)
    if matches and matches.length > 0
      index = matches[matches.length-1].to_i
      name = name.gsub(index.to_s, '')
    end
    name += (index + 1).to_s
    return contextualize(name, context)
  end

  self.tokens[name] = context
  name
end

#debug(doc) ⇒ `Object`

# File 'lib/tml/tokenizers/dom.rb', line 341

def debug(doc)
  self.doc = doc
  debug_tree(self.doc, 0)
end

#debug_translation(translation) ⇒ `Object`



162
163
164

# File 'lib/tml/tokenizers/dom.rb', line 162

def debug_translation(translation)
  option('debug_format').gsub('{$0}', translation)
end

#debug_tree(node, depth) ⇒ `Object`

# File 'lib/tml/tokenizers/dom.rb', line 346

def debug_tree(node, depth)
  padding = ('=' * (depth+1))

  Tml.logger.log(padding + '=> ' + (node) + ': ' + node_info(node))

  (node.children || []).each do |child|
    debug_tree(child, depth+1)
  end
end

#empty_string?(tml) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/tml/tokenizers/dom.rb', line 166

def empty_string?(tml)
  tml = tml.gsub(/[\s\n\r\t]/, '').gsub(/[\u0080-\u00ff]/, '')
  return true if tml == ''
  return true if tml.match(/\A\$\{[^\}]+\}\z/)  # ignore variables ${var_name}
  return true if tml.match(/\A\$?\d+\.?\d+\z/) # ignore prices and numbers

  false
end

#generate_data_tokens(text) ⇒ `Object`

# File 'lib/tml/tokenizers/dom.rb', line 235

def generate_data_tokens(text)
  if option('data_tokens.special.enabled')
    matches = text.scan(option('data_tokens.special.regex'))
    matches.each do  |match|
      token = match[1, - 2]
      self.context[token] = match
      text = text.gsub(match, "{#{token}}")
    end
  end

  if option('data_tokens.date.enabled')
    token_name = option('data_tokens.date.name')
    formats = option('data_tokens.date.formats')
    formats.each do |format|
      regex = format[0]
      # date_format = format[1]

      matches = text.scan(regex)
      if matches
        matches.each do |match|
          next if match.first.nil? or match.first == ''
          date = match.first
          token = self.contextualize(token_name, date)
          replacement = "{#{token}}"
          text = text.gsub(date, replacement)
        end
      end
    end
  end

  rules = option('data_tokens.rules')
  if rules
    rules.each do |rule|
      next unless rule[:enabled]
      matches = text.scan(rule[:regex])

      if matches
        matches.each do |match|
          next if match.first.nil? or match.first == ''
          value = match.first.strip

          unless value == ''
            token = contextualize(rule[:name], value.gsub(/[.,;\s]/, '').to_i)
            text = text.gsub(value, value.gsub(value, "{#{token}}"))
          end
        end
      end
    end
  end

  text
end

#generate_html_token(node, value = nil) ⇒ `Object`

# File 'lib/tml/tokenizers/dom.rb', line 288

def generate_html_token(node, value = nil)
  name = node.name.downcase
  attributes = node.attributes
  attributes_hash = {}
  value = (!value ? '{$0}' : value)

  if attributes.length == 0
    if self_closing_node?(node)
      return '<' + name + '/>' if %w(br hr).index(name)
      return '<' + name + '>' + '</' + name + '>'
    end
    return '<' + name + '>' + value + '</' + name + '>'
  end

  attributes.each do |name, attribute|
    attributes_hash[name] = attribute.value
  end

  keys = attributes_hash.keys.sort

  attr = []
  keys.each do |key|
    quote = attributes_hash[key].index("'") ? '"' : "'"
    attr << (key + '=' + quote + attributes_hash[key] + quote)
  end
  attr = attr.join(' ')

  return '<' + name + ' ' + attr + '>' + '</' + name + '>' if self_closing_node?(node)
  '<' + name + ' ' + attr + '>' + value + '</' + name + '>'
end

#generate_tml_tags(node) ⇒ `Object`

# File 'lib/tml/tokenizers/dom.rb', line 137

def generate_tml_tags(node)
  buffer = ''
  node.children.each do |child|
    if child.type == 3
      buffer += child.inner_text
    else
      buffer += generate_tml_tags(child)
    end
  end

  token_context = generate_html_token(node)
  token = contextualize(adjust_name(node), token_context)
  value = sanitize_value(buffer)

  return '{' + token + '}' if self_closing_node?(node)
  # return '[' + token + ': ' + value + ']' if short_token?(token, value)

  '<' + token + '>' + value + '</' + token + '>'
end

#has_child_nodes?(node) ⇒ `Boolean`

Returns:

(Boolean)



128
129
130

# File 'lib/tml/tokenizers/dom.rb', line 128

def has_child_nodes?(node)
  node.children and node.children.length > 0
end

#has_inline_or_text_siblings?(node) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/tml/tokenizers/dom.rb', line 188

def has_inline_or_text_siblings?(node)
  return false unless node.parent

  node.parent.children.each do |child|
    unless child == node
      return true if inline_node?(child) || valid_text_node?(child)
    end
  end

  false
end

#ignored_node?(node) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/tml/tokenizers/dom.rb', line 216

def ignored_node?(node)
  return true if (node.type != 1)
  (option('nodes.ignored') || []).index(node.name.downcase)
end

#inline_node?(node) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/tml/tokenizers/dom.rb', line 200

def inline_node?(node)
  (
    node.type == 1 and
    (option('nodes.inline') || []).index(node.name.downcase) and
    !only_child?(node)
  )
end

#no_translate_node?(node) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/tml/tokenizers/dom.rb', line 88

def no_translate_node?(node)
  return unless node && node.type == 1 && node.attributes
  node.attributes.each do |name, attribute|
    return true if name == 'notranslate' or attribute.value.index('notranslate')
  end
  false
end

#node_info(node) ⇒ `Object`

# File 'lib/tml/tokenizers/dom.rb', line 356

def node_info(node)
  info = []
  info << node.type

  info << node.tagName if node.type == 1

  if inline_node?(node)
    info << 'inline'
    if has_inline_or_text_siblings?(node)
      info << 'sentence'
    else
      info << 'only translatable'
    end
  end

  info << 'self closing' if self_closing_node?(node)
  info << 'only child' if only_child?(node)

  return "[#{info.join(', ')}]: " + node.inner_text if node.type == 3
  "[#{info.join(', ')}]"
end

#non_translatable_node?(node) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/tml/tokenizers/dom.rb', line 96

def non_translatable_node?(node)
  return false unless node
  return true if node.type == 1 && (option('nodes.scripts') || []).index(node.name.downcase)
  return true if node.type == 1 && node.children.length === 0 && node.inner_text == ''
  return true if no_translate_node?(node)
  false
end

#only_child?(node) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/tml/tokenizers/dom.rb', line 183

def only_child?(node)
  return false unless node.parent
  node.parent.children.count == 1
end

#option(name) ⇒ `Object`

# File 'lib/tml/tokenizers/dom.rb', line 157

def option(name)
  value = Tml::Utils.hash_value(self.options, name)
  value || Tml.config.translator_option(name)
end

#reset_context ⇒ `Object`



175
176
177

# File 'lib/tml/tokenizers/dom.rb', line 175

def reset_context
  self.tokens = {}.merge(self.context)
end

#sanitize_value(value) ⇒ `Object`



231
232
233

# File 'lib/tml/tokenizers/dom.rb', line 231

def sanitize_value(value)
  value.gsub(/^\s+/, '')
end

#self_closing_node?(node) ⇒ `Boolean`

Returns:

(Boolean)



212
213
214

# File 'lib/tml/tokenizers/dom.rb', line 212

def self_closing_node?(node)
  !node.children || !node.children.first
end

#separator_node?(node) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/tml/tokenizers/dom.rb', line 226

def separator_node?(node)
  return false unless node
  node.type == 1 && (option('nodes.splitters') || []).index(node.name.downcase)
end

#short_token?(token, value) ⇒ `Boolean`

Returns:

(Boolean)



179
180
181

# File 'lib/tml/tokenizers/dom.rb', line 179

def short_token?(token, value)
  option('nodes.short').index(token.downcase) || value.length < 20
end

#translate(doc) ⇒ `Object`



47
48
49

# File 'lib/tml/tokenizers/dom.rb', line 47

def translate(doc)
  translate_tree(doc.is_a?(String) ? Nokogiri::HTML.fragment(doc) : doc)
end

#translate_tml(tml) ⇒ `Object`

# File 'lib/tml/tokenizers/dom.rb', line 104

def translate_tml(tml)
  return tml if empty_string?(tml)
  # pp tml

  tml = generate_data_tokens(tml)

  if option('split_sentences')
    sentences = Tml::Utils.split_sentences(tml)
    translation = tml
    sentences.each do |sentence|
      sentence_translation = option('debug') ? debug_translation(sentence) : Tml.session.current_language.translate(sentence, tokens, options.dup)
      translation = translation.gsub(sentence, sentence_translation)
    end
    reset_context
    return translation
  end

  tml = tml.gsub(/[\n]/, '').gsub(/\s\s+/, ' ').strip

  translation = option('debug') ? debug_translation(tml) : Tml.session.target_language.translate(tml, tokens, options.dup)
  reset_context
  translation
end

#translate_tree(node) ⇒ `Object`

# File 'lib/tml/tokenizers/dom.rb', line 51

def translate_tree(node)
  if non_translatable_node?(node)
    return node.inner_html
  end

  return translate_tml(node.inner_text) if node.type == 3

  html = ''
  buffer = ''

  node.children.each do |child|
    if child.type == 3
      buffer += child.inner_text
    elsif inline_node?(child) and has_inline_or_text_siblings?(child) and !between_separators?(child)
      buffer += generate_tml_tags(child)
    elsif separator_node?(child)
      html += translate_tml(buffer) if buffer != ''
      html += generate_html_token(child)
      buffer = ''
    else
      html += translate_tml(buffer) if buffer != ''

      container_value = translate_tree(child)
      if ignored_node?(child)
        html += container_value
      else
        html += generate_html_token(child, container_value)
      end

      buffer = ''
    end
  end

  html += translate_tml(buffer) if buffer != ''
  html
end

#valid_text_node?(node) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/tml/tokenizers/dom.rb', line 221

def valid_text_node?(node)
  return false unless node
  node.type == 3 && !empty_string?(node.inner_text)
end

Class: Tml::Tokenizers::Dom

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(context = {}, options = {}) ⇒ Dom

Instance Attribute Details

#context ⇒ Object

#options ⇒ Object

#tokens ⇒ Object

Instance Method Details

#adjust_name(node) ⇒ Object

#between_separators?(node) ⇒ Boolean

#container_node?(node) ⇒ Boolean

#contextualize(name, context) ⇒ Object

#debug(doc) ⇒ Object

#debug_translation(translation) ⇒ Object

#debug_tree(node, depth) ⇒ Object

#empty_string?(tml) ⇒ Boolean

#generate_data_tokens(text) ⇒ Object

#generate_html_token(node, value = nil) ⇒ Object

#generate_tml_tags(node) ⇒ Object

#has_child_nodes?(node) ⇒ Boolean

#has_inline_or_text_siblings?(node) ⇒ Boolean

#ignored_node?(node) ⇒ Boolean

#inline_node?(node) ⇒ Boolean

#no_translate_node?(node) ⇒ Boolean

#node_info(node) ⇒ Object

#non_translatable_node?(node) ⇒ Boolean

#only_child?(node) ⇒ Boolean

#option(name) ⇒ Object

#reset_context ⇒ Object

#sanitize_value(value) ⇒ Object

#self_closing_node?(node) ⇒ Boolean

#separator_node?(node) ⇒ Boolean

#short_token?(token, value) ⇒ Boolean

#translate(doc) ⇒ Object

#translate_tml(tml) ⇒ Object

#translate_tree(node) ⇒ Object

#valid_text_node?(node) ⇒ Boolean

#initialize(context = {}, options = {}) ⇒ `Dom`

#context ⇒ `Object`

#options ⇒ `Object`

#tokens ⇒ `Object`

#adjust_name(node) ⇒ `Object`

#between_separators?(node) ⇒ `Boolean`

#container_node?(node) ⇒ `Boolean`

#contextualize(name, context) ⇒ `Object`

#debug(doc) ⇒ `Object`

#debug_translation(translation) ⇒ `Object`

#debug_tree(node, depth) ⇒ `Object`

#empty_string?(tml) ⇒ `Boolean`

#generate_data_tokens(text) ⇒ `Object`

#generate_html_token(node, value = nil) ⇒ `Object`

#generate_tml_tags(node) ⇒ `Object`

#has_child_nodes?(node) ⇒ `Boolean`

#has_inline_or_text_siblings?(node) ⇒ `Boolean`

#ignored_node?(node) ⇒ `Boolean`

#inline_node?(node) ⇒ `Boolean`

#no_translate_node?(node) ⇒ `Boolean`

#node_info(node) ⇒ `Object`

#non_translatable_node?(node) ⇒ `Boolean`

#only_child?(node) ⇒ `Boolean`

#option(name) ⇒ `Object`

#reset_context ⇒ `Object`

#sanitize_value(value) ⇒ `Object`

#self_closing_node?(node) ⇒ `Boolean`

#separator_node?(node) ⇒ `Boolean`

#short_token?(token, value) ⇒ `Boolean`

#translate(doc) ⇒ `Object`

#translate_tml(tml) ⇒ `Object`

#translate_tree(node) ⇒ `Object`

#valid_text_node?(node) ⇒ `Boolean`