Class: Opener::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/opener/tokenizer.rb,
lib/opener/tokenizer/cli.rb,
lib/opener/tokenizer/server.rb,
lib/opener/tokenizer/version.rb

Overview

Primary tokenizer class that delegates the work to the various language specific tokenizers.

Defined Under Namespace

Classes: CLI, Server

Constant Summary collapse

DEFAULT_LANGUAGE =

The default language to use when no custom one is specified.

Returns:

  • (String)
'en'.freeze
DEFAULT_OPTIONS =

Hash containing the default options to use.

Returns:

  • (Hash)
{
  :args     => [],
  :kaf      => true,
  :language => DEFAULT_LANGUAGE
}.freeze
VERSION =
"1.1.2"

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Tokenizer

Returns a new instance of Tokenizer.

Parameters:

  • options (Hash) (defaults to: {})

Options Hash (options):

  • :args (Array)

    Collection of arbitrary arguments to pass to the individual tokenizer commands.

  • :language (String)

    The language to use for the tokenization process.

  • :kaf (TrueClass|FalseClass)

    When set to ‘true` the input is assumed to be KAF.



49
50
51
# File 'lib/opener/tokenizer.rb', line 49

def initialize(options = {})
  @options = DEFAULT_OPTIONS.merge(options)
end

Instance Attribute Details

#optionsHash (readonly)

Returns:

  • (Hash)


18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/opener/tokenizer.rb', line 18

class Tokenizer
  attr_reader :options

  ##
  # The default language to use when no custom one is specified.
  #
  # @return [String]
  #
  DEFAULT_LANGUAGE = 'en'.freeze

  ##
  # Hash containing the default options to use.
  #
  # @return [Hash]
  #
  DEFAULT_OPTIONS = {
    :args     => [],
    :kaf      => true,
    :language => DEFAULT_LANGUAGE
  }.freeze

  ##
  # @param [Hash] options
  #
  # @option options [Array] :args Collection of arbitrary arguments to pass
  #  to the individual tokenizer commands.
  # @option options [String] :language The language to use for the
  #  tokenization process.
  # @option options [TrueClass|FalseClass] :kaf When set to `true` the input
  #  is assumed to be KAF.
  #
  def initialize(options = {})
    @options = DEFAULT_OPTIONS.merge(options)
  end

  ##
  # Processes the input and returns an array containing the output of STDOUT,
  # STDERR and an object containing process information.
  #
  # @param [String] input
  # @return [Array]
  #
  def run(input)
    begin
      if options[:kaf]
        language, input = kaf_elements(input)
      else
        language = options[:language]
      end
      
      unless valid_language?(language)
        raise ArgumentError, "The specified language (#{language}) is invalid"
      end
      
      kernel = language_constant(language).new(:args => options[:args])
      
      stdout, stderr, process = Open3.capture3(*kernel.command.split(" "), :stdin_data => input)
      raise stderr unless process.success?
      return stdout
      
    rescue Exception => error
      return Opener::Core::ErrorLayer.new(input, error.message, self.class).add
    end
  end

  alias tokenize run

  private

  ##
  # Returns an Array containing the language an input from a KAF document.
  #
  # @param [String] input The KAF document.
  # @return [Array]
  #
  def kaf_elements(input)
    document = Nokogiri::XML(input)
    language = document.at('KAF').attr('xml:lang')
    text     = document.at('raw').text

    return language, text
  end

  ##
  # @param [String] language
  # @return [Class]
  #
  def language_constant(language)
    Opener::Tokenizers.const_get(language.upcase)
  end

  ##
  # @return [TrueClass|FalseClass]
  #
  def valid_language?(language)
    return Opener::Tokenizers.const_defined?(language.upcase)
  end
end

Instance Method Details

#run(input) ⇒ Array Also known as: tokenize

Processes the input and returns an array containing the output of STDOUT, STDERR and an object containing process information.

Parameters:

  • input (String)

Returns:

  • (Array)


60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/opener/tokenizer.rb', line 60

def run(input)
  begin
    if options[:kaf]
      language, input = kaf_elements(input)
    else
      language = options[:language]
    end
    
    unless valid_language?(language)
      raise ArgumentError, "The specified language (#{language}) is invalid"
    end
    
    kernel = language_constant(language).new(:args => options[:args])
    
    stdout, stderr, process = Open3.capture3(*kernel.command.split(" "), :stdin_data => input)
    raise stderr unless process.success?
    return stdout
    
  rescue Exception => error
    return Opener::Core::ErrorLayer.new(input, error.message, self.class).add
  end
end