Class: TwitterCldr::Tokenizers::Tokenizer
- Inherits:
-
Object
- Object
- TwitterCldr::Tokenizers::Tokenizer
- Defined in:
- lib/twitter_cldr/tokenizers/tokenizer.rb
Instance Attribute Summary collapse
-
#custom_splitter ⇒ Object
readonly
Returns the value of attribute custom_splitter.
-
#recognizers ⇒ Object
readonly
Returns the value of attribute recognizers.
-
#remove_empty_entries ⇒ Object
readonly
Returns the value of attribute remove_empty_entries.
Class Method Summary collapse
Instance Method Summary collapse
-
#initialize(recognizers, splitter = nil, remove_empty_entries = true) ⇒ Tokenizer
constructor
A new instance of Tokenizer.
- #insert_before(token_type, *new_recognizers) ⇒ Object
- #recognizer_at(token_type) ⇒ Object
- #tokenize(text) ⇒ Object
Constructor Details
#initialize(recognizers, splitter = nil, remove_empty_entries = true) ⇒ Tokenizer
Returns a new instance of Tokenizer.
59 60 61 62 63 |
# File 'lib/twitter_cldr/tokenizers/tokenizer.rb', line 59 def initialize(recognizers, splitter = nil, remove_empty_entries = true) @recognizers = recognizers @custom_splitter = splitter @remove_empty_entries = remove_empty_entries end |
Instance Attribute Details
#custom_splitter ⇒ Object (readonly)
Returns the value of attribute custom_splitter.
36 37 38 |
# File 'lib/twitter_cldr/tokenizers/tokenizer.rb', line 36 def custom_splitter @custom_splitter end |
#recognizers ⇒ Object (readonly)
Returns the value of attribute recognizers.
36 37 38 |
# File 'lib/twitter_cldr/tokenizers/tokenizer.rb', line 36 def recognizers @recognizers end |
#remove_empty_entries ⇒ Object (readonly)
Returns the value of attribute remove_empty_entries.
36 37 38 |
# File 'lib/twitter_cldr/tokenizers/tokenizer.rb', line 36 def remove_empty_entries @remove_empty_entries end |
Class Method Details
.union(*tokenizers) ⇒ Object
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
# File 'lib/twitter_cldr/tokenizers/tokenizer.rb', line 38 def self.union(*tokenizers) recognizers = tokenizers.inject([]) do |ret, tokenizer| ret + tokenizer.recognizers.inject([]) do |recog_ret, recognizer| if (block_given? && yield(recognizer)) || !block_given? recog_ret << recognizer end recog_ret end end splitter = if tokenizers.all?(&:custom_splitter) Regexp.compile( tokenizers.map do |tokenizer| tokenizer.custom_splitter.source end.join("|") ) end new(recognizers, splitter) end |
Instance Method Details
#insert_before(token_type, *new_recognizers) ⇒ Object
69 70 71 72 73 74 |
# File 'lib/twitter_cldr/tokenizers/tokenizer.rb', line 69 def insert_before(token_type, *new_recognizers) idx = recognizers.find_index { |rec| rec.token_type == token_type } recognizers.insert(idx, *new_recognizers) clear_splitter nil end |
#recognizer_at(token_type) ⇒ Object
65 66 67 |
# File 'lib/twitter_cldr/tokenizers/tokenizer.rb', line 65 def recognizer_at(token_type) recognizers.find { |r| r.token_type == token_type } end |
#tokenize(text) ⇒ Object
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
# File 'lib/twitter_cldr/tokenizers/tokenizer.rb', line 76 def tokenize(text) text.split(splitter).inject([]) do |ret, token_text| recognizer = recognizers.find do |recognizer| recognizer.recognizes?(token_text) end if recognizer cleaned_text = recognizer.clean(token_text) if (remove_empty_entries && cleaned_text.size > 0) || !remove_empty_entries ret << Token.new( value: cleaned_text, type: recognizer.token_type ) end end ret end end |