Module: Token

Extended by:
Entity
Includes:
Segment
Defined in:
lib/rbbt/segment/token.rb

Class Method Summary collapse

Methods included from Segment

align, ascii, bad_chars, clean_sort, #eend, index, overlaps, #range, #segment_length, sort, split

Methods included from SegmentRanges

collisions, #includes?, #make_relative, #overlaps, #overlaps?, #pull, #push, #range_in

Class Method Details

.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0) ⇒ Object



9
10
11
12
13
14
15
16
17
18
19
20
21
22
# File 'lib/rbbt/segment/token.rb', line 9

def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)

  tokens = []
  while matchdata = text.match(split_at)
    tokens << Token.setup(matchdata.pre_match, :offset => start) unless matchdata.pre_match.empty?
    tokens << Token.setup(matchdata.captures.first, :offset => start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
    start += matchdata.end(0)
    text = matchdata.post_match
  end

  tokens << Token.setup(text, :offset => start) unless text.empty?

  tokens
end