Module: Segment

Extended by:
Entity
Includes:
SegmentRanges
Included in:
NLP::GdepChunk, NLP::GdepToken, NamedEntity, Token
Defined in:
lib/rbbt/segment.rb,
lib/rbbt/segment/encoding.rb,
lib/rbbt/segment/overlaps.rb

Defined Under Namespace

Modules: RangeIndex

Class Method Summary collapse

Instance Method Summary collapse

Methods included from SegmentRanges

collisions, #includes?, #make_relative, #overlaps, #overlaps?, #pull, #push, #range_in

Class Method Details

.align(text, parts) ⇒ Object



163
164
165
166
167
168
169
170
171
172
173
# File 'lib/rbbt/segment.rb', line 163

def self.align(text, parts)
  pre_offset = 0
  docid = text.respond_to?(:docid) ? text.docid : nil
  parts.each do |part|
    offset = text.index part
    next if offset.nil?
    Segment.setup(part, pre_offset + offset, docid)
    pre_offset += offset + part.segment_length - 1
    text = text[(offset + part.segment_length - 1)..-1]
  end
end

.ascii(text, replace = nil, &block) ⇒ Object



13
14
15
16
17
# File 'lib/rbbt/segment/encoding.rb', line 13

def self.ascii(text, replace = nil, &block)
  bad = bad_chars(text)
  replace = "?" if replace.nil?
  Transformed.with_transform(text, bad, replace, &block)
end

.bad_chars(text) ⇒ Object



3
4
5
6
7
8
9
10
11
# File 'lib/rbbt/segment/encoding.rb', line 3

def self.bad_chars(text)
  segments = []
  text.chars.each_with_index do |c,i|
    if ! c.ascii_only?
      segments << Segment.setup(c, :offset => i)
    end
  end
  segments
end

.clean_sort(segments) ⇒ Object



110
111
112
113
114
115
116
117
118
# File 'lib/rbbt/segment.rb', line 110

def self.clean_sort(segments)
  sorted = sort(segments).reject{|s| s.offset.nil?}
  overlaps = overlaps(sorted)
  overlaps.each do |s|
    sorted.delete s
  end

  sorted
end

.index(*args) ⇒ Object



175
176
177
# File 'lib/rbbt/segment.rb', line 175

def self.index(*args)
  Segment::RangeIndex.index(*args)
end

.overlaps(sorted_segments) ⇒ Object



98
99
100
101
102
103
104
105
106
107
108
# File 'lib/rbbt/segment.rb', line 98

def self.overlaps(sorted_segments)
  last = nil
  overlaped = []

  sorted_segments.reverse.each do |segment| 
    overlaped << segment if (not last.nil?) and segment.range.end > last 
    last = segment.range.begin
  end

  overlaped
end

.sort(segments, inline = true) ⇒ Object



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/rbbt/segment.rb', line 77

def self.sort(segments, inline = true)
  if inline
    segments.sort do |a,b| 
      case
      when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
        0
      when (a.nil? or a.offset.nil?)
        -1
      when (b.nil? or b.offset.nil?)
        +1
      when (not a.range.include? b.offset.to_i and not b.range.include? a.offset.to_i)
        a.offset.to_i <=> b.offset.to_i
      else
        a.segment_length <=> b.segment_length
      end
    end
  else
    segments.sort_by do |segment| segment.offset.to_i || 0 end.reverse
  end
end

.split(text, segments, skip_segments = false) ⇒ Object



120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# File 'lib/rbbt/segment.rb', line 120

def self.split(text, segments, skip_segments = false)
  sorted_segments = clean_sort segments

  chunks      = []
  segment_end = 0
  text_offset = 0
  sorted_segments.each do |segment|
    return chunks if text.nil? or text.empty?
    next if segment.offset.nil?
    offset = segment.offset - text_offset

    # Consider segment offset. Save pre, or skip if overlap
    case
    when offset < 0 # Overlap, skip
      next
    when offset > 0 # Save pre
      chunk = text[0..offset - 1]
      Segment.setup(chunk, text_offset)
      chunks << chunk
    end

    segment_end = offset + segment.segment_length - 1

    if not skip_segments
      chunk = text[offset..segment_end]
      Segment.setup(chunk, text_offset + offset)
      chunks << chunk
    end

    text_offset += segment_end + 1
    text = text[segment_end + 1..-1]

  end

  if not text.nil? and not text.empty?
    chunk = text.dup
    Segment.setup(chunk, text_offset)
    chunks << chunk
  end

  chunks
end

Instance Method Details

#eendObject Also known as: end



57
58
59
# File 'lib/rbbt/segment.rb', line 57

def eend
  offset.to_i + length - 1
end

#rangeObject



63
64
65
# File 'lib/rbbt/segment.rb', line 63

def range
  (offset.to_i..eend)
end

#segment_lengthObject



52
53
54
# File 'lib/rbbt/segment.rb', line 52

def segment_length
  length
end