Module: Ariel::LabelUtils

Defined in:
lib/ariel/label_utils.rb

Overview

A set of methods for use when dealing with strings from labeled documents.

Constant Summary collapse

S_LABEL =
"<"
E_LABEL =
">"

Class Method Summary collapse

Class Method Details

.any_label_regexObject

Helper function that returns a regex that will match any open or closing label tags.



20
21
22
# File 'lib/ariel/label_utils.rb', line 20

def self.any_label_regex()
  Regexp.union(*self.label_regex)
end

.clean_string(string) ⇒ Object

Removes all labels such as <l:title> from the given string and returns the result.



26
27
28
# File 'lib/ariel/label_utils.rb', line 26

def self.clean_string(string)
  string.gsub self.any_label_regex, ''
end

.extract_labeled_region(structure, parent_extracted_node) ⇒ Object

Extracts the labeled region representing the given structure node from the parent_extracted_node. A new Node::Extracted is returned to be added as a child to the parent_extracted_node. Used when loading labeled documents.



33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/ariel/label_utils.rb', line 33

def self.extract_labeled_region(structure, parent_extracted_node)
  tokenstream=parent_extracted_node.tokenstream
  start_idxs=[]
  end_idxs=[]
  tokenstream.rewind
  while start_idx = self.skip_to_label_tag(tokenstream, structure.node_name, :open)
    start_idxs << start_idx
    break unless structure.node_type==:list_item
  end
  tokenstream.rewind
  while end_idx=self.skip_to_label_tag(tokenstream, structure.node_name, :closed)
    end_idxs << (end_idx -2) #rewind to token before the label tag token
    break unless structure.node_type==:list_item
  end
  result=[]
  i=0
  start_idxs.zip(end_idxs) do |start_idx, end_idx|
    if start_idx && end_idx && (start_idx <= end_idx)
      newstream=tokenstream.slice_by_token_index(start_idx, end_idx)
      if structure.node_type==:list_item
        new_name="#{structure.node_name}_#{i}"
        i+=1
      else
        new_name = structure.node_name
      end
      child_node = Node::Extracted.new(new_name, newstream, structure)
      result << child_node
      parent_extracted_node.add_child child_node
      yield child_node if block_given?
    else
      break
    end
  end
  return result
end

.label_regex(tag_contents = '\w+', namespace = 'l') ⇒ Object

Returns an array containing a pair of regular expressions to match a start label tag and an end label tag. If the tag_contents is not modified the regular expressions will return any properly formatted label tag. The namespace to search for can also be modified. The returned regular expressions are case insensitive.



13
14
15
16
# File 'lib/ariel/label_utils.rb', line 13

def self.label_regex(tag_contents='\w+', namespace='l')
  [/#{S_LABEL}#{namespace}:#{tag_contents}#{E_LABEL}/i,
  /#{S_LABEL}\/#{namespace}:#{tag_contents}#{E_LABEL}/i]
end