Class: Phrasie::Tagger

Inherits:
Object
  • Object
show all
Includes:
Rules
Defined in:
lib/phrasie/tag.rb

Constant Summary collapse

TERM_SPEC =
/([^a-zA-Z]*)([a-zA-Z\-\.]*[a-zA-Z])([^a-zA-Z]*[a-zA-Z]*)/

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Rules

#correctDefaultNounTag, #determineVerbAfterModal, #normalizePluralForms, #verifyProperNounAtSentenceStart

Constructor Details

#initialize(options = {}) ⇒ Tagger

Returns a new instance of Tagger.



7
8
9
10
11
12
# File 'lib/phrasie/tag.rb', line 7

def initialize(options={})
  self.language = options[:language] || 'english'
  self.lexicon = options[:lexicon] || File.expand_path("#{__FILE__}/../data/#{self.language}-lexicon.txt")
  file = File.read(self.lexicon)
  self.tags_by_term = Hash[file.split("\n").map{|line| line.split.first(2)}]
end

Instance Attribute Details

#languageObject

Returns the value of attribute language.



5
6
7
# File 'lib/phrasie/tag.rb', line 5

def language
  @language
end

#lexiconObject

Returns the value of attribute lexicon.



5
6
7
# File 'lib/phrasie/tag.rb', line 5

def lexicon
  @lexicon
end

#tags_by_termObject

Returns the value of attribute tags_by_term.



5
6
7
# File 'lib/phrasie/tag.rb', line 5

def tags_by_term
  @tags_by_term
end

Instance Method Details

#tag(input) ⇒ Object

Takes an array from #tokenize, or a string which it pipes through #tokenize,

and returns the words with part-of-speech tags.


35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/phrasie/tag.rb', line 35

def tag(input)
  if input.is_a? String
    terms = self.tokenize(input)
  elsif input.is_a? Array
    terms = input
  else
    return []
  end

  tagged_terms = []
  terms.each do |term|
    tag = self.tags_by_term[term] || "NND"
    tagged_terms << [term, tag, term]
  end
  
  # These rules are definied in rules.rb
  rules = [
    'correctDefaultNounTag',
    'verifyProperNounAtSentenceStart',
    'determineVerbAfterModal',
    'normalizePluralForms'
  ]
    
  tagged_terms.each_with_index do |tagged_term, id|
    rules.each do |rule|
      id, tagged_terms[id], tagged_terms = self.send(rule.to_sym, id, tagged_term, tagged_terms)
    end
  end
  
  return tagged_terms
end

#tokenize(text) ⇒ Object

Takes some input text and outputs an array of the words contained in it.



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/phrasie/tag.rb', line 15

def tokenize(text)
  terms = []
  text.split(/\s/).each do |term|
    next if term == ''
    match = TERM_SPEC.match(term).to_a
    match.shift
    if match.size == 0
      terms << term
      next
    end
  
    match.each do |sub_term|
      terms << sub_term if sub_term != ''
    end
  end
  return terms
end