Class: Phrasie::Tagger
Constant Summary collapse
- TERM_SPEC =
/([^a-zA-Z]*)([a-zA-Z\-\.]*[a-zA-Z])([^a-zA-Z]*[a-zA-Z]*)/
Instance Attribute Summary collapse
-
#language ⇒ Object
Returns the value of attribute language.
-
#lexicon ⇒ Object
Returns the value of attribute lexicon.
-
#tags_by_term ⇒ Object
Returns the value of attribute tags_by_term.
Instance Method Summary collapse
-
#initialize(options = {}) ⇒ Tagger
constructor
A new instance of Tagger.
-
#tag(input) ⇒ Object
Takes an array from #tokenize, or a string which it pipes through #tokenize, and returns the words with part-of-speech tags.
-
#tokenize(text) ⇒ Object
Takes some input text and outputs an array of the words contained in it.
Methods included from Rules
#correctDefaultNounTag, #determineVerbAfterModal, #normalizePluralForms, #verifyProperNounAtSentenceStart
Constructor Details
#initialize(options = {}) ⇒ Tagger
Returns a new instance of Tagger.
7 8 9 10 11 12 |
# File 'lib/phrasie/tag.rb', line 7 def initialize(={}) self.language = [:language] || 'english' self.lexicon = [:lexicon] || File.("#{__FILE__}/../data/#{self.language}-lexicon.txt") file = File.read(self.lexicon) self. = Hash[file.split("\n").map{|line| line.split.first(2)}] end |
Instance Attribute Details
#language ⇒ Object
Returns the value of attribute language.
5 6 7 |
# File 'lib/phrasie/tag.rb', line 5 def language @language end |
#lexicon ⇒ Object
Returns the value of attribute lexicon.
5 6 7 |
# File 'lib/phrasie/tag.rb', line 5 def lexicon @lexicon end |
#tags_by_term ⇒ Object
Returns the value of attribute tags_by_term.
5 6 7 |
# File 'lib/phrasie/tag.rb', line 5 def @tags_by_term end |
Instance Method Details
#tag(input) ⇒ Object
Takes an array from #tokenize, or a string which it pipes through #tokenize,
and returns the words with part-of-speech tags.
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# File 'lib/phrasie/tag.rb', line 35 def tag(input) if input.is_a? String terms = self.tokenize(input) elsif input.is_a? Array terms = input else return [] end tagged_terms = [] terms.each do |term| tag = self.[term] || "NND" tagged_terms << [term, tag, term] end # These rules are definied in rules.rb rules = [ 'correctDefaultNounTag', 'verifyProperNounAtSentenceStart', 'determineVerbAfterModal', 'normalizePluralForms' ] tagged_terms.each_with_index do |tagged_term, id| rules.each do |rule| id, tagged_terms[id], tagged_terms = self.send(rule.to_sym, id, tagged_term, tagged_terms) end end return tagged_terms end |
#tokenize(text) ⇒ Object
Takes some input text and outputs an array of the words contained in it.
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
# File 'lib/phrasie/tag.rb', line 15 def tokenize(text) terms = [] text.split(/\s/).each do |term| next if term == '' match = TERM_SPEC.match(term).to_a match.shift if match.size == 0 terms << term next end match.each do |sub_term| terms << sub_term if sub_term != '' end end return terms end |