Class: Brill::Tagger

Inherits:

Object

Object
Brill::Tagger

show all

Defined in:: lib/brill/tagger.rb

Instance Method Summary collapse

#adjectives(text) ⇒ Object

given a body of text return a list of adjectives.
#initialize(lexicon = nil, lexical_rules = nil, contextual_rules = nil) ⇒ Tagger constructor

will use the brown corpus as the default.
#noun_phrases(text) ⇒ Object

see: cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/Tagger.pm.
#nouns(text) ⇒ Object

given a body of text return a list of nouns.
#suggest(text, max = 10) ⇒ Object

returns similar results as tag, but further reduced by only selecting nouns.
#tag(text) ⇒ Object

Tag a body of text returns an array like [[token,tag],…[token,tag]].

Constructor Details

#initialize(lexicon = nil, lexical_rules = nil, contextual_rules = nil) ⇒ `Tagger`

will use the brown corpus as the default

# File 'lib/brill/tagger.rb', line 10

def initialize( lexicon = nil, lexical_rules = nil, contextual_rules = nil)
  @tagger = ::Tagger::BrillTagger.new
  lexicon ||= File.join(File.dirname(__FILE__),"brown","LEXICON")
  lexical_rules ||= File.join(File.dirname(__FILE__),"brown","LEXICALRULEFILE")
  contextual_rules ||= File.join(File.dirname(__FILE__),"brown","CONTEXTUALRULEFILE")

  Brill::Tagger.load_lexicon(@tagger, lexicon )
  Brill::Tagger.load_lexical_rules(@tagger, lexical_rules )
  Brill::Tagger.load_contextual_rules(@tagger, contextual_rules )
end

Instance Method Details

#adjectives(text) ⇒ `Object`

given a body of text return a list of adjectives



22
23
24

# File 'lib/brill/tagger.rb', line 22

def adjectives( text )
  tag(text).select{|t| t.last == 'JJ' }
end

#noun_phrases(text) ⇒ `Object`

see: cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/Tagger.pm

# File 'lib/brill/tagger.rb', line 32

def noun_phrases(text)
  # ?:$PREP|$DET|$NUM)
  #
  tags = tag(text.gsub(/[^\w]/,' '))
  phrases = []
  phrase = []
  mark = -1

  tags.each_with_index do|tag,i|
    if phrase.empty?
      mark = i if tag.last.match(/PRP\$|DT/)
      if tag.last == 'NNP' and mark != -1
        phrase = tags[mark..i]
        #mark = -1
      end
      mark = -1 if i - mark > 8
    elsif tag.last.match(/NN/)
      phrase += tag
    else
      phrases << phrase.flatten
      phrase = []
      mark = -1
    end
  end
  phrases
end

#nouns(text) ⇒ `Object`

given a body of text return a list of nouns



27
28
29

# File 'lib/brill/tagger.rb', line 27

def nouns( text )
  tag(text).select{|t| t.last.match(/NN/) }
end

#suggest(text, max = 10) ⇒ `Object`

returns similar results as tag, but further reduced by only selecting nouns

# File 'lib/brill/tagger.rb', line 60

def suggest( text, max = 10 )
  tags = tag(text)
  #puts tags.inspect
  ptag = [nil,nil]
  # join NNP's together for names
  reduced_tags = []
  mappings = {} # keep a mapping of the joined words to expand
  tags.each{|tag|
    if ptag.last == 'NNP' and tag.last == 'NNP' and !ptag.first.match(/\.$/)
      ptag[0] += " " + tag.first
      # before combining these two create a mapping for each word to each word
      words = ptag.first.split(/\s/)
      i = 0
      #puts words.inspect
      until (i + 1) == words.size
        mappings[words[i]] = ptag.first
        mappings[words[i+1]] = ptag.first
        i += 1
      end
      #puts mappings.inspect
    elsif tag.last == 'NNP'
      ptag = tag
    elsif tag.last != 'NNP' and ptag.first != nil
      reduced_tags << ptag
      reduced_tags << tag if tag.last.match( /^\w+$/ ) and tag.first.match(/^\w+$/)
      ptag = [nil,nil]
    elsif tag.last.match( /^\w+$/ ) and tag.first.match(/^\w+$/)
      reduced_tags << tag
    end
  }
  # now expand any NNP that appear
  tags = reduced_tags.map{|tag|
    if tag.last == 'NNP'
      #puts "#{tag.first} => #{mappings[tag.first]}"
      tag[0] = mappings[tag.first] if mappings.key?(tag.first)
    end
    tag
  }
  results = tags.select{|tag| tag.last.match(/NN/) and tag.first.size > 3 }
  if results.size > max
    counts = {}
    tags = []
    results.each {|tag| counts[tag.first] = 0 }
    results.each do |tag|
      tags << tag if counts[tag.first] == 0
      counts[tag.first] += tag.last == 'NNP' ? 3 : (tag.last == 'NNS' ? 2 : 1)
    end
    tags.map!{|tag| [tag.first, tag.last,counts[tag.first]]}
    t = 1
    until tags.size <= max
      tags = tags.sort_by{|tag| tag.last}.select{|tag| tag.last > t }
      t += 1
      if t == 5
        tags = tags.reverse[0..max]
        break
      end
    end
    tags
  else
    results
  end
end

#tag(text) ⇒ `Object`

Tag a body of text returns an array like [[token,tag],…[token,tag]]

# File 'lib/brill/tagger.rb', line 126

def tag( text )
  # XXX: the list of contractions is much larger then this... find'em
  text = text.gsub(/dont/,"don't").gsub(/Dont/,"Don't")
  text = text.gsub(/youre/,"you're")
  tokens = Brill::Tagger.tokenize( text )
  tags = Brill::Tagger.tag_start( tokens )

  @tagger.apply_lexical_rules( tokens, tags, [], 0 )
  @tagger.default_tag_finish( tokens, tags )

  # Brill uses these fake "STAART" tags to delimit the start & end of sentence.
  tokens << "STAART"
  tokens << "STAART"
  tokens.unshift "STAART"
  tokens.unshift "STAART"
  tags << "STAART"
  tags << "STAART"
  tags.unshift "STAART"
  tags.unshift "STAART"

  @tagger.apply_contextual_rules( tokens, tags, 1 )

  tags.shift
  tags.shift
  tokens.shift
  tokens.shift
  tags.pop
  tags.pop
  tokens.pop
  tokens.pop

  pairs = []
  tokens.each_with_index do|t,i|
    pairs << [t,tags[i]]
  end
  pairs
end

Class: Brill::Tagger

Instance Method Summary collapse

Constructor Details

#initialize(lexicon = nil, lexical_rules = nil, contextual_rules = nil) ⇒ Tagger

Instance Method Details

#adjectives(text) ⇒ Object

#noun_phrases(text) ⇒ Object

#nouns(text) ⇒ Object