Class: EngTagger

Inherits:

Object

Object
EngTagger

show all

Extended by:: BoundedSpaceMemoizable

Defined in:: lib/engtagger.rb,
lib/engtagger/version.rb

Overview

English part-of-speech tagger class

Constant Summary collapse

DEFAULT_LEXPATH = File paths

File.join(File.dirname(__FILE__), "engtagger")

DEFAULT_WORDPATH =

File.join(DEFAULT_LEXPATH, "pos_words.hash")

DEFAULT_TAGPATH =

File.join(DEFAULT_LEXPATH, "pos_tags.hash")

NUM = Regexps to match XML-style part-of-speech tags

get_ext("cd")

GER =

get_ext("vbg")

ADJ =

get_ext("jj[rs]*")

NN =

get_ext("nn[sp]*")

NNP =

get_ext("nnp")

PREP =

get_ext("in")

DET =

get_ext("det")

PAREN =

get_ext("[lr]rb")

QUOT =

get_ext("ppr")

SEN =

get_ext("pp")

WORD =

get_ext("\w+")

VB =

get_ext("vb")

VBG =

get_ext("vbg")

VBD =

get_ext("vbd")

PART =

get_ext("vbn")

VBP =

get_ext("vbp")

VBZ =

get_ext("vbz")

JJ =

get_ext("jj")

JJR =

get_ext("jjr")

JJS =

get_ext("jjs")

RB =

get_ext("rb")

RBR =

get_ext("rbr")

RBS =

get_ext("rbs")

RP =

get_ext("rp")

WRB =

get_ext("wrb")

WDT =

get_ext("wdt")

WP =

get_ext("wp")

WPS =

get_ext("wps")

CC =

get_ext("cc")

IN =

get_ext("in")

TAGS =

VERSION =

"0.4.1"

Instance Attribute Summary collapse

#conf ⇒ Object
Hash storing config values:.

Class Method Summary collapse

.explain_tag(tag) ⇒ String
Convert a Treebank-style, abbreviated tag into verbose definitions.
.get_ext(tag = nil) ⇒ Object
Return a regexp from a string argument that matches an XML-style pos tag.
.hmm ⇒ Hash
Return a class variable that holds probability data.
.lexicon ⇒ Hash
Return a class variable that holds lexical data.

Instance Method Summary collapse

#add_tags(text, verbose = false) ⇒ String
Examine the string provided and return it fully tagged in XML style.
#get_adjectives(tagged) ⇒ Hash
The hash of matches.
#get_adverbs(tagged) ⇒ Hash
The hash of matches.
#get_base_present_verbs(tagged) ⇒ Hash
The hash of matches.
#get_comparative_adjectives(tagged) ⇒ Hash
The hash of matches.
#get_conjunctions(tagged) ⇒ Hash
Returns all types of conjunctions and does not discriminate between the various kinds.
#get_gerund_verbs(tagged) ⇒ Hash
The hash of matches.
#get_infinitive_verbs(tagged) ⇒ Hash
The hash of matches.
#get_interrogatives(tagged) ⇒ Hash (also: #get_question_parts)
The hash of matches.
#get_max_noun_phrases(tagged) ⇒ Hash
Given a POS-tagged text, this method returns only the maximal noun phrases.
#get_noun_phrases(tagged) ⇒ Hash
Similar to get_words, but requires a POS-tagged text as an argument.
#get_nouns(tagged) ⇒ Hash
Given a POS-tagged text, this method returns all nouns and their occurrence frequencies.
#get_passive_verbs(tagged) ⇒ Hash
The hash of matches.
#get_past_tense_verbs(tagged) ⇒ Hash
The hash of matches.
#get_present_verbs(tagged) ⇒ Hash
The hash of matches.
#get_proper_nouns(tagged) ⇒ Object
Given a POS-tagged text, this method returns a hash of all proper nouns and their occurrence frequencies.
#get_readable(text, verbose = false) ⇒ Object
Return an easy-on-the-eyes tagged version of a text string.
#get_sentences(text) ⇒ Object
Return an array of sentences (without POS tags) from a text.
#get_superlative_adjectives(tagged) ⇒ Hash
The hash of matches.
#get_verbs(tagged) ⇒ Hash
Returns all types of verbs and does not descriminate between the various kinds.
#get_words(text) ⇒ Object
Given a text string, return as many nouns and noun phrases as possible.
#initialize(params = {}) ⇒ EngTagger constructor
Take a hash of parameters that override default values.
#install ⇒ Object
Reads some included corpus data and saves it in a stored hash on the local file system.
#tag_pairs(text) ⇒ Array
Return an array of pairs of the form ["word", :tag].

Methods included from BoundedSpaceMemoizable

memoize

Constructor Details

#initialize(params = {}) ⇒ `EngTagger`

Take a hash of parameters that override default values. See above for details.

# File 'lib/engtagger.rb', line 190

def initialize(params = {})
  @conf = {}
  @conf[:unknown_word_tag] = ""
  @conf[:stem] = false
  @conf[:weight_noun_phrases] = false
  @conf[:longest_noun_phrase] = 5
  @conf[:relax] = false
  @conf[:tag_lex] = "tags.yml"
  @conf[:word_lex] = "words.yml"
  @conf[:unknown_lex] = "unknown.yml"
  @conf[:word_path] = DEFAULT_WORDPATH
  @conf[:tag_path] = DEFAULT_TAGPATH
  @conf[:debug] = false
  # assuming that we start analyzing from the beginninga new sentence...
  @conf[:current_tag] = "pp"
  @conf.merge!(params) if params
  if !File.exist?(@conf[:word_path]) || !File.exist?(@conf[:tag_path])
    print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
    @@hmm = {}
    @@lexicon = {}
  else
    lexf = File.open(@conf[:word_path], "r")
    @@lexicon = Marshal.load(lexf)
    lexf.close
    hmmf = File.open(@conf[:tag_path], "r")
    @@hmm = Marshal.load(hmmf)
    hmmf.close
  end
  @@mnp = get_max_noun_regex
end

Instance Attribute Details

#conf ⇒ `Object`

Hash storing config values:

:unknown_word_tag => (String) Tag to assign to unknown words
:stem => (Boolean) Stem single words using Porter module
:weight_noun_phrases => (Boolean) When returning occurrence counts for a noun phrase, multiply the valuethe number of words in the NP.
:longest_noun_phrase => (Integer) Will ignore noun phrases longer than this threshold. This affects only the get_words() and get_nouns() methods.
:relax => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for uncommon words, particularly words used polysemously
:tag_lex => (String) Name of the YAML file containing a hash of adjacent part of speech tags and the probability of each
:word_lex => (String) Name of the YAML file containing a hash of words and corresponding parts of speech
:unknown_lex => (String) Name of the YAML file containing a hash of tags for unknown words and corresponding parts of speech
:tag_path => (String) Directory path of tag_lex
:word_path => (String) Directory path of word_lex and unknown_lex
:debug => (Boolean) Print debug messages



182
183
184

# File 'lib/engtagger.rb', line 182

def conf
  @conf
end

Class Method Details

.explain_tag(tag) ⇒ `String`

Convert a Treebank-style, abbreviated tag into verbose definitions

Parameters:

tag (#to_s) —
the tag in question

Returns:

(String) —
the definition, if available

# File 'lib/engtagger.rb', line 95

def self.explain_tag(tag)
  tag = tag.to_s.downcase
  TAGS[tag] || tag
end

.get_ext(tag = nil) ⇒ `Object`

Return a regexp from a string argument that matches an XML-style pos tag

# File 'lib/engtagger.rb', line 52

def self.get_ext(tag = nil)
  return nil unless tag

  Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
end

.hmm ⇒ `Hash`

Return a class variable that holds probability data.

Returns:

(Hash) —
the probability data



39
40
41

# File 'lib/engtagger.rb', line 39

def self.hmm
  @@hmm
end

.lexicon ⇒ `Hash`

Return a class variable that holds lexical data.

Returns:

(Hash) —
the lexicon



47
48
49

# File 'lib/engtagger.rb', line 47

def self.lexicon
  @@lexicon
end

Instance Method Details

#add_tags(text, verbose = false) ⇒ `String`

Examine the string provided and return it fully tagged in XML style.

Examine the string provided and return it fully tagged in XML style

Parameters:

text (String) —
the input text
verbose (false, true) (defaults to: false) —
whether to use verbose tags

Returns:

(String) —
the marked-up string

# File 'lib/engtagger.rb', line 253

def add_tags(text, verbose = false)
  return nil unless valid_text(text)

  tagged = []
  words = clean_text(text)
  words.each do |word|
    cleaned_word = clean_word(word)
    tag = assign_tag(@conf[:current_tag], cleaned_word)
    @conf[:current_tag] = tag = tag && tag != "" ? tag : "nn"
    tag = EngTagger.explain_tag(tag) if verbose
    tagged << "<#{tag}>#{word}</#{tag}>"
  end
  reset
  tagged.join(" ")
end

#get_adjectives(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 448

def get_adjectives(tagged)
  return nil unless valid_text(tagged)

  tags = [JJ]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_adverbs(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 481

def get_adverbs(tagged)
  return nil unless valid_text(tagged)

  tags = [RB, RBR, RBS, RP]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_base_present_verbs(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 426

def get_base_present_verbs(tagged)
  return nil unless valid_text(tagged)

  tags = [VBP]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_comparative_adjectives(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 459

def get_comparative_adjectives(tagged)
  return nil unless valid_text(tagged)

  tags = [JJR]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_conjunctions(tagged) ⇒ `Hash`

Returns all types of conjunctions and does not discriminate between the various kinds. E.g. coordinating, subordinating, correlative...

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 510

def get_conjunctions(tagged)
  return nil unless valid_text(tagged)

  tags = [CC, IN]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_gerund_verbs(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 404

def get_gerund_verbs(tagged)
  return nil unless valid_text(tagged)

  tags = [VBG]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_infinitive_verbs(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 382

def get_infinitive_verbs(tagged)
  return nil unless valid_text(tagged)

  tags = [VB]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_interrogatives(tagged) ⇒ `Hash` Also known as: get_question_parts

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 492

def get_interrogatives(tagged)
  return nil unless valid_text(tagged)

  tags = [WRB, WDT, WP, WPS]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_max_noun_phrases(tagged) ⇒ `Hash`

Given a POS-tagged text, this method returns only the maximal noun phrases. May be called directly, but is also used by get_noun_phrases.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 523

def get_max_noun_phrases(tagged)
  return nil unless valid_text(tagged)

  tags = [@@mnp]
  mn_phrases = build_trimmed(tagged, tags)
  ret = Hash.new(0)
  mn_phrases.each do |p|
    p = stem(p) unless p =~ /\s/ # stem single words
    ret[p] += 1 unless p =~ /\A\s*\z/
  end
  ret
end

#get_noun_phrases(tagged) ⇒ `Hash`

Similar to get_words, but requires a POS-tagged text as an argument.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 541

def get_noun_phrases(tagged)
  return nil unless valid_text(tagged)

  found = Hash.new(0)
  phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
  scanned = tagged.scan(@@mnp)
  # Find MNPs in the text, one sentence at a time
  # Record and split if the phrase is extended by a (?:PREP|DET|NUM)
  mn_phrases = []
  scanned.each do |m|
    found[m] += 1 if phrase_ext =~ m
    mn_phrases += m.split(phrase_ext)
  end
  mn_phrases.each do |mnp|
    # Split the phrase into an array of words, and create a loop for each word,
    # shortening the phrase by removing the word in the first position.
    # Record the phrase and any single nouns that are found
    words = mnp.split
    words.length.times do
      found[words.join(" ")] += 1 if words.length > 1
      w = words.shift
      found[w] += 1 if w =~ /#{NN}/
    end
  end
  ret = Hash.new(0)
  found.each_key do |f|
    k = strip_tags(f)
    v = found[f]
    # We weight by the word count to favor long noun phrases
    space_count = k.scan(/\s+/)
    word_count = space_count.length + 1
    # Throttle MNPs if necessary
    next if word_count > @conf[:longest_noun_phrase]

    k = stem(k) unless word_count > 1 # stem single words
    multiplier = 1
    multiplier = word_count if @conf[:weight_noun_phrases]
    ret[k] += multiplier * v
  end
  ret
end

#get_nouns(tagged) ⇒ `Hash`

Given a POS-tagged text, this method returns all nouns and their occurrence frequencies.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 356

def get_nouns(tagged)
  return nil unless valid_text(tagged)

  tags = [NN]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_passive_verbs(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 415

def get_passive_verbs(tagged)
  return nil unless valid_text(tagged)

  tags = [PART]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_past_tense_verbs(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 393

def get_past_tense_verbs(tagged)
  return nil unless valid_text(tagged)

  tags = [VBD]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_present_verbs(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 437

def get_present_verbs(tagged)
  return nil unless valid_text(tagged)

  tags = [VBZ]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_proper_nouns(tagged) ⇒ `Object`

Given a POS-tagged text, this method returns a hash of all proper nouns and their occurrence frequencies. The method is greedy and will return multi-word phrases, if possible, so it would find ``Linguistic Data Consortium'' as a single unit, rather than as three individual proper nouns. This method does not stem the found words.

# File 'lib/engtagger.rb', line 321

def get_proper_nouns(tagged)
  return nil unless valid_text(tagged)

  tags = [NNP]
  nnp = build_matches_hash(build_trimmed(tagged, tags))
  # Now for some fancy resolution stuff...
  nnp.each_key do |key|
    words = key.split(/\s/)
    # Let's say this is an organization's name --
    # (and it's got at least three words)
    # is there a corresponding acronym in this hash?
    next if words.length <= 2

    # Make a (naive) acronym out of this name
    acronym = words.map do |word|
      /\A([a-z])[a-z]*\z/ =~ word
      $1
    end.join " "
    # If that acronym has been seen,
    # remove it and add the values to
    # the full name
    if nnp[acronym]
      nnp[key] += nnp[acronym]
      nnp.delete(acronym)
    end
  end
  nnp
end

#get_readable(text, verbose = false) ⇒ `Object`

Return an easy-on-the-eyes tagged version of a text string. Applies add_tags and reformats to be easier to read.

# File 'lib/engtagger.rb', line 289

def get_readable(text, verbose = false)
  return nil unless valid_text(text)

  tagged = add_tags(text, verbose)
  tagged.gsub(%r{<\w+>([^<]+|[<\w>]+)</(\w+)>}o) do
    "#{$1}/#{$2.upcase}"
  end
end

#get_sentences(text) ⇒ `Object`

Return an array of sentences (without POS tags) from a text.

# File 'lib/engtagger.rb', line 299

def get_sentences(text)
  return nil unless valid_text(text)

  tagged = add_tags(text)
  sentences = []
  tagged.split(%r{</pp>}).each do |line|
    sentences << strip_tags(line)
  end
  sentences = sentences.map do |sentence|
    sentence.gsub(Regexp.new(" ('s?) ")) { $1 + " " }
    sentence.gsub(Regexp.new(" (\W+) ")) { $1 + " " }
    sentence.gsub(Regexp.new(" (`+) ")) { " " + $1 }
    sentence.gsub(Regexp.new(" (\W+)$")) { $1 }
    sentence.gsub(Regexp.new("^(`+) ")) { $1 }
  end
end

#get_superlative_adjectives(tagged) ⇒ `Hash`

Returns the hash of matches.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 470

def get_superlative_adjectives(tagged)
  return nil unless valid_text(tagged)

  tags = [JJS]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_verbs(tagged) ⇒ `Hash`

Returns all types of verbs and does not descriminate between the various kinds. Combines all other verb methods listed in this class.

Parameters:

tagged (String) —
the tagged text

Returns:

(Hash) —
the hash of matches

# File 'lib/engtagger.rb', line 370

def get_verbs(tagged)
  return nil unless valid_text(tagged)

  tags = [VB, VBD, VBG, PART, VBP, VBZ]
  build_matches_hash(build_trimmed(tagged, tags))
end

#get_words(text) ⇒ `Object`

Given a text string, return as many nouns and noun phrases as possible. Applies add_tags and involves three stages:

Tag the text
Extract all the maximal noun phrases
Recursively extract all noun phrases from the MNPs

# File 'lib/engtagger.rb', line 276

def get_words(text)
  return false unless valid_text(text)

  tagged = add_tags(text)
  if @conf[:longest_noun_phrase] <= 1
    get_nouns(tagged)
  else
    get_noun_phrases(tagged)
  end
end

#install ⇒ `Object`

Reads some included corpus data and saves it in a stored hash on the local file system. This is called automatically if the tagger can't find the stored lexicon.

# File 'lib/engtagger.rb', line 586

def install
  puts "Creating part-of-speech lexicon" if @conf[:debug]
  load_tags(@conf[:tag_lex])
  load_words(@conf[:word_lex])
  load_words(@conf[:unknown_lex])
  File.open(@conf[:word_path], "w") do |f|
    Marshal.dump(@@lexicon, f)
  end
  File.open(@conf[:tag_path], "w") do |f|
    Marshal.dump(@@hmm, f)
  end
end

#tag_pairs(text) ⇒ `Array`

Return an array of pairs of the form ["word", :tag].

Parameters:

text (String) —
the input text

Returns:

(Array) —
the tagged words

# File 'lib/engtagger.rb', line 230

def tag_pairs(text)
  return [] unless valid_text(text)

  out = clean_text(text).map do |word|
    cleaned_word = clean_word word
    tag = assign_tag(@conf[:current_tag], cleaned_word)
    @conf[:current_tag] = tag = tag && !tag.empty? ? tag : "nn"
    [word, tag.to_sym]
  end

  # reset the tagger state
  reset

  out
end

Class: EngTagger

Overview

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from BoundedSpaceMemoizable

Constructor Details

#initialize(params = {}) ⇒ EngTagger

Instance Attribute Details

#conf ⇒ Object

Class Method Details

.explain_tag(tag) ⇒ String

.get_ext(tag = nil) ⇒ Object

.hmm ⇒ Hash

.lexicon ⇒ Hash

Instance Method Details

#add_tags(text, verbose = false) ⇒ String

#get_adjectives(tagged) ⇒ Hash

#get_adverbs(tagged) ⇒ Hash

#get_base_present_verbs(tagged) ⇒ Hash

#get_comparative_adjectives(tagged) ⇒ Hash

#get_conjunctions(tagged) ⇒ Hash

#get_gerund_verbs(tagged) ⇒ Hash

#get_infinitive_verbs(tagged) ⇒ Hash

#get_interrogatives(tagged) ⇒ Hash Also known as: get_question_parts

#get_max_noun_phrases(tagged) ⇒ Hash

#get_noun_phrases(tagged) ⇒ Hash

#get_nouns(tagged) ⇒ Hash

#get_passive_verbs(tagged) ⇒ Hash

#get_past_tense_verbs(tagged) ⇒ Hash

#get_present_verbs(tagged) ⇒ Hash

#get_proper_nouns(tagged) ⇒ Object

#get_readable(text, verbose = false) ⇒ Object

#get_sentences(text) ⇒ Object

#get_superlative_adjectives(tagged) ⇒ Hash

#get_verbs(tagged) ⇒ Hash

#get_words(text) ⇒ Object

#install ⇒ Object

#tag_pairs(text) ⇒ Array

#initialize(params = {}) ⇒ `EngTagger`

#conf ⇒ `Object`

.explain_tag(tag) ⇒ `String`

.get_ext(tag = nil) ⇒ `Object`

.hmm ⇒ `Hash`

.lexicon ⇒ `Hash`

#add_tags(text, verbose = false) ⇒ `String`

#get_adjectives(tagged) ⇒ `Hash`

#get_adverbs(tagged) ⇒ `Hash`

#get_base_present_verbs(tagged) ⇒ `Hash`

#get_comparative_adjectives(tagged) ⇒ `Hash`

#get_conjunctions(tagged) ⇒ `Hash`

#get_gerund_verbs(tagged) ⇒ `Hash`

#get_infinitive_verbs(tagged) ⇒ `Hash`

#get_interrogatives(tagged) ⇒ `Hash` Also known as: get_question_parts

#get_max_noun_phrases(tagged) ⇒ `Hash`

#get_noun_phrases(tagged) ⇒ `Hash`

#get_nouns(tagged) ⇒ `Hash`

#get_passive_verbs(tagged) ⇒ `Hash`

#get_past_tense_verbs(tagged) ⇒ `Hash`

#get_present_verbs(tagged) ⇒ `Hash`

#get_proper_nouns(tagged) ⇒ `Object`

#get_readable(text, verbose = false) ⇒ `Object`

#get_sentences(text) ⇒ `Object`

#get_superlative_adjectives(tagged) ⇒ `Hash`

#get_verbs(tagged) ⇒ `Hash`

#get_words(text) ⇒ `Object`

#install ⇒ `Object`

#tag_pairs(text) ⇒ `Array`