Class: TermExtract

Inherits:

Object

Object
TermExtract

show all

Defined in:: lib/term-extract.rb

Overview

Based on : pypi.python.org/pypi/topia.termextract/

Constant Summary collapse

@@SEARCH =

@@NOUN =

@@TAGGER =

Brill::Tagger.new

Instance Attribute Summary collapse

#include_tags ⇒ Object

Returns the value of attribute include_tags.
#lazy ⇒ Object

Returns the value of attribute lazy.
#min_occurance ⇒ Object

Returns the value of attribute min_occurance.
#min_terms ⇒ Object

Returns the value of attribute min_terms.
#types ⇒ Object

Returns the value of attribute types.

Class Method Summary collapse

.extract(content, options = {}) ⇒ Object

Provide a class method for syntactic sugar.

Instance Method Summary collapse

#extract(content) ⇒ Object
#initialize(options = {}) ⇒ TermExtract constructor

A new instance of TermExtract.

Constructor Details

#initialize(options = {}) ⇒ `TermExtract`

Returns a new instance of TermExtract.

# File 'lib/term-extract.rb', line 21

def initialize(options = {})
  # The minimum number of times a single word term must occur to be included in the results
  @min_occurance = options.key?(:min_occurance) ? options.delete(:min_occurance) : 3
  # Always include multiword terms that comprise more than @min_terms words
  @min_terms = options.key?(:min_terms) ? options.delete(:min_terms) : 2
  # Extract proper nouns (:nnp) or nouns (:nn) or both (:all)
  @types = options.key?(:types) ? options.delete(:types) : :all
  # Include the extracted POS tags in the results
  @include_tags = options.key?(:include_tags) ? options.delete(:include_tags) : false
  # Remove shorter terms that are part of larger ones
  @collapse_terms = options.key?(:collapse_terms) ? options.delete(:collapse_terms) : true
  #@lazy = options.key?(:lazy) ? options.delete(:lazy) : false
end

Instance Attribute Details

#include_tags ⇒ `Object`

Returns the value of attribute include_tags.



13
14
15

# File 'lib/term-extract.rb', line 13

def include_tags
  @include_tags
end

#lazy ⇒ `Object`

Returns the value of attribute lazy.



13
14
15

# File 'lib/term-extract.rb', line 13

def lazy
  @lazy
end

#min_occurance ⇒ `Object`

Returns the value of attribute min_occurance.



13
14
15

# File 'lib/term-extract.rb', line 13

def min_occurance
  @min_occurance
end

#min_terms ⇒ `Object`

Returns the value of attribute min_terms.



13
14
15

# File 'lib/term-extract.rb', line 13

def min_terms
  @min_terms
end

#types ⇒ `Object`

Returns the value of attribute types.



13
14
15

# File 'lib/term-extract.rb', line 13

def types
  @types
end

Class Method Details

.extract(content, options = {}) ⇒ `Object`

Provide a class method for syntactic sugar

# File 'lib/term-extract.rb', line 16

def self.extract(content, options = {})
  te = new(options)
  te.extract(content)
end

Instance Method Details

#extract(content) ⇒ `Object`

# File 'lib/term-extract.rb', line 35

def extract(content)

  # Tidy content punctuation
  # Add a space after periods
  content.gsub!(/([A-Za-z0-9])\./, '\1. ')
  
  # Assign POS tags and tidy tag stack
  tagger = @@TAGGER.nil? ? Brill::Tagger.new : @@TAGGER
  tags = preprocess_tags(tagger.tag(content))

  # Set pos tags that identify nouns
  pos = "^NN"
  case @types
  when :nn
    pos = "^(NN|NNS)$"
  when :nnp
    pos = "^(NNP|NNPS)$"
  end

  terms = Hash.new()
  multiterm = []
  last_tag = ''
  state = @@SEARCH

  # Iterate through term list and identify nouns
  tags.each do |term,tag|

    if state == @@SEARCH and tag =~ /#{pos}/
      # In search mode, found a noun
      state = @@NOUN
      add_term(term, tag, multiterm, terms)
    elsif state == @@SEARCH and tag == 'JJ' and term =~ /^[A-Z]/ #and @lazy
      # Allow things like 'Good' at the start of sentences
      state = @@NOUN
      add_term(term, tag, multiterm, terms)
    elsif state == @@NOUN and tag == 'POS'
      # Allow nouns with apostrophes : St Paul's Cathedral
      multiterm << [term,tag]
    elsif state == @@NOUN and last_tag =~ /^(NNP|NNPS)$/ and tag == 'IN' and term =~ /(of|for|on|of\sthe|\&|d\'|du|de)/i
      # Allow preposition : "Secretary of State"
      # Only use when in NNP mode
      multiterm << [term,tag]
    elsif state == @@NOUN and tag =~ /^NN/
      # In noun mode, found a noun, add a multiterm noun
      add_term(term, tag, multiterm, terms)
    elsif state == @@NOUN and tag !=~ /#{pos}/
      # In noun mode, found a non-noun, do we have a possible multiterm ?
      state = @@SEARCH
      add_multiterm(multiterm, terms) if multiterm.length > 1
      multiterm = []
    end
    last_tag = tag
  end

  # Check the last term wasn't a possible multiterm
  add_multiterm(multiterm, terms)  if last_tag =~ /#{pos}/

  # Filter out terms that don't meet minimum requirements
  # It's possible for a term with multiple words to be returned even if it doesn't
  # meet the min_occurance requirements (as a multiterm noun is very likely to be
  # correct)
  terms.each_key do |term|
    occur = terms[term][:occurances]
    strength = term.split(/ /).length
    terms.delete(term) if occur < 1
    terms.delete(term) unless ((strength == 1 and occur >= @min_occurance) or (strength >= @min_terms))
  end

  # Remove shorter terms that form part of larger terms
  # This typically removes surname references when we already have a full name
  # This doesn't test that the larger term has more occurrences than the smaller
  # term as testing has shown issues with this approach
  if @collapse_terms
    terms.each_key do |term1|
      terms.each_key do |term2|
        terms.delete(term2) if term1.length > term2.length && (term1 =~ /[^A-Za-z0-9]#{Regexp.escape(term2)}$/ || term1 =~ /^#{Regexp.escape(term2)}[^A-Za-z0-9]/)
      end
    end
  end

  # Filter out tags unless required
  unless @include_tags
    terms.each_key { |term| terms[term] = terms[term][:occurances] }
  end
  terms
end

Class: TermExtract

Overview

Constant Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ TermExtract

Instance Attribute Details

#include_tags ⇒ Object

#lazy ⇒ Object

#min_occurance ⇒ Object

#min_terms ⇒ Object

#types ⇒ Object