Class: TermExtract

Inherits:
Object
  • Object
show all
Defined in:
lib/term-extract.rb

Overview

Constant Summary collapse

@@SEARCH =
0
@@NOUN =
1
@@TAGGER =
Brill::Tagger.new

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ TermExtract

Returns a new instance of TermExtract.



21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/term-extract.rb', line 21

def initialize(options = {})
  # The minimum number of times a single word term must occur to be included in the results
  @min_occurance = options.key?(:min_occurance) ? options.delete(:min_occurance) : 3
  # Always include multiword terms that comprise more than @min_terms words
  @min_terms = options.key?(:min_terms) ? options.delete(:min_terms) : 2
  # Extract proper nouns (:nnp) or nouns (:nn) or both (:all)
  @types = options.key?(:types) ? options.delete(:types) : :all
  # Include the extracted POS tags in the results
  @include_tags = options.key?(:include_tags) ? options.delete(:include_tags) : false
  # Remove shorter terms that are part of larger ones
  @collapse_terms = options.key?(:collapse_terms) ? options.delete(:collapse_terms) : true
  #@lazy = options.key?(:lazy) ? options.delete(:lazy) : false
end

Instance Attribute Details

#include_tagsObject

Returns the value of attribute include_tags.



13
14
15
# File 'lib/term-extract.rb', line 13

def include_tags
  @include_tags
end

#lazyObject

Returns the value of attribute lazy.



13
14
15
# File 'lib/term-extract.rb', line 13

def lazy
  @lazy
end

#min_occuranceObject

Returns the value of attribute min_occurance.



13
14
15
# File 'lib/term-extract.rb', line 13

def min_occurance
  @min_occurance
end

#min_termsObject

Returns the value of attribute min_terms.



13
14
15
# File 'lib/term-extract.rb', line 13

def min_terms
  @min_terms
end

#typesObject

Returns the value of attribute types.



13
14
15
# File 'lib/term-extract.rb', line 13

def types
  @types
end

Class Method Details

.extract(content, options = {}) ⇒ Object

Provide a class method for syntactic sugar



16
17
18
19
# File 'lib/term-extract.rb', line 16

def self.extract(content, options = {})
  te = new(options)
  te.extract(content)
end

Instance Method Details

#extract(content) ⇒ Object



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/term-extract.rb', line 35

def extract(content)

  # Tidy content punctuation
  # Add a space after periods
  content.gsub!(/([A-Za-z0-9])\./, '\1. ')
  
  # Assign POS tags and tidy tag stack
  tagger = @@TAGGER.nil? ? Brill::Tagger.new : @@TAGGER
  tags = preprocess_tags(tagger.tag(content))

  # Set pos tags that identify nouns
  pos = "^NN"
  case @types
  when :nn
    pos = "^(NN|NNS)$"
  when :nnp
    pos = "^(NNP|NNPS)$"
  end

  terms = Hash.new()
  multiterm = []
  last_tag = ''
  state = @@SEARCH

  # Iterate through term list and identify nouns
  tags.each do |term,tag|

    if state == @@SEARCH and tag =~ /#{pos}/
      # In search mode, found a noun
      state = @@NOUN
      add_term(term, tag, multiterm, terms)
    elsif state == @@SEARCH and tag == 'JJ' and term =~ /^[A-Z]/ #and @lazy
      # Allow things like 'Good' at the start of sentences
      state = @@NOUN
      add_term(term, tag, multiterm, terms)
    elsif state == @@NOUN and tag == 'POS'
      # Allow nouns with apostrophes : St Paul's Cathedral
      multiterm << [term,tag]
    elsif state == @@NOUN and last_tag =~ /^(NNP|NNPS)$/ and tag == 'IN' and term =~ /(of|for|on|of\sthe|\&|d\'|du|de)/i
      # Allow preposition : "Secretary of State"
      # Only use when in NNP mode
      multiterm << [term,tag]
    elsif state == @@NOUN and tag =~ /^NN/
      # In noun mode, found a noun, add a multiterm noun
      add_term(term, tag, multiterm, terms)
    elsif state == @@NOUN and tag !=~ /#{pos}/
      # In noun mode, found a non-noun, do we have a possible multiterm ?
      state = @@SEARCH
      add_multiterm(multiterm, terms) if multiterm.length > 1
      multiterm = []
    end
    last_tag = tag
  end

  # Check the last term wasn't a possible multiterm
  add_multiterm(multiterm, terms)  if last_tag =~ /#{pos}/

  # Filter out terms that don't meet minimum requirements
  # It's possible for a term with multiple words to be returned even if it doesn't
  # meet the min_occurance requirements (as a multiterm noun is very likely to be
  # correct)
  terms.each_key do |term|
    occur = terms[term][:occurances]
    strength = term.split(/ /).length
    terms.delete(term) if occur < 1
    terms.delete(term) unless ((strength == 1 and occur >= @min_occurance) or (strength >= @min_terms))
  end

  # Remove shorter terms that form part of larger terms
  # This typically removes surname references when we already have a full name
  # This doesn't test that the larger term has more occurrences than the smaller
  # term as testing has shown issues with this approach
  if @collapse_terms
    terms.each_key do |term1|
      terms.each_key do |term2|
        terms.delete(term2) if term1.length > term2.length && (term1 =~ /[^A-Za-z0-9]#{Regexp.escape(term2)}$/ || term1 =~ /^#{Regexp.escape(term2)}[^A-Za-z0-9]/)
      end
    end
  end

  # Filter out tags unless required
  unless @include_tags
    terms.each_key { |term| terms[term] = terms[term][:occurances] }
  end
  terms
end