Class: Phrasie::Extractor

Inherits:
Object
  • Object
show all
Defined in:
lib/phrasie/extractor.rb

Constant Summary collapse

SEARCH =

Simple state machine for use in the #phrases method.

0
NOUN =
1

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Extractor

Returns a new instance of Extractor.



9
10
11
12
# File 'lib/phrasie/extractor.rb', line 9

def initialize(options={})
  self.tagger = Tagger.new
  self.filter = {:strength => 2, :occur => 3}.merge(options[:filter] || {})
end

Instance Attribute Details

#filterObject

Returns the value of attribute filter.



7
8
9
# File 'lib/phrasie/extractor.rb', line 7

def filter
  @filter
end

#taggerObject

Returns the value of attribute tagger.



7
8
9
# File 'lib/phrasie/extractor.rb', line 7

def tagger
  @tagger
end

Instance Method Details

#phrases(input, filter = nil) ⇒ Object

Returns an array of arrays in the format of:

[phrase, # of occurances, # of words in phrase]


20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/phrasie/extractor.rb', line 20

def phrases(input, filter=nil)
  if input.is_a? String
    taggedTerms = self.tagger.tag(input)
  elsif input.is_a? Array
    taggedTerms = input
  else
    return []
  end
        
  unless filter.nil?
    self.filter = self.filter.merge(filter)
    if self.filter[:occur].to_s[/%/]
      self.filter[:occur] = [(taggedTerms.size * 0.01), 2].sort.last.round
    end
  end

  terms = {}
  multiterm = []
  
  state = SEARCH
  
  while taggedTerms.size > 0
    term, tag, norm = taggedTerms.shift
    if state == SEARCH && tag[0,1] == "N"
      state = NOUN
      add(term, norm, multiterm, terms)
    elsif state == SEARCH && tag == 'JJ' && term[0,1].upcase == term[0,1]
      state = NOUN
      add(term, norm, multiterm, terms)
    elsif state == NOUN && tag[0,1] == "N"
      add(term, norm, multiterm, terms)
    elsif state == NOUN && tag[0,1] != "N"
      state = SEARCH
      if multiterm.size > 1
        word = multiterm.map(&:first).join(' ')
        terms[word] ||= 0
        terms[word] += 1
      end
      multiterm = []
    end
  end
  
  return terms \
          .map{|phrase, occurance| [phrase, occurance, phrase.split.size] } \
          .delete_if{|arr| !self.validate(*arr)} \
          .sort_by{|phrase, occurance, strength|  occurance + ((occurance/5.0)*strength) }.reverse
end

#to_sObject



14
15
16
# File 'lib/phrasie/extractor.rb', line 14

def to_s
  "#<Phrasie::Extractor>"
end