Class: Gulp::PhraseExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/gulp/phrase_extractor.rb

Constant Summary collapse

ALLOWED_PHRASE_LENGTHS =
[2,3,4]
STOPWORDS =
%w(a an and except from has in into is made of one that the these this to with)

Instance Method Summary collapse

Instance Method Details

#extract(text) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/gulp/phrase_extractor.rb', line 6

def extract(text)
  strings = chunk_text(preprocess_text(text))
  phrases = []
  word_count = 0
  strings.each do |string|
    words = string.split(/\s+/)
    word_count += words.size
    
    next if words.size == 0
    
    ALLOWED_PHRASE_LENGTHS.each do |length|
      final_start_position = words.size - length
      (0..final_start_position).each do |start_position|
        sub_phrase_words = words.slice(start_position, length)
        
        next if STOPWORDS.include?(sub_phrase_words.first.downcase) || STOPWORDS.include?(sub_phrase_words.last.downcase)
        
        phrases << sub_phrase_words.join(' ')
      end
    end
  end
  return [word_count, phrases]
end