Class: Basset::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/basset/parser.rb

Class Method Summary collapse

Class Method Details

.clean_text(text) ⇒ Object



15
16
17
18
# File 'lib/basset/parser.rb', line 15

def self.clean_text(text)
  #text.tr(',"#$%^&*()_=+[]{}\|<>/`~\—', " ") .tr("@'\-\'\”\‘\’0123456789", "")
  text.gsub(/\W/, ' ').gsub(/\d/, ' ').tr('_', ' ').downcase
end

.ngrams(unigrams, n) ⇒ Object



9
10
11
12
13
# File 'lib/basset/parser.rb', line 9

def self.ngrams(unigrams, n)
  grams = []
  unigrams.each_cons(n) {|a| grams << a.join("_")}
  grams
end

.parse(text, options = {}) ⇒ Object



2
3
4
5
6
7
# File 'lib/basset/parser.rb', line 2

def self.parse(text, options = {})
  unigrams = clean_text(text).split
  
  ngrams = (options[:ngrams] || 1)
  (unigrams + (2..ngrams).map {|n| ngrams(unigrams, n)}).flatten
end