Class: RSemantic::Parser
- Inherits:
-
Object
- Object
- RSemantic::Parser
- Defined in:
- lib/rsemantic/parser.rb
Instance Method Summary collapse
-
#clean(string) ⇒ Object
remove any nasty grammar tokens from string.
-
#initialize(options = {}) ⇒ Parser
constructor
A new instance of Parser.
-
#remove_stop_words(list) ⇒ Object
stop words are common words which have no search value.
- #tokenise_and_filter(string) ⇒ Object
- #tokenise_and_stem(string) ⇒ Object
Constructor Details
#initialize(options = {}) ⇒ Parser
Returns a new instance of Parser.
6 7 8 9 10 11 12 13 14 15 16 17 18 |
# File 'lib/rsemantic/parser.rb', line 6 def initialize( = {}) # English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop # TODO: nicer way to reference stop file location? @filter_stop_words = [:filter_stop_words] @stem_words = [:stem_words] locale = [:locale] || 'en' if @filter_stop_words File.open("#{File.dirname(__FILE__)}/../../resources/#{locale}.stop", 'r') do |file| @stopwords = Set.new(file.read().split()) end end end |
Instance Method Details
#clean(string) ⇒ Object
remove any nasty grammar tokens from string
26 27 28 29 30 31 |
# File 'lib/rsemantic/parser.rb', line 26 def clean(string) string = string.gsub(".","") string = string.gsub(/\s+/," ") string = string.downcase return string end |
#remove_stop_words(list) ⇒ Object
stop words are common words which have no search value
34 35 36 37 38 39 40 |
# File 'lib/rsemantic/parser.rb', line 34 def remove_stop_words(list) if @filter_stop_words list.select {|word| !@stopwords.include?(word) } else list end end |
#tokenise_and_filter(string) ⇒ Object
20 21 22 23 |
# File 'lib/rsemantic/parser.rb', line 20 def tokenise_and_filter(string) word_list = tokenise_and_stem(string) remove_stop_words(word_list) end |
#tokenise_and_stem(string) ⇒ Object
42 43 44 45 46 47 48 49 50 51 |
# File 'lib/rsemantic/parser.rb', line 42 def tokenise_and_stem(string) string = clean(string) words = string.split(" ") if @stem_words words.map(&:stem) else words end end |