Class: IMatch
- Inherits:
-
Object
- Object
- IMatch
- Defined in:
- lib/imatch.rb,
lib/lexicon.rb
Defined Under Namespace
Classes: Lexicon
Constant Summary collapse
- VERSION =
'0.1.0'
- DEFAULT_LEXICON_FILE =
File.join(File.dirname(__FILE__), 'data', 'en.dat')
- DEFAULT_NUMBER_OF_LEXICONS =
0
- DEFAULT_LEXICON_FRACTION =
0.66
Instance Method Summary collapse
-
#initialize(file = DEFAULT_LEXICON_FILE, options = {}) ⇒ IMatch
constructor
A new instance of IMatch.
- #lexicon ⇒ Object
- #multiple_signatures(string, tokenize = /\s+/) ⇒ Object
- #signature(string, tokenize = /\s+/, lexicon = nil) ⇒ Object
- #to_s ⇒ Object
Constructor Details
#initialize(file = DEFAULT_LEXICON_FILE, options = {}) ⇒ IMatch
Returns a new instance of IMatch.
16 17 18 19 20 21 22 23 24 25 26 27 28 |
# File 'lib/imatch.rb', line 16 def initialize(file = DEFAULT_LEXICON_FILE, = {}) @lexicon = IMatch::Lexicon.new(file).freeze @stop_words = ([:stop_words] || []).to_set @should_stem = !![:stemming] @number_of_lexicons = ([:lexicons] || DEFAULT_NUMBER_OF_LEXICONS).to_i @lexicon_fraction = ([:lexicon_fraction] || DEFAULT_LEXICON_FRACTION).to_f @subsets = [] if @number_of_lexicons > 0 @number_of_lexicons.times { @subsets << @lexicon.subset(@lexicon_fraction) } end end |
Instance Method Details
#lexicon ⇒ Object
70 71 72 |
# File 'lib/imatch.rb', line 70 def lexicon @lexicon end |
#multiple_signatures(string, tokenize = /\s+/) ⇒ Object
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/imatch.rb', line 30 def multiple_signatures(string, tokenize = /\s+/) signatures = Set.new if sig = signature(string, tokenize) signatures << sig end @subsets.each do |lex| if sig = signature(string, tokenize, lex) signatures << sig end end signatures end |
#signature(string, tokenize = /\s+/, lexicon = nil) ⇒ Object
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/imatch.rb', line 46 def signature(string, tokenize = /\s+/, lexicon = nil) return nil unless string tokens = string.split(tokenize) return nil if tokens.empty? current_lexicon = lexicon || @lexicon usable_tokens = Set.new tokens.each do |t| token = t.downcase token = token.stem if @should_stem && token.respond_to?(:stem) next if @stop_words.include?(token) next unless current_lexicon.include?(token) usable_tokens << token end return nil if usable_tokens.empty? finger_print(usable_tokens.to_a.sort) unless tokens.empty? end |
#to_s ⇒ Object
74 75 76 |
# File 'lib/imatch.rb', line 74 def to_s %Q{<IMatch stemming="#{@should_stem}" stop_word_count="#{@stop_words.size}">#{@lexicon.to_s}</IMatch>} end |