Class: Raingrams::Model

Inherits:
Object show all
Includes:
Helpers::Commonality, Helpers::Frequency, Helpers::Probability, Helpers::Random, Helpers::Similarity
Defined in:
lib/raingrams/model.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Helpers::Random

#random_gram, #random_gram_sentence, #random_ngram, #random_paragraph, #random_sentence, #random_text

Methods included from Helpers::Commonality

#common_ngrams_from_fragment, #common_ngrams_from_sentence, #common_ngrams_from_text, #common_ngrams_from_words, #fragment_commonality, included, #sentence_commonality, #text_commonality

Methods included from Helpers::Similarity

#fragment_similarity, included, #sentence_similarity, #text_similarity

Methods included from Helpers::Probability

#fragment_probability, #probabilities_for, #probability_of_ngram, #probability_of_ngrams, #sentence_probability, #text_probability

Methods included from Helpers::Frequency

#frequencies_for, #frequency_of_ngram, #frequency_of_ngrams

Constructor Details

#initialize(options = {}, &block) ⇒ Model

Creates a new NgramModel with the specified options.

options must contain the following keys:

:ngram_size

The size of each gram.

options may contain the following keys:

:ignore_case

Defaults to false.

:ignore_punctuation

Defaults to true.

:ignore_urls

Defaults to false.

:ignore_phone_numbers

Defaults to false.



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/raingrams/model.rb', line 59

def initialize(options={},&block)
  @ngram_size = options[:ngram_size]
  @starting_ngram = Ngram.new(Tokens.start * @ngram_size)
  @stoping_ngram = Ngram.new(Tokens.stop * @ngram_size)

  @ignore_case = false
  @ignore_punctuation = true
  @ignore_urls = true
  @ignore_phone_numbers = false
  @ignore_references = false

  if options.has_key?(:ignore_case)
    @ignore_case = options[:ignore_case]
  end

  if options.has_key?(:ignore_punctuation)
    @ignore_punctuation = options[:ignore_punctuation]
  end

  if options.has_key?(:ignore_urls)
    @ignore_urls = options[:ignore_urls]
  end

  if options.has_key?(:ignore_phone_numbers)
    @ignore_phone_numbers = options[:ignore_phone_numbers]
  end

  if options.has_key?(:ignore_references)
    @ignore_references = options[:ignore_references]
  end

  @prefixes = {}

  block.call(self) if block
end

Instance Attribute Details

#ignore_caseObject (readonly)

Ignore case of parsed text



30
31
32
# File 'lib/raingrams/model.rb', line 30

def ignore_case
  @ignore_case
end

#ignore_phone_numbersObject (readonly)

Ignore Phone numbers



39
40
41
# File 'lib/raingrams/model.rb', line 39

def ignore_phone_numbers
  @ignore_phone_numbers
end

#ignore_punctuationObject (readonly)

Ignore the punctuation of parsed text



33
34
35
# File 'lib/raingrams/model.rb', line 33

def ignore_punctuation
  @ignore_punctuation
end

#ignore_referencesObject (readonly)

Ignore References



42
43
44
# File 'lib/raingrams/model.rb', line 42

def ignore_references
  @ignore_references
end

#ignore_urlsObject (readonly)

Ignore URLs



36
37
38
# File 'lib/raingrams/model.rb', line 36

def ignore_urls
  @ignore_urls
end

#ngram_sizeObject (readonly)

Size of ngrams to use



21
22
23
# File 'lib/raingrams/model.rb', line 21

def ngram_size
  @ngram_size
end

#prefixesObject (readonly)

Probabilities of all (n-1) grams



45
46
47
# File 'lib/raingrams/model.rb', line 45

def prefixes
  @prefixes
end

#starting_ngramObject (readonly)

The sentence starting ngram



24
25
26
# File 'lib/raingrams/model.rb', line 24

def starting_ngram
  @starting_ngram
end

#stoping_ngramObject (readonly)

The sentence stopping ngram



27
28
29
# File 'lib/raingrams/model.rb', line 27

def stoping_ngram
  @stoping_ngram
end

Class Method Details

.build(options = {}, &block) ⇒ Object

Creates a new model object with the given options. If a block is given, it will be passed the newly created model. After the block as been called the model will be built.



100
101
102
103
104
# File 'lib/raingrams/model.rb', line 100

def self.build(options={},&block)
  self.new(options) do |model|
    model.build(&block)
  end
end

.open(path) ⇒ Object

Marshals a model from the contents of the file at the specified path.



150
151
152
153
154
155
156
157
158
# File 'lib/raingrams/model.rb', line 150

def self.open(path)
  model = nil

  File.open(path) do |file|
    model = Marshal.load(file)
  end

  return model
end

.train_with_file(path, options = {}) ⇒ Object

Creates a new model object with the given options and trains it with the contents of the specified path.



130
131
132
133
134
# File 'lib/raingrams/model.rb', line 130

def self.train_with_file(path,options={})
  self.build(options) do |model|
    model.train_with_file(path)
  end
end

.train_with_paragraph(paragraph, options = {}) ⇒ Object

Creates a new model object with the given options and trains it with the specified paragraph.



110
111
112
113
114
# File 'lib/raingrams/model.rb', line 110

def self.train_with_paragraph(paragraph,options={})
  self.build(options) do |model|
    model.train_with_paragraph(paragraph)
  end
end

.train_with_text(text, options = {}) ⇒ Object

Creates a new model object with the given options and trains it with the specified text.



120
121
122
123
124
# File 'lib/raingrams/model.rb', line 120

def self.train_with_text(text,options={})
  self.build(options) do |model|
    model.train_with_text(text)
  end
end

.train_with_url(url, options = {}) ⇒ Object

Creates a new model object with the given options and trains it with the inner text of the paragraphs tags at the specified url.



140
141
142
143
144
# File 'lib/raingrams/model.rb', line 140

def self.train_with_url(url,options={})
  self.build(options) do |model|
    model.train_with_url(url)
  end
end

Instance Method Details

#build(&block) ⇒ Object

Clears and rebuilds the model.



549
550
551
552
553
554
555
# File 'lib/raingrams/model.rb', line 549

def build(&block)
  refresh do
    clear

    block.call(self) if block
  end
end

#clearObject

Clears the model of any training data.



560
561
562
563
# File 'lib/raingrams/model.rb', line 560

def clear
  @prefixes.clear
  return self
end

#each_ngram(&block) ⇒ Object

Iterates over the ngrams that compose the model, passing each one to the given block.



243
244
245
246
247
248
249
250
251
# File 'lib/raingrams/model.rb', line 243

def each_ngram(&block)
  @prefixes.each do |prefix,table|
    table.each_gram do |postfix_gram|
      block.call(prefix + postfix_gram) if block
    end
  end

  return self
end

#gramsObject

Returns all grams within the model.



433
434
435
436
437
# File 'lib/raingrams/model.rb', line 433

def grams
  @prefixes.keys.inject(Set.new) do |all_grams,gram|
    all_grams + gram
  end
end

#grams_following(gram) ⇒ Object

Returns all grams which occur directly after the specified gram.



465
466
467
468
469
470
471
472
473
# File 'lib/raingrams/model.rb', line 465

def grams_following(gram)
  gram_set = Set.new

  ngram_starting_with(gram).each do |ngram|
    gram_set << ngram[1]
  end

  return gram_set
end

#grams_preceeding(gram) ⇒ Object

Returns all grams which preceed the specified gram.



452
453
454
455
456
457
458
459
460
# File 'lib/raingrams/model.rb', line 452

def grams_preceeding(gram)
  gram_set = Set.new

  ngrams_ending_with(gram).each do |ngram|
    gram_set << ngram[-2]
  end

  return gram_set
end

#has_gram?(gram) ⇒ Boolean

Returns true if the model contain the specified gram, returns false otherwise.

Returns:

  • (Boolean)


443
444
445
446
447
# File 'lib/raingrams/model.rb', line 443

def has_gram?(gram)
  @prefixes.keys.any? do |prefix|
    prefix.include?(gram)
  end
end

#has_ngram?(ngram) ⇒ Boolean

Returns true if the model contains the specified ngram, returns false otherwise.

Returns:

  • (Boolean)


231
232
233
234
235
236
237
# File 'lib/raingrams/model.rb', line 231

def has_ngram?(ngram)
  if @prefixes.has_key?(ngram.prefix)
    return @prefixes[ngram.prefix].has_gram?(ngram.last)
  else
    return false
  end
end

#ngramsObject

Returns the ngrams that compose the model.



215
216
217
218
219
220
221
222
223
224
225
# File 'lib/raingrams/model.rb', line 215

def ngrams
  ngram_set = NgramSet.new

  @prefixes.each do |prefix,table|
    table.each_gram do |postfix_gram|
      ngram_set << (prefix + postfix_gram)
    end
  end

  return ngram_set
end

#ngrams_ending_with(gram) ⇒ Object

Returns the ngrams which end with the specified gram.



318
319
320
321
322
323
324
325
326
327
328
# File 'lib/raingrams/model.rb', line 318

def ngrams_ending_with(gram)
  ngram_set = NgramSet.new

  @prefixes.each do |prefix,table|
    if table.has_gram?(gram)
      ngram_set << (prefix + gram)
    end
  end

  return ngram_set
end

#ngrams_following(gram) ⇒ Object

Returns all ngrams which occur directly after the specified gram.



418
419
420
421
422
423
424
425
426
427
428
# File 'lib/raingrams/model.rb', line 418

def ngrams_following(gram)
  ngram_set = NgramSet.new

  ngrams_starting_with(gram).each do |starts_with|
    ngrams_prefixed_by(starts_with.postfix).each do |ngram|
      ngram_set << ngram
    end
  end

  return ngram_set
end

#ngrams_from_fragment(fragment) ⇒ Object

Returns the ngrams extracted from the specified fragment of text.



378
379
380
# File 'lib/raingrams/model.rb', line 378

def ngrams_from_fragment(fragment)
  ngrams_from_words(parse_sentence(fragment))
end

#ngrams_from_sentence(sentence) ⇒ Object

Returns the ngrams extracted from the specified sentence.



385
386
387
# File 'lib/raingrams/model.rb', line 385

def ngrams_from_sentence(sentence)
  ngrams_from_words(wrap_sentence(parse_sentence(sentence)))
end

#ngrams_from_text(text) ⇒ Object Also known as: ngrams_from_paragraph

Returns the ngrams extracted from the specified text.



392
393
394
395
396
# File 'lib/raingrams/model.rb', line 392

def ngrams_from_text(text)
  parse_text(text).inject([]) do |ngrams,sentence|
    ngrams + ngrams_from_sentence(sentence)
  end
end

#ngrams_from_words(words) ⇒ Object

Returns the ngrams extracted from the specified words.



369
370
371
372
373
# File 'lib/raingrams/model.rb', line 369

def ngrams_from_words(words)
  return (0...(words.length-@ngram_size+1)).map do |index|
    Ngram.new(words[index,@ngram_size])
  end
end

#ngrams_including_all(*grams) ⇒ Object

Returns the ngrams including all of the specified grams.



356
357
358
359
360
361
362
363
364
# File 'lib/raingrams/model.rb', line 356

def ngrams_including_all(*grams)
  ngram_set = NgramSet.new

  each_ngram do |ngram|
    ngram_set << ngram if ngram.includes_all?(*grams)
  end

  return ngram_set
end

#ngrams_including_any(*grams) ⇒ Object

Returns the ngrams including any of the specified grams.



333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
# File 'lib/raingrams/model.rb', line 333

def ngrams_including_any(*grams)
  ngram_set = NgramSet.new

  @prefixes.each do |prefix,table|
    if prefix.includes_any?(*grams)
      table.each_gram do |postfix_gram|
        ngram_set << (prefix + postfix_gram)
      end
    else
      table.each_gram do |postfix_gram|
        if grams.include?(postfix_gram)
          ngram_set << (prefix + postfix_gram)
        end
      end
    end
  end

  return ngram_set
end

#ngrams_postfixed_by(postfix) ⇒ Object

Returns the ngrams postfixed by the specified postfix.



284
285
286
287
288
289
290
291
292
293
294
295
296
# File 'lib/raingrams/model.rb', line 284

def ngrams_postfixed_by(postfix)
  ngram_set = NgramSet.new

  @prefixes.each do |prefix,table|
    if prefix[1..-1] == postfix[0..-2]
      if table.has_gram?(postfix.last)
        ngram_set << (prefix + postfix.last)
      end
    end
  end

  return ngram_set
end

#ngrams_preceeding(gram) ⇒ Object

Returns all ngrams which preceed the specified gram.



403
404
405
406
407
408
409
410
411
412
413
# File 'lib/raingrams/model.rb', line 403

def ngrams_preceeding(gram)
  ngram_set = NgramSet.new

  ngrams_ending_with(gram).each do |ends_with|
    ngrams_postfixed_by(ends_with.prefix).each do |ngram|
      ngram_set << ngram
    end
  end

  return ngram_set
end

#ngrams_prefixed_by(prefix) ⇒ Object

Returns the ngrams prefixed by the specified prefix.



269
270
271
272
273
274
275
276
277
278
279
# File 'lib/raingrams/model.rb', line 269

def ngrams_prefixed_by(prefix)
  ngram_set = NgramSet.new

  return ngram_set unless @prefixes.has_key?(prefix)

  ngram_set += @prefixes[prefix].grams.map do |gram|
    prefix + gram
  end

  return ngram_set
end

#ngrams_starting_with(gram) ⇒ Object

Returns the ngrams starting with the specified gram.



301
302
303
304
305
306
307
308
309
310
311
312
313
# File 'lib/raingrams/model.rb', line 301

def ngrams_starting_with(gram)
  ngram_set = NgramSet.new

  @prefixes.each do |prefix,table|
    if prefix.first == gram
      table.each_gram do |postfix_gram|
        ngram_set << (prefix + postfix_gram)
      end
    end
  end

  return ngram_set
end

#ngrams_with(&block) ⇒ Object

Selects the ngrams that match the given block.



256
257
258
259
260
261
262
263
264
# File 'lib/raingrams/model.rb', line 256

def ngrams_with(&block)
  selected_ngrams = NgramSet.new

  each_ngram do |ngram|
    selected_ngrams << ngram if block.call(ngram)
  end

  return selected_ngrams
end

#parse_sentence(sentence) ⇒ Object

Parses the specified sentence and returns an Array of tokens.



163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# File 'lib/raingrams/model.rb', line 163

def parse_sentence(sentence)
  sentence = sentence.to_s

  if @ignore_punctuation
    # eat tailing punctuation
    sentence.gsub!(/[\.\?!]*$/,'')
  end

  if @ignore_case
    # downcase the sentence
    sentence.downcase!
  end

  if @ignore_urls
    sentence.gsub!(/\s*\w+:\/\/[\w\/\+_\-,:%\d\.\-\?&=]*\s*/,' ')
  end

  if @ignore_phone_numbers
    # remove phone numbers
    sentence.gsub!(/\s*(\d-)?(\d{3}-)?\d{3}-\d{4}\s*/,' ')
  end

  if @ignore_references
    # remove RFC style references
    sentence.gsub!(/\s*[\(\{\[]\d+[\)\}\]]\s*/,' ')
  end

  if @ignore_punctuation
    # split and ignore punctuation characters
    return sentence.scan(/\w+[\-_\.:']\w+|\w+/)
  else
    # split and accept punctuation characters
    return sentence.scan(/[\w\-_,:;\.\?\!'"\\\/]+/)
  end
end

#parse_text(text) ⇒ Object

Parses the specified text and returns an Array of sentences.



202
203
204
205
206
207
208
209
210
# File 'lib/raingrams/model.rb', line 202

def parse_text(text)
  text = text.to_s

  if @ignore_urls
    text.gsub!(/\s*\w+:\/\/[\w\/\+_\-,:%\d\.\-\?&=]*\s*/,' ')
  end

  return text.scan(/[^\s\.\?!][^\.\?!]*[\.\?\!]/)
end

#refresh(&block) ⇒ Object

Refreshes the probability tables of the model.



539
540
541
542
543
544
# File 'lib/raingrams/model.rb', line 539

def refresh(&block)
  block.call(self) if block

  @prefixes.each_value { |table| table.build }
  return self
end

#save(path) ⇒ Object

Saves the model to the file at the specified path.



568
569
570
571
572
573
574
# File 'lib/raingrams/model.rb', line 568

def save(path)
  File.open(path,'w') do |file|
    Marshal.dump(self,file)
  end

  return self
end

#set_ngram_frequency(ngram, value) ⇒ Object

Sets the frequency of the specified ngram to the specified value.



478
479
480
# File 'lib/raingrams/model.rb', line 478

def set_ngram_frequency(ngram,value)
  probability_table(ngram).set_count(ngram.last,value)
end

#to_hashObject

Returns a Hash representation of the model.



579
580
581
# File 'lib/raingrams/model.rb', line 579

def to_hash
  @prefixes
end

#train_with_file(path) ⇒ Object

Train the model with the contents of the specified path.



520
521
522
# File 'lib/raingrams/model.rb', line 520

def train_with_file(path)
  train_with_text(File.read(path))
end

#train_with_ngram(ngram) ⇒ Object

Train the model with the specified ngram.



485
486
487
# File 'lib/raingrams/model.rb', line 485

def train_with_ngram(ngram)
  probability_table(ngram).count(ngram.last)
end

#train_with_ngrams(ngrams) ⇒ Object

Train the model with the specified ngrams.



492
493
494
# File 'lib/raingrams/model.rb', line 492

def train_with_ngrams(ngrams)
  ngrams.each { |ngram| train_with_ngram(ngram) }
end

#train_with_paragraph(paragraph) ⇒ Object

Train the model with the specified paragraphs.



506
507
508
# File 'lib/raingrams/model.rb', line 506

def train_with_paragraph(paragraph)
  train_with_ngrams(ngrams_from_paragraph(paragraph))
end

#train_with_sentence(sentence) ⇒ Object

Train the model with the specified sentence.



499
500
501
# File 'lib/raingrams/model.rb', line 499

def train_with_sentence(sentence)
  train_with_ngrams(ngrams_from_sentence(sentence))
end

#train_with_text(text) ⇒ Object

Train the model with the specified text.



513
514
515
# File 'lib/raingrams/model.rb', line 513

def train_with_text(text)
  train_with_ngrams(ngrams_from_text(text))
end

#train_with_url(url) ⇒ Object

Train the model with the inner text of the paragraph tags at the specified url.



528
529
530
531
532
533
534
# File 'lib/raingrams/model.rb', line 528

def train_with_url(url)
  doc = Nokogiri::HTML(open(url))

  return doc.search('p').map do |p|
    train_with_paragraph(p.inner_text)
  end
end