Class: Ebooks::SuffixGenerator

Inherits:
Object
  • Object
show all
Defined in:
lib/twitter_ebooks/suffix.rb

Overview

This generator uses data identical to the markov model, but instead of making a chain by looking up bigrams it uses the positions to randomly replace suffixes in one sentence with matching suffixes in another

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(sentences) ⇒ SuffixGenerator

Returns a new instance of SuffixGenerator.



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/twitter_ebooks/suffix.rb', line 13

def initialize(sentences)
  @sentences = sentences.reject { |s| s.length < 2 }
  @unigrams = {}
  @bigrams = {}

  @sentences.each_with_index do |tokens, i|
    last_token = INTERIM
    tokens.each_with_index do |token, j|
      @unigrams[last_token] ||= []
      @unigrams[last_token] << [i, j]

      @bigrams[last_token] ||= {}
      @bigrams[last_token][token] ||= []

      if j == tokens.length-1 # Mark sentence endings
        @unigrams[token] ||= []
        @unigrams[token] << [i, INTERIM]
        @bigrams[last_token][token] << [i, INTERIM]
      else
        @bigrams[last_token][token] << [i, j+1]
      end

      last_token = token
    end
  end

  self
end

Class Method Details

.build(sentences) ⇒ Object



9
10
11
# File 'lib/twitter_ebooks/suffix.rb', line 9

def self.build(sentences)
  SuffixGenerator.new(sentences)
end

Instance Method Details

#generate(passes = 5, n = :unigrams) ⇒ Object



42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/twitter_ebooks/suffix.rb', line 42

def generate(passes=5, n=:unigrams)
  index = rand(@sentences.length)
  tokens = @sentences[index]
  used = [index] # Sentences we've already used
  verbatim = [tokens] # Verbatim sentences to avoid reproducing

  0.upto(passes-1) do
    log NLP.reconstruct(tokens) if $debug
    varsites = {} # Map bigram start site => next token alternatives

    tokens.each_with_index do |token, i|
      next_token = tokens[i+1]
      break if next_token.nil?

      alternatives = (n == :unigrams) ? @unigrams[next_token] : @bigrams[token][next_token]
      # Filter out suffixes from previous sentences
      alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
      varsites[i] = alternatives unless alternatives.empty?
    end

    variant = nil
    varsites.to_a.shuffle.each do |site|
      start = site[0]

      site[1].shuffle.each do |alt|
        start, alt = site[0], site[1].sample
        verbatim << @sentences[alt[0]]
        suffix = @sentences[alt[0]][alt[1]..-1]
        potential = tokens[0..start+1] + suffix

        # Ensure we're not just rebuilding some segment of another sentence
        unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
          used << alt[0]
          variant = potential
          break
        end
      end

      break if variant
    end

    tokens = variant if variant
  end

  tokens
end