Class: Ebooks::SuffixGenerator

Inherits:

Object

Object
Ebooks::SuffixGenerator

Defined in:: lib/twitter_ebooks/suffix.rb

Overview

This generator uses data identical to the markov model, but instead of making a chain by looking up bigrams it uses the positions to randomly replace suffixes in one sentence with matching suffixes in another

Class Method Summary collapse

.build(sentences) ⇒ Object

Instance Method Summary collapse

#generate(passes = 5, n = :unigrams) ⇒ Object
#initialize(sentences) ⇒ SuffixGenerator constructor

A new instance of SuffixGenerator.

Constructor Details

#initialize(sentences) ⇒ `SuffixGenerator`

Returns a new instance of SuffixGenerator.

# File 'lib/twitter_ebooks/suffix.rb', line 13

def initialize(sentences)
  @sentences = sentences.reject { |s| s.length < 2 }
  @unigrams = {}
  @bigrams = {}

  @sentences.each_with_index do |tokens, i|
    last_token = INTERIM
    tokens.each_with_index do |token, j|
      @unigrams[last_token] ||= []
      @unigrams[last_token] << [i, j]

      @bigrams[last_token] ||= {}
      @bigrams[last_token][token] ||= []

      if j == tokens.length-1 # Mark sentence endings
        @unigrams[token] ||= []
        @unigrams[token] << [i, INTERIM]
        @bigrams[last_token][token] << [i, INTERIM]
      else
        @bigrams[last_token][token] << [i, j+1]
      end

      last_token = token
    end
  end

  self
end

Class Method Details

.build(sentences) ⇒ `Object`



9
10
11

# File 'lib/twitter_ebooks/suffix.rb', line 9

def self.build(sentences)
  SuffixGenerator.new(sentences)
end

Instance Method Details

#generate(passes = 5, n = :unigrams) ⇒ `Object`

# File 'lib/twitter_ebooks/suffix.rb', line 42

def generate(passes=5, n=:unigrams)
  index = rand(@sentences.length)
  tokens = @sentences[index]
  used = [index] # Sentences we've already used
  verbatim = [tokens] # Verbatim sentences to avoid reproducing

  0.upto(passes-1) do
    log NLP.reconstruct(tokens) if $debug
    varsites = {} # Map bigram start site => next token alternatives

    tokens.each_with_index do |token, i|
      next_token = tokens[i+1]
      break if next_token.nil?

      alternatives = (n == :unigrams) ? @unigrams[next_token] : @bigrams[token][next_token]
      # Filter out suffixes from previous sentences
      alternatives.reject! { |a| a[1] == INTERIM || used.include?(a[0]) }
      varsites[i] = alternatives unless alternatives.empty?
    end

    variant = nil
    varsites.to_a.shuffle.each do |site|
      start = site[0]

      site[1].shuffle.each do |alt|
        start, alt = site[0], site[1].sample
        verbatim << @sentences[alt[0]]
        suffix = @sentences[alt[0]][alt[1]..-1]
        potential = tokens[0..start+1] + suffix

        # Ensure we're not just rebuilding some segment of another sentence
        unless verbatim.find { |v| NLP.subseq?(v, potential) || NLP.subseq?(potential, v) }
          used << alt[0]
          variant = potential
          break
        end
      end

      break if variant
    end

    tokens = variant if variant
  end

  tokens
end