Class: Ebooks::Model

Inherits:

Object

Object
Ebooks::Model

show all

Defined in:: lib/twitter_ebooks/model.rb

Instance Attribute Summary collapse

#hash ⇒ Object

Returns the value of attribute hash.
#keywords ⇒ Object

Returns the value of attribute keywords.
#mentions ⇒ Object

Returns the value of attribute mentions.
#sentences ⇒ Object

Returns the value of attribute sentences.

Class Method Summary collapse

Instance Method Summary collapse

#consume(path) ⇒ Object
#find_relevant(sentences, input) ⇒ Object

Finds all relevant tokenized sentences to given input by comparing non-stopword token overlaps.
#fix(tweet) ⇒ Object
#make_response(input, limit = 140, sentences = @mentions) ⇒ Object

Generates a response by looking for related sentences in the corpus and building a smaller generator from these.
#make_statement(limit = 140, generator = nil, retry_limit = 10) ⇒ Object
#save(path) ⇒ Object
#valid_tweet?(tokens, limit) ⇒ Boolean
#verbatim?(tokens) ⇒ Boolean

Test if a sentence has been copied verbatim from original.

Instance Attribute Details

#hash ⇒ `Object`

Returns the value of attribute hash.



11
12
13

# File 'lib/twitter_ebooks/model.rb', line 11

def hash
  @hash
end

#keywords ⇒ `Object`

Returns the value of attribute keywords.



11
12
13

# File 'lib/twitter_ebooks/model.rb', line 11

def keywords
  @keywords
end

#mentions ⇒ `Object`

Returns the value of attribute mentions.



11
12
13

# File 'lib/twitter_ebooks/model.rb', line 11

def mentions
  @mentions
end

#sentences ⇒ `Object`

Returns the value of attribute sentences.



11
12
13

# File 'lib/twitter_ebooks/model.rb', line 11

def sentences
  @sentences
end

Class Method Details

.consume(txtpath) ⇒ `Object`



13
14
15

# File 'lib/twitter_ebooks/model.rb', line 13

def self.consume(txtpath)
  Model.new.consume(txtpath)
end

.load(path) ⇒ `Object`



17
18
19

# File 'lib/twitter_ebooks/model.rb', line 17

def self.load(path)
  Marshal.load(File.open(path, 'rb') { |f| f.read })
end

Instance Method Details

#consume(path) ⇒ `Object`

# File 'lib/twitter_ebooks/model.rb', line 21

def consume(path)
  content = File.read(path, :encoding => 'utf-8')
  @hash = Digest::MD5.hexdigest(content)

  if path.split('.')[-1] == "json"
    log "Reading json corpus from #{path}"
    lines = JSON.parse(content, symbolize_names: true).map do |tweet|
      tweet[:text]
    end
  elsif path.split('.')[-1] == "csv"
    log "Reading CSV corpus from #{path}"
    content = CSV.parse(content)
    header = content.shift
    text_col = header.index('text')
    lines = content.map do |tweet|
      tweet[text_col]
    end
  else
    log "Reading plaintext corpus from #{path}"
    lines = content.split("\n")
  end

  log "Removing commented lines and sorting mentions"

  keeping = []
  mentions = []
  lines.each do |l|
    next if l.start_with?('#') # Remove commented lines
    next if l.include?('RT') || l.include?('MT') # Remove soft retweets
    
    if l.include?('@')
      mentions << l
    else
      keeping << l
    end
  end
  text = NLP.normalize(keeping.join("\n")) # Normalize weird characters
  mention_text = NLP.normalize(mentions.join("\n"))

  log "Segmenting text into sentences"

  statements = NLP.sentences(text)
  mentions = NLP.sentences(mention_text)

  log "Tokenizing #{statements.length} statements and #{mentions.length} mentions"
  @sentences = []
  @mentions = []

  statements.each do |s|
    @sentences << NLP.tokenize(s).reject do |t|
      t.include?('@') || t.include?('http')
    end
  end

  mentions.each do |s|
    @mentions << NLP.tokenize(s).reject do |t|
      t.include?('@') || t.include?('http')
    end
  end

  log "Ranking keywords"
  @keywords = NLP.keywords(@sentences)

  self
end

#find_relevant(sentences, input) ⇒ `Object`

Finds all relevant tokenized sentences to given input by comparing non-stopword token overlaps

# File 'lib/twitter_ebooks/model.rb', line 153

def find_relevant(sentences, input)
  relevant = []
  slightly_relevant = []

  tokenized = NLP.tokenize(input).map(&:downcase)

  sentences.each do |sent|
    tokenized.each do |token|
      if sent.map(&:downcase).include?(token)
        relevant << sent unless NLP.stopword?(token)
        slightly_relevant << sent
      end
    end
  end

  [relevant, slightly_relevant]
end

#fix(tweet) ⇒ `Object`

# File 'lib/twitter_ebooks/model.rb', line 94

def fix(tweet)
  # This seems to require an external api call
  #begin
  #  fixer = NLP.gingerice.parse(tweet)
  #  log fixer if fixer['corrections']
  #  tweet = fixer['result']
  #rescue Exception => e
  #  log e.message
  #  log e.backtrace
  #end

  NLP.htmlentities.decode tweet
end

#make_response(input, limit = 140, sentences = @mentions) ⇒ `Object`

Generates a response by looking for related sentences in the corpus and building a smaller generator from these

# File 'lib/twitter_ebooks/model.rb', line 173

def make_response(input, limit=140, sentences=@mentions)
  # Prefer mentions
  relevant, slightly_relevant = find_relevant(sentences, input)

  if relevant.length >= 3
    generator = SuffixGenerator.build(relevant)
    make_statement(limit, generator)
  elsif slightly_relevant.length >= 5
    generator = SuffixGenerator.build(slightly_relevant)
    make_statement(limit, generator)
  elsif sentences.equal?(@mentions)
    make_response(input, limit, @sentences)
  else
    make_statement(limit)
  end
end

#make_statement(limit = 140, generator = nil, retry_limit = 10) ⇒ `Object`

# File 'lib/twitter_ebooks/model.rb', line 113

def make_statement(limit=140, generator=nil, retry_limit=10)
  responding = !generator.nil?
  generator ||= SuffixGenerator.build(@sentences)

  retries = 0
  tweet = ""

  while (tokens = generator.generate(3, :bigrams)) do
    next if tokens.length <= 3 && !responding
    break if valid_tweet?(tokens, limit)

    retries += 1
    break if retries >= retry_limit
  end

  if verbatim?(tokens) && tokens.length > 3 # We made a verbatim tweet by accident
    while (tokens = generator.generate(3, :unigrams)) do
      break if valid_tweet?(tokens, limit) && !verbatim?(tokens)

      retries += 1
      break if retries >= retry_limit
    end
  end

  tweet = NLP.reconstruct(tokens)

  if retries >= retry_limit
    log "Unable to produce valid non-verbatim tweet; using \"#{tweet}\""
  end

  fix tweet
end

#save(path) ⇒ `Object`

# File 'lib/twitter_ebooks/model.rb', line 87

def save(path)
  File.open(path, 'wb') do |f|
    f.write(Marshal.dump(self))
  end
  self
end

#valid_tweet?(tokens, limit) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/twitter_ebooks/model.rb', line 108

def valid_tweet?(tokens, limit)
  tweet = NLP.reconstruct(tokens)
  tweet.length <= limit && !NLP.unmatched_enclosers?(tweet)
end

#verbatim?(tokens) ⇒ `Boolean`

Test if a sentence has been copied verbatim from original

Returns:

(Boolean)



147
148
149

# File 'lib/twitter_ebooks/model.rb', line 147

def verbatim?(tokens)
  @sentences.include?(tokens) || @mentions.include?(tokens)
end

Class: Ebooks::Model

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#hash ⇒ Object

#keywords ⇒ Object

#mentions ⇒ Object

#sentences ⇒ Object

Class Method Details

.consume(txtpath) ⇒ Object

.load(path) ⇒ Object

Instance Method Details

#consume(path) ⇒ Object

#find_relevant(sentences, input) ⇒ Object

#fix(tweet) ⇒ Object

#make_response(input, limit = 140, sentences = @mentions) ⇒ Object

#make_statement(limit = 140, generator = nil, retry_limit = 10) ⇒ Object

#save(path) ⇒ Object

#valid_tweet?(tokens, limit) ⇒ Boolean

#verbatim?(tokens) ⇒ Boolean