Class: CorrectHorseBatteryStaple::Corpus::Base

Inherits:
CorrectHorseBatteryStaple::Corpus show all
Extended by:
Forwardable
Includes:
CorrectHorseBatteryStaple::Common, Memoize, Enumerable
Defined in:
lib/correct_horse_battery_staple/corpus/base.rb

Direct Known Subclasses

Isam, IsamKD, Redis, Serialized, Sqlite

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from Memoize

included

Methods included from CorrectHorseBatteryStaple::Common

#array_sample, #logger, #random_in_range, #random_number, #set_sample

Methods inherited from CorrectHorseBatteryStaple::Corpus

format_for

Constructor Details

#initialize(*args) ⇒ Base

Returns a new instance of Base.



17
18
19
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 17

def initialize(*args)
  initialize_backend_variables if respond_to?(:initialize_backend_variables)
end

Instance Attribute Details

#frequency_meanObject

Returns the value of attribute frequency_mean.



8
9
10
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 8

def frequency_mean
  @frequency_mean
end

#frequency_stddevObject

Returns the value of attribute frequency_stddev.



8
9
10
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 8

def frequency_stddev
  @frequency_stddev
end

#original_sizeObject

Returns the value of attribute original_size.



10
11
12
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 10

def original_size
  @original_size
end

#probability_meanObject

Returns the value of attribute probability_mean.



9
10
11
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 9

def probability_mean
  @probability_mean
end

#probability_stddevObject

Returns the value of attribute probability_stddev.



9
10
11
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 9

def probability_stddev
  @probability_stddev
end

#weighted_sizeObject

Returns the value of attribute weighted_size.



11
12
13
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 11

def weighted_size
  @weighted_size
end

Class Method Details

.read(dest) ⇒ Object



21
22
23
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 21

def self.read(dest)
  self.new dest
end

Instance Method Details

#candidates(options = {}) ⇒ Object

return all the candidates for a given set of options



65
66
67
68
69
70
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 65

def candidates(options = {})
  return size if !options || options.empty?
  filter = filter_for_options(options)
  return size unless filter
  entries.select {|entry| filter.call(entry) }
end

#compose_filters(filters) ⇒ Object

create a single composed function of all the filters



168
169
170
171
172
173
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 168

def compose_filters(filters)
  return nil if !filters || filters.empty?
  filters.reduce do |prev, current|
    lambda {|value| prev.call(value) && current.call(value) }
  end
end

#count(*args, &block) ⇒ Object Also known as: length

other methods you should implement if possible:

Enumerable

size

CHBS::Corpus

pick
words
frequencies


42
43
44
45
46
47
48
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 42

def count(*args, &block)
  if args.length > 0 || block
    super(*args, &block)
  else
    size
  end 
end

#count_by_options(options = {}) ⇒ Object



51
52
53
54
55
56
57
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 51

def count_by_options(options = {})
  if options.empty?
    count
  else
    count &filter_for_options(options)
  end
end

#count_candidates(options = {}) ⇒ Object



72
73
74
75
76
77
78
79
80
81
82
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 72

def count_candidates(options = {})
  return size if !options || options.empty?
  filter = filter_for_options(options)
  return size unless filter

  count = 0
  each do |entry|
    count += 1 if filter.call(entry)
  end
  count
end

#each(&block) ⇒ Object

you MUST override this method for Enumerable to use

Raises:

  • (NotImplementedError)


27
28
29
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 27

def each(&block)
  raise NotImplementedError
end

#entropy_per_word(options = {}) ⇒ Object



148
149
150
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 148

def entropy_per_word(options = {})
  Math.log(count_by_options(options)) / Math.log(2)
end

#entropy_per_word_by_filter(&filter) ⇒ Object



152
153
154
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 152

def entropy_per_word_by_filter(&filter)
  Math.log(filter ? count(&filter) : size) / Math.log(2)
end

#filter(&block) ⇒ Object

filtering



158
159
160
161
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 158

def filter(&block)
  (@filters ||= []) << block
  self
end

#filter_for_options(options = {}) ⇒ Object

Return a single lambda that will return true/false given a Word object

Respects the :word_length, :percentile, and :filter options :word_length and :percentile should be Range objects :filter can be a single Proc/lambda or an array of them



279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 279

def filter_for_options(options = {})
  return nil if !options || options.empty?

  filters = Array(options[:filter])
  if options[:percentile]
    p_range = options[:percentile]
    filters << lambda {|entry| p_range.include? entry.percentile }
  end

  if options[:word_length]
    wl_range = options[:word_length]
    filters << lambda {|entry| wl_range.include? entry.word.length }
  end

  filters.empty? ? nil : compose_filters(filters)
end

#frequenciesObject



143
144
145
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 143

def frequencies
  CorrectHorseBatteryStaple::StatisticalArray.new(entries.map {|entry| entry.frequency })
end

#inspectObject



232
233
234
235
236
237
238
239
240
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 232

def inspect
  <<INSPECT
Type: #{self.class.name}
Entry count: #{count}

Stats:
#{stats.map {|k,v| "  #{k}: #{v}\n" }.join("") }
INSPECT
end

#load_stats_from_hash(hash) ⇒ Object

statistics



186
187
188
189
190
191
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 186

def load_stats_from_hash(hash)
  hash.each do |k,v|
    setter = "#{k}=".to_sym
    send setter, v if respond_to?(setter)
  end
end

#pick(count, options = {}) ⇒ Object

this is the core password picker method. it is not especially efficient but it is relatively generic. If a corpus supports Enumerable, it will work.



92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 92

def pick(count, options = {})
  array = CorrectHorseBatteryStaple::StatisticalArray.new(sorted_entries)

  filters = Array(options[:filter])

  if options[:percentile]
    range = array.index_range_for_percentile(options[:percentile])
  else
    range = 0..array.size-1
  end
  range_size = range_size(range)

  if range_size < count
    raise ArgumentError, "Percentile range contains fewer words than requested count"
  end

  if options[:word_length]
    wl = options[:word_length]
    filters << lambda {|entry| wl.include? entry.word.length }
  end

  filter = filters.empty? ? nil : compose_filters(filters)

  max_iterations = options[:max_iterations] || 1000

  result = []
  iterations = 0
  while result.length < count && iterations < max_iterations
    i = random_number(range_size)
    entry = array[i + range.first]
    if entry && (!filter || filter.call(entry))
      result << entry
    end
    iterations += 1
  end

  raise "Cannot find #{count} words matching criteria" if result.length < count
  result
end

#precache(max = 0) ⇒ Object

no-op for serialized forms



140
141
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 140

def precache(max=0)
end

#recalculateObject



193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 193

def recalculate
  size        = self.size
  frequencies = self.frequencies

  # corpus-wide statistics
  self.weighted_size  = frequencies.reduce(BigDecimal.new("0"), :+)
  (self.probability_mean, self.probability_stddev)    =
    CorrectHorseBatteryStaple::StatisticalArray.new(frequencies.map do |freq|
      (freq/weighted_size) * 100
    end).mean_and_standard_deviation

  (self.frequency_mean, self.frequency_stddev) = frequencies.mean_and_standard_deviation

    # stats              = corpus.stats
    # size               = corpus.size
    # frequency_mean     = corpus.frequency_mean
    # frequency_stddev   = corpus.frequency_stddev
    # weighted_size      = corpus.weighted_size
    # probability_mean   = corpus.probability_mean
    # probability_stddev = corpus.probability_stddev

  each_with_index do |entry, index|
    entry.rank                      = size - index
    entry.distance                  = (entry.frequency-frequency_mean)/frequency_stddev
    entry.probability               = entry.frequency / weighted_size
    entry.distance_probability      = (entry.probability - probability_mean) / probability_stddev
    entry.percentile                = (index-0.5)/size * 100
  end

  self
end

#resetObject



163
164
165
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 163

def reset
  @filters = []
end

#resultObject



175
176
177
178
179
180
181
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 175

def result
  return self if @filters.empty?

  self.class.new(execute_filters).tap do |new_corpus|
    new_corpus.original_size = self.original_size
  end
end

#sorted_entriesObject



60
61
62
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 60

def sorted_entries
  entries.sort
end

#statsObject



225
226
227
228
229
230
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 225

def stats
  {:frequency_mean => frequency_mean, :frequency_stddev => frequency_stddev,
    :probability_mean => probability_mean, :probability_stddev => probability_stddev,
    :size => count, :original_size => original_size,
    :weighted_size => weighted_size.to_f}
end

#wordsObject



134
135
136
# File 'lib/correct_horse_battery_staple/corpus/base.rb', line 134

def words
  execute_filters.map {|entry| entry.word }
end