Class: CorrectHorseBatteryStaple::Corpus::Sqlite
- Inherits:
-
Base
show all
- Defined in:
- lib/correct_horse_battery_staple/corpus/sqlite.rb
Constant Summary
collapse
- MAX_ITERATIONS =
1000
Instance Attribute Summary
Attributes inherited from Base
#frequency_mean, #frequency_stddev, #original_size, #probability_mean, #probability_stddev, #weighted_size
Class Method Summary
collapse
Instance Method Summary
collapse
-
#_pick_discrete_n(percentile, length, count = 1) ⇒ Object
-
#_pick_discrete_n_ids(percentile, length, count = 1) ⇒ Object
-
#close ⇒ Object
-
#each(&block) ⇒ Object
some core Enumerable building blocks.
-
#entries ⇒ Object
our own collection operations.
-
#frequencies ⇒ Object
-
#get_words_for_ids(ids) ⇒ Object
-
#initialize(file) ⇒ Sqlite
constructor
A new instance of Sqlite.
-
#pick(count, options = {}) ⇒ Object
optimized pick variants - they do NOT support :filter, though.
-
#pick_discrete(count, options = {}) ⇒ Object
-
#pick_discrete2(count, options = {}) ⇒ Object
-
#pick_rtree(count, options = {}) ⇒ Object
-
#pick_standard(count, options = {}) ⇒ Object
-
#pick_standard2(count, options = {}) ⇒ Object
-
#prepare(statement) ⇒ Object
-
#size ⇒ Object
-
#sorted_entries ⇒ Object
Methods inherited from Base
#candidates, #compose_filters, #count, #count_by_options, #count_candidates, #entropy_per_word, #entropy_per_word_by_filter, #filter, #filter_for_options, #inspect, #load_stats_from_hash, #precache, #recalculate, #reset, #result, #stats, #words
Methods included from Memoize
included
#array_sample, #logger, #random_in_range, #random_number, #set_sample
format_for
Constructor Details
#initialize(file) ⇒ Sqlite
Returns a new instance of Sqlite.
7
8
9
10
11
12
|
# File 'lib/correct_horse_battery_staple/corpus/sqlite.rb', line 7
def initialize(file)
super
@db = SQLite3::Database.open file
@statements = []
load_stats
end
|
Class Method Details
.read(file) ⇒ Object
14
15
16
|
# File 'lib/correct_horse_battery_staple/corpus/sqlite.rb', line 14
def self.read(file)
self.new file
end
|
Instance Method Details
#_pick_discrete_n(percentile, length, count = 1) ⇒ Object
194
195
196
197
198
199
|
# File 'lib/correct_horse_battery_staple/corpus/sqlite.rb', line 194
def _pick_discrete_n(percentile, length, count = 1)
statement = prepare "select #{COLUMNS.join(", ")} from entries where " +
" percentile = ? and wordlength = ? and randunit < ? limit ?"
statement.execute!(percentile, length, random_number, count)
end
|
#_pick_discrete_n_ids(percentile, length, count = 1) ⇒ Object
232
233
234
235
236
237
|
# File 'lib/correct_horse_battery_staple/corpus/sqlite.rb', line 232
def _pick_discrete_n_ids(percentile, length, count = 1)
statement = prepare "select id from entries where " +
" percentile = ? and wordlength = ? and randunit > ? limit ?"
statement.execute!(percentile, length, random_number, count)
end
|
#close ⇒ Object
241
242
243
244
|
# File 'lib/correct_horse_battery_staple/corpus/sqlite.rb', line 241
def close
@statements.each { |x| x.close }
super
end
|
#each(&block) ⇒ Object
some core Enumerable building blocks
20
21
22
|
# File 'lib/correct_horse_battery_staple/corpus/sqlite.rb', line 20
def each(&block)
entries.each &block
end
|
#entries ⇒ Object
our own collection operations
32
33
34
|
# File 'lib/correct_horse_battery_staple/corpus/sqlite.rb', line 32
def entries
@entries ||= table
end
|
#frequencies ⇒ Object
40
41
42
|
# File 'lib/correct_horse_battery_staple/corpus/sqlite.rb', line 40
def frequencies
@frequencies ||= @db.execute("select frequency from entries").map {|x| x.first}
end
|
#get_words_for_ids(ids) ⇒ Object
98
99
100
101
102
103
104
105
106
107
|
# File 'lib/correct_horse_battery_staple/corpus/sqlite.rb', line 98
def get_words_for_ids(ids)
ids = Array(ids)
rows = @db.execute("select #{COLUMNS.join(", ")} from entries where id in (#{ids.join(',')})")
words = []
ids.each do |id|
words << rows.find {|r| r[0] == id }
end
words.map {|row| word_from_row(row)}
end
|
#pick(count, options = {}) ⇒ Object
optimized pick variants - they do NOT support :filter, though
47
48
49
50
51
52
53
|
# File 'lib/correct_horse_battery_staple/corpus/sqlite.rb', line 47
def pick(count, options = {})
raise NotImplementedError, "SQLite does not support :filter option" if options[:filter]
strategy = options.delete(:strategy) || ENV['pick_strategy'] || "discrete"
send("pick_#{strategy}", count, options)
end
|
#pick_discrete(count, options = {}) ⇒ Object
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
|
# File 'lib/correct_horse_battery_staple/corpus/sqlite.rb', line 168
def pick_discrete(count, options = {})
p_range = options[:percentile] or 0..100
l_range = options[:word_length] or 4..12
result = []
iterations = 0
while (iterations < 4 || result.length < count) && iterations < MAX_ITERATIONS
percentile = random_in_range(p_range)
length = random_in_range(l_range)
result += _pick_discrete_n(percentile, length, 1)
iterations += 1
end
raise "Cannot find #{count} words matching criteria" if result.length < count
array_sample(result, count).map {|row| word_from_row(row)}
end
|
#pick_discrete2(count, options = {}) ⇒ Object
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
|
# File 'lib/correct_horse_battery_staple/corpus/sqlite.rb', line 203
def pick_discrete2(count, options = {})
p_range = options[:percentile] or 0..100
l_range = options[:word_length] or 4..12
ids = []
iterations = 0
while (iterations < 3 || ids.length < count) && iterations < MAX_ITERATIONS
percentile = random_in_range(p_range)
length = random_in_range(l_range)
ids = ids.concat(_pick_discrete_n_ids(percentile, length, 25)).uniq
iterations += 1
end
ids = array_sample(ids, count).map {|r| r[0] }
result = get_words_for_ids(ids)
raise "Cannot find #{count} words matching criteria" if result.length < count
result
end
|
#pick_rtree(count, options = {}) ⇒ Object
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
# File 'lib/correct_horse_battery_staple/corpus/sqlite.rb', line 55
def pick_rtree(count, options = {})
base = "select id from index3d "
wheres = []
params = []
wheres << "minR >= ? and maxR <= ?"
rnd = random_number
offset = 0.0
if rnd > 0.8
offset = 0.8-rnd
elsif rnd < 0.2
offset = 0.2-rnd
end
params += [rnd - 0.20 + offset, rnd + 0.20 + offset]
if options[:word_length]
wheres << " minL >= ? and maxL <= ? "
params += [options[:word_length].first, options[:word_length].last]
end
if options[:percentile]
wheres << " minP >= ? and maxP <= ? "
params += [options[:percentile].first, options[:percentile].last]
end
statement = [base,
(wheres.empty? ? "" : " WHERE " + wheres.join(" AND ")),
"limit ?"].join(" ")
params += [[count,250].max]
query = prepare(statement)
ids = array_sample(query.execute!(*params), count).map {|r| r[0]}
if ids and !ids.empty?
result = get_words_for_ids(ids)
else
result = []
end
raise "Cannot find #{count} words matching criteria" if result.length < count
result
end
|
#pick_standard(count, options = {}) ⇒ Object
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
|
# File 'lib/correct_horse_battery_staple/corpus/sqlite.rb', line 110
def pick_standard(count, options = {})
statement = "select #{COLUMNS.join(", ")} from entries "
params = []
wheres = []
if options[:word_length]
wheres << " wordlength >= ? and wordlength <= ? "
params += [options[:word_length].first, options[:word_length].last]
end
if options[:percentile]
wheres << " percentile >= ? and percentile <= ? "
params += [options[:percentile].first, options[:percentile].last]
end
statement = [statement,
(wheres.empty? ? "" : " WHERE " + wheres.join(" AND ")),
"order by RANDOM()",
"limit ?"].join(" ")
params << [count, 20].max
query = prepare(statement)
result = array_sample(query.execute!(*params), count).
map { |row| word_from_row(row) }
raise "Cannot find #{count} words matching criteria" if result.length < count
result
end
|
#pick_standard2(count, options = {}) ⇒ Object
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
|
# File 'lib/correct_horse_battery_staple/corpus/sqlite.rb', line 137
def pick_standard2(count, options = {})
statement = "select id from entries "
params = []
wheres = []
if options[:word_length]
wheres << " wordlength >= ? and wordlength <= ? "
params += [options[:word_length].first, options[:word_length].last]
end
if options[:percentile]
wheres << " percentile >= ? and percentile <= ? "
params += [options[:percentile].first, options[:percentile].last]
end
statement = [statement,
(wheres.empty? ? "" : " WHERE " + wheres.join(" AND ")),
"limit ?"].join(" ")
params << [count, 1000].max
query = prepare(statement)
ids = array_sample(query.execute!(*params), count).
map {|r| r[0]}
result = get_words_for_ids(ids)
raise "Cannot find #{count} words matching criteria" if result.length < count
result
end
|
#prepare(statement) ⇒ Object
187
188
189
190
191
|
# File 'lib/correct_horse_battery_staple/corpus/sqlite.rb', line 187
def prepare(statement)
res = @db.prepare(statement)
@statements << res
res
end
|
#size ⇒ Object
24
25
26
|
# File 'lib/correct_horse_battery_staple/corpus/sqlite.rb', line 24
def size
@size ||= @db.execute("select count(*) from entries").first.first
end
|
#sorted_entries ⇒ Object
36
37
38
|
# File 'lib/correct_horse_battery_staple/corpus/sqlite.rb', line 36
def sorted_entries
entries
end
|