Class: Bishop::Bayes
- Inherits:
-
Object
- Object
- Bishop::Bayes
- Defined in:
- lib/bishop.rb
Instance Attribute Summary collapse
-
#cache ⇒ Object
Returns the value of attribute cache.
-
#combiner ⇒ Object
Returns the value of attribute combiner.
-
#corpus ⇒ Object
Returns the value of attribute corpus.
-
#data_class ⇒ Object
Returns the value of attribute data_class.
-
#dirty ⇒ Object
Returns the value of attribute dirty.
-
#pools ⇒ Object
Returns the value of attribute pools.
-
#tokenizer ⇒ Object
Returns the value of attribute tokenizer.
-
#train_count ⇒ Object
Returns the value of attribute train_count.
Instance Method Summary collapse
-
#build_cache ⇒ Object
Create a cache of the metrics for each pool.
- #commit ⇒ Object
- #dirty? ⇒ Boolean
- #export ⇒ Object
-
#get_probs(pool, words) ⇒ Object
For each word trained in the pool, collect it’s occurrence data in the pool into a sorted array.
-
#get_tokens(input) ⇒ Object
Create a token array from the specified input.
-
#guess(msg) ⇒ Object
Call this method to classify a “message”.
-
#initialize(tokenizer = nil, data_class = BayesData, &combiner) ⇒ Bayes
constructor
A new instance of Bayes.
- #load(file = 'bayesdata.yml') ⇒ Object
- #load_data(source) ⇒ Object
-
#merge_pools(dest_name, source_name) ⇒ Object
Merge the contents of the source pool into the destination destination pool.
-
#new_pool(pool_name) ⇒ Object
Create a new, empty, pool without training.
-
#pool_data(pool_name) ⇒ Object
Return an array of token counts for the specified pool.
- #pool_names ⇒ Object
-
#pool_probs ⇒ Object
Get the probabilities for each pool, recreating the cached information if any token information for any of the pools has changed.
-
#pool_tokens(pool_name) ⇒ Object
Return an array of tokens trained in the specified pool.
- #remove_pool(pool_name) ⇒ Object
- #rename_pool(pool_name, new_name) ⇒ Object
- #save(file = 'bayesdata.yml') ⇒ Object
- #train(pool_name, item, uid = nil) ⇒ Object
- #trained_on?(msg) ⇒ Boolean
- #untrain(pool_name, item, uid = nil) ⇒ Object
Constructor Details
#initialize(tokenizer = nil, data_class = BayesData, &combiner) ⇒ Bayes
Returns a new instance of Bayes.
56 57 58 59 60 61 62 63 64 65 |
# File 'lib/bishop.rb', line 56 def initialize( tokenizer = nil, data_class = BayesData, &combiner ) @tokenizer = tokenizer || Tokenizer.new @combiner = combiner || Proc.new { |probs,ignore| Bishop.robinson( probs, ignore ) } @data_class = data_class @pools = {} @corpus = new_pool( '__Corpus__' ) @pools['__Corpus__'] = @corpus @train_count = 0 @dirty = true end |
Instance Attribute Details
#cache ⇒ Object
Returns the value of attribute cache.
53 54 55 |
# File 'lib/bishop.rb', line 53 def cache @cache end |
#combiner ⇒ Object
Returns the value of attribute combiner.
53 54 55 |
# File 'lib/bishop.rb', line 53 def combiner @combiner end |
#corpus ⇒ Object
Returns the value of attribute corpus.
53 54 55 |
# File 'lib/bishop.rb', line 53 def corpus @corpus end |
#data_class ⇒ Object
Returns the value of attribute data_class.
53 54 55 |
# File 'lib/bishop.rb', line 53 def data_class @data_class end |
#dirty ⇒ Object
Returns the value of attribute dirty.
53 54 55 |
# File 'lib/bishop.rb', line 53 def dirty @dirty end |
#pools ⇒ Object
Returns the value of attribute pools.
53 54 55 |
# File 'lib/bishop.rb', line 53 def pools @pools end |
#tokenizer ⇒ Object
Returns the value of attribute tokenizer.
53 54 55 |
# File 'lib/bishop.rb', line 53 def tokenizer @tokenizer end |
#train_count ⇒ Object
Returns the value of attribute train_count.
53 54 55 |
# File 'lib/bishop.rb', line 53 def train_count @train_count end |
Instance Method Details
#build_cache ⇒ Object
Create a cache of the metrics for each pool.
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
# File 'lib/bishop.rb', line 145 def build_cache self.cache = {} self.pools.each do |name,pool| unless name == '__Corpus__' pool_count = pool.token_count them_count = [ 1, self.corpus.token_count - pool_count ].max cache_dict = self.cache[ name ] ||= @data_class.new( name ) self.corpus.data.each do |token,tot_count| this_count = pool.data[token] unless this_count == 0.0 other_count = tot_count - this_count if pool_count > 0 good_metric = [ 1.0, other_count / pool_count ].min else good_metric = 1.0 end bad_metric = [ 1.0, this_count / them_count ].min f = bad_metric / ( good_metric + bad_metric ) if ( f - 0.5 ).abs >= 0.1 cache_dict.data[token] = [ 0.0001, [ 0.9999, f ].min ].max end end end end end end |
#commit ⇒ Object
67 68 69 |
# File 'lib/bishop.rb', line 67 def commit self.save end |
#dirty? ⇒ Boolean
71 72 73 |
# File 'lib/bishop.rb', line 71 def dirty? self.dirty end |
#export ⇒ Object
121 122 123 |
# File 'lib/bishop.rb', line 121 def export self.pools.to_yaml end |
#get_probs(pool, words) ⇒ Object
For each word trained in the pool, collect it’s occurrence data in the pool into a sorted array.
196 197 198 |
# File 'lib/bishop.rb', line 196 def get_probs( pool, words ) words.find_all { |word| pool.data.has_key? word }.map { |word| [word,pool.data[word]] }.sort end |
#get_tokens(input) ⇒ Object
Create a token array from the specified input.
191 192 193 |
# File 'lib/bishop.rb', line 191 def get_tokens( input ) self.tokenizer.tokenize( input ) end |
#guess(msg) ⇒ Object
Call this method to classify a “message”. The return value will be an array containing tuples (pool, probability) for each pool which is a likely match for the message.
264 265 266 267 268 269 270 271 272 273 274 275 276 |
# File 'lib/bishop.rb', line 264 def guess( msg ) tokens = get_tokens( msg ) res = {} pool_probs.each do |pool_name,pool| p = get_probs( pool, tokens ) if p.length != 0 res[pool_name] = self.combiner.call( p, pool_name ) end end res.sort end |
#load(file = 'bayesdata.yml') ⇒ Object
125 126 127 128 129 130 131 |
# File 'lib/bishop.rb', line 125 def load( file = 'bayesdata.yml' ) begin File.open( file ) { |f| load_data( f ) } rescue Errno::ENOENT # File does not exist end end |
#load_data(source) ⇒ Object
133 134 135 136 137 138 |
# File 'lib/bishop.rb', line 133 def load_data( source ) self.pools = YAML.load( source ) self.pools.each { |pool_name,pool| pool.data.default = 0.0 } self.corpus = self.pools['__Corpus__'] self.dirty = true end |
#merge_pools(dest_name, source_name) ⇒ Object
Merge the contents of the source pool into the destination destination pool.
94 95 96 97 98 99 100 101 102 103 104 105 |
# File 'lib/bishop.rb', line 94 def merge_pools( dest_name, source_name ) dest_pool = self.pools[dest_name] self.pools[source_name].data.each do |token,count| if dest_pool.data.has_key?( token ) dest_pool.data[token] += count else dest_pool.data[token] = count dest_pool.token_count += 1 end end self.dirty = true end |
#new_pool(pool_name) ⇒ Object
Create a new, empty, pool without training.
76 77 78 79 |
# File 'lib/bishop.rb', line 76 def new_pool( pool_name ) self.dirty = true self.pools[ pool_name ] ||= @data_class.new( pool_name ) end |
#pool_data(pool_name) ⇒ Object
Return an array of token counts for the specified pool.
108 109 110 |
# File 'lib/bishop.rb', line 108 def pool_data( pool_name ) self.pools[pool_name].data.to_a end |
#pool_names ⇒ Object
140 141 142 |
# File 'lib/bishop.rb', line 140 def pool_names self.pools.keys.sort.reject { |name| name == '__Corpus__' } end |
#pool_probs ⇒ Object
Get the probabilities for each pool, recreating the cached information if any token information for any of the pools has changed.
182 183 184 185 186 187 188 |
# File 'lib/bishop.rb', line 182 def pool_probs if self.dirty? self.build_cache self.dirty = false end self.cache end |
#pool_tokens(pool_name) ⇒ Object
Return an array of tokens trained in the specified pool.
113 114 115 |
# File 'lib/bishop.rb', line 113 def pool_tokens( pool_name ) self.pools[pool_name].data.keys end |
#remove_pool(pool_name) ⇒ Object
81 82 83 |
# File 'lib/bishop.rb', line 81 def remove_pool( pool_name ) self.pools.delete( pool_name ) end |
#rename_pool(pool_name, new_name) ⇒ Object
85 86 87 88 89 90 |
# File 'lib/bishop.rb', line 85 def rename_pool( pool_name, new_name ) self.pools[new_name] = self.pools[pool_name] self.pools[new_name].name = new_name self.pools.delete( pool_name ) self.dirty = true end |
#save(file = 'bayesdata.yml') ⇒ Object
117 118 119 |
# File 'lib/bishop.rb', line 117 def save( file = 'bayesdata.yml' ) File.open( file, 'w' ) { |f| YAML.dump( self.pools, f ) } end |
#train(pool_name, item, uid = nil) ⇒ Object
200 201 202 203 204 205 206 207 208 209 210 |
# File 'lib/bishop.rb', line 200 def train( pool_name, item, uid = nil ) tokens = get_tokens( item ) pool = new_pool( pool_name ) train_( pool, tokens ) self.corpus.train_count += 1 pool.train_count += 1 if uid pool.training.push( uid ) end self.dirty = true end |
#trained_on?(msg) ⇒ Boolean
257 258 259 |
# File 'lib/bishop.rb', line 257 def trained_on?( msg ) self.cache.values.any? { |v| v.trained_on? msg } end |
#untrain(pool_name, item, uid = nil) ⇒ Object
223 224 225 226 227 228 229 230 231 232 233 |
# File 'lib/bishop.rb', line 223 def untrain( pool_name, item, uid = nil ) tokens = get_tokens( item ) pool = new_pool( pool_name ) untrain_( pool, tokens ) self.corpus.train_count += 1 pool.train_count += 1 if uid pool.training.delete( uid ) end self.dirty = true end |