Class: LooseTightDictionary

Inherits:
Object
  • Object
show all
Defined in:
lib/loose_tight_dictionary.rb,
lib/loose_tight_dictionary/score.rb,
lib/loose_tight_dictionary/result.rb,
lib/loose_tight_dictionary/version.rb,
lib/loose_tight_dictionary/wrapper.rb,
lib/loose_tight_dictionary/blocking.rb,
lib/loose_tight_dictionary/identity.rb,
lib/loose_tight_dictionary/tightener.rb,
lib/loose_tight_dictionary/similarity.rb,
lib/loose_tight_dictionary/cached_result.rb

Overview

See the README for more information.

Defined Under Namespace

Classes: Blocking, CachedResult, Identity, Result, Score, Similarity, Tightener, Wrapper

Constant Summary collapse

VERSION =
'1.0.2'

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(records, options = {}) ⇒ LooseTightDictionary

haystack - a bunch of records options

  • tighteners: regexps (see readme)

  • identities: regexps

  • blockings: regexps

  • read: how to interpret each entry in the ‘haystack’, either a Proc or a symbol



29
30
31
32
33
34
# File 'lib/loose_tight_dictionary.rb', line 29

def initialize(records, options = {})
  @options = options.symbolize_keys
  @records = records
  read = options[:read] || options[:haystack_reader]
  @haystack = records.map { |record| Wrapper.new self, record, read }
end

Instance Attribute Details

#haystackObject (readonly)

Returns the value of attribute haystack.



20
21
22
# File 'lib/loose_tight_dictionary.rb', line 20

def haystack
  @haystack
end

#optionsObject (readonly)

Returns the value of attribute options.



19
20
21
# File 'lib/loose_tight_dictionary.rb', line 19

def options
  @options
end

#recordsObject (readonly)

Returns the value of attribute records.



21
22
23
# File 'lib/loose_tight_dictionary.rb', line 21

def records
  @records
end

Instance Method Details

#blockingsObject



214
215
216
217
218
# File 'lib/loose_tight_dictionary.rb', line 214

def blockings
  @blockings ||= (options[:blockings] || []).map do |regexp_or_str|
    Blocking.new regexp_or_str
  end
end

#explain(needle) ⇒ Object

Explain is like mysql’s EXPLAIN command. You give it a needle and it tells you about how it was located (successfully or not) in the haystack.

d = LooseTightDictionary.new ['737', '747', '757' ]
d.explain 'boeing 737-100'


143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# File 'lib/loose_tight_dictionary.rb', line 143

def explain(needle)
  record = find needle, :gather_last_result => true
  log "#" * 150
  log "# Match #{needle.inspect} => #{record.inspect}"
  log "#" * 150
  log
  log "Needle"
  log "-" * 150
  log last_result.needle.to_str
  log
  log "Haystack"
  log "-" * 150
  log last_result.haystack.map { |record| record.to_str }.join("\n")
  log
  log "Tighteners"
  log "-" * 150
  log last_result.tighteners.blank? ? '(none)' : last_result.tighteners.map { |tightener| tightener.inspect }.join("\n")
  log
  log "Blockings"
  log "-" * 150
  log last_result.blockings.blank? ? '(none)' : last_result.blockings.map { |blocking| blocking.inspect }.join("\n")
  log
  log "Identities"
  log "-" * 150
  log last_result.identities.blank? ? '(none)' : last_result.identities.map { |blocking| blocking.inspect }.join("\n")
  log
  log "Joint"
  log "-" * 150
  log last_result.joint.blank? ? '(none)' : last_result.joint.map { |joint| joint.to_str }.join("\n")
  log
  log "Disjoint"
  log "-" * 150
  log last_result.disjoint.blank? ? '(none)' : last_result.disjoint.map { |disjoint| disjoint.to_str }.join("\n")
  log
  log "Possibly identical"
  log "-" * 150
  log last_result.possibly_identical.blank? ? '(none)' : last_result.possibly_identical.map { |possibly_identical| possibly_identical.to_str }.join("\n")
  log
  log "Certainly different"
  log "-" * 150
  log last_result.certainly_different.blank? ? '(none)' : last_result.certainly_different.map { |certainly_different| certainly_different.to_str }.join("\n")
  log
  log "Similarities"
  log "-" * 150
  log last_result.similarities.blank? ? '(none)' : last_result.similarities.reverse[0..9].map { |similarity| similarity.inspect }.join("\n")
  log
  log "Match"
  log "-" * 150
  log record.inspect
end

#find(needle, options = {}) ⇒ Object

Raises:

  • (::RuntimeError)


49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# File 'lib/loose_tight_dictionary.rb', line 49

def find(needle, options = {})
  raise ::RuntimeError, "[loose_tight_dictionary] Dictionary has already been freed, can't perform more finds" if freed?
  
  options = options.symbolize_keys
  if gather_last_result = options.fetch(:gather_last_result, false)
    free_last_result
    @last_result = Result.new
  end
  find_all = options.fetch(:find_all, false)
  
  if gather_last_result
    last_result.tighteners = tighteners
    last_result.identities = identities
    last_result.blockings = blockings
  end
  
  needle = Wrapper.new self, needle
  
  if gather_last_result
    last_result.needle = needle
  end
  
  if must_match_blocking and blockings.any? and blockings.none? { |blocking| blocking.match? needle }
    if find_all
      return []
    else
      return nil
    end
  end

  joint, disjoint = if blockings.any?
    haystack.partition do |straw|
      if first_blocking_decides
        blockings.detect { |blocking| blocking.match? needle }.try :join?, needle, straw
      else
        blockings.any? { |blocking| blocking.join? needle, straw }
      end
    end
  else
    [ haystack.dup, [] ]
  end
  
  # special case: the needle didn't fit anywhere, but must_match_blocking is false, so we'll try it against everything
  if joint.none?
    joint = disjoint
    disjoint = []
  end
  
  if gather_last_result
    last_result.joint = joint
    last_result.disjoint = disjoint
  end
  
  possibly_identical, certainly_different = if identities.any?
    joint.partition do |straw|
      identities.all? do |identity|
        answer = identity.identical? needle, straw
        answer.nil? or answer == true
      end
    end
  else
    [ joint.dup, [] ]
  end
  
  if gather_last_result
    last_result.possibly_identical = possibly_identical
    last_result.certainly_different = certainly_different
  end
  
  if find_all
    return possibly_identical.map { |straw| straw.record }
  end
  
  similarities = possibly_identical.map { |straw| needle.similarity straw }.sort
  
  if gather_last_result
    last_result.similarities = similarities
  end
  
  
  if best_similarity = similarities[-1] and best_similarity.best_score.to_f > 0
    record = best_similarity.wrapper2.record
    if gather_last_result
      last_result.record = record
      last_result.score = best_similarity.best_score.to_f
    end
    record
  end
end

#find_all(needle, options = {}) ⇒ Object



44
45
46
47
# File 'lib/loose_tight_dictionary.rb', line 44

def find_all(needle, options = {})
  options = options.symbolize_keys.merge(:find_all => true)
  find needle, options
end

#first_blocking_decidesObject



198
199
200
# File 'lib/loose_tight_dictionary.rb', line 198

def first_blocking_decides
  options.fetch :first_blocking_decides, false
end

#freeObject



224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# File 'lib/loose_tight_dictionary.rb', line 224

def free
  free_last_result
  @options.try :clear
  @options = nil
  @haystack.try :clear
  @haystack = nil
  @tighteners.try :clear
  @tighteners = nil
  @identities.try :clear
  @identities = nil
  @blockings.try :clear
  @blockings = nil
ensure
  @freed = true
end

#freed?Boolean

Returns:

  • (Boolean)


220
221
222
# File 'lib/loose_tight_dictionary.rb', line 220

def freed?
  @freed == true
end

#identitiesObject



208
209
210
211
212
# File 'lib/loose_tight_dictionary.rb', line 208

def identities
  @identities ||= (options[:identities] || []).map do |regexp_or_str|
    Identity.new regexp_or_str
  end
end

#last_resultObject



36
37
38
# File 'lib/loose_tight_dictionary.rb', line 36

def last_result
  @last_result || raise(::RuntimeError, "[loose_tight_dictionary] You can't access the last result until you've run a find with :gather_last_result => true")
end

#log(str = '') ⇒ Object

:nodoc:



40
41
42
# File 'lib/loose_tight_dictionary.rb', line 40

def log(str = '') #:nodoc:
  (options[:log] || $stderr).puts str unless options[:log] == false
end

#must_match_blockingObject



194
195
196
# File 'lib/loose_tight_dictionary.rb', line 194

def must_match_blocking
  options.fetch :must_match_blocking, false
end

#tightenersObject



202
203
204
205
206
# File 'lib/loose_tight_dictionary.rb', line 202

def tighteners
  @tighteners ||= (options[:tighteners] || []).map do |regexp_or_str|
    Tightener.new regexp_or_str
  end
end