Class: LooseTightDictionary

Inherits:
Object
  • Object
show all
Defined in:
lib/loose_tight_dictionary.rb,
lib/loose_tight_dictionary/score.rb,
lib/loose_tight_dictionary/result.rb,
lib/loose_tight_dictionary/version.rb,
lib/loose_tight_dictionary/wrapper.rb,
lib/loose_tight_dictionary/blocking.rb,
lib/loose_tight_dictionary/identity.rb,
lib/loose_tight_dictionary/stop_word.rb,
lib/loose_tight_dictionary/tightener.rb,
lib/loose_tight_dictionary/similarity.rb,
lib/loose_tight_dictionary/cached_result.rb

Overview

See the README for more information.

Defined Under Namespace

Classes: Blocking, CachedResult, Identity, Result, Score, Similarity, StopWord, Tightener, Wrapper

Constant Summary collapse

VERSION =
'1.0.4'

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(records, options = {}) ⇒ LooseTightDictionary

haystack - a bunch of records options

  • tighteners: regexps (see readme)

  • identities: regexps

  • blockings: regexps

  • stop_words: regexps

  • read: how to interpret each entry in the ‘haystack’, either a Proc or a symbol



36
37
38
39
40
41
42
43
44
45
46
47
# File 'lib/loose_tight_dictionary.rb', line 36

def initialize(records, options = {})
  options = options.symbolize_keys
  @first_blocking_decides = options.fetch :first_blocking_decides, false
  @must_match_blocking = options.fetch :must_match_blocking, false
  @must_match_at_least_one_word = options.fetch :must_match_at_least_one_word, false
  @blockings = options.fetch(:blockings, []).map { |regexp_or_str| Blocking.new regexp_or_str }
  @identities = options.fetch(:identities, []).map { |regexp_or_str| Identity.new regexp_or_str }
  @tighteners = options.fetch(:tighteners, []).map { |regexp_or_str| Tightener.new regexp_or_str }
  @stop_words = options.fetch(:stop_words, []).map { |regexp_or_str| StopWord.new regexp_or_str }
  read = options[:read] || options[:haystack_reader]
  @haystack = records.map { |record| Wrapper.new self, record, read }
end

Instance Attribute Details

#blockingsObject (readonly)

Returns the value of attribute blockings.



21
22
23
# File 'lib/loose_tight_dictionary.rb', line 21

def blockings
  @blockings
end

#first_blocking_decidesObject (readonly)

Returns the value of attribute first_blocking_decides.



25
26
27
# File 'lib/loose_tight_dictionary.rb', line 25

def first_blocking_decides
  @first_blocking_decides
end

#haystackObject (readonly)

Returns the value of attribute haystack.



20
21
22
# File 'lib/loose_tight_dictionary.rb', line 20

def haystack
  @haystack
end

#identitiesObject (readonly)

Returns the value of attribute identities.



22
23
24
# File 'lib/loose_tight_dictionary.rb', line 22

def identities
  @identities
end

#must_match_at_least_one_wordObject (readonly)

Returns the value of attribute must_match_at_least_one_word.



27
28
29
# File 'lib/loose_tight_dictionary.rb', line 27

def must_match_at_least_one_word
  @must_match_at_least_one_word
end

#must_match_blockingObject (readonly)

Returns the value of attribute must_match_blocking.



26
27
28
# File 'lib/loose_tight_dictionary.rb', line 26

def must_match_blocking
  @must_match_blocking
end

#stop_wordsObject (readonly)

Returns the value of attribute stop_words.



24
25
26
# File 'lib/loose_tight_dictionary.rb', line 24

def stop_words
  @stop_words
end

#tightenersObject (readonly)

Returns the value of attribute tighteners.



23
24
25
# File 'lib/loose_tight_dictionary.rb', line 23

def tighteners
  @tighteners
end

Instance Method Details

#explain(needle) ⇒ Object

Explain is like mysql’s EXPLAIN command. You give it a needle and it tells you about how it was located (successfully or not) in the haystack.

d = LooseTightDictionary.new ['737', '747', '757' ]
d.explain 'boeing 737-100'


166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# File 'lib/loose_tight_dictionary.rb', line 166

def explain(needle)
  record = find needle, :gather_last_result => true
  log "#" * 150
  log "# Match #{needle.inspect} => #{record.inspect}"
  log "#" * 150
  log
  log "Needle"
  log "-" * 150
  log last_result.needle.render
  log
  log "Stop words"
  log last_result.stop_words.blank? ? '(none)' : last_result.stop_words.map { |stop_word| stop_word.inspect }.join("\n")
  log
  log "Candidates"
  log "-" * 150
  log last_result.candidates.map { |record| record.render }.join("\n")
  log
  log "Tighteners"
  log "-" * 150
  log last_result.tighteners.blank? ? '(none)' : last_result.tighteners.map { |tightener| tightener.inspect }.join("\n")
  log
  log "Blockings"
  log "-" * 150
  log last_result.blockings.blank? ? '(none)' : last_result.blockings.map { |blocking| blocking.inspect }.join("\n")
  log
  log "Identities"
  log "-" * 150
  log last_result.identities.blank? ? '(none)' : last_result.identities.map { |blocking| blocking.inspect }.join("\n")
  log
  log "Joint"
  log "-" * 150
  log last_result.joint.blank? ? '(none)' : last_result.joint.map { |joint| joint.render }.join("\n")
  log
  log "Disjoint"
  log "-" * 150
  log last_result.disjoint.blank? ? '(none)' : last_result.disjoint.map { |disjoint| disjoint.render }.join("\n")
  log
  log "Possibly identical"
  log "-" * 150
  log last_result.possibly_identical.blank? ? '(none)' : last_result.possibly_identical.map { |possibly_identical| possibly_identical.render }.join("\n")
  log
  log "Certainly different"
  log "-" * 150
  log last_result.certainly_different.blank? ? '(none)' : last_result.certainly_different.map { |certainly_different| certainly_different.render }.join("\n")
  log
  log "Similarities"
  log "-" * 150
  log last_result.similarities.blank? ? '(none)' : last_result.similarities.reverse[0..9].map { |similarity| similarity.inspect }.join("\n")
  log
  log "Match"
  log "-" * 150
  log record.inspect
end

#find(needle, options = {}) ⇒ Object

Raises:

  • (::RuntimeError)


58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/loose_tight_dictionary.rb', line 58

def find(needle, options = {})
  raise ::RuntimeError, "[loose_tight_dictionary] Dictionary has already been freed, can't perform more finds" if freed?
  
  options = options.symbolize_keys
  gather_last_result = options.fetch(:gather_last_result, false)
  is_find_all = options.fetch(:find_all, false)
  
  if gather_last_result
    free_last_result
    @last_result = Result.new
  end
  
  if gather_last_result
    last_result.tighteners = tighteners
    last_result.identities = identities
    last_result.blockings = blockings
    last_result.stop_words = stop_words
  end
  
  needle = Wrapper.new self, needle
  
  if gather_last_result
    last_result.needle = needle
  end
  
  if must_match_blocking and blockings.any? and blockings.none? { |blocking| blocking.match? needle }
    if is_find_all
      return []
    else
      return nil
    end
  end

  candidates = if must_match_at_least_one_word
    haystack.select do |straw|
      (needle.words & straw.words).any?
    end
  else
    haystack
  end
  
  if gather_last_result
    last_result.candidates = candidates
  end
  
  joint, disjoint = if blockings.any?
    candidates.partition do |straw|
      if first_blocking_decides
        blockings.detect { |blocking| blocking.match? needle }.try :join?, needle, straw
      else
        blockings.any? { |blocking| blocking.join? needle, straw }
      end
    end
  else
    [ candidates.dup, [] ]
  end
  
  # special case: the needle didn't fit anywhere, but must_match_blocking is false, so we'll try it against everything
  if joint.none?
    joint = disjoint
    disjoint = []
  end
  
  if gather_last_result
    last_result.joint = joint
    last_result.disjoint = disjoint
  end
  
  possibly_identical, certainly_different = if identities.any?
    joint.partition do |straw|
      identities.all? do |identity|
        answer = identity.identical? needle, straw
        answer.nil? or answer == true
      end
    end
  else
    [ joint.dup, [] ]
  end
  
  if gather_last_result
    last_result.possibly_identical = possibly_identical
    last_result.certainly_different = certainly_different
  end
  
  if is_find_all
    return possibly_identical.map { |straw| straw.record }
  end
  
  similarities = possibly_identical.map { |straw| needle.similarity straw }.sort
  
  if gather_last_result
    last_result.similarities = similarities
  end
  
  if best_similarity = similarities[-1] and best_similarity.best_score.dices_coefficient > 0
    record = best_similarity.wrapper2.record
    if gather_last_result
      last_result.record = record
      last_result.score = best_similarity.best_score.dices_coefficient
    end
    record
  end
end

#find_all(needle, options = {}) ⇒ Object



53
54
55
56
# File 'lib/loose_tight_dictionary.rb', line 53

def find_all(needle, options = {})
  options = options.symbolize_keys.merge(:find_all => true)
  find needle, options
end

#freeObject



228
229
230
231
232
233
234
# File 'lib/loose_tight_dictionary.rb', line 228

def free
  free_last_result
  @haystack.try :clear
  @haystack = nil
ensure
  @freed = true
end

#freed?Boolean

Returns:

  • (Boolean)


224
225
226
# File 'lib/loose_tight_dictionary.rb', line 224

def freed?
  @freed == true
end

#last_resultObject



49
50
51
# File 'lib/loose_tight_dictionary.rb', line 49

def last_result
  @last_result || raise(::RuntimeError, "[loose_tight_dictionary] You can't access the last result until you've run a find with :gather_last_result => true")
end

#log(str = '') ⇒ Object

:nodoc:



220
221
222
# File 'lib/loose_tight_dictionary.rb', line 220

def log(str = '') #:nodoc:
  $stderr.puts str
end