Class: CrossLanguageSpotter::JaroReferencesProducer
- Inherits:
-
Object
- Object
- CrossLanguageSpotter::JaroReferencesProducer
- Defined in:
- lib/crosslanguagespotter/methods/jaro.rb
Instance Attribute Summary collapse
-
#verbose ⇒ Object
Returns the value of attribute verbose.
Instance Method Summary collapse
-
#initialize(parameters) ⇒ JaroReferencesProducer
constructor
A new instance of JaroReferencesProducer.
- #jaro_coefficient(s1, s2) ⇒ Object
- #jaro_coefficient_from_context(context_ni, context_nj) ⇒ Object
- #jaro_coefficient_from_nodes(ni, nj) ⇒ Object
-
#produce_set(project) ⇒ Object
It should produce a set of node ids.
Constructor Details
#initialize(parameters) ⇒ JaroReferencesProducer
Returns a new instance of JaroReferencesProducer.
29 30 31 32 33 |
# File 'lib/crosslanguagespotter/methods/jaro.rb', line 29 def initialize(parameters) @threshold = parameters[:threshold] @verbose = parameters[:verbose] @winkleradjust = parameters[:winkleradjust] end |
Instance Attribute Details
#verbose ⇒ Object
Returns the value of attribute verbose.
27 28 29 |
# File 'lib/crosslanguagespotter/methods/jaro.rb', line 27 def verbose @verbose end |
Instance Method Details
#jaro_coefficient(s1, s2) ⇒ Object
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
# File 'lib/crosslanguagespotter/methods/jaro.rb', line 64 def jaro_coefficient(s1,s2) # if strings (without trailing & leadning spaces) are equal - return 1 #return 1 if str1.strip==str2.strip # either string blank - return 0 #return 0 if str1.size==0 or str2.size==0 m = 0 # number of matching chars tr = 0 # number of transpositions # get character array length s1l = s1.length s2l = s2.length # str2 should be the longer string if s1l > s2l s1, s2 = s2, s1 end # hash from all unique str2 chars + occurances # example 'aba': hash={ a => 0, b => 0 } a: first occurance, b first occurance # if the first a was visited: { a => 1, b => 0} a: second occuance, b second occurance found = Hash[*s2.uniq.sort.collect {|v| [v,0]}.flatten] # matching distance definition md = (([s1l,s2l].max / 2) - 1).to_i s1.each_with_index do |c,i| # find number of matching chars if !found[c].nil? # character exists in str2 # calculates distance between 2 matching characters compare with md if !s2.aindices(c)[found[c]].nil? x = (s2.aindices(c)[found[c]] - i).abs if x <= md found[c] += 1 # increase occurance of character m += 1 # increase number of matching characters # transpositions? if (x != 0) tr += 1 end end end end end tr = (tr/2).to_i # calc jaro-distance third = 1.0/3 jd = (third * m / s1l) + (third * m / s2l) + (third * (m - tr) / m) out = jd # winkleradjust? if first l characters are the same if @winkleradjust l = 0 (0..s1l-1).each { |i| s1[i]==s2[i] ? l+=1 : break } out = jd + (l * 0.1 * (1 - jd)) end out end |
#jaro_coefficient_from_context(context_ni, context_nj) ⇒ Object
58 59 60 61 62 |
# File 'lib/crosslanguagespotter/methods/jaro.rb', line 58 def jaro_coefficient_from_context(context_ni,context_nj) s1 = context_ni.sequence_of_values.map{|v| v.to_s} s2 = context_nj.sequence_of_values.map{|v| v.to_s} jaro_coefficient(s1,s2) end |
#jaro_coefficient_from_nodes(ni, nj) ⇒ Object
54 55 56 |
# File 'lib/crosslanguagespotter/methods/jaro.rb', line 54 def jaro_coefficient_from_nodes(ni,nj) jaro_coefficient_from_context(context(ni),context(nj)) end |
#produce_set(project) ⇒ Object
It should produce a set of node ids
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/crosslanguagespotter/methods/jaro.rb', line 36 def produce_set(project) set = Set.new puts "Jaro method:" if @verbose block1 = Proc.new do |ni,nj| context_ni = context(ni).sequence_of_values.map{|v| v.to_s} context_nj = context(nj).sequence_of_values.map{|v| v.to_s} if jaro_coefficient(context_ni,context_nj)>@threshold id_i = NodeId.from_node(ni) id_j = NodeId.from_node(nj) set << CrossLanguageRelation.new([id_i,id_j]) end end project.iter_over_shared_ids_instances {|ni,nj| block1.call(ni,nj) } puts "Jaro method, set produced: #{set.count} elements" if @verbose set end |