Class: CrossLanguageSpotter::JaroReferencesProducer

Inherits:
Object
  • Object
show all
Defined in:
lib/crosslanguagespotter/methods/jaro.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(parameters) ⇒ JaroReferencesProducer

Returns a new instance of JaroReferencesProducer.



29
30
31
32
33
# File 'lib/crosslanguagespotter/methods/jaro.rb', line 29

def initialize(parameters)
    @threshold = parameters[:threshold]
    @verbose = parameters[:verbose]
    @winkleradjust = parameters[:winkleradjust]
end

Instance Attribute Details

#verboseObject

Returns the value of attribute verbose.



27
28
29
# File 'lib/crosslanguagespotter/methods/jaro.rb', line 27

def verbose
  @verbose
end

Instance Method Details

#jaro_coefficient(s1, s2) ⇒ Object



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/crosslanguagespotter/methods/jaro.rb', line 64

def jaro_coefficient(s1,s2)
    # if strings (without trailing & leadning spaces) are equal - return 1
    #return 1 if str1.strip==str2.strip
    # either string blank - return 0
    #return 0 if str1.size==0 or str2.size==0
    m = 0 # number of matching chars
    tr = 0 # number of transpositions

    # get character array length
    s1l = s1.length
    s2l = s2.length
    # str2 should be the longer string
    if s1l > s2l
        s1, s2 = s2, s1
    end
    # hash from all unique str2 chars + occurances
    # example 'aba': hash={ a => 0, b => 0 } a: first occurance, b first occurance
    # if the first a was visited: { a => 1, b => 0} a: second occuance, b second occurance
    found = Hash[*s2.uniq.sort.collect {|v| [v,0]}.flatten]
    # matching distance definition
    md = (([s1l,s2l].max / 2) - 1).to_i
    s1.each_with_index do |c,i|
        # find number of matching chars
        if !found[c].nil? # character exists in str2
            # calculates distance between 2 matching characters compare with md
            if !s2.aindices(c)[found[c]].nil?
                x = (s2.aindices(c)[found[c]] - i).abs
                if x <= md
                    found[c] += 1 # increase occurance of character
                    m += 1 # increase number of matching characters
                    # transpositions?
                    if (x != 0)
                        tr += 1
                    end
                end
            end
        end
    end
    tr = (tr/2).to_i
    # calc jaro-distance
    third = 1.0/3
    jd = (third * m / s1l) + (third * m / s2l) + (third * (m - tr) / m)
    out = jd
    # winkleradjust? if first l characters are the same
    if @winkleradjust
        l = 0
        (0..s1l-1).each { |i| s1[i]==s2[i] ? l+=1 : break }
        out = jd + (l * 0.1 * (1 - jd))
    end
    out
end

#jaro_coefficient_from_context(context_ni, context_nj) ⇒ Object



58
59
60
61
62
# File 'lib/crosslanguagespotter/methods/jaro.rb', line 58

def jaro_coefficient_from_context(context_ni,context_nj)
    s1 = context_ni.sequence_of_values.map{|v| v.to_s}
    s2 = context_nj.sequence_of_values.map{|v| v.to_s}  
    jaro_coefficient(s1,s2) 
end

#jaro_coefficient_from_nodes(ni, nj) ⇒ Object



54
55
56
# File 'lib/crosslanguagespotter/methods/jaro.rb', line 54

def jaro_coefficient_from_nodes(ni,nj)
    jaro_coefficient_from_context(context(ni),context(nj))
end

#produce_set(project) ⇒ Object

It should produce a set of node ids



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/crosslanguagespotter/methods/jaro.rb', line 36

def produce_set(project)
    set = Set.new
    puts "Jaro method:" if @verbose

    block1 = Proc.new do |ni,nj|
        context_ni = context(ni).sequence_of_values.map{|v| v.to_s}
        context_nj = context(nj).sequence_of_values.map{|v| v.to_s}
        if jaro_coefficient(context_ni,context_nj)>@threshold
            id_i = NodeId.from_node(ni)
            id_j = NodeId.from_node(nj)
            set << CrossLanguageRelation.new([id_i,id_j])
        end
    end     
    project.iter_over_shared_ids_instances {|ni,nj| block1.call(ni,nj) }                
    puts "Jaro method, set produced: #{set.count} elements" if @verbose
    set
end