Top Level Namespace
Defined Under Namespace
Modules: BLEU, DAG, HG, TFIDF
Classes: Array, BooleanSemiring, CountingSemiring, InsideSemiring, Integer, PriorityQueue, ReadFile, RealSemiring, RealxSemiring, Semiring, SparseVector, String, Translation, ViterbiLogSemiring, ViterbiSemiring, WriteFile
Instance Method Summary
collapse
-
#bag_of_words(s, stopwords = []) ⇒ Object
-
#cdec_kbest(cdec_bin, input, ini, weights, k, unique = true) ⇒ Object
-
#ngrams(s, n, fix = false) ⇒ Object
-
#read_config(fn) ⇒ Object
-
#read_kbest_lists(fn, translation_type = Translation) ⇒ Object
-
#read_phrase_table(fn) ⇒ Object
-
#spawn_with_timeout(cmd, t = 4, ignore_fail = false, debug = false) ⇒ Object
-
#splitpipe(s, n = 3) ⇒ Object
-
#tokenize(s) ⇒ Object
Instance Method Details
#bag_of_words(s, stopwords = []) ⇒ Object
15
16
17
|
# File 'lib/zipf/stringutil.rb', line 15
def bag_of_words s, stopwords=[]
s.strip.split.uniq.sort.reject{ |w| stopwords.include? w }
end
|
#cdec_kbest(cdec_bin, input, ini, weights, k, unique = true) ⇒ Object
89
90
91
92
93
94
95
96
97
|
# File 'lib/zipf/misc.rb', line 89
def cdec_kbest cdec_bin, input, ini, weights, k, unique=true
require 'open3'
cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}"
cmd += " -r" if unique
o,_ = Open3.capture2 "#{cmd} 2>/dev/null"
a = []; j = -1
o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t }
return a
end
|
#ngrams(s, n, fix = false) ⇒ Object
5
6
7
8
9
10
11
12
13
|
# File 'lib/zipf/stringutil.rb', line 5
def ngrams(s, n, fix=false)
a = tokenize s
a.each_with_index { |tok, i|
tok.strip!
0.upto([n-1, a.size-i-1].min) { |m|
yield a[i..i+m] if !fix||(fix&&a[i..i+m].size==n)
}
}
end
|
#read_config(fn) ⇒ Object
99
100
101
102
103
104
105
106
107
108
109
110
111
112
|
# File 'lib/zipf/misc.rb', line 99
def read_config fn
f = ReadFile.new fn
cfg = {}
while line = f.gets
line.strip!
next if /^\s*$/.match line
next if line[0]=='#'
content = line.split('#', 2).first
k, v = content.split(/\s*=\s*/, 2)
k.strip!; v.strip!
cfg[k] = v
end
return cfg
end
|
#read_kbest_lists(fn, translation_type = Translation) ⇒ Object
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
|
# File 'lib/zipf/Translation.rb', line 45
def read_kbest_lists fn, translation_type=Translation
kbest_lists = []
cur = []
f = ReadFile.new fn
prev = -1
c = 0
id = 0
while line = f.gets
t = translation_type.new
t.from_s line
c = splitpipe(line)[0].to_i
if c != prev
if cur.size > 0
kbest_lists << cur
cur = []
end
prev = c
id = 0
end
t.id = id
cur << t
id += 1
end
kbest_lists << cur f.close
return kbest_lists
end
|
#read_phrase_table(fn) ⇒ Object
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
|
# File 'lib/zipf/misc.rb', line 73
def read_phrase_table fn
table = {}
f = ReadFile.new fn
while raw_rule = f.gets
french, english, features = splitpipe(raw_rule)
feature_map = SparseVector.from_kv features
if table.has_key? french
table[french] << [english, feature_map ]
else
table[french] = [[english, feature_map]]
end
end
f.close
return table
end
|
#spawn_with_timeout(cmd, t = 4, ignore_fail = false, debug = false) ⇒ Object
60
61
62
63
64
65
66
67
68
69
70
71
|
# File 'lib/zipf/misc.rb', line 60
def spawn_with_timeout cmd, t=4, ignore_fail=false, debug=false
STDERR.write cmd+"\n" if debug
pipe_in, pipe_out = IO.pipe
pid = Process.spawn(cmd, :out => pipe_out)
begin
Timeout.timeout(t) { Process.wait pid }
rescue Timeout::Error
Process.kill('TERM', pid) if !ignore_fail
end
pipe_out.close
return pipe_in.read
end
|
#splitpipe(s, n = 3) ⇒ Object
19
20
21
|
# File 'lib/zipf/stringutil.rb', line 19
def splitpipe s, n=3
s.strip.split("|"*n)
end
|
#tokenize(s) ⇒ Object
1
2
3
|
# File 'lib/zipf/stringutil.rb', line 1
def tokenize s
s.strip.split
end
|