Class: StreamRunner

Inherits:
Object
  • Object
show all
Defined in:
lib/stream_runner.rb

Instance Method Summary collapse

Instance Method Details

#expand_path(file) ⇒ Object



31
32
33
34
35
36
37
38
39
40
# File 'lib/stream_runner.rb', line 31

def expand_path(file)
  return file if File.exist?(file)
  rlib = ENV['RUBYLIB'] || File.dirname(__FILE__)
  raise "Cannot resolve path to #{file} -- no RUBYLIB" unless rlib
  (rlib.split(':') + [File.dirname(__FILE__)]).each do |rp|
    trial = "#{rp}/#{file}"
    return trial if File.exists?(trial)
  end
  raise "Cannot resolve path to #{file}. Is it in RUBYLIB?"
end

#expand_paths(extra) ⇒ Object



41
42
43
44
# File 'lib/stream_runner.rb', line 41

def expand_paths(extra)
  extras = []
  extra.collect { |e| expand_path(e)}
end

#run_hadoop_stream(input, out, mapper, reducer, reducers, extra, map_opts, reduce_opts, opts) ⇒ Object



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/stream_runner.rb', line 46

def run_hadoop_stream(input, out, mapper, reducer, reducers, extra, 
  map_opts, reduce_opts, opts)
  extras = ''
  extra << mapper.split(' ')[0]
  extra << reducer.split(' ')[0]
  expand_paths(extra.uniq).each {|e| extras += "-file #{e} "}
  map_opt = ''
  map_opts.each {|n, v| map_opt += "-jobconf #{n}=#{v} "}
  reduce_opt = ''
  reduce_opts.each {|n, v| reduce_opt += "-jobconf #{n}=#{v} "}
  if input.class == Array
    input = input.collect {|i| "-input #{i}"}.join(" ")
  else
    input = "-input #{input}"
  end

  if reducer.nil?
    cmd = "hadoop jar #{HADOOP_STREAMING} " +
      "#{input} " +
	"-output NONE " +
	"-mapper \"ruby #{mapper}\"" +
      "-jobconf mapred.reduce.tasks=0 " +
	map_opt +
      "#{extras}"
  else
    cmd = "hadoop jar #{HADOOP_STREAMING} " +
      "#{input} " +
	"-output #{out} " +
	"-mapper \"ruby #{mapper}\" " +
	map_opt +
	"-reducer \"ruby #{reducer}\" " +
      "-jobconf mapred.reduce.tasks=#{reducers} " +
	reduce_opt +
      "#{extras}"
  end
  cmd += " -verbose " if opts.has_key?(:verbose)
  cmd += " #{opts[:hadoop_opts]}" if opts.has_key?(:hadoop_opts)
  puts cmd if opts.has_key?(:verbose)
  system(cmd)
end

#run_map_reduce(input, out, map, reduce, reducers, extra, map_opts = {}, reduce_opts = {}, opts = {}) ⇒ Object



87
88
89
90
91
92
93
94
95
96
97
98
# File 'lib/stream_runner.rb', line 87

def run_map_reduce(input, out, map, reduce, reducers, extra, 
  map_opts = {}, reduce_opts = {}, opts = {})
  system("hadoop fs -rmr #{out}")
  system("rm -rf out/#{out}")
  system("mkdir -p out/#{out}")
  run_hadoop_stream(input, out, map, reduce, reducers, extra, 
    map_opts, reduce_opts, opts)
  (0..reducers-1).each do |i|
    n = sprintf("%05d", i)
    system("hadoop fs -cat #{out}/part-#{n} >out/#{out}/part-#{n}")
  end
end