Module: Sent

Defined in:
lib/sent.rb,
lib/sent/main.rb

Defined Under Namespace

Classes: NoConfigError, NoGenesError, ProcessAbortedError, WSError

Constant Summary collapse

@@rootdir =
File.dirname(File.dirname(__FILE__))
@@datadir =
@@workdir = @@tmpdir = nil
@@bionmf_wsdl =
"http://bionmf.dacya.ucm.es/WebService/BioNMFWS.wsdl"

Class Method Summary collapse

Class Method Details

.analyze(prefix, output, clusters = nil, num_words = 15) ⇒ Object



319
320
321
322
323
324
325
326
327
328
329
330
331
332
# File 'lib/sent/main.rb', line 319

def self.analyze(prefix, output, clusters = nil, num_words = 15)

  FileUtils.rm Dir.glob(output + '*.words') + Dir.glob(output + '*.genes')
  run_R("SENT.analyze('#{ prefix }', '#{ output }', #{clusters || 'NULL'}, #{num_words})")
  words = Dir.glob(output + '*.words').sort.collect{|f| Open.read(f).split(/\n/)}
  genes = Dir.glob(output + '*.genes').sort.collect{|f| Open.read(f).split(/\n/)}

  groups = []
  words.zip(genes).each{|p|
    groups << {:words => p[0], :genes => p[1]}
  }

  groups
end

.CCC(matrix, kstart, kend) ⇒ Object



281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# File 'lib/sent/main.rb', line 281

def self.CCC(matrix, kstart, kend)
  raise "Error in range: #{ kstart } to #{ kend }" if kstart >= kend

  driver = SOAP::WSDLDriverFactory.new( @@bionmf_wsdl).create_rpc_driver

  # Prepare matrix for processing
  nmf_matrix = driver.upload_matrix(File.open(matrix).read)
  driver.preprocess(nmf_matrix,1,"No","No", true, true)    

  job_id = driver.sample_classification(nmf_matrix,kstart.to_i,kend.to_i,10)

  aborted = false
  old_int = Signal.trap("INT") do
    puts "Aborting bestK process"
    driver.abort(job_id)
    aborted = true
  end

  while (status = driver.status(job_id)) == 0
    sleep(5)
  end

  driver.clean_matrix(nmf_matrix)
  Signal.trap("INT", old_int)

  if aborted
    raise Sent::ProcessAbortedError, "Process Aborted"
  end

  if status == -1 
    raise Sent::WSError, "Error processing matrix:\n" + driver.info(job_id)
  end

  results = driver.results(job_id)
  text = driver.get_result(results[0])
  text.split(/\n/s).last.split(/\t/)
end

.datadirObject



48
49
50
# File 'lib/sent.rb', line 48

def self.datadir
  @@datadir
end

.dictionary(associations, options = {}) ⇒ Object



112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/sent/main.rb', line 112

def self.dictionary(associations, options = {})
  dict_options = {:low => 0.001, :hi => 0.65, :limit => 3000}.merge options
  dict = Dictionary::TF_IDF.new

  String.reset_stem_list

  associations.each do |gene, pmids|
    try3times do
      text = PubMed.get_article(pmids).collect{|pmid, article| article.text }.join("\n")
      dict.add(BagOfWords.count(text.bigrams))
    end
  end

  term_weigths = dict.weights(dict_options)
  terms        = term_weigths.keys.sort

  stems        = String.stem_list(terms.collect{|p| p.split(/ /) }.flatten.uniq)

  [term_weigths, stems]
end

.literature_index(pmids, outfile) ⇒ Object



334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
# File 'lib/sent/main.rb', line 334

def self.literature_index(pmids, outfile)

  index = Ferret::Index::Index.new(:path => outfile)

  index.field_infos.add_field(:title,  :index => :yes, :boost => 0.67)
  index.field_infos.add_field(:abstract, :index => :yes, :boost => 0.33)

  Progress.monitor("Building index for #{pmids.length} articles")
  pmids.uniq.each{|pmid|
    begin
      article = PubMed.get_article(pmid)
      abstract = article.abstract
      title    = article.title

      abstract_content = BagOfWords.terms(abstract).collect{|w,n| (1..n).collect{ w }}.flatten.join(" ")
      title_content    = BagOfWords.terms(title).collect{|w,n| (1..n).collect{ w }}.flatten.join(" ")

      index << {:id => pmid, :abstract => abstract_content, :name => title_content}
    rescue Exception
      puts $!.backtrace
      puts $!.message
    end

  }
  index.close
end

.load_configObject



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/sent.rb', line 16

def self.load_config
  if File.exist?File.join(@@rootdir, 'sent.config')
    config = YAML.load_file(File.join(@@rotdir, 'sent.config'))
    if config.is_a? Hash
      @@datadir  = config['datadir'] if config['datadir'] 
      @@workdir = config['workdir'] if config['workdir']
      @@tmpdir   = config['tmpdir'] if config['tmpdir']
    end
  end



  if File.exist?(File.join(ENV['HOME'], '.sent'))
    config = YAML.load_file(File.join(ENV['HOME'], '.sent') )
    if config.is_a? Hash
      @@datadir  = config['datadir'] if config['datadir'] 
      @@workdir = config['workdir'] if config['workdir']
      @@tmpdir   = config['tmpdir'] if config['tmpdir']
    end
  end

  if @@datadir.nil?  || @@workdir.nil? || @@tmpdir.nil?
    raise Sent::NoConfig, "sent not configured. Edit #{File.join(@@rootdir, 'sent.config')} or $HOME/.sent"
  end


  FileUtils.mkdir_p @@datadir  unless File.exist? @@datadir
  FileUtils.mkdir_p @@workdir unless File.exist? @@workdir
  FileUtils.mkdir_p @@tmpdir   unless File.exist? @@tmpdir

end

.matrix(metadoc_file, genes) ⇒ Object



144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'lib/sent/main.rb', line 144

def self.matrix(metadoc_file, genes)

  matrix = ""
  File.open(metadoc_file) do |f|
    matrix += f.gets
    f.read.each_line do |line|
      gene = line.match(/^(.*?)\t/)[1]
      matrix += line if genes.include? gene
    end
  end

  matrix
end

.mentions(org) ⇒ Object



80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/sent/main.rb', line 80

def self.mentions(org)
  ner   = Organism.ner(org, :rner)
  norm  = Organism.norm(org)
  pmids = Organism.literature(org)

  chunks = pmids.chunk(100)

  pmids = {}
  Progress.monitor("Finding gene-article associations in text", :step => 1000)
  chunks.each{|chunk|
    try3times do
      PubMed.get_article(chunk).each{|pmid, article|
        text = article.text

        mentions = ner.extract(text)

        Progress.monitor("Resolving mentions", :step => 1000)
        codes = mentions.collect{|mention| 
          matches = norm.match(mention)
          norm.select(matches,mention,text)
        }.flatten.uniq.sort

        codes.each{|code|
          pmids[code] ||= []
          pmids[code] << pmid
        }
      }
    end
  }
  pmids
end

.metadoc(associations, terms) ⇒ Object



134
135
136
137
138
139
140
141
142
# File 'lib/sent/main.rb', line 134

def self.metadoc(associations, terms)
  "%s\t%s\n" % ["Entity", terms.sort * "\t"] +
  associations.collect do |gene, pmids|
    try3times do
      text = PubMed.get_article(pmids).collect{|pmid, article| article.text }.join("\n")
      "%s\t%s" % [gene, BagOfWords.features(text, terms.sort).join("\t")]
    end
  end * "\n"
end

.NMF(matrix, out, k, executions = 10) ⇒ Object



159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# File 'lib/sent/main.rb', line 159

def self.NMF(matrix, out, k, executions = 10)
  driver = SOAP::WSDLDriverFactory.new( @@bionmf_wsdl).create_rpc_driver

  # Upload matrix
  nmf_matrix = driver.upload_matrix(
     matrix,   # matrix
     false,    # binary 
     true,     # column labels 
     true,     # row labels
     true,     # transpose
     "No",     # positive
     "No",     # normalization 
     "matrix") # Suggested name

  # Send several executions in parallel
  while !driver.done(nmf_matrix)
    sleep(5)
  end

  if driver.error(nmf_matrix) 
    error = driver.messages(nmf_matrix).join("\n")
    raise "Error pre-processing matrix!"  + driver.messages(nmf_matrix).join("\n")
  end

  threads = []
  error = nil
  executions.times{|i|
    threads << Thread.new(i){ |num|
      times = 3
      begin
        
        job_id = driver.standardNMF(
          nmf_matrix, # Matrix job
          "Standard", # Algorithm
          k,          # Factor Start
          k,          # Factor End
          1,          # Runs
          2000,       # Iterations
          40,         # Stop criteria
          0,          # Not used (nsnmf smoothness)
          false,      # extra info
          nmf_matrix)         # Suggested name

        while !driver.done(job_id)
          sleep(5)
        end

        if driver.error(job_id) 
          error = driver.messages(job_id).join("\n")
          raise "Error in NMF"  + driver.messages(job_id).join("\n")
        end

        results =  driver.results(job_id)
        
        File.open(out + ".matrix_w.#{num}",'w') do |f| 
          f.write Base64.decode64 driver.result(results[0]) #.sub(/\t(.*)\t$/,'\1')
        end

        File.open(out + ".matrix_h.#{num}",'w') do |f|
          f.write Base64.decode64 driver.result(results[1]) #.sub(/\t(.*)\t$/,'\1')
        end

        driver.clean(job_id)
      rescue Sent::ProcessAbortedError
        puts "Process aborted for #{ num }"
        driver.abort(job_id)
      rescue Timeout::Error
        if times > 0
          times -= 1
          sleep 2
          retry
        else
          raise Sent::ProcessAbortedError, "NMF Execution #{ num } timed out"
        end
      rescue Exception
        puts $!.message
        if times > 0
          times -= 1
          puts "Retrying thread #{ num }"
          retry
        else

          puts "NMF Execution #{ num } Produced Exception"
          puts $!.class
          puts $!.message
          puts $!.backtrace
          raise Sent::ProcessAbortedError, "NMF Execution #{ num } Produced Exception"
        end
      ensure
        Thread.exit
      end
    }
    sleep 1

  }

  # Allow threads to be aborted
  aborted = false
  old_int = Signal.trap("INT") do
    STDERR.puts "Killing threads"
    threads.each{|t| t.raise Sent::ProcessAbortedError, "Process Aborted"}
    aborted = true
  end

  threads.each { |aThread|  aThread.join }

  Signal.trap("INT", old_int)
  driver.clean(nmf_matrix)

  if aborted
    raise Sent::ProcessAbortedError, "Process Aborted"
  end

  if error
    raise Exception, "Error in NMF:\n" + error
  end
  
  run_R("SENT.join.results('#{ out }')")

  FileUtils.rm Dir.glob(out + '.matrix_*.*')
end

.rdirObject



63
64
65
# File 'lib/sent.rb', line 63

def self.rdir
  File.join(@@rootdir, 'R')
end

.rootdirObject



58
59
60
# File 'lib/sent.rb', line 58

def self.rootdir
  @@rootdir
end

.run_R(command) ⇒ Object



63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/sent/main.rb', line 63

def self.run_R(command)
  pid, stdin, stdout, stderr = Open4::popen4 "R --vanilla --slave"
  stdin.write "source('#{File.join(Sent.rdir,'matrix.R')}');\n"
  stdin.write "#{ command };\n"   
  stdin.close

  Process.wait pid

  raise Sent::ProcessAbortedError, "Error in R process: #{stdout.read + stderr.read}" if $?.exitstatus != 0
  result = stdout.read + stderr.read
  stdout.close
  stderr.close

  puts result if result != ""
  result
end

.search_index(words, index) ⇒ Object



361
362
363
364
365
366
367
368
369
370
371
# File 'lib/sent/main.rb', line 361

def self.search_index(words, index)
  index = Ferret::Index::Index.new(:path => index)

  ranks = []
  index.search_each("#{ words.collect{|w| w.stem}.join(" ") }", :limit => 8000) do |id,score|
    next unless score > 0.0001
    ranks << [index[id][:id],score]
  end

  ranks
end

.tmpdirObject



54
55
56
# File 'lib/sent.rb', line 54

def self.tmpdir
  @@tmpdir
end

.workdirObject



51
52
53
# File 'lib/sent.rb', line 51

def self.workdir
  @@workdir
end