Module: Sent
- Defined in:
- lib/sent.rb,
lib/sent/main.rb
Defined Under Namespace
Classes: NoConfigError, NoGenesError, ProcessAbortedError, WSError
Constant Summary collapse
- @@rootdir =
File.dirname(File.dirname(__FILE__))
- @@datadir =
@@workdir = @@tmpdir = nil
- @@bionmf_wsdl =
"http://bionmf.dacya.ucm.es/WebService/BioNMFWS.wsdl"
Class Method Summary collapse
- .analyze(prefix, output, clusters = nil, num_words = 15) ⇒ Object
- .CCC(matrix, kstart, kend) ⇒ Object
- .datadir ⇒ Object
- .dictionary(associations, options = {}) ⇒ Object
- .literature_index(pmids, outfile) ⇒ Object
- .load_config ⇒ Object
- .matrix(metadoc_file, genes) ⇒ Object
- .mentions(org) ⇒ Object
- .metadoc(associations, terms) ⇒ Object
- .NMF(matrix, out, k, executions = 10) ⇒ Object
- .rdir ⇒ Object
- .rootdir ⇒ Object
- .run_R(command) ⇒ Object
- .search_index(words, index) ⇒ Object
- .tmpdir ⇒ Object
- .workdir ⇒ Object
Class Method Details
.analyze(prefix, output, clusters = nil, num_words = 15) ⇒ Object
319 320 321 322 323 324 325 326 327 328 329 330 331 332 |
# File 'lib/sent/main.rb', line 319 def self.analyze(prefix, output, clusters = nil, num_words = 15) FileUtils.rm Dir.glob(output + '*.words') + Dir.glob(output + '*.genes') run_R("SENT.analyze('#{ prefix }', '#{ output }', #{clusters || 'NULL'}, #{num_words})") words = Dir.glob(output + '*.words').sort.collect{|f| Open.read(f).split(/\n/)} genes = Dir.glob(output + '*.genes').sort.collect{|f| Open.read(f).split(/\n/)} groups = [] words.zip(genes).each{|p| groups << {:words => p[0], :genes => p[1]} } groups end |
.CCC(matrix, kstart, kend) ⇒ Object
281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 |
# File 'lib/sent/main.rb', line 281 def self.CCC(matrix, kstart, kend) raise "Error in range: #{ kstart } to #{ kend }" if kstart >= kend driver = SOAP::WSDLDriverFactory.new( @@bionmf_wsdl).create_rpc_driver # Prepare matrix for processing nmf_matrix = driver.upload_matrix(File.open(matrix).read) driver.preprocess(nmf_matrix,1,"No","No", true, true) job_id = driver.sample_classification(nmf_matrix,kstart.to_i,kend.to_i,10) aborted = false old_int = Signal.trap("INT") do puts "Aborting bestK process" driver.abort(job_id) aborted = true end while (status = driver.status(job_id)) == 0 sleep(5) end driver.clean_matrix(nmf_matrix) Signal.trap("INT", old_int) if aborted raise Sent::ProcessAbortedError, "Process Aborted" end if status == -1 raise Sent::WSError, "Error processing matrix:\n" + driver.info(job_id) end results = driver.results(job_id) text = driver.get_result(results[0]) text.split(/\n/s).last.split(/\t/) end |
.datadir ⇒ Object
48 49 50 |
# File 'lib/sent.rb', line 48 def self.datadir @@datadir end |
.dictionary(associations, options = {}) ⇒ Object
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
# File 'lib/sent/main.rb', line 112 def self.dictionary(associations, = {}) = {:low => 0.001, :hi => 0.65, :limit => 3000}.merge dict = Dictionary::TF_IDF.new String.reset_stem_list associations.each do |gene, pmids| try3times do text = PubMed.get_article(pmids).collect{|pmid, article| article.text }.join("\n") dict.add(BagOfWords.count(text.bigrams)) end end term_weigths = dict.weights() terms = term_weigths.keys.sort stems = String.stem_list(terms.collect{|p| p.split(/ /) }.flatten.uniq) [term_weigths, stems] end |
.literature_index(pmids, outfile) ⇒ Object
334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 |
# File 'lib/sent/main.rb', line 334 def self.literature_index(pmids, outfile) index = Ferret::Index::Index.new(:path => outfile) index.field_infos.add_field(:title, :index => :yes, :boost => 0.67) index.field_infos.add_field(:abstract, :index => :yes, :boost => 0.33) Progress.monitor("Building index for #{pmids.length} articles") pmids.uniq.each{|pmid| begin article = PubMed.get_article(pmid) abstract = article.abstract title = article.title abstract_content = BagOfWords.terms(abstract).collect{|w,n| (1..n).collect{ w }}.flatten.join(" ") title_content = BagOfWords.terms(title).collect{|w,n| (1..n).collect{ w }}.flatten.join(" ") index << {:id => pmid, :abstract => abstract_content, :name => title_content} rescue Exception puts $!.backtrace puts $!. end } index.close end |
.load_config ⇒ Object
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# File 'lib/sent.rb', line 16 def self.load_config if File.exist?File.join(@@rootdir, 'sent.config') config = YAML.load_file(File.join(@@rotdir, 'sent.config')) if config.is_a? Hash @@datadir = config['datadir'] if config['datadir'] @@workdir = config['workdir'] if config['workdir'] @@tmpdir = config['tmpdir'] if config['tmpdir'] end end if File.exist?(File.join(ENV['HOME'], '.sent')) config = YAML.load_file(File.join(ENV['HOME'], '.sent') ) if config.is_a? Hash @@datadir = config['datadir'] if config['datadir'] @@workdir = config['workdir'] if config['workdir'] @@tmpdir = config['tmpdir'] if config['tmpdir'] end end if @@datadir.nil? || @@workdir.nil? || @@tmpdir.nil? raise Sent::NoConfig, "sent not configured. Edit #{File.join(@@rootdir, 'sent.config')} or $HOME/.sent" end FileUtils.mkdir_p @@datadir unless File.exist? @@datadir FileUtils.mkdir_p @@workdir unless File.exist? @@workdir FileUtils.mkdir_p @@tmpdir unless File.exist? @@tmpdir end |
.matrix(metadoc_file, genes) ⇒ Object
144 145 146 147 148 149 150 151 152 153 154 155 156 |
# File 'lib/sent/main.rb', line 144 def self.matrix(, genes) matrix = "" File.open() do |f| matrix += f.gets f.read.each_line do |line| gene = line.match(/^(.*?)\t/)[1] matrix += line if genes.include? gene end end matrix end |
.mentions(org) ⇒ Object
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
# File 'lib/sent/main.rb', line 80 def self.mentions(org) ner = Organism.ner(org, :rner) norm = Organism.norm(org) pmids = Organism.literature(org) chunks = pmids.chunk(100) pmids = {} Progress.monitor("Finding gene-article associations in text", :step => 1000) chunks.each{|chunk| try3times do PubMed.get_article(chunk).each{|pmid, article| text = article.text mentions = ner.extract(text) Progress.monitor("Resolving mentions", :step => 1000) codes = mentions.collect{|mention| matches = norm.match(mention) norm.select(matches,mention,text) }.flatten.uniq.sort codes.each{|code| pmids[code] ||= [] pmids[code] << pmid } } end } pmids end |
.metadoc(associations, terms) ⇒ Object
134 135 136 137 138 139 140 141 142 |
# File 'lib/sent/main.rb', line 134 def self.(associations, terms) "%s\t%s\n" % ["Entity", terms.sort * "\t"] + associations.collect do |gene, pmids| try3times do text = PubMed.get_article(pmids).collect{|pmid, article| article.text }.join("\n") "%s\t%s" % [gene, BagOfWords.features(text, terms.sort).join("\t")] end end * "\n" end |
.NMF(matrix, out, k, executions = 10) ⇒ Object
159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 |
# File 'lib/sent/main.rb', line 159 def self.NMF(matrix, out, k, executions = 10) driver = SOAP::WSDLDriverFactory.new( @@bionmf_wsdl).create_rpc_driver # Upload matrix nmf_matrix = driver.upload_matrix( matrix, # matrix false, # binary true, # column labels true, # row labels true, # transpose "No", # positive "No", # normalization "matrix") # Suggested name # Send several executions in parallel while !driver.done(nmf_matrix) sleep(5) end if driver.error(nmf_matrix) error = driver.(nmf_matrix).join("\n") raise "Error pre-processing matrix!" + driver.(nmf_matrix).join("\n") end threads = [] error = nil executions.times{|i| threads << Thread.new(i){ |num| times = 3 begin job_id = driver.standardNMF( nmf_matrix, # Matrix job "Standard", # Algorithm k, # Factor Start k, # Factor End 1, # Runs 2000, # Iterations 40, # Stop criteria 0, # Not used (nsnmf smoothness) false, # extra info nmf_matrix) # Suggested name while !driver.done(job_id) sleep(5) end if driver.error(job_id) error = driver.(job_id).join("\n") raise "Error in NMF" + driver.(job_id).join("\n") end results = driver.results(job_id) File.open(out + ".matrix_w.#{num}",'w') do |f| f.write Base64.decode64 driver.result(results[0]) #.sub(/\t(.*)\t$/,'\1') end File.open(out + ".matrix_h.#{num}",'w') do |f| f.write Base64.decode64 driver.result(results[1]) #.sub(/\t(.*)\t$/,'\1') end driver.clean(job_id) rescue Sent::ProcessAbortedError puts "Process aborted for #{ num }" driver.abort(job_id) rescue Timeout::Error if times > 0 times -= 1 sleep 2 retry else raise Sent::ProcessAbortedError, "NMF Execution #{ num } timed out" end rescue Exception puts $!. if times > 0 times -= 1 puts "Retrying thread #{ num }" retry else puts "NMF Execution #{ num } Produced Exception" puts $!.class puts $!. puts $!.backtrace raise Sent::ProcessAbortedError, "NMF Execution #{ num } Produced Exception" end ensure Thread.exit end } sleep 1 } # Allow threads to be aborted aborted = false old_int = Signal.trap("INT") do STDERR.puts "Killing threads" threads.each{|t| t.raise Sent::ProcessAbortedError, "Process Aborted"} aborted = true end threads.each { |aThread| aThread.join } Signal.trap("INT", old_int) driver.clean(nmf_matrix) if aborted raise Sent::ProcessAbortedError, "Process Aborted" end if error raise Exception, "Error in NMF:\n" + error end run_R("SENT.join.results('#{ out }')") FileUtils.rm Dir.glob(out + '.matrix_*.*') end |
.rdir ⇒ Object
63 64 65 |
# File 'lib/sent.rb', line 63 def self.rdir File.join(@@rootdir, 'R') end |
.rootdir ⇒ Object
58 59 60 |
# File 'lib/sent.rb', line 58 def self.rootdir @@rootdir end |
.run_R(command) ⇒ Object
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
# File 'lib/sent/main.rb', line 63 def self.run_R(command) pid, stdin, stdout, stderr = Open4::popen4 "R --vanilla --slave" stdin.write "source('#{File.join(Sent.rdir,'matrix.R')}');\n" stdin.write "#{ command };\n" stdin.close Process.wait pid raise Sent::ProcessAbortedError, "Error in R process: #{stdout.read + stderr.read}" if $?.exitstatus != 0 result = stdout.read + stderr.read stdout.close stderr.close puts result if result != "" result end |
.search_index(words, index) ⇒ Object
361 362 363 364 365 366 367 368 369 370 371 |
# File 'lib/sent/main.rb', line 361 def self.search_index(words, index) index = Ferret::Index::Index.new(:path => index) ranks = [] index.search_each("#{ words.collect{|w| w.stem}.join(" ") }", :limit => 8000) do |id,score| next unless score > 0.0001 ranks << [index[id][:id],score] end ranks end |
.tmpdir ⇒ Object
54 55 56 |
# File 'lib/sent.rb', line 54 def self.tmpdir @@tmpdir end |
.workdir ⇒ Object
51 52 53 |
# File 'lib/sent.rb', line 51 def self.workdir @@workdir end |