Class: Bio::CNLS::Screenscraper

Inherits:
Object
  • Object
show all
Defined in:
lib/bio/cnls_screenscraper/cnls_screenscraper.rb

Overview

A class used to automatically submit results to the cNLS webserver and parse the HTML results.

Constant Summary collapse

ACCEPTABLE_CUTOFFS =
%w(2.0 3.0 4.0 5.0 6.0)

Class Method Summary collapse

Class Method Details

.get_raw_html_result(amino_acid_sequence, cut_off = '3.0', seconds_pause = 1) ⇒ Object



93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/bio/cnls_screenscraper/cnls_screenscraper.rb', line 93

def self.get_raw_html_result(amino_acid_sequence, cut_off='3.0', seconds_pause=1)
  unless ACCEPTABLE_CUTOFFS.include?(cut_off)
    raise Exception, "Specified cutoff `#{cut_off}' for the cNLS screenscraper is invalid. Valid cutoffs are #{ACCEPTABLE_CUTOFFS.join(', ')}. They are strings, not floating point values."
  end
  
  # retrieve the webpage
  res = Net::HTTP.post_form(URI.parse('http://nls-mapper.iab.keio.ac.jp/cgi-bin/NLS_Mapper_y.cgi'),
  {'cut_off' => cut_off, 'typedseq' => amino_acid_sequence})
  
  # if there is an error, raise it
  unless res.kind_of?(Net::HTTPOK)
    raise Exception, "Failed to retrieve cNLS, internet connectivity problem? Using cutoff/sequence #{cutoff}/#{amino_acid_sequence}"
  end
  
  # pause the specified number of seconds
  sleep seconds_pause
  
  return res.body
end

.parse_html_result(html) ⇒ Object

Given HTML corresponding to a result, return a parse object that is more programmatically palatable.



114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# File 'lib/bio/cnls_screenscraper/cnls_screenscraper.rb', line 114

def self.parse_html_result(html)
  result = Result.new
  
  # The mono and bi-partite regular expressions are equivalent except for the Predicted X NLS bit at the beginning, thanksfully. However, they sometimes appear to be slightly different, which is rather odd.
  monopartite_regex = /Predicted monopartite NLS<\/th>\s+<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td align="center"><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><\/TR>/i
  bipartite_regex =     /Predicted bipartite NLS<\/th>\s+<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><td align="center"><strong><big><code>(.*?)<\/code><\/big><\/strong><br.{0,2}><strong><big><code.{2,8}><\/big><\/strong><\/td><\/TR>/i
  
  monopartite_no_hits = /Predicted monopartite NLS<\/th>\s*<\/tr>\s*<tr bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/tr>\s*<tr><td><strong><big><code><\/code><\/big><\/strong><\/td><td><strong><big><code><\/code><\/big><\/strong><\/td><td align="center"><strong><big><code><\/code><\/big><\/strong><\/td><\/tr>/i
  bipartite_no_hits =     /Predicted bipartite NLS<\/th>\s*<\/tr>\s*<tr bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/tr>\s*<tr><td><strong><big><code><\/code><\/big><\/strong><\/td><td><strong><big><code><\/code><\/big><\/strong><\/td><td align="center"><strong><big><code><\/code><\/big><\/strong><\/td><\/tr>/i
  monopartite_no_hits2 = /Predicted monopartite NLS<\/th>\s*<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code \/><\/big><\/strong><\/td><td><strong><big><code \/><\/big><\/strong><\/td><td align="center"><strong><big><code \/><\/big><\/strong><\/td><\/TR>/i
  bipartite_no_hits2 =     /Predicted bipartite NLS<\/th>\s*<\/TR>\s*<TR bgcolor="#d0d0d0">\s*<th>Pos.<\/th>\s*<th>Sequence<\/th>\s*<th>Score<\/th>\s*<\/TR>\s*<TR><td><strong><big><code \/><\/big><\/strong><\/td><td><strong><big><code \/><\/big><\/strong><\/td><td align="center"><strong><big><code \/><\/big><\/strong><\/td><\/TR>/i
  
  split_regex = /<\/code><\/big><\/strong><br.{0,2}><strong><big><code>/
  
  # Make sure the sequence isn't too long
  if html.match(/Query sequence should be < 5000 aa/)
    raise Exception, "Query sequence provided was too long (> 5000 aa)"
    
    # parse out monopartite signals
  elsif matches = html.match(monopartite_regex)
    positions = matches[1].split(split_regex)
    seqs = matches[2].split(split_regex)
    scores = matches[3].split(split_regex)
    
    positions.each_with_index do |pos, i|
      nls = Result::MonopartiteNLS.new
      nls.position = pos.to_i
      nls.sequence = seqs[i]
      nls.score = scores[i].to_f
      result.signals.push nls
    end
  elsif html.match(monopartite_no_hits) or html.match(monopartite_no_hits2)
    # do nothing, except for not raising a parsing exception
  else
    raise Exception, "Could not parse HTML output returned from cNLS prediction server. In particular, looking for monopartite signals, but the whole document is likely problematic.\n#{html}"
  end
  
  
  # parse out the bipartite signals
  if matches = html.match(bipartite_regex)
    positions = matches[1].split(split_regex)
    seqs = matches[2].split(split_regex)
    scores = matches[3].split(split_regex)
    
    positions.each_with_index do |pos, i|
      nls = Result::BipartiteNLS.new
      nls.position = pos.to_i
      nls.sequence = seqs[i]
      nls.score = scores[i].to_f
      result.signals.push nls
    end
  elsif html.match(bipartite_no_hits) or html.match(bipartite_no_hits2)
    # do nothing, except for not raising a parsing exception
  else
    raise Exception, "Could not parse HTML output returned from cNLS prediction server. In particular, looking for bipartite signals, monopartite signals seemed to be parsed OK.\n#{html}"
  end
  
  return result
end

.submit(amino_acid_sequence, cut_off = '3.0', seconds_pause = 1) ⇒ Object

Contact the cNLS prediction server and submit the amino acid sequence for prediction. Return a Bio::CNLS::Result object. Pause after each round for pause milliseconds, so as not to overload the server.



85
86
87
88
89
90
91
# File 'lib/bio/cnls_screenscraper/cnls_screenscraper.rb', line 85

def self.submit(amino_acid_sequence, cut_off='3.0', seconds_pause=1)
  # contact webserver and sleep
  html = get_raw_html_result(amino_acid_sequence, cut_off, seconds_pause)
  
  # Return the parsed HTML as a CNLS::Result object
  return parse_html_result(html)
end