Class: RptMskr::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/bio-repeatmasker.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeParser

Returns a new instance of Parser.



13
# File 'lib/bio-repeatmasker.rb', line 13

def initialize;end

Class Method Details

.open(filename) ⇒ Object

input file - raw RepeatMasker output file - hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/chromOut.tar.gz



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/bio-repeatmasker.rb', line 17

def self.open(filename)
  $rpt_data=Hash.new{ |hash, key| hash[key] = [] }
  File.open(filename).each do |line|
    s_l=line.gsub(/^\s+/, "").gsub(/\s+/,"\t").split("\t") 
    if s_l[0] =~ /[0-9]+/ and s_l[8] !~ /C/
      hash1 = {"repeat_name"=>"#{s_l[9]}","class_name"=>"#{s_l[10]}","chromosome"=>"#{s_l[4]}","start_coordinate"=>"#{s_l[5]}","end_coordinate"=>"#{s_l[6]}","strand" =>"#{s_l[8]}","n_beyond_match"=>"#{s_l[7].gsub(/\(|\)/,"")}","start_match"=>"#{s_l[11]}","end_match"=>"#{s_l[12]}","n_prior_match"=>"#{s_l[13].gsub(/\(|\)/,"")}","sw_score"=>"#{s_l[0]}","subs_percent"=>"#{s_l[1]}","del_percent"=>"#{s_l[2]}","ins_percent"=>"#{s_l[3]}"}
      $rpt_data[:"#{s_l[14]}"] << hash1
    end
    if s_l[0] =~ /[0-9]+/ and s_l[8] =~ /C/
      hash2 = {"repeat_name"=>"#{s_l[9]}","class_name"=>"#{s_l[10]}","chromosome"=>"#{s_l[4]}","start_coordinate"=>"#{s_l[5]}","end_coordinate"=>"#{s_l[6]}","strand" =>"#{s_l[8]}","n_beyond_match"=>"#{s_l[7].gsub(/\(|\)/,"")}","start_match"=>"#{s_l[13]}","end_match"=>"#{s_l[12]}","n_prior_match"=>"#{s_l[11].gsub(/\(|\)/,"")}","sw_score"=>"#{s_l[0]}","subs_percent"=>"#{s_l[1]}","del_percent"=>"#{s_l[2]}","ins_percent"=>"#{s_l[3]}"}
      $rpt_data[:"#{s_l[14]}"] << hash2
    end
  end
  #return hash of hashes - key for main hash = id
  return $rpt_data  
end

.search_by_coordinate_interval(chr, start_c, end_c) ⇒ Object

search for events that are located into a coordinate interval - chromosome 1, from 10000 to 20000 - chr1,10000,20000)



62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/bio-repeatmasker.rb', line 62

def self.search_by_coordinate_interval(chr,start_c,end_c)
result=[]
$rpt_data.each do |key,value|
        value.each do |ab|
    if ab["chromosome"] == chr.to_s and ab["start_coordinate"] >= start_c.to_s and ab["end_coordinate"] <= end_c.to_s and ab["start_coordinate"] < end_c.to_s and ab["end_coordinate"] > start_c.to_s
      res="#{key},#{value}"
      result.push(res)
    end
  end
end
return result.uniq
end

.search_by_field(field, array_of_values) ⇒ Object

search for events based on given fields and values - input one field and array of values (i.e, chromosome, [“chr1”,“chr2”,“chr3”,“chr4”]) - returns all events matching this pattern fiels, based on RepeatMasker output documentation (www.repeatmasker.org/) are : id, repeat_name, class_name, chromosome, start_coordinate, end_coordinate, strand, n_beyond_match, start_match, end_match, n_prior_match, sw_score, subs_percent, del_percent, ins_percent



46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/bio-repeatmasker.rb', line 46

def self.search_by_field(field,array_of_values)
result=[]
  array_of_values.each do |search_vals|
$rpt_data.each do |key,value|
  value.each do |ab|
  if ab["#{field}"] == search_vals.to_s
res="#{key},#{value}"
result.push(res)
    end
  end
end
return result.uniq
  end
end

.search_by_id(ids) ⇒ Object

search for events based on its ids - input is an array of ids



35
36
37
38
39
40
41
# File 'lib/bio-repeatmasker.rb', line 35

def self.search_by_id(ids)
  ids.each do |id|
  $rpt_data.each do |key,value|
    puts "#{key},#{value}" if key.to_s == id.to_s
    end
  end  
end