Class: Cdhit
- Inherits:
-
Object
- Object
- Cdhit
- Defined in:
- lib/full_lengther_next/classes/cdhit.rb
Constant Summary collapse
- NAME =
0
- COMMENTS =
1
- SEQ_FASTA =
2
Instance Attribute Summary collapse
-
#clusters ⇒ Object
Returns the value of attribute clusters.
-
#sequence_hash_fasta ⇒ Object
Returns the value of attribute sequence_hash_fasta.
Instance Method Summary collapse
- #cd_hit_clusters(clust_file) ⇒ Object
- #each_cluster ⇒ Object
- #get_all_master ⇒ Object
- #get_master(cluster) ⇒ Object
- #get_sp(cluster) ⇒ Object
- #hash_fasta(file) ⇒ Object
-
#initialize(fasta_file, clust_file) ⇒ Cdhit
constructor
A new instance of Cdhit.
- #master_fasta(file_name) ⇒ Object
- #master_to_sp_seq ⇒ Object
- #parse_member(member) ⇒ Object
- #recover_different_lengths(percentage) ⇒ Object
Constructor Details
#initialize(fasta_file, clust_file) ⇒ Cdhit
Returns a new instance of Cdhit.
35 36 37 38 39 |
# File 'lib/full_lengther_next/classes/cdhit.rb', line 35 def initialize(fasta_file, clust_file) @clusters = [] @sequence_hash_fasta=hash_fasta(fasta_file) cd_hit_clusters(clust_file) end |
Instance Attribute Details
#clusters ⇒ Object
Returns the value of attribute clusters.
29 30 31 |
# File 'lib/full_lengther_next/classes/cdhit.rb', line 29 def clusters @clusters end |
#sequence_hash_fasta ⇒ Object
Returns the value of attribute sequence_hash_fasta.
29 30 31 |
# File 'lib/full_lengther_next/classes/cdhit.rb', line 29 def sequence_hash_fasta @sequence_hash_fasta end |
Instance Method Details
#cd_hit_clusters(clust_file) ⇒ Object
111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
# File 'lib/full_lengther_next/classes/cdhit.rb', line 111 def cd_hit_clusters(clust_file) require 'bio-cd-hit-report' report = Bio::CdHitReport.new(clust_file) report.each_cluster do |cluster| clust=[] cluster.data.each do |member| name, master = parse_member(member) hash_seq = @sequence_hash_fasta[name] sequence = Seq.new(hash_seq[NAME], hash_seq[COMMENTS], hash_seq[SEQ_FASTA], master) clust << sequence end @clusters << clust end end |
#each_cluster ⇒ Object
41 42 43 44 45 |
# File 'lib/full_lengther_next/classes/cdhit.rb', line 41 def each_cluster @clusters.each do |cluster| yield cluster end end |
#get_all_master ⇒ Object
92 93 94 95 96 97 98 |
# File 'lib/full_lengther_next/classes/cdhit.rb', line 92 def get_all_master master = [] each_cluster{|cluster| master << get_master(cluster) } return master end |
#get_master(cluster) ⇒ Object
87 88 89 90 |
# File 'lib/full_lengther_next/classes/cdhit.rb', line 87 def get_master(cluster) master= cluster.select{|seq| seq.master}.first return master end |
#get_sp(cluster) ⇒ Object
100 101 102 103 104 105 106 107 108 |
# File 'lib/full_lengther_next/classes/cdhit.rb', line 100 def get_sp(cluster) master=cluster.select{|seq| seq.db == 'sp'} if !master.empty? master=master.first else master=nil end return master end |
#hash_fasta(file) ⇒ Object
139 140 141 142 143 144 145 146 147 |
# File 'lib/full_lengther_next/classes/cdhit.rb', line 139 def hash_fasta(file) sequence_hash_fasta={} fqr=FastaQualFile.new(file) fqr.each do |name,seq_fasta,comments| sequence_hash_fasta[name[0..18]]=[name, comments, seq_fasta] #Cd-hit cuts sequence's name to 20 character (even > character) so we use 'name[0..18]' like key hash end fqr.close return sequence_hash_fasta end |
#master_fasta(file_name) ⇒ Object
47 48 49 50 51 52 53 54 |
# File 'lib/full_lengther_next/classes/cdhit.rb', line 47 def master_fasta(file_name) fasta=File.open(file_name,'w') each_cluster{|cluster| master=get_master(cluster) fasta.print '>'+master.name+' '+master.comments+"\n"+master.seq_fasta+"\n" } fasta.close end |
#master_to_sp_seq ⇒ Object
56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/full_lengther_next/classes/cdhit.rb', line 56 def master_to_sp_seq each_cluster{|cluster| master_seq = get_master(cluster) if master_seq.db != 'sp' sp_seq=get_sp(cluster) if !sp_seq.nil? cluster.map{|seq| seq.master=FALSE} sp_seq.master=TRUE end end } end |
#parse_member(member) ⇒ Object
126 127 128 129 130 131 132 133 134 135 136 |
# File 'lib/full_lengther_next/classes/cdhit.rb', line 126 def parse_member(member) member.gsub!('...','') member.gsub!('>','') fields = member.split(',') data = fields[1].split(' ',2) master = FALSE if data[1] == '*' master = TRUE end return data[0],master end |
#recover_different_lengths(percentage) ⇒ Object
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
# File 'lib/full_lengther_next/classes/cdhit.rb', line 70 def recover_different_lengths(percentage) seqs = [] each_cluster{|cluster| master = get_master(cluster) cluster.each do |seq| if seq.name == master.name next else seq_mas_len = seq.seq_fasta.length/master.seq_fasta.length*100 mas_seq_len = master.seq_fasta.length/seq.seq_fasta.length*100 seqs << seq if mas_seq_len < percentage && seq_mas_len < percentage end end } return seqs end |