Module: BioPangenome

Defined in:
lib/bio-pangenome/pangenome.rb

Defined Under Namespace

Classes: GeneFlankingRegion, Transcript

Class Method Summary collapse

Class Method Details

.align_gene_groups(seqs: {}, tmp_folder: "/Volumes/PanGenome/GeneRegions/201910_v2_v3/tmp", output: "../pairwise_blast_oct_2019/varieties_6A_identites", distance: 0) ⇒ Object



103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# File 'lib/bio-pangenome/pangenome.rb', line 103

def self.align_gene_groups( seqs:{}, tmp_folder:"/Volumes/PanGenome/GeneRegions/201910_v2_v3/tmp", output:"../pairwise_blast_oct_2019/varieties_6A_identites", distance: 0 )
	out_tmp="#{tmp_folder}/out.blast"
	FileUtils.mkdir_p(tmp_folder)
	out = File.open("#{output}_#{distance}bp.tab", "w")
	out.puts [ "transcript" , "query", "subject" ,  "var_query", "var_subject", "aln_type", "length" , "pident" , "Ns_query", "Ns_subject", "Ns_total", "Flanking"   ].join("\t")
	seqs.each_pair do |transcript, transcript_seqs|
		vars = transcript_seqs.keys
		vars_done = []
		ns = {}
		vars.each do |v1|
			tmp =  tmp_folder  + "/" + v1 + ".fa"
			s = transcript_seqs[v1]
			seq = ">#{s.id}\n#{s.sequence}"
			File.open(tmp, 'w') {|f| f.write(seq) }
			ns[v1] = s.sequence.count('Nn')
		end
		vars.each do |v1|
			tmp1 =  tmp_folder  + "/" + v1 + ".fa"
			s1 = transcript_seqs[v1]
			next unless s1.sequence.length > 0
			vars.each do |v2|
				next if v1 == v2
				next if vars_done.include? v2
				s2 = transcript_seqs[v2]
				next unless s2.sequence.length > 0
				tmp2 =  tmp_folder  + "/" + v2 + ".fa"
				to_print = [transcript, s1.id , s2.id , v1,v2,"#{v1}->#{v2}"]
				to_print << blast_pair_fast(tmp1, tmp2, out_tmp) 
				to_print << ns[v1] 
				to_print << ns[v2]
				to_print << ns[v1] + ns[v2]
				to_print << distance
				out.puts to_print.join("\t")
			end
			vars_done << v1
		end
	end
	out.close
end

.blast_pair_fast(path_a, path_b, out_path, program: "blastn") ⇒ Object



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/bio-pangenome/pangenome.rb', line 58

def self.blast_pair_fast(path_a, path_b, out_path, program: "blastn")
	cmd = "#{program} -query #{path_a} -subject #{path_b} -task #{program} -out #{out_path} -outfmt '5' "
	system cmd
	n = Bio::BlastXMLParser::XmlIterator.new(out_path).to_enum
	max_length = 0
	max_pident = 0.0
	n.each do | iter |
		iter.each do | hit |
			hit.each do | hsp |
				if hsp.align_len > max_length
					max_length = hsp.align_len
					max_pident = 100 * hsp.identity.to_f / hsp.align_len.to_f
				end
			end
		end
	end
	[max_length, max_pident]
end

.load_cds_sequences(varieties: [], genes: {}, prefix: "../flanking/filtered/", suffix: ".cds.fa.gz", set_id: "cds") ⇒ Object



143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# File 'lib/bio-pangenome/pangenome.rb', line 143

def self.load_cds_sequences( varieties:[], genes:{}, prefix: "../flanking/filtered/",  suffix: ".cds.fa.gz", set_id: "cds" )
	ret = Hash.new { |h, k| h[k] = Hash.new }
	varieties.each do |variety|
		path = "#{prefix}/#{variety}#{suffix}"
		infile = open(path)
		io = Zlib::GzipReader.new(infile) 
		Bio::FlatFile.open(Bio::FastaFormat, io) do |fasta_file|
			fasta_file.each do |entry|
				arr = entry.definition.split(".")
				next unless genes[arr[0]]
				row = genes[arr[0]]
				seq_name = GeneFlankingRegion.new(entry.definition,
					row["gene"], "",
					"", entry.definition, set_id, nil, variety )
				seq = entry.seq
				seq.gsub!(/N*$/, '')
				seq.gsub!(/^N*/, '')
				seq_name.sequence = seq
				base_gene = seq_name.gene
				ret[base_gene][variety] = seq_name unless ret[base_gene][variety]
			end
		end
		io.close
	end
	ret
end

.load_genes(filename, window: 0, no_windows: 0) ⇒ Object



182
183
184
185
186
187
188
189
190
191
192
193
# File 'lib/bio-pangenome/pangenome.rb', line 182

def self.load_genes(filename, window: 0, no_windows: 0)
	genes = File.readlines(filename).map do |t|
		t.chomp!.split(".")[0]
	end
	if no_windows > 0
		puts "'loading window #{window} of #{no_windows}'"
		window_size = genes.size/no_windows
		start = window * window_size
		genes = genes[start, window_size]
	end
	genes
end

.load_lines(filename) ⇒ Object



195
196
197
198
199
# File 'lib/bio-pangenome/pangenome.rb', line 195

def self.load_lines(filename)
	File.readlines(filename).map do |t|
		t.chomp!.rstrip
	end
end

.load_mapping_hash(varieties: [], transcripts: [], genes: [], distance: 1000, prefix: "../flanking/releasePGSBV1/", suffix: ".RefSeqv1.1") ⇒ Object



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/bio-pangenome/pangenome.rb', line 38

def self.load_mapping_hash(varieties:[],  transcripts:[], genes:[], distance: 1000, prefix: "../flanking/releasePGSBV1/",  suffix: ".RefSeqv1.1")
	ret = Hash.new { |h, k| h[k] = Hash.new }
	varieties.each do |v|
		path = "#{prefix}#{distance}bp/#{v}_#{distance}bp_#{suffix}.reg.map"
		$stderr.puts path
		File.foreach(path) do |line|
			line.chomp!
			arr = line.split("\t")
			begin
				parsed = parseSequenceName(arr[0], arr[1])
			rescue Exception => e
				throw "Unable to parse #{line} (#{v}) [#{e.to_s}]" 
			end
			next unless transcripts.include? parsed.transcript or genes.include? parsed.gene
			ret[v][parsed.region] = parsed
		end
	end
	ret
end

.load_projected_genes(transcript_mapping, genes: []) ⇒ Object



170
171
172
173
174
175
176
177
178
179
180
# File 'lib/bio-pangenome/pangenome.rb', line 170

def self.load_projected_genes(transcript_mapping, genes:[])
	projected_genes = {}
	Zlib::GzipReader.open(transcript_mapping) do |gzip|
		csv = CSV.new(gzip, headers: true)
		csv.each do |row|
			next unless genes.include? row["gene"]
			projected_genes[row["projected_gene"]] = row
		end
	end
	projected_genes
end

.load_sequences_from_hash(coordinates: {}, prefix: "../flanking/filtered/", suffix: "RefSeqv1.1", distance: 1000, projected_genes: {}) ⇒ Object



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/bio-pangenome/pangenome.rb', line 78

def self.load_sequences_from_hash(coordinates:{},  prefix: "../flanking/filtered/",  suffix: "RefSeqv1.1", distance: 1000, projected_genes: {})
	ret = Hash.new { |h, k| h[k] = Hash.new }
	coordinates.each_pair do |variety, coords|

		path = "#{prefix}/#{distance}bp/#{variety}_#{distance}bp_#{suffix}.fa.gz"
		puts "Loading: #{path}"
		infile = open(path)
		io = Zlib::GzipReader.new(infile) 
		Bio::FlatFile.open(Bio::FastaFormat, io) do |fasta_file|
			fasta_file.each do |entry|
				next unless coords[entry.definition]
				seq_name = coords[entry.definition]
				seq = entry.seq
				seq.gsub!(/N*$/, '')
				seq.gsub!(/^N*/, '')
				seq_name.sequence = seq
				base_gene = projected_genes[seq_name.gene]["gene"]
				ret[base_gene][variety] = seq_name unless ret[base_gene][variety]
			end
		end
		io.close
	end
	ret
end

.parseEITranscript(name) ⇒ Object



18
19
20
21
22
23
# File 'lib/bio-pangenome/pangenome.rb', line 18

def self.parseEITranscript name
	arr=name.split(".")
	match = /Traes(?<chr>[[:upper:]]{3}_scaffold_[[:digit:]]*)_(?<ver>[[:digit:]]{2})G(?<count>[[:digit:]]+)(?<conf>[[:upper:]]*)/.match arr[0]
	raise "Unable to parse: #{name}" unless match
	Transcript.new(name, arr[0],match[:chr].downcase,match[:ver],match[:count],arr[1],match[:conf], match[:count].to_i, arr[1])
end

.parsePGSBTranscript(name) ⇒ Object



25
26
27
28
29
30
31
# File 'lib/bio-pangenome/pangenome.rb', line 25

def self.parsePGSBTranscript name
	arr=name.split(".")
	match = /Traes(?<variety>[[:upper:]]{3})(?<chr>[[:alnum:]]{1,2})(?<ver>[[:digit:]]{2})G(?<count>[[:digit:]]+)(?<conf>[[:upper:]]*)/.match arr[0]

	raise "Unable to parse: #{name}" unless match
	Transcript.new(name, arr[0],match[:chr],match[:ver],match[:count],match[:variety], match[:conf],match[:count].to_i, arr[1])
end

.parseSequenceName(region, name) ⇒ Object



32
33
34
35
36
# File 'lib/bio-pangenome/pangenome.rb', line 32

def self.parseSequenceName region, name
	match = /(?<transcript>[[:alnum:]].+)_(?<ann>.+)_(?<flank_length>[[:digit:]]+bp)/.match name
	arr2=match[:transcript].split "."
	GeneFlankingRegion.new(match[:transcript],arr2[0],match[:ann], region, name, match[:flank_length] , nil, nil)
end

.parseTranscript(name) ⇒ Object



12
13
14
15
16
17
# File 'lib/bio-pangenome/pangenome.rb', line 12

def self.parseTranscript name
	arr=name.split(".")
	match = /TraesCS(?<chr>[[:alnum:]]{1,2})(?<ver>[[:digit:]]{2})G(?<count>[[:digit:]]+)(?<conf>[[:upper:]]*)/.match arr[0]
	raise "Unable to parse: #{name}" unless match
	Transcript.new(name, arr[0],match[:chr],match[:ver],match[:count],arr[1],match[:conf], match[:count].to_i, arr[1])
end