Class: GFA

Inherits:
Object
  • Object
show all
Defined in:
lib/gfa/graph.rb,
lib/gfa/common.rb,
lib/gfa/parser.rb,
lib/gfa/modules.rb,
lib/gfa/version.rb,
lib/gfa/generator.rb

Defined Under Namespace

Classes: Field, GraphVertex, Matrix, Record, RecordSet

Constant Summary collapse

MIN_VERSION =

Class-level

'1.0'
MAX_VERSION =
'1.2'
VERSION =
'0.9.4'
VERSION_ARRAY =

:nodoc:

VERSION.split(/\./).map { |x| x.to_i }
VERSION_MAJOR =

:nodoc:

VERSION_ARRAY[0]
VERSION_MINOR =

:nodoc:

VERSION_ARRAY[1]
VERSION_BUILD =

:nodoc:

VERSION_ARRAY[2]

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(opts = {}) ⇒ GFA

Returns a new instance of GFA.



59
60
61
62
63
64
65
# File 'lib/gfa/common.rb', line 59

def initialize(opts = {})
  @records = {}
  @opts = { index: true, index_id: false, comments: false }.merge(opts)
  GFA::Record.TYPES.each do |t|
    @records[t] = GFA::RecordSet.name_class(t).new(self)
  end
end

Instance Attribute Details

#gfa_versionObject (readonly)

Instance-level



48
49
50
# File 'lib/gfa/common.rb', line 48

def gfa_version
  @gfa_version
end

#optsObject (readonly)

Instance-level



48
49
50
# File 'lib/gfa/common.rb', line 48

def opts
  @opts
end

#recordsObject (readonly)

Instance-level



48
49
50
# File 'lib/gfa/common.rb', line 48

def records
  @records
end

Class Method Details

.advanceObject



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/gfa/common.rb', line 23

def self.advance
  @advance_bar_i += 1
  while 50 * @advance_bar_i / @advance_bar_n > @advance_bar_p
    $stderr.print "\b=>"
    @advance_bar_p += 1
  end
  return unless @advance_bar_i == @advance_bar_n

  $stderr.print "\b]\r"
  t_t = Time.now - @advance_bar_s
  t_u = 'sec'

  [[60, 'min'], [60, 'h'], [24, 'd'], [30, 'mo']].each do |t_a|
    break if t_t < t_a[0]
    t_t /= t_a[0]
    t_u  = t_a[1]
  end

  ram_gb = `ps -o rss= -p #{$$}`.to_f / 1024 / 1024
  info1 = 'Time elapsed: %.1f %s' % [t_t, t_u]
  info2 = '%.1fG RAM' % ram_gb
  $stderr.puts '  [ %-36s%12s ]' % [info1, info2]
end

.advance_bar(n) ⇒ Object



14
15
16
17
18
19
20
21
# File 'lib/gfa/common.rb', line 14

def self.advance_bar(n)
  @advance_bar_n = n
  @advance_bar_i = 0
  @advance_bar_p = 0
  @advance_bar_s = Time.now
  $stderr.print '  [' + (' ' * 50) + ']' + " #{n}\r"
  $stderr.print '  [>'
end

.assert_format(value, regex, message) ⇒ Object

Class-level



8
9
10
11
12
# File 'lib/gfa/common.rb', line 8

def self.assert_format(value, regex, message)
  unless value =~ /^(?:#{regex})$/
    raise "#{message}: #{value}"
  end
end

.load(file, opts = {}) ⇒ Object

Load a GFA object from a gfa file with options opts:

  • index: If the records should be indexed as loaded (default: true)

  • index_id: If the records should also be index by ID (default: false)

  • comments: If the comment records should be saved (default: false)

  • line_range: Two-integer array indicating the first and last lines to read

  • file_seek: Seek to this file position before start reading

  • until_line: Read until reaching this line only (default: nil, read the entire file)



17
18
19
20
21
22
23
# File 'lib/gfa/parser.rb', line 17

def self.load(file, opts = {})
  gfa = GFA.new(opts)
  read_records(file, opts) do |record|
    gfa << record
  end
  gfa
end

.load_parallel(file, thr, opts = {}) ⇒ Object

Load a GFA object from a gfa file in parallel using thr threads, and the same opts supported by load. Defaults to the load method instead if thr <= 1.



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/gfa/parser.rb', line 43

def self.load_parallel(file, thr, opts = {})
  return self.load(file, opts) if thr < 1

  # Prepare data
  lsize = []
  File.open(file, 'r') { |fh| fh.each { |ln| lsize << ln.size } }
  thr = [lsize.size, thr].min
  blk = (lsize.size.to_f / thr).ceil

  # Launch children processes
  advance_bar(blk + 1)
  io  = []
  pid = []
  thr.times do |i|
    io[i] = IO.pipe
    pid << fork do
      io[i][0].close
      o = opts.merge(file_seek: lsize[0, blk * i].inject(:+), until_line: blk)
      #o = opts.merge(line_range: [i * blk, (i + 1) * blk - 1])
      records = []
      read_records(file, o) do |record|
        records << record
        advance if i == 0
      end
      Marshal.dump(records, io[i][1])
      advance if i == 0
      exit!(0)
    end
    io[i][1].close
  end

  # Collect and merge results
  gfa = GFA.new(opts)
  io.each_with_index do |pipe, k|
    result = pipe[0].read
    Process.wait(pid[k])
    advance_bar(io.size) if k == 0
    raise "Child process failed: #{k}" if result.empty?
    Marshal.load(result).each { |record| gfa << record }
    pipe[0].close
    advance
  end

  return gfa
end

.read_records(file, opts = {}) ⇒ Object



25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/gfa/parser.rb', line 25

def self.read_records(file, opts = {})
  rng = opts[:line_range]
  File.open(file, 'r') do |fh|
    fh.seek(opts[:file_seek], :SET) unless opts[:file_seek].nil?
    fh.each_with_index do |ln, lno|
      next  if !rng.nil? && (lno < rng[0] || lno > rng[1])
      break if !opts[:until_line].nil? && (lno == opts[:until_line])
      next  if !opts[:comments] && ln[0] == '#'

      yield(GFA::Record[ln])
    end
  end
end

.supported_version?(v) ⇒ Boolean

Returns:

  • (Boolean)


89
90
91
# File 'lib/gfa/parser.rb', line 89

def self.supported_version?(v)
  v.to_f >= MIN_VERSION.to_f and v.to_f <= MAX_VERSION.to_f
end

Instance Method Details

#<<(obj) ⇒ Object

Instance-level



94
95
96
97
98
99
100
101
102
# File 'lib/gfa/parser.rb', line 94

def <<(obj)
  obj = GFA::Record[obj] unless obj.is_a? GFA::Record
  return if obj.nil? || obj.empty?
  @records[obj.type] << obj

  if obj.type == :Header && !obj.VN.nil?
    set_gfa_version(obj.VN.value)
  end
end

#==(gfa) ⇒ Object



75
76
77
# File 'lib/gfa/common.rb', line 75

def ==(gfa)
  eql?(gfa)
end

#adjacency_graph(opts = {}) ⇒ Object

Generates a RGL::DirectedAdjacencyGraph or RGL::AdjacencyGraph object. The opts argument is a hash with the same supported key-value pairs as in #implicit_graph.



21
22
23
# File 'lib/gfa/graph.rb', line 21

def adjacency_graph(opts = {})
  implicit_graph(opts).to_adjacency
end

#all_edgesObject

Returns an array of GFA::Record objects including all possible edges from the GFA. I.e., all links, jumps, containments, and paths.



186
187
188
189
# File 'lib/gfa/graph.rb', line 186

def all_edges
  edge_t = %i[Link Jump Containment Path]
  @edges ||= edge_t.flat_map { |t| records[t].set }
end

#calculate_edge_matrix!Object

Calculate and store internally a matrix representing all edges.



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/gfa/graph.rb', line 27

def calculate_edge_matrix!
  $stderr.puts '- Building edge matrix'
  @edge_matrix = GFA::Matrix.new(segments.size, segments.size)
  self.class.advance_bar(all_edges.size)
  all_edges.each do |edge|
    self.class.advance
    idx = edge.segments(self).map { |i| segments.position(i) }
    idx.each do |i|
      idx.each do |j|
        @edge_matrix[i, j] = true unless i == j
      end
    end
  end
  @edge_matrix
end

#each_line(&blk) ⇒ Object



10
11
12
13
14
15
16
17
# File 'lib/gfa/generator.rb', line 10

def each_line(&blk)
  set_version_header('1.2') if gfa_version.nil?
  GFA::Record.TYPES.each do |r_type|
    records[r_type].set.each do |record|
      blk[record.to_s]
    end
  end
end

#edge_matrixObject

Returns the matrix representing all edges



45
46
47
# File 'lib/gfa/graph.rb', line 45

def edge_matrix
  @edge_matrix or calculate_edge_matrix!
end

#empty?Boolean

Returns:

  • (Boolean)


67
68
69
# File 'lib/gfa/common.rb', line 67

def empty?
  records.empty? || records.values.all?(&:empty?)
end

#eql?(gfa) ⇒ Boolean

Returns:

  • (Boolean)


71
72
73
# File 'lib/gfa/common.rb', line 71

def eql?(gfa)
  records == gfa.records
end

#implicit_graph(opts = {}) ⇒ Object

Generates a RGL::ImplicitGraph object describing the links in the GFA. The opts argument is a hash with any of the following key-value pairs:

  • :orient => bool. If false, ignores strandness of the links. By default true.

  • :directed => bool. If false, ignores direction of the links. By defaut the same value as :orient.



13
14
15
# File 'lib/gfa/graph.rb', line 13

def implicit_graph(opts = {})
  rgl_implicit_graph(opts)
end

#indexed?Boolean

Returns:

  • (Boolean)


91
92
93
# File 'lib/gfa/common.rb', line 91

def indexed?
  records.values.all?(&:indexed?)
end

#internally_linking_records(segments, edges) ⇒ Object



169
170
171
172
173
174
175
176
177
178
179
180
181
# File 'lib/gfa/graph.rb', line 169

def internally_linking_records(segments, edges)
  unless segments.is_a? GFA::RecordSet::SegmentSet
    raise "Unrecognised class: #{segments.class}"
  end

  $stderr.puts '- Gathering internally linking records'
  s_names = Hash[segments.set.map { |i| [i.name.value, true]}]
  self.class.advance_bar(edges.size)
  edges.select do |record|
    self.class.advance
    record.segment_names_a.all? { |s| s_names[s] }
  end
end

#linking_records(segments, degree: 1, threads: 2) ⇒ Object

Finds all the records linking to any segments in segments, a GFA::RecordSet::SegmentSet object, and expands to links with up to degree degrees of separation. This operation uses threads Threads (with shared RAM)

Returns an array of GFA::Record objects with all the identified linking records (edges). Edge GFA::Record objects can be of type: Link, Containment, Jump, or Path

IMPORTANT NOTE: The object segments will be modified to include all linked segments. If you don’t want this behaviour, please make sure to pass a duplicate of the object instead



115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# File 'lib/gfa/graph.rb', line 115

def linking_records(segments, degree: 1, threads: 2)
  unless segments.is_a? GFA::RecordSet::SegmentSet
    raise "Unrecognised class: #{segments.class}"
  end

  edge_matrix unless degree == 0 # Just to trigger matrix calculation
  degree.times do |round|
    $stderr.puts "- Expansion round #{round + 1}"
    self.class.advance_bar(segments.size + 1)
    pre_expansion = segments.size

    # Launch children processes
    io  = []
    pid = []
    threads.times do |t|
      io[t] = IO.pipe
      pid << fork do
        new_segments = Set.new
        segments.set.each_with_index do |segment, k|
          self.class.advance if t == 0
          next unless (k % threads) == t
          idx = self.segments.position(segment)
          edge_matrix[nil, idx].each_with_index do |edge, target_k|
            new_segments << target_k if edge
          end
        end
        Marshal.dump(new_segments, io[t][1])
        self.class.advance if t == 0
        exit!(0)
      end
      io[t][1].close
    end

    # Collect and merge results
    new_segments = Set.new
    io.each_with_index do |pipe, k|
      result = pipe[0].read
      Process.wait(pid[k])
      self.class.advance_bar(io.size) if k == 0
      raise "Child process failed: #{k}" if result.empty?
      new_segments += Marshal.load(result)
      pipe[0].close
      self.class.advance
    end
    new_segments = new_segments.map { |i| self.segments[i] }
    new_segments.each { |i| segments << i unless segments[i.name] }
    new = segments.size - pre_expansion
    $stderr.puts "  #{new} segments found, total: #{segments.size}"
    break if new == 0
  end

  internally_linking_records(segments, all_edges)
end

#matrix_find_module(segment_index) ⇒ Object

Finds the entire module containing the segment with index segment_index in the matrix (requires calling recalculate_matrix first!). Returns the module as a new GFA



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/gfa/modules.rb', line 26

def matrix_find_module(segment_index)
  # Initialize
  segs = [segment_index]
  edges = []
  new_segs = true

  # Iterate until no new segments are found
  while new_segs
    new_segs = false
    segs.each do |seg|
      @matrix.size.times do |k|
        next if seg == k
        v = @matrix[[seg, k].max][[seg, k].min]
        next if v.empty?
        edges += v
        unless segs.include?(k)
          new_segs = true
          segs << k
        end
      end
    end
  end

  # Save as GFA and return
  o = GFA.new
  segs.each { |k| o << segments[k] }
  edges.uniq.each { |k| o << @matrix_edges[k] }
  o
end

#merge(gfa) ⇒ Object

Creates a new GFA based on itself and appends all entries in gfa



114
115
116
# File 'lib/gfa/common.rb', line 114

def merge(gfa)
  GFA.new(opts).merge!(self).merge!(gfa)
end

#merge!(gfa) ⇒ Object

Adds the entrie of gfa to itself



107
108
109
110
111
112
113
# File 'lib/gfa/common.rb', line 107

def merge!(gfa)
  raise "Unsupported object: #{gfa}" unless gfa.is_a? GFA

  GFA::Record.TYPES.each do |t|
    @records[t].merge!(gfa.records[t])
  end
end

#rebuild_index!Object



95
96
97
# File 'lib/gfa/common.rb', line 95

def rebuild_index!
  @records.each_value(&:rebuild_index!)
end

#recalculate_matrixObject

Calculates a matrix where all links between segments are represented by the following variables:

@matrix_segment_names includes the names of all segments (as String) with the order indicating the segment index in the matrix

@matrix is an Array of Arrays of Arrays, where the first index indicates the row index segment, the second index indicates the column index segment, and the third index indicates each of the links between those two. Note that matrix only stores the lower triangle, so the row index must be stictly less than the column index. For example, @matrix[1] returns an Array of all index links between the segment with index 3 and the segment with index 1: “‘ [

[             ], # Row 0 is always empty
[[]           ], # Row 1 stores connections to column 0
[[], []       ], # Row 2 stores connections to columns 0 and 1
[[], [], []   ], # Row 3 stores connections to columns 0, 1, and 2
...              # &c

] “‘

@matrix_edges is an Array of GFA::Record objects representing all edges in the GFA. The order indicates the index used by the values of @matrix



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/gfa/modules.rb', line 81

def recalculate_matrix
  @matrix_segment_names = segments.set.map(&:name).map(&:value)
  @matrix = @matrix_segment_names.size.times.map { |i| Array.new(i) { [] } }
  @matrix_edges = all_edges
  @matrix_edges.each_with_index do |edge, k|
    names = edge.segments(self).map(&:name).map(&:value)
    indices = names.map { |i| @matrix_segment_names.index(i) }
    indices.each do |a|
      indices.each do |b|
        break if a == b
        @matrix[[a, b].max][[a, b].min] << k
      end
    end
  end
end

#save(file) ⇒ Object



2
3
4
5
6
7
8
# File 'lib/gfa/generator.rb', line 2

def save(file)
  fh = File.open(file, 'w')
  each_line do |ln|
    fh.puts ln
  end
  fh.close
end

#set_gfa_version(v) ⇒ Object



104
105
106
107
108
109
110
111
# File 'lib/gfa/parser.rb', line 104

def set_gfa_version(v)
  v = v.value if v.is_a? GFA::Field
  unless GFA::supported_version? v
    raise "GFA version currently unsupported: #{v}"
  end

  @gfa_version = v
end

#set_version_header(v) ⇒ Object



19
20
21
22
23
# File 'lib/gfa/generator.rb', line 19

def set_version_header(v)
  unset_version
  @records[:Header] << GFA::Record::Header.new("VN:Z:#{v}")
  @gfa_version = v
end

#sizeObject



79
80
81
# File 'lib/gfa/common.rb', line 79

def size
  records.values.map(&:size).inject(0, :+)
end

#split_modules(recalculate = true) ⇒ Object

Find all independent modules by greedily crawling the linking entries for each segment, and returns an Array of GFA objects containing each individual module. If recalculate is false, it trusts the current calculated matrix unless none exists



8
9
10
11
12
13
14
15
16
17
18
19
20
# File 'lib/gfa/modules.rb', line 8

def split_modules(recalculate = true)
  recalculate_matrix if recalculate || @matrix.nil?
  missing_segments = (0 .. @matrix_segment_names.size - 1).to_a
  modules = []
  until missing_segments.empty?
    mod = matrix_find_module(missing_segments[0])
    mod.segments.set.map(&:name).map(&:value).each do |name|
      missing_segments.delete(@matrix_segment_names.index(name))
    end
    modules << mod
  end
  modules
end

#subgraph(segments, degree: 1, headers: true, threads: 2) ⇒ Object

Extracts the subset of records associated to segments, which is an Array with values of any class in:

  • Integer: segment index,

  • String or GFA::Field::String: segment names, or

  • GFA::Record::Segment: the actual segments themselves

degree indicates the maximum degree of separation between the original segment set and any additional segments. Use 0 to include only the segments in the set. Use 1 to include those, the records linking to them, and the additional segments linked by those records. Use any integer greater than 1 to prompt additional rounds of greedy graph expansion.

If headers, it includes all the original headers. Otherwise it only only includes the version header (might be inferred).

threads indicates the number of threads to use in the processing of this operation, currently only affecting expansion rounds.

All comments are ignored even if originally parsed. Walks are currently ignored too. If the current GFA object doesn’t have an index, it builds one and forces index: true. The output object inherits all options.



71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/gfa/graph.rb', line 71

def subgraph(segments, degree: 1, headers: true, threads: 2)
  # Prepare objects
  unless opts[:index]
    opts[:index] = true
    rebuild_index!
  end
  gfa = GFA.new(opts)
  segments =
    segments.map do |i|
      i.is_a?(GFA::Record::Segment) ? i :
        segment(i) or raise "Cannot find segment: #{i}"
    end

  # Headers
  if headers
    self.headers.set.each { |record| gfa << record }
  else
    gfa << GFA::Record::Header.new("VN:Z:#{gfa_version}")
  end

  # Original segments
  segments.each { |segment| gfa << segment }

  # Expand graph
  linking = linking_records(gfa.segments, degree: degree, threads: threads)
  linking.each { |record| gfa << record }

  # Return
  gfa
end

#to_sObject



30
31
32
33
34
# File 'lib/gfa/generator.rb', line 30

def to_s
  o = ''
  each_line { |ln| o += ln + "\n" }
  o
end

#total_lengthObject

Computes the sum of all individual segment lengths



101
102
103
# File 'lib/gfa/common.rb', line 101

def total_length
  segments.total_length
end

#unset_versionObject



25
26
27
28
# File 'lib/gfa/generator.rb', line 25

def unset_version
  headers.set.delete_if { |o| !o.fields[:VN].nil? }
  @gfa_version = nil
end