Module: Bio::BioAlignment::MaskIslands

Includes:
MarkRows
Defined in:
lib/bio-alignment/edit/mask_islands.rb

Defined Under Namespace

Classes: IslandElementState

Instance Method Summary collapse

Methods included from MarkRows

#mark_row_elements, #mark_rows

Instance Method Details

#mark_islandsObject

Drop all ‘islands’ in a sequence with low consensus, i.e. islands that show a gap larger than ‘min_gap_size’ (default 3) on both sides, and are shorter than ‘max_island_size’ (default 30). An island larger than 30 elements is arguably no longer an island, and low consensus stretches may be loops - it is up to the alignment procedure to get that right. We also allow for micro deletions inside an alignment (1 or 2 elements). The island consensus is calculated by column. If more than 50% of the island shows consensus, the island is retained. Consensus for each element is defined as the number of matches in the column (default 1).



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/bio-alignment/edit/mask_islands.rb', line 26

def mark_islands
  logger = Bio::Log::LoggerPlus['bio-alignment']
  count_marked_islands = 0
  count_marked_elements = 0

  # Traverse each row in the alignment
  marked_aln = mark_row_elements { |row,rownum|
    # for each element create a state object, and find unique elements (i.e. consensus) across a column
    row.each_with_index do |e,colnum|
      e.state = IslandElementState.new
      column = columns[colnum]
      e.state.unique = (column.count{|e2| !e2.gap? and e2 == e } == 1)
      # p [e,e.state,e.state.unique]
    end
    # at this stage all elements of the row have been set to unique,
    # which are unique. Now group elements into islands (split on gap)
    # and mask
    gap = []
    island = []
    in_island = true
    row.each do |e|
      if not in_island
        if e.gap?
          gap << e
        else
          island << e
          in_island = true
          gap = []
        end
      else # in_island
        if not e.gap?
          island << e
          gap = []
        else
          gap << e
          if gap.length > 2
            in_island = false
            ci, ce = mark_island(island)
            count_marked_islands += ci
            count_marked_elements += ce
            # print_island(island)
            island = []
          end
        end
      end
    end
    if in_island
      ci, ce = mark_island(island)
      count_marked_islands += ci
      count_marked_elements += ce
      # print_island(island) if island.length > 0
    end
    row # always return the row to mark_row_elements
  }
  logger.info("#{count_marked_islands} islands marked (#{count_marked_elements} elements)")
  return marked_aln
end