Class: EupathDBGFF

Inherits:
JgiGenesGff show all
Defined in:
lib/eupathdb_gff.rb

Overview

Unlike JGI genes files, ApiDB files have several differences:

- genes on the reverse strand appear in order of their exons, and so
  the exons are not all in the correct order with respect to the underlying
  sequence.

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods inherited from JgiGenesGff

#distance_iterator

Constructor Details

#initialize(path) ⇒ EupathDBGFF

Returns a new instance of EupathDBGFF.



14
15
16
17
18
19
20
21
22
23
24
# File 'lib/eupathdb_gff.rb', line 14

def initialize(path)
  @file = File.open path, 'r'
  @next_gff = read_record
  @features_to_ignore = [
    'rRNA',
    'tRNA',
    'snRNA',
    'transcript',
    'ncRNA',
  ]
end

Instance Attribute Details

#features_to_ignoreObject

Returns the value of attribute features_to_ignore.



12
13
14
# File 'lib/eupathdb_gff.rb', line 12

def features_to_ignore
  @features_to_ignore
end

Instance Method Details

#ignore_line?(cur) ⇒ Boolean

ignore this line when parsing the file

Returns:

  • (Boolean)


117
118
119
# File 'lib/eupathdb_gff.rb', line 117

def ignore_line?(cur)
  return ['supercontig', 'introgressed_chromosome_region'].include?(cur.feature)
end

#ignore_record?(record) ⇒ Boolean

Certain things I don’t want uploaded, like apicoplast genome, etc.

Returns:

  • (Boolean)


122
123
124
125
126
127
128
129
130
131
132
133
134
# File 'lib/eupathdb_gff.rb', line 122

def ignore_record?(record)
  if !record or !record.seqname or
      @features_to_ignore.include?(record.feature) or 
      record.seqname.match(/^apidb\|NC\_/) or
      record.seqname.match(/^apidb\|API_IRAB/) or
      record.seqname.match(/^apidb\|M76611/) or
      record.seqname.match(/^apidb\|X95276/) #or
#        record.seqname.match(/^apidb\|Pf/)
    return true
  else
    return false
  end
end

#next_geneObject



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/eupathdb_gff.rb', line 26

def next_gene
  cur = @next_gff

  if !cur
    return nil
  end
  
  # Ignore the supercontigs at the start of the file
  while ignore_line?(cur) or ignore_record?(cur)
    @next_gff = read_record
    cur = @next_gff
    if !cur
      return nil
    end
  end
  
  if cur.feature != 'gene'
    raise Exception, "Badly parsed apidb line: #{cur}. Expected gene first."
  end
  
  # save line so can set these values later,
  # i
  gene_line = cur
    
    
  # First mRNA
  cur = read_record
  
  if cur.feature != 'mRNA'
    # skip rRNA type genes because they are not relevant
    if ignore_record?(cur)
      # skip forward to the next gene
      while cur.feature != 'gene'
        cur = read_record
        return nil if cur.nil? # we have reached the end on an ignored gene
      end
      @next_gff = cur
      if cur
        return next_gene
      else
        return nil
      end
    else
      raise Exception, "Badly parsed apidb line: #{cur}. Expected mRNA next."
    end
  end
  
  # Setup the gene in itself
  gene = setup_gene_from_first_line gene_line
  
  # setup stuff from mRNA line
  ids = cur.attributes['Ontology_term']
  if ids
    gene.go_identifiers = ids.split ','
  end
  
  # Next CDS
  cur = read_record
  if cur.feature != 'CDS'
    raise Exception, "Badly parsed apidb line: #{cur}. Expected CDS next."
  end
  gene.cds = []
  while cur.feature == 'CDS'
    f = Bio::Location.new
    f.from = cur.start
    f.to = cur.end
    gene.cds.push f
      
    cur = read_record
  end
    
  #next exons
  if cur.feature != 'exon'
    raise Exception, "Badly parsed apidb line: #{cur}. Expected exon next."
  end
  gene.exons = []
  while cur and cur.feature == 'exon'
    f = Bio::Location.new
    f.from = cur.start
    f.to = cur.end
    gene.exons.push f
      
    cur = read_record
  end
    
  @next_gff = cur

  return gene
end