Class: BioInterchange::TextMining::PDFxXMLReader::MyListener

Inherits:

Object

Object
BioInterchange::TextMining::PDFxXMLReader::MyListener

show all

Includes:: REXML::StreamListener

Defined in:: lib/biointerchange/textmining/pdfx_xml_reader.rb

Instance Method Summary collapse

#document ⇒ Object
#initialize ⇒ MyListener constructor

A new instance of MyListener.
#tag_end(name) ⇒ Object

TODO add deal with <author> type tags.
#tag_start(name, attr) ⇒ Object
#text(data) ⇒ Object

Constructor Details

#initialize ⇒ `MyListener`

Returns a new instance of MyListener.

# File 'lib/biointerchange/textmining/pdfx_xml_reader.rb', line 56

def initialize
  @map = {}
  
  #sections can nest, so "stack" them
  @map['sec_s'] = []
  @map['sec_l'] = []
end

Instance Method Details

#document ⇒ `Object`



158
159
160

# File 'lib/biointerchange/textmining/pdfx_xml_reader.rb', line 158

def document
  @doc
end

#tag_end(name) ⇒ `Object`

TODO add deal with <author> type tags

# File 'lib/biointerchange/textmining/pdfx_xml_reader.rb', line 121

def tag_end(name)
  #puts "tag_end: #{name}"
  if name =~ /^job$/
    @map['id'] = false
    @map['id_done'] = true
  elsif name =~ /^article-title$/
    @map['title'] = false
    dc = BioInterchange::TextMining::Content.new(@map['title_s'], @map['title_l'], BioInterchange::TextMining::Content::TITLE, @process)
    dc.setContext(@doc)
    @doc.add(dc)
    @map['title_done'] = true
  elsif name =~ /^abstract$/
    @map['abs'] = false
    dc = BioInterchange::TextMining::Content.new(@map['abs_s'], @map['abs_l'], BioInterchange::TextMining::Content::ABSTRACT, @process)
    dc.setContext(@doc)
    @doc.add(dc)
    @map['abs_done'] = true
  elsif name =~ /^body$/
    @map['body'] = false
    dc = BioInterchange::TextMining::Content.new(@map['body_s'], @map['body_l'], BioInterchange::TextMining::Content::SECTION, @process)
    dc.setContext(@doc)
    @doc.add(dc)
    @map['body_done'] = true
  elsif name =~ /^article$/
    @map['art'] = false
    dc = BioInterchange::TextMining::Content.new(@map['art_s'], @map['art_l'], BioInterchange::TextMining::Content::DOCUMENT, @process)
    dc.setContext(@doc)
    @doc.add(dc)
    @map['art_done'] = true
  elsif name =~ /^section$/
    raise 'Error with section stack, stacks not equal in size' unless  @map['sec_s'].size == @map['sec_l'].size
    dc = BioInterchange::TextMining::Content.new(@map['sec_s'].pop, @map['sec_l'].pop, BioInterchange::TextMining::Content::SECTION, @process)
    dc.setContext(@doc)
    @doc.add(dc)
  end
end

#tag_start(name, attr) ⇒ `Object`

# File 'lib/biointerchange/textmining/pdfx_xml_reader.rb', line 64

def tag_start(name, attr)
  #puts "tag_start: #{name}"
  if name =~ /^job$/
    raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <job> tags, cannot parse multiple documents within a single file.' if @map['id_done']
    @map['id'] = true
  elsif name =~ /^article-title$/
    raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <article-title> tags defined, cannot parse multiple documents within a single file.' if @map['title_done']
    @map['title'] = true
    @map['title_s'] = @map['art_l']
    @map['title_l'] = 0
  elsif name =~ /^abstract$/
    raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <abstract> tags defined, cannot parse multiple documents within a single file.' if @map['abs_done']
    @map['abs'] = true
    @map['abs_s'] = @map['art_l']
    @map['abs_l'] = 0
  elsif name =~ /^body$/
    raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <body> tags defined, cannot parse multiple documents within a single file.' if @map['body_done']
    @map['body'] = true
    @map['body_s'] = @map['art_l']
    @map['body_l'] = 0
  elsif name =~ /^article$/
    raise BioInterchange::Exceptions::InputFormatError, 'Input document XML has multiple <article> tags defined, cannot parse multiple documents within a single file.' if @map['art_done']
    @map['art'] = true
    @map['art_s'] = 0
    @map['art_l'] = 0
  elsif name =~ /^section$/
    raise BioInterchange::Exceptions::InputFormatError, 'Error with section stack, stacks not equal in size: Possibly not a well formed XML input file. Check <section> tags all match up and do not overlap (nesting is fine).' unless  @map['sec_s'].size == @map['sec_l'].size
    @map['sec_s'].push @map['art_l']
    @map['sec_l'].push 0
  end
end

#text(data) ⇒ `Object`

# File 'lib/biointerchange/textmining/pdfx_xml_reader.rb', line 96

def text(data)
  if @map['art']
    @map['art_l'] += data.length
  end
  
  if @map['id']
    @doc = BioInterchange::TextMining::Document.new("http://pdfx.cs.man.ac.uk/" + data)
  elsif @map['title']
    @map['title_l'] += data.length
  elsif @map['abs']
    @map['abs_l'] += data.length
  end
  if @map['body']
    @map['body_l'] += data.length
  end
  if @map['sec_l'].size != 0
    #add length to *all* current sections
    @map['sec_l'].size.times do |i|
      @map['sec_l'][i] += data.length
    end
  end
end

Class: BioInterchange::TextMining::PDFxXMLReader::MyListener

Instance Method Summary collapse

Constructor Details

#initialize ⇒ MyListener

Instance Method Details

#document ⇒ Object

#tag_end(name) ⇒ Object

#tag_start(name, attr) ⇒ Object

#text(data) ⇒ Object

#initialize ⇒ `MyListener`

#document ⇒ `Object`

#tag_end(name) ⇒ `Object`

#tag_start(name, attr) ⇒ `Object`

#text(data) ⇒ `Object`