Class: RTika::GenericParser

Inherits:
Object
  • Object
show all
Defined in:
lib/rtika.rb

Direct Known Subclasses

FileParser, StringParser, UrlParser

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(*args) ⇒ GenericParser

Returns a new instance of GenericParser.



52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/rtika.rb', line 52

def initialize(*args)
  @options = args.last 

  if remove_boilerplate? 
    @writeout_content = RTika::WriteOutContentHandler.new(-1)
    @content = RTika::BoilerpipeContentHandler.new(@writeout_content)
  else
    @content = RTika::BodyContentHandler.new(-1)
  end

  @metadata = RTika::Metadata.new
end

Class Method Details

.parse(*args) ⇒ Object



44
45
46
# File 'lib/rtika.rb', line 44

def self.parse(*args)
  new(*args).parse
end

Instance Method Details

#parseObject



65
66
67
68
69
70
71
72
73
74
# File 'lib/rtika.rb', line 65

def parse
  @parser = RTika::AutoDetectParser.new
  @content, @metadata = process
  
  if remove_boilerplate? 
    RTika::ParsedResult.new(@writeout_content, @metadata)
  else
    RTika::ParsedResult.new(@content, @metadata)
  end
end

#processObject



76
77
78
# File 'lib/rtika.rb', line 76

def process
  raise "override this in your parser, return content and metadata" 
end

#remove_boilerplate?Boolean

Returns:

  • (Boolean)


48
49
50
# File 'lib/rtika.rb', line 48

def remove_boilerplate?
  @options[:remove_boilerplate] && @options[:remove_boilerplate] == true 
end