Class: Dap::Input::InputWARC
- Inherits:
-
Object
- Object
- Dap::Input::InputWARC
- Includes:
- FileSource
- Defined in:
- lib/dap/input/warc.rb
Overview
WARC
Instance Attribute Summary collapse
-
#header ⇒ Object
Returns the value of attribute header.
-
#info ⇒ Object
Returns the value of attribute info.
Attributes included from FileSource
Instance Method Summary collapse
-
#initialize(args) ⇒ InputWARC
constructor
A new instance of InputWARC.
- #read_record ⇒ Object
- #read_warc_header ⇒ Object
Methods included from FileSource
Constructor Details
#initialize(args) ⇒ InputWARC
Returns a new instance of InputWARC.
13 14 15 16 |
# File 'lib/dap/input/warc.rb', line 13 def initialize(args) self.open(args.first) read_warc_header end |
Instance Attribute Details
#header ⇒ Object
Returns the value of attribute header.
11 12 13 |
# File 'lib/dap/input/warc.rb', line 11 def header @header end |
#info ⇒ Object
Returns the value of attribute info.
11 12 13 |
# File 'lib/dap/input/warc.rb', line 11 def info @info end |
Instance Method Details
#read_record ⇒ Object
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/dap/input/warc.rb', line 37 def read_record begin version = self.fd.readline unless version and version =~ /^WARC\/\d+\.\d+/ return Error::EOF end warc = {} loop do line = self.fd.readline unless line.strip.length == 0 k, v = line.strip.split(/\s*:\s*/, 2) k = k.downcase.gsub('-', '_') warc[k] = v.to_s next end unless warc['content_length'] return Error::EOF end warc['content'] = self.fd.read(warc['content_length'].to_i) skip = self.fd.readline skip = self.fd.readline unless skip.strip.length == 0 return Error::EOF end break end return warc rescue ::EOFError return Error::EOF end end |
#read_warc_header ⇒ Object
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/dap/input/warc.rb', line 18 def read_warc_header self.header = read_record if self.header == Error::EOF raise RuntimeError, "Invalid WARC header" end unless self.header['warc_type'].to_s == "warcinfo" raise RuntimeError, "Invalid WARC header (missing warcinfo)" end self.info = {} self.header['content'].to_s.split("\n").each do |line| k, v = line.strip.split(/\s*:\s*/, 2) next unless v self.info[k] = v end end |