Class: Warc::Record::Header

Inherits:
HeaderHash show all
Includes:
ActiveModel::Validations
Defined in:
lib/warc/record/header.rb

Constant Summary collapse

NAMED_FIELDS =

Set of field names defined in the spec

[
  "WARC-Type",
  "WARC-Record-ID",
  "WARC-Date",
  "Content-Length",
  "Content-Type",
  "ARC-Concurrent-To",
  "WARC-Block-Digest",
  "WARC-Payload-Digest",
  "WARC-IP-Address",
  "WARC-Refers-To",
  "WARC-Target-URI",
  "WARC-Truncated",
  "WARC-Warcinfo-ID",
  "WARC-Filename", #warcinfo only
  "WARC-Profile", #revisit only
  "WARC-Identified-Payload-Type",
  "WARC-Segment-Origin-ID",       # continuation only
  "WARC-Segment-Number",
  "WARC-Segment-Total-Length" #continuation only
]
REQUIRED_FIELDS =
["WARC-Record-ID","Content-Length","WARC-Date","WARC-Type"]

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods inherited from HeaderHash

#[], #[]=, #delete, #each, #include?, #merge, #merge!, #replace, #to_hash

Constructor Details

#initialize(record, h = {}) ⇒ Header

Returns a new instance of Header.



38
39
40
41
# File 'lib/warc/record/header.rb', line 38

def initialize(record,h={})
  @record=record
  super(h)
end

Instance Attribute Details

#recordObject (readonly)

WARC field names are case-insensitive header == header



9
10
11
# File 'lib/warc/record/header.rb', line 9

def record
  @record
end

Instance Method Details

#block_digestObject



63
64
65
# File 'lib/warc/record/header.rb', line 63

def block_digest
  self["warc-block-digest"] ||= compute_digest(self.record.content)
end

#compute_digest(content) ⇒ Object



67
68
69
# File 'lib/warc/record/header.rb', line 67

def compute_digest(content)
  "sha256:" + (Digest::SHA256.hexdigest(content))
end

#content_lengthObject



43
44
45
# File 'lib/warc/record/header.rb', line 43

def content_length
(self["content-length"] ||= self.record.content.length rescue 0).to_i
end

#dateObject



47
48
49
# File 'lib/warc/record/header.rb', line 47

def date
  Time.parse(self["warc-date"]).iso8601 ||= Time.now.iso8601
end

#date=(d) ⇒ Object



51
52
53
# File 'lib/warc/record/header.rb', line 51

def date=(d)
  self["warc-date"] = Time.parse(d).iso8601
end

#record_idObject



59
60
61
# File 'lib/warc/record/header.rb', line 59

def record_id
  self["warc-record-id"] ||= sprintf("<urn:uuid:%s>",UUID.generate)
end

#to_sObject



75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/warc/record/header.rb', line 75

def to_s
  crfl="\r\n"
  str = String.new
  str << "WARC-Type: #{self.type}" + crfl
  str << "WARC-Record-ID: #{self.record_id}" + crfl
  str << "WARC-Date: #{self.date}" + crfl
  str << "Content-Length: #{self.content_length}" + crfl
  each do |k,v|
    str << "#{k}: #{v}#{crfl}" unless REQUIRED_FIELDS.map(&:downcase).include?(k)
  end
  return str
end

#typeObject



55
56
57
# File 'lib/warc/record/header.rb', line 55

def type
  self["warc-type"]
end

#uriObject



71
72
73
# File 'lib/warc/record/header.rb', line 71

def uri
  self["warc-target-uri"]
end