Class: RDF::SAK::Context::Document

Inherits:

Object

Object
RDF::SAK::Context::Document

show all

Includes:: Util, XML::Mixup

Defined in:: lib/rdf/sak.rb

Overview

document context class -

Constant Summary collapse

RDFA_ATTR = notice these are only RDFa attributes that take URIs

[:about, :resource, :typeof].freeze

LINK_ATTR =

[:href, :src, :data, :action, :longdesc].freeze

LINK_XPATH =

('.//html:*[not(self::html:base)][%s]' %
(LINK_ATTR + RDFA_ATTR).map { |a| "@#{a.to_s}" }.join('|')).freeze

OBJS =

[:href, :src].freeze

LITXP = ancestor node always with (@property and not @content) and not @resource|@href|@src unless @rel|@rev

['(ancestor::*[@property][not(@content)]',
'[not(@resource|@href|@src) or @rel|@rev])[1]' ].join('').freeze

Constants included from Util

Util::SCHEME_RANK, Util::XHTMLNS, Util::XHV, Util::XPATHNS

Instance Attribute Summary collapse

#doc ⇒ Object readonly

Returns the value of attribute doc.
#uri ⇒ Object readonly

Returns the value of attribute uri.
#uuid ⇒ Object readonly

Returns the value of attribute uuid.

Instance Method Summary collapse

#base_for(node = nil) ⇒ Object
#generate_backlinks(published: true, ignore: nil) ⇒ Object

backlink structure.
#generate_twitter_meta ⇒ Object

goofy twitter-specific metadata.
#initialize(context, uuid, doc: nil, uri: nil, mtime: nil) ⇒ Document constructor

A new instance of Document.
#prefixes_for(node, prefixes = {}) ⇒ Object
#published? ⇒ Boolean

proxy for context published.
#rewrite_links(node = @doc, uuids: {}, uris: {}, &block) ⇒ Object
#subject_for(node = nil, rdf: false, is_ancestor: false) ⇒ Object

give us the rdf subject of the node itself.
#transform_xhtml(published: true) ⇒ Object
#triples_for ⇒ Object

sponge the document for rdfa.
#vocab_for(node) ⇒ Object

note parentheses cause the index to be counted from the root.
#write_to_target(published: true) ⇒ Array

Actually write the transformed document to the target.

Methods included from Util

#abbreviate, #all_related, asserted_types, #authors_for, base_for, canonical_uri, canonical_uuid, cmp_label, #cmp_resource, #coerce_node_spec, #coerce_resource, #coerce_uuid_urn, dates_for, #dehydrate, #formats_for, #get_base, #get_prefixes, #invert_struct, label_for, #modernize, #node_matches?, objects_for, #predicate_set, #prefix_subset, #prepare_collation, published?, #rehydrate, #reindent, replacements_for, #resolve_curie, #smush_struct, #split_pp, #split_pp2, #split_qp, struct_for, subjects_for, #subtree, #terminal_slug, #title_tag, traverse_links, #type_strata, #uri_pp

Constructor Details

#initialize(context, uuid, doc: nil, uri: nil, mtime: nil) ⇒ `Document`

Returns a new instance of Document.

# File 'lib/rdf/sak.rb', line 1861

def initialize context, uuid, doc: nil, uri: nil, mtime: nil
  raise 'context must be a RDF::SAK::Context' unless
    context.is_a? RDF::SAK::Context
  raise 'uuid must be an RDF::URI' unless
    uuid.is_a? RDF::URI and uuid.to_s.start_with? 'urn:uuid:'

  doc ||= context.locate uuid
  raise 'doc must be Pathname, IO, or Nokogiri node' unless
    C_OK.any? { |c| doc.is_a? c } || doc.respond_to?(:to_s)

  # set some instance variables
  @context = context
  @uuid    = uuid
  @mtime   = mtime || doc.respond_to?(:mtime) ? doc.mtime : Time.now
  @target  = context.target_for uuid

  # now process the document

  # turn the document into an XML::Document
  if doc.is_a? Nokogiri::XML::Node
    # a node that is not a document should be wrapped with one
    unless doc.is_a? Nokogiri::XML::Document
      d = doc.dup 1
      doc = Nokogiri::XML::Document.new
      doc << d
    end
  else
    type = nil

    # pathnames turned into IO objects
    if doc.is_a? Pathname
      type = RDF::SAK::MimeMagic.by_path doc
      doc  = doc.open # this may raise if the file isn't there
    end

    # squash everything else to a string
    doc = doc.to_s unless doc.is_a? IO

    # check type by content
    type ||= RDF::SAK::MimeMagic.by_magic(doc)

    # can you believe there is a special bookmarks mime type good grief
    type = 'text/html' if type == 'application/x-mozilla-bookmarks'

    # now we try to parse the blob
    if type.to_s =~ /xml/i
      doc = Nokogiri.XML doc
    elsif type == 'text/html'
      # if the detected type is html, try it as strict xml first
      attempt = nil
      begin
        attempt = Nokogiri.XML doc, nil, nil, (1 << 11) # NONET
      rescue Nokogiri::XML::SyntaxError
        # do not wrap this a second time; let it fail if it's gonna
        tmp = Nokogiri.HTML doc
        attempt = Nokogiri::XML::Document.new
        attempt << tmp.root.dup(1)
      end
      doc = attempt
    elsif type.to_s =~ /^text\/(?:plain|(?:x-)?markdown)/i
      # just assume plain text is markdown
      doc = ::MD::Noko.new.ingest doc
    else
      raise "Don't know what to do with #{uuid} (#{type})"
    end
  end

  # now fix the namespaces for mangled html documents
  root = doc.root
  if root.name == 'html'
    unless root.namespace
      # clear this off or it will be duplicated in the output
      root.remove_attribute('xmlns')
      # now generate a new ns object
      ns = root.add_namespace(nil, XHTMLNS)
      # *now* scan the document and add the namespace declaration
      root.traverse do |node|
        if node.element? && node.namespace.nil?
          # downcasing the name may be cargo culting; need to check
          # node.name = node.name.downcase # yup it is
          node.namespace = ns
        end
      end
    end

    # also add the magic blank doctype declaration if it's missing
    unless doc.internal_subset
      doc.create_internal_subset('html', nil, nil)
    end
  end

  # aaand set some more instance variables

  @uri = URI(uri || @context.canonical_uri(uuid))

  # voilà
  @doc = doc
end

Instance Attribute Details

#doc ⇒ `Object` (readonly)

Returns the value of attribute doc.



1859
1860
1861

# File 'lib/rdf/sak.rb', line 1859

def doc
  @doc
end

#uri ⇒ `Object` (readonly)

Returns the value of attribute uri.



1859
1860
1861

# File 'lib/rdf/sak.rb', line 1859

def uri
  @uri
end

#uuid ⇒ `Object` (readonly)

Returns the value of attribute uuid.



1859
1860
1861

# File 'lib/rdf/sak.rb', line 1859

def uuid
  @uuid
end

Instance Method Details

#base_for(node = nil) ⇒ `Object`

# File 'lib/rdf/sak.rb', line 1965

def base_for node = nil
  node ||= @doc
  doc  = node.document
  base = @uri.to_s
  if doc.root.name.to_sym == :html
    b = doc.at_xpath(
      '(/html:html/html:head/html:base[@href])[1]/@href',
      { html: XHTMLNS }).to_s.strip
    base = b if URI(b).absolute?
  elsif b = doc.at_xpath('ancestor-or-self::*[@xml:base][1]/@xml:base')
    b = b.to_s.strip
    base = b if URI(b).absolute?
  end

  URI(base)
end

#generate_backlinks(published: true, ignore: nil) ⇒ `Object`

backlink structure



2180
2181
2182

# File 'lib/rdf/sak.rb', line 2180

def generate_backlinks published: true, ignore: nil
  @context.generate_backlinks @uuid, published: published, ignore: ignore
end

#generate_twitter_meta ⇒ `Object`

goofy twitter-specific metadata



2185
2186
2187

# File 'lib/rdf/sak.rb', line 2185

def generate_twitter_meta
  @context.generate_twitter_meta @uuid
end

#prefixes_for(node, prefixes = {}) ⇒ `Object`

# File 'lib/rdf/sak.rb', line 2059

def prefixes_for node, prefixes = {}
  # start with namespaces
  pfx = node.namespaces.select do |k, _|
    k.start_with? 'xmlns:'
  end.transform_keys do |k|
    k.delete_prefix 'xmlns:'
  end

  # then add @prefix overtop of the namespaces
  if node[:prefix]
    x = node[:prefix].strip.split(/\s+/)
    a = []
    b = []
    x.each_index { |i| (i % 2 == 0 ? a : b).push x[i] }
    # if the size is uneven the values will be nil, so w drop em
    pfx.merge! a.zip(b).to_h.reject { |_, v| v.nil? }
  end

  # since we're ascending the tree, input takes precedence
  prefixes = pfx.merge prefixes

  if node.parent and node.parent.element?
    prefixes_for(node.parent, prefixes)
  else
    prefixes
  end
end

#published? ⇒ `Boolean`

proxy for context published

Returns:

(Boolean)



1961
1962
1963

# File 'lib/rdf/sak.rb', line 1961

def published?
  @context.published? @uuid
end

#rewrite_links(node = @doc, uuids: {}, uris: {}, &block) ⇒ `Object`

# File 'lib/rdf/sak.rb', line 1988

def rewrite_links node = @doc, uuids: {}, uris: {}, &block
  base  = base_for node
  count = 0
  cache = {}
  node.xpath(LINK_XPATH, { html: XHTMLNS }).each do |elem|
    LINK_ATTR.each do |attr|
      attr = attr.to_s
      next unless elem.has_attribute? attr

      abs = base.merge uri_pp(elem[attr].strip)

      # fix e.g. http->https
      if abs.host == @uri.host and abs.scheme != @uri.scheme
        tmp          = @uri.dup
        tmp.path     = abs.path
        tmp.query    = abs.query
        tmp.fragment = abs.fragment
        abs          = tmp
      end

      # harvest query string
      pp = split_pp abs, only: true

      abs = RDF::URI(abs.to_s)

      # round-trip to uuid and back if we can
      if uuid = uuids[abs] ||= @context.canonical_uuid(abs)
        abs = cache[abs] ||= @context.canonical_uri(uuid)
      else
        abs = cache[abs] ||= @context.canonical_uri(abs)
      end

      # reinstate the path parameters
      if !pp.empty? && split_pp(abs, only: true).empty?
        abs = abs.dup
        abs.path = ([abs.path] + pp).join(';')
      end
      

      elem[attr] = @uri.route_to(abs.to_s).to_s
      count += 1
    end

    block.call elem if block
  end

  count
end

#subject_for(node = nil, rdf: false, is_ancestor: false) ⇒ `Object`

give us the rdf subject of the node itself

# File 'lib/rdf/sak.rb', line 2088

def subject_for node = nil, rdf: false, is_ancestor: false
  node ||= @doc.root
  raise 'Node must be an element' unless
    node.is_a? Nokogiri::XML::Element

  # first we check for an ancestor element with @property and no
  # @content; if we find one then we reevaluate with that
  # element as the starting point
  if n = node.at_xpath(LITXP)
    return subject_for n
  end

  # answer a bunch of helpful questions about this element
  subject = nil
  base    = base_for node
  parent  = node.parent
  ns_href = node.namespace.href if node.namespace
  up_ok   = %i{rel rev}.none? { |a| node[a] }
  is_root = !parent or parent.document?
  special = /^(?:[^:]+:)?(?:head|body)$/i === node.name and
    (ns_href == 'http://www.w3.org/1999/xhtml' or
    /^(?:[^:]+:)?html$/xi === parent.name)

  # if the node is being inspected as an ancestor to the
  # original node, we have to check it backwards.
  if is_ancestor
    # ah right @resource gets special treatment
    if subject = node[:resource]
      subject.strip!
      if m = /^\[(.*?)\]$/.match(subject)
      end
    else
      OBJS.each do |attr|
        if node[attr]
          # merge with the root and return it
          subject = base + node[attr]
          break
        end
      end
    end

    return rdf ? RDF::URI(subject.to_s) : subject

    # note if we are being called with is_ancestor, that means
    # the original node (or indeed any of the nodes previously
    # tested) have anything resembling a resource in them. this
    # means @rel/@rev should be ignored, and we should keep
    # looking for a subject.
  end

  if node[:about]
    
    if m = /^_:(.*)$/.match(node[:about])
      return RDF::Node(m[1])
    end

    # XXX resolve @about against potential curie
    subject = base + node[:about]
    
  elsif is_root
    subject = base
  elsif special
    subject = subject_for parent
  elsif node[:resource]
    # XXX resolve @about against potential curie
    subject = base + node[:resource]
  elsif node[:href]
    subject = base + node[:href]
  elsif node[:src]
    subject = base + node[:src]
  elsif node[:typeof]
    # bnode the typeof attr

    # note we return bnodes irrespective of the rdf flag
    return RDF::Node('id-%016x' % node.attributes['typeof'].pointer_id)
  elsif node[:inlist]
    # bnode the inlist attr
    return RDF::Node('id-%016x' % node.attributes['inlist'].pointer_id)
  elsif (parent[:inlist] && OBJS.none? { |a| parent[a] }) ||
      (is_ancestor && !up_ok)
    # bnode the element
    return RDF::Node('id-%016x' % node.pointer_id)
  # elsif node[:id]
  else
    subject = subject_for parent, is_ancestor: true
  end

  rdf ? RDF::URI(subject.to_s) : URI(subject.to_s)

end

#transform_xhtml(published: true) ⇒ `Object`

# File 'lib/rdf/sak.rb', line 2189

def transform_xhtml published: true
  # before we do any more work make sure this is html
  doc  = @doc.dup 1
  body = doc.at_xpath('//html:body[1]', { html: XHTMLNS }) or return

  # eliminate comments
  doc.xpath('//comment()[not(ancestor::html:script)]',
    { html: XHTMLNS }).each { |c| c.unlink }

  # initial stuff
  struct    = @context.struct_for @uuid, uuids: true, canon: true
  # rstruct   = @context.struct_for @uuid, uuids: true, rev: true
  resources = {}
  literals  = {}
  ufwd      = {} # uuid -> uri
  urev      = {} # uri  -> uuid
  datatypes = Set.new
  types     = Set.new
  authors   = @context.authors_for(@uuid)
  title     = @context.label_for @uuid, candidates: struct
  desc      = @context.label_for @uuid, candidates: struct, desc: true

  # rewrite content
  title = title[1] if title
  desc  = desc[1]  if desc

  # `struct` and `rstruct` will contain all the links and
  # metadata for forward and backward neighbours, respectively,
  # which we need to mine (predicates, classes, datatypes) for
  # prefixes among other things.

  struct.each do |p, v|
    v.each do |o|
      if o.literal?
        literals[o] ||= Set.new
        literals[o].add p

        # collect the datatype
        datatypes.add o.datatype if o.has_datatype?
      else
        # normalize URIs
        if o.to_s.start_with? 'urn:uuid:'
          ufwd[o] ||= @context.canonical_uri o
        elsif cu = @context.canonical_uuid(o)
          o = urev[o] ||= cu
        end


        # collect the resource
        resources[o] ||= Set.new
        resources[o].add p

        # add to type
        types.add o if p == RDF::RDFV.type
      end
    end
  end
  urev.merge! ufwd.invert

  labels = resources.keys.map do |k|
    # turn this into a pair which subsequently gets turned into a hash
    [k, @context.label_for(k) ]
  end.to_h

  #warn labels

  # handle the title
  title ||= RDF::Literal('')
  tm = { '#title' => title,
    property: @context.abbreviate(literals[title].to_a, vocab: XHV) }
  if tl = title.language
    tm['xml:lang'] = tl # if xmlns
    tm['lang'] = tl
  elsif tdt = title.datatype and tdt != RDF::XSD.string
    tm[:datatype] = @context.abbreviate(tdt)
  end

  # we accumulate a record of the links in the body so we know
  # which ones to skip in the head
  bodylinks = {}
  rewrite_links body, uuids: ufwd, uris: urev do |elem|
    vocab = elem.at_xpath('ancestor-or-self::*[@vocab][1]/@vocab')
    vocab = uri_pp(vocab.to_s) if vocab

    if elem.key?('href') or elem.key?('src')
      vu = uri_pp(elem['href'] || elem['src'])
      ru = RDF::URI(@uri.merge(vu))
      bodylinks[urev[ru] || ru] = true

      if rel = resources[urev[ru] || ru]
        elem['rel'] = (@context.abbreviate rel, vocab: vocab).join ' '
      end

      label = labels[urev[ru] || ru]
      if label and (!elem.key?('title') or elem['title'].strip == '')
        elem['title'] = label[1].to_s
      end
    end
  end

  # and now we do the head
  links = []
  resources.reject { |k, _| bodylinks[k] }.each do |k, v|
    v = v.dup.delete RDF::RDFV.type
    next if v.empty?
    mts = @context.formats_for k

    # warn k, v.inspect

    # warn k, mts.inspect

    rel = @context.abbreviate v.to_a, vocab: XHV
    ru  = @uri.route_to(uri_pp (ufwd[k] || k).to_s)
    ln  = { nil => :link, rel: rel, href: ru.to_s }
    if (label = labels[urev[k] || k])
      ln[:title] = label[1].to_s
    end

    # add type=lol/wut
    ln[:type] = mts.first.to_s unless mts.empty?

    if !ln[:type] and v.include?(RDF::Vocab::XHV.stylesheet)
      ln[:type] = 'text/css'
    elsif ln[:type] =~ /(java|ecma)script/i or
        v.include?(RDF::Vocab::DC.requires)
      ln[nil]  = :script
      ln[:src] = ln.delete :href
      ln[:type] ||= 'text/javascript'
    end
    links.push ln
  end

  links.sort! do |a, b|
    # sort by rel, then by href
    # warn a.inspect, b.inspect
    s = 0
    [nil, :rel, :rev, :href, :title].each do |k|
      s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
      break if s != 0
    end
    s
  end

  # we want to duplicate links from particular subjects (eg the root)
  (@context.config[:duplicate] || {}).sort do |a, b|
    a.first <=> b.first
  end.each do |s, preds|

    o = {}
    u = ufwd[s] ||= @context.canonical_uuid s
    s = urev[u] ||= @context.canonical_uri u if u
    f = {}

    # do not include this subject as these links are already included!
    next if u == @uuid

    # gather up the objects, then gather up the predicates

    @context.objects_for u || s, preds, only: :resource do |obj, rel|
      # XXX do not know why += |= etc does not work
      x = @context.canonical_uuid(obj) || obj
      urev[x] ||= @context.canonical_uri x
      y = o[x] ||= Set.new
      o[x] = y | rel
      f[x] = @context.formats_for x
    end

    srel = @uri.route_to((u ? urev[u] || s : s).to_s)

    # now collect all the other predicates
    o.keys.each do |obj|
      hrel = @uri.route_to((urev[obj] || obj).to_s)
      o[obj] |= @context.graph.query([u || s, nil, obj]).predicates.to_set
      rels = @context.abbreviate o[obj].to_a, vocab: XHV
      ln = { nil => :link, about: srel, rel: rels, href: hrel }
      ln[:type] = f[obj].first if f[obj]

      # add to links
      links << ln
    end
  end

  meta = []

  # include author names as old school meta tags
  authors.each do |a|
    name  = labels[urev[a] || a] or next
    datatypes.add name[0] # a convenient place to chuck this
    prop  = @context.abbreviate(name[0])
    name  = name[1]
    about = @uri.route_to((ufwd[a] || a).to_s)
    tag   = { nil => :meta, about: about.to_s, name: :author,
             property: prop, content: name.to_s }

    if name.has_datatype? and name.datatype != RDF::XSD.string
      tag[:datatype] = @context.abbreviate(name.datatype)
    elsif name.has_language?
      tag['xml:lang'] = tag[:lang] = name.language
    end
    meta.push tag
  end

  literals.each do |k, v|
    next if k == title
    rel = @context.abbreviate v.to_a, vocab: XHV
    elem = { nil => :meta, property: rel, content: k.to_s }
    elem[:name] = :description if k == desc

    if k.has_datatype?
      datatypes.add k.datatype # so we get the prefix
      elem[:datatype] = @context.abbreviate k.datatype, vocab: XHV
    end

    meta.push(elem)
  end

  meta.sort! do |a, b|
    s = 0
    [:about, :property, :datatype, :content, :name].each do |k|
      # warn a.inspect, b.inspect
      s = a.fetch(k, '').to_s <=> b.fetch(k, '').to_s
      break if s != 0
    end
    s
  end

  # don't forget style tag
  style = doc.xpath('/html:html/html:head/html:style', { html: XHTMLNS })

  body = body.dup 1
  body = { '#body' => body.children.to_a, about: '' }
  body[:typeof] = @context.abbreviate(types.to_a, vocab: XHV) unless
    types.empty?

  # prepare only the prefixes we need to resolve the data we need
  rsc = @context.abbreviate(
    (struct.keys + resources.keys + datatypes.to_a + types.to_a).uniq,
    noop: false).map do |x|
    next if x.nil?
    x.split(?:)[0].to_sym
  end.select { |x| not x.nil? }.to_set

  pfx = @context.prefixes.select do |k, _|
    rsc.include? k
  end.transform_values { |v| v.to_s }

  # XXX deal with the qb:Observation separately (just nuke it for now)
  extra = generate_twitter_meta || []
  if bl = generate_backlinks(published: published,
    ignore: @context.graph.query(
      [nil, CI.document, @uuid]).subjects.to_set)
    extra << { [bl] => :object }
  end

  # and now for the document
  xf  = @context.config[:transform]
  doc = xhtml_stub(
    base: @uri, prefix: pfx, vocab: XHV, lang: 'en', title: tm,
    link: links, meta: meta, style: style, transform: xf,
    extra: extra, body: body).document

  # goddamn script tags and text/html
  doc.xpath('//html:script[@src][not(node())]',
    { html: XHTMLNS }).each do |script|
    script << doc.create_text_node('')
  end

  doc
end

#triples_for ⇒ `Object`

sponge the document for rdfa



2038
2039

# File 'lib/rdf/sak.rb', line 2038

def triples_for
end

#vocab_for(node) ⇒ `Object`

note parentheses cause the index to be counted from the root

# File 'lib/rdf/sak.rb', line 2049

def vocab_for node
  if node[:vocab]
    vocab = node[:vocab].strip
    return nil if vocab == ''
    return vocab
  end
  parent = node.parent
  vocab_for parent if parent and parent.element?
end

#write_to_target(published: true) ⇒ `Array`

Actually write the transformed document to the target

Parameters:

published (true, false) (defaults to: true)

Returns:

(Array) —

pathname(s) written

# File 'lib/rdf/sak.rb', line 2464

def write_to_target published: true

  # in all cases we write to private target
  states = [false]
  # document has to be publishable
  states.push true if published && @context.published?(@uuid)

  ok = []
  states.each do |state|
    target = @context.config[state ? :target : :private]

    # XXX this is dumb; it should do something more robust if it
    # fails
    doc = transform_xhtml(published: state) or next

    begin
      fh   = Tempfile.create('xml-', target)
      path = Pathname(fh.path)

      # write the doc to the target
      doc.write_to fh
      fh.close

      uuid = URI(@uuid.to_s)
      newpath = path.dirname + "#{uuid.uuid}.xml"
      ok.push newpath

      File.chmod(0644, path)
      File.rename(path, newpath)
      File.utime(@mtime, @mtime, newpath)
    rescue Exception => e
      # XXX this should only rescue a specific class of errors
      warn e.class, e
      File.unlink path if path.exist?
    end
  end

  ok
end

Class: RDF::SAK::Context::Document

Overview

Constant Summary collapse

Constants included from Util

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Util

Constructor Details

#initialize(context, uuid, doc: nil, uri: nil, mtime: nil) ⇒ Document

Instance Attribute Details

#doc ⇒ Object (readonly)

#uri ⇒ Object (readonly)

#uuid ⇒ Object (readonly)

Instance Method Details

#base_for(node = nil) ⇒ Object

#generate_backlinks(published: true, ignore: nil) ⇒ Object

#generate_twitter_meta ⇒ Object

#prefixes_for(node, prefixes = {}) ⇒ Object

#published? ⇒ Boolean

#rewrite_links(node = @doc, uuids: {}, uris: {}, &block) ⇒ Object

#subject_for(node = nil, rdf: false, is_ancestor: false) ⇒ Object

#transform_xhtml(published: true) ⇒ Object

#triples_for ⇒ Object

#vocab_for(node) ⇒ Object

#write_to_target(published: true) ⇒ Array