Class: RDF::Tabular::Reader

Inherits:

Reader

Object
Reader
RDF::Tabular::Reader

show all

Includes:: Util::Logger

Defined in:: lib/rdf/tabular/reader.rb

Overview

A Tabular Data to RDF parser in Ruby.

Author:

[Gregg Kellogg](greggkellogg.net/)

Instance Attribute Summary collapse

#input ⇒ :read readonly

Input open to read.
#metadata ⇒ Metadata readonly

Metadata associated with the CSV.

Class Method Summary collapse

.options ⇒ Object

Writer options.

Instance Method Summary collapse

#each_statement(&block) ⇒ Object
#each_triple(&block) ⇒ Object
#initialize(input = $stdin, **options) {|reader| ... } ⇒ Reader constructor

Initializes the RDF::Tabular Reader instance.
#minimal? ⇒ Boolean
#prov? ⇒ Boolean
#to_hash(**options) ⇒ Hash, Array

Return a hash representation of the data for JSON serialization.
#to_json(options = @options) ⇒ String

Transform to JSON.
#validate! ⇒ Object

Do we have valid metadata?.

Constructor Details

#initialize(input = $stdin, **options) {|reader| ... } ⇒ `Reader`

Initializes the RDF::Tabular Reader instance.

Parameters:

input (Util::File::RemoteDoc, IO, StringIO, Array<Array<String>>, String) (defaults to: $stdin) —

An opened file possibly JSON Metadata, or an Array used as an internalized array of arrays
options (Hash{Symbol => Object}) —

any additional options (see ‘RDF::Reader#initialize`)

Options Hash (**options):

:decode_uri (Boolean) —

Decode %-encodings in the result of a URI Template operation.
:fks_referencing_table (Array<Hash>) —

When called with Table metadata, a list of the foreign keys referencing this table
:metadata (Metadata, Hash, String, RDF::URI) —

user supplied metadata, merged on top of extracted metadata. If provided as a URL, Metadata is loade from that location
:minimal (Boolean) —

includes only the information gleaned from the cells of the tabular data
:noProv (Boolean) —

do not output optional provenance information

Yields:

(reader) —

‘self`

Yield Parameters:

reader (RDF::Reader)

Yield Returns:

(void) —

ignored

Raises:

(RDF::ReaderError) —

if the CSV document cannot be loaded

# File 'lib/rdf/tabular/reader.rb', line 75

def initialize(input = $stdin, **options, &block)
  super do
    # Base would be how we are to take this
    @options[:base] ||= base_uri.to_s if base_uri
    @options[:base] ||= input.base_uri if input.respond_to?(:base_uri)
    @options[:base] ||= input.path if input.respond_to?(:path)
    @options[:base] ||= input.filename if input.respond_to?(:filename)
    if RDF::URI(@options[:base]).relative? && File.exist?(@options[:base].to_s)
      file_uri = "file:" + File.expand_path(@options[:base])
      @options[:base] = RDF::URI(file_uri.to_s).normalize
    end

    log_debug("Reader#initialize") {"input: #{input.inspect}, base: #{@options[:base]}"}

    # Minimal implies noProv
    @options[:noProv] ||= @options[:minimal]

    @input = case input
    when String then StringIO.new(input)
    when Array then StringIO.new(input.map {|r| r.join(",")}.join("\n"))
    else input
    end

    log_depth do
      # If input is JSON, then the input is the metadata
      content_type = @input.respond_to?(:content_type) ? @input.content_type : ""
      if @options[:base] =~ /\.json(?:ld)?$/ || content_type =~ %r(application/(csvm\+|ld\+)?json)
        @metadata = Metadata.new(@input, filenames: @options[:base], **@options)
        # If @metadata is for a Table, turn it into a TableGroup
        @metadata = @metadata.to_table_group if @metadata.is_a?(Table)
        @metadata.normalize!
        @input = @metadata
      elsif (@options[:base].to_s.end_with?(".html") || %w(text/html application/xhtml+html).include?(content_type)) &&
            !RDF::URI(@options[:base].to_s).fragment
        require 'nokogiri' unless defined?(:Nokogiri)
        doc = Nokogiri::HTML.parse(input)
        doc.xpath("//script[@type='application/csvm+json']/text()").each do |script|
          def script.content_type; "application/csvm+json"; end
          log_debug("Reader#initialize") {"Process HTML script block"}
          @input = script
          @metadata = Metadata.new(@input, filenames: @options[:base], **@options)
          # If @metadata is for a Table, turn it into a TableGroup
          @metadata = @metadata.to_table_group if @metadata.is_a?(Table)
          @metadata.normalize!
          @input = @metadata
        end
      elsif @options[:no_found_metadata]
        # Extract embedded metadata and merge
        dialect_metadata = @options[:metadata] || Table.new({}, context: "http://www.w3.org/ns/csvw")
        dialect = dialect_metadata.dialect.dup

        # HTTP flags for setting header values
        dialect.header = false if (input.headers.fetch(:content_type, '').split(';').include?('header=absent') rescue false)
        dialect.encoding = input.charset if (input.charset rescue nil)
        dialect.separator = "\t" if (input.content_type == "text/tsv" rescue nil)
        embed_options = @options.dup
        embed_options[:lang] = dialect_metadata.lang if dialect_metadata.lang
        embedded_metadata = dialect.embedded_metadata(input, @options[:metadata], **embed_options)

        if (@metadata = @options[:metadata]) && @metadata.tableSchema
          @metadata.verify_compatible!(embedded_metadata)
        else
          @metadata = embedded_metadata.normalize!
        end

        lang = input.headers[:content_language] rescue nil
        lang = nil if lang.to_s.include?(',') # Not for multiple languages
        # Set language, if unset and provided
        @metadata.lang ||= lang if lang 
          
        @metadata.dialect = dialect
      else
        # It's tabluar data. Find metadata and proceed as if it was specified in the first place
        @options[:original_input] = @input unless @options[:metadata]
        @input = @metadata = Metadata.for_input(@input, **@options).normalize!
      end

      log_debug("Reader#initialize") {"input: #{input}, metadata: #{metadata.inspect}"}

      if block_given?
        case block.arity
          when 0 then instance_eval(&block)
          else block.call(self)
        end
      end
    end
  end
end

Instance Attribute Details

#input ⇒ `:read` (readonly)

Input open to read

Returns:

(:read)



21
22
23

# File 'lib/rdf/tabular/reader.rb', line 21

def input
  @input
end

#metadata ⇒ `Metadata` (readonly)

Metadata associated with the CSV

Returns:

(Metadata)



16
17
18

# File 'lib/rdf/tabular/reader.rb', line 16

def metadata
  @metadata
end

Class Method Details

.options ⇒ `Object`

Writer options

Instance Method Details

#each_statement(&block) ⇒ `Object`

See Also:

Reader#each_statement

# File 'lib/rdf/tabular/reader.rb', line 167

def each_statement(&block)
  if block_given?
    @callback = block

    start_time = Time.now

    # Construct metadata from that passed from file open, along with information from the file.
    if input.is_a?(Metadata)
      log_debug("each_statement: metadata") {input.inspect}

      log_depth do
        begin
          # Validate metadata
          input.validate!

          # Use resolved @id of TableGroup, if available
          table_group = input.id || RDF::Node.new
          add_statement(0, table_group, RDF.type, CSVW.TableGroup) unless minimal?

          # Common Properties
          input.each do |key, value|
            next unless key.to_s.include?(':') || key == :notes
            input.common_properties(table_group, key, value) do |statement|
              add_statement(0, statement)
            end
          end unless minimal?

          # If we were originally given tabular data as input, simply use that, rather than opening the table URL. This allows buffered data to be used as input.
          # This case also handles found metadata that doesn't describe the input file
          if options[:original_input] && !input.describes_file?(options[:base_uri])
            table_resource = RDF::Node.new
            add_statement(0, table_group, CSVW.table, table_resource) unless minimal?
            Reader.new(options[:original_input], **options.merge(
                metadata: input.tables.first,
                base: input.tables.first.url,
                no_found_metadata: true,
                table_resource: table_resource,
            )) do |r|
              r.each_statement(&block)
            end
          else
            input.each_table do |table|
              # If validating, continue on to process value restrictions
              next if table.suppressOutput && !validate?

              # Foreign Keys referencing this table
              fks = input.tables.map do |t|
                t.tableSchema && t.tableSchema.foreign_keys_referencing(table)
              end.flatten.compact
              table_resource = table.id || RDF::Node.new
              add_statement(0, table_group, CSVW.table, table_resource) unless minimal?
              Reader.open(table.url, **options.merge(
                  metadata: table,
                  base: table.url,
                  no_found_metadata: true,
                  table_resource: table_resource,
                  fks_referencing_table: fks,
              )) do |r|
                r.each_statement(&block)
              end
            end

            # Lastly, if validating, validate foreign key integrity
            validate_foreign_keys(input) if validate?
          end

          # Provenance
          if prov?
            activity = RDF::Node.new
            add_statement(0, table_group, RDF::Vocab::PROV.wasGeneratedBy, activity)
            add_statement(0, activity, RDF.type, RDF::Vocab::PROV.Activity)
            add_statement(0, activity, RDF::Vocab::PROV.wasAssociatedWith, RDF::URI("https://rubygems.org/gems/rdf-tabular"))
            add_statement(0, activity, RDF::Vocab::PROV.startedAtTime, RDF::Literal::DateTime.new(start_time))
            add_statement(0, activity, RDF::Vocab::PROV.endedAtTime, RDF::Literal::DateTime.new(Time.now))

            unless (urls = input.tables.map(&:url)).empty?
              usage = RDF::Node.new
              add_statement(0, activity, RDF::Vocab::PROV.qualifiedUsage, usage)
              add_statement(0, usage, RDF.type, RDF::Vocab::PROV.Usage)
              urls.each do |url|
                add_statement(0, usage, RDF::Vocab::PROV.entity, RDF::URI(url))
              end
              add_statement(0, usage, RDF::Vocab::PROV.hadRole, CSVW.csvEncodedTabularData)
            end

            unless Array(input.filenames).empty?
              usage = RDF::Node.new
              add_statement(0, activity, RDF::Vocab::PROV.qualifiedUsage, usage)
              add_statement(0, usage, RDF.type, RDF::Vocab::PROV.Usage)
              Array(input.filenames).each do |fn|
                add_statement(0, usage, RDF::Vocab::PROV.entity, RDF::URI(fn))
              end
              add_statement(0, usage, RDF::Vocab::PROV.hadRole, CSVW.tabularMetadata)
            end
          end
        end
      end

      if validate? && log_statistics[:error]
        raise RDF::ReaderError, "Errors found during processing"
      end
      return
    end

    # Output Table-Level RDF triples
    table_resource = options.fetch(:table_resource, (metadata.id || RDF::Node.new))
    unless minimal? || metadata.suppressOutput
      add_statement(0, table_resource, RDF.type, CSVW.Table)
      add_statement(0, table_resource, CSVW.url, RDF::URI(metadata.url))
    end

    # Input is file containing CSV data.
    # Output ROW-Level statements
    last_row_num = 0
    primary_keys = []
    metadata.each_row(input) do |row|
      if row.is_a?(RDF::Statement)
        # May add additional comments
        row.subject = table_resource
        add_statement(last_row_num + 1, row) unless metadata.suppressOutput
        next
      else
        last_row_num = row.sourceNumber
      end

      # Collect primary and foreign keys if validating
      if validate?
        primary_keys << row.primaryKey
        collect_foreign_key_references(metadata, options[:fks_referencing_table], row)
      end

      next if metadata.suppressOutput

      # Output row-level metadata
      row_resource = RDF::Node.new
      default_cell_subject = RDF::Node.new
      unless minimal?
        add_statement(row.sourceNumber, table_resource, CSVW.row, row_resource)
        add_statement(row.sourceNumber, row_resource, CSVW.rownum, row.number)
        add_statement(row.sourceNumber, row_resource, RDF.type, CSVW.Row)
        add_statement(row.sourceNumber, row_resource, CSVW.url, row.id)
        row.titles.each do |t|
          add_statement(row.sourceNumber, row_resource, CSVW.title, t)
        end
      end
      row.values.each_with_index do |cell, index|
        # Collect cell errors
        unless Array(cell.errors).empty?
          self.send((validate? ? :log_error : :log_warn),
                   "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber})") do
            cell.errors.join("\n")
          end
        end
        next if cell.column.suppressOutput # Skip ignored cells
        cell_subject = cell.aboutUrl || default_cell_subject
        propertyUrl = cell.propertyUrl || begin
          # It's possible that the metadata URL already has a fragment, in which case we need to override it.
          u = metadata.url.dup
          u.fragment = cell.column.name
          u
        end
        add_statement(row.sourceNumber, row_resource, CSVW.describes, cell_subject) unless minimal?

        if cell.column.valueUrl
          add_statement(row.sourceNumber, cell_subject, propertyUrl, cell.valueUrl) if cell.valueUrl
        elsif cell.column.ordered && cell.column.separator
          list = RDF::List[*Array(cell.value)]
          add_statement(row.sourceNumber, cell_subject, propertyUrl, list.subject)
          list.each_statement do |statement|
            next if statement.predicate == RDF.type && statement.object == RDF.List
            add_statement(row.sourceNumber, statement.subject, statement.predicate, statement.object)
          end
        else
          Array(cell.value).each do |v|
            add_statement(row.sourceNumber, cell_subject, propertyUrl, v)
          end
        end
      end
    end

    # Validate primary keys
    validate_primary_keys(metadata, primary_keys) if validate?

    # Common Properties
    metadata.each do |key, value|
      next unless key.to_s.include?(':') || key == :notes
      metadata.common_properties(table_resource, key, value) do |statement|
        add_statement(0, statement)
      end
    end unless minimal?
  end
  enum_for(:each_statement)
rescue IOError => e
  raise RDF::ReaderError, e.message, e.backtrace
end

#each_triple(&block) ⇒ `Object`

#minimal? ⇒ `Boolean`

Returns:

(Boolean)

639	# File 'lib/rdf/tabular/reader.rb', line 639 def minimal?; @options[:minimal]; end

#prov? ⇒ `Boolean`

Returns:

(Boolean)

640	# File 'lib/rdf/tabular/reader.rb', line 640 def prov?; !(@options[:noProv]); end

#to_hash(**options) ⇒ `Hash`, `Array`

Return a hash representation of the data for JSON serialization

Produces an array if run in minimal mode.

Parameters:

options (Hash{Symbol => Object})

Returns:

(Hash, Array)

# File 'lib/rdf/tabular/reader.rb', line 456

def to_hash(**options)
  # Construct metadata from that passed from file open, along with information from the file.
  if input.is_a?(Metadata)
    log_debug("each_statement: metadata") {input.inspect}
    log_depth do
      # Get Metadata to invoke and open referenced files
      begin
        # Validate metadata
        input.validate!

        tables = []
        table_group = {}
        table_group['@id'] = input.id.to_s if input.id

        # Common Properties
        input.each do |key, value|
          next unless key.to_s.include?(':') || key == :notes
          table_group[key] = input.common_properties(nil, key, value)
          table_group[key] = [table_group[key]] if key == :notes && !table_group[key].is_a?(Array)
        end

        table_group['tables'] = tables

        if options[:original_input] && !input.describes_file?(options[:base_uri])
          Reader.new(options[:original_input], **options.merge(
              metadata:           input.tables.first,
              base:               input.tables.first.url,
              minimal:            minimal?,
              no_found_metadata:  true,
          )) do |r|
            case t = r.to_hash(**options)
            when Array then tables += t unless input.tables.first.suppressOutput
            when Hash  then tables << t unless input.tables.first.suppressOutput
            end
          end
        else
          input.each_table do |table|
            next if table.suppressOutput && !validate?
            Reader.open(table.url, **options.merge(
              metadata:           table,
              base:               table.url,
              minimal:            minimal?,
              no_found_metadata:  true,
            )) do |r|
              case t = r.to_hash(**options)
              when Array then tables += t unless table.suppressOutput
              when Hash  then tables << t unless table.suppressOutput
              end
            end
          end
        end

        # Lastly, if validating, validate foreign key integrity
        validate_foreign_keys(input) if validate?

        # Result is table_group or array
        minimal? ? tables : table_group
      end
    end
  else
    rows = []
    table = {}
    table['@id'] = metadata.id.to_s if metadata.id
    table['url'] = metadata.url.to_s

    table.merge!("row" => rows)

    # Input is file containing CSV data.
    # Output ROW-Level statements
    primary_keys = []
    metadata.each_row(input) do |row|
      if row.is_a?(RDF::Statement)
        # May add additional comments
        table['rdfs:comment'] ||= []
        table['rdfs:comment'] << row.object.to_s
        next
      end

      # Collect primary and foreign keys if validating
      if validate?
        primary_keys << row.primaryKey
        collect_foreign_key_references(metadata, options[:fks_referencing_table], row)
      end

      # Output row-level metadata
      r, a, values = {}, {}, {}
      r["url"] = row.id.to_s
      r["rownum"] = row.number

      # Row titles
      Array(row.titles).each { |t| merge_compacted_value(r, "titles", t.to_s) unless t.nil?}

      row.values.each_with_index do |cell, index|
        column = metadata.tableSchema.columns[index]

        # Collect cell errors
        unless Array(cell.errors).empty?
          self.send(validate? ? :log_error : :log_warn,
            "Table #{metadata.url} row #{row.number}(src #{row.sourceNumber}, col #{cell.column.sourceNumber}): ") do
            cell.errors.join("\n")
          end
        end

        # Ignore suppressed columns
        next if column.suppressOutput

        # Skip valueUrl cells where the valueUrl is null
        next if cell.column.valueUrl && cell.valueUrl.nil?

        # Skip empty sequences
        next if !cell.column.valueUrl && cell.value.is_a?(Array) && cell.value.empty?

        subject = cell.aboutUrl || 'null'
        co = (a[subject.to_s] ||= {})
        co['@id'] = subject.to_s unless subject == 'null'
        prop = case cell.propertyUrl
        when RDF.type then '@type'
        when nil then CGI.unescape(column.name) # Use URI-decoded name
        else
          # Compact the property to a term or prefixed name
          metadata.context.compact_iri(cell.propertyUrl, vocab: true)
        end

        value = case
        when prop == '@type'
          metadata.context.compact_iri(cell.valueUrl || cell.value, vocab: true)
        when cell.valueUrl
          unless subject == cell.valueUrl
            values[cell.valueUrl.to_s] ||= {o: co, prop: prop, count: 0}
            values[cell.valueUrl.to_s][:count] += 1
          end
          cell.valueUrl.to_s
        when cell.value.is_a?(RDF::Literal::Double)
          cell.value.object.nan? || cell.value.object.infinite? ? cell.value : cell.value.object
        when cell.value.is_a?(RDF::Literal::Integer)
          cell.value.object.to_i
        when cell.value.is_a?(RDF::Literal::Numeric)
          cell.value.object.to_f
        when cell.value.is_a?(RDF::Literal::Boolean)
          cell.value.object
        when cell.value
          cell.value
        end

        # Add or merge value
        merge_compacted_value(co, prop, value) unless value.nil?
      end

      # Check for nesting
      values.keys.each do |valueUrl|
        next unless a.has_key?(valueUrl)
        ref = values[valueUrl]
        co = ref[:o]
        prop = ref[:prop]
        next if ref[:count] != 1
        raise "Expected #{ref[o][prop].inspect} to include #{valueUrl.inspect}" unless Array(co[prop]).include?(valueUrl)
        co[prop] = Array(co[prop]).map {|e| e == valueUrl ? a.delete(valueUrl) : e}
        co[prop] = co[prop].first if co[prop].length == 1
      end

      r["describes"] = a.values

      if minimal?
        rows.concat(r["describes"])
      else
        rows << r
      end
    end

    # Validate primary keys
    validate_primary_keys(metadata, primary_keys) if validate?

    # Use string values notes and common properties
    metadata.each do |key, value|
      next unless key.to_s.include?(':') || key == :notes
      table[key] = metadata.common_properties(nil, key, value)
      table[key] = [table[key]] if key == :notes && !table[key].is_a?(Array)
    end unless minimal?

    minimal? ? table["row"] : table
  end
end

#to_json(options = @options) ⇒ `String`

Transform to JSON. Note that this must be run from within the reader context if the input is an open IO stream.

Examples:

outputing annotated CSV as JSON

result = nil
RDF::Tabular::Reader.open("etc/doap.csv") do |reader|
  result = reader.to_json
end
result #=> {...}

outputing annotated CSV as JSON from an in-memory structure

csv = %(
  GID,On Street,Species,Trim Cycle,Inventory Date
  1,ADDISON AV,Celtis australis,Large Tree Routine Prune,10/18/2010
  2,EMERSON ST,Liquidambar styraciflua,Large Tree Routine Prune,6/2/2010
  3,EMERSON ST,Liquidambar styraciflua,Large Tree Routine Prune,6/2/2010
).gsub(/^\s+/, '')
r = RDF::Tabular::Reader.new(csv)
r.to_json #=> {...}

Parameters:

options (Hash{Symbol => Object}) (defaults to: @options) —

may also be a JSON state

Options Hash (options):

io (IO, StringIO) —

to output to file
:state (::JSON::State) —

used when dumping
:atd (Boolean) —

output Abstract Table representation instead

Returns:

(String)

Raises:

(RDF::Tabular::Error)

# File 'lib/rdf/tabular/reader.rb', line 411

def to_json(options = @options)
  io = case options
  when IO, StringIO then options
  when Hash then options[:io]
  end
  json_state = case options
  when Hash
    case
    when options.has_key?(:state) then options[:state]
    when options.has_key?(:indent) then options
    else ::JSON::LD::JSON_STATE
    end
  when ::JSON::State, ::JSON::Ext::Generator::State, ::JSON::Pure::Generator::State
    options
  else ::JSON::LD::JSON_STATE
  end
  options = {} unless options.is_a?(Hash)

  hash_fn = :to_hash
  options = options.merge(noProv: @options[:noProv])

  res = if io
    ::JSON::dump_default_options = json_state
    ::JSON.dump(self.send(hash_fn, **options), io)
  else
    hash = self.send(hash_fn, **options)
    ::JSON.generate(hash, json_state)
  end

  if validate? && log_statistics[:error]
    raise RDF::Tabular::Error, "Errors found during processing"
  end

  res
rescue IOError => e
  raise RDF::Tabular::Error, e.message
end

#validate! ⇒ `Object`

Do we have valid metadata?

Raises:

(RDF::ReaderError)

# File 'lib/rdf/tabular/reader.rb', line 378

def validate!
  @options[:validate] = true
  each_statement {}
rescue RDF::ReaderError => e
  raise Error, e.message
end

Class: RDF::Tabular::Reader

Overview

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(input = $stdin, **options) {|reader| ... } ⇒ Reader

Instance Attribute Details

#input ⇒ :read (readonly)

#metadata ⇒ Metadata (readonly)

Class Method Details

.options ⇒ Object

Instance Method Details

#each_statement(&block) ⇒ Object

#each_triple(&block) ⇒ Object

#minimal? ⇒ Boolean

#prov? ⇒ Boolean

#to_hash(**options) ⇒ Hash, Array

#to_json(options = @options) ⇒ String

Examples:

outputing annotated CSV as JSON

outputing annotated CSV as JSON from an in-memory structure

#validate! ⇒ Object

#initialize(input = $stdin, **options) {|reader| ... } ⇒ `Reader`

#input ⇒ `:read` (readonly)

#metadata ⇒ `Metadata` (readonly)

.options ⇒ `Object`

#each_statement(&block) ⇒ `Object`

#each_triple(&block) ⇒ `Object`

#minimal? ⇒ `Boolean`

#prov? ⇒ `Boolean`

#to_hash(**options) ⇒ `Hash`, `Array`

#to_json(options = @options) ⇒ `String`

#validate! ⇒ `Object`