Module: Bolognese::Readers::DataciteReader

Included in:: MetadataUtils

Defined in:: lib/bolognese/readers/datacite_reader.rb

Instance Method Summary collapse

Instance Method Details

#get_datacite(id: nil, **options) ⇒ `Object`

# File 'lib/bolognese/readers/datacite_reader.rb', line 6

def get_datacite(id: nil, **options)
  return { "string" => nil, "state" => "not_found" } unless id.present?

  api_url = doi_api_url(id, options)
  response = Maremma.get(api_url)
  attributes = response.body.dig("data", "attributes")
  return { "string" => nil, "state" => "not_found" } unless attributes.present?

  string = attributes.fetch('xml', nil)
  string = Base64.decode64(string) if string.present?

  if string.present?
    doc = Nokogiri::XML(string, nil, 'UTF-8', &:noblanks)

    # remove leading and trailing whitespace in text nodes
    doc.xpath("//text()").each do |node|
      if node.content =~ /\S/
        node.content = node.content.strip
      else
        node.remove
      end
    end
    string = doc.to_xml(:indent => 2)
  end

  client = Array.wrap(response.body.fetch("included", nil)).find { |m| m["type"] == "clients" }
  client_id = client.to_h.fetch("id", nil)
  provider_id = Array.wrap(client.to_h.fetch("relationships", nil)).find { |m| m["provider"].present? }.to_h.dig("provider", "data", "id")

  content_url = attributes.fetch("contentUrl", nil) || Array.wrap(response.body.fetch("included", nil)).select { |m| m["type"] == "media" }.map do |m|
    m.dig("attributes", "url")
  end.compact

  { "string" => string,
    "url" => attributes.fetch("url", nil),
    "state" => attributes.fetch("state", nil),
    "date_registered" => attributes.fetch("registered", nil),
    "date_updated" => attributes.fetch("updated", nil),
    "provider_id" => provider_id,
    "client_id" => client_id,
    "content_url" => content_url }
end

#get_titles(meta) ⇒ `Object`

# File 'lib/bolognese/readers/datacite_reader.rb', line 348

def get_titles(meta)
  titles = Array.wrap(meta.dig("titles", "title")).map do |r|
    if r.blank?
      nil
    elsif r.is_a?(String)
      { "title" => sanitize(r) }
    else
      { "title" => sanitize(r["__content__"]), "titleType" => r["titleType"], "lang" => r["lang"] }.compact
    end
  end.compact

  titles
end

#read_datacite(string: nil, **options) ⇒ `Object`

# File 'lib/bolognese/readers/datacite_reader.rb', line 49

def read_datacite(string: nil, **options)
  read_options = ActiveSupport::HashWithIndifferentAccess.new(options.except(:doi, :id, :url, :sandbox, :validate, :ra))

  doc = Nokogiri::XML(string, nil, 'UTF-8', &:noblanks)
  if read_options.present?
    schema_version = "http://datacite.org/schema/kernel-4"
  else
    ns = doc.collect_namespaces.find { |k, v| v.start_with?("http://datacite.org/schema/kernel") }
    schema_version = Array.wrap(ns).last || "http://datacite.org/schema/kernel-4"
  end
  doc.remove_namespaces!
  string = doc.to_xml(:indent => 2)

  meta = Maremma.from_xml(string).to_h.fetch("resource", {})

  # validate only when option is set, as this step is expensive and
  # not needed if XML comes from DataCite MDS
  if options[:validate]
    errors = datacite_errors(xml: string, schema_version: schema_version)
    return { "errors" => errors } if errors.present?
  end

  if options[:doi]
    id = normalize_doi(options[:doi], sandbox: options[:sandbox])
  else
    id = normalize_doi(meta.dig("identifier", "__content__") || options[:id], sandbox: options[:sandbox])
  end

  identifiers = Array.wrap(meta.dig("alternateIdentifiers", "alternateIdentifier")).map do |r|
    if r["__content__"].present?
      { "identifierType" => get_identifier_type(r["alternateIdentifierType"]), "identifier" => r["__content__"] }
    end
  end.compact

  resource_type_general = meta.dig("resourceType", "resourceTypeGeneral")
  resource_type = meta.dig("resourceType", "__content__")
  schema_org = Bolognese::Utils::CR_TO_SO_TRANSLATIONS[resource_type.to_s.underscore.camelcase] || Bolognese::Utils::DC_TO_SO_TRANSLATIONS[resource_type_general.to_s.dasherize] || "CreativeWork"
  types = {
    "resourceTypeGeneral" => resource_type_general,
    "resourceType" => resource_type,
    "schemaOrg" => schema_org,
    "citeproc" => Bolognese::Utils::CR_TO_CP_TRANSLATIONS[resource_type.to_s.underscore.camelcase] || Bolognese::Utils::SO_TO_CP_TRANSLATIONS[schema_org] || "article",
    "bibtex" => Bolognese::Utils::CR_TO_BIB_TRANSLATIONS[resource_type.to_s.underscore.camelcase] || Bolognese::Utils::SO_TO_BIB_TRANSLATIONS[schema_org] || "misc",
    "ris" => Bolognese::Utils::CR_TO_RIS_TRANSLATIONS[resource_type.to_s.underscore.camelcase] || Bolognese::Utils::DC_TO_RIS_TRANSLATIONS[resource_type_general.to_s.dasherize] || "GEN"
  }.compact

  titles = get_titles(meta)

  publisher = Array.wrap(meta.dig("publisher")).map do |r|
    if r.blank?
      nil
    elsif r.is_a?(String)
      { "name" => r.strip }
    elsif r.is_a?(Hash)
      {
        "name" => r["__content__"].present? ? r["__content__"].strip : nil,
        "publisherIdentifier" => r["publisherIdentifierScheme"] == "ROR" ? normalize_ror(r["publisherIdentifier"]) : r["publisherIdentifier"],
        "publisherIdentifierScheme" => r["publisherIdentifierScheme"],
        "schemeUri" => r["schemeURI"],
        "lang" => r["lang"],
      }.compact
    end
  end.compact.first

  descriptions = Array.wrap(meta.dig("descriptions", "description")).map do |r|
    if r.blank?
      nil
    elsif r.is_a?(String)
      { "description" => sanitize(r, new_line: true), "descriptionType" => "Abstract" }
    elsif r.is_a?(Hash)
      { "description" => sanitize(r["__content__"], new_line: true), "descriptionType" => r["descriptionType"], "lang" => r["lang"] }.compact
    end
  end.compact
  rights_list = Array.wrap(meta.dig("rightsList", "rights")).map do |r|
    if r.blank?
      nil
    elsif r.is_a?(String)
      name_to_spdx(r)
    elsif r.is_a?(Hash)
      hsh_to_spdx(r)
    end
  end.compact

  subjects = Array.wrap(meta.dig("subjects", "subject")).reduce([]) do |sum, subject|
    if subject.is_a?(String)
      sum += name_to_fos(subject)
    elsif subject.is_a?(Hash)
      sum += hsh_to_fos(subject)
    end

    sum
  end.uniq

  dates = Array.wrap(meta.dig("dates", "date")).map do |r|
    if r.is_a?(Hash) && date = sanitize(r["__content__"]).presence
      if Date.edtf(date).present? || Bolognese::Utils::UNKNOWN_INFORMATION.key?(date)
        { "date" => date,
          "dateType" => parse_attributes(r, content: "dateType"),
          "dateInformation" => parse_attributes(r, content: "dateInformation")
        }.compact
      end
    end
  end.compact
  dates << { "date" => meta.fetch("publicationYear", nil), "dateType" => "Issued" } if meta.fetch("publicationYear", nil).present? && get_date(dates, "Issued").blank?
  sizes = Array.wrap(meta.dig("sizes", "size")).map do |k|
    if k.blank?
      nil
    elsif k.is_a?(String)
      sanitize(k).presence
    elsif k.is_a?(Hash)
      sanitize(k["__content__"]).presence
    end
  end.compact
  formats = Array.wrap(meta.dig("formats", "format")).map do |k|
    if k.blank?
      nil
    elsif k.is_a?(String)
      sanitize(k).presence
    elsif k.is_a?(Hash)
      sanitize(k["__content__"]).presence
    end
  end.compact
  .map { |s| s.to_s.squish.presence }.compact
  funding_references = Array.wrap(meta.dig("fundingReferences", "fundingReference")).compact.map do |fr|
    scheme_uri = parse_attributes(fr["funderIdentifier"], content: "schemeURI")
    funder_identifier = parse_attributes(fr["funderIdentifier"])
    funder_identifier_type = parse_attributes(fr["funderIdentifier"], content: "funderIdentifierType")

    if funder_identifier_type == "Crossref Funder ID"
      funder_identifier = validate_funder_doi(funder_identifier)
    elsif funder_identifier_type == "ROR"
      funder_identifier =  normalize_ror(funder_identifier)
      scheme_uri = "https://ror.org"
    else
      funder_identifier = normalize_id(funder_identifier) ? normalize_id(funder_identifier) : funder_identifier
    end

    {
      "funderName" => fr["funderName"],
      "funderIdentifier" => funder_identifier,
      "funderIdentifierType" => funder_identifier_type,
      "schemeUri" => scheme_uri,
      "awardNumber" => parse_attributes(fr["awardNumber"]),
      "awardUri" => parse_attributes(fr["awardNumber"], content: "awardURI"),
      "awardTitle" => fr["awardTitle"] }.compact
  end
  related_identifiers = Array.wrap(meta.dig("relatedIdentifiers", "relatedIdentifier")).map do |ri|
    if ri["relatedIdentifierType"] == "DOI"
      rid = validate_doi(ri["__content__"].to_s.downcase)
    else
      rid = ri["__content__"]
    end

    {
      "relatedIdentifier" => rid,
      "relatedIdentifierType" => ri["relatedIdentifierType"],
      "relationType" => ri["relationType"],
      "resourceTypeGeneral" => ri["resourceTypeGeneral"],
      "relatedMetadataScheme" => ri["relatedMetadataScheme"],
      "schemeUri" => ri["schemeURI"],
      "schemeType" => ri["schemeType"]
    }.compact
  end

  related_items = Array.wrap(meta.dig("relatedItems", "relatedItem")).map do |ri|

    rii = ri["relatedItemIdentifier"]
    relatedItemIdentifier = nil
    if rii
      if rii["relatedItemIdentifierType"] == "DOI"
        rid = validate_doi(rii["__content__"].to_s.downcase)
      else
        rid = rii["__content__"]
      end

      relatedItemIdentifier = {
        "relatedItemIdentifier" => rid,
        "relatedItemIdentifierType" => rii["relatedItemIdentifierType"],
        "relatedMetadataScheme" => rii["relatedMetadataScheme"],
        "schemeURI" => rii["schemeURI"],
        "schemeType" => rii["schemeType"]
      }.compact
    end

    number = ri["number"]
    if number.is_a?(String)
      number = number
      numberType = nil
    else
      number = ri.dig("number", "__content__")
      numberType = ri.dig("number", "numberType")
    end

    a = {
      "relationType" => ri["relationType"],
      "relatedItemType" => ri["relatedItemType"],
      "relatedItemIdentifier" => relatedItemIdentifier,
      "creators" => get_authors(Array.wrap(ri.dig("creators", "creator"))),
      "titles" => get_titles(ri),
      "publicationYear" => ri["publicationYear"],
      "volume" => parse_attributes(ri["volume"]),
      "issue" => parse_attributes(ri["issue"]),
      "number" => number,
      "numberType" => numberType,
      "firstPage" => parse_attributes(ri["firstPage"]),
      "lastPage" => parse_attributes(ri["lastPage"]),
      "publisher" => parse_attributes(ri["publisher"]),
      "edition" => parse_attributes(ri["edition"]),
      "contributors" => get_authors(Array.wrap(ri.dig("contributors", "contributor"))),
    }.compact
  end

  geo_locations = Array.wrap(meta.dig("geoLocations", "geoLocation")).map do |gl|
    if !gl.is_a?(Hash) || gl["geoLocationPoint"].is_a?(String) || gl["geoLocationBox"].is_a?(String) || gl["geoLocationPolygon"].is_a?(String)
      nil
    else

      # Handle scenario where multiple geoLocationPolygons are allowed within a single geoLocation
      # we want to return an array if it's already an array (i.e. multiple geoLocationPolygons)
      # vs if it's singular just return the object
      # This is for backwards compatability to allow both scenarios.
      if gl.dig("geoLocationPolygon").kind_of?(Array)
        geoLocationPolygon = gl.dig("geoLocationPolygon").map do |glp|
          Array.wrap(glp.dig("polygonPoint")).map { |glpp| { "polygonPoint" => glpp } }.compact.presence
        end.compact.presence
      else
        geoLocationPolygon = Array.wrap(gl.dig("geoLocationPolygon", "polygonPoint")).map { |glp| { "polygonPoint" => glp } }.compact.presence
      end

      {
        "geoLocationPoint" => {
          "pointLatitude" => gl.dig("geoLocationPoint", "pointLatitude"),
          "pointLongitude" => gl.dig("geoLocationPoint", "pointLongitude")
        }.compact.presence,
        "geoLocationBox" => {
          "westBoundLongitude" => gl.dig("geoLocationBox", "westBoundLongitude"),
          "eastBoundLongitude" => gl.dig("geoLocationBox", "eastBoundLongitude"),
          "southBoundLatitude" => gl.dig("geoLocationBox", "southBoundLatitude"),
          "northBoundLatitude" => gl.dig("geoLocationBox", "northBoundLatitude")
        }.compact.presence,
        "geoLocationPolygon" => geoLocationPolygon,
        "geoLocationPlace" => parse_attributes(gl["geoLocationPlace"], first: true).to_s.strip.presence
      }.compact
    end
  end.compact

  state = id.present? || read_options.present? ? "findable" : "not_found"

  { "id" => id,
    "types" => types,
    "doi" => doi_from_url(id),
    "identifiers" => identifiers,
    "url" => options.fetch(:url, nil).to_s.strip.presence,
    "titles" => titles,
    "creators" => get_authors(Array.wrap(meta.dig("creators", "creator"))),
    "contributors" => get_authors(Array.wrap(meta.dig("contributors", "contributor"))),
    "container" => set_container(meta),
    "publisher" => publisher,
    "agency" => "datacite",
    "funding_references" => funding_references,
    "dates" => dates,
    "publication_year" => parse_attributes(meta.fetch("publicationYear", nil), first: true).to_s.strip.presence,
    "descriptions" => descriptions,
    "rights_list" => Array.wrap(rights_list),
    "version_info" => meta.fetch("version", nil).to_s.presence,
    "subjects" => subjects,
    "language" => parse_attributes(meta.fetch("language", nil), first: true).to_s.strip.presence,
    "geo_locations" => geo_locations,
    "related_identifiers" => related_identifiers,
    "related_items" => related_items,
    "formats" => formats,
    "sizes" => sizes,
    "schema_version" => schema_version,
    "state" => state
  }.merge(read_options)
end

#set_container(meta) ⇒ `Object`

# File 'lib/bolognese/readers/datacite_reader.rb', line 326

def set_container(meta)
  series_information = Array.wrap(meta.dig("descriptions", "description")).find { |r| r["descriptionType"] == "SeriesInformation" }.to_h.fetch("__content__", nil)
  si = get_series_information(series_information)

  is_part_of = Array.wrap(meta.dig("relatedIdentifiers", "relatedIdentifier")).find { |ri| ri["relationType"] == "IsPartOf" }.to_h

  if si["title"].present? || is_part_of.present?
    {
      "type" => meta.dig("resourceType", "resourceTypeGeneral") == "Dataset" ? "DataRepository" : "Series",
      "identifier" => is_part_of["__content__"],
      "identifierType" => is_part_of["relatedIdentifierType"],
      "title" => si["title"],
      "volume" => si["volume"],
      "issue" => si["issue"],
      "firstPage" => si["firstPage"],
      "lastPage" => si["lastPage"]
    }.compact
  else
    {}
  end
end

Module: Bolognese::Readers::DataciteReader

Instance Method Summary collapse

Instance Method Details

#get_datacite(id: nil, **options) ⇒ Object

#get_titles(meta) ⇒ Object

#read_datacite(string: nil, **options) ⇒ Object

#set_container(meta) ⇒ Object

#get_datacite(id: nil, **options) ⇒ `Object`

#get_titles(meta) ⇒ `Object`

#read_datacite(string: nil, **options) ⇒ `Object`

#set_container(meta) ⇒ `Object`