Module: Bolognese::Utils
Constant Summary collapse
- LICENSE_NAMES =
{ "http://creativecommons.org/publicdomain/zero/1.0/" => "Public Domain (CC0 1.0)", "http://creativecommons.org/licenses/by/3.0/" => "Creative Commons Attribution 3.0 (CC-BY 3.0)", "http://creativecommons.org/licenses/by/4.0/" => "Creative Commons Attribution 4.0 (CC-BY 4.0)", "http://creativecommons.org/licenses/by-nc/4.0/" => "Creative Commons Attribution Noncommercial 4.0 (CC-BY-NC 4.0)", "http://creativecommons.org/licenses/by-sa/4.0/" => "Creative Commons Attribution Share Alike 4.0 (CC-BY-SA 4.0)", "http://creativecommons.org/licenses/by-nc-nd/4.0/" => "Creative Commons Attribution Noncommercial No Derivatives 4.0 (CC-BY-NC-ND 4.0)" }
- DC_TO_SO_TRANSLATIONS =
{ "Audiovisual" => "VideoObject", "Collection" => "Collection", "Dataset" => "Dataset", "Event" => "Event", "Image" => "ImageObject", "InteractiveResource" => nil, "Model" => nil, "PhysicalObject" => nil, "Service" => "Service", "Software" => "SoftwareSourceCode", "Sound" => "AudioObject", "Text" => "ScholarlyArticle", "Workflow" => nil, "Other" => "CreativeWork" }
- DC_TO_CP_TRANSLATIONS =
{ "Audiovisual" => "motion_picture", "Collection" => nil, "Dataset" => "dataset", "Event" => nil, "Image" => "graphic", "InteractiveResource" => nil, "Model" => nil, "PhysicalObject" => nil, "Service" => nil, "Sound" => "song", "Text" => "report", "Workflow" => nil, "Other" => nil }
- CR_TO_CP_TRANSLATIONS =
{ "proceedings" => nil, "reference-book" => nil, "journal-issue" => nil, "proceedings-article" => "paper-conference", "other" => nil, "dissertation" => "thesis", "dataset" => "dataset", "edited-book" => "book", "journal-article" => "article-journal", "journal" => nil, "report" => "report", "book-series" => nil, "report-series" => nil, "book-track" => nil, "standard" => nil, "book-section" => "chapter", "book-part" => nil, "book" => "book", "book-chapter" => "chapter", "standard-series" => nil, "monograph" => "book", "component" => nil, "reference-entry" => "entry-dictionary", "journal-volume" => nil, "book-set" => nil }
- SO_TO_DC_TRANSLATIONS =
{ "Article" => "Text", "AudioObject" => "Sound", "Blog" => "Text", "BlogPosting" => "Text", "Collection" => "Collection", "CreativeWork" => "Other", "DataCatalog" => "Dataset", "Dataset" => "Dataset", "Event" => "Event", "ImageObject" => "Image", "Movie" => "Audiovisual", "PublicationIssue" => "Text", "ScholarlyArticle" => "Text", "Service" => "Service", "SoftwareSourceCode" => "Software", "VideoObject" => "Audiovisual", "WebPage" => "Text", "WebSite" => "Text" }
- SO_TO_CP_TRANSLATIONS =
{ "Article" => "", "AudioObject" => "song", "Blog" => "report", "BlogPosting" => "post-weblog", "Collection" => nil, "CreativeWork" => nil, "DataCatalog" => "dataset", "Dataset" => "dataset", "Event" => nil, "ImageObject" => "graphic", "Movie" => "motion_picture", "PublicationIssue" => nil, "ScholarlyArticle" => "article-journal", "Service" => nil, "VideoObject" => "broadcast", "WebPage" => "webpage", "WebSite" => "webpage" }
- SO_TO_RIS_TRANSLATIONS =
{ "Article" => nil, "AudioObject" => nil, "Blog" => nil, "BlogPosting" => "BLOG", "Collection" => nil, "CreativeWork" => "GEN", "DataCatalog" => "CTLG", "Dataset" => "DATA", "Event" => nil, "ImageObject" => "FIGURE", "Movie" => "MPCT", "PublicationIssue" => nil, "ScholarlyArticle" => "JOUR", "Service" => nil, "SoftwareSourceCode" => "COMP", "VideoObject" => "VIDEO", "WebPage" => "ELEC", "WebSite" => nil }
- CR_TO_RIS_TRANSLATIONS =
{ "proceedings" => "CONF", "reference-book" => "BOOK", "journal-issue" => nil, "proceedings-article" => "CPAPER", "other" => "GEN", "dissertation" => "THES", "dataset" => "DATA", "edited-book" => "BOOK", "journal-article" => "JOUR", "journal" => nil, "report" => nil, "book-series" => nil, "report-series" => nil, "book-track" => nil, "standard" => nil, "book-section" => "CHAP", "book-part" => "CHAP", "book" => "BOOK", "book-chapter" => "CHAP", "standard-series" => nil, "monograph" => "BOOK", "component" => nil, "reference-entry" => "DICT", "journal-volume" => nil, "book-set" => nil }
- DC_TO_RIS_TRANSLATIONS =
{ "Audiovisual" => "MPCT", "Collection" => nil, "Dataset" => "DATA", "Event" => nil, "Image" => "FIGURE", "InteractiveResource" => nil, "Model" => nil, "PhysicalObject" => nil, "Service" => nil, "Software" => "COMP", "Sound" => "SOUND", "Text" => "RPRT", "Workflow" => nil, "Other" => nil }
- SO_TO_BIB_TRANSLATIONS =
{ "Article" => "article", "AudioObject" => "misc", "Blog" => "misc", "BlogPosting" => "article", "Collection" => "misc", "CreativeWork" => "misc", "DataCatalog" => "misc", "Dataset" => "misc", "Event" => "misc", "ImageObject" => "misc", "Movie" => "misc", "PublicationIssue" => "misc", "ScholarlyArticle" => "article", "Service" => "misc", "SoftwareSourceCode" => "misc", "VideoObject" => "misc", "WebPage" => "misc", "WebSite" => "misc" }
Instance Method Summary collapse
- #find_from_format(id: nil, string: nil, ext: nil) ⇒ Object
- #find_from_format_by_ext(string, options = {}) ⇒ Object
- #find_from_format_by_id(id) ⇒ Object
- #find_from_format_by_string(string) ⇒ Object
- #from_citeproc(element) ⇒ Object
- #from_schema_org(element) ⇒ Object
- #get_date_from_date_parts(date_as_parts) ⇒ Object
- #get_date_from_parts(year, month = nil, day = nil) ⇒ Object
- #get_date_parts(iso8601_time) ⇒ Object
- #github_as_codemeta_url(url) ⇒ Object
- #github_as_owner_url(url) ⇒ Object
- #github_as_release_url(url) ⇒ Object
- #github_as_repo_url(url) ⇒ Object
- #github_from_url(url) ⇒ Object
- #github_owner_from_url(url) ⇒ Object
- #github_release_from_url(url) ⇒ Object
- #github_repo_from_url(url) ⇒ Object
- #jsonlint(json) ⇒ Object
- #map_hash_keys(element: nil, mapping: nil) ⇒ Object
- #normalize_id(id) ⇒ Object
- #normalize_ids(ids: nil) ⇒ Object
-
#normalize_licenses(licenses) ⇒ Object
find Creative Commons or OSI license in licenses array, normalize url and name.
- #normalize_orcid(orcid) ⇒ Object
- #normalize_url(id) ⇒ Object
- #orcid_as_url(orcid) ⇒ Object
- #orcid_from_url(url) ⇒ Object
- #parse_attributes(element, options = {}) ⇒ Object
- #sanitize(text, options = {}) ⇒ Object
- #to_citeproc(element) ⇒ Object
- #to_ris(element) ⇒ Object
- #to_schema_org(element) ⇒ Object
- #validate_orcid(orcid) ⇒ Object
- #validate_url(str) ⇒ Object
Instance Method Details
#find_from_format(id: nil, string: nil, ext: nil) ⇒ Object
201 202 203 204 205 206 207 208 209 |
# File 'lib/bolognese/utils.rb', line 201 def find_from_format(id: nil, string: nil, ext: nil) if id.present? find_from_format_by_id(id) elsif ext.present? find_from_format_by_ext(string, ext: ext) elsif string.present? find_from_format_by_string(string) end end |
#find_from_format_by_ext(string, options = {}) ⇒ Object
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 |
# File 'lib/bolognese/utils.rb', line 226 def find_from_format_by_ext(string, ={}) if [:ext] == ".bib" "bibtex" elsif [:ext] == ".ris" "ris" elsif [:ext] == ".xml" && Maremma.from_xml(string).to_h.dig("doi_records", "doi_record", "crossref") "crossref" elsif [:ext] == ".xml" && Maremma.from_xml(string).to_h.dig("resource", "xmlns").to_s.start_with?("http://datacite.org/schema/kernel") "datacite" elsif [:ext] == ".json" && Maremma.from_json(string).to_h.dig("ris_type") "crosscite" elsif [:ext] == ".json" && Maremma.from_json(string).to_h.dig("schemaVersion").to_s.start_with?("http://datacite.org/schema/kernel") "datacite_json" elsif [:ext] == ".json" && Maremma.from_json(string).to_h.dig("issued", "date-parts").present? "citeproc" elsif [:ext] == ".json" && Maremma.from_json(string).to_h.dig("@context").to_s.start_with?("http://schema.org") "schema_org" elsif [:ext] == ".json" && Maremma.from_json(string).to_h.dig("@context") == ("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld") "codemeta" end end |
#find_from_format_by_id(id) ⇒ Object
211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
# File 'lib/bolognese/utils.rb', line 211 def find_from_format_by_id(id) id = normalize_id(id) if /\A(?:(http|https):\/(\/)?(dx\.)?doi.org\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(id) ra = get_doi_ra(id) %w(DataCite Crossref).include?(ra) ? ra.downcase : nil elsif /\A(?:(http|https):\/(\/)?orcid\.org\/)?(\d{4}-\d{4}-\d{4}-\d{3}[0-9X]+)\z/.match(id) "orcid" elsif /\A(http|https):\/(\/)?github\.com\/(.+)\z/.match(id) "codemeta" else "schema_org" end end |
#find_from_format_by_string(string) ⇒ Object
248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 |
# File 'lib/bolognese/utils.rb', line 248 def find_from_format_by_string(string) if Maremma.from_xml(string).to_h.dig("doi_records", "doi_record", "crossref").present? "crossref" elsif Maremma.from_xml(string).to_h.dig("resource", "xmlns").to_s.start_with?("http://datacite.org/schema/kernel") "datacite" elsif Maremma.from_json(string).to_h.dig("ris_type").present? "crosscite" elsif Maremma.from_json(string).to_h.dig("schemaVersion").to_s.start_with?("http://datacite.org/schema/kernel") "datacite_json" elsif Maremma.from_json(string).to_h.dig("issued", "date-parts").present? "citeproc" elsif Maremma.from_json(string).to_h.dig("@context").to_s.start_with?("http://schema.org") "schema_org" elsif Maremma.from_json(string).to_h.dig("@context") == ("https://raw.githubusercontent.com/codemeta/codemeta/master/codemeta.jsonld") "codemeta" end end |
#from_citeproc(element) ⇒ Object
413 414 415 416 417 418 419 420 421 422 423 424 425 426 |
# File 'lib/bolognese/utils.rb', line 413 def from_citeproc(element) Array.wrap(element).map do |a| if a["literal"].present? a["@type"] = "Organization" a["name"] = a["literal"] else a["@type"] = "Person" a["name"] = [a["given"], a["family"]].compact.join(" ") end a["givenName"] = a["given"] a["familyName"] = a["family"] a.except("given", "family", "literal").compact end.unwrap end |
#from_schema_org(element) ⇒ Object
393 394 395 396 397 |
# File 'lib/bolognese/utils.rb', line 393 def from_schema_org(element) mapping = { "@type" => "type", "@id" => "id" } map_hash_keys(element: element, mapping: mapping) end |
#get_date_from_date_parts(date_as_parts) ⇒ Object
504 505 506 507 508 |
# File 'lib/bolognese/utils.rb', line 504 def get_date_from_date_parts(date_as_parts) date_parts = date_as_parts.fetch("date-parts", []).first year, month, day = date_parts[0], date_parts[1], date_parts[2] get_date_from_parts(year, month, day) end |
#get_date_from_parts(year, month = nil, day = nil) ⇒ Object
510 511 512 |
# File 'lib/bolognese/utils.rb', line 510 def get_date_from_parts(year, month = nil, day = nil) [year.to_s.rjust(4, '0'), month.to_s.rjust(2, '0'), day.to_s.rjust(2, '0')].reject { |part| part == "00" }.join("-") end |
#get_date_parts(iso8601_time) ⇒ Object
495 496 497 498 499 500 501 502 |
# File 'lib/bolognese/utils.rb', line 495 def get_date_parts(iso8601_time) return nil if iso8601_time.nil? year = iso8601_time[0..3].to_i month = iso8601_time[5..6].to_i day = iso8601_time[8..9].to_i { 'date-parts' => [[year, month, day].reject { |part| part == 0 }] } end |
#github_as_codemeta_url(url) ⇒ Object
490 491 492 493 |
# File 'lib/bolognese/utils.rb', line 490 def (url) github_hash = github_from_url(url) "https://raw.githubusercontent.com/#{github_hash[:owner]}/#{github_hash[:repo]}/master/codemeta.json" if github_hash[:owner].present? end |
#github_as_owner_url(url) ⇒ Object
475 476 477 478 |
# File 'lib/bolognese/utils.rb', line 475 def github_as_owner_url(url) github_hash = github_from_url(url) "https://github.com/#{github_hash[:owner]}" if github_hash[:owner].present? end |
#github_as_release_url(url) ⇒ Object
485 486 487 488 |
# File 'lib/bolognese/utils.rb', line 485 def github_as_release_url(url) github_hash = github_from_url(url) "https://github.com/#{github_hash[:owner]}/#{github_hash[:repo]}/tree/#{github_hash[:release]}" if github_hash[:release].present? end |
#github_as_repo_url(url) ⇒ Object
480 481 482 483 |
# File 'lib/bolognese/utils.rb', line 480 def github_as_repo_url(url) github_hash = github_from_url(url) "https://github.com/#{github_hash[:owner]}/#{github_hash[:repo]}" if github_hash[:repo].present? end |
#github_from_url(url) ⇒ Object
454 455 456 457 458 459 460 461 |
# File 'lib/bolognese/utils.rb', line 454 def github_from_url(url) return {} unless /\Ahttps:\/\/github\.com\/(.+)(?:\/)?(.+)?(?:\/tree\/)?(.*)\z/.match(url) words = URI.parse(url).path[1..-1].split('/') { owner: words[0], repo: words[1], release: words[3] }.compact end |
#github_owner_from_url(url) ⇒ Object
471 472 473 |
# File 'lib/bolognese/utils.rb', line 471 def github_owner_from_url(url) github_from_url(url).fetch(:owner, nil) end |
#github_release_from_url(url) ⇒ Object
467 468 469 |
# File 'lib/bolognese/utils.rb', line 467 def github_release_from_url(url) github_from_url(url).fetch(:release, nil) end |
#github_repo_from_url(url) ⇒ Object
463 464 465 |
# File 'lib/bolognese/utils.rb', line 463 def github_repo_from_url(url) github_from_url(url).fetch(:repo, nil) end |
#jsonlint(json) ⇒ Object
514 515 516 517 518 519 520 521 |
# File 'lib/bolognese/utils.rb', line 514 def jsonlint(json) return ["No JSON provided"] unless json.present? error_array = [] linter = JsonLint::Linter.new linter.send(:check_data, json, error_array) error_array end |
#map_hash_keys(element: nil, mapping: nil) ⇒ Object
399 400 401 402 403 404 405 406 407 408 409 410 411 |
# File 'lib/bolognese/utils.rb', line 399 def map_hash_keys(element: nil, mapping: nil) Array.wrap(element).map do |a| a.map {|k, v| [mapping.fetch(k, k), v] }.reduce({}) do |hsh, (k, v)| if v.is_a?(Hash) hsh[k] = to_schema_org(v) hsh else hsh[k] = v hsh end end end.unwrap end |
#normalize_id(id) ⇒ Object
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 |
# File 'lib/bolognese/utils.rb', line 304 def normalize_id(id) return nil unless id.present? # check for valid DOI doi = normalize_doi(id) return doi if doi.present? # check for valid HTTP uri uri = Addressable::URI.parse(id) return nil unless uri && uri.host && %w(http https).include?(uri.scheme) # clean up URL PostRank::URI.clean(id) rescue Addressable::URI::InvalidURIError nil end |
#normalize_ids(ids: nil) ⇒ Object
342 343 344 345 346 347 348 |
# File 'lib/bolognese/utils.rb', line 342 def normalize_ids(ids: nil) Array.wrap(ids).map do |id| { "id" => normalize_id(id["@id"]), "type" => id["@type"] || Metadata::DC_TO_SO_TRANSLATIONS[id["resourceTypeGeneral"]] || "CreativeWork", "title" => id["title"] || id["name"] }.compact end.unwrap end |
#normalize_licenses(licenses) ⇒ Object
find Creative Commons or OSI license in licenses array, normalize url and name
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 |
# File 'lib/bolognese/utils.rb', line 351 def normalize_licenses(licenses) standard_licenses = Array.wrap(licenses).map { |l| URI.parse(l["url"]) }.select { |li| li.host && li.host[/(creativecommons.org|opensource.org)$/] } return licenses unless standard_licenses.present? # use HTTPS uri.scheme = "https" # use host name without subdomain uri.host = Array(/(creativecommons.org|opensource.org)/.match uri.host).last # normalize URLs if uri.host == "creativecommons.org" uri.path = uri.path.split('/')[0..-2].join("/") if uri.path.split('/').last == "legalcode" uri.path << '/' unless uri.path.end_with?('/') else uri.path = uri.path.gsub(/(-license|\.php|\.html)/, '') uri.path = uri.path.sub(/(mit|afl|apl|osl|gpl|ecl)/) { |match| match.upcase } uri.path = uri.path.sub(/(artistic|apache)/) { |match| match.titleize } uri.path = uri.path.sub(/([^0-9\-]+)(-)?([1-9])?(\.)?([0-9])?$/) do m = Regexp.last_match text = m[1] if m[3].present? version = [m[3], m[5].presence || "0"].join(".") [text, version].join("-") else text end end end uri.to_s rescue URI::InvalidURIError nil end |
#normalize_orcid(orcid) ⇒ Object
334 335 336 337 338 339 340 |
# File 'lib/bolognese/utils.rb', line 334 def normalize_orcid(orcid) orcid = validate_orcid(orcid) return nil unless orcid.present? # turn ORCID ID into URL "http://orcid.org/" + Addressable::URI.encode(orcid) end |
#normalize_url(id) ⇒ Object
321 322 323 324 325 326 327 328 329 330 331 332 |
# File 'lib/bolognese/utils.rb', line 321 def normalize_url(id) return nil unless id.present? # check for valid HTTP uri uri = Addressable::URI.parse(id) return nil unless uri && uri.host && %w(http https).include?(uri.scheme) # clean up URL PostRank::URI.clean(id) rescue Addressable::URI::InvalidURIError nil end |
#orcid_as_url(orcid) ⇒ Object
270 271 272 |
# File 'lib/bolognese/utils.rb', line 270 def orcid_as_url(orcid) "https://orcid.org/#{orcid}" if orcid.present? end |
#orcid_from_url(url) ⇒ Object
266 267 268 |
# File 'lib/bolognese/utils.rb', line 266 def orcid_from_url(url) Array(/\A:(http|https)\/\/orcid\.org\/(.+)/.match(url)).last end |
#parse_attributes(element, options = {}) ⇒ Object
289 290 291 292 293 294 295 296 297 298 299 300 301 302 |
# File 'lib/bolognese/utils.rb', line 289 def parse_attributes(element, ={}) content = [:content] || "__content__" if element.is_a?(String) element elsif element.is_a?(Hash) element.fetch(content, nil) elsif element.is_a?(Array) a = element.map { |e| e.is_a?(Hash) ? e.fetch(content, nil) : e }.uniq a = [:first] ? a.first : a.unwrap else nil end end |
#sanitize(text, options = {}) ⇒ Object
447 448 449 450 451 452 |
# File 'lib/bolognese/utils.rb', line 447 def sanitize(text, ={}) [:tags] ||= Set.new(%w(strong em b i code pre sub sup br)) custom_scrubber = Bolognese::WhitelistScrubber.new() Loofah.scrub_fragment(text, custom_scrubber).to_s.gsub(/\u00a0/, ' ').strip end |
#to_citeproc(element) ⇒ Object
428 429 430 431 432 433 434 435 |
# File 'lib/bolognese/utils.rb', line 428 def to_citeproc(element) Array.wrap(element).map do |a| a["family"] = a["familyName"] a["given"] = a["givenName"] a["literal"] = a["name"] unless a["familyName"].present? a.except("type", "@type", "id", "@id", "name", "familyName", "givenName").compact end end |
#to_ris(element) ⇒ Object
437 438 439 440 441 442 443 444 445 |
# File 'lib/bolognese/utils.rb', line 437 def to_ris(element) Array.wrap(element).map do |a| if a["familyName"].present? [a["familyName"], a["givenName"]].join(", ") else a["name"] end end.unwrap end |
#to_schema_org(element) ⇒ Object
387 388 389 390 391 |
# File 'lib/bolognese/utils.rb', line 387 def to_schema_org(element) mapping = { "type" => "@type", "id" => "@id", "title" => "name" } map_hash_keys(element: element, mapping: mapping) end |
#validate_orcid(orcid) ⇒ Object
274 275 276 277 |
# File 'lib/bolognese/utils.rb', line 274 def validate_orcid(orcid) orcid = Array(/\A(?:(http|https):\/\/(www\.)?orcid\.org\/)?(\d{4}[[:space:]-]\d{4}[[:space:]-]\d{4}[[:space:]-]\d{3}[0-9X]+)\z/.match(orcid)).last orcid.gsub(/[[:space:]]/, "-") if orcid.present? end |
#validate_url(str) ⇒ Object
279 280 281 282 283 284 285 286 287 |
# File 'lib/bolognese/utils.rb', line 279 def validate_url(str) if /\A(?:(http|https):\/\/(dx\.)?doi.org\/)?(doi:)?(10\.\d{4,5}\/.+)\z/.match(str) "DOI" elsif /\A(http|https):\/\//.match(str) "URL" elsif /\A(ISSN|eISSN) (\d{4}-\d{3}[0-9X]+)\z/.match(str) "ISSN" end end |