Module: Extcite

Defined in:
lib/extcite.rb,
lib/extcite/version.rb

Constant Summary collapse

VERSION =
"0.1.0"

Class Method Summary collapse

Class Method Details

.cont_neg(ids:) ⇒ Object

Get citation(s) using Crossref content negotation

Return: an array of bib data

Examples:

require 'extcite'
Extcite.cont_neg(ids: "10.1016/j.dendro.2014.01.004")

Parameters:



212
213
214
215
# File 'lib/extcite.rb', line 212

def self.cont_neg(ids:)
  out = Serrano.content_negotiation(ids: ids)
  return out
end

.extract(path:, file: "out.bib", output: "bib") ⇒ Object

Extract DOIs from one or more PDFs

Return: writes bib files to a .bib file or an array if file is nil

When writing to a file, `extract` by default appends to the end
of the file so you can build up your bibtex file with your
citations

Examples:

require 'extcite'
require 'faraday'
# get a paper in pdf format
path = '2068.pdf'
res = Faraday.new(:url => "https://peerj.com/articles/2068.pdf").get
f = File.new(path, "wb")
f.write(res.body)
f.close()
# extract doi from the pdf
Extcite.extract(path: path)
Extcite.extract(path: path, file: nil)

Parameters:

  • path (String)

    Path to a pdf file, or a folder of PDF files

  • file (String) (defaults to: "out.bib")

    File name to write data to - or nil to stdout

  • output (String) (defaults to: "bib")

    Typeo of output. only bibtex for now



38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/extcite.rb', line 38

def self.extract(path:, file: "out.bib", output: "bib")
  path = make_paths(path)
  path.each do |x|
    # try PDF metadata first
    ids = nil
    rr = PDF::Reader.new(x)
    pdfmeta = rr.
    if !pdfmeta.nil?
      xml = Oga.parse_xml(pdfmeta)
      begin
        tt = xml.xpath('//rdf:Description')
        # try dc:identifier attribute
        ss = tt.attr('dc:identifier')[0]
        if !ss.nil?
          ids = ss.text.sub(/doi:/, '')
        else
          # try prism:doi node
          pdoi = xml.xpath('//rdf:Description//prism:doi')
          if pdoi.length == 1
            ids = pdoi.text
          else
            # try pdf:WPS-ARTICLEDOI node
            wpsdoi = xml.xpath('//rdf:Description//pdf:WPS-ARTICLEDOI')
            if wpsdoi.length == 1
              ids = wpsdoi.text
            else
              # try pdfx:WPS-ARTICLEDOI node
              pdfxwpsdoi = xml.xpath('//rdf:Description//pdfx:WPS-ARTICLEDOI')
              if pdfxwpsdoi.length == 1
                ids = pdfxwpsdoi.text
              else
                ids = nil
              end
            end
          end
        end
      rescue
        ids = nil
      end
    end

    # if not found, try regexing for DOI
    if ids.nil?
      ids = Extcite.get_ids(txt: Extcite.extract_text_one(x))
    end

    if ids.length == 0
      puts "no DOI found in " + x
    else
      if !ids.match(/arxiv/i).nil? && ids.length < 200
        conn = Faraday.new(:url => 'http://export.arxiv.org/api/query?id_list=' + ids.gsub(/arxiv:/i, '')).get
        bibs = conn.body.make_bib_arxiv(ids.gsub(/arxiv:/i, ''))
      else
        bibs = Extcite.cont_neg(ids: ids)
      end

      # if an error or not found, skip
      bibstest = nil
      if bibs.class == Array
        bibstest = bibs[0]
      end

      if !bibstest.nil?
        if !bibstest.match(/error|not found/i).nil? || !bibstest.match(/<\/html>/i).nil?
          puts "DOI found: " + ids + " ; but citation not found via content negotation - passing"
          # do something else?
        else
          if file.nil?
            return bibs
          else
            puts "writing " + ids + " to " + file
            bibs.write_bib(file)
          end
        end
      end
    end
  end
end

.extract_dois(path:) ⇒ Object

Extract DOIs from one or more PDFs after extracting text

Examples:

require 'extcite'
require 'faraday'
# get a paper in pdf format
path = '2068.pdf'
res = Faraday.new(:url => "https://peerj.com/articles/2068.pdf").get
f = File.new(path, "wb")
f.write(res.body)
f.close()
# extract doi from the pdf
Extcite.extract_dois(path: path)

Parameters:

  • path (String)

    Path to a pdf file, or a folder of PDF files



133
134
135
136
# File 'lib/extcite.rb', line 133

def self.extract_dois(path:)
  txt = Extcite.extract_text(path: path)
  return txt.map { |z| z.match("[0-9]+\\.[0-9]+/.+").to_s.gsub(/\s.+/, '') }
end

.extract_text(path:) ⇒ Object

Extract text from a pdf, or many pdfs

This method is used internally within fetch to parse PDFs.

Examples:

require 'extcite'
require 'faraday'
# get a paper in pdf format
path = '2068.pdf'
res = Faraday.new(:url => "https://peerj.com/articles/2068.pdf").get
f = File.new(path, "wb")
f.write(res.body)
f.close()
# extract doi from the pdf
Extcite.extract_text(path: path)

Parameters:

  • path (String)

    Path to a pdf file, or a folder of PDF files



185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# File 'lib/extcite.rb', line 185

def self.extract_text(path:)
  path = Array(path)
  if path.length == 1
    if File.directory?(path[0])
      # keep only files with .pdf extension
      path = dir_files(path[0]).keep_if { |z| !!z.match(/.pdf/) }
    end
  end

  out = []
  path.each do |x|
    rr = PDF::Reader.new(x)
    out << rr.pages.map { |page| page.text }.join("\n")
  end
  return out
end

.get_ids(txt:) ⇒ Object

Get DOIs from a String or Array of String’s

Return: Array of DOIs

Examples:

require 'extcite'
Extcite.get_ids(txt: '10.1016/j.dendro.2014.01.004 adfasdf asd fas df asdfsd')

Parameters:

  • txt (String)

    String or Array of String’s



148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# File 'lib/extcite.rb', line 148

def self.get_ids(txt:)
  # see if there's

  return Array(txt).map { |z|
    # detect if is an arxiv paper
    if !z.match(/arxiv:[0-9]+\.[0-9A-Za-z]+/i).nil?
      # if so, return arxiv id for later extraction of arxiv citation via their API
      z = z.match(/arxiv:[0-9]+\.[0-9A-Za-z]+/i).to_s
    else
      doi_pattern = '(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%"#? ])\\S)+)'
      z = z.match(doi_pattern).to_s.gsub(/\s.+/, '')
      # z = z.match("10\\.[0-9]+/.+").to_s.gsub(/\s.+/, '')
    end
    # clean up doi
    z = z.gsub(/\.$|\.;$|\.\]$|\.\}$|\.\)$|,$/, '')
    return z.gsub(/;$|\]$|\}$|\)$/, '')
  }[0]
end