Class: PubMed::Article

Inherits:
Object
  • Object
show all
Defined in:
lib/rbbt/sources/pubmed.rb

Overview

Processes the xml with an articles as served by MedLine and extracts the abstract, title and journal information

Constant Summary collapse

XML_KEYS =
[
  [:title    , "ArticleTitle"],
  [:journal  , "Journal/Title"],
  [:issue    , "Journal/JournalIssue/Issue"],
  [:volume   , "Journal/JournalIssue/Volume"],
  [:issn     , "Journal/ISSN"],
  [:year     , "Journal/JournalIssue/PubDate/Year"],
  [:month    , "Journal/JournalIssue/PubDate/Month"],
  [:pages    , "Pagination/MedlinePgn"],
  [:author    , "AuthorList/Author"],
  [:abstract , "Abstract/AbstractText"],
]
PMC_PDF_URL =
"http://www.ncbi.nlm.nih.gov/pmc/articles/PMCID/pdf/"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(xml) ⇒ Article

Returns a new instance of Article.



119
120
121
122
123
124
125
126
# File 'lib/rbbt/sources/pubmed.rb', line 119

def initialize(xml)
  if xml && ! xml.empty?
    info = PubMed::Article.parse_xml xml
    info.each do |key, value|
      self.send("#{ key }=", value)
    end
  end
end

Instance Attribute Details

#abstractObject

Returns the value of attribute abstract.



116
117
118
# File 'lib/rbbt/sources/pubmed.rb', line 116

def abstract
  @abstract
end

#authorObject

Returns the value of attribute author.



116
117
118
# File 'lib/rbbt/sources/pubmed.rb', line 116

def author
  @author
end

#bibentryObject

Returns the value of attribute bibentry.



116
117
118
# File 'lib/rbbt/sources/pubmed.rb', line 116

def bibentry
  @bibentry
end

#gscholar_pdfObject

Returns the value of attribute gscholar_pdf.



116
117
118
# File 'lib/rbbt/sources/pubmed.rb', line 116

def gscholar_pdf
  @gscholar_pdf
end

#journalObject

Returns the value of attribute journal.



116
117
118
# File 'lib/rbbt/sources/pubmed.rb', line 116

def journal
  @journal
end

#meshObject

Returns the value of attribute mesh.



116
117
118
# File 'lib/rbbt/sources/pubmed.rb', line 116

def mesh
  @mesh
end

#pdf_urlObject

Returns the value of attribute pdf_url.



116
117
118
# File 'lib/rbbt/sources/pubmed.rb', line 116

def pdf_url
  @pdf_url
end

#pmc_pdfObject

Returns the value of attribute pmc_pdf.



116
117
118
# File 'lib/rbbt/sources/pubmed.rb', line 116

def pmc_pdf
  @pmc_pdf
end

#pmidObject

Returns the value of attribute pmid.



116
117
118
# File 'lib/rbbt/sources/pubmed.rb', line 116

def pmid
  @pmid
end

#substanceObject

Returns the value of attribute substance.



116
117
118
# File 'lib/rbbt/sources/pubmed.rb', line 116

def substance
  @substance
end

#titleObject

Returns the value of attribute title.



116
117
118
# File 'lib/rbbt/sources/pubmed.rb', line 116

def title
  @title
end

Class Method Details

.escape_title(title) ⇒ Object



41
42
43
# File 'lib/rbbt/sources/pubmed.rb', line 41

def self.escape_title(title)
  title.gsub(/(\w*[A-Z][A-Z]+\w*)/, '{\1}')
end

.make_bibentry(lastname, year, title) ⇒ Object



45
46
47
48
49
50
51
52
53
# File 'lib/rbbt/sources/pubmed.rb', line 45

def self.make_bibentry(lastname, year, title)
  words = title.downcase.scan(/\w+/)
  if words.first.length > 3
    abrev = words.first
  else
    abrev = words[0..2].collect{|w| w.chars.first} * ""
  end
  [lastname.gsub(/\s/,'_'), year || "NOYEAR", abrev] * ""
end

.parse_xml(xml) ⇒ Object



55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/rbbt/sources/pubmed.rb', line 55

def self.parse_xml(xml)
  require 'nokogiri'

  #parser  = LibXML::XML::Parser.string(xml)
  #pubmed  = parser.parse.find("/PubmedArticle").first
  #medline = parser.find("MedlineCitation").first
  #article = medline.find("Article").first

  parser  = Nokogiri.XML(xml)
  medline = parser.search("MedlineCitation").first
  article = medline.search("Article").first

  info = {}

  info[:pmid] = medline.search("PMID").first.content

  XML_KEYS.each do |p|
    name, key = p
    nodes = article.search(key)

    next if nodes.nil? || nodes.empty?

    info[name] = nodes.collect{|n| n.content } * "\n\n"
  end

  bibentry = nil
  info[:author] = article.search("AuthorList/Author").collect do |author|
    begin
      lastname = author.search("LastName").first.content
      if author.search("ForeName").first.nil?
        forename = nil
      else
        forename = author.search("ForeName").first.content.split(/\s/).collect{|word| if word.length == 1; then word + '.'; else word; end} * " "
      end
      bibentry ||= make_bibentry lastname, info[:year], info[:title]
    rescue
    end
    [lastname, forename] * ", "
  end * " and "

  info[:mesh] = parser.search("MeshHeadingList/MeshHeading").collect do |mesh|
    descriptor = mesh.search("DescriptorName").first.attr('UI')
    qualifiers = mesh.search("QualifierName").collect{|q| q.attr('UI')}
    [descriptor] + qualifiers.collect{|q| descriptor + q }
  end.compact.flatten

  info[:substance] = parser.search("NameOfSubstance").collect do |substance|
    substance.attr('UI')
  end

  info[:bibentry] = bibentry.downcase if bibentry

  info[:pmc_pdf] = parser.search("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first

  if info[:pmc_pdf]
    info[:pmc_pdf] = PMC_PDF_URL.sub(/PMCID/, info[:pmc_pdf].content)
  end

  info
end

Instance Method Details

#bibtexObject



168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# File 'lib/rbbt/sources/pubmed.rb', line 168

def bibtex
  keys = [:author] + XML_KEYS.collect{|p| p.first } - [:bibentry]
  bibtex = "@article{#{bibentry},\n"

  keys.each do |key|
    next if self.send(key).nil?

    case key

    when :title
      bibtex += "  title = { #{ PubMed::Article.escape_title title } },\n"

    when :issue
      bibtex += "  number = { #{ issue } },\n"

    else
      bibtex += "  #{ key } = { #{ self.send(key) } },\n"
    end

  end

  bibtex += "  fulltext = { #{ pdf_url } },\n" if pdf_url
  bibtex += "  pmid = { #{ pmid } }\n}"


  bibtex
end

#full_textObject



147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# File 'lib/rbbt/sources/pubmed.rb', line 147

def full_text
  text = if pdf_url
           text = nil
           TmpFile.with_file do |pdf|
             # Change user-agent, oh well...
             `wget --user-agent=firefox #{ pdf_url } -O #{ pdf } -t 3`
             TmpFile.with_file do |txt|
               `pdftotext #{ pdf } #{ txt }`
               text = Open.read(txt) if File.exist?(txt)
             end
           end
           text
         elsif pmc_full_xml
           pmc_full_xml
         else
           nil
         end

  Misc.fixutf8(text)
end

#pmc_full_xmlObject



128
129
130
131
132
133
134
# File 'lib/rbbt/sources/pubmed.rb', line 128

def pmc_full_xml
  begin
    Open.read("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=#{pmid}")
  rescue
    nil
  end
end

#textObject

Join the text from title and abstract



197
198
199
200
201
# File 'lib/rbbt/sources/pubmed.rb', line 197

def text
  text = [title, abstract].join("\n")

  Misc.fixutf8(text)
end