Module: Whitepaper::Engine::CiteSeerX

Defined in:
lib/whitepaper/engine/citeseerx.rb

Overview

This engine uses the CiteSeerX database to query metadata about a paper.

Constant Summary collapse

DOMAIN =

The domain to use for CiteSeerX.

"http://citeseerx.ist.psu.edu"
SEARCH_BY_TITLE_URL =

The url to use to search by title.

"search?q=title%3A{title}&t=doc&sort=cite"

Class Method Summary collapse

Class Method Details

.find_by_title(title) ⇒ Object

Returns a Whitespace::Paper by searching for the paper with the given title keywords.



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/whitepaper/engine/citeseerx.rb', line 22

def find_by_title(title)
  @agent = Mechanize.new
  page = @agent.get "#{find_by_title_url(title)}"

  # get the first link
  paper = page.search '//div[@id="result_list"]/div[@class="result"]/h3/a'

  if paper.empty?
    # no results
    return nil
  end

  paper_link = "#{DOMAIN}#{paper.first.attribute("href")}"

  retrieve_details paper_link
end

.find_by_title_url(title) ⇒ Object

Returns a url that will query for the given title keywords.



17
18
19
# File 'lib/whitepaper/engine/citeseerx.rb', line 17

def find_by_title_url(title)
  "#{DOMAIN}/#{SEARCH_BY_TITLE_URL.gsub(/\{title\}/, title)}"
end

.retrieve_details(url) ⇒ Object

Returns a Whitespace::Paper by reading the direct page for a particular paper.



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/whitepaper/engine/citeseerx.rb', line 40

def retrieve_details(url)
  @agent = Mechanize.new

  page = @agent.get url

  get_meta = lambda {|name|
    meta = page.search "//meta[@name=\"#{name}\"]"
    if meta.nil? or meta.first.nil?
      return ""
    end
    meta.first.attribute("content").to_s
  }

  description = get_meta.call("description")
  keywords_raw = get_meta.call("keywords")
  title = get_meta.call("citation_title")
  authors_raw = get_meta.call("citation_authors")
  year = get_meta.call("citation_year")
  conference = get_meta.call("citation_conference")

  authors = authors_raw.to_s.split(',').map(&:strip)
  keywords = keywords_raw.to_s.split(',').map(&:strip)

  links = []
  ps_links = []

  link_url = page.search '//ul[@id="clinks"]/li/a'
  link_url.each do |l|
    purl = "#{DOMAIN}#{l.attribute("href").to_s}"
    if purl.end_with? "pdf"
      links << purl
    end
    if purl.end_with? "ps"
      ps_links << purl
    end
  end

  link_url = page.search '//ul[@id="dlinks"]/li/a'
  link_url.each do |l|
    purl = l.attribute("href").to_s
    if purl.end_with? "pdf"
      links << purl
    end
    if purl.end_with? "ps"
      ps_links << purl
    end
  end

  Paper.new title, authors, {:description  => description,
                             :keywords     => keywords,
                             :year         => year,
                             :conference   => conference,
                             :metadata_url => url,
                             :pdf_urls     => links,
                             :ps_urls      => ps_links}
end