Module: Whitepaper::Engine::ACM

Defined in:
lib/whitepaper/engine/acm.rb

Overview

This engine uses the ACM database to query metadata about a paper.

Constant Summary collapse

DOMAIN =

The domain to use for ACM.

"https://dl.acm.org"
SEARCH_BY_TITLE_URL2 =

The url to use to search by title.

"results.cfm?within={title_query}&adv=1&DL=ACM&termzone=Title&allofem={title}"
SEARCH_BY_TITLE_URL =

The alternate url to use to search by title.

"results.cfm?query={title}&querydisp={title}&srt=score%20dsc&short=0&coll=DL&dl=GUIDE&source_disp=&source_query=&since_month=&since_year=&before_month=&before_year=&termshow=matchall&range_query="

Class Method Summary collapse

Class Method Details

.find_by_title(title) ⇒ Object

Returns a Whitespace::Paper by searching for the paper with the given title keywords.



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/whitepaper/engine/acm.rb', line 25

def find_by_title(title)
  @agent = Mechanize.new

  # In case cookies are ever necessary to establish:
  #page = @agent.get("#{DOMAIN}")
  #search_url = page.search('//form[@name="qiksearch"]').first.attribute("action").to_s

  page = @agent.get(find_by_title_url(title))

  # get the first link
  paper = page.search '//a[@class="medium-text"]'

  paper_link = "#{DOMAIN}/#{paper.first.attribute("href")}"

  retrieve_details paper_link
end

.find_by_title_url(title) ⇒ Object

Returns a url that will query for the given title keywords.



18
19
20
21
22
# File 'lib/whitepaper/engine/acm.rb', line 18

def find_by_title_url(title)
  "#{DOMAIN}/#{SEARCH_BY_TITLE_URL
    .gsub(/\{title\}/, title.gsub(/\s/, "+"))
    .gsub(/\{title_query\}/, "(Title:\"" + title.split(" ").join("\"+or+Title:\"") + "\")")}"
end

.retrieve_details(url) ⇒ Object

Returns a Whitespace::Paper by reading the direct page for a particular paper.



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/whitepaper/engine/acm.rb', line 43

def retrieve_details(url)
  @agent = Mechanize.new

  page = @agent.get url

  get_meta = lambda {|name|
    meta = page.search "//meta[@name=\"#{name}\"]"
    if meta.nil? or meta.first.nil?
      return ""
    end
    meta.first.attribute("content").to_s
  }

  title = get_meta.call("citation_title")
  authors_raw = get_meta.call("citation_authors")
  year = get_meta.call("citation_date")
  year = year[-4..-1] unless year.empty?
  conference = get_meta.call("citation_conference")
  publisher = get_meta.call("citation_publisher")

  authors = authors_raw.to_s.split(';').map(&:strip).map do |s|
    index = s.index(',')
    if index > 0
      "#{s[index+2..-1]} #{s[0..index-1]}"
    else
      s
    end
  end

  links = []
  ps_links = []

  # get abstract
  abstract_url = page.content.match(/tab_abstract\.cfm\?.*cftoken\=\d+/)[0]
  abstract = @agent.get(abstract_url).root.text.to_s.strip

  Paper.new title, authors, {:description  => abstract,
                             :keywords     => [],
                             :metadata_url => url,
                             :year         => year,
                             :conference   => conference,
                             :pdf_urls     => links,
                             :ps_urls      => ps_links}
end