Class: BentoSearch::DoajArticlesEngine

Inherits:
Object
  • Object
show all
Extended by:
HTTPClientPatch::IncludeClient
Includes:
ActionView::Helpers::SanitizeHelper, SearchEngine
Defined in:
app/search_engines/bento_search/doaj_articles_engine.rb

Overview

DOAJ Articles search. doaj.org/api/v1/docs

Phrase searches with double quotes are respected.

Supports #get by unique_id feature

Constant Summary

Constants included from SearchEngine

SearchEngine::DefaultPerPage

Instance Method Summary collapse

Methods included from HTTPClientPatch::IncludeClient

include_http_client

Methods included from SearchEngine

#display_configuration, #engine_id, #fill_in_search_metadata_for, #initialize, #normalized_search_arguments, #public_settable_search_args, #search

Methods included from SearchEngine::Capabilities

#search_keys, #semantic_search_keys, #semantic_search_map, #sort_keys

Instance Method Details

#args_to_search_url(arguments) ⇒ Object



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'app/search_engines/bento_search/doaj_articles_engine.rb', line 77

def args_to_search_url(arguments)
  query = if arguments[:query].kind_of?(Hash)
    # multi-field query
    arguments[:query].collect {|field, query_value| fielded_query(query_value, field)}.join(" ")
  else
    fielded_query(arguments[:query], arguments[:search_field])
  end

  # We need to escape this for going in a PATH component,
  # not a query. So space can't be "+", it needs to be "%20",
  # and indeed DOAJ API does not like "+".
  #
  # But neither CGI.escape nor URI.escape does quite
  # the right kind of escaping, seems to work out
  # if we do CGI.escape but then replace '+'
  # with '%20'
  escaped_query = CGI.escape(query).gsub('+', '%20')
  url = self.base_url + escaped_query

  query_args = {}

  if arguments[:per_page]
    query_args["pageSize"]  = arguments[:per_page]
  end

  if arguments[:page]
    query_args["page"]      = arguments[:page]
  end

  if arguments[:sort] &&
      (defn = sort_definitions[arguments[:sort]]) &&
      (value = defn[:implementation])
    query_args["sort"] = value
  end

  query = query_args.to_query
  url = url + "?" + query if query.present?

  return url
end

#escape_query(q) ⇒ Object

Escape special chars in query, Doaj says it’s elastic search, punctuation that needs to be escaped and how to escape (backslash) for ES documented here: www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html

We do not escape double quotes, want to allow them for phrases.

This method does NOT return URI-escaped, it returns literal, escaped for ES.



228
229
230
# File 'app/search_engines/bento_search/doaj_articles_engine.rb', line 228

def escape_query(q)
  q.gsub(/([\+\-\=\&\|\>\<\!\(\)\{\}\[\]\^\~\*\?\:\\\/])/) {|m| "\\#{$1}"}
end

#fielded_query(query, field = nil) ⇒ Object

Prepares a DOAJ API (elastic search) query component for given textual query in a given field (or default non-fielded search)

Separates query string into tokens (bare words and phrases), so they can each be made mandatory for ElasticSearch. Default DOAJ API makes them all optional, with a very low mm, which leads to low-precision odd looking results for standard use cases.

Escapes all remaining special characters as literals (not including double quotes which can be used for phrases, which are respected. )

Eg:

fielded_query('apple orange "strawberry banana"', field_name)
# => '+field_name(+apple +orange +"strawberry banana")'

The “+” prefixed before field-name is to make sure all separate fields are also mandatory when doing multi-field searches. It should make no difference for a single-field search.



137
138
139
140
141
142
143
# File 'app/search_engines/bento_search/doaj_articles_engine.rb', line 137

def fielded_query(query, field = nil)
  if field.present?
    "+#{field}:(#{prepare_mandatory_terms(query)})"
  else
    prepare_mandatory_terms(query)
  end
end

#get(unique_id) ⇒ Object



66
67
68
69
70
71
72
73
74
# File 'app/search_engines/bento_search/doaj_articles_engine.rb', line 66

def get(unique_id)
  results = search(unique_id, :search_field => "id")

  raise (results.error[:exception] || StandardError.new(results.error[:message] || results.error[:status])) if results.failed?
  raise BentoSearch::NotFound.new("For id: #{unique_id}") if results.length == 0
  raise BentoSearch::TooManyFound.new("For id: #{unique_id}") if results.length > 1

  results.first
end

#hash_to_item(hash) ⇒ Object

Converts from item found in DOAJ results to BentoSearch::ResultItem



162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# File 'app/search_engines/bento_search/doaj_articles_engine.rb', line 162

def hash_to_item(hash)
  item = ResultItem.new

  bibjson = hash["bibjson"] || {}

  item.unique_id  = hash["id"]

  # Hard-code to Article, we don't get any format information
  item.format     = "Article"

  item.title      = bibjson["title"]


  item.start_page = bibjson["start_page"]
  item.end_page   = bibjson["end_page"]

  item.year       = bibjson["year"]
  if (year = bibjson["year"].to_i) && (month = bibjson["month"].to_i)
    if year != 0 && month != 0
      item.publication_date = Date.new(bibjson["year"].to_i, bibjson["month"].to_i)
    end
  end

  item.abstract   = sanitize(bibjson["abstract"]) if bibjson.has_key?("abstract")

  journal           = bibjson["journal"] || {}
  item.volume       = journal["volume"]
  item.issue        = journal["number"]
  item.source_title = journal["title"]
  item.publisher    = journal["publisher"]
  item.language_str = journal["language"].try(:first)

  (bibjson["identifier"] || []).each do |id_hash|
    case id_hash["type"]
    when "doi"
      item.doi = id_hash["id"]
    when "pissn"
      item.issn = id_hash["id"]
    end
  end

  (bibjson["author"] || []).each do |author_hash|
    if author_hash.has_key?("name")
      author = Author.new(:display => author_hash["name"])
      item.authors << author
    end
  end

  # I _think_ DOAJ articles results always only have one link,
  # and it may always be of type 'fulltext'
  link_hash             = (bibjson["link"] || []).first
  if link_hash && link_hash["url"]
    item.link             = link_hash["url"]
    item.link_is_fulltext = true if link_hash["type"] == "fulltext"
  end

  return item
end

#max_per_pageObject

BentoBox::SearchEngine API



237
238
239
# File 'app/search_engines/bento_search/doaj_articles_engine.rb', line 237

def max_per_page
  100
end

#multi_field_search?Boolean

Returns:

  • (Boolean)


260
261
262
# File 'app/search_engines/bento_search/doaj_articles_engine.rb', line 260

def multi_field_search?
  true
end

#prepare_mandatory_terms(query) ⇒ Object

Takes a query string, prepares an ElasticSearch query doing what we want:

* tokenizes into bare words and double-quoted phrases
* Escapes other punctuation to be literal not ElasticSearch operator.
  (Does NOT do URI escaping)
* Makes each token mandatory with an ElasticSearch "+" operator prefixed.


151
152
153
154
155
156
157
158
159
# File 'app/search_engines/bento_search/doaj_articles_engine.rb', line 151

def prepare_mandatory_terms(query)
  # use string split with regex to too-cleverly split into space
  # seperated terms and phrases, keeping phrases as unit.
  terms = query.split %r{[[:space:]]+|("[^"]+")}
  # Wound up with some empty strings, get rid of em
  terms.delete_if {|t| t.blank?}

  terms.collect {|token| "+" + escape_query(token)}.join(" ")
end

#search_field_definitionsObject



241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
# File 'app/search_engines/bento_search/doaj_articles_engine.rb', line 241

def search_field_definitions
  { nil                     => {:semantic => :general},
    "bibjson.title"         => {:semantic => :title},
    # Using 'exact' seems to produce much better results for
    # author, don't entirely understand what's up.
    "bibjson.author.name"   => {:semantic => :author},
    "publisher"             => {:semantic => :publisher},
    "bibjson.subject.term"  => {:semantic => :subject},
    "bibjson.journal.title" => {:semantic => :source_title},
    "issn"                  => {:semantic => :issn},
    "doi"                   => {:semantic => :doi},
    "bibjson.journal.volume"   => {:semantic => :volume},
    "bibjson.journal.number"   => {:semantic => :issue},
    "bibjson.start_page"   => {:semantic => :start_page},
    "license" => {},
    "id"      => {}
  }
end

#search_implementation(arguments) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'app/search_engines/bento_search/doaj_articles_engine.rb', line 30

def search_implementation(arguments)
  query_url = args_to_search_url(arguments)

  results = Results.new

  begin
    Rails.logger.debug("DoajEngine: requesting #{query_url}")
    response = http_client.get( query_url )
    json = JSON.parse(response.body)
  rescue BentoSearch::RubyTimeoutClass, HTTPClient::TimeoutError,
         HTTPClient::ConfigurationError, HTTPClient::BadResponseError,
         JSON::ParserError  => e
    results.error ||= {}
    results.error[:exception] = e
  end

  if ( response.nil? || json.nil? ||
      (! HTTP::Status.successful? response.status) ||
      (json && json["error"]))

    results.error ||= {}
    results.error[:status] = response.status if response
    results.error[:message] = json["error"] if json["error"]

    return results
  end

  results.total_items = json["total"]

  (json["results"] || []).each do |item_response|
    results <<  hash_to_item(item_response)
  end

  return results
end

#sort_definitionsObject



264
265
266
267
268
269
270
271
272
273
274
275
276
# File 'app/search_engines/bento_search/doaj_articles_engine.rb', line 264

def sort_definitions
  # Don't believe DOAJ supports sorting by author
  {
    "relevance" => {:implementation => nil}, # default
    "title" => {:implementation => "title:asc"},
    # We don't quite have publication date sorting, but we'll use
    # created_date from DOAJ
    "date_desc" => {:implementation => "article.created_date:desc"},
    "date_asc"  => {:implementation => "article.created_date:asc"},
    # custom one not previously standardized
    "publication_name" => {:implementation => "bibjson.journal.title:asc"}
  }
end