Module: Buzzsaw::DSL

Included in:
Document
Defined in:
lib/buzzsaw/dsl.rb

Constant Summary collapse

ENCODING_EXCEPTION =
defined?(Java) ? Java::JavaNioCharset::UnsupportedCharsetException : Encoding::CompatibilityError

Instance Method Summary collapse

Instance Method Details

#asciify_target_text(target) ⇒ Object



240
241
242
243
244
245
# File 'lib/buzzsaw/dsl.rb', line 240

def asciify_target_text(target)
  return unless target
  newstr = ""
  target.each_char { |chr| newstr << (chr.dump["u{e2}"] ? '"' : chr) }
  newstr.to_ascii
end

#capture_target_text(text, pattern) ⇒ Object



200
201
202
203
# File 'lib/buzzsaw/dsl.rb', line 200

def capture_target_text(text, pattern)
  return unless text
  pattern ? text[pattern] : text.gsub(/\s+/," ")
end

#collect_by_xpath(args) ⇒ Object



19
20
21
22
23
24
25
26
27
# File 'lib/buzzsaw/dsl.rb', line 19

def collect_by_xpath(args)
  args.symbolize_keys!
  args[:match] = args[:capture] = args[:pattern] if args[:pattern]

  nodes = get_nodes(args)
  target = collect_target_text(args, nodes)
  return args[:label] if args[:label] && target.present?
  asciify_target_text(target)
end

#collect_target_text(args, nodes) ⇒ Object



182
183
184
185
186
187
188
189
190
191
# File 'lib/buzzsaw/dsl.rb', line 182

def collect_target_text(args, nodes)
  match_target_text!(nodes, args[:match])

  # Reduce the matching nodes
  result = join_target_text(nodes, args[:join])

  # Filter the string with the :capture regex
  capture_target_text(result, args[:capture])
rescue ENCODING_EXCEPTION
end

#filter_target_text(target, filter_list) ⇒ Object Also known as: filters



139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# File 'lib/buzzsaw/dsl.rb', line 139

def filter_target_text(target, filter_list)
  filter_list.each do |filter|
    next unless target.present?
    filter.symbolize_keys! if filter.is_a?(Hash)
    if filter.is_a?(String) && respond_to?(filter)
      target = send(filter, target)
    elsif filter[:accept]
      target = target[filter[:accept]]
    elsif filter[:reject]
      target.slice!(filter[:reject])
    elsif filter[:prefix]
      target = "#{filter[:prefix]}#{target}"
    elsif filter[:postfix]
      target = "#{target}#{filter[:postfix]}"
    end
  end
  target.try(:strip)
end

#find_by_meta_tag(args) ⇒ Object Also known as: label_by_meta_tag



70
71
72
73
74
75
76
77
78
79
# File 'lib/buzzsaw/dsl.rb', line 70

def find_by_meta_tag(args)
  args.symbolize_keys!
  args[:pattern] ||= args[:match] # Backwards compatibility

  nodes = get_nodes_for_meta_attribute(args)
  return unless target = get_content_for_meta_nodes(nodes)
  target = target[args[:pattern]] if args[:pattern]
  return args[:label] if args[:label] && target.present?
  target
end

#find_by_schema_tag(value) ⇒ Object



82
83
84
85
86
87
88
89
90
91
# File 'lib/buzzsaw/dsl.rb', line 82

def find_by_schema_tag(value)
  string_methods = [:upcase, :downcase, :capitalize]
  nodes = string_methods.map do |method|
    doc.at_xpath("//*[@itemprop=\"#{value.send(method)}\"]")
  end.compact
  return if nodes.empty?
  content = nodes.first.text.strip.gsub(/\s+/," ")
  return unless content.present?
  content
end

#find_by_xpath(args) ⇒ Object

Main DSL methods



9
10
11
12
13
14
15
16
17
# File 'lib/buzzsaw/dsl.rb', line 9

def find_by_xpath(args)
  args.symbolize_keys!
  args[:match] = args[:capture] = args[:pattern] if args[:pattern]

  nodes = get_nodes(args)
  target = find_target_text(args, nodes)
  return args[:label] if args[:label] && target.present?
  asciify_target_text(target)
end

#find_in_table(args) ⇒ Object



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/buzzsaw/dsl.rb', line 29

def find_in_table(args)
  args.symbolize_keys!

  xpath   = args[:xpath]
  capture = args[:capture]

  if args[:row].is_a?(Fixnum)
    match_row = nil
    row_index = args[:row]
  else
    row_index = nil
    match_row = args[:row]
  end

  if args[:column].is_a?(Fixnum)
    match_column = nil
    column_index = args[:column]
  else
    column_index = nil
    match_column = args[:column]
  end

  return unless table = doc.at_xpath(xpath)

  # Rows match first
  return unless row = match_table_element(table, "tr", match_row, row_index)
  unless match_column || column_index
    if capture
      return row.text[capture]
    else
      return row.text
    end
  end

  # Now columns
  return unless col = match_table_element(row, "td", match_column, column_index)

  return col.text unless capture
  col.text[capture]
end

#find_target_text(args, nodes) ⇒ Object



171
172
173
174
175
176
177
178
179
180
# File 'lib/buzzsaw/dsl.rb', line 171

def find_target_text(args, nodes)
  match_target_text!(nodes, args[:match])

  # Select the first match
  result = nodes.first.try(:strip)

  # Filter match with the :capture regex
  capture_target_text(result, args[:capture])
rescue ENCODING_EXCEPTION
end

#get_content_for_meta_nodes(nodes) ⇒ Object



231
232
233
234
235
236
237
238
# File 'lib/buzzsaw/dsl.rb', line 231

def get_content_for_meta_nodes(nodes)
  return unless nodes && nodes.any?
  contents = nodes.map { |node| node.attribute("content") }.compact
  return if contents.empty?
  content = contents.first.value.strip.squeeze(" ")
  return unless content.present?
  content
end

#get_nodes(args) ⇒ Object



216
217
218
219
# File 'lib/buzzsaw/dsl.rb', line 216

def get_nodes(args)
  nodes = doc.xpath(args[:xpath])
  nodes.map(&:text).compact
end

#get_nodes_for_meta_attribute(args) ⇒ Object



221
222
223
224
225
226
227
228
229
# File 'lib/buzzsaw/dsl.rb', line 221

def get_nodes_for_meta_attribute(args)
  attribute = args[:attribute]
  value_variations = [:upcase, :downcase, :capitalize].map { |method| args[:value].send(method) }
  nodes = value_variations.map do |value|
    doc.at_xpath("//head/meta[@#{attribute}=\"#{value}\"]")
  end.compact
  return if nodes.empty?
  nodes
end

#join_target_text(nodes, delimiter) ⇒ Object



205
206
207
208
209
# File 'lib/buzzsaw/dsl.rb', line 205

def join_target_text(nodes, delimiter)
  return unless nodes.present?
  delimiter = delimiter.to_s
  nodes.inject { |a, b| a.to_s + delimiter + b.to_s }
end

#label_by_meta_keywords(args) ⇒ Object



126
127
128
129
# File 'lib/buzzsaw/dsl.rb', line 126

def label_by_meta_keywords(args)
  args.symbolize_keys!
  return args[:label] if meta_keywords && meta_keywords[args[:pattern]]
end

#label_by_url(args) ⇒ Object



93
94
95
96
# File 'lib/buzzsaw/dsl.rb', line 93

def label_by_url(args)
  args.symbolize_keys!
  return args[:label] if "#{url}"[args[:pattern]]
end

#match_table_element(table, element, match, index) ⇒ Object

Private



164
165
166
167
168
169
# File 'lib/buzzsaw/dsl.rb', line 164

def match_table_element(table, element, match, index)
  row = nil
  row = table.xpath(".//#{element}").detect { |r| r.text && r.text[match] } if match
  row ||= table.xpath(".//#{element}[#{index}]") if index
  row
end

#match_target_text!(nodes, pattern) ⇒ Object



193
194
195
196
197
198
# File 'lib/buzzsaw/dsl.rb', line 193

def match_target_text!(nodes, pattern)
  return unless nodes.present?
  nodes.select! do |node|
    pattern ? node[pattern].present? : node.present?
  end
end

#meta_descriptionObject



117
# File 'lib/buzzsaw/dsl.rb', line 117

def meta_description; meta_name(value: 'description'); end

#meta_imageObject



118
# File 'lib/buzzsaw/dsl.rb', line 118

def meta_image;       meta_name(value: 'image'); end

#meta_keywordsObject



116
# File 'lib/buzzsaw/dsl.rb', line 116

def meta_keywords;    meta_name(value: 'keywords'); end

#meta_name(args) ⇒ Object



107
108
109
110
111
# File 'lib/buzzsaw/dsl.rb', line 107

def meta_name(args)
  args.symbolize_keys!
  args.merge!(attribute: 'name')
  find_by_meta_tag(args)
end

#meta_og(value) ⇒ Object



113
# File 'lib/buzzsaw/dsl.rb', line 113

def meta_og(value);   meta_property(value: "og:#{value}"); end

#meta_og_descriptionObject



123
# File 'lib/buzzsaw/dsl.rb', line 123

def meta_og_description; meta_og('description'); end

#meta_og_imageObject



124
# File 'lib/buzzsaw/dsl.rb', line 124

def meta_og_image;       meta_og('image'); end

#meta_og_keywordsObject



122
# File 'lib/buzzsaw/dsl.rb', line 122

def meta_og_keywords;    meta_og('keywords'); end

#meta_og_titleObject



121
# File 'lib/buzzsaw/dsl.rb', line 121

def meta_og_title;       meta_og('title'); end

#meta_priceObject



119
# File 'lib/buzzsaw/dsl.rb', line 119

def meta_price;       meta_name(value: 'price'); end

#meta_property(args) ⇒ Object

Meta tag convenience methods



101
102
103
104
105
# File 'lib/buzzsaw/dsl.rb', line 101

def meta_property(args)
  args.symbolize_keys!
  args.merge!(attribute: 'property')
  find_by_meta_tag(args)
end

#meta_titleObject



115
# File 'lib/buzzsaw/dsl.rb', line 115

def meta_title;       meta_name(value: 'title'); end

#sanitize(text) ⇒ Object



211
212
213
214
# File 'lib/buzzsaw/dsl.rb', line 211

def sanitize(text)
  return unless str = Sanitize.clean(text, elements: [])
  HTMLEntities.new.decode(str)
end

#schema_descriptionObject



137
# File 'lib/buzzsaw/dsl.rb', line 137

def schema_description; find_by_schema_tag("description"); end

#schema_nameObject



136
# File 'lib/buzzsaw/dsl.rb', line 136

def schema_name;        find_by_schema_tag("name"); end

#schema_priceObject

Schema.org convenience mthods



135
# File 'lib/buzzsaw/dsl.rb', line 135

def schema_price;       find_by_schema_tag("price"); end