Module: Buzzsaw::DSL
- Included in:
- Document
- Defined in:
- lib/buzzsaw/dsl.rb
Constant Summary collapse
- ENCODING_EXCEPTION =
defined?(Java) ? Java::JavaNioCharset::UnsupportedCharsetException : Encoding::CompatibilityError
Instance Method Summary collapse
- #asciify_target_text(target) ⇒ Object
- #capture_target_text(text, pattern) ⇒ Object
- #collect_by_xpath(args) ⇒ Object
- #collect_target_text(args, nodes) ⇒ Object
- #filter_target_text(target, filter_list) ⇒ Object (also: #filters)
- #find_by_meta_tag(args) ⇒ Object (also: #label_by_meta_tag)
- #find_by_schema_tag(value) ⇒ Object
-
#find_by_xpath(args) ⇒ Object
Main DSL methods.
- #find_in_table(args) ⇒ Object
- #find_target_text(args, nodes) ⇒ Object
- #get_content_for_meta_nodes(nodes) ⇒ Object
- #get_nodes(args) ⇒ Object
- #get_nodes_for_meta_attribute(args) ⇒ Object
- #join_target_text(nodes, delimiter) ⇒ Object
- #label_by_meta_keywords(args) ⇒ Object
- #label_by_url(args) ⇒ Object
-
#match_table_element(table, element, match, index) ⇒ Object
Private.
- #match_target_text!(nodes, pattern) ⇒ Object
- #meta_description ⇒ Object
- #meta_image ⇒ Object
- #meta_keywords ⇒ Object
- #meta_name(args) ⇒ Object
- #meta_og(value) ⇒ Object
- #meta_og_description ⇒ Object
- #meta_og_image ⇒ Object
- #meta_og_keywords ⇒ Object
- #meta_og_title ⇒ Object
- #meta_price ⇒ Object
-
#meta_property(args) ⇒ Object
Meta tag convenience methods.
- #meta_title ⇒ Object
- #sanitize(text) ⇒ Object
- #schema_description ⇒ Object
- #schema_name ⇒ Object
-
#schema_price ⇒ Object
Schema.org convenience mthods.
Instance Method Details
#asciify_target_text(target) ⇒ Object
240 241 242 243 244 245 |
# File 'lib/buzzsaw/dsl.rb', line 240 def asciify_target_text(target) return unless target newstr = "" target.each_char { |chr| newstr << (chr.dump["u{e2}"] ? '"' : chr) } newstr.to_ascii end |
#capture_target_text(text, pattern) ⇒ Object
200 201 202 203 |
# File 'lib/buzzsaw/dsl.rb', line 200 def capture_target_text(text, pattern) return unless text pattern ? text[pattern] : text.gsub(/\s+/," ") end |
#collect_by_xpath(args) ⇒ Object
19 20 21 22 23 24 25 26 27 |
# File 'lib/buzzsaw/dsl.rb', line 19 def collect_by_xpath(args) args.symbolize_keys! args[:match] = args[:capture] = args[:pattern] if args[:pattern] nodes = get_nodes(args) target = collect_target_text(args, nodes) return args[:label] if args[:label] && target.present? asciify_target_text(target) end |
#collect_target_text(args, nodes) ⇒ Object
182 183 184 185 186 187 188 189 190 191 |
# File 'lib/buzzsaw/dsl.rb', line 182 def collect_target_text(args, nodes) match_target_text!(nodes, args[:match]) # Reduce the matching nodes result = join_target_text(nodes, args[:join]) # Filter the string with the :capture regex capture_target_text(result, args[:capture]) rescue ENCODING_EXCEPTION end |
#filter_target_text(target, filter_list) ⇒ Object Also known as: filters
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
# File 'lib/buzzsaw/dsl.rb', line 139 def filter_target_text(target, filter_list) filter_list.each do |filter| next unless target.present? filter.symbolize_keys! if filter.is_a?(Hash) if filter.is_a?(String) && respond_to?(filter) target = send(filter, target) elsif filter[:accept] target = target[filter[:accept]] elsif filter[:reject] target.slice!(filter[:reject]) elsif filter[:prefix] target = "#{filter[:prefix]}#{target}" elsif filter[:postfix] target = "#{target}#{filter[:postfix]}" end end target.try(:strip) end |
#find_by_meta_tag(args) ⇒ Object Also known as: label_by_meta_tag
70 71 72 73 74 75 76 77 78 79 |
# File 'lib/buzzsaw/dsl.rb', line 70 def (args) args.symbolize_keys! args[:pattern] ||= args[:match] # Backwards compatibility nodes = (args) return unless target = (nodes) target = target[args[:pattern]] if args[:pattern] return args[:label] if args[:label] && target.present? target end |
#find_by_schema_tag(value) ⇒ Object
82 83 84 85 86 87 88 89 90 91 |
# File 'lib/buzzsaw/dsl.rb', line 82 def find_by_schema_tag(value) string_methods = [:upcase, :downcase, :capitalize] nodes = string_methods.map do |method| doc.at_xpath("//*[@itemprop=\"#{value.send(method)}\"]") end.compact return if nodes.empty? content = nodes.first.text.strip.gsub(/\s+/," ") return unless content.present? content end |
#find_by_xpath(args) ⇒ Object
Main DSL methods
9 10 11 12 13 14 15 16 17 |
# File 'lib/buzzsaw/dsl.rb', line 9 def find_by_xpath(args) args.symbolize_keys! args[:match] = args[:capture] = args[:pattern] if args[:pattern] nodes = get_nodes(args) target = find_target_text(args, nodes) return args[:label] if args[:label] && target.present? asciify_target_text(target) end |
#find_in_table(args) ⇒ Object
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/buzzsaw/dsl.rb', line 29 def find_in_table(args) args.symbolize_keys! xpath = args[:xpath] capture = args[:capture] if args[:row].is_a?(Fixnum) match_row = nil row_index = args[:row] else row_index = nil match_row = args[:row] end if args[:column].is_a?(Fixnum) match_column = nil column_index = args[:column] else column_index = nil match_column = args[:column] end return unless table = doc.at_xpath(xpath) # Rows match first return unless row = match_table_element(table, "tr", match_row, row_index) unless match_column || column_index if capture return row.text[capture] else return row.text end end # Now columns return unless col = match_table_element(row, "td", match_column, column_index) return col.text unless capture col.text[capture] end |
#find_target_text(args, nodes) ⇒ Object
171 172 173 174 175 176 177 178 179 180 |
# File 'lib/buzzsaw/dsl.rb', line 171 def find_target_text(args, nodes) match_target_text!(nodes, args[:match]) # Select the first match result = nodes.first.try(:strip) # Filter match with the :capture regex capture_target_text(result, args[:capture]) rescue ENCODING_EXCEPTION end |
#get_content_for_meta_nodes(nodes) ⇒ Object
231 232 233 234 235 236 237 238 |
# File 'lib/buzzsaw/dsl.rb', line 231 def (nodes) return unless nodes && nodes.any? contents = nodes.map { |node| node.attribute("content") }.compact return if contents.empty? content = contents.first.value.strip.squeeze(" ") return unless content.present? content end |
#get_nodes(args) ⇒ Object
216 217 218 219 |
# File 'lib/buzzsaw/dsl.rb', line 216 def get_nodes(args) nodes = doc.xpath(args[:xpath]) nodes.map(&:text).compact end |
#get_nodes_for_meta_attribute(args) ⇒ Object
221 222 223 224 225 226 227 228 229 |
# File 'lib/buzzsaw/dsl.rb', line 221 def (args) attribute = args[:attribute] value_variations = [:upcase, :downcase, :capitalize].map { |method| args[:value].send(method) } nodes = value_variations.map do |value| doc.at_xpath("//head/meta[@#{attribute}=\"#{value}\"]") end.compact return if nodes.empty? nodes end |
#join_target_text(nodes, delimiter) ⇒ Object
205 206 207 208 209 |
# File 'lib/buzzsaw/dsl.rb', line 205 def join_target_text(nodes, delimiter) return unless nodes.present? delimiter = delimiter.to_s nodes.inject { |a, b| a.to_s + delimiter + b.to_s } end |
#label_by_meta_keywords(args) ⇒ Object
126 127 128 129 |
# File 'lib/buzzsaw/dsl.rb', line 126 def (args) args.symbolize_keys! return args[:label] if && [args[:pattern]] end |
#label_by_url(args) ⇒ Object
93 94 95 96 |
# File 'lib/buzzsaw/dsl.rb', line 93 def label_by_url(args) args.symbolize_keys! return args[:label] if "#{url}"[args[:pattern]] end |
#match_table_element(table, element, match, index) ⇒ Object
Private
164 165 166 167 168 169 |
# File 'lib/buzzsaw/dsl.rb', line 164 def match_table_element(table, element, match, index) row = nil row = table.xpath(".//#{element}").detect { |r| r.text && r.text[match] } if match row ||= table.xpath(".//#{element}[#{index}]") if index row end |
#match_target_text!(nodes, pattern) ⇒ Object
193 194 195 196 197 198 |
# File 'lib/buzzsaw/dsl.rb', line 193 def match_target_text!(nodes, pattern) return unless nodes.present? nodes.select! do |node| pattern ? node[pattern].present? : node.present? end end |
#meta_description ⇒ Object
117 |
# File 'lib/buzzsaw/dsl.rb', line 117 def ; (value: 'description'); end |
#meta_image ⇒ Object
118 |
# File 'lib/buzzsaw/dsl.rb', line 118 def ; (value: 'image'); end |
#meta_keywords ⇒ Object
116 |
# File 'lib/buzzsaw/dsl.rb', line 116 def ; (value: 'keywords'); end |
#meta_name(args) ⇒ Object
107 108 109 110 111 |
# File 'lib/buzzsaw/dsl.rb', line 107 def (args) args.symbolize_keys! args.merge!(attribute: 'name') (args) end |
#meta_og(value) ⇒ Object
113 |
# File 'lib/buzzsaw/dsl.rb', line 113 def (value); (value: "og:#{value}"); end |
#meta_og_description ⇒ Object
123 |
# File 'lib/buzzsaw/dsl.rb', line 123 def ; ('description'); end |
#meta_og_image ⇒ Object
124 |
# File 'lib/buzzsaw/dsl.rb', line 124 def ; ('image'); end |
#meta_og_keywords ⇒ Object
122 |
# File 'lib/buzzsaw/dsl.rb', line 122 def ; ('keywords'); end |
#meta_og_title ⇒ Object
121 |
# File 'lib/buzzsaw/dsl.rb', line 121 def ; ('title'); end |
#meta_price ⇒ Object
119 |
# File 'lib/buzzsaw/dsl.rb', line 119 def ; (value: 'price'); end |
#meta_property(args) ⇒ Object
Meta tag convenience methods
101 102 103 104 105 |
# File 'lib/buzzsaw/dsl.rb', line 101 def (args) args.symbolize_keys! args.merge!(attribute: 'property') (args) end |
#meta_title ⇒ Object
115 |
# File 'lib/buzzsaw/dsl.rb', line 115 def ; (value: 'title'); end |
#sanitize(text) ⇒ Object
211 212 213 214 |
# File 'lib/buzzsaw/dsl.rb', line 211 def sanitize(text) return unless str = Sanitize.clean(text, elements: []) HTMLEntities.new.decode(str) end |
#schema_description ⇒ Object
137 |
# File 'lib/buzzsaw/dsl.rb', line 137 def schema_description; find_by_schema_tag("description"); end |
#schema_name ⇒ Object
136 |
# File 'lib/buzzsaw/dsl.rb', line 136 def schema_name; find_by_schema_tag("name"); end |
#schema_price ⇒ Object
Schema.org convenience mthods
135 |
# File 'lib/buzzsaw/dsl.rb', line 135 def schema_price; find_by_schema_tag("price"); end |