Class: Curl::Html
- Inherits:
-
Object
- Object
- Curl::Html
- Defined in:
- lib/searchlink/curl/html.rb
Overview
Class for CURLing an HTML page
Instance Attribute Summary collapse
-
#body ⇒ Object
readonly
Returns the value of attribute body.
-
#body_images ⇒ Object
readonly
Returns the value of attribute body_images.
-
#body_links ⇒ Object
readonly
Returns the value of attribute body_links.
-
#code ⇒ Object
readonly
Returns the value of attribute code.
-
#description ⇒ Object
readonly
Returns the value of attribute description.
-
#head ⇒ Object
readonly
Returns the value of attribute head.
-
#headers ⇒ Object
readonly
Returns the value of attribute headers.
-
#links ⇒ Object
readonly
Returns the value of attribute links.
-
#meta ⇒ Object
readonly
Returns the value of attribute meta.
-
#source ⇒ Object
readonly
Returns the value of attribute source.
-
#title ⇒ Object
readonly
Returns the value of attribute title.
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Instance Method Summary collapse
-
#extract(before, after) ⇒ Array
Extract text between two regular expressions.
-
#extract_tag(tag, attribute = nil, source: false, content: false) ⇒ Hash, Array
Extract an array of tags or tag attributes.
-
#extract_tag_contents(tag, source: false) ⇒ Array
Extract tag contents or full tag source.
-
#h(level = '\d') ⇒ Array
Return all headers of given level.
-
#images ⇒ Array
Get all images from the page.
-
#initialize(url, headers: nil, headers_only: false, compressed: false) ⇒ HTMLCurl
constructor
Create a new page object from a URL.
-
#tags(tag = nil) ⇒ Array
Return all tags in body, or a specific tag.
- #to_s ⇒ Object
Constructor Details
#initialize(url, headers: nil, headers_only: false, compressed: false) ⇒ HTMLCurl
Create a new page object from a URL
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
# File 'lib/searchlink/curl/html.rb', line 26 def initialize(url, headers: nil, headers_only: false, compressed: false) @curl = TTY::Which.which('curl') res = curl_html(url, headers: headers, headers_only: headers_only, compressed: compressed) @url = res[:url] @code = res[:code] @headers = res[:headers] @meta = res[:meta] @links = res[:links] @head = res[:head] unless res[:head].nil? @body = reencode(res[:body]) @source = res[:source] @title = @meta['og:title'] || @meta['title'] unless @meta.nil? @description = @meta['og:description'] || @meta['description'] unless @meta.nil? @body_links = content_links @body_images = content_images end |
Instance Attribute Details
#body ⇒ Object (readonly)
Returns the value of attribute body.
13 14 15 |
# File 'lib/searchlink/curl/html.rb', line 13 def body @body end |
#body_images ⇒ Object (readonly)
Returns the value of attribute body_images.
13 14 15 |
# File 'lib/searchlink/curl/html.rb', line 13 def body_images @body_images end |
#body_links ⇒ Object (readonly)
Returns the value of attribute body_links.
13 14 15 |
# File 'lib/searchlink/curl/html.rb', line 13 def body_links @body_links end |
#code ⇒ Object (readonly)
Returns the value of attribute code.
13 14 15 |
# File 'lib/searchlink/curl/html.rb', line 13 def code @code end |
#description ⇒ Object (readonly)
Returns the value of attribute description.
13 14 15 |
# File 'lib/searchlink/curl/html.rb', line 13 def description @description end |
#head ⇒ Object (readonly)
Returns the value of attribute head.
13 14 15 |
# File 'lib/searchlink/curl/html.rb', line 13 def head @head end |
#headers ⇒ Object (readonly)
Returns the value of attribute headers.
13 14 15 |
# File 'lib/searchlink/curl/html.rb', line 13 def headers @headers end |
#links ⇒ Object (readonly)
Returns the value of attribute links.
13 14 15 |
# File 'lib/searchlink/curl/html.rb', line 13 def links @links end |
#meta ⇒ Object (readonly)
Returns the value of attribute meta.
13 14 15 |
# File 'lib/searchlink/curl/html.rb', line 13 def @meta end |
#source ⇒ Object (readonly)
Returns the value of attribute source.
13 14 15 |
# File 'lib/searchlink/curl/html.rb', line 13 def source @source end |
#title ⇒ Object (readonly)
Returns the value of attribute title.
13 14 15 |
# File 'lib/searchlink/curl/html.rb', line 13 def title @title end |
#url ⇒ Object (readonly)
Returns the value of attribute url.
13 14 15 |
# File 'lib/searchlink/curl/html.rb', line 13 def url @url end |
Instance Method Details
#extract(before, after) ⇒ Array
Extract text between two regular expressions
51 52 53 54 55 |
# File 'lib/searchlink/curl/html.rb', line 51 def extract(before, after) before = /#{Regexp.escape(before)}/ unless before.instance_of?(Regexp) after = /#{Regexp.escape(after)}/ unless after.instance_of?(Regexp) @body.scan(/#{before.source}(.*?)#{after.source}/) end |
#extract_tag(tag, attribute = nil, source: false, content: false) ⇒ Hash, Array
Extract an array of tags or tag attributes
If attribute is not given, tag contents will be returned
contents] src attributes]
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
# File 'lib/searchlink/curl/html.rb', line 79 def extract_tag(tag, attribute = nil, source: false, content: false) res = extract_tag_contents(tag, source: true) return res if source res.map! do |tag_source| m = tag_source.to_enum(:scan, /(\S+)=(['"])(.*?)\2/).map { Regexp.last_match } attrs = m.each_with_object({}) { |at, a| a[at[1]] = at[3] } = tag_source.match(/<.*?>(?<content>.*?)</) contents = .nil? ? nil : ['content'] { tag: tag, source: tag_source, attrs: attrs, content: contents } end return res.map { |r| r[:content] } if content return res if attribute.nil? res.map { |r| r[:attrs][attribute] } end |
#extract_tag_contents(tag, source: false) ⇒ Array
Extract tag contents or full tag source
111 112 113 114 115 |
# File 'lib/searchlink/curl/html.rb', line 111 def extract_tag_contents(tag, source: false) return @body.scan(%r{<#{tag}.*?>(?:.*?</#{tag}>)?}) if source @body.scan(/<#{tag}.*?>(.*?)</).map { |t| t[0] } end |
#h(level = '\d') ⇒ Array
Return all headers of given level
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 |
# File 'lib/searchlink/curl/html.rb', line 203 def h(level = '\d') res = [] headlines = @body.to_enum(:scan, %r{<h(?<level>#{level})(?<tag> .*?)?>(?<text>.*?)</h#{level}>}i).map { Regexp.last_match } headlines.each do |m| headline = { level: m['level'] } if m['tag'].nil? attrs = nil else attrs = m['tag'].to_enum(:scan, /(?<attr>\w+)=(?<quot>["'])(?<content>.*?)\k<quot>/).map { Regexp.last_match } attrs.each { |a| headline[a['attr'].to_sym] = a['content'] } end headline[:text] = m['text'].remove_entities res << headline end res end |
#images ⇒ Array
Get all images from the page
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
# File 'lib/searchlink/curl/html.rb', line 142 def images output = [] %w[og:image twitter:image].each do |src| next unless @meta.key?(src) output << { type: 'opengraph', attrs: nil, src: @meta[src] } end images = (%w[img source]) images.each do |img| case img[:tag].downcase when /source/ srcsets = img[:attrs].filter { |k| k[:key] =~ /srcset/i } if srcsets.count.positive? srcset = [] srcsets.each do |src| src[:value].split(/ *, */).each do |s| image, media = s.split(/ /) srcset << { src: image, media: media } end end output << { type: 'srcset', attrs: img[:attrs], images: srcset } end when /img/ output << { type: 'img', src: img[:attrs].filter { |a| a[:key] =~ /src/i }.first[:value], attrs: img[:attrs] } end end output end |
#tags(tag = nil) ⇒ Array
Return all tags in body, or a specific tag
128 129 130 131 132 133 134 135 |
# File 'lib/searchlink/curl/html.rb', line 128 def (tag = nil) = (@body) return if tag.nil? tag = [tag] unless tag.is_a?(Array) tag.map!(&:downcase) ().dup.delete_if { |t| !tag.include?(t[:tag].downcase) } end |
#to_s ⇒ Object
186 187 188 189 190 191 192 193 194 |
# File 'lib/searchlink/curl/html.rb', line 186 def to_s headers = @headers.nil? ? 0 : @headers.count = @meta.nil? ? 0 : @meta.count links = @links.nil? ? 0 : @links.count [ %(<HTMLCurl: @code="#{@code}" @url="#{@url}" @title="#{@title}"), %(@description=#{@description} @headers:#{headers} @meta:#{} @links:#{links}>) ].join(' ') end |