Class: Raev::Url
- Inherits:
-
Object
- Object
- Raev::Url
- Defined in:
- lib/raev/url.rb
Constant Summary collapse
- AUTHOR_CSS_SELECTORS =
[ '.c-byline__item a'.freeze, '.author-info .name'.freeze, '.author-top a'.freeze, '.yt-user-info a'.freeze, 'a[rel~="author"]'.freeze, 'a[itemprop~="author"]'.freeze, '.author h3 a'.freeze, '.author'.freeze, '.posted-by a'.freeze, '.entryAuthor a'.freeze, 'a.names'.freeze, 'a.byline-author'.freeze, '.byline a'.freeze, '.author.vcard a'.freeze, 'p.info a'.freeze, '.author-name'.freeze, '.upcased'.freeze, 'a[rel~="nofollow"]'.freeze ]
- REGEX_UTM =
/(\?|&)utm_/- REGEX_URL_DATE =
/[0-9]{4}\/[0-9]{1,2}\/[0-9]{1,2}/- REGEX_ENTRY_DATE =
/[^a-zA-Z0-9\s]/- REGEX_PAGE_TITLE =
/ +/
Instance Attribute Summary collapse
-
#body ⇒ Object
readonly
Returns the value of attribute body.
-
#doc ⇒ Object
readonly
Returns the value of attribute doc.
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Class Method Summary collapse
Instance Method Summary collapse
- #author ⇒ Object
- #bestRating ⇒ Object
- #document ⇒ Object
- #feed ⇒ Object
- #headline ⇒ Object
-
#initialize(url) ⇒ Url
constructor
A new instance of Url.
- #pubdate ⇒ Object
- #ratingValue ⇒ Object
- #twitter ⇒ Object
- #without_http ⇒ Object
Constructor Details
#initialize(url) ⇒ Url
Returns a new instance of Url.
40 41 42 43 44 45 |
# File 'lib/raev/url.rb', line 40 def initialize(url) fetch(url) @url = Url.remove_utm(@url) @doc = nil @linked_data = nil end |
Instance Attribute Details
#body ⇒ Object (readonly)
Returns the value of attribute body.
37 38 39 |
# File 'lib/raev/url.rb', line 37 def body @body end |
#doc ⇒ Object (readonly)
Returns the value of attribute doc.
38 39 40 |
# File 'lib/raev/url.rb', line 38 def doc @doc end |
#url ⇒ Object (readonly)
Returns the value of attribute url.
36 37 38 |
# File 'lib/raev/url.rb', line 36 def url @url end |
Class Method Details
.base(url) ⇒ Object
47 48 49 50 51 |
# File 'lib/raev/url.rb', line 47 def self.base(url) base_url = url.split('/'.freeze)[2] base_url.gsub!('www.'.freeze, ''.freeze) unless base_url.nil? base_url end |
.remove_utm(url) ⇒ Object
53 54 55 56 57 58 59 60 61 62 |
# File 'lib/raev/url.rb', line 53 def self.remove_utm(url) unless url.nil? utm_index = url.index(REGEX_UTM) unless(utm_index.nil?) url = url.slice(0, utm_index) end end url end |
Instance Method Details
#author ⇒ Object
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
# File 'lib/raev/url.rb', line 165 def node = document.search('meta[name="author"]'.freeze).first if node && node.attribute("content".freeze) return node.attribute("content".freeze).value end node = document.search(AUTHOR_CSS_SELECTORS.join(", ".freeze)).first if node words = node.content.split.size if words <= 4 return Sanitize.clean(node.content).strip[0..255] end end "".freeze end |
#bestRating ⇒ Object
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 |
# File 'lib/raev/url.rb', line 203 def bestRating node = document.search('*[itemprop="bestRating"]'.freeze).first if node if node.attribute("content".freeze) value = node.attribute("content".freeze).value if value return value.to_f end end end nil end |
#document ⇒ Object
219 220 221 222 223 224 225 |
# File 'lib/raev/url.rb', line 219 def document if @doc.nil? @doc = Nokogiri::HTML(@body) end @doc end |
#feed ⇒ Object
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
# File 'lib/raev/url.rb', line 79 def feed feed_url = nil node = document.css('link[type="application/rss+xml"][rel="alternate"]'.freeze) if node.first feed_url = node.first["href"] else node = document.css('a:match_href("http://feeds.")'.freeze, Raev::Parser.new) if node.first feed_url = node.first["href"] end end if feed_url && feed_url[0,1] == "/".freeze feed_url = @url + feed_url end feed_url end |
#headline ⇒ Object
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
# File 'lib/raev/url.rb', line 101 def headline if linked_data && linked_data["headline"] return Sanitize.clean(linked_data["headline"]) end page_title = nil node = document.css(".twitter-share-button".freeze) if node.first if node.first['data-text'] page_title = node.first['data-text'] end end if page_title.nil? document.css("head meta".freeze).each do || if ['property'] == 'og:title'.freeze || ['property'] == 'twitter:title'.freeze page_title = ['content'] end end end if page_title.nil? node = document.css("#article h1, a[rel=\"bookmark\"], h2[itemprop=\"name\"]".freeze) if node.first page_title = node.first.content end end unless page_title.nil? page_title.gsub!(REGEX_PAGE_TITLE, ' '.freeze) end page_title end |
#pubdate ⇒ Object
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
# File 'lib/raev/url.rb', line 139 def pubdate if linked_data && linked_data["datePublished"] return Date.parse(linked_data["datePublished"]) end date_elements = @url.match(REGEX_URL_DATE).to_s.split("/".freeze) if date_elements.size == 3 return Date.new(date_elements[0].to_i, date_elements[1].to_i, date_elements[2].to_i) else node = document.search("meta[itemprop='datePublished'], meta[name='pub_date']".freeze).first if node return Date.parse(node.attribute("content".freeze)) else node = document.search(".entryDate, .entrydate".freeze).first if node return Chronic.parse(node.content.gsub(REGEX_ENTRY_DATE, "".freeze).strip) end end end nil end |
#ratingValue ⇒ Object
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
# File 'lib/raev/url.rb', line 185 def node = document.search('*[itemprop="ratingValue"]'.freeze).first if node if node.attribute("content".freeze) value = node.attribute("content".freeze).value else value = node.content end end if value value.to_f else nil end end |
#twitter ⇒ Object
68 69 70 71 72 73 74 75 76 77 |
# File 'lib/raev/url.rb', line 68 def twitter node = document.css('a:match_href("twitter.com")'.freeze, Raev::Parser.new) if node.first twitter_url = node.first["href"] twitter_url.split('/'.freeze).last else nil end end |
#without_http ⇒ Object
64 65 66 |
# File 'lib/raev/url.rb', line 64 def without_http @url.sub("http://".freeze, "".freeze) end |