Class: ReadabilityJs::Extended

Inherits:
Object
  • Object
show all
Defined in:
lib/readability_js/extended.rb

Constant Summary collapse

DEFAULT_SELECTOR_BLACKLIST =
[
  ".Article-Partner",
  ".Article-Partner-Text",
  ".Article-Comments-Button",
  "#isl-5-AdCarousel",
  "#isl-10-ArticleComments",
  "*[data-element-tracking-name]",
  "*[aria-label='Anzeige']",
  "nav[aria-label='breadcrumb']",
  "a-video",
  "a-gift",
  "a-collapse",
  "a-opt-in",
  "[data-area='related_articles']",
  "nav[aria-label='Breadcrumb']",
  ".c-inline-teaser-list",
  "[width='1'][height='1']",
  ".go-alink-list",
  "[data-external-selector='related-articles-entries']",
  ".BigBox",
  ".id-Breadcrumb-item",
  ".id-Story-interactionBar",
  "revenue-reel",
  ".id-StoryElement-factBox",
  ".breadcrumb",
  ".teaser",
  ".group-teaserblock__items",
  ".title__kicker",
  "ws-adtag",
  "[data-for='webelement_bio']",
  "[data-for='webelement_citation']",
  "#articleTeaser",
  ".article-produktteaser-container",
  "[x-data='{}']",
  "#komune",
  ".community",
  ".article-head__topline",
  ".article__audioicon",
  ".auplayer",
]

Class Method Summary collapse

Class Method Details

.after_cleanup(result, html) ⇒ Object



49
50
51
52
# File 'lib/readability_js/extended.rb', line 49

def self.after_cleanup(result, html)
  find_and_add_picture result, html
  clean_up_and_enrich_result result
end

.beautify_html_and_text(result) ⇒ String

Beautify HTML content by adding title if not present and fixing link spacing

Parameters:

  • result (Hash)

    The result hash from Readability parsing.

Returns:

  • (String)

    The beautified HTML content as a string.



221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
# File 'lib/readability_js/extended.rb', line 221

def self.beautify_html_and_text(result)
  html = result["content"]
  text = result["text_content"]
  # Add title to html and text if not present
  if (html.index(/h[1-2]/) && html.index(/h[1-2]/).to_i > 128 && result.key?("title") && !result["title"].to_s.strip.empty? && !html.include?(result["title"])) || html.index(/h[1-2]/).nil?
    title_tag = "<h1>#{result['title']}</h1>\n"
    html = title_tag + html
    text = result['title'] + "\n\n" + text
  end
  # Check for image and if none is found, add after title if available
  if result.key?("image_url") && !result["image_url"].to_s.strip.empty?
    doc = Nokogiri::HTML(html)
    # check for img tags but also for picture tags
    has_image = !doc.css('img, picture').empty?
    if !has_image
      img_tag = "<p><img src=\"#{result['image_url']}\"></p>\n"
      h1 = doc.at_css('h1')
      if h1
        h1.add_next_sibling(Nokogiri::HTML::DocumentFragment.parse(img_tag))
        html = doc.to_html
      end
    end
  end
  # Add a space after a links if immediately followed by an alphanumeric char (missing separation).
  doc = Nokogiri::HTML(html)
  doc.css('a').each do |link|
    next if link.next_sibling.nil?
    if link.next_sibling.text? && link.next_sibling.content =~ /\A[A-Za-z0-9ÄÖÜäöüß]/
      link.add_next_sibling(Nokogiri::XML::Text.new(' ', doc))
    end
  end
  result["content"] = doc.to_html
  result["text_content"] = text
  result
end

.beautify_markdown(result) ⇒ Hash

Beautify Markdown content by adding title if not present and fixing link spacing

Parameters:

  • result (Hash)

    The result hash from Readability parsing.

Returns:

  • (Hash)

    The beautified result hash.



195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# File 'lib/readability_js/extended.rb', line 195

def self.beautify_markdown(result)
  mark_down = result["markdown_content"]
  # add title to markdown if not present
  if !mark_down.start_with?("# ") && result.key?("title") && !result["title"].to_s.strip.empty? && !mark_down.include?(result["title"])
    mark_down = "# #{result['title']}\n\n" + mark_down
  end
  # Check for image and if none is found, add after title if available
  if result.key?("image_url") && !result["image_url"].to_s.strip.empty?
    has_image = mark_down.match(/!\[.*?\]\(.*?\)/) || mark_down.match(/<img\b[^>]*>/) || mark_down.match(/<picture\b[^>]*>.*?<\/picture>/m)
    if !has_image
      img_md = "![image](#{result['image_url']})\n\n"
      mark_down = mark_down.sub(/^# .+?\n/, "\\0" + img_md)
    end
  end
  # Add a space after markdown links if immediately followed by an alphanumeric char (missing separation).
  mark_down.gsub!(/(\[[^\]]+\]\((?:[^\)"']+|"[^"]*"|'[^']*')*\))(?=[A-Za-z0-9ÄÖÜäöüß])/, '\1 ')
  result["markdown_content"] = mark_down
  result
end

.before_cleanup(html, blacklist_selectors: []) ⇒ Object



45
46
47
# File 'lib/readability_js/extended.rb', line 45

def self.before_cleanup(html, blacklist_selectors: [])
  pre_parser html, blacklist_selectors: blacklist_selectors
end

.clean_up_and_enrich_result(result) ⇒ Hash

Post-parser to clean up extracted content after Readability processing

Cleans up comment artifacts and beautifies HTML and adds beautified Markdown content.

Parameters:

  • result (Hash)

    The result hash from Readability parsing.

Returns:

  • (Hash)

    The cleaned result hash.



127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/readability_js/extended.rb', line 127

def self.clean_up_and_enrich_result(result)
  result["content"] = clean_up_comments(result["content"]) if result.key?("content")
  result["text_content"] = clean_up_comments(result["text_content"]) if result.key?("text_content")
  result["excerpt"] = clean_up_comments(result["excerpt"]) if result.key?("excerpt")
  result["byline"] = clean_up_comments(result["byline"]) if result.key?("byline")
  if result.key?("content")
    result = beautify_html_and_text(result)
    result["markdown_content"] = ReverseMarkdown.convert(result["content"]) if result.key?("content")
    result = beautify_markdown(result)
  end
  result
end

.clean_up_comments(html) ⇒ String

Remove/replace comment / artifact noise like <!–[–&gt;, <!—-&gt; etc.

Parameters:

  • html (String)

    The HTML content as a string.

Returns:

  • (String)

    The cleaned HTML content as a string.



146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# File 'lib/readability_js/extended.rb', line 146

def self.clean_up_comments(html)
  copy = html.dup || ""
  # Turn \x3C before comment start into '<'
  copy.gsub!(/\\x3C(?=!--)/, '<')
  # Decode encoded comment end --&gt; to -->
  copy.gsub!(/--&gt;/, '-->')
  # Remove fully empty or artifact comments ([], only whitespace)
  copy.gsub!(/<!--\s*(?:\[|\]|)*\s*-->/, '')
  # Collapse multiple dummy comment chains
  copy.gsub!(/(?:<!--\s*-->\s*)+/, '')
  # Remove remaining comment artifacts like <!--[-->, <!--]-->
  copy.gsub!(/<!--\[\]-->|<!--\[\s*-->|<!--\]\s*-->/, '')
  # Remove any remaining regular comments
  copy.gsub!(/<!--.*?-->/m, '')
  # Reduce excessive whitespace / blank lines (real newlines)
  copy.gsub!(/\n[ \t]+\n/, "\n")
  copy.gsub!(/\n{3,}/, "\n\n")
  # Remove any remaining script tags (including encoded variants)
  copy.gsub!(/(?:\\x3C|<)script\b[^>]*?(?:>|\\x3E|&gt;).*?(?:\\x3C|<)\/script(?:>|\\x3E|&gt;)/im, '')
  # Preserve blocks where whitespace/newlines matter
  preserve_tags = %w[pre code textarea]
  preserved = {}
  preserve_tags.each_with_index do |tag, idx|
    copy.scan(/<#{tag}[^>]*?>.*?<\/#{tag}>/mi).each do |block|
      key = "__PRESERVE_BLOCK_#{tag.upcase}_#{idx}_#{preserved.size}__"
      preserved[key] = block
      copy.sub!(block, key)
    end
  end
  # Remove literal backslash+n sequences (if they exist as textual artifacts) outside preserved blocks
  copy.gsub!(/\\n\s*/, ' ')
  # Collapse whitespace between tags to a single space or nothing
  # Remove whitespace-only text nodes represented by spaces/newlines between tags
  copy.gsub!(/>\s+</, '><')
  # Normalize multiple spaces to a single space
  copy.gsub!(/ {2,}/, ' ')
  # Trim spaces directly inside tags (e.g., <p> text </p>)
  copy.gsub!(/>\s+([^<])/) { ">#{$1}" }
  # Restore preserved blocks
  preserved.each { |k, v| copy.sub!(k, v) }
  copy.strip
end

.find_and_add_picture(result, html) ⇒ Hash

Post-parser to find and add lead image URL if missing.

Will add a picture into the result hash under the key “image_url”.

Looks for Open Graph and Twitter Card meta tags to find a lead image URL. If not found, it will have a look into the markdown content for the first image.

Parameters:

  • result (Hash)

    The result hash from Readability parsing.

  • html (String)

    The original HTML document as a string.

Returns:

  • (Hash)

    The updated result hash.



88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# File 'lib/readability_js/extended.rb', line 88

def self.find_and_add_picture(result, html)
  return result if result.key?("lead_image_url") && !result["lead_image_url"].to_s.strip.empty?
  doc = Nokogiri::HTML(html)
  # try to find og:image or twitter:image meta tags
  meta_tags = doc.css('meta[property="og:image"], meta[name="og:image"], meta[name="twitter:image"]')
  meta_tags.each do |meta_tag|
    content = meta_tag['content']
    if content && !content.strip.empty?
      result["image_url"] = content.strip
      break
    end
  end
  # try to find first image in markdown content if no meta tag found before
  if !result.key?("image_url") || result["image_url"].to_s.strip.empty?
    if result.key?("markdown_content")
      md_content = result["markdown_content"]
      md_content.scan(/!\[.*?\]\((.*?)\)/).each do |match|
        img_url = match[0]
        if img_url && !img_url.strip.empty?
          # check if img ends with common image file extensions
          if img_url =~ /\.(jpg|jpeg|png|gif|webp|svg|tif|avif)(\?.*)?$/i
            result["image_url"] = img_url.strip
            break
          end
        end
      end
    end
  end
  result
end

.pre_parser(html, blacklist_selectors: []) ⇒ String

Pre-parser to clean up HTML before passing it to Readability

DEFAULT_SELECTOR_BLACKLIST and given blacklist_selectors contains CSS selectors of elements to be removed from the HTML before parsing to improve content extraction.

Parameters:

  • html (String)

    The HTML document as a string.

Returns:

  • (String)

    The cleaned HTML document as a string.



65
66
67
68
69
70
71
72
73
74
# File 'lib/readability_js/extended.rb', line 65

def self.pre_parser(html, blacklist_selectors: [])
  final_blacklist = DEFAULT_SELECTOR_BLACKLIST
  final_blacklist += blacklist_selectors if blacklist_selectors.is_a?(Array) && !blacklist_selectors.empty?
  doc = Nokogiri::HTML(html)
  # Remove blacklisted elements by selector
  final_blacklist.each do |classname|
    doc.css("#{classname}").remove
  end
  doc.to_html
end