Module: ReadabilityJs
- Defined in:
- lib/readability_js.rb,
lib/custom_errors/error.rb,
lib/readability_js/nodo.rb,
lib/readability_js/version.rb,
lib/readability_js/extended.rb
Overview
ReadabilityJs
Defined Under Namespace
Classes: Error, Extended, Nodo
Constant Summary collapse
- VERSION =
'0.0.3'.freeze
Class Method Summary collapse
-
.is_probably_readerable(html, min_content_length: 140, min_score: 20, visibility_checker: nil) ⇒ Boolean
Decides whether a document is probably readerable without parsing the whole document.
-
.parse(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0) ⇒ Hash
Parse a HTML document and extract its main content using Mozilla’s Readability library.
-
.parse_extended(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0) ⇒ Hash
Like #parse but with additional pre- and post-processing to enhance content extraction.
-
.probably_readerable?(html, min_content_length: 140, min_score: 20, visibility_checker: nil) ⇒ Boolean
Decides whether a document is probably readerable without parsing the whole document.
Class Method Details
.is_probably_readerable(html, min_content_length: 140, min_score: 20, visibility_checker: nil) ⇒ Boolean
Decides whether a document is probably readerable without parsing the whole document.
Only ‘html’ is a required parameter, all others are optional.
html = “<html>…</html>”
visibility_checker = <<~JS
(node) => {
const style = node.ownerDocument.defaultView.getComputedStyle(node);
return (style && style.display !== 'none' && style.visibility !== 'hidden' && parseFloat(style.opacity) > 0);
}
JS
ReadabilityJs.is_probably_readerable(html, min_content_length: 200, min_score: 25, visibility_checker: visibility_checker)
102 103 104 105 106 107 108 |
# File 'lib/readability_js.rb', line 102 def self.is_probably_readerable(html, min_content_length: 140, min_score: 20, visibility_checker: nil) begin ReadabilityJs::Nodo.is_probably_readerable(html, min_content_length: min_content_length, min_score: min_score, visibility_checker: visibility_checker) rescue => e raise ReadabilityJs::Error.new e. end end |
.parse(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0) ⇒ Hash
Parse a HTML document and extract its main content using Mozilla’s Readability library.
‘html’ is a required parameters, all others are optional.
40 41 42 43 44 45 46 47 |
# File 'lib/readability_js.rb', line 40 def self.parse(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0) begin result = ReadabilityJs::Nodo.parse(html, url: url, debug: debug, max_elems_to_parse: max_elems_to_parse, nb_top_candidates: nb_top_candidates, char_threshold: char_threshold, classes_to_preserve: classes_to_preserve, keep_classes: keep_classes, disable_json_ld: disable_json_ld, serializer: serializer, allow_video_regex: allow_video_regex, link_density_modifier: link_density_modifier) normalize_result(result) rescue => e raise ReadabilityJs::Error.new e. end end |
.parse_extended(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0) ⇒ Hash
Like #parse but with additional pre- and post-processing to enhance content extraction.
‘html’ is a required parameters, all others are optional.
70 71 72 73 74 |
# File 'lib/readability_js.rb', line 70 def self.parse_extended(html, url: nil, debug: false, max_elems_to_parse: 0, nb_top_candidates: 5, char_threshold: 500, classes_to_preserve: [], keep_classes: false, disable_json_ld: false, serializer: nil, allow_video_regex: nil, link_density_modifier: 0) result = Extended::before_cleanup html result = parse result, url: url, debug: debug, max_elems_to_parse: max_elems_to_parse, nb_top_candidates: nb_top_candidates, char_threshold: char_threshold, classes_to_preserve: classes_to_preserve, keep_classes: keep_classes, disable_json_ld: disable_json_ld, serializer: serializer, allow_video_regex: allow_video_regex, link_density_modifier: link_density_modifier Extended::after_cleanup result, html end |
.probably_readerable?(html, min_content_length: 140, min_score: 20, visibility_checker: nil) ⇒ Boolean
Decides whether a document is probably readerable without parsing the whole document.
Only ‘html’ is a required parameter, all others are optional.
html = “<html>…</html>”
visibility_checker = <<~JS
(node) => {
const style = node.ownerDocument.defaultView.getComputedStyle(node);
return (style && style.display !== 'none' && style.visibility !== 'hidden' && parseFloat(style.opacity) > 0);
}
JS
ReadabilityJs.probably_readerable?(html, min_content_length: 200, min_score: 25, visibility_checker: visibility_checker)
137 138 139 |
# File 'lib/readability_js.rb', line 137 def self.probably_readerable?(html, min_content_length: 140, min_score: 20, visibility_checker: nil) self.is_probably_readerable(html, min_content_length: min_content_length, min_score: min_score, visibility_checker: visibility_checker) end |