Module: Html2rss

Defined in:
lib/html2rss.rb,
lib/html2rss/cli.rb,
lib/html2rss/item.rb,
lib/html2rss/utils.rb,
lib/html2rss/config.rb,
lib/html2rss/version.rb,
lib/html2rss/auto_source.rb,
lib/html2rss/rss_builder.rb,
lib/html2rss/config/channel.rb,
lib/html2rss/item_extractors.rb,
lib/html2rss/request_service.rb,
lib/html2rss/config/selectors.rb,
lib/html2rss/rss_builder/item.rb,
lib/html2rss/auto_source/article.rb,
lib/html2rss/auto_source/channel.rb,
lib/html2rss/auto_source/cleanup.rb,
lib/html2rss/auto_source/reducer.rb,
lib/html2rss/auto_source/scraper.rb,
lib/html2rss/rss_builder/channel.rb,
lib/html2rss/item_extractors/href.rb,
lib/html2rss/item_extractors/html.rb,
lib/html2rss/item_extractors/text.rb,
lib/html2rss/item_extractors/static.rb,
lib/html2rss/rss_builder/stylesheet.rb,
lib/html2rss/auto_source/rss_builder.rb,
lib/html2rss/object_to_xml_converter.rb,
lib/html2rss/request_service/context.rb,
lib/html2rss/auto_source/scraper/html.rb,
lib/html2rss/request_service/response.rb,
lib/html2rss/request_service/strategy.rb,
lib/html2rss/attribute_post_processors.rb,
lib/html2rss/item_extractors/attribute.rb,
lib/html2rss/auto_source/scraper/schema.rb,
lib/html2rss/attribute_post_processors/base.rb,
lib/html2rss/attribute_post_processors/gsub.rb,
lib/html2rss/auto_source/scraper/schema/thing.rb,
lib/html2rss/request_service/faraday_strategy.rb,
lib/html2rss/request_service/puppet_commander.rb,
lib/html2rss/auto_source/scraper/semantic_html.rb,
lib/html2rss/attribute_post_processors/template.rb,
lib/html2rss/attribute_post_processors/parse_uri.rb,
lib/html2rss/attribute_post_processors/substring.rb,
lib/html2rss/attribute_post_processors/parse_time.rb,
lib/html2rss/auto_source/scraper/schema/item_list.rb,
lib/html2rss/auto_source/scraper/schema/list_item.rb,
lib/html2rss/request_service/browserless_strategy.rb,
lib/html2rss/attribute_post_processors/sanitize_html.rb,
lib/html2rss/auto_source/scraper/semantic_html/image.rb,
lib/html2rss/attribute_post_processors/html_to_markdown.rb,
lib/html2rss/attribute_post_processors/markdown_to_html.rb,
lib/html2rss/auto_source/scraper/semantic_html/extractor.rb,
lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb,
lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb

Overview

The Html2rss namespace.

Defined Under Namespace

Modules: AttributePostProcessors, ItemExtractors, RssBuilder, Utils Classes: AutoSource, CLI, Config, Error, Item, ObjectToXmlConverter, RequestService

Constant Summary collapse

Log =

The logger instance.

Logger.new($stderr)
CONFIG_KEY_FEEDS =

Key for the feeds configuration in the YAML file.

:feeds
VERSION =
'0.17.0'

Class Method Summary collapse

Class Method Details

.auto_source(url, strategy: :faraday) ⇒ RSS::Rss

Scrapes the provided URL and returns an RSS object. No need for a “feed config”.

Parameters:

  • url (String)

    the URL to automatically source the feed from

  • strategy (Symbol) (defaults to: :faraday)

    the request strategy to use

Returns:

  • (RSS::Rss)

104
105
106
107
108
109
# File 'lib/html2rss.rb', line 104

def self.auto_source(url, strategy: :faraday)
  ctx = RequestService::Context.new(url:, headers: {})
  response = RequestService.execute(ctx, strategy:)

  Html2rss::AutoSource.new(ctx.url, body: response.body, headers: response.headers).build
end

.feed(config) ⇒ RSS::Rss

Returns an RSS object generated from the provided configuration.

Example:

feed = Html2rss.feed(
  channel: { name: 'StackOverflow: Hot Network Questions', url: 'https://stackoverflow.com' },
  selectors: {
    items: { selector: '#hot-network-questions > ul > li' },
    title: { selector: 'a' },
    link: { selector: 'a', extractor: 'href' }
  }
)
# => #<RSS::Rss:0x00007fb2f48d14a0 ...>

Parameters:

Returns:

  • (RSS::Rss)

    RSS object generated from the configuration.


72
73
74
75
# File 'lib/html2rss.rb', line 72

def self.feed(config)
  config = Config.new(config) unless config.is_a?(Config)
  RssBuilder.build(config)
end

.feed_from_yaml_config(file, name = nil, global_config: {}, params: {}) ⇒ RSS::Rss

Returns an RSS object generated from the provided YAML file configuration.

Example:

feed = Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')
# => #<RSS::Rss:0x00007fb2f6331228

Parameters:

  • file (String)

    Path to the YAML file.

  • name (String, Symbol, nil) (defaults to: nil)

    Name of the feed in the YAML file.

  • global_config (Hash) (defaults to: {})

    Global options (e.g., HTTP headers).

  • params (Hash) (defaults to: {})

    Dynamic parameters for the feed configuration.

Returns:

  • (RSS::Rss)

    RSS object generated from the configuration.


46
47
48
49
50
51
52
53
# File 'lib/html2rss.rb', line 46

def self.feed_from_yaml_config(file, name = nil, global_config: {}, params: {})
  yaml = YAML.safe_load_file(file, symbolize_names: true)
  feeds = yaml[CONFIG_KEY_FEEDS] || {}

  feed_config = find_feed_config(yaml, feeds, name, global_config)

  feed(Config.new(feed_config, global_config, params))
end