Class: TitleGrabber::MultiThreadedGrabber
- Inherits:
-
Object
- Object
- TitleGrabber::MultiThreadedGrabber
- Includes:
- HTTPHelper, TextHelper
- Defined in:
- lib/title_grabber.rb
Constant Summary
Constants included from TextHelper
Constants included from HTTPHelper
HTTPHelper::CONNECTION_ERRORS, HTTPHelper::HTTP_REDIR, HTTPHelper::INVALID_BYTE_SEQ, HTTPHelper::REDIR_FORBIDDEN, HTTPHelper::REST_INTERVAL, HTTPHelper::SRV_UNAVAILABLE
Instance Attribute Summary collapse
-
#connect_to ⇒ Object
readonly
Returns the value of attribute connect_to.
-
#file_paths ⇒ Object
readonly
Returns the value of attribute file_paths.
-
#logger ⇒ Object
readonly
Returns the value of attribute logger.
-
#max_retries ⇒ Object
readonly
Returns the value of attribute max_retries.
-
#max_threads ⇒ Object
readonly
Returns the value of attribute max_threads.
-
#out_path ⇒ Object
readonly
Returns the value of attribute out_path.
-
#read_to ⇒ Object
readonly
Returns the value of attribute read_to.
-
#tmp_path ⇒ Object
readonly
Returns the value of attribute tmp_path.
Instance Method Summary collapse
- #call ⇒ Object
-
#initialize(options) ⇒ MultiThreadedGrabber
constructor
A new instance of MultiThreadedGrabber.
Methods included from TextHelper
#clean_up_whitespace, #utf8_encode
Methods included from HTTPHelper
#open_w_timeout, #read_w_timeout
Constructor Details
#initialize(options) ⇒ MultiThreadedGrabber
Returns a new instance of MultiThreadedGrabber.
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
# File 'lib/title_grabber.rb', line 45 def initialize() @file_paths = [:file_paths] @out_path = .fetch(:output, DEF_OUT_PATH) @tmp_path = @out_path.sub_ext(".tmp#{@out_path.extname}") @connect_to = .fetch(:connect_to, CONNECT_TO) @read_to = .fetch(:read_to, READ_TO) @max_retries = .fetch(:max_retries, MAX_RETRIES) @max_threads = .fetch(:max_threads, Etc.nprocessors) logging_target = if [:debug] STDOUT else log_file = Pathname(__FILE__).sub_ext(".log"). basename. open("w") end @logger = Logger.new(logging_target) end |
Instance Attribute Details
#connect_to ⇒ Object (readonly)
Returns the value of attribute connect_to.
42 43 44 |
# File 'lib/title_grabber.rb', line 42 def connect_to @connect_to end |
#file_paths ⇒ Object (readonly)
Returns the value of attribute file_paths.
42 43 44 |
# File 'lib/title_grabber.rb', line 42 def file_paths @file_paths end |
#logger ⇒ Object (readonly)
Returns the value of attribute logger.
42 43 44 |
# File 'lib/title_grabber.rb', line 42 def logger @logger end |
#max_retries ⇒ Object (readonly)
Returns the value of attribute max_retries.
42 43 44 |
# File 'lib/title_grabber.rb', line 42 def max_retries @max_retries end |
#max_threads ⇒ Object (readonly)
Returns the value of attribute max_threads.
42 43 44 |
# File 'lib/title_grabber.rb', line 42 def max_threads @max_threads end |
#out_path ⇒ Object (readonly)
Returns the value of attribute out_path.
42 43 44 |
# File 'lib/title_grabber.rb', line 42 def out_path @out_path end |
#read_to ⇒ Object (readonly)
Returns the value of attribute read_to.
42 43 44 |
# File 'lib/title_grabber.rb', line 42 def read_to @read_to end |
#tmp_path ⇒ Object (readonly)
Returns the value of attribute tmp_path.
42 43 44 |
# File 'lib/title_grabber.rb', line 42 def tmp_path @tmp_path end |
Instance Method Details
#call ⇒ Object
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
# File 'lib/title_grabber.rb', line 66 def call queue = Queue.new CSV.open(tmp_path, "w", force_quotes: true) do |csv| csv << HEADERS file_paths.each do |file_path| file_path.each_line do |line| md = line.match(URL_RE) next unless md url = md.to_s if h = processed_urls[url] csv << [url, h[END_URL_HEAD], h[PAGE_TIT_HEAD], h[ART_TIT_HEAD]] next end queue << url end end thr_cnt = [max_threads, queue.size].min 1.upto(thr_cnt).map.with_index { |_, i| Thread.new(i) do |j| Thread.current.name = "Thread ##{i + 1}" url = begin queue.pop(true) rescue ThreadError; end while url end_url, html = read_w_timeout(url, **http_opts) if end_url && html && !html.empty? doc = begin Oga.parse_html(html) rescue ArgumentError, LL::ParserError => err logger.error "[#{Thread.current.name}] Unable to parse HTML from URL '#{url}' - #{err.}" nil end if doc if e_url = parse_end_url_from(doc) end_url = e_url end page_title = doc.at_css('title')&.text || -"" clean_up_whitespace(page_title) unless page_title.empty? article_title = nil ART_TIT_SEL.each do |selector| article_title = doc.at_css(selector)&.text break if article_title && !article_title.empty? end article_title ||= -"" clean_up_whitespace(article_title) unless article_title.empty? csv << [url, end_url, page_title, article_title] end end url = begin queue.pop(true) rescue ThreadError; end end end }.each(&:join) end ensure if tmp_path.size? FileUtils.mv(tmp_path, out_path) else tmp_path.unlink if tmp_path.exist? end end |