Class: TitleGrabber::MultiThreadedGrabber

# File 'lib/title_grabber.rb', line 45

def initialize(options)
  @file_paths = options[:file_paths]

  @out_path = options.fetch(:output, DEF_OUT_PATH)
  @tmp_path = @out_path.sub_ext(".tmp#{@out_path.extname}")

  @connect_to = options.fetch(:connect_to, CONNECT_TO)
  @read_to = options.fetch(:read_to, READ_TO)
  @max_retries = options.fetch(:max_retries, MAX_RETRIES)
  @max_threads = options.fetch(:max_threads, Etc.nprocessors)

  logging_target = if options[:debug]
                     STDOUT
                   else
                     log_file = Pathname(__FILE__).sub_ext(".log").
                                                   basename.
                                                   open("w")
                   end
  @logger = Logger.new(logging_target)
end

Instance Attribute Details

#connect_to ⇒ `Object` (readonly)

Returns the value of attribute connect_to.



42
43
44

# File 'lib/title_grabber.rb', line 42

def connect_to
  @connect_to
end

#file_paths ⇒ `Object` (readonly)

Returns the value of attribute file_paths.



42
43
44

# File 'lib/title_grabber.rb', line 42

def file_paths
  @file_paths
end

#logger ⇒ `Object` (readonly)

Returns the value of attribute logger.



42
43
44

# File 'lib/title_grabber.rb', line 42

def logger
  @logger
end

#max_retries ⇒ `Object` (readonly)

Returns the value of attribute max_retries.



42
43
44

# File 'lib/title_grabber.rb', line 42

def max_retries
  @max_retries
end

#max_threads ⇒ `Object` (readonly)

Returns the value of attribute max_threads.



42
43
44

# File 'lib/title_grabber.rb', line 42

def max_threads
  @max_threads
end

#out_path ⇒ `Object` (readonly)

Returns the value of attribute out_path.



42
43
44

# File 'lib/title_grabber.rb', line 42

def out_path
  @out_path
end

#read_to ⇒ `Object` (readonly)

Returns the value of attribute read_to.



42
43
44

# File 'lib/title_grabber.rb', line 42

def read_to
  @read_to
end

#tmp_path ⇒ `Object` (readonly)

Returns the value of attribute tmp_path.



42
43
44

# File 'lib/title_grabber.rb', line 42

def tmp_path
  @tmp_path
end

Instance Method Details

#call ⇒ `Object`

# File 'lib/title_grabber.rb', line 66

def call
  queue = Queue.new
  CSV.open(tmp_path, "w", force_quotes: true) do |csv|
    csv << HEADERS

    file_paths.each do |file_path|
      file_path.each_line do |line|
        md = line.match(URL_RE)
        next unless md

        url = md.to_s
        if h = processed_urls[url]
          csv << [url, h[END_URL_HEAD], h[PAGE_TIT_HEAD], h[ART_TIT_HEAD]]
          next
        end

        queue << url
      end
    end

    thr_cnt = [max_threads, queue.size].min
    1.upto(thr_cnt).map.with_index { |_, i|
      Thread.new(i) do |j|
        Thread.current.name = "Thread ##{i + 1}"

        url = begin
                queue.pop(true)
              rescue ThreadError; end

        while url
          end_url, html = read_w_timeout(url, **http_opts)

          if end_url && html && !html.empty?
            doc = begin
                    Oga.parse_html(html)
                  rescue ArgumentError, LL::ParserError => err
                    logger.error "[#{Thread.current.name}] Unable to parse HTML from URL '#{url}' - #{err.message}"
                    nil
                  end

            if doc
              if e_url = parse_end_url_from(doc)
                end_url = e_url
              end

              page_title = doc.at_css('title')&.text || -""
              clean_up_whitespace(page_title) unless page_title.empty?

              article_title = nil
              ART_TIT_SEL.each do |selector|
                article_title = doc.at_css(selector)&.text
                break if article_title && !article_title.empty?
              end
              article_title ||= -""
              clean_up_whitespace(article_title) unless article_title.empty?

              csv << [url, end_url, page_title, article_title]
            end
          end

          url = begin
                  queue.pop(true)
                rescue ThreadError; end
        end
      end
    }.each(&:join)
  end
ensure
  if tmp_path.size?
    FileUtils.mv(tmp_path, out_path)
  else
    tmp_path.unlink if tmp_path.exist?
  end
end

Class: TitleGrabber::MultiThreadedGrabber

Constant Summary

Constants included from TextHelper

Constants included from HTTPHelper

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from TextHelper

Methods included from HTTPHelper

Constructor Details

#initialize(options) ⇒ MultiThreadedGrabber

Instance Attribute Details

#connect_to ⇒ Object (readonly)

#file_paths ⇒ Object (readonly)

#logger ⇒ Object (readonly)

#max_retries ⇒ Object (readonly)

#max_threads ⇒ Object (readonly)

#out_path ⇒ Object (readonly)

#read_to ⇒ Object (readonly)

#tmp_path ⇒ Object (readonly)