Class: TitleGrabber::MultiThreadedGrabber

Inherits:
Object
  • Object
show all
Includes:
HTTPHelper, TextHelper
Defined in:
lib/title_grabber.rb

Constant Summary

Constants included from TextHelper

TextHelper::SINGLE_SPACE

Constants included from HTTPHelper

HTTPHelper::CONNECTION_ERRORS, HTTPHelper::HTTP_REDIR, HTTPHelper::INVALID_BYTE_SEQ, HTTPHelper::REDIR_FORBIDDEN, HTTPHelper::REST_INTERVAL, HTTPHelper::SRV_UNAVAILABLE

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from TextHelper

#clean_up_whitespace, #utf8_encode

Methods included from HTTPHelper

#open_w_timeout, #read_w_timeout

Constructor Details

#initialize(options) ⇒ MultiThreadedGrabber

Returns a new instance of MultiThreadedGrabber.



45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/title_grabber.rb', line 45

def initialize(options)
  @file_paths = options[:file_paths]

  @out_path = options.fetch(:output, DEF_OUT_PATH)
  @tmp_path = @out_path.sub_ext(".tmp#{@out_path.extname}")

  @connect_to = options.fetch(:connect_to, CONNECT_TO)
  @read_to = options.fetch(:read_to, READ_TO)
  @max_retries = options.fetch(:max_retries, MAX_RETRIES)
  @max_threads = options.fetch(:max_threads, Etc.nprocessors)

  logging_target = if options[:debug]
                     STDOUT
                   else
                     log_file = Pathname(__FILE__).sub_ext(".log").
                                                   basename.
                                                   open("w")
                   end
  @logger = Logger.new(logging_target)
end

Instance Attribute Details

#connect_toObject (readonly)

Returns the value of attribute connect_to.



42
43
44
# File 'lib/title_grabber.rb', line 42

def connect_to
  @connect_to
end

#file_pathsObject (readonly)

Returns the value of attribute file_paths.



42
43
44
# File 'lib/title_grabber.rb', line 42

def file_paths
  @file_paths
end

#loggerObject (readonly)

Returns the value of attribute logger.



42
43
44
# File 'lib/title_grabber.rb', line 42

def logger
  @logger
end

#max_retriesObject (readonly)

Returns the value of attribute max_retries.



42
43
44
# File 'lib/title_grabber.rb', line 42

def max_retries
  @max_retries
end

#max_threadsObject (readonly)

Returns the value of attribute max_threads.



42
43
44
# File 'lib/title_grabber.rb', line 42

def max_threads
  @max_threads
end

#out_pathObject (readonly)

Returns the value of attribute out_path.



42
43
44
# File 'lib/title_grabber.rb', line 42

def out_path
  @out_path
end

#read_toObject (readonly)

Returns the value of attribute read_to.



42
43
44
# File 'lib/title_grabber.rb', line 42

def read_to
  @read_to
end

#tmp_pathObject (readonly)

Returns the value of attribute tmp_path.



42
43
44
# File 'lib/title_grabber.rb', line 42

def tmp_path
  @tmp_path
end

Instance Method Details

#callObject



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# File 'lib/title_grabber.rb', line 66

def call
  queue = Queue.new
  CSV.open(tmp_path, "w", force_quotes: true) do |csv|
    csv << HEADERS

    file_paths.each do |file_path|
      file_path.each_line do |line|
        md = line.match(URL_RE)
        next unless md

        url = md.to_s
        if h = processed_urls[url]
          csv << [url, h[END_URL_HEAD], h[PAGE_TIT_HEAD], h[ART_TIT_HEAD]]
          next
        end

        queue << url
      end
    end

    thr_cnt = [max_threads, queue.size].min
    1.upto(thr_cnt).map.with_index { |_, i|
      Thread.new(i) do |j|
        Thread.current.name = "Thread ##{i + 1}"

        url = begin
                queue.pop(true)
              rescue ThreadError; end

        while url
          end_url, html = read_w_timeout(url, **http_opts)

          if end_url && html && !html.empty?
            doc = begin
                    Oga.parse_html(html)
                  rescue ArgumentError, LL::ParserError => err
                    logger.error "[#{Thread.current.name}] Unable to parse HTML from URL '#{url}' - #{err.message}"
                    nil
                  end

            if doc
              if e_url = parse_end_url_from(doc)
                end_url = e_url
              end

              page_title = doc.at_css('title')&.text || -""
              clean_up_whitespace(page_title) unless page_title.empty?

              article_title = nil
              ART_TIT_SEL.each do |selector|
                article_title = doc.at_css(selector)&.text
                break if article_title && !article_title.empty?
              end
              article_title ||= -""
              clean_up_whitespace(article_title) unless article_title.empty?

              csv << [url, end_url, page_title, article_title]
            end
          end

          url = begin
                  queue.pop(true)
                rescue ThreadError; end
        end
      end
    }.each(&:join)
  end
ensure
  if tmp_path.size?
    FileUtils.mv(tmp_path, out_path)
  else
    tmp_path.unlink if tmp_path.exist?
  end
end