Module: NHKore::CLI::NewsCmd

Included in:
App
Defined in:
lib/nhkore/cli/news_cmd.rb

Constant Summary collapse

DEFAULT_NEWS_SCRAPE =
1

Instance Method Summary collapse

Instance Method Details

#build_news_cmdObject



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# File 'lib/nhkore/cli/news_cmd.rb', line 27

def build_news_cmd
  app = self

  @news_cmd = @app_cmd.define_command do
    name    'news'
    usage   'news [OPTIONS] [COMMAND]...'
    aliases :n
    summary "Scrape NHK News Web (Easy) articles (aliases: #{app.color_alias('n')})"

    description <<-DESC
      Scrape NHK News Web (Easy) articles &
      save to folder: #{News::DEFAULT_DIR}
    DESC

    option :d,:datetime,<<-DESC,argument: :required,transform: lambda { |value|
      date time to use as a fallback in cases when an article doesn't have one;
      format: YYYY-mm-dd H:M; example: 2020-03-30 15:30
    DESC
      value = Time.strptime(value,'%Y-%m-%d %H:%M',&DatetimeParser.method(:guess_year))
      value = Util.jst_time(value)
      value
    }
    option :i,:in,<<-DESC,argument: :required,transform: lambda { |value|
      HTML file of article to read instead of URL (for offline testing and/or slow internet;
      see '--no-dict' option)
    DESC
      app.check_empty_opt(:in,value)
    }
    flag :L,:lenient,<<-DESC
      leniently (not strict) scrape articles:
      body & title content without the proper HTML/CSS classes/IDs and no futsuurl;
      example URLs that need this flag:
      -https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
      -https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
    DESC
    option :k,:like,<<-DESC,argument: :required,transform: lambda { |value|
      text to fuzzy search links for; for example, "--like '00123'" will only scrape links containing
      text '00123' -- like '*00123*'
    DESC
      value = Util.strip_web_str(value).downcase
      value
    }
    option :l,:links,<<-DESC,argument: :required,transform: lambda { |value|
      'directory/file' of article links to scrape (see '#{App::NAME} search';
      defaults: #{SearchLinks::DEFAULT_YASASHII_FILE}, #{SearchLinks::DEFAULT_FUTSUU_FILE})
    DESC
      app.check_empty_opt(:links,value)
    }
    flag :M,:missingno,<<-DESC
      very rarely an article will not have kana or kanji for a Ruby tag;
      to not raise an error, this will use previously scraped data to fill it in;
      example URL:
      -https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html
    DESC
    flag :D,:'no-dict',<<-DESC
      do not try to parse the dictionary files for the articles; useful in case of errors trying to load
      the dictionaries (or for offline testing)
    DESC
    flag :H,'no-sha256',<<-DESC
      do not check the SHA-256 of the content to see if an article has already been scraped;
      for example, 2 URLs with the same content, but 1 with 'http' & 1 with 'https', will both be scraped;
      this is useful if 2 articles have the same SHA-256, but different content (unlikely)
    DESC
    option :o,:out,<<-DESC,argument: :required,transform: lambda { |value|
      'directory/file' to save words to; if you only specify a directory or a file, it will attach
      the appropriate default directory/file name
      (defaults: #{YasashiiNews::DEFAULT_FILE}, #{FutsuuNews::DEFAULT_FILE})
    DESC
      app.check_empty_opt(:out,value)
    }
    flag :r,:redo,'scrape article links even if they have already been scraped'
    option :s,:scrape,'number of unscraped article links to scrape',argument: :required,
        default: DEFAULT_NEWS_SCRAPE,transform: lambda { |value|
          value = value.to_i
          value = 1 if value < 1
          value
        }
    option nil,:'show-dict',<<-DESC
      show dictionary URL and contents for the first article and exit;
      useful for debugging dictionary errors (see '--no-dict' option);
      implies '--dry-run' option
    DESC
    option :u,:url,<<-DESC,argument: :required,transform: lambda { |value|
      URL of article to scrape, instead of article links file (see '--links' option)
    DESC
      app.check_empty_opt(:url,value)
    }

    run do |opts,args,cmd|
      puts cmd.help
    end
  end

  @news_easy_cmd = @news_cmd.define_command do
    name    'easy'
    usage   'easy [OPTIONS] [COMMAND]...'
    aliases :e,:ez
    summary "Scrape NHK News Web Easy (Yasashii) articles (aliases: #{app.color_alias('e ez')})"

    description <<-DESC
      Search for NHK News Web Easy (Yasashii) links &
      save to file: #{YasashiiNews::DEFAULT_FILE}
    DESC

    run do |opts,args,cmd|
      app.refresh_cmd(opts,args,cmd)
      app.run_news_cmd(:yasashii)
    end
  end

  @news_regular_cmd = @news_cmd.define_command do
    name    'regular'
    usage   'regular [OPTIONS] [COMMAND]...'
    aliases :r,:reg
    summary "Scrape NHK News Web Regular (Futsuu) articles (aliases: #{app.color_alias('r reg')})"

    description <<-DESC
      Search for NHK News Web Regular (Futsuu) links &
      save to file: #{FutsuuNews::DEFAULT_FILE}
    DESC

    run do |opts,args,cmd|
      app.refresh_cmd(opts,args,cmd)
      app.run_news_cmd(:futsuu)
    end
  end
end

#run_news_cmd(type) ⇒ Object



155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
# File 'lib/nhkore/cli/news_cmd.rb', line 155

def run_news_cmd(type)
  @cmd_opts[:dry_run] = true if @cmd_opts[:show_dict]
  news_name = nil

  build_in_file(:in)

  case type
  when :futsuu
    build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,
      default_filename: SearchLinks::DEFAULT_FUTSUU_FILENAME)
    build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: FutsuuNews::DEFAULT_FILENAME)

    news_name = 'Regular'
  when :yasashii
    build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,
      default_filename: SearchLinks::DEFAULT_YASASHII_FILENAME)
    build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: YasashiiNews::DEFAULT_FILENAME)

    news_name = 'Easy'
  else
    raise ArgumentError,"invalid type[#{type}]"
  end

  return unless check_in_file(:in,empty_ok: true)
  return unless check_out_file(:out)

  datetime = @cmd_opts[:datetime]
  dict = @cmd_opts[:no_dict] ? nil : :scrape
  dry_run = @cmd_opts[:dry_run]
  in_file = @cmd_opts[:in]
  lenient = @cmd_opts[:lenient]
  like = @cmd_opts[:like]
  links_file = @cmd_opts[:links]
  max_scrapes = @cmd_opts[:scrape]
  max_scrapes = DEFAULT_NEWS_SCRAPE if max_scrapes.nil?
  missingno = @cmd_opts[:missingno]
  no_sha256 = @cmd_opts[:no_sha256]
  out_file = @cmd_opts[:out]
  redo_scrapes = @cmd_opts[:redo]
  show_dict = @cmd_opts[:show_dict]

  # Favor in_file option over url option.
  url = in_file.nil? ? Util.strip_web_str(@cmd_opts[:url].to_s) : in_file
  url = nil if url.empty?

  if url.nil?
    # Then we must have a links file that exists.
    return unless check_in_file(:links,empty_ok: false)
  end

  start_spin("Scraping NHK News Web #{news_name} articles")

  is_file = !in_file.nil?
  link_count = -1
  links = File.exist?(links_file) ? SearchLinks.load_file(links_file) : SearchLinks.new
  new_articles = [] # For --dry-run
  news = nil
  scrape_count = 0

  if File.exist?(out_file)
    news = (type == :yasashii) ?
      YasashiiNews.load_file(out_file,overwrite: no_sha256) :
      FutsuuNews.load_file(out_file,overwrite: no_sha256)
  else
    news = (type == :yasashii) ? YasashiiNews.new : FutsuuNews.new
  end

  @news_article_scraper_kargs = @scraper_kargs.merge({
    datetime: datetime,
    dict: dict,
    is_file: is_file,
    missingno: missingno ? Missingno.new(news) : nil,
    strict: !lenient,
  })
  @news_dict_scraper_kargs = @scraper_kargs.merge({
    is_file: is_file,
  })

  if url.nil?
    # Why store each() and do `links_len` instead of `links-len - 1`?
    #
    # If links contains 5 entries and you scrape all 5, then the output of
    # update_spin_detail() will end on 4, so all of this complexity is so
    # that update_spin_detail() only needs to be written/updated on one line.

    links_each = links.links.values.each
    links_len = links.length

    0.upto(links_len) do |i|
      update_spin_detail(" (scraped=#{scrape_count}, considered=#{link_count += 1})")

      break if i >= links_len || scrape_count >= max_scrapes

      link = links_each.next

      next if !like.nil? && !link.url.to_s.downcase.include?(like)
      next if !redo_scrapes && scraped_news_article?(news,link)

      url = link.url
      result = scrape_news_article(url,link: link,new_articles: new_articles,news: news)

      if result == :scraped
        scrape_count += 1
      elsif result == :unscraped
        next
      else
        # --show-dict
        url = result
        scrape_count = max_scrapes # Break on next iteration for update_spin_detail().
      end

      # Break on next iteration for update_spin_detail().
      next if scrape_count >= max_scrapes
      sleep_scraper
    end
  else
    link = links[url]

    if link.nil?
      link = SearchLink.new(url)
      links.add_link(link)
    end

    result = scrape_news_article(url,link: link,new_articles: new_articles,news: news)
    scrape_count += 1 if result != :unscraped
  end

  stop_spin
  puts

  if scrape_count <= 0
    puts 'Nothing scraped!'

    if !dry_run && !show_dict
      puts
      start_spin('Saving updated links to file')

      links.save_file(links_file)

      stop_spin
      puts "> #{links_file}"
    end
  else
    puts 'Last URL scraped:'
    puts "> #{url}"
    puts

    if show_dict
      puts @cmd_opts[:show_dict] # Updated in scrape_news_article()
    elsif dry_run
      if new_articles.length < 1
        raise CLIError,"scrape_count[#{scrape_count}] != new_articles[#{new_articles.length}];" \
          ' internal code is broken'
      elsif new_articles.length == 1
        puts new_articles.first
      else
        # Don't show the words (mini), too verbose for more than 1.
        new_articles.each do |article|
          puts article.to_s(mini: true)
        end
      end
    else
      start_spin('Saving scraped data to files')

      links.save_file(links_file)
      news.save_file(out_file)

      stop_spin
      puts "> #{out_file}"
      puts "> #{links_file}"
    end
  end
end

#scrape_news_article(url, link:, new_articles:, news:) ⇒ Object



329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
# File 'lib/nhkore/cli/news_cmd.rb', line 329

def scrape_news_article(url,link:,new_articles:,news:)
  show_dict = @cmd_opts[:show_dict]

  if show_dict
    scraper = DictScraper.new(url,**@news_dict_scraper_kargs)

    @cmd_opts[:show_dict] = scraper.scrape.to_s

    return scraper.url
  end

  scraper = nil

  begin
    scraper = ArticleScraper.new(url,**@news_article_scraper_kargs)
  rescue Http404Error
    # - https://www3.nhk.or.jp/news/easy/k10014157491000/k10014157491000.html
    Util.warn("Ignoring URL with 404 error: #{url}.")
    return :unscraped
  end

  article = scraper.scrape
  # run_news_cmd() handles overwriting with --redo or not
  #   using scraped_news_article?().
  news.add_article(article,overwrite: true)

  news.update_article(article,link.url) # Favors https
  link.update_from_article(article)

  new_articles << article

  return :scraped # No --show-dict
end

#scraped_news_article?(news, link) ⇒ Boolean

Returns:

  • (Boolean)


363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
# File 'lib/nhkore/cli/news_cmd.rb', line 363

def scraped_news_article?(news,link)
  return true if link.scraped?

  no_sha256 = @cmd_opts[:no_sha256]

  article = news.article(link.url)

  if !no_sha256 && article.nil?
    if !Util.empty_web_str?(link.sha256) && news.sha256?(link.sha256)
      article = news.article_with_sha256(link.sha256)
    end

    if article.nil?
      scraper = nil

      begin
        scraper = ArticleScraper.new(link.url,**@news_article_scraper_kargs)
      rescue Http404Error
        return false
      end

      sha256 = scraper.scrape_sha256_only
      article = news.article_with_sha256(sha256) if news.sha256?(sha256)
    end
  end

  if article
    news.update_article(article,link.url) # Favors https
    link.update_from_article(article)

    return true
  end

  return false
end