Module: NHKore::CLI::NewsCmd

Included in:
App
Defined in:
lib/nhkore/cli/news_cmd.rb

Overview

Author:

  • Jonathan Bradley Whited

Since:

  • 0.2.0

Constant Summary collapse

DEFAULT_NEWS_SCRAPE =

Since:

  • 0.2.0

1

Instance Method Summary collapse

Instance Method Details

#build_news_cmdObject

Since:

  • 0.2.0



31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# File 'lib/nhkore/cli/news_cmd.rb', line 31

def build_news_cmd
  app = self

  @news_cmd = @app_cmd.define_command do
    name    'news'
    usage   'news [OPTIONS] [COMMAND]...'
    aliases :n
    summary "Scrape NHK News Web (Easy) articles (aliases: #{app.color_alias('n')})"

    description <<-DESC
      Scrape NHK News Web (Easy) articles &
      save to folder: #{News::DEFAULT_DIR}
    DESC

    option :d,:datetime,<<-DESC,argument: :required,transform: lambda { |value|
      date time to use as a fallback in cases when an article doesn't have one;
      format: YYYY-mm-dd H:M; example: 2020-03-30 15:30
    DESC
      value = Time.strptime(value,'%Y-%m-%d %H:%M',&DatetimeParser.method(:guess_year))
      value = Util.jst_time(value)
      value
    }
    option :i,:in,<<-DESC,argument: :required,transform: lambda { |value|
      HTML file of article to read instead of URL (for offline testing and/or slow internet;
      see '--no-dict' option)
    DESC
      app.check_empty_opt(:in,value)
    }
    flag :L,:lenient,<<-DESC
      leniently (not strict) scrape articles:
      body & title content without the proper HTML/CSS classes/IDs and no futsuurl;
      example URLs that need this flag:
      -https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
      -https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
    DESC
    option :k,:like,<<-DESC,argument: :required,transform: lambda { |value|
      text to fuzzy search links for; for example, "--like '00123'" will only scrape links containing
      text '00123' -- like '*00123*'
    DESC
      value = Util.strip_web_str(value).downcase
      value
    }
    option :l,:links,<<-DESC,argument: :required,transform: lambda { |value|
      'directory/file' of article links to scrape (see '#{App::NAME} search';
      defaults: #{SearchLinks::DEFAULT_YASASHII_FILE}, #{SearchLinks::DEFAULT_FUTSUU_FILE})
    DESC
      app.check_empty_opt(:links,value)
    }
    flag :M,:missingno,<<-DESC
      very rarely an article will not have kana or kanji for a Ruby tag;
      to not raise an error, this will use previously scraped data to fill it in;
      example URL:
      -https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html
    DESC
    flag :D,:'no-dict',<<-DESC
      do not try to parse the dictionary files for the articles; useful in case of errors trying to load
      the dictionaries (or for offline testing)
    DESC
    flag :H,'no-sha256',<<-DESC
      do not check the SHA-256 of the content to see if an article has already been scraped;
      for example, 2 URLs with the same content, but 1 with 'http' & 1 with 'https', will both be scraped;
      this is useful if 2 articles have the same SHA-256, but different content (unlikely)
    DESC
    option :o,:out,<<-DESC,argument: :required,transform: lambda { |value|
      'directory/file' to save words to; if you only specify a directory or a file, it will attach
      the appropriate default directory/file name
      (defaults: #{YasashiiNews::DEFAULT_FILE}, #{FutsuuNews::DEFAULT_FILE})
    DESC
      app.check_empty_opt(:out,value)
    }
    flag :r,:redo,'scrape article links even if they have already been scraped'
    option :s,:scrape,'number of unscraped article links to scrape',argument: :required,
        default: DEFAULT_NEWS_SCRAPE,transform: lambda { |value|
          value = value.to_i
          value = 1 if value < 1
          value
        }
    option nil,:'show-dict',<<-DESC
      show dictionary URL and contents for the first article and exit;
      useful for debugging dictionary errors (see '--no-dict' option);
      implies '--dry-run' option
    DESC
    option :u,:url,<<-DESC,argument: :required,transform: lambda { |value|
      URL of article to scrape, instead of article links file (see '--links' option)
    DESC
      app.check_empty_opt(:url,value)
    }

    run do |opts,args,cmd|
      puts cmd.help
    end
  end

  @news_easy_cmd = @news_cmd.define_command do
    name    'easy'
    usage   'easy [OPTIONS] [COMMAND]...'
    aliases :e,:ez
    summary "Scrape NHK News Web Easy (Yasashii) articles (aliases: #{app.color_alias('e ez')})"

    description <<-DESC
      Search for NHK News Web Easy (Yasashii) links &
      save to file: #{YasashiiNews::DEFAULT_FILE}
    DESC

    run do |opts,args,cmd|
      app.refresh_cmd(opts,args,cmd)
      app.run_news_cmd(:yasashii)
    end
  end

  @news_regular_cmd = @news_cmd.define_command do
    name    'regular'
    usage   'regular [OPTIONS] [COMMAND]...'
    aliases :r,:reg
    summary "Scrape NHK News Web Regular (Futsuu) articles (aliases: #{app.color_alias('r reg')})"

    description <<-DESC
      Search for NHK News Web Regular (Futsuu) links &
      save to file: #{FutsuuNews::DEFAULT_FILE}
    DESC

    run do |opts,args,cmd|
      app.refresh_cmd(opts,args,cmd)
      app.run_news_cmd(:futsuu)
    end
  end
end

#run_news_cmd(type) ⇒ Object

Since:

  • 0.2.0



159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
# File 'lib/nhkore/cli/news_cmd.rb', line 159

def run_news_cmd(type)
  @cmd_opts[:dry_run] = true if @cmd_opts[:show_dict]
  news_name = nil

  build_in_file(:in)

  case type
  when :futsuu
    build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,
      default_filename: SearchLinks::DEFAULT_FUTSUU_FILENAME)
    build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: FutsuuNews::DEFAULT_FILENAME)

    news_name = 'Regular'
  when :yasashii
    build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR,
      default_filename: SearchLinks::DEFAULT_YASASHII_FILENAME)
    build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: YasashiiNews::DEFAULT_FILENAME)

    news_name = 'Easy'
  else
    raise ArgumentError,"invalid type[#{type}]"
  end

  return unless check_in_file(:in,empty_ok: true)
  return unless check_out_file(:out)

  datetime = @cmd_opts[:datetime]
  dict = @cmd_opts[:no_dict] ? nil : :scrape
  dry_run = @cmd_opts[:dry_run]
  in_file = @cmd_opts[:in]
  lenient = @cmd_opts[:lenient]
  like = @cmd_opts[:like]
  links_file = @cmd_opts[:links]
  max_scrapes = @cmd_opts[:scrape]
  max_scrapes = DEFAULT_NEWS_SCRAPE if max_scrapes.nil?
  missingno = @cmd_opts[:missingno]
  no_sha256 = @cmd_opts[:no_sha256]
  out_file = @cmd_opts[:out]
  redo_scrapes = @cmd_opts[:redo]
  show_dict = @cmd_opts[:show_dict]

  # Favor in_file option over url option.
  url = in_file.nil? ? Util.strip_web_str(@cmd_opts[:url].to_s) : in_file
  url = nil if url.empty?

  if url.nil?
    # Then we must have a links file that exists.
    return unless check_in_file(:links,empty_ok: false)
  end

  start_spin("Scraping NHK News Web #{news_name} articles")

  is_file = !in_file.nil?
  link_count = -1
  links = File.exist?(links_file) ? SearchLinks.load_file(links_file) : SearchLinks.new
  new_articles = [] # For --dry-run
  news = nil
  scrape_count = 0

  if File.exist?(out_file)
    news = (type == :yasashii) ?
      YasashiiNews.load_file(out_file,overwrite: no_sha256) :
      FutsuuNews.load_file(out_file,overwrite: no_sha256)
  else
    news = (type == :yasashii) ? YasashiiNews.new : FutsuuNews.new
  end

  @news_article_scraper_kargs = @scraper_kargs.merge({
    datetime: datetime,
    dict: dict,
    is_file: is_file,
    missingno: missingno ? Missingno.new(news) : nil,
    strict: !lenient,
  })
  @news_dict_scraper_kargs = @scraper_kargs.merge({
    is_file: is_file,
  })

  if url.nil?
    # Why store each() and do `links_len` instead of `links-len - 1`?
    #
    # If links contains 5 entries and you scrape all 5, then the output of
    # update_spin_detail() will end on 4, so all of this complexity is so
    # that update_spin_detail() only needs to be written/updated on one line.

    links_each = links.links.values.each
    links_len = links.length

    0.upto(links_len) do |i|
      update_spin_detail(" (scraped=#{scrape_count}, considered=#{link_count += 1})")

      break if i >= links_len || scrape_count >= max_scrapes

      link = links_each.next

      next if !like.nil? && !link.url.to_s.downcase.include?(like)
      next if !redo_scrapes && scraped_news_article?(news,link)

      url = link.url

      if (new_url = scrape_news_article(url,link: link,new_articles: new_articles,news: news))
        # --show-dict
        url = new_url
        scrape_count = max_scrapes - 1 # Break on next iteration for update_spin_detail()
      end

      # Break on next iteration for update_spin_detail().
      next if (scrape_count += 1) >= max_scrapes

      sleep_scraper
    end
  else
    link = links[url]

    if link.nil?
      link = SearchLink.new(url)
      links.add_link(link)
    end

    scrape_news_article(url,link: link,new_articles: new_articles,news: news)

    scrape_count += 1
  end

  stop_spin
  puts

  if scrape_count <= 0
    puts 'Nothing scraped!'

    if !dry_run && !show_dict
      puts
      start_spin('Saving updated links to file')

      links.save_file(links_file)

      stop_spin
      puts "> #{links_file}"
    end
  else
    puts 'Last URL scraped:'
    puts "> #{url}"
    puts

    if show_dict
      puts @cmd_opts[:show_dict] # Updated in scrape_news_article()
    elsif dry_run
      if new_articles.length < 1
        raise CLIError,"scrape_count[#{scrape_count}] != new_articles[#{new_articles.length}];" \
          ' internal code is broken'
      elsif new_articles.length == 1
        puts new_articles.first
      else
        # Don't show the words (mini), too verbose for more than 1.
        new_articles.each do |article|
          puts article.to_s(mini: true)
        end
      end
    else
      start_spin('Saving scraped data to files')

      links.save_file(links_file)
      news.save_file(out_file)

      stop_spin
      puts "> #{out_file}"
      puts "> #{links_file}"
    end
  end
end

#scrape_news_article(url, link:, new_articles:, news:) ⇒ Object

Since:

  • 0.2.0



330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
# File 'lib/nhkore/cli/news_cmd.rb', line 330

def scrape_news_article(url,link:,new_articles:,news:)
  show_dict = @cmd_opts[:show_dict]

  if show_dict
    scraper = DictScraper.new(url,**@news_dict_scraper_kargs)

    @cmd_opts[:show_dict] = scraper.scrape.to_s

    return scraper.url
  end

  scraper = ArticleScraper.new(url,**@news_article_scraper_kargs)
  article = scraper.scrape

  # run_news_cmd() handles overwriting with --redo or not
  #   using scraped_news_article?().
  news.add_article(article,overwrite: true)

  news.update_article(article,link.url) # Favors https
  link.update_from_article(article)

  new_articles << article

  return false # No --show-dict
end

#scraped_news_article?(news, link) ⇒ Boolean

Returns:

  • (Boolean)

Since:

  • 0.2.0



356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
# File 'lib/nhkore/cli/news_cmd.rb', line 356

def scraped_news_article?(news,link)
  return true if link.scraped?

  no_sha256 = @cmd_opts[:no_sha256]

  article = news.article(link.url)

  if !no_sha256 && article.nil?
    if !Util.empty_web_str?(link.sha256) && news.sha256?(link.sha256)
      article = news.article_with_sha256(link.sha256)
    end

    if article.nil?
      scraper = ArticleScraper.new(link.url,**@news_article_scraper_kargs)

      sha256 = scraper.scrape_sha256_only

      article = news.article_with_sha256(sha256) if news.sha256?(sha256)
    end
  end

  if article
    news.update_article(article,link.url) # Favors https
    link.update_from_article(article)

    return true
  end

  return false
end