Class: NHKore::Word

Inherits:
Object
  • Object
show all
Defined in:
lib/nhkore/word.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(defn: nil, eng: nil, freq: 1, kana: nil, kanji: nil, unknown: nil, word: nil, **kargs) ⇒ Word

Returns a new instance of Word.

Raises:

  • (ArgumentError)


27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/nhkore/word.rb', line 27

def initialize(defn: nil,eng: nil,freq: 1,kana: nil,kanji: nil,unknown: nil,word: nil,**kargs)
  super()

  if !word.nil?
    defn = word.defn if defn.nil?
    eng = word.eng if eng.nil?
    freq = word.freq if freq.nil?
    kana = word.kana if kana.nil?
    kanji = word.kanji if kanji.nil?
  end

  raise ArgumentError,"freq[#{freq}] cannot be < 1" if freq < 1

  if !unknown.nil?
    # kanji?() only tests if it contains kanji, so don't use kana?().
    if Util.kanji?(unknown)
      if !Util.empty_web_str?(kanji)
        raise ArgumentError,"unknown[#{unknown}] will overwrite kanji[#{kanji}]"
      end

      kanji = unknown
    else
      if !Util.empty_web_str?(kana)
        raise ArgumentError,"unknown[#{unknown}] will overwrite kana[#{kana}]"
      end

      kana = unknown
    end
  end

  kana = nil if Util.empty_web_str?(kana)
  kanji = nil if Util.empty_web_str?(kanji)

  raise ArgumentError,'kanji and kana cannot both be empty' if kana.nil? && kanji.nil?

  @defn = defn
  @eng = eng
  @freq = freq
  @kana = kana
  @kanji = kanji
  @key = "#{kanji}=#{kana}" # nil.to_s() is ''
end

Instance Attribute Details

#defnObject

Returns the value of attribute defn.



20
21
22
# File 'lib/nhkore/word.rb', line 20

def defn
  @defn
end

#engObject

Returns the value of attribute eng.



21
22
23
# File 'lib/nhkore/word.rb', line 21

def eng
  @eng
end

#freqObject

Returns the value of attribute freq.



22
23
24
# File 'lib/nhkore/word.rb', line 22

def freq
  @freq
end

#kanaObject (readonly)

Returns the value of attribute kana.



23
24
25
# File 'lib/nhkore/word.rb', line 23

def kana
  @kana
end

#kanjiObject (readonly)

Returns the value of attribute kanji.



24
25
26
# File 'lib/nhkore/word.rb', line 24

def kanji
  @kanji
end

#keyObject (readonly)

Returns the value of attribute key.



25
26
27
# File 'lib/nhkore/word.rb', line 25

def key
  @key
end

Class Method Details

.load_data(key, hash) ⇒ Object



81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/nhkore/word.rb', line 81

def self.load_data(key,hash)
  key = key.to_s # Change from a symbol

  word = Word.new(
    defn: hash[:defn],
    eng: hash[:eng],
    kana: hash[:kana],
    kanji: hash[:kanji]
  )

  if key != word.key
    raise ArgumentError,"the key from the hash[#{key}] does not match the generated key[#{word.key}]"
  end

  freq = hash[:freq].to_i # nil.to_i() is 0
  word.freq = freq if freq > 0

  return word
end

.scrape_ruby_tag(tag, missingno: nil, url: nil) ⇒ Array<Word>

Do not clean and/or strip spaces, as the raw text is important for Defn and ArticleScraper.

This originally only scraped 1 word, but multiple words were added after seeing this link for 産業能率大学, which is valid HTML:

https://www3.nhk.or.jp/news/easy/k10012759201000/k10012759201000.html

Returns:

Raises:



109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# File 'lib/nhkore/word.rb', line 109

def self.scrape_ruby_tag(tag,missingno: nil,url: nil)
  # First, try <rb> tags.
  kanjis = tag.css('rb')
  # Second, try text nodes.
  kanjis = tag.search('./text()') if kanjis.length < 1
  # Third, try non-<rt> tags, in case of being surrounded by <span>, <b>, etc.
  kanjis = tag.search("./*[not(name()='rt')]") if kanjis.length < 1

  kanas = tag.css('rt')

  raise ScrapeError,"no kanji at URL[#{url}] in tag[#{tag}]" if kanjis.length < 1
  raise ScrapeError,"no kana at URL[#{url}] in tag[#{tag}]" if kanas.length < 1

  if kanjis.length != kanas.length
    raise ScrapeError,"number of kanji & kana mismatch at URL[#{url}] in tag[#{tag}]"
  end

  words = []

  (0...kanjis.length).each do |i|
    kanji = kanjis[i].text
    kana = kanas[i].text

    # Uncomment for debugging; really need a logger.
    #puts "Word[#{i}]: #{kanji} => #{kana}"

    if !missingno.nil?
      # Check kana first, since this is the typical scenario.
      # - https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html
      # - '窓' in '(8)窓を開けて外の空気を入れましょう'
      if Util.empty_web_str?(kana)
        kana = missingno.kana_from_kanji(kanji)

        if !Util.empty_web_str?(kana)
          Util.warn("using missingno for kana[#{kana}] from kanji[#{kanji}]")
        end
      elsif Util.empty_web_str?(kanji)
        kanji = missingno.kanji_from_kana(kana)

        if !Util.empty_web_str?(kanji)
          Util.warn("using missingno for kanji[#{kanji}] from kana[#{kana}]")
        end
      end
    end

    raise ScrapeError,"empty kanji at URL[#{url}] in tag[#{tag}]" if Util.empty_web_str?(kanji)
    raise ScrapeError,"empty kana at URL[#{url}] in tag[#{tag}]" if Util.empty_web_str?(kana)

    words << Word.new(kanji: kanji,kana: kana)
  end

  return words
end

.scrape_text_node(tag, url: nil) ⇒ Object

Do not clean and/or strip spaces, as the raw text is important for Defn and ArticleScraper.



165
166
167
168
169
170
171
172
173
174
# File 'lib/nhkore/word.rb', line 165

def self.scrape_text_node(tag,url: nil)
  text = tag.text

  # No error; empty text is fine (not strictly kanji/kana only).
  return nil if Util.empty_web_str?(text)

  word = Word.new(unknown: text)

  return word
end

Instance Method Details

#encode_with(coder) ⇒ Object



70
71
72
73
74
75
76
77
78
79
# File 'lib/nhkore/word.rb', line 70

def encode_with(coder)
  # Ignore @key because it will be the key in the YAML/Hash.
  # Order matters.

  coder[:kanji] = @kanji
  coder[:kana] = @kana
  coder[:freq] = @freq
  coder[:defn] = @defn
  coder[:eng] = @eng
end

#kanji?Boolean

Returns:

  • (Boolean)


176
177
178
# File 'lib/nhkore/word.rb', line 176

def kanji?
  return !Util.empty_web_str?(@kanji)
end

#to_sObject



184
185
186
187
188
189
190
191
192
193
194
195
196
# File 'lib/nhkore/word.rb', line 184

def to_s
  s = ''.dup

  s << "'#{@key}': "
  s << "{ kanji=>'#{@kanji}'"
  s << ", kana=>'#{@kana}'"
  s << ", freq=>#{@freq}"
  s << ", defn=>'#{@defn.to_s.gsub("\n",'\\n')}'"
  s << ", eng=>'#{@eng}'"
  s << ' }'

  return s
end

#wordObject



180
181
182
# File 'lib/nhkore/word.rb', line 180

def word
  return kanji? ? @kanji : @kana
end