Module: Wp2txt

Included in:
Article, Runner, Splitter
Defined in:
lib/wp2txt.rb,
lib/wp2txt/regex.rb,
lib/wp2txt/utils.rb,
lib/wp2txt/article.rb,
lib/wp2txt/version.rb

Defined Under Namespace

Classes: Article, Runner, Splitter

Constant Summary collapse

HTML_DECODER =

variables to save resource for generating regexps those with a trailing number 1 represent opening tag/markup those with a trailing number 2 represent closing tag/markup those without a trailing number contain both opening/closing tags/markups

HTMLEntities.new
ENTITIES =
['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;'].zip([' ', '<', '>', '&', '"'])
HTML_HASH =
HTML_REGEX =
Regexp.new("(" + HTML_HASH.keys.join("|") + ")")
ML_TEMPLATE_ONSET_REGEX =
Regexp.new('^\{\{[^\}]*$')
ML_TEMPLATE_END_REGEX =
Regexp.new('\}\}\s*$')
Regexp.new('^\[\[[^\]]*$')
Regexp.new('\]\]\s*$')
ISOLATED_TEMPLATE_REGEX =
Regexp.new('^\s*\{\{.+\}\}\s*$')
ISOLATED_TAG_REGEX =
Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
Regexp.new('^\s*\[.*\]\s*$')
IN_INPUTBOX_REGEX =
Regexp.new('<inputbox>.*?<\/inputbox>')
IN_INPUTBOX_REGEX1 =
Regexp.new('<inputbox>')
IN_INPUTBOX_REGEX2 =
Regexp.new('<\/inputbox>')
IN_SOURCE_REGEX =
Regexp.new('<source.*?>.*?<\/source>')
IN_SOURCE_REGEX1 =
Regexp.new('<source.*?>')
IN_SOURCE_REGEX2 =
Regexp.new('<\/source>')
IN_MATH_REGEX =
Regexp.new('<math.*?>.*?<\/math>')
IN_MATH_REGEX1 =
Regexp.new('<math.*?>')
IN_MATH_REGEX2 =
Regexp.new('<\/math>')
IN_HEADING_REGEX =
Regexp.new('^=+.*?=+$')
IN_HTML_TABLE_REGEX =
Regexp.new("<table.*?><\/table>")
IN_HTML_TABLE_REGEX1 =
Regexp.new('<table\b')
IN_HTML_TABLE_REGEX2 =
Regexp.new('<\/\s*table>')
IN_TABLE_REGEX1 =
Regexp.new('^\s*\{\|')
IN_TABLE_REGEX2 =
Regexp.new('^\|\}.*?$')
IN_UNORDERED_REGEX =
Regexp.new('^\*')
IN_ORDERED_REGEX =
Regexp.new('^\#')
IN_PRE_REGEX =
Regexp.new('^ ')
IN_DEFINITION_REGEX =
Regexp.new('^[\;\:]')
BLANK_LINE_REGEX =
Regexp.new('^\s*$')
REDIRECT_REGEX =
Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
REMOVE_TAG_REGEX =
Regexp.new("\<[^\<\>]*\>")
REMOVE_DIRECTIVES_REGEX =
Regexp.new("\_\_[^\_]*\_\_")
REMOVE_EMPHASIS_REGEX =
Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
CHRREF_TO_UTF_REGEX =
Regexp.new('&#(x?)([0-9a-fA-F]+);')
MNDASH_REGEX =
Regexp.new('\{(mdash|ndash|–)\}')
REMOVE_HR_REGEX =
Regexp.new('^\s*\-+\s*$')
MAKE_REFERENCE_REGEX_A =
Regexp.new('<br ?\/>')
MAKE_REFERENCE_REGEX_B =
Regexp.new('<ref[^>]*\/>')
MAKE_REFERENCE_REGEX_C =
Regexp.new('<ref[^>]*>')
MAKE_REFERENCE_REGEX_D =
Regexp.new('<\/ref>')
FORMAT_REF_REGEX =
Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
HEADING_ONSET_REGEX =
Regexp.new('^(\=+)\s+')
HEADING_CODA_REGEX =
Regexp.new('\s+(\=+)$')
LIST_MARKS_REGEX =
Regexp.new('\A[\*\#\;\:\ ]+')
PRE_MARKS_REGEX =
Regexp.new('\A\^\ ')
DEF_MARKS_REGEX =
Regexp.new('\A[\;\:\ ]+')
ONSET_BAR_REGEX =
Regexp.new('\A[^\|]+\z')
CATEGORY_PATTERNS =
["Category", "Categoria"].join("|")
CATEGORY_REGEX =
Regexp.new('[\{\[\|\b](?:' + CATEGORY_PATTERNS + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
ESCAPE_NOWIKI_REGEX =
Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
UNESCAPE_NOWIKI_REGEX =
Regexp.new('<nowiki\-(\d+?)>')
REMOVE_ISOLATED_REGEX =
Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
REMOVE_INLINE_REGEX =
Regexp.new('\{\{(.*?)\}\}')
TYPE_CODE_REGEX =
Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
SINGLE_SQUARE_BRACKET_REGEX =
Regexp.new("(#{Regexp.escape("[")}|#{Regexp.escape("]")})", Regexp::MULTILINE)
DOUBLE_SQUARE_BRACKET_REGEX =
Regexp.new("(#{Regexp.escape("[[")}|#{Regexp.escape("]]")})", Regexp::MULTILINE)
SINGLE_CURLY_BRACKET_REGEX =
Regexp.new("(#{Regexp.escape("{")}|#{Regexp.escape("}")})", Regexp::MULTILINE)
DOUBLE_CURLY_BRACKET_REGEX =
Regexp.new("(#{Regexp.escape("{{")}|#{Regexp.escape("}}")})", Regexp::MULTILINE)
CURLY_SQUARE_BRACKET_REGEX =
Regexp.new("(#{Regexp.escape("{|")}|#{Regexp.escape("|}")})", Regexp::MULTILINE)
COMPLEX_REGEX_01 =
Regexp.new('\<\<([^<>]++)\>\>\s?')
COMPLEX_REGEX_02 =
Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
COMPLEX_REGEX_03 =
Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
COMPLEX_REGEX_04 =
Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
COMPLEX_REGEX_05 =
Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
CLEANUP_REGEX_01 =
Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
CLEANUP_REGEX_02 =
Regexp.new('^File:.+$')
CLEANUP_REGEX_03 =
Regexp.new('^\|.*$')
CLEANUP_REGEX_04 =
Regexp.new('\{\{.*$')
CLEANUP_REGEX_05 =
Regexp.new('^.*\}\}')
CLEANUP_REGEX_06 =
Regexp.new('\{\|.*$')
CLEANUP_REGEX_07 =
Regexp.new('^.*\|\}')
CLEANUP_REGEX_08 =
Regexp.new('\n\n\n+', Regexp::MULTILINE)
VERSION =
"1.1.3"

Instance Method Summary collapse

Instance Method Details

#batch_file_mod(dir_path) ⇒ Object

modify files under a directry (recursive)



315
316
317
318
319
320
321
322
323
# File 'lib/wp2txt/utils.rb', line 315

def batch_file_mod(dir_path)
  if FileTest.directory?(dir_path)
    collect_files(dir_path).each do |file|
      yield file if FileTest.file?(file)
    end
  elsif FileTest.file?(dir_path)
    yield dir_path
  end
end

#chrref_to_utf(num_str) ⇒ Object



200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# File 'lib/wp2txt/utils.rb', line 200

def chrref_to_utf(num_str)
  num_str.gsub(CHRREF_TO_UTF_REGEX) do
    ch = if $1 == "x"
           $2.to_i(16)
         else
           $2.to_i
         end
    hi = ch >> 8
    lo = ch & 0xff
    u = +"\377\376" << lo.chr << hi.chr
    u.encode("UTF-8", "UTF-16")
  end
rescue StandardError
  num_str
end

#cleanup(text) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/wp2txt/utils.rb', line 45

def cleanup(text)
  text = text.gsub(CLEANUP_REGEX_01) { "" }
  text = text.gsub(CLEANUP_REGEX_02) { "" }
  text = text.gsub(CLEANUP_REGEX_03) { "" }
  text = text.gsub(CLEANUP_REGEX_04) { "" }
  text = text.gsub(CLEANUP_REGEX_05) { "" }
  text = text.gsub(CLEANUP_REGEX_06) { "" }
  text = text.gsub(CLEANUP_REGEX_07) { "" }
  text = text.gsub(CLEANUP_REGEX_08) { "\n\n" }
  text = text.strip
  text << "\n\n"
end

#collect_files(str, regex = nil) ⇒ Object

collect filenames recursively



289
290
291
292
293
294
295
296
# File 'lib/wp2txt/utils.rb', line 289

def collect_files(str, regex = nil)
  regex ||= //
  text_array = []
  Find.find(str) do |f|
    text_array << f if regex =~ f
  end
  text_array.sort
end

#convert_characters(text, has_retried = false) ⇒ Object



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/wp2txt/utils.rb', line 8

def convert_characters(text, has_retried = false)
  text << ""
  text = chrref_to_utf(text)
  text = special_chr(text)
  text = text.encode("UTF-8", "UTF-8", invalid: :replace, replace: "")
rescue StandardError # detect invalid byte sequence in UTF-8
  if has_retried
    puts "invalid byte sequence detected"
    puts "******************************"
    File.open("error_log.txt", "w") do |f|
      f.write text
    end
    exit
  else
    text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
    text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
    convert_characters(text, true)
  end
end

#correct_inline_template(str) ⇒ Object



256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
# File 'lib/wp2txt/utils.rb', line 256

def correct_inline_template(str)
  scanner = StringScanner.new(str)
  process_nested_structure(scanner, "{{", "}}") do |contents|
    parts = contents.split("|")
    if /\A(?:lang|fontsize)\z/i =~ parts[0]
      parts.shift
    elsif /\Alang-/i =~ parts[0]
      parts.shift
    elsif /\Alang=/i =~ parts[1]
      parts.shift
    end

    if parts.size == 1
      out = parts[0]
    else
      begin
        keyval = parts[1].split("=")
        out = if keyval.size > 1
                keyval[1]
              else
                parts[1] || ""
              end
      rescue StandardError
        out = parts[1] || ""
      end
    end
    out.strip
  end
end

#correct_separator(input) ⇒ Object

take care of difference of separators among environments



326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
# File 'lib/wp2txt/utils.rb', line 326

def correct_separator(input)
  case input
  when String
    if RUBY_PLATFORM.index("win32")
      input.gsub("/", "\\")
    else
      input.gsub("\\", "/")
    end
  when Array
    ret_array = []
    input.each do |item|
      ret_array << correct_separator(item)
    end
    ret_array
  end
end

#escape_nowiki(str) ⇒ Object

methods used from format_wiki ####################



104
105
106
107
108
109
110
111
112
113
114
115
116
# File 'lib/wp2txt/utils.rb', line 104

def escape_nowiki(str)
  if @nowikis
    @nowikis.clear
  else
    @nowikis = {}
  end
  str.gsub(ESCAPE_NOWIKI_REGEX) do
    nowiki = $1
    nowiki_id = nowiki.object_id
    @nowikis[nowiki_id] = nowiki
    "<nowiki-#{nowiki_id}>"
  end
end

#file_mod(file_path, backup = false) ⇒ Object

modify a file using block/yield mechanism



299
300
301
302
303
304
305
306
307
308
309
310
311
312
# File 'lib/wp2txt/utils.rb', line 299

def file_mod(file_path, backup = false)
  File.open(file_path, "r") do |fr|
    str = fr.read
    newstr = yield(str)
    str = newstr if nil? newstr
    File.open("temp", "w") do |tf|
      tf.write(str)
    end
  end

  File.rename(file_path, file_path + ".bak")
  File.rename("temp", file_path)
  File.unlink(file_path + ".bak") unless backup
end

#format_wiki(text, config = {}) ⇒ Object



28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/wp2txt/utils.rb', line 28

def format_wiki(text, config = {})
  text = remove_complex(text)
  text = escape_nowiki(text)
  text = process_interwiki_links(text)
  text = process_external_links(text)
  text = unescape_nowiki(text)
  text = remove_directive(text)
  text = remove_emphasis(text)
  text = mndash(text)
  text = remove_hr(text)
  text = remove_tag(text)
  text = correct_inline_template(text) unless config[:inline]
  text = remove_templates(text) unless config[:inline]
  text = remove_table(text) unless config[:table]
  text
end

#make_reference(str) ⇒ Object



249
250
251
252
253
254
# File 'lib/wp2txt/utils.rb', line 249

def make_reference(str)
  str = str.gsub(MAKE_REFERENCE_REGEX_A) { "\n" }
  str = str.gsub(MAKE_REFERENCE_REGEX_B) { "" }
  str = str.gsub(MAKE_REFERENCE_REGEX_C) { "[ref]" }
  str.gsub(MAKE_REFERENCE_REGEX_D) { "[/ref]" }
end

#mndash(str) ⇒ Object



216
217
218
# File 'lib/wp2txt/utils.rb', line 216

def mndash(str)
  str.gsub(MNDASH_REGEX, "")
end


139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/wp2txt/utils.rb', line 139

def process_external_links(str)
  scanner = StringScanner.new(str)
  process_nested_structure(scanner, "[", "]") do |contents|
    if /\A\s.+\s\z/ =~ contents
      " (#{contents.strip}) "
    else
      parts = contents.split(" ", 2)
      case parts.size
      when 1
        parts.first || ""
      else
        parts.last || ""
      end
    end
  end
end


125
126
127
128
129
130
131
132
133
134
135
136
137
# File 'lib/wp2txt/utils.rb', line 125

def process_interwiki_links(str)
  scanner = StringScanner.new(str)
  process_nested_structure(scanner, "[[", "]]") do |contents|
    parts = contents.split("|")
    case parts.size
    when 1
      parts.first || ""
    else
      parts.shift
      parts.join("|")
    end
  end
end

#process_nested_structure(scanner, left, right, &block) ⇒ Object

parser for nested structure ####################



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/wp2txt/utils.rb', line 60

def process_nested_structure(scanner, left, right, &block)
  buffer = +""
  begin
    regex = if left == "[" && right == "]"
              SINGLE_SQUARE_BRACKET_REGEX
            elsif left == "[[" && right == "]]"
              DOUBLE_SQUARE_BRACKET_REGEX
            elsif left == "{" && right == "}"
              SINGLE_CURLY_BRACKET_REGEX
            elsif left == "{{" && right == "}}"
              DOUBLE_CURLY_BRACKET_REGEX
            elsif left == "{|" && right == "|}"
              CURLY_SQUARE_BRACKET_REGEX
            else
              Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
            end
    while (str = scanner.scan_until(regex))
      case scanner[1]
      when left
        buffer << str
        has_left = true
      when right
        if has_left
          buffer = buffer[0...-left.size]
          contents = block.call(str[0...-left.size])
          buffer << contents
          break
        else
          buffer << str
        end
      end
    end
    buffer << scanner.rest

    return buffer if buffer == scanner.string

    scanner.string = buffer
    process_nested_structure(scanner, left, right, &block) || ""
  rescue StandardError
    scanner.string
  end
end

#remove_complex(str) ⇒ Object



241
242
243
244
245
246
247
# File 'lib/wp2txt/utils.rb', line 241

def remove_complex(str)
  str = str.gsub(COMPLEX_REGEX_01) { "#{$1}" }
  str = str.gsub(COMPLEX_REGEX_02) { "" }
  str = str.gsub(COMPLEX_REGEX_03) { "" }
  str = str.gsub(COMPLEX_REGEX_04) { "" }
  str.gsub(COMPLEX_REGEX_05) { "" }
end

#remove_directive(str) ⇒ Object



190
191
192
# File 'lib/wp2txt/utils.rb', line 190

def remove_directive(str)
  str.gsub(REMOVE_DIRECTIVES_REGEX, "")
end

#remove_emphasis(str) ⇒ Object



194
195
196
197
198
# File 'lib/wp2txt/utils.rb', line 194

def remove_emphasis(str)
  str.gsub(REMOVE_EMPHASIS_REGEX) do
    $2
  end
end

#remove_hr(str) ⇒ Object



220
221
222
# File 'lib/wp2txt/utils.rb', line 220

def remove_hr(str)
  str.gsub(REMOVE_HR_REGEX, "")
end

#remove_html(str) ⇒ Object



228
229
230
231
232
233
234
235
236
237
238
239
# File 'lib/wp2txt/utils.rb', line 228

def remove_html(str)
  res = +str.dup
  res.gsub!(%r{<[^<>]+/>}) { "" }
  ["div", "gallery", "timeline", "noinclude"].each do |tag|
    scanner = StringScanner.new(res)
    result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do
      ""
    end
    res.replace(result)
  end
  res
end

#remove_inbetween(str, tagset = ["<", ">"]) ⇒ Object



180
181
182
183
184
# File 'lib/wp2txt/utils.rb', line 180

def remove_inbetween(str, tagset = ["<", ">"])
  tagsets = Regexp.quote(tagset.uniq.join(""))
  regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
  str.gsub(regex, "")
end

#remove_ref(str) ⇒ Object



224
225
226
# File 'lib/wp2txt/utils.rb', line 224

def remove_ref(str)
  str.gsub(FORMAT_REF_REGEX) { "" }
end

#remove_table(str) ⇒ Object



169
170
171
172
173
174
# File 'lib/wp2txt/utils.rb', line 169

def remove_table(str)
  scanner = StringScanner.new(str)
  process_nested_structure(scanner, "{|", "|}") do
    ""
  end
end

#remove_tag(str) ⇒ Object



186
187
188
# File 'lib/wp2txt/utils.rb', line 186

def remove_tag(str)
  str.gsub(REMOVE_TAG_REGEX, "")
end

#remove_templates(str) ⇒ Object

methods used from format_article ####################



158
159
160
161
162
163
164
165
166
167
# File 'lib/wp2txt/utils.rb', line 158

def remove_templates(str)
  scanner1 = StringScanner.new(str)
  result = process_nested_structure(scanner1, "{{", "}}") do
    ""
  end
  scanner2 = StringScanner.new(result)
  process_nested_structure(scanner2, "{", "}") do
    ""
  end
end

#rename(files, ext = "txt") ⇒ Object



343
344
345
346
347
348
349
350
351
352
353
354
355
356
# File 'lib/wp2txt/utils.rb', line 343

def rename(files, ext = "txt")
  # num of digits necessary to name the last file generated
  maxwidth = 0

  files.each do |f|
    width = f.slice(/-(\d+)\z/, 1).to_s.length.to_i
    maxwidth = width if maxwidth < width
    newname = f.sub(/-(\d+)\z/) do
      "-" + format("%0#{maxwidth}d", $1.to_i)
    end
    File.rename(f, newname + ".#{ext}")
  end
  true
end

#sec_to_str(int) ⇒ Object

convert int of seconds to string in the format 00:00:00



359
360
361
362
363
364
365
366
367
368
# File 'lib/wp2txt/utils.rb', line 359

def sec_to_str(int)
  unless int
    str = "--:--:--"
    return str
  end
  h = int / 3600
  m = (int - h * 3600) / 60
  s = int % 60
  format("%02d:%02d:%02d", h, m, s)
end

#special_chr(str) ⇒ Object



176
177
178
# File 'lib/wp2txt/utils.rb', line 176

def special_chr(str)
  HTML_DECODER.decode(str)
end

#unescape_nowiki(str) ⇒ Object



118
119
120
121
122
123
# File 'lib/wp2txt/utils.rb', line 118

def unescape_nowiki(str)
  str.gsub(UNESCAPE_NOWIKI_REGEX) do
    obj_id = $1.to_i
    @nowikis[obj_id]
  end
end