Module: Wp2txt

Included in:: Article, Runner, Splitter

Defined in:: lib/wp2txt.rb,
lib/wp2txt/regex.rb,
lib/wp2txt/utils.rb,
lib/wp2txt/article.rb,
lib/wp2txt/version.rb

Defined Under Namespace

Constant Summary collapse

HTML_DECODER = variables to save resource for generating regexps those with a trailing number 1 represent opening tag/markup those with a trailing number 2 represent closing tag/markup those without a trailing number contain both opening/closing tags/markups

HTMLEntities.new

ENTITIES =

['&nbsp;', '&lt;', '&gt;', '&amp;', '&quot;'].zip([' ', '<', '>', '&', '"'])

HTML_HASH =

HTML_REGEX =

Regexp.new("(" + HTML_HASH.keys.join("|") + ")")

ML_TEMPLATE_ONSET_REGEX =

Regexp.new('^\{\{[^\}]*$')

ML_TEMPLATE_END_REGEX =

Regexp.new('\}\}\s*$')

ML_LINK_ONSET_REGEX =

Regexp.new('^\[\[[^\]]*$')

ML_LINK_END_REGEX =

Regexp.new('\]\]\s*$')

ISOLATED_TEMPLATE_REGEX =

Regexp.new('^\s*\{\{.+\}\}\s*$')

ISOLATED_TAG_REGEX =

Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')

IN_LINK_REGEX =

Regexp.new('^\s*\[.*\]\s*$')

IN_INPUTBOX_REGEX =

Regexp.new('<inputbox>.*?<\/inputbox>')

IN_INPUTBOX_REGEX1 =

Regexp.new('<inputbox>')

IN_INPUTBOX_REGEX2 =

Regexp.new('<\/inputbox>')

IN_SOURCE_REGEX =

Regexp.new('<source.*?>.*?<\/source>')

IN_SOURCE_REGEX1 =

Regexp.new('<source.*?>')

IN_SOURCE_REGEX2 =

Regexp.new('<\/source>')

IN_MATH_REGEX =

Regexp.new('<math.*?>.*?<\/math>')

IN_MATH_REGEX1 =

Regexp.new('<math.*?>')

IN_MATH_REGEX2 =

Regexp.new('<\/math>')

IN_HEADING_REGEX =

Regexp.new('^=+.*?=+$')

IN_HTML_TABLE_REGEX =

Regexp.new("<table.*?><\/table>")

IN_HTML_TABLE_REGEX1 =

Regexp.new('<table\b')

IN_HTML_TABLE_REGEX2 =

Regexp.new('<\/\s*table>')

IN_TABLE_REGEX1 =

Regexp.new('^\s*\{\|')

IN_TABLE_REGEX2 =

Regexp.new('^\|\}.*?$')

IN_UNORDERED_REGEX =

Regexp.new('^\*')

IN_ORDERED_REGEX =

Regexp.new('^\#')

IN_PRE_REGEX =

Regexp.new('^ ')

IN_DEFINITION_REGEX =

Regexp.new('^[\;\:]')

BLANK_LINE_REGEX =

Regexp.new('^\s*$')

REDIRECT_REGEX =

Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)

REMOVE_TAG_REGEX =

Regexp.new("\<[^\<\>]*\>")

REMOVE_DIRECTIVES_REGEX =

Regexp.new("\_\_[^\_]*\_\_")

REMOVE_EMPHASIS_REGEX =

Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')

CHRREF_TO_UTF_REGEX =

Regexp.new('&#(x?)([0-9a-fA-F]+);')

MNDASH_REGEX =

Regexp.new('\{(mdash|ndash|–)\}')

REMOVE_HR_REGEX =

Regexp.new('^\s*\-+\s*$')

MAKE_REFERENCE_REGEX_A =

Regexp.new('<br ?\/>')

MAKE_REFERENCE_REGEX_B =

Regexp.new('<ref[^>]*\/>')

MAKE_REFERENCE_REGEX_C =

Regexp.new('<ref[^>]*>')

MAKE_REFERENCE_REGEX_D =

Regexp.new('<\/ref>')

FORMAT_REF_REGEX =

Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)

HEADING_ONSET_REGEX =

Regexp.new('^(\=+)\s+')

HEADING_CODA_REGEX =

Regexp.new('\s+(\=+)$')

LIST_MARKS_REGEX =

Regexp.new('\A[\*\#\;\:\ ]+')

PRE_MARKS_REGEX =

Regexp.new('\A\^\ ')

DEF_MARKS_REGEX =

Regexp.new('\A[\;\:\ ]+')

ONSET_BAR_REGEX =

Regexp.new('\A[^\|]+\z')

CATEGORY_PATTERNS =

["Category", "Categoria"].join("|")

CATEGORY_REGEX =

Regexp.new('[\{\[\|\b](?:' + CATEGORY_PATTERNS + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)

ESCAPE_NOWIKI_REGEX =

Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)

UNESCAPE_NOWIKI_REGEX =

Regexp.new('<nowiki\-(\d+?)>')

REMOVE_ISOLATED_REGEX =

Regexp.new('^\s*\{\{(.*?)\}\}\s*$')

REMOVE_INLINE_REGEX =

Regexp.new('\{\{(.*?)\}\}')

TYPE_CODE_REGEX =

Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)

SINGLE_SQUARE_BRACKET_REGEX =

Regexp.new("(#{Regexp.escape("[")}|#{Regexp.escape("]")})", Regexp::MULTILINE)

DOUBLE_SQUARE_BRACKET_REGEX =

Regexp.new("(#{Regexp.escape("[[")}|#{Regexp.escape("]]")})", Regexp::MULTILINE)

SINGLE_CURLY_BRACKET_REGEX =

Regexp.new("(#{Regexp.escape("{")}|#{Regexp.escape("}")})", Regexp::MULTILINE)

DOUBLE_CURLY_BRACKET_REGEX =

Regexp.new("(#{Regexp.escape("{{")}|#{Regexp.escape("}}")})", Regexp::MULTILINE)

CURLY_SQUARE_BRACKET_REGEX =

Regexp.new("(#{Regexp.escape("{|")}|#{Regexp.escape("|}")})", Regexp::MULTILINE)

COMPLEX_REGEX_01 =

Regexp.new('\<\<([^<>]++)\>\>\s?')

COMPLEX_REGEX_02 =

Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)

COMPLEX_REGEX_03 =

Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)

COMPLEX_REGEX_04 =

Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)

COMPLEX_REGEX_05 =

Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)

CLEANUP_REGEX_01 =

Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)

CLEANUP_REGEX_02 =

Regexp.new('^File:.+$')

CLEANUP_REGEX_03 =

Regexp.new('^\|.*$')

CLEANUP_REGEX_04 =

Regexp.new('\{\{.*$')

CLEANUP_REGEX_05 =

Regexp.new('^.*\}\}')

CLEANUP_REGEX_06 =

Regexp.new('\{\|.*$')

CLEANUP_REGEX_07 =

Regexp.new('^.*\|\}')

CLEANUP_REGEX_08 =

Regexp.new('\n\n\n+', Regexp::MULTILINE)

VERSION =

"1.1.3"

Instance Method Summary collapse

#batch_file_mod(dir_path) ⇒ Object

modify files under a directry (recursive).
#chrref_to_utf(num_str) ⇒ Object
#cleanup(text) ⇒ Object
#collect_files(str, regex = nil) ⇒ Object

collect filenames recursively.
#convert_characters(text, has_retried = false) ⇒ Object
#correct_inline_template(str) ⇒ Object
#correct_separator(input) ⇒ Object

take care of difference of separators among environments.
#escape_nowiki(str) ⇒ Object

methods used from format_wiki ####################.
#file_mod(file_path, backup = false) ⇒ Object

modify a file using block/yield mechanism.
#format_wiki(text, config = {}) ⇒ Object
#make_reference(str) ⇒ Object
#mndash(str) ⇒ Object
#process_external_links(str) ⇒ Object
#process_interwiki_links(str) ⇒ Object
#process_nested_structure(scanner, left, right, &block) ⇒ Object

parser for nested structure ####################.
#remove_complex(str) ⇒ Object
#remove_directive(str) ⇒ Object
#remove_emphasis(str) ⇒ Object
#remove_hr(str) ⇒ Object
#remove_html(str) ⇒ Object
#remove_inbetween(str, tagset = ["<", ">"]) ⇒ Object
#remove_ref(str) ⇒ Object
#remove_table(str) ⇒ Object
#remove_tag(str) ⇒ Object
#remove_templates(str) ⇒ Object

methods used from format_article ####################.
#rename(files, ext = "txt") ⇒ Object
#sec_to_str(int) ⇒ Object

convert int of seconds to string in the format 00:00:00.
#special_chr(str) ⇒ Object
#unescape_nowiki(str) ⇒ Object

Instance Method Details

#batch_file_mod(dir_path) ⇒ `Object`

modify files under a directry (recursive)

# File 'lib/wp2txt/utils.rb', line 315

def batch_file_mod(dir_path)
  if FileTest.directory?(dir_path)
    collect_files(dir_path).each do |file|
      yield file if FileTest.file?(file)
    end
  elsif FileTest.file?(dir_path)
    yield dir_path
  end
end

#chrref_to_utf(num_str) ⇒ `Object`

# File 'lib/wp2txt/utils.rb', line 200

def chrref_to_utf(num_str)
  num_str.gsub(CHRREF_TO_UTF_REGEX) do
    ch = if $1 == "x"
           $2.to_i(16)
         else
           $2.to_i
         end
    hi = ch >> 8
    lo = ch & 0xff
    u = +"\377\376" << lo.chr << hi.chr
    u.encode("UTF-8", "UTF-16")
  end
rescue StandardError
  num_str
end

#cleanup(text) ⇒ `Object`

# File 'lib/wp2txt/utils.rb', line 45

def cleanup(text)
  text = text.gsub(CLEANUP_REGEX_01) { "" }
  text = text.gsub(CLEANUP_REGEX_02) { "" }
  text = text.gsub(CLEANUP_REGEX_03) { "" }
  text = text.gsub(CLEANUP_REGEX_04) { "" }
  text = text.gsub(CLEANUP_REGEX_05) { "" }
  text = text.gsub(CLEANUP_REGEX_06) { "" }
  text = text.gsub(CLEANUP_REGEX_07) { "" }
  text = text.gsub(CLEANUP_REGEX_08) { "\n\n" }
  text = text.strip
  text << "\n\n"
end

#collect_files(str, regex = nil) ⇒ `Object`

collect filenames recursively

# File 'lib/wp2txt/utils.rb', line 289

def collect_files(str, regex = nil)
  regex ||= //
  text_array = []
  Find.find(str) do |f|
    text_array << f if regex =~ f
  end
  text_array.sort
end

#convert_characters(text, has_retried = false) ⇒ `Object`

# File 'lib/wp2txt/utils.rb', line 8

def convert_characters(text, has_retried = false)
  text << ""
  text = chrref_to_utf(text)
  text = special_chr(text)
  text = text.encode("UTF-8", "UTF-8", invalid: :replace, replace: "")
rescue StandardError # detect invalid byte sequence in UTF-8
  if has_retried
    puts "invalid byte sequence detected"
    puts "******************************"
    File.open("error_log.txt", "w") do |f|
      f.write text
    end
    exit
  else
    text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
    text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "")
    convert_characters(text, true)
  end
end

#correct_inline_template(str) ⇒ `Object`

# File 'lib/wp2txt/utils.rb', line 256

def correct_inline_template(str)
  scanner = StringScanner.new(str)
  process_nested_structure(scanner, "{{", "}}") do |contents|
    parts = contents.split("|")
    if /\A(?:lang|fontsize)\z/i =~ parts[0]
      parts.shift
    elsif /\Alang-/i =~ parts[0]
      parts.shift
    elsif /\Alang=/i =~ parts[1]
      parts.shift
    end

    if parts.size == 1
      out = parts[0]
    else
      begin
        keyval = parts[1].split("=")
        out = if keyval.size > 1
                keyval[1]
              else
                parts[1] || ""
              end
      rescue StandardError
        out = parts[1] || ""
      end
    end
    out.strip
  end
end

#correct_separator(input) ⇒ `Object`

take care of difference of separators among environments

# File 'lib/wp2txt/utils.rb', line 326

def correct_separator(input)
  case input
  when String
    if RUBY_PLATFORM.index("win32")
      input.gsub("/", "\\")
    else
      input.gsub("\\", "/")
    end
  when Array
    ret_array = []
    input.each do |item|
      ret_array << correct_separator(item)
    end
    ret_array
  end
end

#escape_nowiki(str) ⇒ `Object`

methods used from format_wiki ####################

# File 'lib/wp2txt/utils.rb', line 104

def escape_nowiki(str)
  if @nowikis
    @nowikis.clear
  else
    @nowikis = {}
  end
  str.gsub(ESCAPE_NOWIKI_REGEX) do
    nowiki = $1
    nowiki_id = nowiki.object_id
    @nowikis[nowiki_id] = nowiki
    "<nowiki-#{nowiki_id}>"
  end
end

#file_mod(file_path, backup = false) ⇒ `Object`

modify a file using block/yield mechanism

# File 'lib/wp2txt/utils.rb', line 299

def file_mod(file_path, backup = false)
  File.open(file_path, "r") do |fr|
    str = fr.read
    newstr = yield(str)
    str = newstr if nil? newstr
    File.open("temp", "w") do |tf|
      tf.write(str)
    end
  end

  File.rename(file_path, file_path + ".bak")
  File.rename("temp", file_path)
  File.unlink(file_path + ".bak") unless backup
end

#format_wiki(text, config = {}) ⇒ `Object`

# File 'lib/wp2txt/utils.rb', line 28

def format_wiki(text, config = {})
  text = remove_complex(text)
  text = escape_nowiki(text)
  text = process_interwiki_links(text)
  text = process_external_links(text)
  text = unescape_nowiki(text)
  text = remove_directive(text)
  text = remove_emphasis(text)
  text = mndash(text)
  text = remove_hr(text)
  text = remove_tag(text)
  text = correct_inline_template(text) unless config[:inline]
  text = remove_templates(text) unless config[:inline]
  text = remove_table(text) unless config[:table]
  text
end

#make_reference(str) ⇒ `Object`

# File 'lib/wp2txt/utils.rb', line 249

def make_reference(str)
  str = str.gsub(MAKE_REFERENCE_REGEX_A) { "\n" }
  str = str.gsub(MAKE_REFERENCE_REGEX_B) { "" }
  str = str.gsub(MAKE_REFERENCE_REGEX_C) { "[ref]" }
  str.gsub(MAKE_REFERENCE_REGEX_D) { "[/ref]" }
end

#mndash(str) ⇒ `Object`



216
217
218

# File 'lib/wp2txt/utils.rb', line 216

def mndash(str)
  str.gsub(MNDASH_REGEX, "–")
end

#process_external_links(str) ⇒ `Object`

# File 'lib/wp2txt/utils.rb', line 139

def process_external_links(str)
  scanner = StringScanner.new(str)
  process_nested_structure(scanner, "[", "]") do |contents|
    if /\A\s.+\s\z/ =~ contents
      " (#{contents.strip}) "
    else
      parts = contents.split(" ", 2)
      case parts.size
      when 1
        parts.first || ""
      else
        parts.last || ""
      end
    end
  end
end

#process_interwiki_links(str) ⇒ `Object`

# File 'lib/wp2txt/utils.rb', line 125

def process_interwiki_links(str)
  scanner = StringScanner.new(str)
  process_nested_structure(scanner, "[[", "]]") do |contents|
    parts = contents.split("|")
    case parts.size
    when 1
      parts.first || ""
    else
      parts.shift
      parts.join("|")
    end
  end
end

#process_nested_structure(scanner, left, right, &block) ⇒ `Object`

parser for nested structure ####################

# File 'lib/wp2txt/utils.rb', line 60

def process_nested_structure(scanner, left, right, &block)
  buffer = +""
  begin
    regex = if left == "[" && right == "]"
              SINGLE_SQUARE_BRACKET_REGEX
            elsif left == "[[" && right == "]]"
              DOUBLE_SQUARE_BRACKET_REGEX
            elsif left == "{" && right == "}"
              SINGLE_CURLY_BRACKET_REGEX
            elsif left == "{{" && right == "}}"
              DOUBLE_CURLY_BRACKET_REGEX
            elsif left == "{|" && right == "|}"
              CURLY_SQUARE_BRACKET_REGEX
            else
              Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})")
            end
    while (str = scanner.scan_until(regex))
      case scanner[1]
      when left
        buffer << str
        has_left = true
      when right
        if has_left
          buffer = buffer[0...-left.size]
          contents = block.call(str[0...-left.size])
          buffer << contents
          break
        else
          buffer << str
        end
      end
    end
    buffer << scanner.rest

    return buffer if buffer == scanner.string

    scanner.string = buffer
    process_nested_structure(scanner, left, right, &block) || ""
  rescue StandardError
    scanner.string
  end
end

#remove_complex(str) ⇒ `Object`

# File 'lib/wp2txt/utils.rb', line 241

def remove_complex(str)
  str = str.gsub(COMPLEX_REGEX_01) { "《#{$1}》" }
  str = str.gsub(COMPLEX_REGEX_02) { "" }
  str = str.gsub(COMPLEX_REGEX_03) { "" }
  str = str.gsub(COMPLEX_REGEX_04) { "" }
  str.gsub(COMPLEX_REGEX_05) { "" }
end

#remove_directive(str) ⇒ `Object`



190
191
192

# File 'lib/wp2txt/utils.rb', line 190

def remove_directive(str)
  str.gsub(REMOVE_DIRECTIVES_REGEX, "")
end

#remove_emphasis(str) ⇒ `Object`

# File 'lib/wp2txt/utils.rb', line 194

def remove_emphasis(str)
  str.gsub(REMOVE_EMPHASIS_REGEX) do
    $2
  end
end

#remove_hr(str) ⇒ `Object`



220
221
222

# File 'lib/wp2txt/utils.rb', line 220

def remove_hr(str)
  str.gsub(REMOVE_HR_REGEX, "")
end

#remove_html(str) ⇒ `Object`

# File 'lib/wp2txt/utils.rb', line 228

def remove_html(str)
  res = +str.dup
  res.gsub!(%r{<[^<>]+/>}) { "" }
  ["div", "gallery", "timeline", "noinclude"].each do |tag|
    scanner = StringScanner.new(res)
    result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do
      ""
    end
    res.replace(result)
  end
  res
end

#remove_inbetween(str, tagset = ["<", ">"]) ⇒ `Object`

# File 'lib/wp2txt/utils.rb', line 180

def remove_inbetween(str, tagset = ["<", ">"])
  tagsets = Regexp.quote(tagset.uniq.join(""))
  regex = /#{Regexp.escape(tagset[0])}[^#{tagsets}]*#{Regexp.escape(tagset[1])}/
  str.gsub(regex, "")
end

#remove_ref(str) ⇒ `Object`



224
225
226

# File 'lib/wp2txt/utils.rb', line 224

def remove_ref(str)
  str.gsub(FORMAT_REF_REGEX) { "" }
end

#remove_table(str) ⇒ `Object`

# File 'lib/wp2txt/utils.rb', line 169

def remove_table(str)
  scanner = StringScanner.new(str)
  process_nested_structure(scanner, "{|", "|}") do
    ""
  end
end

#remove_tag(str) ⇒ `Object`



186
187
188

# File 'lib/wp2txt/utils.rb', line 186

def remove_tag(str)
  str.gsub(REMOVE_TAG_REGEX, "")
end

#remove_templates(str) ⇒ `Object`

methods used from format_article ####################

# File 'lib/wp2txt/utils.rb', line 158

def remove_templates(str)
  scanner1 = StringScanner.new(str)
  result = process_nested_structure(scanner1, "{{", "}}") do
    ""
  end
  scanner2 = StringScanner.new(result)
  process_nested_structure(scanner2, "{", "}") do
    ""
  end
end

#rename(files, ext = "txt") ⇒ `Object`

# File 'lib/wp2txt/utils.rb', line 343

def rename(files, ext = "txt")
  # num of digits necessary to name the last file generated
  maxwidth = 0

  files.each do |f|
    width = f.slice(/-(\d+)\z/, 1).to_s.length.to_i
    maxwidth = width if maxwidth < width
    newname = f.sub(/-(\d+)\z/) do
      "-" + format("%0#{maxwidth}d", $1.to_i)
    end
    File.rename(f, newname + ".#{ext}")
  end
  true
end

#sec_to_str(int) ⇒ `Object`

convert int of seconds to string in the format 00:00:00

# File 'lib/wp2txt/utils.rb', line 359

def sec_to_str(int)
  unless int
    str = "--:--:--"
    return str
  end
  h = int / 3600
  m = (int - h * 3600) / 60
  s = int % 60
  format("%02d:%02d:%02d", h, m, s)
end

#special_chr(str) ⇒ `Object`



176
177
178

# File 'lib/wp2txt/utils.rb', line 176

def special_chr(str)
  HTML_DECODER.decode(str)
end

#unescape_nowiki(str) ⇒ `Object`

# File 'lib/wp2txt/utils.rb', line 118

def unescape_nowiki(str)
  str.gsub(UNESCAPE_NOWIKI_REGEX) do
    obj_id = $1.to_i
    @nowikis[obj_id]
  end
end

Module: Wp2txt

Defined Under Namespace

Constant Summary collapse

Instance Method Summary collapse

Instance Method Details

#batch_file_mod(dir_path) ⇒ Object

#chrref_to_utf(num_str) ⇒ Object

#cleanup(text) ⇒ Object

#collect_files(str, regex = nil) ⇒ Object

#convert_characters(text, has_retried = false) ⇒ Object

#correct_inline_template(str) ⇒ Object

#correct_separator(input) ⇒ Object

#escape_nowiki(str) ⇒ Object

#file_mod(file_path, backup = false) ⇒ Object

#format_wiki(text, config = {}) ⇒ Object

#make_reference(str) ⇒ Object

#mndash(str) ⇒ Object

#process_external_links(str) ⇒ Object

#process_interwiki_links(str) ⇒ Object

#process_nested_structure(scanner, left, right, &block) ⇒ Object

#remove_complex(str) ⇒ Object

#remove_directive(str) ⇒ Object

#remove_emphasis(str) ⇒ Object

#remove_hr(str) ⇒ Object

#remove_html(str) ⇒ Object

#remove_inbetween(str, tagset = ["<", ">"]) ⇒ Object

#remove_ref(str) ⇒ Object

#remove_table(str) ⇒ Object

#remove_tag(str) ⇒ Object

#remove_templates(str) ⇒ Object

#rename(files, ext = "txt") ⇒ Object

#sec_to_str(int) ⇒ Object

#special_chr(str) ⇒ Object

#unescape_nowiki(str) ⇒ Object