Class: Wp2txt::Splitter

Inherits:

Object

Object
Wp2txt::Splitter

Includes:: Wp2txt

Defined in:: lib/wp2txt.rb

Constant Summary

Constants included from Wp2txt

BLANK_LINE_REGEX, CATEGORY_PATTERNS, CATEGORY_REGEX, CHRREF_TO_UTF_REGEX, CLEANUP_REGEX_01, CLEANUP_REGEX_02, CLEANUP_REGEX_03, CLEANUP_REGEX_04, CLEANUP_REGEX_05, CLEANUP_REGEX_06, CLEANUP_REGEX_07, CLEANUP_REGEX_08, COMPLEX_REGEX_01, COMPLEX_REGEX_02, COMPLEX_REGEX_03, COMPLEX_REGEX_04, COMPLEX_REGEX_05, CURLY_SQUARE_BRACKET_REGEX, DEF_MARKS_REGEX, DOUBLE_CURLY_BRACKET_REGEX, DOUBLE_SQUARE_BRACKET_REGEX, ENTITIES, ESCAPE_NOWIKI_REGEX, FORMAT_REF_REGEX, HEADING_CODA_REGEX, HEADING_ONSET_REGEX, HTML_DECODER, HTML_HASH, HTML_REGEX, IN_DEFINITION_REGEX, IN_HEADING_REGEX, IN_HTML_TABLE_REGEX, IN_HTML_TABLE_REGEX1, IN_HTML_TABLE_REGEX2, IN_INPUTBOX_REGEX, IN_INPUTBOX_REGEX1, IN_INPUTBOX_REGEX2, IN_LINK_REGEX, IN_MATH_REGEX, IN_MATH_REGEX1, IN_MATH_REGEX2, IN_ORDERED_REGEX, IN_PRE_REGEX, IN_SOURCE_REGEX, IN_SOURCE_REGEX1, IN_SOURCE_REGEX2, IN_TABLE_REGEX1, IN_TABLE_REGEX2, IN_UNORDERED_REGEX, ISOLATED_TAG_REGEX, ISOLATED_TEMPLATE_REGEX, LIST_MARKS_REGEX, MAKE_REFERENCE_REGEX_A, MAKE_REFERENCE_REGEX_B, MAKE_REFERENCE_REGEX_C, MAKE_REFERENCE_REGEX_D, ML_LINK_END_REGEX, ML_LINK_ONSET_REGEX, ML_TEMPLATE_END_REGEX, ML_TEMPLATE_ONSET_REGEX, MNDASH_REGEX, ONSET_BAR_REGEX, PRE_MARKS_REGEX, REDIRECT_REGEX, REMOVE_DIRECTIVES_REGEX, REMOVE_EMPHASIS_REGEX, REMOVE_HR_REGEX, REMOVE_INLINE_REGEX, REMOVE_ISOLATED_REGEX, REMOVE_TAG_REGEX, SINGLE_CURLY_BRACKET_REGEX, SINGLE_SQUARE_BRACKET_REGEX, TYPE_CODE_REGEX, UNESCAPE_NOWIKI_REGEX, VERSION

Instance Method Summary collapse

#command_exist?(command) ⇒ Boolean

check if a given command exists: return the path if it does, return false if not.
#file_size(file) ⇒ Object
#fill_buffer ⇒ Object

read text data from bz2 compressed file by 1 megabyte.
#get_newline ⇒ Object
#initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false) ⇒ Splitter constructor

A new instance of Splitter.
#prepare ⇒ Object

check the size of input file (bz2 or plain xml) when decompressed.
#split_file ⇒ Object

Methods included from Wp2txt

#batch_file_mod, #chrref_to_utf, #cleanup, #collect_files, #convert_characters, #correct_inline_template, #correct_separator, #escape_nowiki, #file_mod, #format_wiki, #make_reference, #mndash, #process_external_links, #process_interwiki_links, #process_nested_structure, #remove_complex, #remove_directive, #remove_emphasis, #remove_hr, #remove_html, #remove_inbetween, #remove_ref, #remove_table, #remove_tag, #remove_templates, #rename, #sec_to_str, #special_chr, #unescape_nowiki

Constructor Details

#initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false) ⇒ `Splitter`

Returns a new instance of Splitter.

# File 'lib/wp2txt.rb', line 10

def initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false)
  @fp = nil
  @input_file = input_file
  @output_dir = output_dir
  @tfile_size = tfile_size
  require "bzip2-ruby" if bz2_gem
  @bz2_gem = bz2_gem
  prepare
end

Instance Method Details

#command_exist?(command) ⇒ `Boolean`

check if a given command exists: return the path if it does, return false if not

Returns:

(Boolean)

# File 'lib/wp2txt.rb', line 47

def command_exist?(command)
  basename = File.basename(command)
  path = +""
  print "Checking #{basename}: "
  begin
    if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip }
      puts "detected [#{path}]"
      path.strip
    elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip }
      puts "detected [#{path}]"
      path.strip
    else
      puts "#{basename} not found"
      false
    end
  rescue StandardError
    puts "#{basename} not found"
    false
  end
end

#file_size(file) ⇒ `Object`

# File 'lib/wp2txt.rb', line 20

def file_size(file)
  size = 0
  unit = 10_485_760
  star = 0
  before = Time.now.to_f

  loop do
    begin
      a = file.read(unit)
    rescue StandardError
      a = nil
    end
    break unless a

    present = Time.now.to_f
    size += a.size

    next if present - before <= 0.3

    star = 0 if star > 10
    star += 1
    before = present
  end
  size
end

#fill_buffer ⇒ `Object`

read text data from bz2 compressed file by 1 megabyte

# File 'lib/wp2txt.rb', line 100

def fill_buffer
  loop do
    begin
      new_lines = @file_pointer.read(10_485_760)
    rescue StandardError
      return nil
    end
    return nil unless new_lines

    # temp_buf is filled with text split by "\n"
    temp_buf = []
    ss = StringScanner.new(new_lines)
    temp_buf << ss[0] while ss.scan(/.*?\n/m)
    temp_buf << ss.rest unless ss.eos?

    new_first_line = temp_buf.shift
    @buffer.last << new_first_line
    @buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
    @buffer += temp_buf unless temp_buf.empty?
    @buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
    break if @buffer.size > 1
  end
  true
end

#get_newline ⇒ `Object`

# File 'lib/wp2txt.rb', line 125

def get_newline
  @buffer ||= [+""]
  if @buffer.size == 1 && !fill_buffer
    nil
  elsif @buffer.empty?
    nil
  else
    @buffer.shift
  end
end

#prepare ⇒ `Object`

check the size of input file (bz2 or plain xml) when decompressed

# File 'lib/wp2txt.rb', line 69

def prepare
  # if output_dir is not specified, output in the same directory
  # as the imput file
  @output_dir = File.dirname(@input_file) if !@output_dir && @input_file

  if /.bz2$/ =~ @input_file
    if @bz2_gem
      file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
    elsif Gem.win_platform?
      file = IO.popen("bunzip2.exe -c #{@input_file}")
    elsif (bzpath = command_exist?("lbzip2") || command_exist?("pbzip2") || command_exist?("bzip2"))
      file = IO.popen("#{bzpath} -c -d #{@input_file}")
    end
  else # meaning that it is a text file
    @infile_size = File.stat(@input_file).size
    file = open(@input_file)
  end

  # create basename of output file
  @outfile_base = File.basename(@input_file, ".*") + "-"
  @total_size = 0
  @file_index = 1
  outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
  @outfiles = []
  @outfiles << outfilename
  @fp = File.open(outfilename, "w")
  @file_pointer = file
  true
end

#split_file ⇒ `Object`

# File 'lib/wp2txt.rb', line 136

def split_file
  output_text = +""
  end_flag = false
  while (text = get_newline)
    @count ||= 0
    @count += 1
    @size_read ||= 0
    @size_read += text.bytesize
    @total_size += text.bytesize
    output_text << text
    end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
    # never close the file until the end of the page even if end_flag is on
    next unless end_flag && %r{</page} =~ text

    @fp.puts(output_text)
    output_text = +""
    @total_size = 0
    end_flag = false
    @fp.close
    @file_index += 1
    outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
    @outfiles << outfilename
    @fp = File.open(outfilename, "w")
  end
  @fp.puts(output_text) if output_text != ""
  @fp.close

  if outfilename && File.size(outfilename).zero?
    File.delete(outfilename)
    @outfiles.delete(outfilename)
  end

  rename(@outfiles, "xml")
end

Class: Wp2txt::Splitter

Constant Summary

Constants included from Wp2txt

Instance Method Summary collapse

Methods included from Wp2txt

Constructor Details

#initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false) ⇒ Splitter

Instance Method Details

#command_exist?(command) ⇒ Boolean

#file_size(file) ⇒ Object

#fill_buffer ⇒ Object

#get_newline ⇒ Object

#prepare ⇒ Object