Class: Wp2txt::Splitter

Inherits:
Object
  • Object
show all
Includes:
Wp2txt
Defined in:
lib/wp2txt.rb

Constant Summary

Constants included from Wp2txt

BLANK_LINE_REGEX, CATEGORY_PATTERNS, CATEGORY_REGEX, CHRREF_TO_UTF_REGEX, CLEANUP_REGEX_01, CLEANUP_REGEX_02, CLEANUP_REGEX_03, CLEANUP_REGEX_04, CLEANUP_REGEX_05, CLEANUP_REGEX_06, CLEANUP_REGEX_07, CLEANUP_REGEX_08, COMPLEX_REGEX_01, COMPLEX_REGEX_02, COMPLEX_REGEX_03, COMPLEX_REGEX_04, COMPLEX_REGEX_05, CURLY_SQUARE_BRACKET_REGEX, DEF_MARKS_REGEX, DOUBLE_CURLY_BRACKET_REGEX, DOUBLE_SQUARE_BRACKET_REGEX, ENTITIES, ESCAPE_NOWIKI_REGEX, FORMAT_REF_REGEX, HEADING_CODA_REGEX, HEADING_ONSET_REGEX, HTML_DECODER, HTML_HASH, HTML_REGEX, IN_DEFINITION_REGEX, IN_HEADING_REGEX, IN_HTML_TABLE_REGEX, IN_HTML_TABLE_REGEX1, IN_HTML_TABLE_REGEX2, IN_INPUTBOX_REGEX, IN_INPUTBOX_REGEX1, IN_INPUTBOX_REGEX2, IN_LINK_REGEX, IN_MATH_REGEX, IN_MATH_REGEX1, IN_MATH_REGEX2, IN_ORDERED_REGEX, IN_PRE_REGEX, IN_SOURCE_REGEX, IN_SOURCE_REGEX1, IN_SOURCE_REGEX2, IN_TABLE_REGEX1, IN_TABLE_REGEX2, IN_UNORDERED_REGEX, ISOLATED_TAG_REGEX, ISOLATED_TEMPLATE_REGEX, LIST_MARKS_REGEX, MAKE_REFERENCE_REGEX_A, MAKE_REFERENCE_REGEX_B, MAKE_REFERENCE_REGEX_C, MAKE_REFERENCE_REGEX_D, ML_LINK_END_REGEX, ML_LINK_ONSET_REGEX, ML_TEMPLATE_END_REGEX, ML_TEMPLATE_ONSET_REGEX, MNDASH_REGEX, ONSET_BAR_REGEX, PRE_MARKS_REGEX, REDIRECT_REGEX, REMOVE_DIRECTIVES_REGEX, REMOVE_EMPHASIS_REGEX, REMOVE_HR_REGEX, REMOVE_INLINE_REGEX, REMOVE_ISOLATED_REGEX, REMOVE_TAG_REGEX, SINGLE_CURLY_BRACKET_REGEX, SINGLE_SQUARE_BRACKET_REGEX, TYPE_CODE_REGEX, UNESCAPE_NOWIKI_REGEX, VERSION

Instance Method Summary collapse

Methods included from Wp2txt

#batch_file_mod, #chrref_to_utf, #cleanup, #collect_files, #convert_characters, #correct_inline_template, #correct_separator, #escape_nowiki, #file_mod, #format_wiki, #make_reference, #mndash, #process_external_links, #process_interwiki_links, #process_nested_structure, #remove_complex, #remove_directive, #remove_emphasis, #remove_hr, #remove_html, #remove_inbetween, #remove_ref, #remove_table, #remove_tag, #remove_templates, #rename, #sec_to_str, #special_chr, #unescape_nowiki

Constructor Details

#initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false) ⇒ Splitter

Returns a new instance of Splitter.



10
11
12
13
14
15
16
17
18
# File 'lib/wp2txt.rb', line 10

def initialize(input_file, output_dir = ".", tfile_size = 10, bz2_gem = false)
  @fp = nil
  @input_file = input_file
  @output_dir = output_dir
  @tfile_size = tfile_size
  require "bzip2-ruby" if bz2_gem
  @bz2_gem = bz2_gem
  prepare
end

Instance Method Details

#command_exist?(command) ⇒ Boolean

check if a given command exists: return the path if it does, return false if not

Returns:

  • (Boolean)


47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/wp2txt.rb', line 47

def command_exist?(command)
  basename = File.basename(command)
  path = +""
  print "Checking #{basename}: "
  begin
    if open("| which #{command} 2>/dev/null") { |f| path = f.gets.strip }
      puts "detected [#{path}]"
      path.strip
    elsif open("| which #{basename} 2>/dev/null") { |f| path = f.gets.strip }
      puts "detected [#{path}]"
      path.strip
    else
      puts "#{basename} not found"
      false
    end
  rescue StandardError
    puts "#{basename} not found"
    false
  end
end

#file_size(file) ⇒ Object



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/wp2txt.rb', line 20

def file_size(file)
  size = 0
  unit = 10_485_760
  star = 0
  before = Time.now.to_f

  loop do
    begin
      a = file.read(unit)
    rescue StandardError
      a = nil
    end
    break unless a

    present = Time.now.to_f
    size += a.size

    next if present - before <= 0.3

    star = 0 if star > 10
    star += 1
    before = present
  end
  size
end

#fill_bufferObject

read text data from bz2 compressed file by 1 megabyte



100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/wp2txt.rb', line 100

def fill_buffer
  loop do
    begin
      new_lines = @file_pointer.read(10_485_760)
    rescue StandardError
      return nil
    end
    return nil unless new_lines

    # temp_buf is filled with text split by "\n"
    temp_buf = []
    ss = StringScanner.new(new_lines)
    temp_buf << ss[0] while ss.scan(/.*?\n/m)
    temp_buf << ss.rest unless ss.eos?

    new_first_line = temp_buf.shift
    @buffer.last << new_first_line
    @buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
    @buffer += temp_buf unless temp_buf.empty?
    @buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
    break if @buffer.size > 1
  end
  true
end

#get_newlineObject



125
126
127
128
129
130
131
132
133
134
# File 'lib/wp2txt.rb', line 125

def get_newline
  @buffer ||= [+""]
  if @buffer.size == 1 && !fill_buffer
    nil
  elsif @buffer.empty?
    nil
  else
    @buffer.shift
  end
end

#prepareObject

check the size of input file (bz2 or plain xml) when decompressed



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/wp2txt.rb', line 69

def prepare
  # if output_dir is not specified, output in the same directory
  # as the imput file
  @output_dir = File.dirname(@input_file) if !@output_dir && @input_file

  if /.bz2$/ =~ @input_file
    if @bz2_gem
      file = Bzip2::Reader.new File.open(@input_file, "r:UTF-8")
    elsif Gem.win_platform?
      file = IO.popen("bunzip2.exe -c #{@input_file}")
    elsif (bzpath = command_exist?("lbzip2") || command_exist?("pbzip2") || command_exist?("bzip2"))
      file = IO.popen("#{bzpath} -c -d #{@input_file}")
    end
  else # meaning that it is a text file
    @infile_size = File.stat(@input_file).size
    file = open(@input_file)
  end

  # create basename of output file
  @outfile_base = File.basename(@input_file, ".*") + "-"
  @total_size = 0
  @file_index = 1
  outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
  @outfiles = []
  @outfiles << outfilename
  @fp = File.open(outfilename, "w")
  @file_pointer = file
  true
end

#split_fileObject



136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'lib/wp2txt.rb', line 136

def split_file
  output_text = +""
  end_flag = false
  while (text = get_newline)
    @count ||= 0
    @count += 1
    @size_read ||= 0
    @size_read += text.bytesize
    @total_size += text.bytesize
    output_text << text
    end_flag = true if @total_size > (@tfile_size * 1024 * 1024)
    # never close the file until the end of the page even if end_flag is on
    next unless end_flag && %r{</page} =~ text

    @fp.puts(output_text)
    output_text = +""
    @total_size = 0
    end_flag = false
    @fp.close
    @file_index += 1
    outfilename = File.join(@output_dir, @outfile_base + @file_index.to_s)
    @outfiles << outfilename
    @fp = File.open(outfilename, "w")
  end
  @fp.puts(output_text) if output_text != ""
  @fp.close

  if outfilename && File.size(outfilename).zero?
    File.delete(outfilename)
    @outfiles.delete(outfilename)
  end

  rename(@outfiles, "xml")
end