Class: Wp2txt::Runner

Inherits:
Object
  • Object
show all
Includes:
Wp2txt
Defined in:
lib/wp2txt.rb

Constant Summary

Constants included from Wp2txt

BLANK_LINE_REGEX, CATEGORY_PATTERNS, CATEGORY_REGEX, CHRREF_TO_UTF_REGEX, CLEANUP_REGEX_01, CLEANUP_REGEX_02, CLEANUP_REGEX_03, CLEANUP_REGEX_04, CLEANUP_REGEX_05, CLEANUP_REGEX_06, CLEANUP_REGEX_07, CLEANUP_REGEX_08, COMPLEX_REGEX_01, COMPLEX_REGEX_02, COMPLEX_REGEX_03, COMPLEX_REGEX_04, COMPLEX_REGEX_05, CURLY_SQUARE_BRACKET_REGEX, DEF_MARKS_REGEX, DOUBLE_CURLY_BRACKET_REGEX, DOUBLE_SQUARE_BRACKET_REGEX, ENTITIES, ESCAPE_NOWIKI_REGEX, FORMAT_REF_REGEX, HEADING_CODA_REGEX, HEADING_ONSET_REGEX, HTML_DECODER, HTML_HASH, HTML_REGEX, IN_DEFINITION_REGEX, IN_HEADING_REGEX, IN_HTML_TABLE_REGEX, IN_HTML_TABLE_REGEX1, IN_HTML_TABLE_REGEX2, IN_INPUTBOX_REGEX, IN_INPUTBOX_REGEX1, IN_INPUTBOX_REGEX2, IN_LINK_REGEX, IN_MATH_REGEX, IN_MATH_REGEX1, IN_MATH_REGEX2, IN_ORDERED_REGEX, IN_PRE_REGEX, IN_SOURCE_REGEX, IN_SOURCE_REGEX1, IN_SOURCE_REGEX2, IN_TABLE_REGEX1, IN_TABLE_REGEX2, IN_UNORDERED_REGEX, ISOLATED_TAG_REGEX, ISOLATED_TEMPLATE_REGEX, LIST_MARKS_REGEX, MAKE_REFERENCE_REGEX_A, MAKE_REFERENCE_REGEX_B, MAKE_REFERENCE_REGEX_C, MAKE_REFERENCE_REGEX_D, ML_LINK_END_REGEX, ML_LINK_ONSET_REGEX, ML_TEMPLATE_END_REGEX, ML_TEMPLATE_ONSET_REGEX, MNDASH_REGEX, ONSET_BAR_REGEX, PRE_MARKS_REGEX, REDIRECT_REGEX, REMOVE_DIRECTIVES_REGEX, REMOVE_EMPHASIS_REGEX, REMOVE_HR_REGEX, REMOVE_INLINE_REGEX, REMOVE_ISOLATED_REGEX, REMOVE_TAG_REGEX, SINGLE_CURLY_BRACKET_REGEX, SINGLE_SQUARE_BRACKET_REGEX, TYPE_CODE_REGEX, UNESCAPE_NOWIKI_REGEX, VERSION

Instance Method Summary collapse

Methods included from Wp2txt

#batch_file_mod, #chrref_to_utf, #cleanup, #collect_files, #convert_characters, #correct_inline_template, #correct_separator, #escape_nowiki, #file_mod, #format_wiki, #make_reference, #mndash, #process_external_links, #process_interwiki_links, #process_nested_structure, #remove_complex, #remove_directive, #remove_emphasis, #remove_hr, #remove_html, #remove_inbetween, #remove_ref, #remove_table, #remove_tag, #remove_templates, #rename, #sec_to_str, #special_chr, #unescape_nowiki

Constructor Details

#initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true) ⇒ Runner

Returns a new instance of Runner.



175
176
177
178
179
180
181
182
# File 'lib/wp2txt.rb', line 175

def initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true)
  @fp = nil
  @input_file = input_file
  @output_dir = output_dir
  @strip_tmarker = strip_tmarker
  @del_interfile = del_interfile
  prepare
end

Instance Method Details

#extract_text(&block) ⇒ Object



254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# File 'lib/wp2txt.rb', line 254

def extract_text(&block)
  title = nil
  output_text = +""
  pages = []
  data_empty = false

  until data_empty
    new_page = get_page
    if new_page
      pages << new_page
    else
      data_empty = true
    end
    next unless data_empty

    pages.each do |page|
      xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
      xml = xmlns + page + "</mediawiki>"

      input = Nokogiri::XML(xml, nil, 'UTF-8')
      page = input.xpath("//xmlns:text").first
      pp_title = page.parent.parent.at_css "title"
      title = pp_title.content
      next if /:/ =~ title

      text = page.content
      text.gsub!(/<!--(.*?)-->/m) do |content|
        num_of_newlines = content.count("\n")
        if num_of_newlines.zero?
          +""
        else
          "\n" * num_of_newlines
        end
      end
      article = Article.new(text, title, @strip_tmarker)
      page_text = block.call(article)
      output_text << page_text
    end

    output_text = cleanup(output_text)
    unless output_text.empty?
      outfilename = File.join(@output_dir, @outfile_base + ".txt")
      @fp = File.open(outfilename, "w")
      @fp.puts(output_text)
      @fp.close
    end
    @file_pointer.close
    File.delete(@input_file) if @del_interfile
    output_text = +""
  end
end

#fill_bufferObject



193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
# File 'lib/wp2txt.rb', line 193

def fill_buffer
  loop do
    begin
      new_lines = @file_pointer.read(10_485_760)
    rescue StandardError
      return nil
    end
    return nil unless new_lines

    # temp_buf is filled with text split by "\n"
    temp_buf = []
    ss = StringScanner.new(new_lines)
    temp_buf << ss[0] while ss.scan(/.*?\n/m)
    temp_buf << ss.rest unless ss.eos?

    new_first_line = temp_buf.shift
    @buffer.last <<  new_first_line
    @buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
    @buffer += temp_buf unless temp_buf.empty?
    @buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
    break if @buffer.size > 1
  end
  true
end

#get_newlineObject



218
219
220
221
222
223
224
225
226
227
# File 'lib/wp2txt.rb', line 218

def get_newline
  @buffer ||= [+""]
  if @buffer.size == 1 && !fill_buffer
    nil
  elsif @buffer.empty?
    nil
  else
    @buffer.shift
  end
end

#get_pageObject



229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# File 'lib/wp2txt.rb', line 229

def get_page
  inside_page = false
  page = +""
  while (line = get_newline)
    case line
    when /<page>/
      page << line
      inside_page = true
      next
    when %r{</page>}
      page << line
      inside_page = false
      break
    end
    page << line if inside_page
  end
  if page.empty?
    false
  else
    page.force_encoding("utf-8")
  end
rescue StandardError
  page
end

#prepareObject



184
185
186
187
188
189
190
191
# File 'lib/wp2txt.rb', line 184

def prepare
  @infile_size = File.stat(@input_file).size
  file = open(@input_file)
  @file_pointer = file
  @outfile_base = File.basename(@input_file, ".*")
  @total_size = 0
  true
end