Class: Wp2txt::Runner

Inherits:

Object

Object
Wp2txt::Runner

Includes:: Wp2txt

Defined in:: lib/wp2txt.rb

Constant Summary

Constants included from Wp2txt

BLANK_LINE_REGEX, CATEGORY_PATTERNS, CATEGORY_REGEX, CHRREF_TO_UTF_REGEX, CLEANUP_REGEX_01, CLEANUP_REGEX_02, CLEANUP_REGEX_03, CLEANUP_REGEX_04, CLEANUP_REGEX_05, CLEANUP_REGEX_06, CLEANUP_REGEX_07, CLEANUP_REGEX_08, COMPLEX_REGEX_01, COMPLEX_REGEX_02, COMPLEX_REGEX_03, COMPLEX_REGEX_04, COMPLEX_REGEX_05, CURLY_SQUARE_BRACKET_REGEX, DEF_MARKS_REGEX, DOUBLE_CURLY_BRACKET_REGEX, DOUBLE_SQUARE_BRACKET_REGEX, ENTITIES, ESCAPE_NOWIKI_REGEX, FORMAT_REF_REGEX, HEADING_CODA_REGEX, HEADING_ONSET_REGEX, HTML_DECODER, HTML_HASH, HTML_REGEX, IN_DEFINITION_REGEX, IN_HEADING_REGEX, IN_HTML_TABLE_REGEX, IN_HTML_TABLE_REGEX1, IN_HTML_TABLE_REGEX2, IN_INPUTBOX_REGEX, IN_INPUTBOX_REGEX1, IN_INPUTBOX_REGEX2, IN_LINK_REGEX, IN_MATH_REGEX, IN_MATH_REGEX1, IN_MATH_REGEX2, IN_ORDERED_REGEX, IN_PRE_REGEX, IN_SOURCE_REGEX, IN_SOURCE_REGEX1, IN_SOURCE_REGEX2, IN_TABLE_REGEX1, IN_TABLE_REGEX2, IN_UNORDERED_REGEX, ISOLATED_TAG_REGEX, ISOLATED_TEMPLATE_REGEX, LIST_MARKS_REGEX, MAKE_REFERENCE_REGEX_A, MAKE_REFERENCE_REGEX_B, MAKE_REFERENCE_REGEX_C, MAKE_REFERENCE_REGEX_D, ML_LINK_END_REGEX, ML_LINK_ONSET_REGEX, ML_TEMPLATE_END_REGEX, ML_TEMPLATE_ONSET_REGEX, MNDASH_REGEX, ONSET_BAR_REGEX, PRE_MARKS_REGEX, REDIRECT_REGEX, REMOVE_DIRECTIVES_REGEX, REMOVE_EMPHASIS_REGEX, REMOVE_HR_REGEX, REMOVE_INLINE_REGEX, REMOVE_ISOLATED_REGEX, REMOVE_TAG_REGEX, SINGLE_CURLY_BRACKET_REGEX, SINGLE_SQUARE_BRACKET_REGEX, TYPE_CODE_REGEX, UNESCAPE_NOWIKI_REGEX, VERSION

Instance Method Summary collapse

#extract_text(&block) ⇒ Object
#fill_buffer ⇒ Object
#get_newline ⇒ Object
#get_page ⇒ Object
#initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true) ⇒ Runner constructor

A new instance of Runner.
#prepare ⇒ Object

Methods included from Wp2txt

#batch_file_mod, #chrref_to_utf, #cleanup, #collect_files, #convert_characters, #correct_inline_template, #correct_separator, #escape_nowiki, #file_mod, #format_wiki, #make_reference, #mndash, #process_external_links, #process_interwiki_links, #process_nested_structure, #remove_complex, #remove_directive, #remove_emphasis, #remove_hr, #remove_html, #remove_inbetween, #remove_ref, #remove_table, #remove_tag, #remove_templates, #rename, #sec_to_str, #special_chr, #unescape_nowiki

Constructor Details

#initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true) ⇒ `Runner`

Returns a new instance of Runner.

# File 'lib/wp2txt.rb', line 175

def initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true)
  @fp = nil
  @input_file = input_file
  @output_dir = output_dir
  @strip_tmarker = strip_tmarker
  @del_interfile = del_interfile
  prepare
end

Instance Method Details

#extract_text(&block) ⇒ `Object`

# File 'lib/wp2txt.rb', line 254

def extract_text(&block)
  title = nil
  output_text = +""
  pages = []
  data_empty = false

  until data_empty
    new_page = get_page
    if new_page
      pages << new_page
    else
      data_empty = true
    end
    next unless data_empty

    pages.each do |page|
      xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
      xml = xmlns + page + "</mediawiki>"

      input = Nokogiri::XML(xml, nil, 'UTF-8')
      page = input.xpath("//xmlns:text").first
      pp_title = page.parent.parent.at_css "title"
      title = pp_title.content
      next if /:/ =~ title

      text = page.content
      text.gsub!(/<!--(.*?)-->/m) do |content|
        num_of_newlines = content.count("\n")
        if num_of_newlines.zero?
          +""
        else
          "\n" * num_of_newlines
        end
      end
      article = Article.new(text, title, @strip_tmarker)
      page_text = block.call(article)
      output_text << page_text
    end

    output_text = cleanup(output_text)
    unless output_text.empty?
      outfilename = File.join(@output_dir, @outfile_base + ".txt")
      @fp = File.open(outfilename, "w")
      @fp.puts(output_text)
      @fp.close
    end
    @file_pointer.close
    File.delete(@input_file) if @del_interfile
    output_text = +""
  end
end

#fill_buffer ⇒ `Object`

# File 'lib/wp2txt.rb', line 193

def fill_buffer
  loop do
    begin
      new_lines = @file_pointer.read(10_485_760)
    rescue StandardError
      return nil
    end
    return nil unless new_lines

    # temp_buf is filled with text split by "\n"
    temp_buf = []
    ss = StringScanner.new(new_lines)
    temp_buf << ss[0] while ss.scan(/.*?\n/m)
    temp_buf << ss.rest unless ss.eos?

    new_first_line = temp_buf.shift
    @buffer.last <<  new_first_line
    @buffer << +"" if new_first_line[-1, 1] == "\n" # new_first_line.index("\n")
    @buffer += temp_buf unless temp_buf.empty?
    @buffer << +"" if @buffer.last[-1, 1] == "\n" # @buffer.last.index("\n")
    break if @buffer.size > 1
  end
  true
end

#get_newline ⇒ `Object`

# File 'lib/wp2txt.rb', line 218

def get_newline
  @buffer ||= [+""]
  if @buffer.size == 1 && !fill_buffer
    nil
  elsif @buffer.empty?
    nil
  else
    @buffer.shift
  end
end

#get_page ⇒ `Object`

# File 'lib/wp2txt.rb', line 229

def get_page
  inside_page = false
  page = +""
  while (line = get_newline)
    case line
    when /<page>/
      page << line
      inside_page = true
      next
    when %r{</page>}
      page << line
      inside_page = false
      break
    end
    page << line if inside_page
  end
  if page.empty?
    false
  else
    page.force_encoding("utf-8")
  end
rescue StandardError
  page
end

#prepare ⇒ `Object`

# File 'lib/wp2txt.rb', line 184

def prepare
  @infile_size = File.stat(@input_file).size
  file = open(@input_file)
  @file_pointer = file
  @outfile_base = File.basename(@input_file, ".*")
  @total_size = 0
  true
end

Class: Wp2txt::Runner

Constant Summary

Constants included from Wp2txt

Instance Method Summary collapse

Methods included from Wp2txt

Constructor Details

#initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true) ⇒ Runner

Instance Method Details

#extract_text(&block) ⇒ Object

#fill_buffer ⇒ Object

#get_newline ⇒ Object

#get_page ⇒ Object

#prepare ⇒ Object

#initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true) ⇒ `Runner`

#extract_text(&block) ⇒ `Object`

#fill_buffer ⇒ `Object`

#get_newline ⇒ `Object`

#get_page ⇒ `Object`

#prepare ⇒ `Object`