Class: Wp2txt::Runner
Constant Summary
Constants included
from Wp2txt
BLANK_LINE_REGEX, CATEGORY_PATTERNS, CATEGORY_REGEX, CHRREF_TO_UTF_REGEX, CLEANUP_REGEX_01, CLEANUP_REGEX_02, CLEANUP_REGEX_03, CLEANUP_REGEX_04, CLEANUP_REGEX_05, CLEANUP_REGEX_06, CLEANUP_REGEX_07, CLEANUP_REGEX_08, COMPLEX_REGEX_01, COMPLEX_REGEX_02, COMPLEX_REGEX_03, COMPLEX_REGEX_04, COMPLEX_REGEX_05, CURLY_SQUARE_BRACKET_REGEX, DEF_MARKS_REGEX, DOUBLE_CURLY_BRACKET_REGEX, DOUBLE_SQUARE_BRACKET_REGEX, ENTITIES, ESCAPE_NOWIKI_REGEX, FORMAT_REF_REGEX, HEADING_CODA_REGEX, HEADING_ONSET_REGEX, HTML_DECODER, HTML_HASH, HTML_REGEX, IN_DEFINITION_REGEX, IN_HEADING_REGEX, IN_HTML_TABLE_REGEX, IN_HTML_TABLE_REGEX1, IN_HTML_TABLE_REGEX2, IN_INPUTBOX_REGEX, IN_INPUTBOX_REGEX1, IN_INPUTBOX_REGEX2, IN_LINK_REGEX, IN_MATH_REGEX, IN_MATH_REGEX1, IN_MATH_REGEX2, IN_ORDERED_REGEX, IN_PRE_REGEX, IN_SOURCE_REGEX, IN_SOURCE_REGEX1, IN_SOURCE_REGEX2, IN_TABLE_REGEX1, IN_TABLE_REGEX2, IN_UNORDERED_REGEX, ISOLATED_TAG_REGEX, ISOLATED_TEMPLATE_REGEX, LIST_MARKS_REGEX, MAKE_REFERENCE_REGEX_A, MAKE_REFERENCE_REGEX_B, MAKE_REFERENCE_REGEX_C, MAKE_REFERENCE_REGEX_D, ML_LINK_END_REGEX, ML_LINK_ONSET_REGEX, ML_TEMPLATE_END_REGEX, ML_TEMPLATE_ONSET_REGEX, MNDASH_REGEX, ONSET_BAR_REGEX, PRE_MARKS_REGEX, REDIRECT_REGEX, REMOVE_DIRECTIVES_REGEX, REMOVE_EMPHASIS_REGEX, REMOVE_HR_REGEX, REMOVE_INLINE_REGEX, REMOVE_ISOLATED_REGEX, REMOVE_TAG_REGEX, SINGLE_CURLY_BRACKET_REGEX, SINGLE_SQUARE_BRACKET_REGEX, TYPE_CODE_REGEX, UNESCAPE_NOWIKI_REGEX, VERSION
Instance Method Summary
collapse
Methods included from Wp2txt
#batch_file_mod, #chrref_to_utf, #cleanup, #collect_files, #convert_characters, #correct_inline_template, #correct_separator, #escape_nowiki, #file_mod, #format_wiki, #make_reference, #mndash, #process_external_links, #process_interwiki_links, #process_nested_structure, #remove_complex, #remove_directive, #remove_emphasis, #remove_hr, #remove_html, #remove_inbetween, #remove_ref, #remove_table, #remove_tag, #remove_templates, #rename, #sec_to_str, #special_chr, #unescape_nowiki
Constructor Details
#initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true) ⇒ Runner
Returns a new instance of Runner.
175
176
177
178
179
180
181
182
|
# File 'lib/wp2txt.rb', line 175
def initialize(input_file, output_dir = ".", strip_tmarker = false, del_interfile = true)
@fp = nil
@input_file = input_file
@output_dir = output_dir
@strip_tmarker = strip_tmarker
@del_interfile = del_interfile
prepare
end
|
Instance Method Details
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
|
# File 'lib/wp2txt.rb', line 254
def (&block)
title = nil
output_text = +""
pages = []
data_empty = false
until data_empty
new_page = get_page
if new_page
pages << new_page
else
data_empty = true
end
next unless data_empty
pages.each do |page|
xmlns = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">' + "\n"
xml = xmlns + page + "</mediawiki>"
input = Nokogiri::XML(xml, nil, 'UTF-8')
page = input.xpath("//xmlns:text").first
pp_title = page.parent.parent.at_css "title"
title = pp_title.content
next if /:/ =~ title
text = page.content
text.gsub!(/<!--(.*?)-->/m) do |content|
num_of_newlines = content.count("\n")
if num_of_newlines.zero?
+""
else
"\n" * num_of_newlines
end
end
article = Article.new(text, title, @strip_tmarker)
page_text = block.call(article)
output_text << page_text
end
output_text = cleanup(output_text)
unless output_text.empty?
outfilename = File.join(@output_dir, @outfile_base + ".txt")
@fp = File.open(outfilename, "w")
@fp.puts(output_text)
@fp.close
end
@file_pointer.close
File.delete(@input_file) if @del_interfile
output_text = +""
end
end
|
#fill_buffer ⇒ Object
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
|
# File 'lib/wp2txt.rb', line 193
def fill_buffer
loop do
begin
new_lines = @file_pointer.read(10_485_760)
rescue StandardError
return nil
end
return nil unless new_lines
temp_buf = []
ss = StringScanner.new(new_lines)
temp_buf << ss[0] while ss.scan(/.*?\n/m)
temp_buf << ss.rest unless ss.eos?
new_first_line = temp_buf.shift
@buffer.last << new_first_line
@buffer << +"" if new_first_line[-1, 1] == "\n" @buffer += temp_buf unless temp_buf.empty?
@buffer << +"" if @buffer.last[-1, 1] == "\n" break if @buffer.size > 1
end
true
end
|
#get_newline ⇒ Object
218
219
220
221
222
223
224
225
226
227
|
# File 'lib/wp2txt.rb', line 218
def get_newline
@buffer ||= [+""]
if @buffer.size == 1 && !fill_buffer
nil
elsif @buffer.empty?
nil
else
@buffer.shift
end
end
|
#get_page ⇒ Object
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
|
# File 'lib/wp2txt.rb', line 229
def get_page
inside_page = false
page = +""
while (line = get_newline)
case line
when /<page>/
page << line
inside_page = true
next
when %r{</page>}
page << line
inside_page = false
break
end
page << line if inside_page
end
if page.empty?
false
else
page.force_encoding("utf-8")
end
rescue StandardError
page
end
|
#prepare ⇒ Object
184
185
186
187
188
189
190
191
|
# File 'lib/wp2txt.rb', line 184
def prepare
@infile_size = File.stat(@input_file).size
file = open(@input_file)
@file_pointer = file
@outfile_base = File.basename(@input_file, ".*")
@total_size = 0
true
end
|