Module: Wp2txt
- Defined in:
- lib/wp2txt.rb,
lib/wp2txt/regex.rb,
lib/wp2txt/utils.rb,
lib/wp2txt/article.rb,
lib/wp2txt/version.rb
Defined Under Namespace
Classes: Article, Runner, Splitter
Constant Summary collapse
- HTML_DECODER =
variables to save resource for generating regexps those with a trailing number 1 represent opening tag/markup those with a trailing number 2 represent closing tag/markup those without a trailing number contain both opening/closing tags/markups
HTMLEntities.new
- ENTITIES =
[' ', '<', '>', '&', '"'].zip([' ', '<', '>', '&', '"'])
- HTML_HASH =
- HTML_REGEX =
Regexp.new("(" + HTML_HASH.keys.join("|") + ")")
- ML_TEMPLATE_ONSET_REGEX =
Regexp.new('^\{\{[^\}]*$')
- ML_TEMPLATE_END_REGEX =
Regexp.new('\}\}\s*$')
- ML_LINK_ONSET_REGEX =
Regexp.new('^\[\[[^\]]*$')
- ML_LINK_END_REGEX =
Regexp.new('\]\]\s*$')
- ISOLATED_TEMPLATE_REGEX =
Regexp.new('^\s*\{\{.+\}\}\s*$')
- ISOLATED_TAG_REGEX =
Regexp.new('^\s*\<[^\<\>]+\>.+\<[^\<\>]+\>\s*$')
- IN_LINK_REGEX =
Regexp.new('^\s*\[.*\]\s*$')
- IN_INPUTBOX_REGEX =
Regexp.new('<inputbox>.*?<\/inputbox>')
- IN_INPUTBOX_REGEX1 =
Regexp.new('<inputbox>')
- IN_INPUTBOX_REGEX2 =
Regexp.new('<\/inputbox>')
- IN_SOURCE_REGEX =
Regexp.new('<source.*?>.*?<\/source>')
- IN_SOURCE_REGEX1 =
Regexp.new('<source.*?>')
- IN_SOURCE_REGEX2 =
Regexp.new('<\/source>')
- IN_MATH_REGEX =
Regexp.new('<math.*?>.*?<\/math>')
- IN_MATH_REGEX1 =
Regexp.new('<math.*?>')
- IN_MATH_REGEX2 =
Regexp.new('<\/math>')
- IN_HEADING_REGEX =
Regexp.new('^=+.*?=+$')
- IN_HTML_TABLE_REGEX =
Regexp.new("<table.*?><\/table>")
- IN_HTML_TABLE_REGEX1 =
Regexp.new('<table\b')
- IN_HTML_TABLE_REGEX2 =
Regexp.new('<\/\s*table>')
- IN_TABLE_REGEX1 =
Regexp.new('^\s*\{\|')
- IN_TABLE_REGEX2 =
Regexp.new('^\|\}.*?$')
- IN_UNORDERED_REGEX =
Regexp.new('^\*')
- IN_ORDERED_REGEX =
Regexp.new('^\#')
- IN_PRE_REGEX =
Regexp.new('^ ')
- IN_DEFINITION_REGEX =
Regexp.new('^[\;\:]')
- BLANK_LINE_REGEX =
Regexp.new('^\s*$')
- REDIRECT_REGEX =
Regexp.new('#(?:REDIRECT|転送)\s+\[\[(.+)\]\]', Regexp::IGNORECASE)
- REMOVE_TAG_REGEX =
Regexp.new("\<[^\<\>]*\>")
- REMOVE_DIRECTIVES_REGEX =
Regexp.new("\_\_[^\_]*\_\_")
- REMOVE_EMPHASIS_REGEX =
Regexp.new('(' + Regexp.escape("''") + '+)(.+?)\1')
- CHRREF_TO_UTF_REGEX =
Regexp.new('&#(x?)([0-9a-fA-F]+);')
- MNDASH_REGEX =
Regexp.new('\{(mdash|ndash|–)\}')
- REMOVE_HR_REGEX =
Regexp.new('^\s*\-+\s*$')
- MAKE_REFERENCE_REGEX_A =
Regexp.new('<br ?\/>')
- MAKE_REFERENCE_REGEX_B =
Regexp.new('<ref[^>]*\/>')
- MAKE_REFERENCE_REGEX_C =
Regexp.new('<ref[^>]*>')
- MAKE_REFERENCE_REGEX_D =
Regexp.new('<\/ref>')
- FORMAT_REF_REGEX =
Regexp.new('\[ref\](.*?)\[\/ref\]', Regexp::MULTILINE)
- HEADING_ONSET_REGEX =
Regexp.new('^(\=+)\s+')
- HEADING_CODA_REGEX =
Regexp.new('\s+(\=+)$')
- LIST_MARKS_REGEX =
Regexp.new('\A[\*\#\;\:\ ]+')
- PRE_MARKS_REGEX =
Regexp.new('\A\^\ ')
- DEF_MARKS_REGEX =
Regexp.new('\A[\;\:\ ]+')
- ONSET_BAR_REGEX =
Regexp.new('\A[^\|]+\z')
- CATEGORY_PATTERNS =
["Category", "Categoria"].join("|")
- CATEGORY_REGEX =
Regexp.new('[\{\[\|\b](?:' + CATEGORY_PATTERNS + ')\:(.*?)[\}\]\|\b]', Regexp::IGNORECASE)
- ESCAPE_NOWIKI_REGEX =
Regexp.new('<nowiki>(.*?)<\/nowiki>', Regexp::MULTILINE)
- UNESCAPE_NOWIKI_REGEX =
Regexp.new('<nowiki\-(\d+?)>')
- REMOVE_ISOLATED_REGEX =
Regexp.new('^\s*\{\{(.*?)\}\}\s*$')
- REMOVE_INLINE_REGEX =
Regexp.new('\{\{(.*?)\}\}')
- TYPE_CODE_REGEX =
Regexp.new('\A(?:lang*|\AIPA|IEP|SEP|indent|audio|small|dmoz|pron|unicode|note label|nowrap|ArabDIN|trans|Nihongo|Polytonic)', Regexp::IGNORECASE)
- SINGLE_SQUARE_BRACKET_REGEX =
Regexp.new("(#{Regexp.escape("[")}|#{Regexp.escape("]")})", Regexp::MULTILINE)
- DOUBLE_SQUARE_BRACKET_REGEX =
Regexp.new("(#{Regexp.escape("[[")}|#{Regexp.escape("]]")})", Regexp::MULTILINE)
- SINGLE_CURLY_BRACKET_REGEX =
Regexp.new("(#{Regexp.escape("{")}|#{Regexp.escape("}")})", Regexp::MULTILINE)
- DOUBLE_CURLY_BRACKET_REGEX =
Regexp.new("(#{Regexp.escape("{{")}|#{Regexp.escape("}}")})", Regexp::MULTILINE)
- CURLY_SQUARE_BRACKET_REGEX =
Regexp.new("(#{Regexp.escape("{|")}|#{Regexp.escape("|}")})", Regexp::MULTILINE)
- COMPLEX_REGEX_01 =
Regexp.new('\<\<([^<>]++)\>\>\s?')
- COMPLEX_REGEX_02 =
Regexp.new('\[\[File\:((?:[^\[\]]++|\[\[\g<1>\]\])++)\]\]', Regexp::MULTILINE | Regexp::IGNORECASE)
- COMPLEX_REGEX_03 =
Regexp.new('^\[\[((?:[^\[\]]++|\[\[\g<1>\]\])++)^\]\]', Regexp::MULTILINE)
- COMPLEX_REGEX_04 =
Regexp.new('\{\{(?:infobox|efn|sfn|unreliable source|refn|reflist|col(?:umns)?\-list|div col|no col|bar box|formatnum\:|col\||see also\||r\||#)((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
- COMPLEX_REGEX_05 =
Regexp.new('\{\{[^{}]+?\n\|((?:[^{}]++|\{\{\g<1>\}\})++)\}\}', Regexp::MULTILINE | Regexp::IGNORECASE)
- CLEANUP_REGEX_01 =
Regexp.new('\[ref\]\s*\[\/ref\]', Regexp::MULTILINE)
- CLEANUP_REGEX_02 =
Regexp.new('^File:.+$')
- CLEANUP_REGEX_03 =
Regexp.new('^\|.*$')
- CLEANUP_REGEX_04 =
Regexp.new('\{\{.*$')
- CLEANUP_REGEX_05 =
Regexp.new('^.*\}\}')
- CLEANUP_REGEX_06 =
Regexp.new('\{\|.*$')
- CLEANUP_REGEX_07 =
Regexp.new('^.*\|\}')
- CLEANUP_REGEX_08 =
Regexp.new('\n\n\n+', Regexp::MULTILINE)
- VERSION =
"1.1.3"
Instance Method Summary collapse
-
#batch_file_mod(dir_path) ⇒ Object
modify files under a directry (recursive).
- #chrref_to_utf(num_str) ⇒ Object
- #cleanup(text) ⇒ Object
-
#collect_files(str, regex = nil) ⇒ Object
collect filenames recursively.
- #convert_characters(text, has_retried = false) ⇒ Object
- #correct_inline_template(str) ⇒ Object
-
#correct_separator(input) ⇒ Object
take care of difference of separators among environments.
-
#escape_nowiki(str) ⇒ Object
methods used from format_wiki ####################.
-
#file_mod(file_path, backup = false) ⇒ Object
modify a file using block/yield mechanism.
- #format_wiki(text, config = {}) ⇒ Object
- #make_reference(str) ⇒ Object
- #mndash(str) ⇒ Object
- #process_external_links(str) ⇒ Object
- #process_interwiki_links(str) ⇒ Object
-
#process_nested_structure(scanner, left, right, &block) ⇒ Object
parser for nested structure ####################.
- #remove_complex(str) ⇒ Object
- #remove_directive(str) ⇒ Object
- #remove_emphasis(str) ⇒ Object
- #remove_hr(str) ⇒ Object
- #remove_html(str) ⇒ Object
- #remove_inbetween(str, tagset = ["<", ">"]) ⇒ Object
- #remove_ref(str) ⇒ Object
- #remove_table(str) ⇒ Object
- #remove_tag(str) ⇒ Object
-
#remove_templates(str) ⇒ Object
methods used from format_article ####################.
- #rename(files, ext = "txt") ⇒ Object
-
#sec_to_str(int) ⇒ Object
convert int of seconds to string in the format 00:00:00.
- #special_chr(str) ⇒ Object
- #unescape_nowiki(str) ⇒ Object
Instance Method Details
#batch_file_mod(dir_path) ⇒ Object
modify files under a directry (recursive)
315 316 317 318 319 320 321 322 323 |
# File 'lib/wp2txt/utils.rb', line 315 def batch_file_mod(dir_path) if FileTest.directory?(dir_path) collect_files(dir_path).each do |file| yield file if FileTest.file?(file) end elsif FileTest.file?(dir_path) yield dir_path end end |
#chrref_to_utf(num_str) ⇒ Object
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
# File 'lib/wp2txt/utils.rb', line 200 def chrref_to_utf(num_str) num_str.gsub(CHRREF_TO_UTF_REGEX) do ch = if $1 == "x" $2.to_i(16) else $2.to_i end hi = ch >> 8 lo = ch & 0xff u = +"\377\376" << lo.chr << hi.chr u.encode("UTF-8", "UTF-16") end rescue StandardError num_str end |
#cleanup(text) ⇒ Object
45 46 47 48 49 50 51 52 53 54 55 56 |
# File 'lib/wp2txt/utils.rb', line 45 def cleanup(text) text = text.gsub(CLEANUP_REGEX_01) { "" } text = text.gsub(CLEANUP_REGEX_02) { "" } text = text.gsub(CLEANUP_REGEX_03) { "" } text = text.gsub(CLEANUP_REGEX_04) { "" } text = text.gsub(CLEANUP_REGEX_05) { "" } text = text.gsub(CLEANUP_REGEX_06) { "" } text = text.gsub(CLEANUP_REGEX_07) { "" } text = text.gsub(CLEANUP_REGEX_08) { "\n\n" } text = text.strip text << "\n\n" end |
#collect_files(str, regex = nil) ⇒ Object
collect filenames recursively
289 290 291 292 293 294 295 296 |
# File 'lib/wp2txt/utils.rb', line 289 def collect_files(str, regex = nil) regex ||= // text_array = [] Find.find(str) do |f| text_array << f if regex =~ f end text_array.sort end |
#convert_characters(text, has_retried = false) ⇒ Object
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# File 'lib/wp2txt/utils.rb', line 8 def convert_characters(text, has_retried = false) text << "" text = chrref_to_utf(text) text = special_chr(text) text = text.encode("UTF-8", "UTF-8", invalid: :replace, replace: "") rescue StandardError # detect invalid byte sequence in UTF-8 if has_retried puts "invalid byte sequence detected" puts "******************************" File.open("error_log.txt", "w") do |f| f.write text end exit else text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "") text = text.encode("UTF-16", "UTF-16", invalid: :replace, replace: "") convert_characters(text, true) end end |
#correct_inline_template(str) ⇒ Object
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 |
# File 'lib/wp2txt/utils.rb', line 256 def correct_inline_template(str) scanner = StringScanner.new(str) process_nested_structure(scanner, "{{", "}}") do |contents| parts = contents.split("|") if /\A(?:lang|fontsize)\z/i =~ parts[0] parts.shift elsif /\Alang-/i =~ parts[0] parts.shift elsif /\Alang=/i =~ parts[1] parts.shift end if parts.size == 1 out = parts[0] else begin keyval = parts[1].split("=") out = if keyval.size > 1 keyval[1] else parts[1] || "" end rescue StandardError out = parts[1] || "" end end out.strip end end |
#correct_separator(input) ⇒ Object
take care of difference of separators among environments
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 |
# File 'lib/wp2txt/utils.rb', line 326 def correct_separator(input) case input when String if RUBY_PLATFORM.index("win32") input.gsub("/", "\\") else input.gsub("\\", "/") end when Array ret_array = [] input.each do |item| ret_array << correct_separator(item) end ret_array end end |
#escape_nowiki(str) ⇒ Object
methods used from format_wiki ####################
104 105 106 107 108 109 110 111 112 113 114 115 116 |
# File 'lib/wp2txt/utils.rb', line 104 def escape_nowiki(str) if @nowikis @nowikis.clear else @nowikis = {} end str.gsub(ESCAPE_NOWIKI_REGEX) do nowiki = $1 nowiki_id = nowiki.object_id @nowikis[nowiki_id] = nowiki "<nowiki-#{nowiki_id}>" end end |
#file_mod(file_path, backup = false) ⇒ Object
modify a file using block/yield mechanism
299 300 301 302 303 304 305 306 307 308 309 310 311 312 |
# File 'lib/wp2txt/utils.rb', line 299 def file_mod(file_path, backup = false) File.open(file_path, "r") do |fr| str = fr.read newstr = yield(str) str = newstr if nil? newstr File.open("temp", "w") do |tf| tf.write(str) end end File.rename(file_path, file_path + ".bak") File.rename("temp", file_path) File.unlink(file_path + ".bak") unless backup end |
#format_wiki(text, config = {}) ⇒ Object
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
# File 'lib/wp2txt/utils.rb', line 28 def format_wiki(text, config = {}) text = remove_complex(text) text = escape_nowiki(text) text = process_interwiki_links(text) text = process_external_links(text) text = unescape_nowiki(text) text = remove_directive(text) text = remove_emphasis(text) text = mndash(text) text = remove_hr(text) text = remove_tag(text) text = correct_inline_template(text) unless config[:inline] text = remove_templates(text) unless config[:inline] text = remove_table(text) unless config[:table] text end |
#make_reference(str) ⇒ Object
249 250 251 252 253 254 |
# File 'lib/wp2txt/utils.rb', line 249 def make_reference(str) str = str.gsub(MAKE_REFERENCE_REGEX_A) { "\n" } str = str.gsub(MAKE_REFERENCE_REGEX_B) { "" } str = str.gsub(MAKE_REFERENCE_REGEX_C) { "[ref]" } str.gsub(MAKE_REFERENCE_REGEX_D) { "[/ref]" } end |
#mndash(str) ⇒ Object
216 217 218 |
# File 'lib/wp2txt/utils.rb', line 216 def mndash(str) str.gsub(MNDASH_REGEX, "–") end |
#process_external_links(str) ⇒ Object
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
# File 'lib/wp2txt/utils.rb', line 139 def process_external_links(str) scanner = StringScanner.new(str) process_nested_structure(scanner, "[", "]") do |contents| if /\A\s.+\s\z/ =~ contents " (#{contents.strip}) " else parts = contents.split(" ", 2) case parts.size when 1 parts.first || "" else parts.last || "" end end end end |
#process_interwiki_links(str) ⇒ Object
125 126 127 128 129 130 131 132 133 134 135 136 137 |
# File 'lib/wp2txt/utils.rb', line 125 def process_interwiki_links(str) scanner = StringScanner.new(str) process_nested_structure(scanner, "[[", "]]") do |contents| parts = contents.split("|") case parts.size when 1 parts.first || "" else parts.shift parts.join("|") end end end |
#process_nested_structure(scanner, left, right, &block) ⇒ Object
parser for nested structure ####################
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
# File 'lib/wp2txt/utils.rb', line 60 def process_nested_structure(scanner, left, right, &block) buffer = +"" begin regex = if left == "[" && right == "]" SINGLE_SQUARE_BRACKET_REGEX elsif left == "[[" && right == "]]" DOUBLE_SQUARE_BRACKET_REGEX elsif left == "{" && right == "}" SINGLE_CURLY_BRACKET_REGEX elsif left == "{{" && right == "}}" DOUBLE_CURLY_BRACKET_REGEX elsif left == "{|" && right == "|}" CURLY_SQUARE_BRACKET_REGEX else Regexp.new("(#{Regexp.escape(left)}|#{Regexp.escape(right)})") end while (str = scanner.scan_until(regex)) case scanner[1] when left buffer << str has_left = true when right if has_left buffer = buffer[0...-left.size] contents = block.call(str[0...-left.size]) buffer << contents break else buffer << str end end end buffer << scanner.rest return buffer if buffer == scanner.string scanner.string = buffer process_nested_structure(scanner, left, right, &block) || "" rescue StandardError scanner.string end end |
#remove_complex(str) ⇒ Object
241 242 243 244 245 246 247 |
# File 'lib/wp2txt/utils.rb', line 241 def remove_complex(str) str = str.gsub(COMPLEX_REGEX_01) { "《#{$1}》" } str = str.gsub(COMPLEX_REGEX_02) { "" } str = str.gsub(COMPLEX_REGEX_03) { "" } str = str.gsub(COMPLEX_REGEX_04) { "" } str.gsub(COMPLEX_REGEX_05) { "" } end |
#remove_directive(str) ⇒ Object
190 191 192 |
# File 'lib/wp2txt/utils.rb', line 190 def remove_directive(str) str.gsub(REMOVE_DIRECTIVES_REGEX, "") end |
#remove_emphasis(str) ⇒ Object
194 195 196 197 198 |
# File 'lib/wp2txt/utils.rb', line 194 def remove_emphasis(str) str.gsub(REMOVE_EMPHASIS_REGEX) do $2 end end |
#remove_hr(str) ⇒ Object
220 221 222 |
# File 'lib/wp2txt/utils.rb', line 220 def remove_hr(str) str.gsub(REMOVE_HR_REGEX, "") end |
#remove_html(str) ⇒ Object
228 229 230 231 232 233 234 235 236 237 238 239 |
# File 'lib/wp2txt/utils.rb', line 228 def remove_html(str) res = +str.dup res.gsub!(%r{<[^<>]+/>}) { "" } ["div", "gallery", "timeline", "noinclude"].each do |tag| scanner = StringScanner.new(res) result = process_nested_structure(scanner, "<#{tag}", "#{tag}>") do "" end res.replace(result) end res end |
#remove_inbetween(str, tagset = ["<", ">"]) ⇒ Object
180 181 182 183 184 |
# File 'lib/wp2txt/utils.rb', line 180 def remove_inbetween(str, = ["<", ">"]) = Regexp.quote(.uniq.join("")) regex = /#{Regexp.escape([0])}[^#{}]*#{Regexp.escape([1])}/ str.gsub(regex, "") end |
#remove_ref(str) ⇒ Object
224 225 226 |
# File 'lib/wp2txt/utils.rb', line 224 def remove_ref(str) str.gsub(FORMAT_REF_REGEX) { "" } end |
#remove_table(str) ⇒ Object
169 170 171 172 173 174 |
# File 'lib/wp2txt/utils.rb', line 169 def remove_table(str) scanner = StringScanner.new(str) process_nested_structure(scanner, "{|", "|}") do "" end end |
#remove_tag(str) ⇒ Object
186 187 188 |
# File 'lib/wp2txt/utils.rb', line 186 def remove_tag(str) str.gsub(REMOVE_TAG_REGEX, "") end |
#remove_templates(str) ⇒ Object
methods used from format_article ####################
158 159 160 161 162 163 164 165 166 167 |
# File 'lib/wp2txt/utils.rb', line 158 def remove_templates(str) scanner1 = StringScanner.new(str) result = process_nested_structure(scanner1, "{{", "}}") do "" end scanner2 = StringScanner.new(result) process_nested_structure(scanner2, "{", "}") do "" end end |
#rename(files, ext = "txt") ⇒ Object
343 344 345 346 347 348 349 350 351 352 353 354 355 356 |
# File 'lib/wp2txt/utils.rb', line 343 def rename(files, ext = "txt") # num of digits necessary to name the last file generated maxwidth = 0 files.each do |f| width = f.slice(/-(\d+)\z/, 1).to_s.length.to_i maxwidth = width if maxwidth < width newname = f.sub(/-(\d+)\z/) do "-" + format("%0#{maxwidth}d", $1.to_i) end File.rename(f, newname + ".#{ext}") end true end |
#sec_to_str(int) ⇒ Object
convert int of seconds to string in the format 00:00:00
359 360 361 362 363 364 365 366 367 368 |
# File 'lib/wp2txt/utils.rb', line 359 def sec_to_str(int) unless int str = "--:--:--" return str end h = int / 3600 m = (int - h * 3600) / 60 s = int % 60 format("%02d:%02d:%02d", h, m, s) end |
#special_chr(str) ⇒ Object
176 177 178 |
# File 'lib/wp2txt/utils.rb', line 176 def special_chr(str) HTML_DECODER.decode(str) end |
#unescape_nowiki(str) ⇒ Object
118 119 120 121 122 123 |
# File 'lib/wp2txt/utils.rb', line 118 def unescape_nowiki(str) str.gsub(UNESCAPE_NOWIKI_REGEX) do obj_id = $1.to_i @nowikis[obj_id] end end |