Class: Wp2txt::Article
Overview
an article contains elements, each of which is [TYPE, string]
Constant Summary
Constants included from Wp2txt
BLANK_LINE_REGEX, CATEGORY_PATTERNS, CATEGORY_REGEX, CHRREF_TO_UTF_REGEX, CLEANUP_REGEX_01, CLEANUP_REGEX_02, CLEANUP_REGEX_03, CLEANUP_REGEX_04, CLEANUP_REGEX_05, CLEANUP_REGEX_06, CLEANUP_REGEX_07, CLEANUP_REGEX_08, COMPLEX_REGEX_01, COMPLEX_REGEX_02, COMPLEX_REGEX_03, COMPLEX_REGEX_04, COMPLEX_REGEX_05, CURLY_SQUARE_BRACKET_REGEX, DEF_MARKS_REGEX, DOUBLE_CURLY_BRACKET_REGEX, DOUBLE_SQUARE_BRACKET_REGEX, ENTITIES, ESCAPE_NOWIKI_REGEX, FORMAT_REF_REGEX, HEADING_CODA_REGEX, HEADING_ONSET_REGEX, HTML_DECODER, HTML_HASH, HTML_REGEX, IN_DEFINITION_REGEX, IN_HEADING_REGEX, IN_HTML_TABLE_REGEX, IN_HTML_TABLE_REGEX1, IN_HTML_TABLE_REGEX2, IN_INPUTBOX_REGEX, IN_INPUTBOX_REGEX1, IN_INPUTBOX_REGEX2, IN_LINK_REGEX, IN_MATH_REGEX, IN_MATH_REGEX1, IN_MATH_REGEX2, IN_ORDERED_REGEX, IN_PRE_REGEX, IN_SOURCE_REGEX, IN_SOURCE_REGEX1, IN_SOURCE_REGEX2, IN_TABLE_REGEX1, IN_TABLE_REGEX2, IN_UNORDERED_REGEX, ISOLATED_TAG_REGEX, ISOLATED_TEMPLATE_REGEX, LIST_MARKS_REGEX, MAKE_REFERENCE_REGEX_A, MAKE_REFERENCE_REGEX_B, MAKE_REFERENCE_REGEX_C, MAKE_REFERENCE_REGEX_D, ML_LINK_END_REGEX, ML_LINK_ONSET_REGEX, ML_TEMPLATE_END_REGEX, ML_TEMPLATE_ONSET_REGEX, MNDASH_REGEX, ONSET_BAR_REGEX, PRE_MARKS_REGEX, REDIRECT_REGEX, REMOVE_DIRECTIVES_REGEX, REMOVE_EMPHASIS_REGEX, REMOVE_HR_REGEX, REMOVE_INLINE_REGEX, REMOVE_ISOLATED_REGEX, REMOVE_TAG_REGEX, SINGLE_CURLY_BRACKET_REGEX, SINGLE_SQUARE_BRACKET_REGEX, TYPE_CODE_REGEX, UNESCAPE_NOWIKI_REGEX, VERSION
Instance Attribute Summary collapse
-
#categories ⇒ Object
Returns the value of attribute categories.
-
#elements ⇒ Object
Returns the value of attribute elements.
-
#title ⇒ Object
Returns the value of attribute title.
Instance Method Summary collapse
- #create_element(tpx, text) ⇒ Object
-
#initialize(text, title = "", strip_tmarker = false) ⇒ Article
constructor
A new instance of Article.
- #parse(source) ⇒ Object
Methods included from Wp2txt
#batch_file_mod, #chrref_to_utf, #cleanup, #collect_files, #convert_characters, #correct_inline_template, #correct_separator, #escape_nowiki, #file_mod, #format_wiki, #make_reference, #mndash, #process_external_links, #process_interwiki_links, #process_nested_structure, #remove_complex, #remove_directive, #remove_emphasis, #remove_hr, #remove_html, #remove_inbetween, #remove_ref, #remove_table, #remove_tag, #remove_templates, #rename, #sec_to_str, #special_chr, #unescape_nowiki
Constructor Details
#initialize(text, title = "", strip_tmarker = false) ⇒ Article
Returns a new instance of Article.
31 32 33 34 35 36 37 38 39 40 |
# File 'lib/wp2txt/article.rb', line 31 def initialize(text, title = "", strip_tmarker = false) @title = title.strip @strip_tmarker = strip_tmarker text = convert_characters(text) text = text.gsub(/\|\n\n+/m) { "|\n" } text = remove_html(text) text = make_reference(text) text = remove_ref(text) parse text end |
Instance Attribute Details
#categories ⇒ Object
Returns the value of attribute categories.
29 30 31 |
# File 'lib/wp2txt/article.rb', line 29 def categories @categories end |
#elements ⇒ Object
Returns the value of attribute elements.
29 30 31 |
# File 'lib/wp2txt/article.rb', line 29 def elements @elements end |
#title ⇒ Object
Returns the value of attribute title.
29 30 31 |
# File 'lib/wp2txt/article.rb', line 29 def title @title end |
Instance Method Details
#create_element(tpx, text) ⇒ Object
42 43 44 |
# File 'lib/wp2txt/article.rb', line 42 def create_element(tpx, text) [tpx, text] end |
#parse(source) ⇒ Object
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
# File 'lib/wp2txt/article.rb', line 46 def parse(source) @elements = [] @categories = [] mode = nil source.each_line do |line| matched = line.scan(CATEGORY_REGEX) if matched && !matched.empty? @categories += matched @categories.uniq! end case mode when :mw_ml_template scanner = StringScanner.new(line) str = process_nested_structure(scanner, "{{", "}}") { "" } mode = nil if ML_TEMPLATE_END_REGEX =~ str @elements.last.last << line next when :mw_ml_link scanner = StringScanner.new(line) str = process_nested_structure(scanner, "[[", "]]") { "" } mode = nil if ML_LINK_END_REGEX =~ str @elements.last.last << line next when :mw_table mode = nil if IN_TABLE_REGEX2 =~ line @elements.last.last << line next when :mw_inputbox mode = nil if IN_INPUTBOX_REGEX2 =~ line @elements.last.last << line next when :mw_source mode = nil if IN_SOURCE_REGEX2 =~ line @elements.last.last << line next when :mw_math mode = nil if IN_MATH_REGEX2 =~ line @elements.last.last << line next when :mw_htable mode = nil if IN_HTML_TABLE_REGEX2 =~ line @elements.last.last << line next end case line when ISOLATED_TEMPLATE_REGEX @elements << create_element(:mw_isolated_template, line) when ISOLATED_TAG_REGEX @elements << create_element(:mw_isolated_tag, line) when BLANK_LINE_REGEX @elements << create_element(:mw_blank, "\n") when REDIRECT_REGEX @elements << create_element(:mw_redirect, line) when IN_HEADING_REGEX line = line.sub(HEADING_ONSET_REGEX) { $1 }.sub(HEADING_CODA_REGEX) { $1 } @elements << create_element(:mw_heading, "\n" + line + "\n") when IN_INPUTBOX_REGEX @elements << create_element(:mw_inputbox, line) when ML_TEMPLATE_ONSET_REGEX @elements << create_element(:mw_ml_template, line) mode = :mw_ml_template when ML_LINK_ONSET_REGEX @elements << create_element(:mw_ml_link, line) mode = :mw_ml_link when IN_INPUTBOX_REGEX1 mode = :mw_inputbox @elements << create_element(:mw_inputbox, line) when IN_SOURCE_REGEX @elements << create_element(:mw_source, line) when IN_SOURCE_REGEX1 mode = :mw_source @elements << create_element(:mw_source, line) when IN_MATH_REGEX @elements << create_element(:mw_math, line) when IN_MATH_REGEX1 mode = :mw_math @elements << create_element(:mw_math, line) when IN_HTML_TABLE_REGEX @elements << create_element(:mw_htable, line) when IN_HTML_TABLE_REGEX1 mode = :mw_htable @elements << create_element(:mw_htable, line) when IN_TABLE_REGEX1 mode = :mw_table @elements << create_element(:mw_table, line) when IN_UNORDERED_REGEX line = line.sub(LIST_MARKS_REGEX, "") if @strip_tmarker @elements << create_element(:mw_unordered, line) when IN_ORDERED_REGEX line = line.sub(LIST_MARKS_REGEX, "") if @strip_tmarker @elements << create_element(:mw_ordered, line) when IN_PRE_REGEX line = line.sub(PRE_MARKS_REGEX, "") if @strip_tmarker @elements << create_element(:mw_pre, line) when IN_DEFINITION_REGEX line = line.sub(DEF_MARKS_REGEX, "") if @strip_tmarker @elements << create_element(:mw_definition, line) when IN_LINK_REGEX @elements << create_element(:mw_link, line) else @elements << create_element(:mw_paragraph, "\n" + line) end end @elements end |