Class: ReverseAdoc::Cleaner
- Inherits:
-
Object
- Object
- ReverseAdoc::Cleaner
- Defined in:
- lib/reverse_adoc/cleaner.rb
Instance Method Summary collapse
-
#clean_headings(string) ⇒ Object
following added by me.
- #clean_punctuation_characters(string) ⇒ Object
-
#clean_tag_borders(string) ⇒ Object
Find non-asterisk content that is enclosed by two or more asterisks.
-
#preprocess_word_html(string) ⇒ Object
preprocesses HTML, rather than postprocessing it.
- #remove_inner_whitespaces(string) ⇒ Object
- #remove_leading_newlines(string) ⇒ Object
- #remove_newlines(string) ⇒ Object
- #scrub_whitespace(string) ⇒ Object
- #tidy(string) ⇒ Object
Instance Method Details
#clean_headings(string) ⇒ Object
following added by me
82 83 84 85 86 87 88 89 |
# File 'lib/reverse_adoc/cleaner.rb', line 82 def clean_headings(string) string.gsub!(%r{<h([1-9])[^>]*></h\1>}, " ") # I don't know why Libre Office is inserting them, but they need to go string.gsub!(%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>}, "<sup>\\2</sup>") # I absolutely don't know why Libre Office is rendering superscripts as h1 string end |
#clean_punctuation_characters(string) ⇒ Object
62 63 64 |
# File 'lib/reverse_adoc/cleaner.rb', line 62 def clean_punctuation_characters(string) string.gsub(/(\*\*|~~|__)\s([.!?'"])/, "\\1".strip + "\\2") end |
#clean_tag_borders(string) ⇒ Object
Find non-asterisk content that is enclosed by two or more asterisks. Ensure that only one whitespace occurs in the border area. Same for underscores and brackets.
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
# File 'lib/reverse_adoc/cleaner.rb', line 36 def clean_tag_borders(string) result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match| preserve_border_whitespaces(match, default_border: ReverseAdoc.config.tag_border) do match.strip.sub("** ", "**").sub(" **", "**") end end result = result.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match| preserve_border_whitespaces(match, default_border: ReverseAdoc.config.tag_border) do match.strip.sub("__ ", "__").sub(" __", "__") end end result = result.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match| preserve_border_whitespaces(match, default_border: ReverseAdoc.config.tag_border) do match.strip.sub("~~ ", "~~").sub(" ~~", "~~") end end result.gsub(/\s?\[.*?\]\s?/) do |match| preserve_border_whitespaces(match) do match.strip.sub("[ ", "[").sub(" ]", "]") end end end |
#preprocess_word_html(string) ⇒ Object
preprocesses HTML, rather than postprocessing it
67 68 69 |
# File 'lib/reverse_adoc/cleaner.rb', line 67 def preprocess_word_html(string) clean_headings(scrub_whitespace(string.dup)) end |
#remove_inner_whitespaces(string) ⇒ Object
19 20 21 22 23 24 25 26 27 28 29 30 |
# File 'lib/reverse_adoc/cleaner.rb', line 19 def remove_inner_whitespaces(string) unless string.nil? string.gsub!(/\n stem:\[/, "\nstem:[") string.gsub!(/(stem:\[([^\]]|\\\])*\])\n(?=\S)/, "\\1 ") string.gsub!(/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/, "\\1") end string.each_line.inject("") do |memo, line| memo + preserve_border_whitespaces(line) do line.strip.gsub(/[ \t]{2,}/, " ") end end end |
#remove_leading_newlines(string) ⇒ Object
15 16 17 |
# File 'lib/reverse_adoc/cleaner.rb', line 15 def remove_leading_newlines(string) string.gsub(/\A\n+/, "") end |
#remove_newlines(string) ⇒ Object
11 12 13 |
# File 'lib/reverse_adoc/cleaner.rb', line 11 def remove_newlines(string) string.gsub(/\n{3,}/, "\n\n") end |
#scrub_whitespace(string) ⇒ Object
71 72 73 74 75 76 77 78 79 |
# File 'lib/reverse_adoc/cleaner.rb', line 71 def scrub_whitespace(string) string.gsub!(/ | |\u00a0/i, " ") # HTML encoded spaces string.sub!(/^\A[[:space:]]+/m, "") # document leading whitespace string.sub!(/[[:space:]]+\z$/m, "") # document trailing whitespace string.gsub!(/( +)$/, " ") # line trailing whitespace string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks # string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs string end |
#tidy(string) ⇒ Object
3 4 5 6 7 8 9 |
# File 'lib/reverse_adoc/cleaner.rb', line 3 def tidy(string) result = remove_inner_whitespaces(string) result = remove_newlines(result) result = remove_leading_newlines(result) result = clean_tag_borders(result) clean_punctuation_characters(result) end |