Class: Coradoc::Input::HTML::Cleaner
- Inherits:
-
Object
- Object
- Coradoc::Input::HTML::Cleaner
- Defined in:
- lib/coradoc/input/html/cleaner.rb
Instance Method Summary collapse
-
#clean_headings(string) ⇒ Object
following added by me.
- #clean_punctuation_characters(string) ⇒ Object
-
#clean_tag_borders(string) ⇒ Object
Find non-asterisk content that is enclosed by two or more asterisks.
-
#preprocess_word_html(string) ⇒ Object
preprocesses HTML, rather than postprocessing it.
- #remove_block_leading_newlines(string) ⇒ Object
- #remove_inner_whitespaces(string) ⇒ Object
- #remove_leading_newlines(string) ⇒ Object
- #remove_newlines(string) ⇒ Object
- #remove_section_attribute_newlines(string) ⇒ Object
- #scrub_whitespace(string) ⇒ Object
- #tidy(string) ⇒ Object
Instance Method Details
#clean_headings(string) ⇒ Object
following added by me
108 109 110 111 112 113 114 115 |
# File 'lib/coradoc/input/html/cleaner.rb', line 108 def clean_headings(string) string.gsub!(%r{<h([1-9])[^>]*></h\1>}, " ") # I don't know why Libre Office is inserting them, but they need to go string.gsub!(%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>}, "<sup>\\2</sup>") # I absolutely don't know why Libre Office is rendering superscripts as h1 string end |
#clean_punctuation_characters(string) ⇒ Object
89 90 91 |
# File 'lib/coradoc/input/html/cleaner.rb', line 89 def clean_punctuation_characters(string) string.gsub(/(\*\*|~~|__)\s([.!?'"])/, "\\1\\2") end |
#clean_tag_borders(string) ⇒ Object
Find non-asterisk content that is enclosed by two or more asterisks. Ensure that only one whitespace occurs in the border area. Same for underscores and brackets.
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
# File 'lib/coradoc/input/html/cleaner.rb', line 62 def clean_tag_borders(string) # result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match| # preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do # match.strip.sub("** ", "**").sub(" **", "**") # end # end # result = string.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match| # preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do # match.strip.sub("__ ", "__").sub(" __", "__") # end # end result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match| preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do match.strip.sub("~~ ", "~~").sub(" ~~", "~~") end end result.gsub(/\s?\[.*?\]\s?/) do |match| preserve_border_whitespaces(match) do match.strip.sub("[ ", "[").sub(" ]", "]") end end end |
#preprocess_word_html(string) ⇒ Object
preprocesses HTML, rather than postprocessing it
94 95 96 |
# File 'lib/coradoc/input/html/cleaner.rb', line 94 def preprocess_word_html(string) clean_headings(scrub_whitespace(string.dup)) end |
#remove_block_leading_newlines(string) ⇒ Object
27 28 29 |
# File 'lib/coradoc/input/html/cleaner.rb', line 27 def remove_block_leading_newlines(string) string.gsub("]\n****\n\n", "]\n****\n") end |
#remove_inner_whitespaces(string) ⇒ Object
43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
# File 'lib/coradoc/input/html/cleaner.rb', line 43 def remove_inner_whitespaces(string) unless string.nil? string.gsub!(/\n stem:\[/, "\nstem:[") string.gsub!(/(stem:\[([^\]]|\\\])*\])\n(?=\S)/, "\\1 ") string.gsub!(/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/, "\\1") end result = +"" string.each_line do |line| result << preserve_border_whitespaces(line) do line.strip.gsub(/[ \t]{2,}/, " ") end end result end |
#remove_leading_newlines(string) ⇒ Object
39 40 41 |
# File 'lib/coradoc/input/html/cleaner.rb', line 39 def remove_leading_newlines(string) string.gsub(/\A\n+/, "") end |
#remove_newlines(string) ⇒ Object
35 36 37 |
# File 'lib/coradoc/input/html/cleaner.rb', line 35 def remove_newlines(string) string.gsub(/\n{3,}/, "\n\n") end |
#remove_section_attribute_newlines(string) ⇒ Object
31 32 33 |
# File 'lib/coradoc/input/html/cleaner.rb', line 31 def remove_section_attribute_newlines(string) string.gsub("]\n\n==", "]\n==") end |
#scrub_whitespace(string) ⇒ Object
98 99 100 101 102 103 104 105 |
# File 'lib/coradoc/input/html/cleaner.rb', line 98 def scrub_whitespace(string) string.gsub!(/ | |\u00a0/i, " ") # HTML encoded spaces string = Coradoc.strip_unicode(string) # Strip document-level leading and trailing whitespace string.gsub!(/( +)$/, " ") # line trailing whitespace string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks # string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs string end |
#tidy(string) ⇒ Object
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
# File 'lib/coradoc/input/html/cleaner.rb', line 3 def tidy(string) if string.is_a? Hash return string.transform_values { |i| tidy(i) } end result = HtmlConverter.track_time "Removing inner whitespace" do remove_inner_whitespaces(String.new(string)) end result = HtmlConverter.track_time "Removing newlines" do remove_newlines(result) end result = HtmlConverter.track_time "Removing leading newlines" do remove_leading_newlines(result) end result = HtmlConverter.track_time "Cleaning tag borders" do clean_tag_borders(result) end result = HtmlConverter.track_time "Cleaning punctuation characters" do clean_punctuation_characters(result) end result = remove_block_leading_newlines(result) result = remove_section_attribute_newlines(result) end |