Class: Coradoc::Input::HTML::Cleaner

Inherits:
Object
  • Object
show all
Defined in:
lib/coradoc/input/html/cleaner.rb

Instance Method Summary collapse

Instance Method Details

#clean_headings(string) ⇒ Object

following added by me



108
109
110
111
112
113
114
115
# File 'lib/coradoc/input/html/cleaner.rb', line 108

def clean_headings(string)
  string.gsub!(%r{<h([1-9])[^>]*></h\1>}, " ")
  # I don't know why Libre Office is inserting them, but they need to go
  string.gsub!(%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
               "<sup>\\2</sup>")
  # I absolutely don't know why Libre Office is rendering superscripts as h1
  string
end

#clean_punctuation_characters(string) ⇒ Object



89
90
91
# File 'lib/coradoc/input/html/cleaner.rb', line 89

def clean_punctuation_characters(string)
  string.gsub(/(\*\*|~~|__)\s([.!?'"])/, "\\1\\2")
end

#clean_tag_borders(string) ⇒ Object

Find non-asterisk content that is enclosed by two or more asterisks. Ensure that only one whitespace occurs in the border area. Same for underscores and brackets.



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/coradoc/input/html/cleaner.rb', line 62

def clean_tag_borders(string)
  # result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match|
  # preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do
  #   match.strip.sub("** ", "**").sub(" **", "**")
  # end
  # end

  # result = string.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match|
  #   preserve_border_whitespaces(match, default_border: Coradoc::Input::HTML.config.tag_border) do
  #     match.strip.sub("__ ", "__").sub(" __", "__")
  #   end
  # end

  result = string.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match|
    preserve_border_whitespaces(match,
                                default_border: Coradoc::Input::HTML.config.tag_border) do
      match.strip.sub("~~ ", "~~").sub(" ~~", "~~")
    end
  end

  result.gsub(/\s?\[.*?\]\s?/) do |match|
    preserve_border_whitespaces(match) do
      match.strip.sub("[ ", "[").sub(" ]", "]")
    end
  end
end

#preprocess_word_html(string) ⇒ Object

preprocesses HTML, rather than postprocessing it



94
95
96
# File 'lib/coradoc/input/html/cleaner.rb', line 94

def preprocess_word_html(string)
  clean_headings(scrub_whitespace(string.dup))
end

#remove_block_leading_newlines(string) ⇒ Object



27
28
29
# File 'lib/coradoc/input/html/cleaner.rb', line 27

def remove_block_leading_newlines(string)
  string.gsub("]\n****\n\n", "]\n****\n")
end

#remove_inner_whitespaces(string) ⇒ Object



43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/coradoc/input/html/cleaner.rb', line 43

def remove_inner_whitespaces(string)
  unless string.nil?
    string.gsub!(/\n stem:\[/, "\nstem:[")
    string.gsub!(/(stem:\[([^\]]|\\\])*\])\n(?=\S)/, "\\1 ")
    string.gsub!(/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/, "\\1")
  end
  result = +""
  string.each_line do |line|
    result << preserve_border_whitespaces(line) do
      line.strip.gsub(/[ \t]{2,}/, " ")
    end
  end
  result
end

#remove_leading_newlines(string) ⇒ Object



39
40
41
# File 'lib/coradoc/input/html/cleaner.rb', line 39

def remove_leading_newlines(string)
  string.gsub(/\A\n+/, "")
end

#remove_newlines(string) ⇒ Object



35
36
37
# File 'lib/coradoc/input/html/cleaner.rb', line 35

def remove_newlines(string)
  string.gsub(/\n{3,}/, "\n\n")
end

#remove_section_attribute_newlines(string) ⇒ Object



31
32
33
# File 'lib/coradoc/input/html/cleaner.rb', line 31

def remove_section_attribute_newlines(string)
  string.gsub("]\n\n==", "]\n==")
end

#scrub_whitespace(string) ⇒ Object



98
99
100
101
102
103
104
105
# File 'lib/coradoc/input/html/cleaner.rb', line 98

def scrub_whitespace(string)
  string.gsub!(/&nbsp;|&#xA0;|\u00a0/i, "&#xA0;") # HTML encoded spaces
  string = Coradoc.strip_unicode(string) # Strip document-level leading and trailing whitespace
  string.gsub!(/( +)$/, " ") # line trailing whitespace
  string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
  # string.delete!('?| ')               # Unicode non-breaking spaces, injected as tabs
  string
end

#tidy(string) ⇒ Object



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/coradoc/input/html/cleaner.rb', line 3

def tidy(string)
  if string.is_a? Hash
    return string.transform_values { |i| tidy(i) }
  end

  result = HtmlConverter.track_time "Removing inner whitespace" do
    remove_inner_whitespaces(String.new(string))
  end
  result = HtmlConverter.track_time "Removing newlines" do
    remove_newlines(result)
  end
  result = HtmlConverter.track_time "Removing leading newlines" do
    remove_leading_newlines(result)
  end
  result = HtmlConverter.track_time "Cleaning tag borders" do
    clean_tag_borders(result)
  end
  result = HtmlConverter.track_time "Cleaning punctuation characters" do
    clean_punctuation_characters(result)
  end
  result = remove_block_leading_newlines(result)
  result = remove_section_attribute_newlines(result)
end