Class: SiteFuel::Processor::HTMLProcessor

Inherits:
AbstractStringBasedProcessor show all
Defined in:
lib/sitefuel/processors/HTMLProcessor.rb

Direct Known Subclasses

PHPProcessor, RHTMLProcessor

Constant Summary collapse

SINGLE_QUOTE_OPEN =

quotes

'‘'.freeze
SINGLE_QUOTE_CLOSE =
'’'.freeze
DOUBLE_QUOTE_OPEN =
'“'.freeze
DOUBLE_QUOTE_CLOSE =
'”'.freeze
EN_DASH =

dashes

'–'.freeze
EM_DASH =
'—'.freeze
ELLIPSIS =

signs

'…'.freeze
'©'.freeze
TRADEMARK =
'™'.freeze
REGISTERED =
'®'.freeze
ARROW_LEFTWARD =

arrows

'←'.freeze
ARROW_RIGHTWARD =
'→'.freeze
ARROW_LEFTRIGHT =
'↔'.freeze
ARROW_DOUBLE_LEFTWARD =
'⇐'.freeze
ARROW_DOUBLE_RIGHTWARD =
'⇒'.freeze
ARROW_DOUBLE_LEFTRIGHT =
'⇔'.freeze
MULTIPLICATION_SIGN =

math operators

'×'.freeze
TEXTUAL_TAGS =

list of tags which have proper text items inside them

['h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'p', 'b', 'i', 'ul', 'a', 'li', 'td',
'th'].freeze
TEXTUAL_TAGS_FILTER =

filter for use with XPath searches

TEXTUAL_TAGS.join('|').freeze

Instance Attribute Summary

Attributes inherited from AbstractStringBasedProcessor

#document

Attributes inherited from AbstractProcessor

#execution_list, #original_size, #processed_size, #resource_name

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from AbstractStringBasedProcessor

filter_string, #generate, #generate_string, #open_file, #open_string, process_file, process_string, #processor_symbol, processor_type, #save

Methods inherited from AbstractProcessor

#add_filter, #add_filterset, #clear_filters, #create_file, #drop_filter, #execute, file_pattern_match?, #filter?, filter?, filters, filters_in_filterset, filterset?, filterset_ignore, filtersets, find_processors, #initialize, processes_file?, processor_name, #processor_symbol, processor_type, #run_filter, #run_filterset, #save

Methods included from ClassLogging

#debug, #error, #fatal, #info, #warn

Methods included from Configurable

#configuration_options, #configure, #ensure_configurable_option, #post_configuration, #pre_configuration, #set_configuration

Methods included from Logging

#debug, #error, #fatal, #info, #logger=, #warn

Constructor Details

This class inherits a constructor from SiteFuel::Processor::AbstractProcessor

Class Method Details

.default_filtersetObject



72
73
74
# File 'lib/sitefuel/processors/HTMLProcessor.rb', line 72

def self.default_filterset
  :minify
end

.file_patternsObject

gives the file patterns which this processor will match



65
66
67
68
69
70
# File 'lib/sitefuel/processors/HTMLProcessor.rb', line 65

def self.file_patterns
  [
    # plain html
    ".html", ".htm"
  ]
end

.filterset_beautifyObject



80
81
82
# File 'lib/sitefuel/processors/HTMLProcessor.rb', line 80

def self.filterset_beautify
  [:beautify_quotes, :beautify_dashes, :beautify_arrows, :beautify_symbols]
end

.filterset_minifyObject



76
77
78
# File 'lib/sitefuel/processors/HTMLProcessor.rb', line 76

def self.filterset_minify
  [:whitespace, :minify_javascript, :minify_styles]
end

Instance Method Details

#filter_beautify_arrowsObject

convert basic arrow forms to unicode characters



180
181
182
183
184
185
186
187
188
189
190
# File 'lib/sitefuel/processors/HTMLProcessor.rb', line 180

def filter_beautify_arrows
  traverse do |tag,txt|
    txt.content = txt.content.
      gsub(/(\s|\b)-->(\s|\b)/, "\\1#{ARROW_RIGHTWARD}\\2").
      gsub(/(\s|\b)<--(\s|\b)/, "\\1#{ARROW_LEFTWARD}\\2").
      gsub(/(\s|\b)<->(\s|\b)/, "\\1#{ARROW_LEFTRIGHT}\\2").
      gsub(/(\s|\b)==>(\s|\b)/, "\\1#{ARROW_DOUBLE_RIGHTWARD}\\2").
      gsub(/(\s|\b)<==(\s|\b)/, "\\1#{ARROW_DOUBLE_LEFTWARD}\\2").
      gsub(/(\s|\b)<=>(\s|\b)/, "\\1#{ARROW_DOUBLE_LEFTRIGHT}\\2")
  end
end

#filter_beautify_dashesObject

cleans up the various dash forms: <pre>12–13 => 12&#8211;13</pre> <pre>the car—it was red—was destroyed => …&#8212;it was red&#8212;…</pre>



162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# File 'lib/sitefuel/processors/HTMLProcessor.rb', line 162

def filter_beautify_dashes
  traverse do |tag,txt|
    txt.content = txt.content.
      # between two numbers we have an en dash
      # this would be a bit cleaner with (negative) lookbehind
      gsub(/(\d)--(\d)/,        "\\1#{EN_DASH}\\2").

      # we can also have multiple en-dashes
      gsub(/\b(--(--)+)(\b|\z|\s)/) do ||
        EN_DASH * ($1.length / 2) + $3
      end.

      # three dashes in general are an em dash
      gsub(/(\s|\b)---(\s|\b)/, "\\1#{EM_DASH}\\2")
  end
end

#filter_beautify_mathObject

converts ‘x’ signs between numbers into the unicode symbol



193
194
195
# File 'lib/sitefuel/processors/HTMLProcessor.rb', line 193

def filter_beautify_math

end

#filter_beautify_quotesObject

cleans up double and single quotes in textual objects <pre>“hello world” => &#8220; hello world&#8221;</pre>



144
145
146
147
148
149
150
151
152
153
154
155
156
157
# File 'lib/sitefuel/processors/HTMLProcessor.rb', line 144

def filter_beautify_quotes
  traverse do |tag,txt|
    txt.content = txt.content.
      # apostrophes
      gsub(/(\S)'(s)/i,   '\1%s\2' % SINGLE_QUOTE_CLOSE).
      gsub(/(\Ss)'(\s)/i, '\1%s\2'   % SINGLE_QUOTE_CLOSE).

      # double quotes
      gsub(/"(\S.*?\S)"/, '%s\1%s' % [DOUBLE_QUOTE_OPEN, DOUBLE_QUOTE_CLOSE]).

      # single quotes
      gsub(/'(\S.*?\S)'/, '%s\1%s' % [SINGLE_QUOTE_OPEN, SINGLE_QUOTE_CLOSE])
  end
end

#filter_beautify_symbolsObject

convert a few shorthands like ©, (tm) to their unicode symbols



198
199
200
201
202
203
204
205
206
207
# File 'lib/sitefuel/processors/HTMLProcessor.rb', line 198

def filter_beautify_symbols
  traverse do |tag,txt|
    txt.content = txt.content.
      gsub(/\(tm\)/i, TRADEMARK).
      gsub(/\(c\)/i,  COPYRIGHT).
      gsub(/\(r\)/i,  REGISTERED).
      gsub(/(\b| )\.\.\.(\.)?/, "\\1#{ELLIPSIS}\\2")

  end
end

#filter_minify_javascriptObject

minifies embedded JavaScript code using the JavaScriptProcessor



122
123
124
125
126
127
128
129
130
# File 'lib/sitefuel/processors/HTMLProcessor.rb', line 122

def filter_minify_javascript
  # TODO check the language attribute to make sure it's javascript
  traverse('script') do |tag,txt|
    txt.content = JavaScriptProcessor.process_string(
            txt.content,
            {:resource_name => resource_name+'<embedded_JS>'}
    )
  end
end

#filter_minify_stylesObject

minifies embedded CSS styles using the CSSProcessor



133
134
135
136
137
138
139
140
# File 'lib/sitefuel/processors/HTMLProcessor.rb', line 133

def filter_minify_styles
  traverse('style') do |tag,txt|
    txt.content = CSSProcessor.process_string(
            txt.content,
            :resource_name => resource_name+'<embedded_CSS>'
    )
  end
end

#filter_whitespaceObject

strips excess whitespace in most HTML tags. Notably, pre tags are left alone.



111
112
113
114
115
116
117
118
119
# File 'lib/sitefuel/processors/HTMLProcessor.rb', line 111

def filter_whitespace
  @htmlstruc.traverse_text do |txt|
    if /\A\s+\z/ =~ txt.content then
      txt.content = ''
    else
      txt.content = txt.content.gsub(/\s+/m, ' ')
    end
  end
end

#finish_filtersObject

after all the filters are run dump the HTML as a string and do a tiny bit of post processing



96
97
98
99
# File 'lib/sitefuel/processors/HTMLProcessor.rb', line 96

def finish_filters
  # do a last minute, ugly +br+ cleanup
  @document = @htmlstruc.to_s.gsub('<br />', '<br>')
end

#setup_filtersObject

before any filters are run parse the document with hpricot



90
91
92
# File 'lib/sitefuel/processors/HTMLProcessor.rb', line 90

def setup_filters
  @htmlstruc = Hpricot.parse(document)
end

#traverse(patterns = TEXTUAL_TAGS_FILTER, &block) ⇒ Object



101
102
103
104
105
106
107
# File 'lib/sitefuel/processors/HTMLProcessor.rb', line 101

def traverse(patterns = TEXTUAL_TAGS_FILTER, &block)
  (@htmlstruc/patterns).each do |tag|
    tag.traverse_text do |txt|
      block.call(tag.pathname, txt)
    end
  end
end