Class: Owasp::Esapi::Codec::HtmlCodec
- Defined in:
- lib/codec/html_codec.rb
Constant Summary collapse
- REPLACEMENT_HEX =
Replacement const hex
"fffd"
- REPLACEMENT_CHAR =
Replacement const char
'\ufffd'
- ENTITY_MAP =
Map of entities to numeric codes
{ 'Aacute' => 193, 'aacute' => 225, 'Acirc' => 194, 'acirc' => 226, 'acute' => 180, 'AElig' => 198, 'aelig' => 230, 'Agrave' => 192, 'agrave' => 224, 'alefsym' => 8501, 'Alpha' => 913, 'alpha' => 945, 'amp' => 38, 'and' => 8743, 'ang' => 8736, 'Aring' => 197, 'aring' => 229, 'asymp' => 8776, 'Atilde' => 195, 'atilde' => 227, 'Auml' => 196, 'auml' => 228, 'bdquo' => 8222, 'Beta' => 914, 'beta' => 946, 'brvbar' => 166, 'bull' => 8226, 'cap' => 8745, 'Ccedil' => 199, 'ccedil' => 231, 'cedil' => 184, 'cent' => 162, 'Chi' => 935, 'chi' => 967, 'circ' => 710, 'clubs' => 9827, 'cong' => 8773, 'copy' => 169, 'crarr' => 8629, 'cup' => 8746, 'curren' => 164, 'Dagger' => 8225, 'dagger' => 8224, 'dArr' => 8659, 'darr' => 8595, 'deg' => 176, 'Delta' => 916, 'delta' => 948, 'diams' => 9830, 'divide' => 247, 'Eacute' => 201, 'eacute' => 233, 'Ecirc' => 202, 'ecirc' => 234, 'Egrave' => 200, 'egrave' => 232, 'empty' => 8709, 'emsp' => 8195, 'ensp' => 8194, 'Epsilon' => 917, 'epsilon' => 949, 'equiv' => 8801, 'Eta' => 919, 'eta' => 951, 'ETH' => 208, 'eth' => 240, 'Euml' => 203, 'euml' => 235, 'euro' => 8364, 'exist' => 8707, 'fnof' => 402, 'forall' => 8704, 'frac12' => 189, 'frac14' => 188, 'frac34' => 190, 'frasl' => 8260, 'Gamma' => 915, 'gamma' => 947, 'ge' => 8805, 'gt' => 62, 'hArr' => 8660, 'harr' => 8596, 'hearts' => 9829, 'hellip' => 8230, 'Iacute' => 205, 'iacute' => 237, 'Icirc' => 206, 'icirc' => 238, 'iexcl' => 161, 'Igrave' => 204, 'igrave' => 236, 'image' => 8465, 'infin' => 8734, 'int' => 8747, 'Iota' => 921, 'iota' => 953, 'iquest' => 191, 'isin' => 8712, 'Iuml' => 207, 'iuml' => 239, 'Kappa' => 922, 'kappa' => 954, 'Lambda' => 923, 'lambda' => 955, 'lang' => 9001, 'laquo' => 171, 'lArr' => 8656, 'larr' => 8592, 'lceil' => 8968, 'ldquo' => 8220, 'le' => 8804, 'lfloor' => 8970, 'lowast' => 8727, 'loz' => 9674, 'lrm' => 8206, 'lsaquo' => 8249, 'lsquo' => 8216, 'lt' => 60, 'macr' => 175, 'mdash' => 8212, 'micro' => 181, 'middot' => 183, 'minus' => 8722, 'Mu' => 924, 'mu' => 956, 'nabla' => 8711, 'nbsp' => 160, 'ndash' => 8211, 'ne' => 8800, 'ni' => 8715, 'not' => 172, 'notin' => 8713, 'nsub' => 8836, 'Ntilde' => 209, 'ntilde' => 241, 'Nu' => 925, 'nu' => 957, 'Oacute' => 211, 'oacute' => 243, 'Ocirc' => 212, 'ocirc' => 244, 'OElig' => 338, 'oelig' => 339, 'Ograve' => 210, 'ograve' => 242, 'oline' => 8254, 'Omega' => 937, 'omega' => 969, 'Omicron' => 927, 'omicron' => 959, 'oplus' => 8853, 'or' => 8744, 'ordf' => 170, 'ordm' => 186, 'Oslash' => 216, 'oslash' => 248, 'Otilde' => 213, 'otilde' => 245, 'otimes' => 8855, 'Ouml' => 214, 'ouml' => 246, 'para' => 182, 'part' => 8706, 'permil' => 8240, 'perp' => 8869, 'Phi' => 934, 'phi' => 966, 'Pi' => 928, 'pi' => 960, 'piv' => 982, 'plusmn' => 177, 'pound' => 163, 'Prime' => 8243, 'prime' => 8242, 'prod' => 8719, 'prop' => 8733, 'Psi' => 936, 'psi' => 968, 'quot' => 34, 'radic' => 8730, 'rang' => 9002, 'raquo' => 187, 'rArr' => 8658, 'rarr' => 8594, 'rceil' => 8969, 'rdquo' => 8221, 'real' => 8476, 'reg' => 174, 'rfloor' => 8971, 'Rho' => 929, 'rho' => 961, 'rlm' => 8207, 'rsaquo' => 8250, 'rsquo' => 8217, 'sbquo' => 8218, 'Scaron' => 352, 'scaron' => 353, 'sdot' => 8901, 'sect' => 167, 'shy' => 173, 'Sigma' => 931, 'sigma' => 963, 'sigmaf' => 962, 'sim' => 8764, 'spades' => 9824, 'sub' => 8834, 'sube' => 8838, 'sum' => 8721, 'sup' => 8835, 'sup1' => 185, 'sup2' => 178, 'sup3' => 179, 'supe' => 8839, 'szlig' => 223, 'Tau' => 932, 'tau' => 964, 'there4' => 8756, 'Theta' => 920, 'theta' => 952, 'thetasym' => 977, 'thinsp' => 8201, 'THORN' => 222, 'thorn' => 254, 'tilde' => 732, 'times' => 215, 'trade' => 8482, 'Uacute' => 218, 'uacute' => 250, 'uArr' => 8657, 'uarr' => 8593, 'Ucirc' => 219, 'ucirc' => 251, 'Ugrave' => 217, 'ugrave' => 249, 'uml' => 168, 'upsih' => 978, 'Upsilon' => 933, 'upsilon' => 965, 'Uuml' => 220, 'uuml' => 252, 'weierp' => 8472, 'Xi' => 926, 'xi' => 958, 'Yacute' => 221, 'yacute' => 253, 'yen' => 165, 'Yuml' => 376, 'yuml' => 255, 'Zeta' => 918, 'zeta' => 950, 'zwj' => 8205, 'zwnj' => 8204 }
Constants inherited from BaseCodec
BaseCodec::END_CODE_POINT, BaseCodec::START_CODE_POINT
Instance Method Summary collapse
-
#decode_char(input) ⇒ Object
Returns the decoded version of the character starting at index, or nil if no decoding is possible.
-
#encode_char(immune, input) ⇒ Object
Encodes a Character for safe use in an HTML entity field.
-
#initialize ⇒ HtmlCodec
constructor
A new instance of HtmlCodec.
-
#named_entity(input) ⇒ Object
check to see if the input is a named entity.
-
#numeric_entity(input) ⇒ Object
check to see if the input is a numeric entity.
-
#parse_hex(input) ⇒ Object
parse a hex value in the stream.
-
#parse_number(input) ⇒ Object
parse a number int he stream.
Methods inherited from BaseCodec
Constructor Details
#initialize ⇒ HtmlCodec
Returns a new instance of HtmlCodec.
6 7 8 9 10 11 12 13 14 15 |
# File 'lib/codec/html_codec.rb', line 6 def initialize @longest_key = 0 @lookup_map = {} ENTITY_MAP.each_key do |k| if k.size > @longest_key @longest_key += 1 end @lookup_map[k.downcase] = k end end |
Instance Method Details
#decode_char(input) ⇒ Object
Returns the decoded version of the character starting at index, or nil if no decoding is possible. Formats all are legal both with and without semi-colon, upper/lower case:
-
&#dddd;
-
&#xhhhh;
-
&name;
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
# File 'lib/codec/html_codec.rb', line 43 def decode_char(input) # mark the input input.mark first = input.next if first.nil? input.reset return nil end # this isnt an encoded char if first != '&' input.reset return nil end # test for numeric encodings second = input.next if second.nil? input.reset return nil end if second == '#' c = numeric_entity(input) return c unless c.nil? elsif second =~ /[a-zA-Z]/ input.push(second) c = named_entity(input) return c unless c.nil? end input.reset return nil end |
#encode_char(immune, input) ⇒ Object
Encodes a Character for safe use in an HTML entity field.
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/codec/html_codec.rb', line 18 def encode_char(immune, input) c = input return input if immune.include?(input) # check for alpha numeric hex = hex(input) return input if hex.nil? # check to see if we need to replace an entity if ( c.ord <= 0x1f and c != '\t' and c != '\n' and c != '\r' ) || ( c.ord >= 0x7f and c.ord <= 0x9f ) hex = REPLACEMENT_HEX c = REPLACEMENT_CHAR end # find the entity name if its possible ENTITY_MAP.each_pair do |k,v| return "&#{k};" if v == c.ord end #encode as a hex value "&#x#{hex};" end |
#named_entity(input) ⇒ Object
check to see if the input is a named entity
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
# File 'lib/codec/html_codec.rb', line 88 def named_entity(input)#:nodoc: possible = '' len = min(input.remainder.size,@longest_key) if input.peek?("&") input.next end found_key = false last_possible = '' for i in 0..len do possible << input.next if input.next? # we have to find the longest match # so we dont find sub values if @lookup_map[possible.downcase] last_possible = @lookup_map[possible.downcase] end end # no matches found return return nil if last_possible.empty? # reset the input and plow through input.reset for i in 0..last_possible.size input.next end possible = ENTITY_MAP[last_possible] input.next if input.peek?(';') possible.chr(Encoding::UTF_8) end |
#numeric_entity(input) ⇒ Object
check to see if the input is a numeric entity
77 78 79 80 81 82 83 84 85 |
# File 'lib/codec/html_codec.rb', line 77 def numeric_entity(input) #:nodoc: first = input.peek return nil if first.nil? if first.downcase.eql?("x") input.next return parse_hex(input) end return parse_number(input) end |
#parse_hex(input) ⇒ Object
parse a hex value in the stream
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
# File 'lib/codec/html_codec.rb', line 139 def parse_hex(input)#:nodoc: result = '' while input.next? c = input.peek if "0123456789ABCDEFabcdef".include?(c) result << c input.next elsif c == ";" input.next break else break end end begin i = result.hex return i.chr(Encoding::UTF_8) if i >= START_CODE_POINT and i <= END_CODE_POINT rescue Exception => e end nil end |
#parse_number(input) ⇒ Object
parse a number int he stream
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
# File 'lib/codec/html_codec.rb', line 116 def parse_number(input)#:nodoc: result = '' while input.next? c = input.peek if c =~ /\d/ result << c input.next elsif c == ';' input.next break; else break; end end begin i = result.to_i return i.chr(Encoding::UTF_8) if i >= START_CODE_POINT and i <= END_CODE_POINT rescue Exception => e end nil end |