Class: Mechanize::Util

Inherits:
Object
  • Object
show all
Defined in:
lib/mechanize/util.rb

Constant Summary collapse

CODE_DIC =
{
:JIS => "ISO-2022-JP",
:EUC => "EUC-JP",
:SJIS => "SHIFT_JIS",
:UTF8 => "UTF-8", :UTF16 => "UTF-16", :UTF32 => "UTF-32"}
NKF_TO_ICONV =
{
  'ASCII-8BIT' => 'CP1252',
  'SHIFT_JIS' => 'CP932',
}

Class Method Summary collapse

Class Method Details

.build_query_string(parameters, enc = nil) ⇒ Object



17
18
19
20
21
22
# File 'lib/mechanize/util.rb', line 17

def build_query_string(parameters, enc=nil)
  parameters.map { |k,v|
    # WEBrick::HTTP.escape* has some problems about m17n on ruby-1.9.*.
    [CGI.escape(k.to_s), CGI.escape(v.to_s)].join("=") if k
  }.compact.join('&')
end

.detect_charset(src) ⇒ Object



57
58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/mechanize/util.rb', line 57

def detect_charset(src)
  tmp = NKF.guess(src || "<html></html>")
  if RUBY_VERSION >= "1.9.0"
    enc = tmp.to_s.upcase
  else
    enc = NKF.constants.find{|c|
      NKF.const_get(c) == tmp
    }
    enc = CODE_DIC[enc.intern]
  end
  enc = NKF_TO_ICONV[enc] if NKF_TO_ICONV[enc]
  enc || "CP1252"
end

.from_native_charset(s, code) ⇒ Object



34
35
36
37
38
39
40
41
# File 'lib/mechanize/util.rb', line 34

def from_native_charset(s, code)
  if Mechanize.html_parser == Nokogiri::HTML
    return unless s
    Iconv.iconv(code, "UTF-8", s).join("")
  else
    return s
  end
end

.html_unescape(s) ⇒ Object



43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/mechanize/util.rb', line 43

def html_unescape(s)
  return s unless s
  s.gsub(/&(\w+|#[0-9]+);/) { |match|
    number = case match
             when /&(\w+);/
               Mechanize.html_parser::NamedCharacters[$1]
             when /&#([0-9]+);/
               $1.to_i
             end

    number ? ([number].pack('U') rescue match) : match
  }
end

.to_native_charset(s, code = nil) ⇒ Object



24
25
26
27
28
29
30
31
32
# File 'lib/mechanize/util.rb', line 24

def to_native_charset(s, code=nil)
  if Mechanize.html_parser == Nokogiri::HTML
    return unless s
    code ||= detect_charset(s)
    Iconv.iconv("UTF-8", code, s).join("")
  else
    s
  end
end