Class: Unicode::Data::Validate

Inherits:
Object
  • Object
show all
Defined in:
lib/unicode/data/validate.rb

Defined Under Namespace

Modules: Mode

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(logger: Logger.new(STDOUT), mode: ENV.fetch("MODE", "first")) ⇒ Validate

Returns a new instance of Validate.



33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/unicode/data/validate.rb', line 33

def initialize(logger: Logger.new(STDOUT), mode: ENV.fetch("MODE", "first"))
  @logger = logger
  @mode =
    case mode
    when "first"  then Mode::First.new
    when "sample" then Mode::Sample.new
    when "full"   then Mode::Full.new
    else
      raise ArgumentError, "invalid mode: #{mode}"
    end

  # This is a list of all of the surrogate characters that exist so that
  # we can skip them when validating since they're not valid in UTF-8.
  File.foreach(File.join(__dir__, "derived.txt"), chomp: true) do |line|
    property, values = line.split(" ", 2)

    if property.start_with?("\\p{General_Category=Surrogate}")
      @surrogates = each_value(values, Mode::Full.new).to_a
      break
    end
  end
end

Instance Attribute Details

#loggerObject (readonly)

Returns the value of attribute logger.



31
32
33
# File 'lib/unicode/data/validate.rb', line 31

def logger
  @logger
end

#modeObject (readonly)

Returns the value of attribute mode.



31
32
33
# File 'lib/unicode/data/validate.rb', line 31

def mode
  @mode
end

#surrogatesObject (readonly)

Returns the value of attribute surrogates.



31
32
33
# File 'lib/unicode/data/validate.rb', line 31

def surrogates
  @surrogates
end

Class Method Details

.callObject



101
102
103
# File 'lib/unicode/data/validate.rb', line 101

def self.call
  new.validate
end

Instance Method Details

#validateObject



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/unicode/data/validate.rb', line 56

def validate
  File.foreach(File.join(__dir__, "derived.txt"), chomp: true) do |line|
    property, values = line.split(/\s+/, 2)

    # For general categories and scripts, we don't actually want the
    # prefix in the property name, so here leave it out.
    property.gsub!(/(General_Category|Script)=/, "")

    # Ruby doesn't support Block= syntax, it expects you to instead have
    # no property name and have the block name begin with In_.
    property.gsub!(/Block=/, "In_")

    # Ruby doesn't support boolean property querying with values, it only
    # supports the plain property name.
    property.gsub!(/=(Yes|Y|True|T)/, "")

    pattern =
      begin
        # This is failing on CI, so I'm going to explicitly skip it for
        # now. Probably some kind of Ruby version mismatch.
        raise RegexpError if property == "\\p{In_CJK_Symbols}"

        /#{property}/
      rescue RegexpError
        # There are a fair amount of properties that we have in this gem
        # that Ruby doesn't support natively. Things like aliases for the
        # various blocks, script extensions, aliases for the ages, etc.
        # In this case just rescue the error and move on since we can't
        # validate against native.
        logger.warn("Skipping   #{property}")
        next
      end

    logger.info("Validating #{property}")

    each_value(values, mode) do |value|
      next if surrogates.include?(value)

      unless pattern.match?([value].pack("U"))
        raise "Expected #{value} to match #{property}"
      end
    end
  end
end