Class: SwissMatch::Location::DataFiles

Inherits:
Object
  • Object
show all
Defined in:
lib/swissmatch/location/datafiles.rb

Overview

TODO:

The current handling of the urls is not clean. I don’t know yet how the urls will change over iterations.

Deals with retrieving and updating the files provided by the swiss postal service, and loading the data from them.

Constant Summary collapse

Expressions =

Regular expressions used to parse the different files.

{
  :community    => generate_expression(4, '\t', '\r\n'),
  :zip_2        => generate_expression(6, '\t', '\r\n'),
  :zip_1        => generate_expression(13, '\t', '\r\n'),
  :districts    => generate_expression(3, ',', '\n'),
  :communities  => generate_expression(10, ',', '\n'),
}
URLZip1 =

The URL of the plz_p1 file

"https://match.post.ch/download?file=10001&tid=11&rol=0"
URLZip2 =

The URL of the plz_p2 file

"https://match.post.ch/download?file=10002&tid=14&rol=0"
URLCommunity =

The URL of the plz_c file

"https://match.post.ch/download?file=10003&tid=13&rol=0"
URLAll =

An array of all urls

[URLZip1, URLZip2, URLCommunity]
CantonData =

The data of all cantons

[
  ["AG", "Aargau",                    "Aargau",                   "Argovie",                      "Argovia",                  "Argovia"],
  ["AI", "Appenzell Innerrhoden",     "Appenzell Innerrhoden",    "Appenzell Rhodes-Intérieures", "Appenzello Interno",       "Appenzell Dadens"],
  ["AR", "Appenzell Ausserrhoden",    "Appenzell Ausserrhoden",   "Appenzell Rhodes-Extérieures", "Appenzello Esterno",       "Appenzell Dadora"],
  ["BE", "Bern",                      "Bern",                     "Berne",                        "Berna",                    "Berna"],
  ["BL", "Basel-Landschaft",          "Basel-Landschaft",         "Bâle-Campagne",                "Basilea Campagna",         "Basilea-Champagna"],
  ["BS", "Basel-Stadt",               "Basel-Stadt",              "Bâle-Ville",                   "Basilea Città",            "Basilea-Citad"],
  ["FR", "Freiburg",                  "Fribourg",                 "Fribourg",                     "Friburgo",                 "Friburg"],
  ["GE", "Genève",                    "Genf",                     "Genève",                       "Ginevra",                  "Genevra"],
  ["GL", "Glarus",                    "Glarus",                   "Glaris",                       "Glarona",                  "Glaruna"],
  ["GR", "Graubünden",                "Graubünden",               "Grisons",                      "Grigioni",                 "Grischun"],
  ["JU", "Jura",                      "Jura",                     "Jura",                         "Giura",                    "Giura"],
  ["LU", "Luzern",                    "Luzern",                   "Lucerne",                      "Lucerna",                  "Lucerna"],
  ["NE", "Neuchâtel",                 "Neuenburg",                "Neuchâtel",                    "Neuchâtel",                "Neuchâtel"],
  ["NW", "Nidwalden",                 "Nidwalden",                "Nidwald",                      "Nidvaldo",                 "Sutsilvania"],
  ["OW", "Obwalden",                  "Obwalden",                 "Obwald",                       "Obvaldo",                  "Sursilvania"],
  ["SG", "St. Gallen",                "St. Gallen",               "Saint-Gall",                   "San Gallo",                "Son Gagl"],
  ["SH", "Schaffhausen",              "Schaffhausen",             "Schaffhouse",                  "Sciaffusa",                "Schaffusa"],
  ["SO", "Solothurn",                 "Solothurn",                "Soleure",                      "Soletta",                  "Soloturn"],
  ["SZ", "Schwyz",                    "Schwyz",                   "Schwytz",                      "Svitto",                   "Sviz"],
  ["TG", "Thurgau",                   "Thurgau",                  "Thurgovie",                    "Turgovia",                 "Turgovia"],
  ["TI", "Ticino",                    "Tessin",                   "Tessin",                       "Ticino",                   "Tessin"],
  ["UR", "Uri",                       "Uri",                      "Uri",                          "Uri",                      "Uri"],
  ["VD", "Vaud",                      "Waadt",                    "Vaud",                         "Vaud",                     "Vad"],
  ["VS", "Valais",                    "Wallis",                   "Valais",                       "Vallese",                  "Vallais"],
  ["ZG", "Zug",                       "Zug",                      "Zoug",                         "Zugo",                     "Zug"],
  ["ZH", "Zürich",                    "Zürich",                   "Zurich",                       "Zurigo",                   "Turitg"],
  ["FL", "Fürstentum Liechtenstein",  "Fürstentum Liechtenstein", "Liechtenstein",                "Liechtenstein",            "Liechtenstein"],
  ["DE", "Deutschland",               "Deutschland",              "Allemagne",                    "Germania",                 "Germania"],
  ["IT", "Italien",                   "Italien",                  "Italie",                       "Italia",                   "Italia"],
]
LanguageCodes =

Used to convert numerical language codes to symbols

[nil, :de, :fr, :it, :rt]

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(data_directory = nil) ⇒ DataFiles

Returns a new instance of DataFiles.

Parameters:

  • data_directory (nil, String) (defaults to: nil)

    The directory in which the post mat files reside



117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/swissmatch/location/datafiles.rb', line 117

def initialize(data_directory=nil)
  reset_errors!
  if data_directory then
    @data_directory = data_directory
  elsif ENV['SWISSMATCH_DATA'] then
    @data_directory = ENV['SWISSMATCH_DATA']
  else
    data_directory  = File.expand_path('../../../../data/swissmatch-location', __FILE__)
    data_directory  = Gem.datadir 'swissmatch-location' if defined?(Gem) && !File.directory?(data_directory)
    @data_directory = data_directory
  end
end

Instance Attribute Details

#cantonsSwissMatch::Cantons (readonly)

Returns The loaded swiss cantons.

Returns:



101
102
103
# File 'lib/swissmatch/location/datafiles.rb', line 101

def cantons
  @cantons
end

#communitiesSwissMatch::Communities (readonly)

Returns The loaded swiss communities.

Returns:



107
108
109
# File 'lib/swissmatch/location/datafiles.rb', line 107

def communities
  @communities
end

#data_directoryObject

The directory in which the post mat files reside



98
99
100
# File 'lib/swissmatch/location/datafiles.rb', line 98

def data_directory
  @data_directory
end

#districtsSwissMatch::Districts (readonly)

Returns The loaded swiss districts.

Returns:



104
105
106
# File 'lib/swissmatch/location/datafiles.rb', line 104

def districts
  @districts
end

#errorsArray<LoadError> (readonly)

Returns Errors that occurred while loading the data.

Returns:

  • (Array<LoadError>)

    Errors that occurred while loading the data



113
114
115
# File 'lib/swissmatch/location/datafiles.rb', line 113

def errors
  @errors
end

#zip_codesSwissMatch::ZipCodes (readonly)

Returns The loaded swiss zip codes.

Returns:



110
111
112
# File 'lib/swissmatch/location/datafiles.rb', line 110

def zip_codes
  @zip_codes
end

Class Method Details

.generate_expression(size, separator, terminator) ⇒ Object

Used to generate the regular expressions used to parse the data files. Generates a regular expression, that matches size tab separated fields, delimited by rn.



33
34
35
# File 'lib/swissmatch/location/datafiles.rb', line 33

def self.generate_expression(size, separator, terminator)
  /^#{Array.new(size) { "([^#{separator}]*)" }.join(eval("'#{separator}'"))}#{terminator}/
end

Instance Method Details

#http_get_zip_file(url, destination) ⇒ Array<String>

Performs an HTTP-GET for the given url, extracts it as a zipped file into the destination directory.

Returns:

  • (Array<String>)

    An array with the absolute file paths of the extracted files.



152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# File 'lib/swissmatch/location/datafiles.rb', line 152

def http_get_zip_file(url, destination)
  require 'open-uri'
  require 'swissmatch/zip' # patched zip/zip
  require 'fileutils'

  files = []

  open(url) do |zip_buffer|
    Zip::ZipFile.open(zip_buffer) do |zip_file|
      zip_file.each do |f|
        target_path = File.join(destination, f.name)
        FileUtils.mkdir_p(File.dirname(target_path))
        zip_file.extract(f, target_path) unless File.exist?(target_path)
        files << target_path
      end
    end
  end

  files
end

#loadArray

Returns an array of the form [SwissMatch::Cantons, SwissMatch::Communities, SwissMatch::ZipCodes].

Returns:

  • (Array)

    Returns an array of the form [SwissMatch::Cantons, SwissMatch::Communities, SwissMatch::ZipCodes].



200
201
202
203
204
205
206
207
208
209
# File 'lib/swissmatch/location/datafiles.rb', line 200

def load
  reset_errors!

  cantons     = load_cantons
  districts   = load_districts(cantons)
  communities = load_communities(cantons)
  zip_codes   = load_zipcodes(cantons, communities)

  [cantons, districts, communities, zip_codes]
end

#load!self

Loads the data into this DataFiles instance

Returns:

  • (self)

    Returns self.



192
193
194
195
# File 'lib/swissmatch/location/datafiles.rb', line 192

def load!
  @cantons, @districts, @communities, @zip_codes = *load
  self
end

#load_cantonsSwissMatch::Cantons

Returns A SwissMatch::Cantons containing all cantons used by the swiss postal service.

Returns:

  • (SwissMatch::Cantons)

    A SwissMatch::Cantons containing all cantons used by the swiss postal service.



213
214
215
216
217
218
219
# File 'lib/swissmatch/location/datafiles.rb', line 213

def load_cantons
  Cantons.new(
    CantonData.map { |tag, name, name_de, name_fr, name_it, name_rt|
      Canton.new(tag, name, name_de, name_fr, name_it, name_rt)
    }
  )
end

#load_communities(cantons) ⇒ SwissMatch::Communities

Returns An instance of SwissMatch::Communities containing all communities defined by the files known to this DataFiles instance.

Returns:

  • (SwissMatch::Communities)

    An instance of SwissMatch::Communities containing all communities defined by the files known to this DataFiles instance.



238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# File 'lib/swissmatch/location/datafiles.rb', line 238

def load_communities(cantons)
  raise "Must load cantons first" unless cantons

  file      = Dir.enum_for(:glob, "#{@data_directory}/plz_c_*.txt").last
  temporary = []
  complete  = {}
  load_table(file, :community).each do |bfsnr, name, canton, agglomeration|
    bfsnr         = bfsnr.to_i
    agglomeration = agglomeration.to_i
    canton        = cantons.by_license_tag(canton)
    if agglomeration == bfsnr then
      complete[bfsnr] = Community.new(bfsnr, name, canton, :self)
    elsif agglomeration.nil? then
      complete[bfsnr] = Community.new(bfsnr, name, canton, nil)
    else
      temporary << [bfsnr, name, canton, agglomeration]
    end
  end
  temporary.each do |bfsnr, name, canton, agglomeration|
    community = complete[agglomeration]
    raise "Incomplete community referenced by #{bfsnr}: #{agglomeration}" unless agglomeration
    complete[bfsnr] = Community.new(bfsnr, name, canton, community)
  end

  Communities.new(complete.values)
end

#load_districts(cantons) ⇒ Object



221
222
223
224
225
226
227
228
229
230
231
232
233
# File 'lib/swissmatch/location/datafiles.rb', line 221

def load_districts(cantons)
  # File format: GDEKT,GDEBZNR,GDEBZNA
  path      = Dir.enum_for(:glob, "#{@data_directory}/districts_*.csv").last
  data      = File.read(path, encoding: Encoding::UTF_8.to_s).scan(Expressions[:districts])
  districts = data[1..-1].map { |canton_tag, district_number, district_name|
    district_number = Integer(district_number, 10)
    canton          = cantons.by_license_tag(canton_tag)

    District.new(district_number, district_name, canton, SwissMatch::Communities.new([]))
  }

  Districts.new(districts)
end

#load_table(path, pattern) ⇒ Array<Array<String>>

Reads a file and parses using the pattern of the given name.

Parameters:

  • path (String)

    The path of the file to parse

  • pattern (Symbol)

    The pattern-name used to parse the file (see Expressions)

Returns:

  • (Array<Array<String>>)

    A 2 dimensional array representing the tabular data contained in the given file.



380
381
382
383
384
# File 'lib/swissmatch/location/datafiles.rb', line 380

def load_table(path, pattern)
  File.read(path, :encoding => Encoding::Windows_1252.to_s). # to_s because sadly, ruby 1.9.2 can't handle an Encoding instance as argument
    encode(Encoding::UTF_8).
    scan(Expressions[pattern])
end

#load_updatesArray<String>

Load new files

Returns:

  • (Array<String>)

    An array with the absolute file paths of the extracted files.



141
142
143
144
145
# File 'lib/swissmatch/location/datafiles.rb', line 141

def load_updates
  URLAll.flat_map { |url|
    http_get_zip_file(url, @data_directory)
  }
end

#load_zipcodes(cantons, communities) ⇒ SwissMatch::ZipCodes

TODO: load all files, not just the most recent TODO: calculate valid_until dates

Returns:

  • (SwissMatch::ZipCodes)

    An instance of SwissMatch::ZipCodes containing all zip codes defined by the files known to this DataFiles instance.



271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
# File 'lib/swissmatch/location/datafiles.rb', line 271

def load_zipcodes(cantons, communities)
  raise "Must load cantons first" unless cantons
  raise "Must load communities first" unless communities

  community_mapping = Hash.new { |h,k| h[k] = [] }
  self_delivered    = []
  others            = []
  zip1_file         = Dir.enum_for(:glob, "#{@data_directory}/plz_p1_*.txt").last
  zip2_file         = Dir.enum_for(:glob, "#{@data_directory}/plz_p2_*.txt").last
  communities_file  = Dir.enum_for(:glob, "#{@data_directory}/communities_*.csv").last

  # KTKZ,OHW,ORTNAME,GHW,GDENR,GDENAMK,PHW,PLZ4,PLZZ,PLZNAMK
  communities_data  = File.read(
    communities_file,
    encoding: Encoding::UTF_8.to_s
  ).scan(Expressions[:communities])[1..-1].transpose.values_at(4,7,8)
  communities_data[0].map!(&:to_i)
  communities_data[1].map!(&:to_i)
  communities_data[2].map!(&:to_i)
  communities_data.transpose.each do |data|
    community_mapping[data.last(2)] << data.at(0)
  end

  temporary         = {}
  load_table(zip1_file, :zip_1).each do |row|
    onrp                      = row.at(0).to_i
    code                      = row.at(2).to_i
    addon                     = row.at(3).to_i
    delivery_by               = row.at(10).to_i
    delivery_by               = case delivery_by when 0 then nil; when onrp then :self; else delivery_by; end
    language                  = LanguageCodes[row.at(7).to_i]
    language_alternative      = LanguageCodes[row.at(8).to_i]
    name_short                = Name.new(row.at(4), language)
    name                      = Name.new(row.at(5), language)
    largest_community_number  = row.at(11).to_i
    # compact, because some communities already no longer exist, so by_community_numbers can
    # contain nils which must be removed
    community_numbers         = (community_mapping[[code, addon]]|[largest_community_number]).sort
    communities               = Communities.new(communities.by_community_numbers(*community_numbers).compact)
    data                      = [
      onrp,                              # ordering_number
      row.at(1).to_i,                    # type
      code,
      addon,
      name,                              # name (official)
      [name],                            # names (official + alternative)
      name_short,                        # name_short (official)
      [name_short],                      # names_short (official + alternative)
      [],                                # PLZ2 type 3 short names (additional region names)
      [],                                # PLZ2 type 3 names (additional region names)
      cantons.by_license_tag(row.at(6)), # canton
      language,
      language_alternative,
      row.at(9) == "1",                  # sortfile_member
      delivery_by,                       # delivery_by
      communities.by_community_number(largest_community_number),  # community_number
      communities,
      Date.civil(*row.at(12).match(/^(\d{4})(\d\d)(\d\d)$/).captures.map(&:to_i)) # valid_from
    ]
    temporary[onrp] = data
    if :self == delivery_by then
      self_delivered << data
    else
      others << data
    end
  end

  load_table(zip2_file, :zip_2).each do |onrp, rn, type, lang, short, name|
    onrp      = onrp.to_i
    lang_code = lang.to_i
    language  = LanguageCodes[lang_code]
    entry     = temporary[onrp]
    if type == "2"
      entry[5] << Name.new(name, language, rn.to_i)
      entry[7] << Name.new(short, language, rn.to_i)
    elsif type == "3"
      entry[8] << Name.new(name, language, rn.to_i)
      entry[9] << Name.new(short, language, rn.to_i)
    end
  end

  self_delivered.each do |row|
    temporary[row.at(0)] = ZipCode.new(*row)
  end
  others.each do |row|
    if row.at(14) then
      raise "Delivery not found:\n#{row.inspect}" unless tmp = temporary[row.at(14)]
      if tmp.kind_of?(Array) then
        @errors << LoadError.new("Invalid reference: onrp #{row.at(0)} delivery by #{row.at(14)}", row)
        row[14] = nil
      else
        row[14] = tmp
      end
    end
    temporary[row.at(0)] = ZipCode.new(*row)
  end

  ZipCodes.new(temporary.values)
end

#reset_errors!self

Resets the list of errors that were encountered during load

Returns:

  • (self)


132
133
134
135
# File 'lib/swissmatch/location/datafiles.rb', line 132

def reset_errors!
  @errors = []
  self
end

#unzip_file(file, destination) ⇒ Object

Unzips it as a zipped file into the destination directory.



174
175
176
177
178
179
180
181
182
183
# File 'lib/swissmatch/location/datafiles.rb', line 174

def unzip_file(file, destination)
  require 'swissmatch/zip'
  Zip::ZipFile.open(file) do |zip_file|
    zip_file.each do |f|
      target_path = File.join(destination, f.name)
      FileUtils.mkdir_p(File.dirname(target_path))
      zip_file.extract(f, target_path) unless File.exist?(target_path)
    end
  end
end