Class: Fech::MapGenerator

Inherits:
Object
  • Object
show all
Defined in:
lib/fech/map_generator.rb

Overview

Helper class to generate mapping hashes from source csv data. Needed to rebuild rendered_maps.rb with new source data, not used in main gem.

rake fech:maps

Constant Summary collapse

FILING_VERSIONS =
["8.0", "7.0", "6.4", "6.3", "6.2", "6.1",
"5.3", "5.2", "5.1", "5.0", "3"]
BASE_ROW_TYPES =
["HDR", "F1", "F13", "F132", "F133", "F1M", "F1S", "F2", "F24", "F3", "F3L", "F3P", "F3P31", "F3PS", 
"F3S", "F3X", "F4", "F5", "F56", "F57", "F6", "F65", "F7", "F76", "F9", "F91", "F92", "F93", 
"F94", "F99", "H1", "H2", "H3", "H4", "H5", "H6",
"SchA", "SchB", "SchC", "SchC1", "SchC2", "SchD", "SchE", "SchF", "SchL", "TEXT"]
ROW_TYPE_MATCHERS =
{
  "HDR"    => FechUtils::ROW_TYPES[:hdr],
  "F1"     => FechUtils::ROW_TYPES[:f1],
  "F13"    => FechUtils::ROW_TYPES[:f13],
  "F132"   => FechUtils::ROW_TYPES[:f132],
  "F133"   => FechUtils::ROW_TYPES[:f133],
  "F1M"    => FechUtils::ROW_TYPES[:f1m],
  "F1S"    => FechUtils::ROW_TYPES[:f1s],
  "F2"     => FechUtils::ROW_TYPES[:f2],
  "F24"    => FechUtils::ROW_TYPES[:f24],
  "F3"     => FechUtils::ROW_TYPES[:f3],
  "F3L"    => FechUtils::ROW_TYPES[:f3l],
  "F3P"    => FechUtils::ROW_TYPES[:f3p],
  "F3S"    => FechUtils::ROW_TYPES[:f3s],
  "F3P31"  => FechUtils::ROW_TYPES[:f3p31],
  "F3PS"   => FechUtils::ROW_TYPES[:f3ps],
  "F3X"    => FechUtils::ROW_TYPES[:f3x],
  "F4"     => FechUtils::ROW_TYPES[:f4],
  "F5"     => FechUtils::ROW_TYPES[:f5],
  "F56"    => FechUtils::ROW_TYPES[:f56],
  "F57"    => FechUtils::ROW_TYPES[:f57],
  "F6"     => FechUtils::ROW_TYPES[:f6],
  "F65"    => FechUtils::ROW_TYPES[:f65],
  "F7"     => FechUtils::ROW_TYPES[:f7],
  "F76"    => FechUtils::ROW_TYPES[:f76],
  "F9"     => FechUtils::ROW_TYPES[:f9],
  "F91"    => FechUtils::ROW_TYPES[:f91],
  "F92"    => FechUtils::ROW_TYPES[:f92],
  "F93"    => FechUtils::ROW_TYPES[:f93],
  "F94"    => FechUtils::ROW_TYPES[:f94],
  "F99"    => FechUtils::ROW_TYPES[:f99],
  "H1"     => FechUtils::ROW_TYPES[:h1],
  "H2"     => FechUtils::ROW_TYPES[:h2],
  "H3"     => FechUtils::ROW_TYPES[:h3],
  "H4"     => FechUtils::ROW_TYPES[:h4],
  "H5"     => FechUtils::ROW_TYPES[:h5],
  "H6"     => FechUtils::ROW_TYPES[:h6],
  "SchA"   => FechUtils::ROW_TYPES[:sa],
  "SchB"   => FechUtils::ROW_TYPES[:sb],
  "SchC"   => FechUtils::ROW_TYPES[:sc],
  "SchC1"  => FechUtils::ROW_TYPES[:sc1],
  "SchC2"  => FechUtils::ROW_TYPES[:sc2],
  "SchD"   => FechUtils::ROW_TYPES[:sd],
  "SchE"   => FechUtils::ROW_TYPES[:se],
  "SchF"   => FechUtils::ROW_TYPES[:sf],
  "SchL"   => FechUtils::ROW_TYPES[:sl],
  "TEXT"   => FechUtils::ROW_TYPES[:text],
}

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Attribute Details

#mapObject

Returns the value of attribute map.



9
10
11
# File 'lib/fech/map_generator.rb', line 9

def map
  @map
end

Class Method Details

.convert_header_file_to_row_files(source_dir) ⇒ Object

Goes through all version header summary files and generates row map files for each type of row inside them.



67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/fech/map_generator.rb', line 67

def self.convert_header_file_to_row_files(source_dir)
  data = {}
  hybrid_data = {}
  
  ignored_fields = File.open(ignored_fields_file(source_dir)).readlines.map { |l| l.strip }
  
  # Create a hash of data with an entry for each row type found in the source
  # version summary files. Each row has an entry for each version map that
  # exists for it. If maps for two different versions are identical, they
  # are combined.
  FILING_VERSIONS.each do |version|
    filepath = version_summary_file(source_dir, version)

    # Clean the source files by removing unparseable characters
    if RUBY_VERSION < "1.9.3"
      require 'iconv'
      ic = Iconv.new('UTF-8//IGNORE', 'UTF-8')
      valid_string = ic.iconv(open(filepath).read << ' ')[0..-2]
    else
      valid_string = (open(filepath).read << ' ')[0..-2].encode!('UTF-16', 'UTF-8', :invalid => :replace, :replace => '')
      valid_string = valid_string.encode!('UTF-8', 'UTF-16')
    end
    open(filepath, 'w').write(valid_string)

    Fech::Csv.foreach(filepath) do |row|
      # Each row of a version summary file contains the ordered list of
      # column names.
      data[row.first] ||= {}
      hybrid_data[row.first] ||= {}
      row_version_data = remove_ignored_fields(row, ignored_fields)

      # Check the maps for this row type in already-processed versions.
      # If this map is identical to a previous map, tack this version on to
      # to it instead of creating a new one.
      data[row.first][version] = row_version_data
      data[row.first].each do |k, v|
        # skip the row we just added
        
        next if k == version
        if v == row_version_data
          # Create the new hybrid entry
          hybrid_data[row.first]["#{k}|#{version}"] = row_version_data
          
          # Delete the old entry, and the one for this version only
          data[row.first].delete(k)
          data[row.first].delete(version)
        end
      end
      data[row.first].update(hybrid_data[row.first])
    end
  end
  
  # Go through each row type and create a base map management file that
  # will serve as a template for organizing which fields are the same
  # between versions. This file will need to then be arranged by hand to
  # clean up the data. Each row will represent a column across versions,
  # each column a unique map for that row for one or more versions.
  data.each do |row_type, row_data|
    file_path = write_row_map_file(source_dir, row_type)
    next unless File.exists?(file_path)
    File.open(file_path, 'w') do |f|
      f.write('canonical')
      
      to_transpose = []
      row_data.sort.reverse.each do |version, version_data|
        to_transpose << ["^#{version}", version_data.each_with_index.collect {|x, idx| idx+1}].flatten
        to_transpose << [nil, version_data].flatten
      end
      
      # standardize row size
      max_size = to_transpose.max { |r1, r2| r1.size <=> r2.size }.size
      to_transpose.each { |r| r[max_size - 1] ||= nil }
      transposed = to_transpose.transpose
      
      transposed.each do |transposed_data|
        transposed_data.collect! {|x| x.to_s.gsub(/\r/, ' ')}
        canonical = transposed_data[1] # first description
        if canonical
          canonical = canonical.gsub(/\{.*\}/, "").gsub(/[ -\.\/\(\)]/, "_").gsub(/_+/, "_").gsub(/(_$)|(^_)/, "").downcase
          transposed_data = [canonical, transposed_data].flatten
        end
        f.write(transposed_data.join(','))
        f.write("\n")
      end
    end
  end

end

.dump_row_maps_to_ruby(source_dir, file_path) ⇒ Object

Generates the mapping for each row type in BASE_ROW_TYPES, writes them out to file for inclusion in the gem.



158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# File 'lib/fech/map_generator.rb', line 158

def self.dump_row_maps_to_ruby(source_dir, file_path)
  File.open(file_path, 'w') do |f|
    f.write("# Generated automatically by Fech::MapGenerator.\n\n")
    f.write("# RENDERED_MAPS contains an entry for each supported row type, which in turn:\n")
    f.write("#   contain an entry for each distinct map between a row's labels and the\n")
    f.write("#   indexes where their values can be found.\n")
    f.write("module Fech\n")
    f.write("  RENDERED_MAPS = {\n")
    BASE_ROW_TYPES.each do |row_type|
      f.write("    \"#{ROW_TYPE_MATCHERS[row_type].source}\" => {\n")
      generate_row_map_from_file(source_dir, row_type).sort_by(&:first).reverse.each do |k, v|
        f.write("      \'#{k}' => [#{v.map {|x| x.to_s.gsub(/^\d+_?/, "") }.collect {|x| (x.nil? || x == "") ? "nil" : ":#{x}" }.join(', ') }],\n")
      end
      f.write("    },\n")
    end
    f.write("  }\n")
    f.write("end")
  end
end

.generate_row_map_from_file(source_dir, row_type) ⇒ Object

For a given row type, parses its source file and returns a mapping object for it.



180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
# File 'lib/fech/map_generator.rb', line 180

def self.generate_row_map_from_file(source_dir, row_type)
  versions = []
  version_indexes = []
  data = {}
  text = open(row_map_file(source_dir, row_type)).read
  split_char = text.index(/\r/) ? /\r/ : /\n/
  rows = text.split(split_char).collect {|x| x.split(',')}
  rows.each do |row|
    row = row.collect {|x| x.gsub("\n", "")}
    if row.first.nil?
      require 'ruby-debug'; debugger
    end
    if row.first.downcase == "canonical"
      versions = row[1..-1].uniq.collect {|x| x unless (x.nil? || x.empty?)}.compact
      row.each_with_index {|x, ind| version_indexes << ind unless (x.nil? || x.empty?)}.slice!(1)
      version_indexes.slice!(0, 1)
      versions.each {|x| data[x] = [] }
      
    elsif row.first.size > 0
      canonical = row.first
      
      versions.zip(version_indexes).each do |version, row_index|
        index = row[row_index]
        data[version][index.to_i - 1] = canonical.to_sym if index.to_i > 0
      end
    end
  end

  row_map = {}
  data.each {|key, value| row_map[key] = value}
  row_map
end

.ignored_fields_file(source_dir) ⇒ Object



224
225
226
# File 'lib/fech/map_generator.rb', line 224

def self.ignored_fields_file(source_dir)
  File.join(source_dir, 'headers', 'ignore.csv')
end

.remove_ignored_fields(row, ignore) ⇒ Object

Remove both the row type from the beginning of the row, and any fields marked as “ignore” in sources/headers/ignore.csv



215
216
217
218
# File 'lib/fech/map_generator.rb', line 215

def self.remove_ignored_fields(row, ignore)
  data = row[1..-1].compact # strip off the row type
  data.reject { |f| ignore.include?(f) }
end

.row_map_file(source_dir, row_type) ⇒ Object



220
221
222
# File 'lib/fech/map_generator.rb', line 220

def self.row_map_file(source_dir, row_type)
  File.join(source_dir, row_type + '.csv')
end

.version_summary_file(source_dir, version) ⇒ Object



228
229
230
# File 'lib/fech/map_generator.rb', line 228

def self.version_summary_file(source_dir, version)
  File.join(source_dir, 'headers', version + '.csv')
end

.write_row_map_file(source_dir, row_type) ⇒ Object



232
233
234
# File 'lib/fech/map_generator.rb', line 232

def self.write_row_map_file(source_dir, row_type)
  File.join(source_dir, 'rows', row_type + '.csv')
end