data_miner

Programmatically import useful data into your ActiveRecord models.

(see wiki.github.com/seamusabshere/data_miner for more examples)

Quick start

You define data_miner blocks in your ActiveRecord models. For example, in app/models/country.rb:

class Country < ActiveRecord::Base
  set_primary_key :iso_3166_code

  data_miner do
    schema do
      string   'iso_3166_code'
      string   'name'
    end

    import 'the official ISO country list',
           :url => 'http://www.iso.org/iso/list-en1-semic-3.txt',
           :skip => 2,
           :headers => false,
           :delimiter => ';',
           :encoding => 'ISO-8859-1' do
      key   'iso_3166_code', :field_number => 1
      store 'name', :field_number => 0
    end
  end
end

Now you can run:

irb(main):001:0> Country.run_data_miner!
=> nil

Advanced usage

This is how we linked together (data.brighterplanet.com/aircraft) the FAA’s list of aircraft with the US Department of Transportations list of aircraft:

class Aircraft < ActiveRecord::Base
  # Tell ActiveRecord that we want to use a string primary key.
  # This makes it easier to repeatedly truncate and re-import this
  # table without breaking associations.
  set_primary_key :icao_code

  # A dictionary between BTS aircraft type codes and ICAO aircraft
  # codes that uses string similarity instead of exact matching.
  # This is preferable to typing everything out.
  def self.bts_name_dictionary
    # Sorry for documenting the LooseTightDictionary gem here, but it's useful
    @_bts_dictionary ||= LooseTightDictionary.new(
                                                  # The first argument is the source... the possible matches. Most Enumerables will do.
                                                  RemoteTable.new(:url => 'http://www.transtats.bts.gov/Download_Lookup.asp?Lookup=L_AIRCRAFT_TYPE', :select => lambda { |record| record['Code'].to_i.between?(1, 998) }),
                                                  # Tightenings optionally pull out what is important on both sides of a potential match
                                                  :tightenings  => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false),
                                                  # Identities optionally require a particular capture from both sides of a match to be equal
                                                  :identities   => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false),
                                                  # Blockings restrict comparisons to a subset where everything matches the blocking
                                                  :blockings    => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false),
                                                  # This means that lookups that don't match a blocking won't be compared to possible matches that **do** match a blocking.
                                                  # This is useful because we say /boeing/ and only boeings are matched against other boeings.
                                                  :blocking_only => true,
                                                  # Tell the dictionary how read things from the source.
                                                  :right_reader => lambda { |record| record['Description'] }
                                                 )
  end

  # A dictionary between what appear to be ICAO aircraft names and
  # objects of this class itself.
  # Warning: self-referential (it calls Aircraft.all) so it should be run after the first DataMiner step.
  def self.icao_name_dictionary
    @_icao_dictionary ||= LooseTightDictionary.new Aircraft.all,
                                                   :tightenings  => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=0&output=csv', :headers => false),
                                                   :identities   => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=3&output=csv', :headers => false),
                                                   :blockings    => RemoteTable.new(:url => 'http://spreadsheets.google.com/pub?key=tiS_6CCDDM_drNphpYwE_iw&single=true&gid=4&output=csv', :headers => false),
                                                   :right_reader => lambda { |record| record.manufacturer_name.to_s + ' ' + record.name.to_s }
  end

  # This responds to the "Matcher" interface as defined by DataMiner.
  # In other words, it takes Matcher#match(*args) and returns something.
  class BtsMatcher
    attr_reader :wants
    def initialize(wants)
      @wants = wants
    end
    def match(raw_faa_icao_record)
      @_match ||= Hash.new
      return @_match[raw_faa_icao_record] if @_match.has_key?(raw_faa_icao_record)
      faa_icao_record = [ raw_faa_icao_record['Manufacturer'] + ' ' + raw_faa_icao_record['Model'] ]
      bts_record = Aircraft.bts_name_dictionary.left_to_right faa_icao_record
      retval = case wants
      when :bts_aircraft_type_code
        bts_record['Code']
      when :bts_name
        bts_record['Description']
      end if bts_record
      @_match[raw_faa_icao_record] = retval
    end
  end

  # Another class that implements the "Matcher" interface as expected by DataMiner.
  class FuelUseMatcher
    def match(raw_fuel_use_record)
      @_match ||= Hash.new
      return @_match[raw_fuel_use_record] if @_match.has_key?(raw_fuel_use_record)
      # First try assuming we have an ICAO code
      aircraft_record = if raw_fuel_use_record['ICAO'] =~ /\A[0-9A-Z]+\z/
        Aircraft.find_by_icao_code raw_fuel_use_record['ICAO']
      end
      # No luck? then try a fuzzy match
      aircraft_record ||= if raw_fuel_use_record['Aircraft Name'].present?
        Aircraft.icao_name_dictionary.left_to_right [ raw_fuel_use_record['Aircraft Name'] ]
      end
      if aircraft_record
        @_match[raw_fuel_use_record] = aircraft_record.icao_code
      else
        # While we're developing the dictionary, we want it to blow up until we have 100% matchability
        raise "Didn't find a match for #{raw_fuel_use_record['Aircraft Name']} (#{raw_fuel_use_record['ICAO']}), which we found in the fuel use spreadsheet"
      end
    end
  end

  # This responds to the "Responder" interface as expected by Errata.
  # Basically it lets you say "Is a DC plane" in the errata file and
  # have it map to a Ruby method.
  class Guru
    def is_a_dc_plane?(row)
      row['Designator'] =~ /^DC\d/i
    end
    def is_a_g159?(row)
      row['Designator'] =~ /^G159$/
    end
    def is_a_galx?(row)
      row['Designator'] =~ /^GALX$/
    end
    def method_missing(method_id, *args, &block)
      if method_id.to_s =~ /\Ais_n?o?t?_?attributed_to_([^\?]+)/
        manufacturer_name = $1
        manufacturer_regexp = Regexp.new(manufacturer_name.gsub('_', ' ?'), Regexp::IGNORECASE)
        matches = manufacturer_regexp.match(args.first['Manufacturer']) # row['Manufacturer'] =~ /mcdonnell douglas/i
        method_id.to_s.include?('not_attributed') ? matches.nil? : !matches.nil?
      else
        super
      end
    end
  end

  data_miner do
    # In our app, we defined DataMiner::Run.allowed? to return false if a run
    # has taken place in the last hour (among other things).
    # By raising DataMiner::Skip, we skip this run but call it a success.
    process "Don't re-import too often" do
      raise DataMiner::Skip unless DataMiner::Run.allowed? Aircraft
    end

    # Define the database schema in-line.
    # It will destructively and automatically add/remove columns.
    # This is "OK" because you can always just re-run the import script to get the data back.
    # PS. if we were using DataMapper, we wouldn't need this.
    schema :options => 'ENGINE=InnoDB default charset=utf8' do
      string   'icao_code'
      string   'manufacturer_name'
      string   'name'
      string   'bts_name'
      string   'bts_aircraft_type_code'
      string   'brighter_planet_aircraft_class_code'
      string   'fuel_use_aircraft_name'
      float    'm3'
      string   'm3_units'
      float    'm2'
      string   'm2_units'
      float    'm1'
      string   'm1_units'
      float    'endpoint_fuel'
      string   'endpoint_fuel_units'
      float    'seats'
      float    'distance'
      string   'distance_units'
      float    'load_factor'
      float    'freight_share'
      float    'payload'
      float    'weighting'
      index    'bts_aircraft_type_code'
    end

    # The FAA publishes a document to help people identify aircraft by different names.
    ('A'..'Z').each do |letter|
      import( "ICAO aircraft codes starting with the letter #{letter} used by the FAA",
              # The master URL of the source file (one for every letter)
              :url => "http://www.faa.gov/air_traffic/publications/atpubs/CNT/5-2-#{letter}.htm",
              # The RFC-style errata... note that it will use the Guru class we defined above. See the Errata gem for more details.
              :errata => Errata.new(:url => 'http://spreadsheets.google.com/pub?key=tObVAGyqOkCBtGid0tJUZrw', :responder => Aircraft::Guru.new),
              # If it's not UTF-8, you should say what it is so that we can iconv it!
              :encoding => 'windows-1252',
              # Nokogiri is being used to grab each row starting from the second
              :row_xpath => '//table/tr[2]/td/table/tr',
              # ditto... XPath for Nokogiri
              :column_xpath => 'td' ) do
        # The code that they use is in fact the ICAO code!
        key 'icao_code', :field_name => 'Designator'
        # We get this for free
        store 'manufacturer_name', :field_name => 'Manufacturer'
        # ditto
        store 'name', :field_name => 'Model'
        # Use the loose-tight dictionary.
        # It gets the entire input row to play with before deciding on an output.
        store 'bts_aircraft_type_code', :matcher => Aircraft::BtsMatcher.new(:bts_aircraft_type_code)
        store 'bts_name', :matcher => Aircraft::BtsMatcher.new(:bts_name)
      end
    end

    # Pull in some data that might only be important to Brighter Planet
    import "Brighter Planet's aircraft class codes",
           :url => 'http://static.brighterplanet.com/science/data/transport/air/bts_aircraft_type/bts_aircraft_types-brighter_planet_aircraft_classes.csv' do
      key   'bts_aircraft_type_code', :field_name => 'bts_aircraft_type'
      store 'brighter_planet_aircraft_class_code'
    end

    # Pull in fuel use equation (y = m3*x^3 + m2*x^2 + m1*x + endpoint_fuel).
    # This data comes from the EEA.
    import "pre-calculated fuel use equation coefficients",
           :url => 'http://static.brighterplanet.com/science/data/transport/air/fuel_use/aircraft_fuel_use_formulae.ods',
           :select => lambda { |row| row['ICAO'].present? or row['Aircraft Name'].present? } do
      # We want to key on ICAO code, but since it's sometimes missing, use the loose-tight dictionary we defined above.
      key   'icao_code', :matcher => Aircraft::FuelUseMatcher.new
      # Keep the name for sanity checking. Yes, we have 3 different "name" fields... they should all refer to the same aircraft.
      store 'fuel_use_aircraft_name', :field_name => 'Aircraft Name'
      store 'm3'
      store 'm2'
      store 'm1'
      store 'endpoint_fuel', :field_name => 'b'
    end

    # Use arel and the weighted_average gem to do some crazy averaging.
    # This assumes that you're dealing with the BTS T-100 flight segment data.
    # See http://data.brighterplanet.com/flight_segments for a pre-sanitized version.
    process "Derive some average flight characteristics from flight segments" do
      FlightSegment.run_data_miner!
      aircraft = Aircraft.arel_table
      segments = FlightSegment.arel_table

      conditional_relation = aircraft[:bts_aircraft_type_code].eq(segments[:bts_aircraft_type_code])
      update_all "seats         = (#{FlightSegment.weighted_average_relation(:seats,         :weighted_by => :passengers                                           ).where(conditional_relation).to_sql})"
      update_all "distance      = (#{FlightSegment.weighted_average_relation(:distance,      :weighted_by => :passengers                                           ).where(conditional_relation).to_sql})"
      update_all "load_factor   = (#{FlightSegment.weighted_average_relation(:load_factor,   :weighted_by => :passengers                                           ).where(conditional_relation).to_sql})"
      update_all "freight_share = (#{FlightSegment.weighted_average_relation(:freight_share, :weighted_by => :passengers                                           ).where(conditional_relation).to_sql})"
      update_all "payload       = (#{FlightSegment.weighted_average_relation(:payload,       :weighted_by => :passengers, :disaggregate_by => :departures_performed).where(conditional_relation).to_sql})"

      update_all "weighting = (#{segments.project(segments[:passengers].sum).where(aircraft[:bts_aircraft_type_code].eq(segments[:bts_aircraft_type_code])).to_sql})"
    end

    # And finally re-run the import of resources that depend on this resource.
    # Don't worry about calling Aircraft.run_data_miner! at the top of AircraftManufacturer's data_miner block;
    # that's the right way to do dependencies. It won't get called twice in the same run.
    [ AircraftManufacturer ].each do |synthetic_resource|
      process "Synthesize #{synthetic_resource}" do
        synthetic_resource.run_data_miner!
      end
    end
  end
end

Authors

Copyright © 2010 Brighter Planet. See LICENSE for details.