Module: Datamancer
- Defined in:
- lib/datamancer.rb,
lib/datamancer/load.rb,
lib/datamancer/extract.rb,
lib/datamancer/version.rb,
lib/datamancer/transform.rb
Defined Under Namespace
Classes: ExistingField, MissingField
Constant Summary collapse
- VERSION =
'0.1.0'
Instance Method Summary collapse
- #add(left, right) ⇒ Object
- #aggregate(input) ⇒ Object
- #extract(args) ⇒ Object
- #field_actions(field, value, actions) ⇒ Object
- #join(left, right, attribute) ⇒ Object
- #keyvalue(file) ⇒ Object
- #load(input, args) ⇒ Object
- #raw(input, args) ⇒ Object
- #transform(input, args = {}) ⇒ Object
- #unique(input, attribute) ⇒ Object
- #where(input, attributes) ⇒ Object
Instance Method Details
#add(left, right) ⇒ Object
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
# File 'lib/datamancer/transform.rb', line 3 def add left, right first_row = left.first.merge right.first keys = first_row.keys valores_por_defecto = {} keys.each do |key| valores_por_defecto[key] = case first_row[key] when String then '' when Numeric then 0 else nil end end output = [] (left + right).each do |input_row| output_row = {} keys.each do |key| output_row[key] = input_row[key] || valores_por_defecto[key] end output << output_row end output end |
#aggregate(input) ⇒ Object
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
# File 'lib/datamancer/transform.rb', line 195 def aggregate input define_singleton_method :dim do |name| name = name.to_sym @dimensions[name] = @row[name] end define_singleton_method :fact do |name| name = name.to_sym @facts[name] = @row[name] end aggregated_input = Hash.new { |hash, key| hash[key] = Hash.new } input.each_with_index do |row, row_number| @row = row @dimensions = {} @facts = {} yield if block_given? aggregated_input[@dimensions].merge!(@facts) { |_, fact, other_fact| fact + other_fact } end aggregated_input.map do |dimensions, facts| dimensions.merge(facts) end end |
#extract(args) ⇒ Object
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
# File 'lib/datamancer/extract.rb', line 3 def extract args raise ArgumentError, 'Extract requires a source, i.e. extract(from: source)' unless args.is_a?(Hash) && args[:from] headers = case args[:from] when String # TODO: The first row (headers row) is dropped; to drop more initial rows should be an option. # TODO: Test the separator option. csv = CSV.read args[:from], col_sep: (args[:separator] || ',') csv.shift when Hash ::ActiveRecord::Base.establish_connection args[:from] db = ::ActiveRecord::Base.connection # TODO: Test this. table = args[:table] || args[:from][:table] raise ArgumentError, 'Extract requires a database table, i.e. extract(from: source, table: table_name)' unless table db.columns(table).map(&:name) end @fields = {} @actions = {} headers.each do |header| field = header.to_sym mapping = header @fields[field] = mapping end unless args[:exclude] # The reason behind default_actions is the possibility to # write reject_if: nil with the DSL. default_actions = {reject_if: :nil, reject_unless: :nil} define_singleton_method :field do |name, actions = {}, &block| actions[:type] ||= actions[:type_default] actions[:default] ||= actions[:empty_default] actions = default_actions.merge(actions) mapping = actions[:map] || name.to_s raise MissingField, "Required field '#{mapping}' was not found in '#{args[:from]}'" unless headers.include? mapping field = name.to_sym @fields.delete(actions[:map].to_sym) if actions[:map] @fields[field] = mapping @actions[field] = actions @actions[field][:block] = block end yield if block_given? output = case args[:from] when String @fields.each do |field, mapping| index = headers.find_index(mapping) @fields[field] = index end csv when Hash #TODO: Test for column names with spaces. #TODO: Implement all the SQL escaping cases. columns = @fields.map { |field, mapping| "[#{mapping}] AS [#{field}]" }.join(', ') @fields.keys.each_with_index do |field, index| @fields[field] = index end # TODO: Test top, distinct, where. # TODO: Top for CSV. # TODO: Top for support several databases. if args[:distinct] db.select_rows( "SELECT DISTINCT #{columns} FROM #{table}") elsif args[:top] db.select_rows( "SELECT TOP #{args[:top]} #{columns} FROM #{table} #{'WHERE ' + args[:where] if args[:where]}") else db.select_rows( "SELECT #{columns} FROM #{table} #{'WHERE ' + args[:where] if args[:where]}") end end output.map! do |array_row| hash_row = {} @fields.each do |field, index| value = array_row[index] hash_row[field] = field_actions field, value, @actions[field] end if hash_row.has_value?(:reject) nil else hash_row end end.compact! output end |
#field_actions(field, value, actions) ⇒ Object
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
# File 'lib/datamancer/extract.rb', line 116 def field_actions field, value, actions return value unless actions # TODO: Revisit the order of actions. ## Block-passing ## # TODO: Test this. if actions[:block] value = actions[:block].call(value) end ## Stripping ## # TODO: Test this. if actions[:strip] value.strip! if value.respond_to?(:strip!) end ## Casting ## # Indexes and :type_default are not good friends. # (Because of join while transforming.) # TODO: Test this. if value || actions[:type_default] # TODO: Better data types support. From Mongoid: # [ ] Array # [ ] BigDecimal # [ ] Boolean # [x] Float # [ ] Hash # [x] Integer # [ ] Range # [ ] Regexp # [x] String # [x] Symbol # [x] Date # [ ] DateTime # [ ] Time # [ ] TimeWithZone case actions[:type].to_s when 'Complex' value = value.to_c when 'Float' value = value.to_f when 'Integer' value = value.to_i when 'Rational' value = value.to_r when 'String' value = value.to_s when 'Symbol' value = value.to_sym when 'Date' # From "1900-01-01 00:00:00.000" to "19000101". value = value.gsub('-', '')[0..7] end end ## Default value ## # TODO: Test this. if value.nil? || (actions[:empty_default] && value.empty?) value = actions[:default] end ## Validation ## # TODO: Test this. Test to not reject nil by default. if actions[:reject_if].is_a? Array actions[:reject_if].each do |reject_value| value = :reject if reject_value == value end else value = :reject if actions[:reject_if] == value end if actions[:reject_unless].is_a? Array actions[:reject_unless].each do |reject_value| value = :reject if reject_value != value end else value = :reject if actions[:reject_unless] != :nil && actions[:reject_unless] != value end value end |
#join(left, right, attribute) ⇒ Object
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/datamancer/transform.rb', line 33 def join left, right, attribute attribute = attribute.to_sym left_groups = Hash.new { |hash, key| hash[key] = [] } right_groups = Hash.new { |hash, key| hash[key] = [] } left.each do |tuple| left_groups[tuple[attribute]] << tuple if tuple[attribute] end right.each do |tuple| right_groups[tuple[attribute]] << tuple if tuple[attribute] end output = Array.new left_groups.each do |key, left_group| if right_group = right_groups[key] left_group.each do |left_tuple| right_group.each do |right_tuple| output << left_tuple.merge(right_tuple) end end end end # TODO: Test this: raise StandardError, 'Sadness: null join.' if output.empty? output end |
#keyvalue(file) ⇒ Object
11 12 13 |
# File 'lib/datamancer.rb', line 11 def keyvalue file YAML::load_file file end |
#load(input, args) ⇒ Object
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# File 'lib/datamancer/load.rb', line 48 def load input, args raise ArgumentError, 'Load requires a destination, i.e. load(data, to: destination)' unless args.is_a?(Hash) && args[:to] ## COLUMNS ## # define_singleton_method :field do |name, options = {}| # @columns << (options[:map] || name) # end # @columns = [] # yield if block_given? ## INSERTS ## define_singleton_method :field do |name, = {}| name = name.to_sym raise MissingField, "Required field '#{name}' was not found in '#{args[:to]}'" unless @input_row.include? name @output_row[[:map] || name] = @input_row[name] @output_row.delete(name) if !args[:exclude] && [:map] end inserts = [] input.each do |row| @input_row = row @output_row = args[:exclude] ? {} : row.dup yield if block_given? inserts << @output_row.values end columns = @output_row.keys ## LOAD ## # TODO: Set 'w' or 'w+' for CSV writing. if args[:to].is_a?(String) mode = if args[:append] then 'a' else 'w' end CSV.open(args[:to], mode) do |csv| csv << columns inserts.each do |insert| csv << insert end end else ::ActiveRecord::Base.establish_connection(args[:to]) # TODO: Test this. table = args[:table] || args[:to][:table] raise ArgumentError, 'Load requires a database table, i.e. load(data, to: destination, table: table_name)' unless table ::ActiveRecord::Base.connection.delete("DELETE FROM #{table}") unless args[:append] batch_size = args[:batch] || 1000 pre_query = "INSERT INTO #{table} (#{columns.join(',')}) VALUES " # String values must be enclosed by single quotes. inserts.map! do |insert| insert.map! do |field| field.is_a?(String) ? "'#{field}'" : (field ? field : 'NULL') end "(#{insert.join(',')})" end until inserts.empty? query = pre_query + inserts.pop(batch_size).join(',') ::ActiveRecord::Base.connection.execute query end end end |
#raw(input, args) ⇒ Object
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# File 'lib/datamancer/load.rb', line 3 def raw input, args raise ArgumentError, 'Raw requires a destination, i.e. raw(data, to: destination)' unless args.is_a?(Hash) && args[:to] ::ActiveRecord::Base.establish_connection(args[:to]) # TODO: Test this. # table = args[:table] || args[:to][:table] # raise ArgumentError, # 'Raw requires a database table, i.e. raw(data, to: destination, table: table_name)' unless table # TODO: Method-overriding safeguard. input.first.each_key do |key| define_singleton_method key.downcase do # Some methods applied to fields might modify the original fields. # Fields could be duplicated in case this be a common problem. #@input_row[key].dup @input_row[key] end end define_singleton_method :db do ::ActiveRecord::Base.connection end define_singleton_method :query do |query| ::ActiveRecord::Base.connection.execute query end input.each do |row| @input_row = row yield if block_given? end end |
#transform(input, args = {}) ⇒ Object
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
# File 'lib/datamancer/transform.rb', line 90 def transform input, args = {} # TODO: Mensajes que expliquen mejor los estos errores. if args[:join] raise ArgumentError unless args[:on] raise ArgumentError unless input.first.keys.include?(args[:on].to_sym) raise ArgumentError unless args[:join].first.keys.include?(args[:on].to_sym) input = join input, args[:join], args[:on] end if args[:add] input = add input, args[:add] end if args[:unique] input = unique input, args[:unique] end # TODO: Method-overriding safeguard. input.first.each_key do |key| define_singleton_method key.to_s.gsub(' ', '_').downcase do # Some methods applied to fields might modify the original fields. # Fields could be duplicated in case this be a common problem. #@input_row[key].dup @input_row[key] end end define_singleton_method :row_number do @row_number end define_singleton_method :row do @supplementary_row end define_singleton_method :count do @count += 1 end define_singleton_method :output do @output end define_singleton_method :switch do |slot| @slot = slot end define_singleton_method :field do |name, value = nil, *args| raise MissingField, "Required field '#{name}' was not found" unless @input_row.include?(name.to_sym) @output_row[name.to_sym] = if value.is_a?(Symbol) send(name.downcase).send *args.unshift(value) else value || send(name.downcase) end end define_singleton_method :del_field do |name| raise MissingField, "Filtered field '#{name}' was not found" unless @input_row.include?(name.to_sym) @output_row.delete(name.to_sym) end define_singleton_method :new_field do |name, value| raise ExistingField, "New field '#{name}' already exists" if @input_row.include?(name.to_sym) @output_row[name.to_sym] = value end # TODO: Test for count. @count = 0 # TODO: Test for slots. @output = Hash.new { |h, k| h[k] = [] } input.each_with_index do |row, row_number| # TODO: Test for row_number. # TODO: Test for (supplementary) row. @row_number = row_number @input_row = row @supplementary_row = @input_row.dup @output_row = args[:exclude] ? {} : @input_row.dup yield if block_given? @output[@slot] << @output_row end @output.length == 1? @output[nil] : @output end |
#unique(input, attribute) ⇒ Object
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# File 'lib/datamancer/transform.rb', line 74 def unique input, attribute attribute = attribute.to_sym output = Array.new unique_values = Array.new input.each do |row| unless unique_values.include?(row[attribute]) output << row unique_values << row[attribute] end end output end |
#where(input, attributes) ⇒ Object
70 71 72 |
# File 'lib/datamancer/transform.rb', line 70 def where input, attributes end |