Class: Tapsoob::DataStream
- Inherits:
-
Object
- Object
- Tapsoob::DataStream
- Defined in:
- lib/tapsoob/data_stream.rb
Direct Known Subclasses
Constant Summary collapse
- DEFAULT_CHUNKSIZE =
1000
Instance Attribute Summary collapse
-
#db ⇒ Object
readonly
Returns the value of attribute db.
-
#state ⇒ Object
readonly
Returns the value of attribute state.
Class Method Summary collapse
Instance Method Summary collapse
- #complete? ⇒ Boolean
- #encode_rows(rows) ⇒ Object
- #error ⇒ Object
- #error=(val) ⇒ Object
- #fetch(opts = {}) ⇒ Object
- #fetch_chunksize ⇒ Object
- #fetch_data_in_database(params) ⇒ Object
- #fetch_database ⇒ Object
- #fetch_file(dump_path) ⇒ Object
- #fetch_from_database ⇒ Object
-
#fetch_rows ⇒ Object
keep a record of the average chunksize within the first few hundred thousand records, after chunksize goes below 100 or maybe if offset is > 1000.
- #import_rows(rows) ⇒ Object
- #increment(row_count) ⇒ Object
-
#initialize(db, state) ⇒ DataStream
constructor
A new instance of DataStream.
- #log ⇒ Object
- #max_chunksize_training ⇒ Object
- #order_by(name = nil) ⇒ Object
- #parse_encoded_data(encoded_data, checksum) ⇒ Object
- #string_columns ⇒ Object
- #table ⇒ Object
- #table_name ⇒ Object
- #table_name_sql ⇒ Object
- #to_hash ⇒ Object
- #to_json ⇒ Object
- #update_chunksize_stats ⇒ Object
- #verify_stream ⇒ Object
Constructor Details
#initialize(db, state) ⇒ DataStream
Returns a new instance of DataStream.
11 12 13 14 15 16 17 18 19 20 21 |
# File 'lib/tapsoob/data_stream.rb', line 11 def initialize(db, state) @db = db @state = { :offset => 0, :avg_chunksize => 0, :num_chunksize => 0, :total_chunksize => 0 }.merge(state) @state[:chunksize] ||= DEFAULT_CHUNKSIZE @complete = false end |
Instance Attribute Details
#db ⇒ Object (readonly)
Returns the value of attribute db.
9 10 11 |
# File 'lib/tapsoob/data_stream.rb', line 9 def db @db end |
#state ⇒ Object (readonly)
Returns the value of attribute state.
9 10 11 |
# File 'lib/tapsoob/data_stream.rb', line 9 def state @state end |
Class Method Details
.factory(db, state) ⇒ Object
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 |
# File 'lib/tapsoob/data_stream.rb', line 239 def self.factory(db, state) if defined?(Sequel::MySQL) && Sequel::MySQL.respond_to?(:convert_invalid_date_time=) Sequel::MySQL.convert_invalid_date_time = :nil end if state.has_key?(:klass) return eval(state[:klass]).new(db, state) end if Tapsoob::Utils.single_integer_primary_key(db, state[:table_name].to_sym) DataStreamKeyed.new(db, state) else DataStream.new(db, state) end end |
.parse_json(json) ⇒ Object
200 201 202 203 204 |
# File 'lib/tapsoob/data_stream.rb', line 200 def self.parse_json(json) hash = JSON.parse(json).symbolize_keys hash[:state].symbolize_keys! if hash.has_key?(:state) hash end |
Instance Method Details
#complete? ⇒ Boolean
142 143 144 |
# File 'lib/tapsoob/data_stream.rb', line 142 def complete? @complete end |
#encode_rows(rows) ⇒ Object
118 119 120 |
# File 'lib/tapsoob/data_stream.rb', line 118 def encode_rows(rows) Tapsoob::Utils.base64encode(Marshal.dump(rows)) end |
#error ⇒ Object
32 33 34 |
# File 'lib/tapsoob/data_stream.rb', line 32 def error state[:error] || false end |
#error=(val) ⇒ Object
28 29 30 |
# File 'lib/tapsoob/data_stream.rb', line 28 def error=(val) state[:error] = val end |
#fetch(opts = {}) ⇒ Object
122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
# File 'lib/tapsoob/data_stream.rb', line 122 def fetch(opts = {}) opts = (opts.empty? ? { :type => "database", :source => db.uri } : opts) log.debug "DataStream#fetch state -> #{state.inspect}" t1 = Time.now rows = (opts[:type] == "file" ? fetch_file(opts[:source]) : fetch_rows) encoded_data = encode_rows(rows) t2 = Time.now elapsed_time = t2 - t1 if opts[:type] == "file" @complete = rows[:data] == [ ] else @complete = rows == { } end [encoded_data, (@complete ? 0 : rows[:data].size), elapsed_time] end |
#fetch_chunksize ⇒ Object
103 104 105 106 107 108 109 |
# File 'lib/tapsoob/data_stream.rb', line 103 def fetch_chunksize chunksize = state[:chunksize] return chunksize if state[:num_chunksize] < max_chunksize_training return chunksize if state[:avg_chunksize] == 0 return chunksize if state[:error] state[:avg_chunksize] > chunksize ? state[:avg_chunksize] : chunksize end |
#fetch_data_in_database(params) ⇒ Object
185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
# File 'lib/tapsoob/data_stream.rb', line 185 def fetch_data_in_database(params) encoded_data = params[:encoded_data] rows = parse_encoded_data(encoded_data, params[:checksum]) @complete = rows[:data] == [ ] unless @complete import_rows(rows) rows[:data].size else 0 end end |
#fetch_database ⇒ Object
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
# File 'lib/tapsoob/data_stream.rb', line 146 def fetch_database params = fetch_from_database encoded_data = params[:encoded_data] json = params[:json] rows = parse_encoded_data(encoded_data, json[:checksum]) @complete = rows == { } # update local state state.merge!(json[:state].merge(:chunksize => state[:chunksize])) unless @complete yield rows if block_given? state[:offset] += rows[:data].size rows[:data].size else 0 end end |
#fetch_file(dump_path) ⇒ Object
86 87 88 89 90 91 92 93 94 95 96 97 |
# File 'lib/tapsoob/data_stream.rb', line 86 def fetch_file(dump_path) state[:chunksize] = fetch_chunksize ds = JSON.parse(File.read(File.join(dump_path, "data", "#{table_name}.json"))) log.debug "DataStream#fetch_file" rows = { :table_name => ds["table_name"], :header => ds["header"], :data => ds["data"][state[:offset], (state[:offset] + state[:chunksize])] || [ ] } update_chunksize_stats rows end |
#fetch_from_database ⇒ Object
167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
# File 'lib/tapsoob/data_stream.rb', line 167 def fetch_from_database res = nil log.debug "DataStream#fetch_from_database state -> #{state.inspect}" state[:chunksize] = Tapsoob::Utils.calculate_chunksize(state[:chunksize]) do |c| state[:chunksize] = c.to_i encoded_data = fetch.first checksum = Tapsoob::Utils.checksum(encoded_data).to_s res = { :json => { :checksum => checksum, :state => to_hash }, :encoded_data => encoded_data } end res end |
#fetch_rows ⇒ Object
keep a record of the average chunksize within the first few hundred thousand records, after chunksize goes below 100 or maybe if offset is > 1000
73 74 75 76 77 78 79 80 81 82 83 84 |
# File 'lib/tapsoob/data_stream.rb', line 73 def fetch_rows state[:chunksize] = fetch_chunksize ds = table.order(*order_by).limit(state[:chunksize], state[:offset]) log.debug "DataStream#fetch_rows SQL -> #{ds.sql}" rows = Tapsoob::Utils.format_data(ds.all, :string_columns => string_columns, :schema => db.schema(table_name), :table => table_name ) update_chunksize_stats rows end |
#import_rows(rows) ⇒ Object
220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
# File 'lib/tapsoob/data_stream.rb', line 220 def import_rows(rows) table.import(rows[:header], rows[:data], :commit_every => 100) state[:offset] += rows[:data].size rescue Exception => ex case ex. when /integer out of range/ then raise Tapsoob::InvalidData, <<-ERROR, [] \nDetected integer data that exceeds the maximum allowable size for an integer type. This generally occurs when importing from SQLite due to the fact that SQLite does not enforce maximum values on integer types. ERROR else raise ex end end |
#increment(row_count) ⇒ Object
67 68 69 |
# File 'lib/tapsoob/data_stream.rb', line 67 def increment(row_count) state[:offset] += row_count end |
#log ⇒ Object
23 24 25 26 |
# File 'lib/tapsoob/data_stream.rb', line 23 def log Tapsoob.log.level = Logger::DEBUG if state[:debug] Tapsoob.log end |
#max_chunksize_training ⇒ Object
99 100 101 |
# File 'lib/tapsoob/data_stream.rb', line 99 def max_chunksize_training 20 end |
#order_by(name = nil) ⇒ Object
60 61 62 63 64 65 |
# File 'lib/tapsoob/data_stream.rb', line 60 def order_by(name=nil) @order_by ||= begin name ||= table_name Tapsoob::Utils.order_by(db, name) end end |
#parse_encoded_data(encoded_data, checksum) ⇒ Object
206 207 208 209 210 211 212 213 214 215 216 217 218 |
# File 'lib/tapsoob/data_stream.rb', line 206 def parse_encoded_data(encoded_data, checksum) raise Tapsoob::CorruptedData.new("Checksum Failed") unless Tapsoob::Utils.valid_data?(encoded_data, checksum) begin return Marshal.load(Tapsoob::Utils.base64decode(encoded_data)) rescue Object => e unless ENV['NO_DUMP_MARSHAL_ERRORS'] puts "Error encountered loading data, wrote the data chunk to dump.#{Process.pid}.dat" File.open("dump.#{Process.pid}.dat", "w") { |f| f.write(encoded_data) } end raise e end end |
#string_columns ⇒ Object
52 53 54 |
# File 'lib/tapsoob/data_stream.rb', line 52 def string_columns @string_columns ||= Tapsoob::Utils.incorrect_blobs(db, table_name) end |
#table ⇒ Object
56 57 58 |
# File 'lib/tapsoob/data_stream.rb', line 56 def table @table ||= db[table_name_sql] end |
#table_name ⇒ Object
36 37 38 |
# File 'lib/tapsoob/data_stream.rb', line 36 def table_name state[:table_name].to_sym end |
#table_name_sql ⇒ Object
40 41 42 |
# File 'lib/tapsoob/data_stream.rb', line 40 def table_name_sql table_name end |
#to_hash ⇒ Object
44 45 46 |
# File 'lib/tapsoob/data_stream.rb', line 44 def to_hash state.merge(:klass => self.class.to_s) end |
#to_json ⇒ Object
48 49 50 |
# File 'lib/tapsoob/data_stream.rb', line 48 def to_json JSON.generate(to_hash) end |
#update_chunksize_stats ⇒ Object
111 112 113 114 115 116 |
# File 'lib/tapsoob/data_stream.rb', line 111 def update_chunksize_stats return if state[:num_chunksize] >= max_chunksize_training state[:total_chunksize] += state[:chunksize] state[:num_chunksize] += 1 state[:avg_chunksize] = state[:total_chunksize] / state[:num_chunksize] rescue state[:chunksize] end |
#verify_stream ⇒ Object
235 236 237 |
# File 'lib/tapsoob/data_stream.rb', line 235 def verify_stream state[:offset] = table.count end |