Module: DataMetaByteSer::Py

Includes:
DataMetaByteSer, DataMetaDom, DataMetaDom::PythonLexer
Defined in:
lib/dataMetaByteSer/python.rb

Overview

(De)Serialization for Python

Constant Summary collapse

TEXT_RW_METHODS =

HDFS Reader and Writer for textual Python types such as str.

DataMetaByteSer::RwHolder.new(
        lambda{|ctx|
            ctx.fld.aggr ? ctx.rw.call("DataMetaHadoopUtil.read#{aggrPyFull(ctx.fld.aggr)}String(di)") : ctx.rw.call('DataMetaHadoopUtil.readText(di)')
        },
        lambda{|ctx|
            ctx.fld.aggr ? "DataMetaHadoopUtil.write#{aggrPyFull(ctx.fld.aggr)}String(do, val.#{ctx.valGetter})" : "DataMetaHadoopUtil.writeTextIfAny(do, val.#{ctx.valGetter})"
        }
)
INTEGRAL_RW_METHODS =

HDFS Reader and Writer for integral Python type.

RwHolder.new(
lambda{ |ctx|
    mapsNotSupported(ctx.fld) if ctx.fld.trgType # map
    case
        when ctx.fType.length <= 4; ctx.fld.aggr ? ctx.rw.call("DataMetaHadoopUtil.read#{aggrPyFull(ctx.fld.aggr)}Integer(di)") :
                ctx.rw.call('WritableUtils.readVInt(di)')

        when ctx.fType.length <= 8; ; ctx.fld.aggr ? ctx.rw.call("DataMetaHadoopUtil.read#{aggrPyFull(ctx.fld.aggr)}Long(di)") : ctx.rw.call('WritableUtils.readVLong(di)')

        else; raise "Invalid integer field #{ctx.fld}"
    end
},
lambda{ |ctx|
    case
        when ctx.fType.length <= 4; ctx.fld.aggr ? "DataMetaHadoopUtil.write#{aggrPyFull(ctx.fld.aggr)}Integer(do, val.#{ctx.valGetter})" :
                "WritableUtils.writeVInt(do, val.#{ctx.valGetter})"

        when ctx.fType.length <= 8;  ctx.fld.aggr ? "DataMetaHadoopUtil.write#{aggrPyFull(ctx.fld.aggr)}Long(do, val.#{ctx.valGetter})" : "WritableUtils.writeVLong(do, val.#{ctx.valGetter})"

        else; raise "Invalid integer field #{ctx.fld}"
    end
})
BOOLEAN_RW_METHODS =

HDFS Reader and Writer for Booleans.

RwHolder.new(
lambda{|ctx|
    mapsNotSupported(ctx.fld) if ctx.fld.trgType # map
    ctx.fld.aggr ? ctx.rw.call("DataMetaHadoopUtil.read#{aggrPyFull(ctx.fld.aggr)}Boolean(di)") : ctx.rw.call('di.readBoolean()')
},
lambda{|ctx|
    mapsNotSupported(ctx.fld) if ctx.fld.trgType # map
    ctx.fld.aggr ? "DataMetaHadoopUtil.write#{aggrPyFull(ctx.fld.aggr)}Boolean(do, val.#{ctx.valGetter})" : "do.writeBoolean(val.#{ctx.valGetter})"
})
PRIMITIVABLE_TYPES =

Python has no primitivable types

Set.new
FLOAT_RW_METHODS =

HDFS Reader and Writer for floating point types.

RwHolder.new(
lambda{|ctx|
    mapsNotSupported(ctx.fld) if ctx.fld.trgType # map
    case
        when ctx.fType.length <= 4; ctx.fld.aggr ? ctx.rw.call("DataMetaHadoopUtil.read#{aggrPyFull(ctx.fld.aggr)}Float(di)") : ctx.rw.call('di.readFloat()')
        when ctx.fType.length <= 8; ctx.fld.aggr ? ctx.rw.call("DataMetaHadoopUtil.read#{aggrPyFull(ctx.fld.aggr)}Double(di)") : ctx.rw.call('di.readDouble()')
        else; raise "Invalid float field #{ctx.fld}"
    end
},
lambda{|ctx|
    mapsNotSupported(ctx.fld) if ctx.fld.trgType # map
    case
        when ctx.fType.length <= 4; ctx.fld.aggr ? "DataMetaHadoopUtil.write#{aggrPyFull(ctx.fld.aggr)}Float(do, val.#{ctx.valGetter})" : "do.writeFloat(val.#{ctx.valGetter})"
        when ctx.fType.length <= 8; ctx.fld.aggr ? "DataMetaHadoopUtil.write#{aggrPyFull(ctx.fld.aggr)}Double(do, val.#{ctx.valGetter})" : "do.writeDouble(val.#{ctx.valGetter})"
        else; raise "Invalid float field #{ctx.fld}"
    end
})
DTTM_RW_METHODS =

HDFS Reader and Writer for the temporal type, the DateTime

RwHolder.new(
        lambda { |ctx|
            ctx.fld.aggr ? ctx.rw.call("DataMetaHadoopUtil.read#{aggrPyFull(ctx.fld.aggr)}DateTime(di)") : ctx.rw.call('DataMetaHadoopUtil.readDttm(di)')
        },
        lambda { |ctx|
            ctx.fld.aggr ? "DataMetaHadoopUtil.write#{aggrPyFull(ctx.fld.aggr)}DateTime(do, val.#{ctx.valGetter})" : "DataMetaHadoopUtil.writeDttm(do, val.#{ctx.valGetter})"
        }
)
NUMERIC_RW_METHODS =

HDFS Reader and Writer the variable size Decimal data type.

RwHolder.new(lambda{|ctx| ctx.fld.aggr ? ctx.rw.call("DataMetaHadoopUtil.read#{aggrPyFull(ctx.fld.aggr)}BigDecimal(di)") : ctx.rw.call('DataMetaHadoopUtil.readBigDecimal(di)')},
lambda{|ctx| "DataMetaHadoopUtil.writeBigDecimal(do, val.#{ctx.valGetter})"})
ENUM_RW_METHODS =

HDFS Reader and Writer the Java Enums.

RwHolder.new(
        lambda{|ctx|
            aggrNotSupported(ctx.fld, 'Enums') if ctx.fld.aggr
            _, s = DataMetaDom.splitNameSpace(ctx.fType.type)
            "#{s}(WritableUtils.readVInt(di) + 1)" # Python starts their enums from 1 - we save it starting from 0
            # as Java and Scala does
        },
        lambda { |ctx|
            aggrNotSupported(ctx.fld, 'Enums') if ctx.fld.aggr
            # Python starts their enums from 1 - we save it starting from 0 as Java and Scala
            "WritableUtils.writeVInt(do, val.#{ctx.valGetter}.value - 1)"
        }
)
URL_RW_METHODS =

HDFS Reader and Writer the URL.

RwHolder.new(
        lambda { |ctx|
            aggrNotSupported(ctx.fld, 'URLs') if ctx.fld.aggr
            'DataMetaHadoopUtil.readText(di)'
        },
        lambda { |ctx|
            aggrNotSupported(ctx.fld, 'URLs') if ctx.fld.aggr
            "DataMetaHadoopUtil.writeTextIfAny(do, val.#{ctx.valGetter})"
        }
)
NOT_IMPLEMENTED_METHODS =

Pseudo-implementers that just raise an error

RwHolder.new(
        lambda { |ctx|
            aggrNotSupported(ctx.fld, 'Serialization')
        },
        lambda { |ctx|
            aggrNotSupported(ctx.fld, 'Serialization')
        }
)
STD_RW_METHODS =

Read/write methods for the standard data types.

{
        DataMetaDom::INT => INTEGRAL_RW_METHODS,
        DataMetaDom::STRING => TEXT_RW_METHODS,
        DataMetaDom::DATETIME => DTTM_RW_METHODS,
        DataMetaDom::BOOL => BOOLEAN_RW_METHODS,
        DataMetaDom::CHAR => TEXT_RW_METHODS,
        DataMetaDom::FLOAT => FLOAT_RW_METHODS,
        DataMetaDom::RAW => NOT_IMPLEMENTED_METHODS,
        DataMetaDom::NUMERIC => NUMERIC_RW_METHODS,
        DataMetaDom::URL => URL_RW_METHODS
}
RECORD_RW_METHODS =

DataMeta DOM object renderer

RwHolder.new(
        lambda { |ctx|
            if ctx.fld.aggr
                if ctx.fld.trgType # map
                    mapsNotSupported(ctx.fld)
                else  # list, set or deque
                    "DataMetaHadoopUtil.read#{aggrPyFull(ctx.fld.aggr)}(di, #{
                    inOutablePy(ctx)}())"
                end
            else # scalar
                "#{inOutablePy(ctx)}().read(di)"
            end
        },
        lambda { |ctx|
            if ctx.fld.aggr && !ctx.fld.trgType
                if ctx.fld.trgType # map
                    mapsNotSupported(ctx.fld)
                else  # list, set or deque
                    "DataMetaHadoopUtil.writeCollection(val.#{ctx.valGetter}, do, #{inOutablePy(ctx)}())"
                end
            else # scalar
                "#{inOutablePy(ctx)}().write(do, val.#{ctx.valGetter})"
            end
        }
)
MAP_RW_METHODS =

Read/write methods for the DataMeta DOM Maps, accidentally all the same as for the standard data types.

STD_RW_METHODS

Constants included from DataMetaByteSer

BITSET_RW_METHODS, BOOL_RW_METHODS, RAW_RW_METHODS, VERSION

Class Method Summary collapse

Methods included from DataMetaByteSer

aggrBaseName, aggrJavaFull, helpDataMetaBytesSerGen, inOutableClassName, #tmpVar

Class Method Details

.aggrNotSupported(fld, forWhat) ⇒ Object

Raises:

  • (ArgumentError)


35
36
37
# File 'lib/dataMetaByteSer/python.rb', line 35

def aggrNotSupported(fld, forWhat)
    raise ArgumentError, "Field #{fld.name}: aggregate types are not supported for #{forWhat} for Byte Array format"
end

.aggrPyFull(aggr) ⇒ Object

Full name of a Py aggregate for the given DataMeta DOM aggregate



132
133
134
135
136
137
138
139
140
141
142
143
# File 'lib/dataMetaByteSer/python.rb', line 132

def aggrPyFull(aggr)
    case aggr
        when DataMetaDom::Field::LIST
            'List'
        when DataMetaDom::Field::SET
            'Set'
        when DataMetaDom::Field::DEQUE
            'Deque' # note this is different from Java
        else
            raise ArgumentError, "Aggregate type #{aggr} not supported for Python serialization"
    end
end

.genWritable(model, wriOut, ioOut, record, pyPackage, baseName) ⇒ Object

Generates one InOutable, Writables here currently are not generated



252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
# File 'lib/dataMetaByteSer/python.rb', line 252

def genWritable(model, wriOut, ioOut, record, pyPackage, baseName)
    enumCount = model.enums.values.select{|e| e.kind_of?(DataMetaDom::Enum)}.size
    recImports = model.records.values.map{|r| # import all records
        p, b, pp = DataMetaDom::PojoLexer::assertNamespace(r.name)
        %|from #{DataMetaXtra::Str.downCaseFirst(b)} import #{b}|
    }.join("\n")
#             ioImports = model.records.values.reject{|r| r.name == record.name}.map{|r| # import all InOutables except of this one
#                 p, b, pp = DataMetaDom::PojoLexer::assertNamespace(r.name)
#                 # since one InOutable may import another which may import another, and Python can't handle this,
#                 # catch the error. It's harmless because if it really failed to import, we'll know
#                 %|
# try:
#     from #{inOutablePy(DataMetaXtra::Str.downCaseFirst(b))} import #{inOutablePy(b)}
# except ImportError:
#     pass|
#             }.join("\n")
    ctx = RendCtx.new.init(model, record, pyPackage, baseName)
    fields = record.fields
    wriName = nil # writableClassName(baseName)
    ioName = inOutablePy(baseName)
    hasOptional = fields.values.map{|f|
#      !model.records[f.dataType.type] &&
        !f.isRequired
    }.reduce(:|) # true if there is at least one optional field which isn't a record
    keysInOrder = fields.each_key.map{|k| k.to_s}.sort.map{|k| k.to_sym}
    reads = ''
    writes = ''
    writeNullMaskHead = hasOptional ? "nullFlags = bitarray(#{fields.keys.size}); nullFlags.setall(False); fldIndex = -1" : ''
    readNullMaskHead = hasOptional ? 'nullFlags = DataMetaHadoopUtil.readBitArray(di); fldIndex = -1' : ''
    indent = "\n#{' ' * 8}"
    # sorting provides predictable read/write order
    keysInOrder.each { |k|
        f = fields[k]
        ctx.fld = f
        rwRenderer = getRwRenderer(ctx)
        reads <<  ( indent + (f.isRequired ? '' : "fldIndex += 1#{indent}") + "val.#{DataMetaDom.setterName(ctx.fld)}(" +
                (f.isRequired ? '' : ' None if nullFlags[fldIndex] else ')+ "#{rwRenderer.r.call(ctx)})"
        )
        # noinspection RubyNestedTernaryOperatorsInspection
        writes << (indent + (f.isRequired ?
                (PRIMITIVABLE_TYPES.member?(f.dataType.type) ? '' : ''):
#%Q<if(val.#{DataMetaDom::PojoLexer::getterName(ctx.fld)}() == null) throw noReqFld("#{f.name}"); >) :
                "if(val.#{DataMetaDom.getterName(ctx.fld)}() is not None): ") + "#{rwRenderer.w.call(ctx)}")
        unless f.isRequired
            writeNullMaskHead << (indent + "fldIndex += 1#{indent}if(val.#{DataMetaDom.getterName(ctx.fld)}() is None): nullFlags[fldIndex] = True")
        end
    }
    writeNullMaskHead << ( indent + 'DataMetaHadoopUtil.writeBitArray(do, nullFlags)') if hasOptional

    ioOut.puts <<IN_OUTABLE_CLASS

class #{ioName}(InOutable):

    def write(self, do, val):
val.verify()
#{writeNullMaskHead}
#{writes}

    def readVal(self, di, val):
#{readNullMaskHead}
#{reads}
return val

    def read(self, di):
return self.readVal(di, #{baseName}())

IN_OUTABLE_CLASS
end

.genWritables(model, outRoot) ⇒ Object

Generates all the writables for the given model. Parameters:

  • model - the model to generate Writables from.

  • outRoot - destination directory name.



327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
# File 'lib/dataMetaByteSer/python.rb', line 327

def genWritables(model, outRoot)
    firstRecord = model.records.values.first
    pyPackage, base, packagePath = DataMetaDom::PojoLexer::assertNamespace(firstRecord.name)
    # Next: replace dots with underscores.The path also adjusted accordingly.
    #
    # Rationale for this, quoting PEP 8:
    #
    #    Package and Module Names
    #
    #    Modules should have short, all-lowercase names. Underscores can be used in the module name if it improves
    #    readability. Python packages should also have short, all-lowercase names, although the use of underscores
    #    is discouraged.
    #
    # Short and all-lowercase names, and improving readability if you have complex system and need long package names,
    # is "discouraged". Can't do this here, our system is more complicated for strictly religous, "pythonic" Python.
    # A tool must be enabling, and in this case, this irrational ruling gets in the way.
    # And dots are a no-no, Python can't find packages with complicated package structures and imports.
    #
    # Hence, we opt for long package names with underscores for distinctiveness and readability:
    pyPackage = pyPackage.gsub('.', '_')
    packagePath = packagePath.gsub('/', '_')
    destDir = File.join(outRoot, packagePath)
    FileUtils.mkdir_p destDir
    wriOut = nil # File.open(File.join(destDir, "#{writableClassName(base)}.py"), 'wb')
    serFile = File.join(destDir, 'serial.py')
    FileUtils.rm serFile if File.file?(serFile)
    ioOut = File.open(serFile, 'wb') # one huge serialization file
    ioOut.puts %|# This file is generated by DataMeta DOM. Do not edit manually!
#package #{pyPackage}

from hadoop.io import WritableUtils, InputStream, OutputStream, Text
from ebay_datameta_core.base import DateTime
from decimal import *
from collections import *
from bitarray import bitarray
from ebay_datameta_hadoop.base import *
from model import *

|
    begin
        model.records.values.each { |e|
                _, base, _ = DataMetaDom::PojoLexer::assertNamespace(e.name)
                case
                    when e.kind_of?(DataMetaDom::Record)
                        genWritable model, wriOut, ioOut, e, pyPackage, base
                    else
                        raise "Unsupported Entity: #{e.inspect}"
                end
        }
    ensure
        begin
            ioOut.close
        ensure
            #wriOut.close
        end
    end
end

.getRwRenderer(ctx) ⇒ Object

Build the Read/Write operation renderer for the given context:



229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
# File 'lib/dataMetaByteSer/python.rb', line 229

def getRwRenderer(ctx)
    dt = ctx.fld.dataType
    ctx.refType = nil # reset to avoid misrendering primitives
    rwRenderer = STD_RW_METHODS[dt.type]
    return rwRenderer if rwRenderer
    refKey = dt.type
    ctx.refType = ctx.model.enums[refKey] || ctx.model.records[refKey]
    case
        when ctx.refType.kind_of?(DataMetaDom::Record)
            RECORD_RW_METHODS
        when ctx.refType.kind_of?(DataMetaDom::Enum)
            ENUM_RW_METHODS
        when ctx.refType.kind_of?(DataMetaDom::BitSet)
            NOT_IMPLEMENTED_METHODS
        when ctx.refType.kind_of?(DataMetaDom::Mapping)
            MAP_RW_METHODS[ctx.fType.type] || (raise ArgumentError, "No renderer found for the key type #{
            ctx.fType.type}, record #{ctx.rec}, field #{ctx.fld}")
        else
            raise "No renderer defined for field #{ctx.fld}"
    end
end

.inOutablePy(arg) ⇒ Object

Builds a class name for a InOutable.



20
21
22
23
24
25
26
27
28
29
# File 'lib/dataMetaByteSer/python.rb', line 20

def inOutablePy(arg)
    klassName = case
                    when arg.kind_of?(String)
                        arg
                    else
                        _, s = DataMetaDom.splitNameSpace(arg.fType.type)
                        s
                end
    "#{klassName}_InOutable"
end

.mapsNotSupported(fld) ⇒ Object

Raises:

  • (ArgumentError)


31
32
33
# File 'lib/dataMetaByteSer/python.rb', line 31

def mapsNotSupported(fld)
    raise ArgumentError, "Field #{fld.name}: maps are not currently supported for Byte Array format"
end

.writableClassName(baseName) ⇒ Object

Builds a class name for a Writable.



16
# File 'lib/dataMetaByteSer/python.rb', line 16

def writableClassName(baseName); "#{baseName}_Writable" end