Class: AmazonTRP::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/amazon-textract-parser-ruby.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(blocks, blockMap) ⇒ Page

Returns a new instance of Page.



435
436
437
438
439
440
441
442
443
444
# File 'lib/amazon-textract-parser-ruby.rb', line 435

def initialize(blocks, blockMap)
  @blocks = blocks
  @text = ""
  @lines = []
  @form = Form.new()
  @tables = []
  @content = []
  
  _parse(blockMap)
end

Instance Attribute Details

#blocksObject (readonly)

Returns the value of attribute blocks.



426
427
428
# File 'lib/amazon-textract-parser-ruby.rb', line 426

def blocks
  @blocks
end

#contentObject (readonly)

Returns the value of attribute content.



431
432
433
# File 'lib/amazon-textract-parser-ruby.rb', line 431

def content
  @content
end

#formObject (readonly)

Returns the value of attribute form.



429
430
431
# File 'lib/amazon-textract-parser-ruby.rb', line 429

def form
  @form
end

#geometryObject (readonly)

Returns the value of attribute geometry.



432
433
434
# File 'lib/amazon-textract-parser-ruby.rb', line 432

def geometry
  @geometry
end

#idObject (readonly)

Returns the value of attribute id.



433
434
435
# File 'lib/amazon-textract-parser-ruby.rb', line 433

def id
  @id
end

#linesObject (readonly)

Returns the value of attribute lines.



428
429
430
# File 'lib/amazon-textract-parser-ruby.rb', line 428

def lines
  @lines
end

#tablesObject (readonly)

Returns the value of attribute tables.



430
431
432
# File 'lib/amazon-textract-parser-ruby.rb', line 430

def tables
  @tables
end

#textObject (readonly)

Returns the value of attribute text.



427
428
429
# File 'lib/amazon-textract-parser-ruby.rb', line 427

def text
  @text
end

Instance Method Details

#_parse(blockMap) ⇒ Object



454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
# File 'lib/amazon-textract-parser-ruby.rb', line 454

def _parse(blockMap)
  @blocks.each do |item|
    if item[:block_type] == "PAGE"
      @geometry = Geometry.new(item[:geometry])
      @id = item[:id]
    elsif item[:block_type] == "LINE"
      l = Line.new(item, blockMap)
      @lines.append(l)
      @content.append(l)
      @text = @text + l.text + '\n'
    elsif item[:block_type] == "TABLE"
      t = Table.new(item, blockMap)
      @tables.append(t)
      @content.append(t)
    elsif item[:block_type] == "KEY_VALUE_SET"
      if item[:entity_types].include?('KEY')
        f = Field.new(item, blockMap)
        if f.key
          @form.addField(f)
          @content.append(f)
        end
      end
    end
  end
end

#getLinesInBoundingBox(boundingBox) ⇒ Object



515
516
517
518
519
520
521
522
523
524
525
526
527
# File 'lib/amazon-textract-parser-ruby.rb', line 515

def getLinesInBoundingBox(boundingBox)
  lines = []
  @lines.each do |line|
    line_bbox = line.geometry.boundingBox
    if (line_bbox.left >= boundingBox.left &&
      line_bbox.left <= boundingBox.right &&
      line_bbox.top >= boundingBox.top &&
      line_bbox.top <= boundingBox.bottom)
      lines.append(line)
    end
  end
  return lines
end

#getLinesInReadingOrderObject



480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
# File 'lib/amazon-textract-parser-ruby.rb', line 480

def getLinesInReadingOrder
  columns = []
  lines = []
  @lines.each do |item|
    column_found = false
    columns.each_with_index do |column, index|
      bbox_left = item.geometry.boundingBox.left
      bbox_right = item.geometry.boundingBox.right
      bbox_centre = item.geometry.boundingBox.left + item.geometry.boundingBox.width/2
      column_centre = column[:left] + ((column[:right] - column[:left]) / 2)
      if (bbox_centre > column[:left] && bbox_centre < column[:right]) || (column_centre > bbox_left && column_centre < bbox_right)
        # Bbox appears inside the column
        lines.append({:column => index, :text => item.text})
        column_found = true
        break
      end
    end
    if !column_found
      columns.append({:left => item.geometry.boundingBox.left, :right => item.geometry.boundingBox.right})
      lines.append({:column => columns.count - 1, :text => item.text})
    end
  end
  
  return AmazonTRP::stable_sort_by(lines) {|x| x[:column]}
end

#getTextInReadingOrderObject



506
507
508
509
510
511
512
513
# File 'lib/amazon-textract-parser-ruby.rb', line 506

def getTextInReadingOrder
  lines = getLinesInReadingOrder()
  text = ""
  lines.each do |line|
    text = text + line[:text] + "\n"
  end
  return text
end

#to_sObject



446
447
448
449
450
451
452
# File 'lib/amazon-textract-parser-ruby.rb', line 446

def to_s
  s = "Page:\n"
  @content.each do |item|
    s = s + item.to_s + "\n"
  end
  return s
end