Class: AmazonTRP::Page
- Inherits:
-
Object
- Object
- AmazonTRP::Page
- Defined in:
- lib/amazon-textract-parser-ruby.rb
Instance Attribute Summary collapse
-
#blocks ⇒ Object
readonly
Returns the value of attribute blocks.
-
#content ⇒ Object
readonly
Returns the value of attribute content.
-
#form ⇒ Object
readonly
Returns the value of attribute form.
-
#geometry ⇒ Object
readonly
Returns the value of attribute geometry.
-
#id ⇒ Object
readonly
Returns the value of attribute id.
-
#lines ⇒ Object
readonly
Returns the value of attribute lines.
-
#tables ⇒ Object
readonly
Returns the value of attribute tables.
-
#text ⇒ Object
readonly
Returns the value of attribute text.
Instance Method Summary collapse
- #_parse(blockMap) ⇒ Object
- #getLinesInBoundingBox(boundingBox) ⇒ Object
- #getLinesInReadingOrder ⇒ Object
- #getTextInReadingOrder ⇒ Object
-
#initialize(blocks, blockMap) ⇒ Page
constructor
A new instance of Page.
- #to_s ⇒ Object
Constructor Details
#initialize(blocks, blockMap) ⇒ Page
Returns a new instance of Page.
435 436 437 438 439 440 441 442 443 444 |
# File 'lib/amazon-textract-parser-ruby.rb', line 435 def initialize(blocks, blockMap) @blocks = blocks @text = "" @lines = [] @form = Form.new() @tables = [] @content = [] _parse(blockMap) end |
Instance Attribute Details
#blocks ⇒ Object (readonly)
Returns the value of attribute blocks.
426 427 428 |
# File 'lib/amazon-textract-parser-ruby.rb', line 426 def blocks @blocks end |
#content ⇒ Object (readonly)
Returns the value of attribute content.
431 432 433 |
# File 'lib/amazon-textract-parser-ruby.rb', line 431 def content @content end |
#form ⇒ Object (readonly)
Returns the value of attribute form.
429 430 431 |
# File 'lib/amazon-textract-parser-ruby.rb', line 429 def form @form end |
#geometry ⇒ Object (readonly)
Returns the value of attribute geometry.
432 433 434 |
# File 'lib/amazon-textract-parser-ruby.rb', line 432 def geometry @geometry end |
#id ⇒ Object (readonly)
Returns the value of attribute id.
433 434 435 |
# File 'lib/amazon-textract-parser-ruby.rb', line 433 def id @id end |
#lines ⇒ Object (readonly)
Returns the value of attribute lines.
428 429 430 |
# File 'lib/amazon-textract-parser-ruby.rb', line 428 def lines @lines end |
#tables ⇒ Object (readonly)
Returns the value of attribute tables.
430 431 432 |
# File 'lib/amazon-textract-parser-ruby.rb', line 430 def tables @tables end |
#text ⇒ Object (readonly)
Returns the value of attribute text.
427 428 429 |
# File 'lib/amazon-textract-parser-ruby.rb', line 427 def text @text end |
Instance Method Details
#_parse(blockMap) ⇒ Object
454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 |
# File 'lib/amazon-textract-parser-ruby.rb', line 454 def _parse(blockMap) @blocks.each do |item| if item[:block_type] == "PAGE" @geometry = Geometry.new(item[:geometry]) @id = item[:id] elsif item[:block_type] == "LINE" l = Line.new(item, blockMap) @lines.append(l) @content.append(l) @text = @text + l.text + '\n' elsif item[:block_type] == "TABLE" t = Table.new(item, blockMap) @tables.append(t) @content.append(t) elsif item[:block_type] == "KEY_VALUE_SET" if item[:entity_types].include?('KEY') f = Field.new(item, blockMap) if f.key @form.addField(f) @content.append(f) end end end end end |
#getLinesInBoundingBox(boundingBox) ⇒ Object
515 516 517 518 519 520 521 522 523 524 525 526 527 |
# File 'lib/amazon-textract-parser-ruby.rb', line 515 def getLinesInBoundingBox(boundingBox) lines = [] @lines.each do |line| line_bbox = line.geometry.boundingBox if (line_bbox.left >= boundingBox.left && line_bbox.left <= boundingBox.right && line_bbox.top >= boundingBox.top && line_bbox.top <= boundingBox.bottom) lines.append(line) end end return lines end |
#getLinesInReadingOrder ⇒ Object
480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 |
# File 'lib/amazon-textract-parser-ruby.rb', line 480 def getLinesInReadingOrder columns = [] lines = [] @lines.each do |item| column_found = false columns.each_with_index do |column, index| bbox_left = item.geometry.boundingBox.left bbox_right = item.geometry.boundingBox.right bbox_centre = item.geometry.boundingBox.left + item.geometry.boundingBox.width/2 column_centre = column[:left] + ((column[:right] - column[:left]) / 2) if (bbox_centre > column[:left] && bbox_centre < column[:right]) || (column_centre > bbox_left && column_centre < bbox_right) # Bbox appears inside the column lines.append({:column => index, :text => item.text}) column_found = true break end end if !column_found columns.append({:left => item.geometry.boundingBox.left, :right => item.geometry.boundingBox.right}) lines.append({:column => columns.count - 1, :text => item.text}) end end return AmazonTRP::stable_sort_by(lines) {|x| x[:column]} end |
#getTextInReadingOrder ⇒ Object
506 507 508 509 510 511 512 513 |
# File 'lib/amazon-textract-parser-ruby.rb', line 506 def getTextInReadingOrder lines = getLinesInReadingOrder() text = "" lines.each do |line| text = text + line[:text] + "\n" end return text end |
#to_s ⇒ Object
446 447 448 449 450 451 452 |
# File 'lib/amazon-textract-parser-ruby.rb', line 446 def to_s s = "Page:\n" @content.each do |item| s = s + item.to_s + "\n" end return s end |