Module: Tabula::LSD

Extended by:
FFI::Library
Defined in:
lib/tabula/line_segment_detector.rb

Constant Summary collapse

DETECT_LINES_DEFAULTS =
{
  :scale_factor => nil,
  :image_size => 2048
}

Class Method Summary collapse

Class Method Details

.detect_lines(image, scale_factor = 1) ⇒ Object

image can be either a string (path to image) or a Java::JavaAwtImage::BufferedImage image to pixels: stackoverflow.com/questions/6524196/java-get-pixel-array-from-image



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/tabula/line_segment_detector.rb', line 72

def LSD.detect_lines(image, scale_factor=1)

  bimage = if image.class == Java::JavaAwtImage::BufferedImage
             image
           elsif image.class == String
             ImageIO.read(java.io.File.new(image))
           else
             raise ArgumentError, 'image must be a string or a BufferedImage'
           end

  image = LSD.image_to_image_float(bimage)

  lines_found_ptr = FFI::MemoryPointer.new(:int, 1)

  out = lsd(lines_found_ptr, image, bimage.getWidth, bimage.getHeight)

  lines_found = lines_found_ptr.get_int

  rv = []
  lines_found.times do |i|
    a = out[7*4*i].read_array_of_type(:float, 7)

    a_round = a[0..3].map(&:round)
    p1, p2 = [[a_round[0], a_round[1]], [a_round[2], a_round[3]]]

    rv << Tabula::Ruling.new(p1[1] * scale_factor,
                             p1[0] * scale_factor,
                             (p2[0] - p1[0]) * scale_factor,
                             (p2[1] - p1[1]) * scale_factor)
  end

  free_values(out)
  bimage.flush
  bimage.getGraphics.dispose
  image = nil

  return rv
end

.detect_lines_in_pdf(pdf_path, options = {}) ⇒ Object



45
46
47
48
49
50
51
52
53
54
55
# File 'lib/tabula/line_segment_detector.rb', line 45

def LSD.detect_lines_in_pdf(pdf_path, options={})
  options = DETECT_LINES_DEFAULTS.merge(options)

  pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
  lines = pdf_file.getDocumentCatalog.getAllPages.to_a.map do |page|
    bi = Tabula::Render.pageToBufferedImage(page, options[:image_size])
    detect_lines(bi, options[:scale_factor] || (page.findCropBox.width / options[:image_size]))
  end
  pdf_file.close
  lines
end

.detect_lines_in_pdf_page(pdf_path, page_number, options = {}) ⇒ Object

zero-indexed page_number



58
59
60
61
62
63
64
65
66
67
68
# File 'lib/tabula/line_segment_detector.rb', line 58

def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
  options = DETECT_LINES_DEFAULTS.merge(options)

  pdf_file = Extraction.openPDF(pdf_path)
  page = pdf_file.getDocumentCatalog.getAllPages[page_number]
  bi = Tabula::Render.pageToBufferedImage(page,
                                          options[:image_size])
  pdf_file.close
  detect_lines(bi,
               options[:scale_factor] || (page.findCropBox.width / options[:image_size]))
end