Module: Tabula::LSD

Extended by:
FFI::Library
Defined in:
lib/tabula/line_segment_detector.rb

Constant Summary collapse

DETECT_LINES_DEFAULTS =
{
  :scale_factor => nil,
  :image_size => 2048
}

Class Method Summary collapse

Class Method Details

.detect_lines(image, scale_factor = 1) ⇒ Object

image can be either a string (path to image) or a Java::JavaAwtImage::BufferedImage image to pixels: stackoverflow.com/questions/6524196/java-get-pixel-array-from-image



67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/tabula/line_segment_detector.rb', line 67

def LSD.detect_lines(image, scale_factor=1)

  bimage = if image.class == Java::JavaAwtImage::BufferedImage
             image
           elsif image.class == String
             ImageIO.read(java.io.File.new(image))
           else
             raise ArgumentError, 'image must be a string or a BufferedImage'
           end

  image = LSD.image_to_image_float(bimage)

  lines_found_ptr = FFI::MemoryPointer.new(:int, 1)

  out = lsd(lines_found_ptr, image, bimage.getWidth, bimage.getHeight)

  lines_found = lines_found_ptr.get_int

  rv = []
  lines_found.times do |i|
    a = out[7*4*i].read_array_of_type(:float, 7)

    a_round = a[0..3].map(&:round)
    p1, p2 = [[a_round[0], a_round[1]], [a_round[2], a_round[3]]]

    rv << Tabula::Ruling.new(p1[1] * scale_factor,
                             p1[0] * scale_factor,
                             (p2[0] - p1[0]) * scale_factor,
                             (p2[1] - p1[1]) * scale_factor)
  end

  free_values(out)
  bimage.flush
  bimage.getGraphics.dispose
  image = nil

  return rv
end

.detect_lines_in_pdf(pdf_path, options = {}) ⇒ Object



40
41
42
43
44
45
46
47
48
49
50
# File 'lib/tabula/line_segment_detector.rb', line 40

def LSD.detect_lines_in_pdf(pdf_path, options={})
  options = DETECT_LINES_DEFAULTS.merge(options)

  pdf_file = PDDocument.loadNonSeq(java.io.File.new(pdf_path), nil)
  lines = pdf_file.getDocumentCatalog.getAllPages.to_a.map do |page|
    bi = Tabula::Render.pageToBufferedImage(page, options[:image_size])
    detect_lines(bi, options[:scale_factor] || (page.findCropBox.width / options[:image_size]))
  end
  pdf_file.close
  lines
end

.detect_lines_in_pdf_page(pdf_path, page_number, options = {}) ⇒ Object

zero-indexed page_number



53
54
55
56
57
58
59
60
61
62
63
# File 'lib/tabula/line_segment_detector.rb', line 53

def LSD.detect_lines_in_pdf_page(pdf_path, page_number, options={})
  options = DETECT_LINES_DEFAULTS.merge(options)

  pdf_file = Extraction.openPDF(pdf_path)
  page = pdf_file.getDocumentCatalog.getAllPages[page_number]
  bi = Tabula::Render.pageToBufferedImage(page,
                                          options[:image_size])
  pdf_file.close
  detect_lines(bi,
               options[:scale_factor] || (page.findCropBox.width / options[:image_size]))
end