Class: AIPP::PDF

Inherits:
Object
  • Object
show all
Defined in:
lib/aipp/pdf.rb

Overview

PDF to text reader with support for pages and fencing

Examples:

pdf = AIPP::PDF.new("/path/to/file.pdf")
pdf.file   # => #<Pathname:/path/to/file.pdf>
pdf.from(100).to(200).each_line_with_position do |line, page, last|
  line   # => line content (e.g. "first line")
  page   # => page number (e.g. 1)
  last   # => last line boolean (true for last line, false otherwise)
end

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file, cache: true) ⇒ PDF



16
17
18
19
20
21
# File 'lib/aipp/pdf.rb', line 16

def initialize(file, cache: true)
  @file = file.is_a?(Pathname) ? file : Pathname(file)
  @text, @page_ranges = cache ? read_cache : read
  @from = 0
  @to = @last = @text.length - 1
end

Instance Attribute Details

#fileObject (readonly)

Returns the value of attribute file.



14
15
16
# File 'lib/aipp/pdf.rb', line 14

def file
  @file
end

Instance Method Details

#each_line {|line, page, last| ... } ⇒ Enumerator Also known as: each

Executes the block for every line and passes the line content, page number and end of document boolean.

If no block is given, an enumerator is returned instead.

Yield Parameters:

  • line (String)

    content of the line

  • page (Integer)

    page number the line is found on within the PDF

  • last (Boolean)

    true for the last line, false otherwise



82
83
84
85
86
87
88
89
# File 'lib/aipp/pdf.rb', line 82

def each_line
  return enum_for(:each) unless block_given?
  offset, last_line_index = @from, lines.count - 1
  lines.each_with_index do |line, line_index|
    yield(line, page_for(index: offset), line_index == last_line_index)
    offset += line.length
  end
end

#from(index) ⇒ self

Fence the PDF beginning with this index



33
34
35
36
37
38
# File 'lib/aipp/pdf.rb', line 33

def from(index)
  index = 0 if index == :begin
  fail ArgumentError unless (0..@to).include? index
  @from = index
  self
end

#inspectString



24
25
26
# File 'lib/aipp/pdf.rb', line 24

def inspect
  %Q(#<#{self.class} file=#{@file} range=#{range}>)
end

#linesArray

Text split to individual lines



69
70
71
# File 'lib/aipp/pdf.rb', line 69

def lines
  text.split(/(?<=[\n\f])/)
end

#rangeRange<Integer>

Get the current fencing range



55
56
57
# File 'lib/aipp/pdf.rb', line 55

def range
  (@from..@to)
end

#textString

Text string of the PDF with fencing applied



62
63
64
# File 'lib/aipp/pdf.rb', line 62

def text
  @text[range]
end

#to(index) ⇒ self

Fence the PDF ending with this index



45
46
47
48
49
50
# File 'lib/aipp/pdf.rb', line 45

def to(index)
  index = @last if index == :end
  fail ArgumentError unless (@from..@last).include? index
  @to = index
  self
end