Class: Langchain::Processors::PDF

Inherits:
Base
  • Object
show all
Defined in:
lib/langchain/processors/pdf.rb

Constant Summary collapse

EXTENSIONS =
[".pdf"]
CONTENT_TYPES =
["application/pdf"]

Instance Method Summary collapse

Methods included from DependencyHelper

#depends_on

Constructor Details

#initializePDF

Returns a new instance of PDF.



9
10
11
# File 'lib/langchain/processors/pdf.rb', line 9

def initialize(*)
  depends_on "pdf-reader"
end

Instance Method Details

#parse(data) ⇒ String

Parse the document and return the text

Parameters:

  • data (File)

Returns:

  • (String)


16
17
18
19
20
21
22
# File 'lib/langchain/processors/pdf.rb', line 16

def parse(data)
  ::PDF::Reader
    .new(StringIO.new(data.read))
    .pages
    .map(&:text)
    .join("\n\n")
end