Class: ChupaText::Decomposers::PDF

Inherits:
Decomposer
  • Object
show all
Includes:
Loggable
Defined in:
lib/chupa-text/decomposers/pdf.rb

Instance Method Summary collapse

Instance Method Details

#decompose(data) {|text_data| ... } ⇒ Object

Yields:

  • (text_data)


40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/chupa-text/decomposers/pdf.rb', line 40

def decompose(data)
  document = create_document(data)
  return if document.nil?

  text = ""
  document.each do |page|
    page_text = page.get_text
    next if page_text.empty?
    text << page_text
    text << "\n" unless page_text.end_with?("\n")
  end
  text_data = TextData.new(text, :source_data => data)
  add_attribute(text_data, document, :title)
  add_attribute(text_data, document, :author)
  add_attribute(text_data, document, :subject)
  add_attribute(text_data, document, :keywords)
  add_attribute(text_data, document, :creator)
  add_attribute(text_data, document, :producer)
  add_attribute(text_data, document, :creation_date, :created_time)
  if data.need_screenshot?
    text_data.screenshot = create_screenshot(data, document)
  end
  yield(text_data)
end

#target?(data) ⇒ Boolean

Returns:

  • (Boolean)


29
30
31
32
33
34
35
36
37
38
# File 'lib/chupa-text/decomposers/pdf.rb', line 29

def target?(data)
  return true if data.mime_type == "application/pdf"

  case data.extension
  when nil, "pdf"
    (data.peek_body(6) || "").start_with?("%PDF-1")
  else
    false
  end
end