Module: PDF

Extended by:: PDF

Includes:: Utils

Included in:: PDF

Defined in:: lib/pdf.rb

Instance Method Summary collapse

#extract_illustrations(filename, options = {}) ⇒ Object

extract_illustrations 提取pdf文件中的插图 parameters: filename pdf文件 options 可选参数 dir 插图存放的目录，默认存放在当前目录下与filename同名的子目录下。.
#extract_page_illustrations(illustrations, index) ⇒ Object
#extract_pdf_meta(filename) ⇒ Object

extract_pdf_meta 提取pdf元数据 parameters: filename pdf文件.
#extract_pdf_pages_text(filename) ⇒ Object

extract_pdf_pages_text 提取pdf中页文本内容 parameters: filename pdf文件.
#extract_sections(filename) ⇒ Object

extract_sections 提取pdf文件的大纲 parameters: filename pdf文件.
#fixed_break_of_cross_page(pages, length = 80) ⇒ Object
#fixed_break_with_pages_text(pages_text) ⇒ Object
#gen_html_from_page_text(page_text, illustrations, options = {}) ⇒ Object
#gen_html_from_page_texts(page_texts, illustrations, options = {}) ⇒ Object
#gen_html_from_sections_and_page_texts(sections, page_texts, illustrations) ⇒ Object
#guess_footer_row_count(pages_text) ⇒ Object
#guess_header_line?(lines) ⇒ Boolean (also: #guess_footer_line?)

猜测是否是页眉/页脚行猜测规则： 1.
#guess_header_row_count(pages_text) ⇒ Object

猜测页眉/页脚的行数页眉页脚有一定的规律： 1.
#sanitize_page_header_and_footer(pdf_pages_text, options = {}) ⇒ Object

sanitize_page_header_and_footer 清洗页眉页脚 parameters: pdf_pages_text pdf文件页文本内容集合 options 可选参数 :header_rows_count 指定页眉行数 :footer_rows_count 指定页脚行数.
#scan_pdf?(filename) ⇒ Boolean

scan_pdf? 检查指定的文件是否为扫描版pdf parameters: filename pdf文件.
#walk_index(indexer, sections) ⇒ Object
#work_index(child, sections) ⇒ Object

Methods included from Utils

#breaklines, #clean_text, #detect_sections_from_html, #detect_utf8, #end_mark?, #escape_html, #extract_keywords_from_path, #extract_text_from_file, #fixed_page_break, #guess_content_line_length, #line_closed?, #make_destination_dir, #merge_para_part, #scan_file_from_dir, #source_exists?, #text_similarity, #text_to_array, #timer, #to_utf8, #walk_dir, #wrapper_html, #write_file

Instance Method Details

#extract_illustrations(filename, options = {}) ⇒ `Object`

extract_illustrations

提取pdf文件中的插图

parameters:

+filename+   pdf文件
+options+    可选参数
   +dir+       插图存放的目录，默认存放在当前目录下与filename同名的子目录下。

# File 'lib/pdf.rb', line 86

def extract_illustrations(filename,options={})
  tmp_dir = options[:dir] || File.basename(filename,'.pdf')
  old_dir = Dir.getwd
  Dir.mkdir(tmp_dir) unless Dir.exists?(tmp_dir)
  system("pdfimages -p '#{filename}' '#{tmp_dir}/'")
  system("mogrify -format png '#{tmp_dir}/*.ppm'")
  Dir.chdir(tmp_dir)
  images = Dir.glob('*.png')
  images_path = []
  images.each do |image|
    images_path << image
  end
  Dir.chdir(old_dir)
  images_path
end

#extract_page_illustrations(illustrations, index) ⇒ `Object`

# File 'lib/pdf.rb', line 116

def extract_page_illustrations(illustrations,index)
  page_illustrations = []
  illustrations.each do |image_path|
    if image_path.split("-")[1].to_i == index
      page_illustrations << image_path
    end
  end
  page_illustrations
end

#extract_pdf_meta(filename) ⇒ `Object`

extract_pdf_meta

提取pdf元数据

parameters:

+filename+   pdf文件

# File 'lib/pdf.rb', line 58

def extract_pdf_meta(filename)
  pdf = Poppler::Document.new(filename)
  meta ={}
  meta[:author] = pdf.author
  meta[:title] = pdf.title
  meta
end

#extract_pdf_pages_text(filename) ⇒ `Object`

extract_pdf_pages_text

提取pdf中页文本内容

parameters:

+filename+   pdf文件

# File 'lib/pdf.rb', line 25

def extract_pdf_pages_text(filename)
  pdf = PDF::Reader.new(filename)
  pages = []

  pdf.pages.each do |page|
    pages << page.text
  end
  pages
end

#extract_sections(filename) ⇒ `Object`

extract_sections

提取pdf文件的大纲

parameters:

+filename+   pdf文件

# File 'lib/pdf.rb', line 70

def extract_sections(filename)
  sections = []
  pdf = Poppler::Document.new(filename)
  indexer = Poppler::IndexIter.new(pdf)
  walk_index(indexer,sections)
  sections
rescue
  sections
end

#fixed_break_of_cross_page(pages, length = 80) ⇒ `Object`

# File 'lib/pdf.rb', line 175

def fixed_break_of_cross_page(pages,length=80)
  i=0
  while i < (pages.count-1)
    first_page_lines = pages[i].split("\n")
    second_page_lines = pages[i+1].split("\n")
    if first_page_lines.any? && second_page_lines.any?
      first_page_last = first_page_lines.last
      second_page_first = second_page_lines.first    

      unless Utils.end_mark?(first_page_last)
        first_page_lines[(first_page_lines.count-1)] = Utils.merge_para_part(first_page_last,second_page_first)
        second_page_lines.shift
        pages[i] = first_page_lines.join("\n")
        pages[i+1] = second_page_lines.join("\n")
      end
    end
    i = i + 1
  end
  pages
end

#fixed_break_with_pages_text(pages_text) ⇒ `Object`

# File 'lib/pdf.rb', line 102

def fixed_break_with_pages_text(pages_text)
  line_length = pages_text.map{|text| Utils.guess_content_line_length(text)}.compact.sort.last * 0.5
  pages_text = pages_text.map{|page_text| Utils.fixed_page_break(page_text,:length=>line_length) }
  pages_text = fixed_break_of_cross_page(pages_text,line_length)
end

#gen_html_from_page_text(page_text, illustrations, options = {}) ⇒ `Object`

# File 'lib/pdf.rb', line 135

def gen_html_from_page_text(page_text,illustrations,options={})
  html = ''
  page_text.split("\n").each_with_index do |line,index| 
    if line.present? 
      if HeaderDetect.guess_header?(line)
        html += "<h2 id='#{options[:index]}_#{index}'>#{Utils.escape_html(Utils.clean_text(line))}</h2>"
      else
        html += "<p class='division'>#{Utils.escape_html(Utils.clean_text(line))}</p>" 
      end
    end
  end

  images = illustrations.map{|image_path| "<p class='division'><img src='#{image_path}' /></p>"}.compact.join("")
  "<div class='page' name='#{options[:index]}' >#{html}#{images}</div>"
end

#gen_html_from_page_texts(page_texts, illustrations, options = {}) ⇒ `Object`

# File 'lib/pdf.rb', line 126

def gen_html_from_page_texts(page_texts,illustrations,options={})
  page_htmls = []
  page_texts.each_with_index do |page_text,index|
    page_illustrations = extract_page_illustrations(illustrations,index)
    page_htmls << gen_html_from_page_text(page_text,page_illustrations,options.merge(:index=>index))
  end
  page_htmls.join("")
end

#gen_html_from_sections_and_page_texts(sections, page_texts, illustrations) ⇒ `Object`

# File 'lib/pdf.rb', line 108

def gen_html_from_sections_and_page_texts(sections,page_texts,illustrations)
  if sections.empty?
    gen_html_from_page_texts(page_texts,illustrations)
  else
    gen_html_from_page_texts(page_texts,illustrations) #sections中的页码不准确，暂时不进行处理
  end
end

#guess_footer_row_count(pages_text) ⇒ `Object`

# File 'lib/pdf.rb', line 213

def guess_footer_row_count(pages_text)
  i = 0
  while true
    lines = pages_text.map{|page_text| page_text.split("\n")[(-i -1)]}
    if guess_footer_line?(lines)
      i = i + 1
    else
      break
    end
  end
  i > 2 ? 0 : i
end

#guess_header_line?(lines) ⇒ `Boolean` Also known as: guess_footer_line?

猜测是否是页眉/页脚行猜测规则：

1. 相邻页的行匹配相似度，一定相似比例(默认70%)以上加入相似集合。如果相似集合占总集合数的比例高于一定值（默认50%）时，猜测为页眉页脚行
2. 隔页的行匹配相似度，一定相似比例(默认70%)以上加入相似集合。如果相似集合占总集合数的比例高于一定值（默认50%）时，猜测为页眉页脚行
3. 页码猜测，页的行是数值则加入相似集合。如果相似集合占总集合数的比例高于一定值（默认50%）时，猜测为页眉页脚行

Returns:

(Boolean)

# File 'lib/pdf.rb', line 231

def guess_header_line?(lines)
  return false if lines.empty?

  lines = lines.map{|line| line.strip if line.present?}
  similarity_set = []
  lines.each_with_index do |line,index|
    if Utils.text_similarity(line,lines[index+1]) > 0.7
      similarity_set << [index,index+1]
    end
  end
  similarity_set.flatten!
  similarity_set.uniq!

  return true if similarity_set.count.to_f / lines.count.to_f > 0.5

  similarity_set = []
  lines.each_with_index do |line,index|
    if Utils.text_similarity(line,lines[index+2]) > 0.7
      similarity_set << [index,index+2]
    end
  end
  similarity_set.flatten!
  similarity_set.uniq!
  return true if similarity_set.count.to_f / lines.count.to_f > 0.5

  similarity_set=[]
  lines.each_with_index do |line,index|
    similarity_set << index if line.to_i > 0
  end
  return true if similarity_set.count.to_f / lines.count.to_f > 0.5

  false
end

#guess_header_row_count(pages_text) ⇒ `Object`

猜测页眉/页脚的行数页眉页脚有一定的规律：

1. 页眉和页脚一般都在每页的固定位置出现或者对称出现(相邻两页左右位置堆成)
2. 呈现的内容一般是书名、章节名、页码等。
3. 呈现的顺序一般有两种形式：逐页式，即每一页的页眉页脚大致相似；隔页式

# File 'lib/pdf.rb', line 201

def guess_header_row_count(pages_text)
  i = 0
  while true
    if guess_header_line?(pages_text.map{|page_text| page_text.split("\n")[i]})
      i = i + 1
    else
      break
    end
  end
  i > 2 ? 0 : i
end

#sanitize_page_header_and_footer(pdf_pages_text, options = {}) ⇒ `Object`

sanitize_page_header_and_footer

清洗页眉页脚

parameters:

+pdf_pages_text+  pdf文件页文本内容集合
+options+    可选参数
 :header_rows_count 指定页眉行数
 :footer_rows_count 指定页脚行数

# File 'lib/pdf.rb', line 42

def sanitize_page_header_and_footer(pdf_pages_text,options={})
  header_rows_count = options[:header_rows_count] || guess_header_row_count(pdf_pages_text)
  footer_rows_count = options[:footer_rows_count] || guess_footer_row_count(pdf_pages_text)
  pages_text = []
  pdf_pages_text.each do |page_text|
    page_lines = page_text.split("\n")
    page_lines = page_lines[(header_rows_count)..(-footer_rows_count-1)] || []
    pages_text << page_lines.join("\n")
  end
  pages_text
end

#scan_pdf?(filename) ⇒ `Boolean`

scan_pdf?

检查指定的文件是否为扫描版pdf

parameters:

+filename+   pdf文件

Returns:

(Boolean)

# File 'lib/pdf.rb', line 12

def scan_pdf?(filename)
  if File.extname(filename).downcase == '.pdf'
    threshold = 1000
    pdf = Poppler::Document.new(filename)
    content = pdf.map{|page| page.get_text}.join('')
    content.strip.length < threshold ? true : false
  end
end

#walk_index(indexer, sections) ⇒ `Object`

# File 'lib/pdf.rb', line 151

def walk_index(indexer,sections)
  indexer.each_with_index do |i,index|
    sections[index] = {:title=>Utils.clean_text(i.action.title),:page_num=>i.action.dest.page_num}
    child = i.child
    if child.nil? == false
      sub_sections = []
      work_index(child,sub_sections)
      sections[index][:sub_sections] = sub_sections
    end
  end
end

#work_index(child, sections) ⇒ `Object`

# File 'lib/pdf.rb', line 163

def work_index(child,sections)
  child.each_with_index do |h,index|
    sections[index] = {:title=> Utils.clean_text(h.action.title),:page_num=>h.action.dest.page_num}
    sub = h.child
    if sub.nil? == false
      sub_sections = []
      work_index(sub,sub_sections)
      sections[index][:sub_sections] = sub_sections
    end
  end
end

Module: PDF

Instance Method Summary collapse

Methods included from Utils

Instance Method Details

#extract_illustrations(filename, options = {}) ⇒ Object

#extract_page_illustrations(illustrations, index) ⇒ Object

#extract_pdf_meta(filename) ⇒ Object

#extract_pdf_pages_text(filename) ⇒ Object

#extract_sections(filename) ⇒ Object

#fixed_break_of_cross_page(pages, length = 80) ⇒ Object

#fixed_break_with_pages_text(pages_text) ⇒ Object

#gen_html_from_page_text(page_text, illustrations, options = {}) ⇒ Object

#gen_html_from_page_texts(page_texts, illustrations, options = {}) ⇒ Object

#gen_html_from_sections_and_page_texts(sections, page_texts, illustrations) ⇒ Object

#guess_footer_row_count(pages_text) ⇒ Object

#guess_header_line?(lines) ⇒ Boolean Also known as: guess_footer_line?

#guess_header_row_count(pages_text) ⇒ Object

#sanitize_page_header_and_footer(pdf_pages_text, options = {}) ⇒ Object

#scan_pdf?(filename) ⇒ Boolean

#walk_index(indexer, sections) ⇒ Object

#work_index(child, sections) ⇒ Object

#extract_illustrations(filename, options = {}) ⇒ `Object`

#extract_page_illustrations(illustrations, index) ⇒ `Object`

#extract_pdf_meta(filename) ⇒ `Object`

#extract_pdf_pages_text(filename) ⇒ `Object`

#extract_sections(filename) ⇒ `Object`

#fixed_break_of_cross_page(pages, length = 80) ⇒ `Object`

#fixed_break_with_pages_text(pages_text) ⇒ `Object`

#gen_html_from_page_text(page_text, illustrations, options = {}) ⇒ `Object`

#gen_html_from_page_texts(page_texts, illustrations, options = {}) ⇒ `Object`

#gen_html_from_sections_and_page_texts(sections, page_texts, illustrations) ⇒ `Object`

#guess_footer_row_count(pages_text) ⇒ `Object`

#guess_header_line?(lines) ⇒ `Boolean` Also known as: guess_footer_line?

#guess_header_row_count(pages_text) ⇒ `Object`

#sanitize_page_header_and_footer(pdf_pages_text, options = {}) ⇒ `Object`

#scan_pdf?(filename) ⇒ `Boolean`

#walk_index(indexer, sections) ⇒ `Object`

#work_index(child, sections) ⇒ `Object`