Module: EbookTools

Extended by:: EbookTools

Included in:: EbookTools

Defined in:: lib/ebook_tools.rb

Instance Method Summary collapse

#allow_extract_struct?(file) ⇒ Boolean
#batch_convert(source, destination, options = {}) ⇒ Object
#batch_extract_from_dir(source, destination, options = {}) ⇒ Object

batch_extract_from_dir batch extract book struct form dir parameters: source source directory destination output directory options optional parameter.
#convert(filename, epub_file, options = {}) ⇒ Object
#extract_book_struct_to_file(source, destination, options = {}) ⇒ Object
#html2epub(filename, epub_file, options = {}) ⇒ Object

html2epub 将HTML格式转换成EPUB格式.
#pdf2epub(filename, epub_file, options = {}) ⇒ Object

pdf2epub 将PDF格式转换成EPUB格式.
#sanitize_for_epub_text(content) ⇒ Object

sanitize_for_epub_text.
#text_paras_repair(source_file, target_file, options = {}) ⇒ Object

text_paras_repair 对文本文件格式中的中断段落进行修复.
#txt2epub(filename, epub_file, options = {}) ⇒ Object

txt2epub 将文本格式转换成EPUB格式.
#write_doc_book(destination, docbook_xml) ⇒ Object

Instance Method Details

#allow_extract_struct?(file) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/ebook_tools.rb', line 198

def allow_extract_struct?(file)
  extname = File.extname(file)
  ['.txt','.html','.epub'].include?(extname.downcase)
end

#batch_convert(source, destination, options = {}) ⇒ `Object`

# File 'lib/ebook_tools.rb', line 102

def batch_convert(source,destination,options={})  
  log = File.open('batch.log','a')
  success_log = File.open('success.log','a')
  error_log = File.open('error.log','a')
  scan_log = File.open('scan.log','a')
  unknown_log = File.open('unknown.log','a')

  source_path = File.absolute_path(source)
  dest_path = File.join(File.absolute_path(destination),'epub')
  scan_path = File.join(File.absolute_path(destination),'scan')
  unknown_path = File.join(File.absolute_path(destination),'unknown')
  backup_path = File.join(File.absolute_path(destination),'backup')
  
  format = options[:format]

  files = Utils.scan_file_from_dir(source_path,:format=>format)

  total_count = files.count
  scan_count = 0
  success_count = 0
  error_count = 0
  unknown_count = 0

  puts "count: #{total_count} file "
  log.puts "****batch convert****** : #{Time.now}"
  log.puts "#{source_path}  =>  #{dest_path} "
  log.puts "count: #{total_count} file "

  success_log.puts "****batch convert****** : #{Time.now}"
  success_log.puts "#{source_path}  =>  #{dest_path} "

  error_log.puts "****batch convert****** : #{Time.now}"
  error_log.puts "#{source_path}  =>  #{dest_path} "

  scan_log.puts "****batch convert****** : #{Time.now}"
  scan_log.puts "#{source_path}  =>  #{dest_path} "

  unknown_log.puts "****batch convert****** : #{Time.now}"
  unknown_log.puts "#{source_path}  =>  #{dest_path} "


  files.each do |file|
    dest_file = File.join(File.dirname(File.join(dest_path,file.gsub(source_path,''))),"#{File.basename(file,File.extname(file))}.epub")
    
    keywords = Utils.extract_keywords_from_path(File.dirname(file).gsub(source_path,''))
    puts "start convert #{file}"
    extname = File.extname(file).gsub('.','')
    method_name = "#{extname}2epub" 
    if extname == 'epub'
      FileUtils.mkdir_p(dest_path) unless Dir.exists?(dest_path)
      FileUtils.cp(file,dest_file)
      success_file = File.join(backup_path,file.gsub(source_path,''))
      FileUtils.mkdir_p(File.dirname(success_file)) unless Dir.exists?(File.dirname(success_file))
      FileUtils.mv(file,success_file,:force=>true)     
      success_count += 1
      success_log.puts "success: #{source} conversion successfully!"
    elsif EbookTools.respond_to?(method_name)
      begin
        if PDF.scan_pdf?(file)
          scan_file = File.join(scan_path,file.gsub(source_path,''))
          FileUtils.mkdir_p(File.dirname(scan_file)) unless Dir.exists?(File.dirname(scan_file))
          FileUtils.mv(file,scan_file,:force=>true)
          scan_count += 1
          scan_log.puts "warning: #{file} is scan pdf."            
        else
          EbookTools.send(method_name,file,dest_file,{:keywords=>keywords}) 
          success_file = File.join(backup_path,file.gsub(source_path,''))
          FileUtils.mkdir_p(File.dirname(success_file)) unless Dir.exists?(File.dirname(success_file))
          FileUtils.mv(file,success_file,:force=>true)        
          success_count += 1
          success_log.puts "success: #{source} conversion successfully!"
        end
      rescue Exception => e
        unknown_file = File.join(unknown_path,file.gsub(source_path,''))
        FileUtils.mkdir_p(File.dirname(unknown_file)) unless Dir.exists?(File.dirname(unknown_file))
        FileUtils.mv(file,unknown_file,:force=>true)
        error_count += 1
        error_log.puts "error: #{source} \n#{e.backtrace.join("\n")}"
      end        
    end
  end

  success_log.puts "count: #{success_count}   Time: #{Time.now} \n"
  scan_log.puts "count: #{scan_count}    Time: #{Time.now} \n"
  error_log.puts "count: #{error_count}    Time: #{Time.now} \n"
  unknown_log.puts "unknown: #{unknown_count}    Time: #{Time.now} \n"
  log.puts "success: #{success_count}   scan: #{scan_count}   error: #{error_count}    Time: #{Time.now} \n"

  ensure
    success_log.close
    error_log.close
    scan_log.close
    unknown_log.close
    log.close    
end

#batch_extract_from_dir(source, destination, options = {}) ⇒ `Object`

batch_extract_from_dir

batch extract book struct form dir

parameters:

+source+     source directory
+destination+   output directory
+options+        optional parameter.
   :format     指定需要提取结构的文件后缀名，例如要从所有txt文件中提取，通过:format=>'.txt'指定

# File 'lib/ebook_tools.rb', line 242

def batch_extract_from_dir(source,destination,options={})
  format = options.delete(:format)
  source_path = File.absolute_path(source)
  dest_path = File.absolute_path(destination)

  files = Utils.scan_file_from_dir(source_path,{:format=>format})

  files.each do |file|
    extname = File.extname(file)
    basename = File.basename(file,extname)
    dest_file = File.join(File.dirname(File.join(dest_path,file.gsub(source_path,''))),"#{basename}.xml")
    if allow_extract_struct?(file)
      puts "start extract #{file} ..."          
      begin
        if extract_book_struct_to_file(file,dest_file)
          puts "success: extract book struct  successfully!"
        else
          new_file = File.join(File.dirname(file),"[err]#{basename}#{extname}")
          FileUtils.mv(file,new_file,:force=>true)
          puts "警告: 没有检测到书结构信息."
        end
      rescue Exception => e
        puts "error: #{file} \n#{e.backtrace.join("\n")}"
      end
    else
      puts "error: #{file}不是允许的文件格式: txt,html,epub"
    end
  end
end

#convert(filename, epub_file, options = {}) ⇒ `Object`

# File 'lib/ebook_tools.rb', line 10

def convert(filename,epub_file,options={})
  method_name = "#{File.extname(filename).gsub('.','')}2epub" 
  if EbookTools.respond_to?(method_name)
    EbookTools.send(method_name,filename,epub_file,options) 
    return true
  else
    return nil
  end
end

#extract_book_struct_to_file(source, destination, options = {}) ⇒ `Object`

# File 'lib/ebook_tools.rb', line 203

def extract_book_struct_to_file(source,destination,options={})
  options[:title] ||= File.basename(source,File.extname(source))
  if File.extname(source) == '.epub'
    epub_book = EpubBook.new(source,options)
    docbook_xml = epub_book.to_doc_book
    if docbook_xml
      write_doc_book(destination,docbook_xml)
      puts "目录结构:"
      puts epub_book.toc_to_text
      return true
    end
  else
    content = case File.extname(source)
      when '.html'
        Utils.extract_text_from_file(source,'.html')
      when '.txt'
        File.open(source).read
    end
    txt_book = TxtBook.new(content,options)
    docbook_xml = txt_book.to_doc_book
    if docbook_xml
      write_doc_book(destination,docbook_xml)
      puts "目录结构:"
      puts txt_book.toc_to_text
      puts "共修复#{txt_book.breaklines_count}个断点."
      return true
    end
  end

  return nil
end

#html2epub(filename, epub_file, options = {}) ⇒ `Object`

html2epub 将HTML格式转换成EPUB格式

# File 'lib/ebook_tools.rb', line 49

def html2epub(filename,epub_file,options={})
  basename = File.basename(filename,'.html')
  temp_dir = "#{basename}"
  FileUtils.mkdir(temp_dir) unless File.exists?(temp_dir)

  html = File.open(filename).read

  html_file = File.join([temp_dir,"#{basename}.html"].compact)
  Utils.write_file(html,html_file)
  sections = Utils.detect_sections_from_html(html_file)

  nav_file = EPUB.gen_nav_file(html_file,sections)

  EPUB.write_epub(epub_file,options.merge(:files=>[nav_file,html_file]))
  ensure
    FileUtils.remove_dir(temp_dir,true)
end

#pdf2epub(filename, epub_file, options = {}) ⇒ `Object`

pdf2epub 将PDF格式转换成EPUB格式

# File 'lib/ebook_tools.rb', line 69

def pdf2epub(filename,epub_file,options={})
  basename = File.basename(filename,'.pdf')
  temp_dir = "#{basename}"
  FileUtils.mkdir(temp_dir) unless File.exists?(temp_dir)

  pages_text = PDF.extract_pdf_pages_text(filename)
  pages_text = PDF.sanitize_page_header_and_footer(pages_text,options)
  pages_text = PDF.fixed_break_with_pages_text(pages_text)

  sections = PDF.extract_sections(filename)

  illustrations = PDF.extract_illustrations(filename,{:dir=>temp_dir})

  html_content = PDF.gen_html_from_sections_and_page_texts(sections,pages_text,illustrations)
  html = Utils.wrapper_html(html_content)
  html_file = File.join([temp_dir,"#{basename}.html"].compact)
  Utils.write_file(html,html_file)

  illustrations_path = illustrations.map{|image_path| File.join(temp_dir,image_path)}

  nav_file = EPUB.gen_nav_file(html_file,sections)

  files = [html_file,nav_file,illustrations_path].flatten

  meta = PDF.extract_pdf_meta(filename)
  epub_options = options.merge(meta).merge(:files=>files)

  EPUB.write_epub(epub_file,epub_options)

  ensure
    FileUtils.remove_dir(temp_dir,true)
end

#sanitize_for_epub_text(content) ⇒ `Object`

sanitize_for_epub_text

# File 'lib/ebook_tools.rb', line 287

def sanitize_for_epub_text(content)
  return content if content.blank?
  lines = []
  content.each_line do |line|
    unless line.downcase.include?('document outline')
      lines << line
    else
      break;
    end
  end
  lines.join("")
end

#text_paras_repair(source_file, target_file, options = {}) ⇒ `Object`

text_paras_repair 对文本文件格式中的中断段落进行修复

# File 'lib/ebook_tools.rb', line 279

def text_paras_repair(source_file,target_file,options={})
  content = File.open(source_file).read
  content = Utils.to_utf8 unless Utils.detect_utf8(content)
  content = Utils.fixed_page_break(content,options)
  File.open(target_file,'w'){|file| file.write content}
end

#txt2epub(filename, epub_file, options = {}) ⇒ `Object`

txt2epub 将文本格式转换成EPUB格式

# File 'lib/ebook_tools.rb', line 22

def txt2epub(filename,epub_file,options={})
  basename = File.basename(filename,'.txt')
  temp_dir = "#{basename}"
  FileUtils.mkdir(temp_dir) unless File.exists?(temp_dir)

  title,outlines, content = TXT.extract_book_part(filename)
 
  if options[:fix]
    content = Utils.fixed_page_break(content)
  end

  html_content = TXT.gen_html_from_txt_book(title,outlines,content)
  html = Utils.wrapper_html(html_content)

  html_file = File.join([temp_dir,"#{basename}.html"].compact)
  Utils.write_file(html,html_file)
  sections = Utils.detect_sections_from_html(html_file)

  nav_file = EPUB.gen_nav_file(html_file,sections)

  EPUB.write_epub(epub_file,options.merge(:files=>[nav_file,html_file]))
  ensure
    FileUtils.remove_dir(temp_dir,true)
end

#write_doc_book(destination, docbook_xml) ⇒ `Object`

# File 'lib/ebook_tools.rb', line 272

def write_doc_book(destination, docbook_xml)
  FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination))
  File.open(destination,'wb'){|file|file.write docbook_xml}    
end

Module: EbookTools

Instance Method Summary collapse

Instance Method Details

#allow_extract_struct?(file) ⇒ Boolean

#batch_convert(source, destination, options = {}) ⇒ Object

#batch_extract_from_dir(source, destination, options = {}) ⇒ Object

#convert(filename, epub_file, options = {}) ⇒ Object

#extract_book_struct_to_file(source, destination, options = {}) ⇒ Object

#html2epub(filename, epub_file, options = {}) ⇒ Object

#pdf2epub(filename, epub_file, options = {}) ⇒ Object

#sanitize_for_epub_text(content) ⇒ Object

#text_paras_repair(source_file, target_file, options = {}) ⇒ Object

#txt2epub(filename, epub_file, options = {}) ⇒ Object

#write_doc_book(destination, docbook_xml) ⇒ Object

#allow_extract_struct?(file) ⇒ `Boolean`

#batch_convert(source, destination, options = {}) ⇒ `Object`

#batch_extract_from_dir(source, destination, options = {}) ⇒ `Object`

#convert(filename, epub_file, options = {}) ⇒ `Object`

#extract_book_struct_to_file(source, destination, options = {}) ⇒ `Object`

#html2epub(filename, epub_file, options = {}) ⇒ `Object`

#pdf2epub(filename, epub_file, options = {}) ⇒ `Object`

#sanitize_for_epub_text(content) ⇒ `Object`

#text_paras_repair(source_file, target_file, options = {}) ⇒ `Object`

#txt2epub(filename, epub_file, options = {}) ⇒ `Object`

#write_doc_book(destination, docbook_xml) ⇒ `Object`