Module: EbookTools
Instance Method Summary collapse
- #allow_extract_struct?(file) ⇒ Boolean
- #batch_convert(source, destination, options = {}) ⇒ Object
-
#batch_extract_from_dir(source, destination, options = {}) ⇒ Object
batch_extract_from_dir batch extract book struct form dir parameters:
source
source directorydestination
output directoryoptions
optional parameter. - #convert(filename, epub_file, options = {}) ⇒ Object
- #extract_book_struct_to_file(source, destination, options = {}) ⇒ Object
-
#html2epub(filename, epub_file, options = {}) ⇒ Object
html2epub 将HTML格式转换成EPUB格式.
-
#pdf2epub(filename, epub_file, options = {}) ⇒ Object
pdf2epub 将PDF格式转换成EPUB格式.
-
#sanitize_for_epub_text(content) ⇒ Object
sanitize_for_epub_text.
-
#text_paras_repair(source_file, target_file, options = {}) ⇒ Object
text_paras_repair 对文本文件格式中的中断段落进行修复.
-
#txt2epub(filename, epub_file, options = {}) ⇒ Object
txt2epub 将文本格式转换成EPUB格式.
- #write_doc_book(destination, docbook_xml) ⇒ Object
Instance Method Details
#allow_extract_struct?(file) ⇒ Boolean
198 199 200 201 |
# File 'lib/ebook_tools.rb', line 198 def allow_extract_struct?(file) extname = File.extname(file) ['.txt','.html','.epub'].include?(extname.downcase) end |
#batch_convert(source, destination, options = {}) ⇒ Object
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
# File 'lib/ebook_tools.rb', line 102 def batch_convert(source,destination,={}) log = File.open('batch.log','a') success_log = File.open('success.log','a') error_log = File.open('error.log','a') scan_log = File.open('scan.log','a') unknown_log = File.open('unknown.log','a') source_path = File.absolute_path(source) dest_path = File.join(File.absolute_path(destination),'epub') scan_path = File.join(File.absolute_path(destination),'scan') unknown_path = File.join(File.absolute_path(destination),'unknown') backup_path = File.join(File.absolute_path(destination),'backup') format = [:format] files = Utils.scan_file_from_dir(source_path,:format=>format) total_count = files.count scan_count = 0 success_count = 0 error_count = 0 unknown_count = 0 puts "count: #{total_count} file " log.puts "****batch convert****** : #{Time.now}" log.puts "#{source_path} => #{dest_path} " log.puts "count: #{total_count} file " success_log.puts "****batch convert****** : #{Time.now}" success_log.puts "#{source_path} => #{dest_path} " error_log.puts "****batch convert****** : #{Time.now}" error_log.puts "#{source_path} => #{dest_path} " scan_log.puts "****batch convert****** : #{Time.now}" scan_log.puts "#{source_path} => #{dest_path} " unknown_log.puts "****batch convert****** : #{Time.now}" unknown_log.puts "#{source_path} => #{dest_path} " files.each do |file| dest_file = File.join(File.dirname(File.join(dest_path,file.gsub(source_path,''))),"#{File.basename(file,File.extname(file))}.epub") keywords = Utils.extract_keywords_from_path(File.dirname(file).gsub(source_path,'')) puts "start convert #{file}" extname = File.extname(file).gsub('.','') method_name = "#{extname}2epub" if extname == 'epub' FileUtils.mkdir_p(dest_path) unless Dir.exists?(dest_path) FileUtils.cp(file,dest_file) success_file = File.join(backup_path,file.gsub(source_path,'')) FileUtils.mkdir_p(File.dirname(success_file)) unless Dir.exists?(File.dirname(success_file)) FileUtils.mv(file,success_file,:force=>true) success_count += 1 success_log.puts "success: #{source} conversion successfully!" elsif EbookTools.respond_to?(method_name) begin if PDF.scan_pdf?(file) scan_file = File.join(scan_path,file.gsub(source_path,'')) FileUtils.mkdir_p(File.dirname(scan_file)) unless Dir.exists?(File.dirname(scan_file)) FileUtils.mv(file,scan_file,:force=>true) scan_count += 1 scan_log.puts "warning: #{file} is scan pdf." else EbookTools.send(method_name,file,dest_file,{:keywords=>keywords}) success_file = File.join(backup_path,file.gsub(source_path,'')) FileUtils.mkdir_p(File.dirname(success_file)) unless Dir.exists?(File.dirname(success_file)) FileUtils.mv(file,success_file,:force=>true) success_count += 1 success_log.puts "success: #{source} conversion successfully!" end rescue Exception => e unknown_file = File.join(unknown_path,file.gsub(source_path,'')) FileUtils.mkdir_p(File.dirname(unknown_file)) unless Dir.exists?(File.dirname(unknown_file)) FileUtils.mv(file,unknown_file,:force=>true) error_count += 1 error_log.puts "error: #{source} \n#{e.backtrace.join("\n")}" end end end success_log.puts "count: #{success_count} Time: #{Time.now} \n" scan_log.puts "count: #{scan_count} Time: #{Time.now} \n" error_log.puts "count: #{error_count} Time: #{Time.now} \n" unknown_log.puts "unknown: #{unknown_count} Time: #{Time.now} \n" log.puts "success: #{success_count} scan: #{scan_count} error: #{error_count} Time: #{Time.now} \n" ensure success_log.close error_log.close scan_log.close unknown_log.close log.close end |
#batch_extract_from_dir(source, destination, options = {}) ⇒ Object
batch_extract_from_dir
batch extract book struct form dir
parameters:
+source+ source directory
+destination+ output directory
+options+ optional parameter.
:format 指定需要提取结构的文件后缀名,例如要从所有txt文件中提取,通过:format=>'.txt'指定
242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 |
# File 'lib/ebook_tools.rb', line 242 def batch_extract_from_dir(source,destination,={}) format = .delete(:format) source_path = File.absolute_path(source) dest_path = File.absolute_path(destination) files = Utils.scan_file_from_dir(source_path,{:format=>format}) files.each do |file| extname = File.extname(file) basename = File.basename(file,extname) dest_file = File.join(File.dirname(File.join(dest_path,file.gsub(source_path,''))),"#{basename}.xml") if allow_extract_struct?(file) puts "start extract #{file} ..." begin if extract_book_struct_to_file(file,dest_file) puts "success: extract book struct successfully!" else new_file = File.join(File.dirname(file),"[err]#{basename}#{extname}") FileUtils.mv(file,new_file,:force=>true) puts "警告: 没有检测到书结构信息." end rescue Exception => e puts "error: #{file} \n#{e.backtrace.join("\n")}" end else puts "error: #{file}不是允许的文件格式: txt,html,epub" end end end |
#convert(filename, epub_file, options = {}) ⇒ Object
10 11 12 13 14 15 16 17 18 |
# File 'lib/ebook_tools.rb', line 10 def convert(filename,epub_file,={}) method_name = "#{File.extname(filename).gsub('.','')}2epub" if EbookTools.respond_to?(method_name) EbookTools.send(method_name,filename,epub_file,) return true else return nil end end |
#extract_book_struct_to_file(source, destination, options = {}) ⇒ Object
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
# File 'lib/ebook_tools.rb', line 203 def extract_book_struct_to_file(source,destination,={}) [:title] ||= File.basename(source,File.extname(source)) if File.extname(source) == '.epub' epub_book = EpubBook.new(source,) docbook_xml = epub_book.to_doc_book if docbook_xml write_doc_book(destination,docbook_xml) puts "目录结构:" puts epub_book.toc_to_text return true end else content = case File.extname(source) when '.html' Utils.extract_text_from_file(source,'.html') when '.txt' File.open(source).read end txt_book = TxtBook.new(content,) docbook_xml = txt_book.to_doc_book if docbook_xml write_doc_book(destination,docbook_xml) puts "目录结构:" puts txt_book.toc_to_text puts "共修复#{txt_book.breaklines_count}个断点." return true end end return nil end |
#html2epub(filename, epub_file, options = {}) ⇒ Object
html2epub 将HTML格式转换成EPUB格式
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# File 'lib/ebook_tools.rb', line 49 def html2epub(filename,epub_file,={}) basename = File.basename(filename,'.html') temp_dir = "#{basename}" FileUtils.mkdir(temp_dir) unless File.exists?(temp_dir) html = File.open(filename).read html_file = File.join([temp_dir,"#{basename}.html"].compact) Utils.write_file(html,html_file) sections = Utils.detect_sections_from_html(html_file) nav_file = EPUB.gen_nav_file(html_file,sections) EPUB.write_epub(epub_file,.merge(:files=>[nav_file,html_file])) ensure FileUtils.remove_dir(temp_dir,true) end |
#pdf2epub(filename, epub_file, options = {}) ⇒ Object
pdf2epub 将PDF格式转换成EPUB格式
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
# File 'lib/ebook_tools.rb', line 69 def pdf2epub(filename,epub_file,={}) basename = File.basename(filename,'.pdf') temp_dir = "#{basename}" FileUtils.mkdir(temp_dir) unless File.exists?(temp_dir) pages_text = PDF.extract_pdf_pages_text(filename) pages_text = PDF.(pages_text,) pages_text = PDF.fixed_break_with_pages_text(pages_text) sections = PDF.extract_sections(filename) illustrations = PDF.extract_illustrations(filename,{:dir=>temp_dir}) html_content = PDF.gen_html_from_sections_and_page_texts(sections,pages_text,illustrations) html = Utils.wrapper_html(html_content) html_file = File.join([temp_dir,"#{basename}.html"].compact) Utils.write_file(html,html_file) illustrations_path = illustrations.map{|image_path| File.join(temp_dir,image_path)} nav_file = EPUB.gen_nav_file(html_file,sections) files = [html_file,nav_file,illustrations_path].flatten = PDF.(filename) = .merge().merge(:files=>files) EPUB.write_epub(epub_file,) ensure FileUtils.remove_dir(temp_dir,true) end |
#sanitize_for_epub_text(content) ⇒ Object
sanitize_for_epub_text
287 288 289 290 291 292 293 294 295 296 297 298 |
# File 'lib/ebook_tools.rb', line 287 def sanitize_for_epub_text(content) return content if content.blank? lines = [] content.each_line do |line| unless line.downcase.include?('document outline') lines << line else break; end end lines.join("") end |
#text_paras_repair(source_file, target_file, options = {}) ⇒ Object
text_paras_repair 对文本文件格式中的中断段落进行修复
279 280 281 282 283 284 |
# File 'lib/ebook_tools.rb', line 279 def text_paras_repair(source_file,target_file,={}) content = File.open(source_file).read content = Utils.to_utf8 unless Utils.detect_utf8(content) content = Utils.fixed_page_break(content,) File.open(target_file,'w'){|file| file.write content} end |
#txt2epub(filename, epub_file, options = {}) ⇒ Object
txt2epub 将文本格式转换成EPUB格式
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# File 'lib/ebook_tools.rb', line 22 def txt2epub(filename,epub_file,={}) basename = File.basename(filename,'.txt') temp_dir = "#{basename}" FileUtils.mkdir(temp_dir) unless File.exists?(temp_dir) title,outlines, content = TXT.extract_book_part(filename) if [:fix] content = Utils.fixed_page_break(content) end html_content = TXT.gen_html_from_txt_book(title,outlines,content) html = Utils.wrapper_html(html_content) html_file = File.join([temp_dir,"#{basename}.html"].compact) Utils.write_file(html,html_file) sections = Utils.detect_sections_from_html(html_file) nav_file = EPUB.gen_nav_file(html_file,sections) EPUB.write_epub(epub_file,.merge(:files=>[nav_file,html_file])) ensure FileUtils.remove_dir(temp_dir,true) end |
#write_doc_book(destination, docbook_xml) ⇒ Object
272 273 274 275 |
# File 'lib/ebook_tools.rb', line 272 def write_doc_book(destination, docbook_xml) FileUtils.mkdir_p(File.dirname(destination)) unless Dir.exists?(File.dirname(destination)) File.open(destination,'wb'){|file|file.write docbook_xml} end |