Module: ExtractBookStruct
- Extended by:
- ExtractBookStruct
- Included in:
- ExtractBookStruct
- Defined in:
- lib/extract_book_struct.rb
Instance Method Summary collapse
- #build_doc_book(struct, options = {}) ⇒ Object
- #build_struct(content) ⇒ Object
-
#clean_text(text) ⇒ Object
clean_text 获得干净的文本,去除两边的空格和回车.
- #closed_node(struct, stack) ⇒ Object
- #detect_struct_type(paras) ⇒ Object
- #detect_utf8(content) ⇒ Object
-
#escape_html(text) ⇒ Object
escape_html 文本转义,在txt文本转html时需要使用.
- #extract_book_struct(paras, options = {}) ⇒ Object
-
#extract_digital_book_struct(content, options = {}) ⇒ Object
从数字类型书中提取结构.
-
#extract_hybrid_book_struct(content, options = {}) ⇒ Object
从混合类型书中提取结构.
- #extract_paras(content) ⇒ Object
-
#extract_text_book_struct(content, options = {}) ⇒ Object
从text类型书中提取结构.
- #extract_text_from_file(filename, format) ⇒ Object
- #extract_toc_from_struct(struct) ⇒ Object
- #from_epub(filename, options = {}) ⇒ Object
- #from_html(filename, options = {}) ⇒ Object
- #from_txt(filename, options = {}) ⇒ Object
- #gen_docbook_content(struct) ⇒ Object
- #gen_docbook_toc(toc) ⇒ Object
- #gen_docbook_tocdiv(toc) ⇒ Object
- #guess_appendix?(text) ⇒ Boolean
- #guess_chapter?(text) ⇒ Boolean
- #guess_digital_head_line?(text) ⇒ Boolean
- #guess_digital_section?(text) ⇒ Boolean
- #guess_glossary?(text) ⇒ Boolean
- #guess_head_line?(text) ⇒ Boolean
- #guess_index?(text) ⇒ Boolean
- #guess_part?(text, options = {}) ⇒ Boolean
- #guess_preface?(text) ⇒ Boolean
- #guess_section?(text) ⇒ Boolean
- #guess_volume?(text, options = {}) ⇒ Boolean
- #hav_complete_sentence?(text) ⇒ Boolean
- #mark_digital_struct_info(content) ⇒ Object
- #mark_hybrid_struct_info(content) ⇒ Object
-
#mark_struct_info(content) ⇒ Object
标注结构信息 将内容以行分割顺序存放在数组中,并对行猜测是否为结构信息,将猜测的结果以哈希的形式保存在数组中。.
-
#revise_struct(struct) ⇒ Object
修正结构 TODO.
-
#sanitize_for_epub_text(content) ⇒ Object
sanitize_for_epub_text.
- #to_utf8(text, encoding = 'GB2312') ⇒ Object
Instance Method Details
#build_doc_book(struct, options = {}) ⇒ Object
258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 |
# File 'lib/extract_book_struct.rb', line 258 def build_doc_book(struct,={}) toc = extract_toc_from_struct(struct) doc_toc = gen_docbook_toc(toc) struct = struct.map{|item| item if item.is_a?(Hash)}.compact doc_content = gen_docbook_content(struct) <<-EOS <?xml version="1.0" encoding="utf-8"?> <book xmlns="http://docbook.org/ns/docbook" version="5.0"> <info> <title>#{[:title]}</title> <author>#{[:]}</author> <pubdate>#{[:pubdate]}</pubdate> <publisher>#{[:publisher]}</publisher> </info> #{doc_toc} #{doc_content} </book> EOS end |
#build_struct(content) ⇒ Object
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 |
# File 'lib/extract_book_struct.rb', line 386 def build_struct(content) stack = Array.new(8) struct = [] content.each do |line| if line.is_a?(Hash) case type = line[:type].to_sym when :volume 7.downto(0) do |index| closed_node(struct,stack[0..index]) stack[index]=nil end stack[0] = {:title=>line[:title],:type=>type.to_s,:children=>[]} when :part 7.downto(1) do |index| closed_node(struct,stack[0..index]) stack[index]=nil end stack[1] = {:title=>line[:title],:type=>type.to_s,:children=>[]} when :chapter,:appendix,:index,:glossary,:preface,:afterword 7.downto(2) do |index| closed_node(struct,stack[0..index]) stack[index]=nil end stack[2] = {:title=>line[:title],:type=>type.to_s,:children=>[]} when :sect1 if stack[2] && ['preface','appendix','index','glossary'].include?(stack[2][:type]) stack[2][:children] << line[:title] else 7.downto(3) do |index| closed_node(struct,stack[0..index]) stack[index]=nil end stack[3] = {:title=>line[:title],:type=>type.to_s,:children=>[]} end when :sect2 7.downto(4) do |index| closed_node(struct,stack[0..index]) stack[index]=nil end stack[4] = {:title=>line[:title],:type=>type.to_s,:children=>[]} when :sect3 7.downto(5) do |index| closed_node(struct,stack[0..index]) stack[index]=nil end stack[5] = {:title=>line[:title],:type=>type.to_s,:children=>[]} when :sect4 7.downto(6) do |index| closed_node(struct,stack[0..index]) stack[index]=nil end stack[6] = {:title=>line[:title],:type=>type.to_s,:children=>[]} when :sect5 closed_node(struct,stack) stack[7] = {:title=>line[:title],:type=>type.to_s,:children=>[]} end else if stack[7] stack[7][:children] << line elsif stack[6] stack[6][:children] << line elsif stack[5] stack[5][:children] << line elsif stack[4] stack[4][:children] << line elsif stack[3] stack[3][:children] << line elsif stack[2] stack[2][:children] << line elsif stack[1] stack[1][:children] << line elsif stack[0] stack[0][:children] << line else struct << line end end end 7.downto(0) do |index| closed_node(struct,stack[0..index]) stack[index] = nil end struct end |
#clean_text(text) ⇒ Object
clean_text
获得干净的文本,去除两边的空格和回车
583 584 585 586 587 |
# File 'lib/extract_book_struct.rb', line 583 def clean_text(text) return text if text.nil? text = text.strip text.gsub("\n",'') end |
#closed_node(struct, stack) ⇒ Object
473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 |
# File 'lib/extract_book_struct.rb', line 473 def closed_node(struct,stack) last = stack.pop if last result = false while stack.any? item = stack.pop if item item[:children] << last result = true break end end if result == false struct << last end end end |
#detect_struct_type(paras) ⇒ Object
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
# File 'lib/extract_book_struct.rb', line 129 def detect_struct_type(paras) text_flag = false digital_flag = false paras.each do |para| if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para) text_flag = true end if guess_digital_head_line?(para) digital_flag = true end end if text_flag && digital_flag :hybrid elsif text_flag :text elsif digital_flag :digital else :unknown end end |
#detect_utf8(content) ⇒ Object
561 562 563 564 565 566 |
# File 'lib/extract_book_struct.rb', line 561 def detect_utf8(content) content.each_line{|line| line.strip} true rescue false end |
#escape_html(text) ⇒ Object
escape_html 文本转义,在txt文本转html时需要使用
591 592 593 |
# File 'lib/extract_book_struct.rb', line 591 def escape_html(text) CGI::escapeHTML(text) end |
#extract_book_struct(paras, options = {}) ⇒ Object
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
# File 'lib/extract_book_struct.rb', line 95 def extract_book_struct(paras,={}) # 检查书类型(text,digital,hybrid) format = [:format] || detect_struct_type(paras) case format when :text extract_text_book_struct(paras,) when :digital extract_digital_book_struct(paras,) when :hybrid extract_hybrid_book_struct(paras,) else puts "警告: 没有检测到书结构信息." return nil end end |
#extract_digital_book_struct(content, options = {}) ⇒ Object
从数字类型书中提取结构
169 170 171 172 173 174 175 176 177 178 179 180 |
# File 'lib/extract_book_struct.rb', line 169 def extract_digital_book_struct(content,={}) marked_content = mark_digital_struct_info(content) # 构建书结构 struct = build_struct(marked_content) # 修正结构 revised_struct = revise_struct(struct) # 生成docbook build_doc_book(revised_struct,) end |
#extract_hybrid_book_struct(content, options = {}) ⇒ Object
从混合类型书中提取结构
183 184 185 186 187 188 189 190 191 192 193 194 |
# File 'lib/extract_book_struct.rb', line 183 def extract_hybrid_book_struct(content,={}) marked_content = mark_hybrid_struct_info(content) # 构建书结构 struct = build_struct(marked_content) # 修正结构 revised_struct = revise_struct(struct) # 生成docbook build_doc_book(revised_struct,) end |
#extract_paras(content) ⇒ Object
120 121 122 123 124 125 126 127 |
# File 'lib/extract_book_struct.rb', line 120 def extract_paras(content) paras = [] content.each_line do |line| text = clean_text(line) paras << text if text.length > 0 end paras end |
#extract_text_book_struct(content, options = {}) ⇒ Object
从text类型书中提取结构
154 155 156 157 158 159 160 161 162 163 164 165 166 |
# File 'lib/extract_book_struct.rb', line 154 def extract_text_book_struct(content,={}) # 标注结构信息 marked_content = mark_struct_info(content) # 构建书结构 struct = build_struct(marked_content) # 修正结构 revised_struct = revise_struct(struct) # 生成docbook build_doc_book(revised_struct,) end |
#extract_text_from_file(filename, format) ⇒ Object
111 112 113 114 115 116 117 118 |
# File 'lib/extract_book_struct.rb', line 111 def extract_text_from_file(filename,format) txt_file = File.basename(filename,format) cmd = "ebook-convert #{filename} #{txt_file}.txt" output = `#{cmd}` content = File.open("#{txt_file}.txt").read FileUtils.remove_file("#{txt_file}.txt",true) sanitize_for_epub_text(content) end |
#extract_toc_from_struct(struct) ⇒ Object
496 497 498 499 500 501 502 503 504 505 506 507 508 509 |
# File 'lib/extract_book_struct.rb', line 496 def extract_toc_from_struct(struct) toc = [] struct.each do |item| if item.is_a?(Hash) children = [] if item[:children].any? children = extract_toc_from_struct(item[:children]) end item_hash = {:title=>item[:title], :type=> item[:type],:children=>children} toc << item_hash end end toc end |
#from_epub(filename, options = {}) ⇒ Object
88 89 90 91 92 93 |
# File 'lib/extract_book_struct.rb', line 88 def from_epub(filename,={}) content = extract_text_from_file(filename,'.epub') content = to_utf8(content) unless detect_utf8(content) paras = extract_paras(content) extract_book_struct(paras,) end |
#from_html(filename, options = {}) ⇒ Object
81 82 83 84 85 86 |
# File 'lib/extract_book_struct.rb', line 81 def from_html(filename,={}) content = extract_text_from_file(filename,'.html') content = to_utf8(content) unless detect_utf8(content) paras = extract_paras(content) extract_book_struct(paras,) end |
#from_txt(filename, options = {}) ⇒ Object
71 72 73 74 75 76 77 78 79 |
# File 'lib/extract_book_struct.rb', line 71 def from_txt(filename,={}) content = File.open(filename).read unless detect_utf8(content) content = to_utf8(content) end content = sanitize_for_epub_text(content) paras = extract_paras(content) extract_book_struct(paras,) end |
#gen_docbook_content(struct) ⇒ Object
527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 |
# File 'lib/extract_book_struct.rb', line 527 def gen_docbook_content(struct) content = [] struct.each do |item| if item.is_a?(Hash) children = "" if item[:children].any? children = gen_docbook_content(item[:children]) end case item[:type] when 'volume','part' content << "<part label='#{UUID.generate}'>\n<info><title>#{item[:title]}</title>\n</info>#{children}\n</part>" when 'chapter','appendix','glossary','index','preface' content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>" when 'sect1','sect2','sect3','sect4','sect5' content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>" end else text = escape_html(clean_text(item)) if text.length > 0 content << "<para id='#{UUID.generate}'>#{text}</para>" end end end content.join("\n") end |
#gen_docbook_toc(toc) ⇒ Object
511 512 513 |
# File 'lib/extract_book_struct.rb', line 511 def gen_docbook_toc(toc) "<toc><title>Table of contents</title>#{gen_docbook_tocdiv(toc)}</toc>" end |
#gen_docbook_tocdiv(toc) ⇒ Object
515 516 517 518 519 520 521 522 523 524 525 |
# File 'lib/extract_book_struct.rb', line 515 def gen_docbook_tocdiv(toc) doc_toc = [] toc.each do |item| children = "" if item[:children].any? children = gen_docbook_tocdiv(item[:children]) end doc_toc << "<tocdiv><title>#{item[:title]}</title>#{children}</tocdiv>" end doc_toc.join("") end |
#guess_appendix?(text) ⇒ Boolean
330 331 332 333 334 335 336 337 |
# File 'lib/extract_book_struct.rb', line 330 def guess_appendix?(text) return false if hav_complete_sentence?(text) return true if text =~ /^附\s*录$/ return true if text =~ /^附\s*录\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/ text = text.downcase return true if text =~ /^appendix$/ return true if text =~ /^appendix\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/ end |
#guess_chapter?(text) ⇒ Boolean
296 297 298 299 300 301 |
# File 'lib/extract_book_struct.rb', line 296 def guess_chapter?(text) return false if hav_complete_sentence?(text) return true if text =~ /^第.{1,4}[章回]/ text = text.downcase return true if text =~ /^chapter\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ end |
#guess_digital_head_line?(text) ⇒ Boolean
358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 |
# File 'lib/extract_book_struct.rb', line 358 def guess_digital_head_line?(text) return false if hav_complete_sentence?(text) matcher = text.match(/(^\d+(\.\d)*\s)(.*)/) if matcher return false if matcher[3].length == 0 levels = matcher[1].split(".") return false if levels[0].to_i > 99 case levels.count when 1 "chapter".to_sym else "sect#{levels.count - 1}".to_sym end end end |
#guess_digital_section?(text) ⇒ Boolean
348 349 350 351 352 353 354 355 356 |
# File 'lib/extract_book_struct.rb', line 348 def guess_digital_section?(text) return false if hav_complete_sentence?(text) matcher = text.match(/^(\d+\.)+[\d]\s*(.*)/) if matcher return false if matcher[2].length == 0 level = matcher[0].split(".").count - 1 "sect#{level}".to_sym end end |
#guess_glossary?(text) ⇒ Boolean
339 340 341 342 343 344 345 346 |
# File 'lib/extract_book_struct.rb', line 339 def guess_glossary?(text) return false if hav_complete_sentence?(text) return true if text =~ /^术\s*语$/ return true if text =~ /^术\s*语\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ text = text.downcase return true if text =~ /^glossary$/ return true if text =~ /^glossary\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ end |
#guess_head_line?(text) ⇒ Boolean
374 375 376 377 378 379 380 381 382 383 |
# File 'lib/extract_book_struct.rb', line 374 def guess_head_line?(text) return :volume if guess_volume?(text) return :part if guess_part?(text) return :chapter if guess_chapter?(text) return :section if guess_section?(text) return :preface if guess_preface?(text) return :appendix if guess_appendix?(text) return :index if guess_index?(text) return :glossary if guess_glossary?(text) end |
#guess_index?(text) ⇒ Boolean
321 322 323 324 325 326 327 328 |
# File 'lib/extract_book_struct.rb', line 321 def guess_index?(text) return false if hav_complete_sentence?(text) return true if text =~ /^索\s*引$/ return true if text =~ /^索\s*引\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ text = text.downcase return true if text =~ /^index$/ return true if text =~ /^index\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ end |
#guess_part?(text, options = {}) ⇒ Boolean
289 290 291 292 293 294 |
# File 'lib/extract_book_struct.rb', line 289 def guess_part?(text,={}) return false if hav_complete_sentence?(text) return true if text =~ /^第.{1,3}[部篇]/ text = text.downcase return true if text =~ /^part\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ end |
#guess_preface?(text) ⇒ Boolean
308 309 310 311 312 313 314 315 316 317 318 319 |
# File 'lib/extract_book_struct.rb', line 308 def guess_preface?(text) return false if hav_complete_sentence?(text) return true if text =~ /^前\s*言$/ return true if text =~ /^序\s*言$/ return true if text =~ /^序$/ return true if text =~ /^序[言]\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ text = text.downcase return true if text =~ /^preface$/ return true if text =~ /^preface\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ return true if text =~ /^foreword$/ return true if text =~ /^foreword\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ end |
#guess_section?(text) ⇒ Boolean
303 304 305 306 |
# File 'lib/extract_book_struct.rb', line 303 def guess_section?(text) return false if hav_complete_sentence?(text) return true if text =~ /^第.{1,3}[节]/ end |
#guess_volume?(text, options = {}) ⇒ Boolean
282 283 284 285 286 287 |
# File 'lib/extract_book_struct.rb', line 282 def guess_volume?(text,={}) return false if hav_complete_sentence?(text) return true if (text =~ /^第.{1,3}卷/ || text =~ /^卷\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/) text = text.downcase return true if text =~ /^volume\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/ end |
#hav_complete_sentence?(text) ⇒ Boolean
491 492 493 494 |
# File 'lib/extract_book_struct.rb', line 491 def hav_complete_sentence?(text) text = text.gsub(/^\d+(\.\d)*\s/,'') text =~ /[\.。!\?!?]/ end |
#mark_digital_struct_info(content) ⇒ Object
233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 |
# File 'lib/extract_book_struct.rb', line 233 def mark_digital_struct_info(content) marked_content = [] content.each do |text| if text.length > 0 type = guess_head_line?(text) if type marked_content << {:title=>text,:type=>type} else type = guess_digital_head_line?(text) if type marked_content << {:title=>text,:type=>type} else marked_content << text end end end end marked_content end |
#mark_hybrid_struct_info(content) ⇒ Object
213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 |
# File 'lib/extract_book_struct.rb', line 213 def mark_hybrid_struct_info(content) marked_content = [] content.each do |text| if text.length > 0 type = guess_head_line?(text) if type marked_content << {:title=>text,:type=>type} else type = guess_digital_section?(text) if type marked_content << {:title=>text,:type=>type} else marked_content << text end end end end marked_content end |
#mark_struct_info(content) ⇒ Object
标注结构信息
将内容以行分割顺序存放在数组中,并对行猜测是否为结构信息,将猜测的结果以哈希的形式保存在数组中。
198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
# File 'lib/extract_book_struct.rb', line 198 def mark_struct_info(content) marked_content = [] content.each do |text| if text.length > 0 type = guess_head_line?(text) if type marked_content << {:title=>text,:type=>type} else marked_content << text end end end marked_content end |
#revise_struct(struct) ⇒ Object
修正结构 TODO
254 255 256 |
# File 'lib/extract_book_struct.rb', line 254 def revise_struct(struct) struct end |
#sanitize_for_epub_text(content) ⇒ Object
sanitize_for_epub_text
569 570 571 572 573 574 575 576 577 578 579 |
# File 'lib/extract_book_struct.rb', line 569 def sanitize_for_epub_text(content) lines = [] content.each_line do |line| unless line.downcase.include?('document outline') lines << line else break; end end lines.join("") end |
#to_utf8(text, encoding = 'GB2312') ⇒ Object
553 554 555 556 557 558 559 |
# File 'lib/extract_book_struct.rb', line 553 def to_utf8(text,encoding='GB2312') doc = Iconv.iconv('UTF-8//IGNORE',"#{encoding}//IGNORE",text) doc.join("") #text.encode(encoding) rescue text end |