Module: ExtractBookStruct

Extended by:
ExtractBookStruct
Included in:
ExtractBookStruct
Defined in:
lib/extract_book_struct.rb

Instance Method Summary collapse

Instance Method Details

#build_doc_book(struct, options = {}) ⇒ Object



258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
# File 'lib/extract_book_struct.rb', line 258

def build_doc_book(struct,options={})
  toc = extract_toc_from_struct(struct)

  doc_toc = gen_docbook_toc(toc)

  struct = struct.map{|item| item if item.is_a?(Hash)}.compact

  doc_content = gen_docbook_content(struct)

<<-EOS
<?xml version="1.0" encoding="utf-8"?>
  <book xmlns="http://docbook.org/ns/docbook" version="5.0">
  <info>
  <title>#{options[:title]}</title>
  <author>#{options[:author]}</author>
  <pubdate>#{options[:pubdate]}</pubdate>
  <publisher>#{options[:publisher]}</publisher>
  </info>
  #{doc_toc}
  #{doc_content}
  </book>
EOS
end

#build_struct(content) ⇒ Object



386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
# File 'lib/extract_book_struct.rb', line 386

def build_struct(content)
  stack = Array.new(8)
  struct = []
  content.each do |line|
    if line.is_a?(Hash)
      case type = line[:type].to_sym
      when :volume
        7.downto(0) do |index|
          closed_node(struct,stack[0..index])
          stack[index]=nil
        end
        stack[0] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
      when :part
        7.downto(1) do |index|
          closed_node(struct,stack[0..index])
          stack[index]=nil
        end
        stack[1] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
      when :chapter,:appendix,:index,:glossary,:preface,:afterword
        7.downto(2) do |index|
          closed_node(struct,stack[0..index])
          stack[index]=nil
        end
        stack[2] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
      when :sect1
        if stack[2] && ['preface','appendix','index','glossary'].include?(stack[2][:type])
          stack[2][:children] << line[:title]
        else
          7.downto(3) do |index|
            closed_node(struct,stack[0..index])
            stack[index]=nil
          end
          stack[3] =  {:title=>line[:title],:type=>type.to_s,:children=>[]}
        end
      when :sect2
        7.downto(4) do |index|
          closed_node(struct,stack[0..index])
          stack[index]=nil
        end
        stack[4] =  {:title=>line[:title],:type=>type.to_s,:children=>[]}
      when :sect3
        7.downto(5) do |index|
          closed_node(struct,stack[0..index])
          stack[index]=nil
        end
        stack[5] =  {:title=>line[:title],:type=>type.to_s,:children=>[]}
      when :sect4
        7.downto(6) do |index|
          closed_node(struct,stack[0..index])
          stack[index]=nil
        end
        stack[6] =  {:title=>line[:title],:type=>type.to_s,:children=>[]}
      when :sect5
        closed_node(struct,stack)
        stack[7] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
      end
    else
      if stack[7]
        stack[7][:children] << line
      elsif stack[6]
        stack[6][:children] << line
      elsif stack[5]
        stack[5][:children] << line
      elsif stack[4]
        stack[4][:children] << line
      elsif stack[3]
        stack[3][:children] << line
      elsif stack[2]
        stack[2][:children] << line
      elsif stack[1]
        stack[1][:children] << line
      elsif stack[0]
        stack[0][:children] << line
      else
        struct << line
      end
    end
  end

  7.downto(0) do |index|
    closed_node(struct,stack[0..index])
    stack[index] = nil
  end

  struct
end

#clean_text(text) ⇒ Object

clean_text

获得干净的文本,去除两边的空格和回车


583
584
585
586
587
# File 'lib/extract_book_struct.rb', line 583

def clean_text(text)
  return text if text.nil?
  text = text.strip
  text.gsub("\n",'')
end

#closed_node(struct, stack) ⇒ Object



473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
# File 'lib/extract_book_struct.rb', line 473

def closed_node(struct,stack)
  last = stack.pop
  if last
    result = false
    while stack.any?
      item = stack.pop
      if item
        item[:children] << last
        result = true
        break
      end
    end
    if result == false
      struct << last
    end
  end
end

#detect_struct_type(paras) ⇒ Object



129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# File 'lib/extract_book_struct.rb', line 129

def detect_struct_type(paras)
  text_flag = false
  digital_flag = false
  paras.each do |para|
    if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
      text_flag = true
    end

    if guess_digital_head_line?(para)
      digital_flag = true
    end
  end

  if text_flag && digital_flag
    :hybrid
  elsif text_flag
    :text
  elsif digital_flag
    :digital
  else
    :unknown
  end
end

#detect_utf8(content) ⇒ Object



561
562
563
564
565
566
# File 'lib/extract_book_struct.rb', line 561

def detect_utf8(content)
  content.each_line{|line| line.strip}
  true
rescue
  false
end

#escape_html(text) ⇒ Object

escape_html 文本转义,在txt文本转html时需要使用



591
592
593
# File 'lib/extract_book_struct.rb', line 591

def escape_html(text)
  CGI::escapeHTML(text)
end

#extract_book_struct(paras, options = {}) ⇒ Object



95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/extract_book_struct.rb', line 95

def extract_book_struct(paras,options={})
  # 检查书类型(text,digital,hybrid)
  format = options[:format] || detect_struct_type(paras)
  case format
  when :text
    extract_text_book_struct(paras,options)
  when :digital
    extract_digital_book_struct(paras,options)
  when :hybrid
    extract_hybrid_book_struct(paras,options)
  else
    puts "警告: 没有检测到书结构信息."
    return nil
  end
end

#extract_digital_book_struct(content, options = {}) ⇒ Object

从数字类型书中提取结构



169
170
171
172
173
174
175
176
177
178
179
180
# File 'lib/extract_book_struct.rb', line 169

def extract_digital_book_struct(content,options={})
  marked_content = mark_digital_struct_info(content)

  # 构建书结构
  struct = build_struct(marked_content)

  # 修正结构
  revised_struct = revise_struct(struct)

  # 生成docbook
  build_doc_book(revised_struct,options)
end

#extract_hybrid_book_struct(content, options = {}) ⇒ Object

从混合类型书中提取结构



183
184
185
186
187
188
189
190
191
192
193
194
# File 'lib/extract_book_struct.rb', line 183

def extract_hybrid_book_struct(content,options={})
  marked_content = mark_hybrid_struct_info(content)

  # 构建书结构
  struct = build_struct(marked_content)

  # 修正结构
  revised_struct = revise_struct(struct)

  # 生成docbook
  build_doc_book(revised_struct,options)
end

#extract_paras(content) ⇒ Object



120
121
122
123
124
125
126
127
# File 'lib/extract_book_struct.rb', line 120

def extract_paras(content)
  paras = []
  content.each_line do |line|
    text = clean_text(line)
    paras << text if text.length > 0
  end
  paras
end

#extract_text_book_struct(content, options = {}) ⇒ Object

从text类型书中提取结构



154
155
156
157
158
159
160
161
162
163
164
165
166
# File 'lib/extract_book_struct.rb', line 154

def extract_text_book_struct(content,options={})
  # 标注结构信息
  marked_content = mark_struct_info(content)

  # 构建书结构
  struct = build_struct(marked_content)

  # 修正结构
  revised_struct = revise_struct(struct)

  # 生成docbook
  build_doc_book(revised_struct,options)
end

#extract_text_from_file(filename, format) ⇒ Object



111
112
113
114
115
116
117
118
# File 'lib/extract_book_struct.rb', line 111

def extract_text_from_file(filename,format)
  txt_file = File.basename(filename,format)
  cmd = "ebook-convert #{filename} #{txt_file}.txt"
  output = `#{cmd}`
  content = File.open("#{txt_file}.txt").read
  FileUtils.remove_file("#{txt_file}.txt",true)
  sanitize_for_epub_text(content)
end

#extract_toc_from_struct(struct) ⇒ Object



496
497
498
499
500
501
502
503
504
505
506
507
508
509
# File 'lib/extract_book_struct.rb', line 496

def extract_toc_from_struct(struct)
  toc = []
  struct.each do |item|
    if item.is_a?(Hash)
      children = []
      if item[:children].any?
        children = extract_toc_from_struct(item[:children])
      end
      item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
      toc << item_hash
    end
  end
  toc
end

#from_epub(filename, options = {}) ⇒ Object



88
89
90
91
92
93
# File 'lib/extract_book_struct.rb', line 88

def from_epub(filename,options={})
  content = extract_text_from_file(filename,'.epub')
  content = to_utf8(content) unless detect_utf8(content)
  paras = extract_paras(content)
  extract_book_struct(paras,options)
end

#from_html(filename, options = {}) ⇒ Object



81
82
83
84
85
86
# File 'lib/extract_book_struct.rb', line 81

def from_html(filename,options={})
  content = extract_text_from_file(filename,'.html')
  content = to_utf8(content) unless detect_utf8(content)
  paras = extract_paras(content)
  extract_book_struct(paras,options)
end

#from_txt(filename, options = {}) ⇒ Object



71
72
73
74
75
76
77
78
79
# File 'lib/extract_book_struct.rb', line 71

def from_txt(filename,options={})
  content = File.open(filename).read
  unless detect_utf8(content)
    content = to_utf8(content)
  end
  content = sanitize_for_epub_text(content)
  paras = extract_paras(content)
  extract_book_struct(paras,options)
end

#gen_docbook_content(struct) ⇒ Object



527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
# File 'lib/extract_book_struct.rb', line 527

def gen_docbook_content(struct)
  content = []
  struct.each do |item|
    if item.is_a?(Hash)
      children = ""
      if item[:children].any?
        children = gen_docbook_content(item[:children])
      end
      case item[:type]
      when 'volume','part'
        content << "<part label='#{UUID.generate}'>\n<info><title>#{item[:title]}</title>\n</info>#{children}\n</part>"
      when 'chapter','appendix','glossary','index','preface'
        content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
      when 'sect1','sect2','sect3','sect4','sect5'
        content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
      end
    else
      text = escape_html(clean_text(item))
      if text.length > 0
        content << "<para id='#{UUID.generate}'>#{text}</para>"
      end
    end
  end
  content.join("\n")
end

#gen_docbook_toc(toc) ⇒ Object



511
512
513
# File 'lib/extract_book_struct.rb', line 511

def gen_docbook_toc(toc)
  "<toc><title>Table of contents</title>#{gen_docbook_tocdiv(toc)}</toc>"
end

#gen_docbook_tocdiv(toc) ⇒ Object



515
516
517
518
519
520
521
522
523
524
525
# File 'lib/extract_book_struct.rb', line 515

def gen_docbook_tocdiv(toc)
  doc_toc = []
  toc.each do |item|
    children = ""
    if item[:children].any?
      children = gen_docbook_tocdiv(item[:children])
    end
    doc_toc << "<tocdiv><title>#{item[:title]}</title>#{children}</tocdiv>"
  end
  doc_toc.join("")
end

#guess_appendix?(text) ⇒ Boolean

Returns:

  • (Boolean)


330
331
332
333
334
335
336
337
# File 'lib/extract_book_struct.rb', line 330

def guess_appendix?(text)
  return false if hav_complete_sentence?(text)
  return true if text =~ /^附\s*录$/
  return true if text =~ /^附\s*录\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/
  text = text.downcase
  return true if text =~ /^appendix$/
  return true if text =~ /^appendix\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/
end

#guess_chapter?(text) ⇒ Boolean

Returns:

  • (Boolean)


296
297
298
299
300
301
# File 'lib/extract_book_struct.rb', line 296

def guess_chapter?(text)
  return false if hav_complete_sentence?(text)
  return true if text =~ /^第.{1,4}[章回]/
  text = text.downcase
  return true if text =~ /^chapter\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
end

#guess_digital_head_line?(text) ⇒ Boolean

Returns:

  • (Boolean)


358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
# File 'lib/extract_book_struct.rb', line 358

def guess_digital_head_line?(text)
  return false if hav_complete_sentence?(text)
  matcher = text.match(/(^\d+(\.\d)*\s)(.*)/)
  if matcher
    return false if matcher[3].length == 0
    levels = matcher[1].split(".")
    return false if levels[0].to_i > 99
    case levels.count
    when 1
      "chapter".to_sym
    else
      "sect#{levels.count - 1}".to_sym
    end
  end
end

#guess_digital_section?(text) ⇒ Boolean

Returns:

  • (Boolean)


348
349
350
351
352
353
354
355
356
# File 'lib/extract_book_struct.rb', line 348

def guess_digital_section?(text)
  return false if hav_complete_sentence?(text)
  matcher = text.match(/^(\d+\.)+[\d]\s*(.*)/)
  if matcher
    return false if matcher[2].length == 0
    level = matcher[0].split(".").count - 1
    "sect#{level}".to_sym
  end
end

#guess_glossary?(text) ⇒ Boolean

Returns:

  • (Boolean)


339
340
341
342
343
344
345
346
# File 'lib/extract_book_struct.rb', line 339

def guess_glossary?(text)
  return false if hav_complete_sentence?(text)
  return true if text =~ /^术\s*语$/
  return true if text =~ /^术\s*语\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
  text = text.downcase
  return true if text =~ /^glossary$/
  return true if text =~ /^glossary\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
end

#guess_head_line?(text) ⇒ Boolean

Returns:

  • (Boolean)


374
375
376
377
378
379
380
381
382
383
# File 'lib/extract_book_struct.rb', line 374

def guess_head_line?(text)
  return :volume if guess_volume?(text)
  return :part if guess_part?(text)
  return :chapter if guess_chapter?(text)
  return :section if guess_section?(text)
  return :preface if guess_preface?(text)
  return :appendix if guess_appendix?(text)
  return :index if guess_index?(text)
  return :glossary if guess_glossary?(text)
end

#guess_index?(text) ⇒ Boolean

Returns:

  • (Boolean)


321
322
323
324
325
326
327
328
# File 'lib/extract_book_struct.rb', line 321

def guess_index?(text)
  return false if hav_complete_sentence?(text)
  return true if text =~ /^索\s*引$/
  return true if text =~ /^索\s*引\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
  text = text.downcase
  return true if text =~ /^index$/
  return true if text =~ /^index\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
end

#guess_part?(text, options = {}) ⇒ Boolean

Returns:

  • (Boolean)


289
290
291
292
293
294
# File 'lib/extract_book_struct.rb', line 289

def guess_part?(text,options={})
  return false if hav_complete_sentence?(text)
  return true if text =~ /^第.{1,3}[部篇]/
  text = text.downcase
  return true if text =~ /^part\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
end

#guess_preface?(text) ⇒ Boolean

Returns:

  • (Boolean)


308
309
310
311
312
313
314
315
316
317
318
319
# File 'lib/extract_book_struct.rb', line 308

def guess_preface?(text)
  return false if hav_complete_sentence?(text)
  return true if text =~ /^前\s*言$/
  return true if text =~ /^序\s*言$/
  return true if text =~ /^序$/
  return true if text =~ /^序[言]\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
  text = text.downcase
  return true if text =~ /^preface$/
  return true if text =~ /^preface\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
  return true if text =~ /^foreword$/
  return true if text =~ /^foreword\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
end

#guess_section?(text) ⇒ Boolean

Returns:

  • (Boolean)


303
304
305
306
# File 'lib/extract_book_struct.rb', line 303

def guess_section?(text)
  return false if hav_complete_sentence?(text)
  return true if text =~ /^第.{1,3}[节]/
end

#guess_volume?(text, options = {}) ⇒ Boolean

Returns:

  • (Boolean)


282
283
284
285
286
287
# File 'lib/extract_book_struct.rb', line 282

def guess_volume?(text,options={})
  return false if hav_complete_sentence?(text)
  return true if (text =~ /^第.{1,3}卷/ || text =~ /^卷\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/)
  text = text.downcase
  return true if text =~ /^volume\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
end

#hav_complete_sentence?(text) ⇒ Boolean

Returns:

  • (Boolean)


491
492
493
494
# File 'lib/extract_book_struct.rb', line 491

def hav_complete_sentence?(text)
  text = text.gsub(/^\d+(\.\d)*\s/,'')
  text =~ /[\.。!\?!?]/
end

#mark_digital_struct_info(content) ⇒ Object



233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
# File 'lib/extract_book_struct.rb', line 233

def mark_digital_struct_info(content)
  marked_content = []
  content.each do |text|
    if text.length > 0
      type = guess_head_line?(text)
      if type
        marked_content << {:title=>text,:type=>type}
      else
        type = guess_digital_head_line?(text)
        if type
          marked_content << {:title=>text,:type=>type}
        else
          marked_content << text
        end
      end
    end
  end
  marked_content
end

#mark_hybrid_struct_info(content) ⇒ Object



213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
# File 'lib/extract_book_struct.rb', line 213

def mark_hybrid_struct_info(content)
  marked_content = []
  content.each do |text|
    if text.length > 0
      type = guess_head_line?(text)
      if type
        marked_content << {:title=>text,:type=>type}
      else
        type = guess_digital_section?(text)
        if type
          marked_content << {:title=>text,:type=>type}
        else
          marked_content << text
        end
      end
    end
  end
  marked_content
end

#mark_struct_info(content) ⇒ Object

标注结构信息

将内容以行分割顺序存放在数组中,并对行猜测是否为结构信息,将猜测的结果以哈希的形式保存在数组中。


198
199
200
201
202
203
204
205
206
207
208
209
210
211
# File 'lib/extract_book_struct.rb', line 198

def mark_struct_info(content)
  marked_content = []
  content.each do |text|
    if text.length > 0
      type = guess_head_line?(text)
      if type
        marked_content << {:title=>text,:type=>type}
      else
        marked_content << text
      end
    end
  end
  marked_content
end

#revise_struct(struct) ⇒ Object

修正结构 TODO



254
255
256
# File 'lib/extract_book_struct.rb', line 254

def revise_struct(struct)
  struct
end

#sanitize_for_epub_text(content) ⇒ Object

sanitize_for_epub_text



569
570
571
572
573
574
575
576
577
578
579
# File 'lib/extract_book_struct.rb', line 569

def sanitize_for_epub_text(content)
  lines = []
  content.each_line do |line|
    unless line.downcase.include?('document outline')
      lines << line
    else
      break;
    end
  end
  lines.join("")
end

#to_utf8(text, encoding = 'GB2312') ⇒ Object



553
554
555
556
557
558
559
# File 'lib/extract_book_struct.rb', line 553

def to_utf8(text,encoding='GB2312')
  doc = Iconv.iconv('UTF-8//IGNORE',"#{encoding}//IGNORE",text)
  doc.join("")
  #text.encode(encoding)
rescue
  text
end