Module: ExtractBookStruct

Extended by:: ExtractBookStruct

Included in:: ExtractBookStruct

Defined in:: lib/extract_book_struct.rb

Instance Method Summary collapse

#build_doc_book(struct, options = {}) ⇒ Object
#build_struct(content) ⇒ Object
#clean_text(text) ⇒ Object

clean_text 获得干净的文本，去除两边的空格和回车.
#closed_node(struct, stack) ⇒ Object
#detect_struct_type(paras) ⇒ Object
#detect_utf8(content) ⇒ Object
#escape_html(text) ⇒ Object

escape_html 文本转义，在txt文本转html时需要使用.
#extract_book_struct(paras, options = {}) ⇒ Object
#extract_digital_book_struct(content, options = {}) ⇒ Object

从数字类型书中提取结构.
#extract_hybrid_book_struct(content, options = {}) ⇒ Object

从混合类型书中提取结构.
#extract_paras(content) ⇒ Object
#extract_text_book_struct(content, options = {}) ⇒ Object

从text类型书中提取结构.
#extract_text_from_file(filename, format) ⇒ Object
#extract_toc_from_struct(struct) ⇒ Object
#from_epub(filename, options = {}) ⇒ Object
#from_html(filename, options = {}) ⇒ Object
#from_txt(filename, options = {}) ⇒ Object
#gen_docbook_content(struct) ⇒ Object
#gen_docbook_toc(toc) ⇒ Object
#gen_docbook_tocdiv(toc) ⇒ Object
#guess_appendix?(text) ⇒ Boolean
#guess_chapter?(text) ⇒ Boolean
#guess_digital_head_line?(text) ⇒ Boolean
#guess_digital_section?(text) ⇒ Boolean
#guess_glossary?(text) ⇒ Boolean
#guess_head_line?(text) ⇒ Boolean
#guess_index?(text) ⇒ Boolean
#guess_part?(text, options = {}) ⇒ Boolean
#guess_preface?(text) ⇒ Boolean
#guess_section?(text) ⇒ Boolean
#guess_volume?(text, options = {}) ⇒ Boolean
#hav_complete_sentence?(text) ⇒ Boolean
#mark_digital_struct_info(content) ⇒ Object
#mark_hybrid_struct_info(content) ⇒ Object
#mark_struct_info(content) ⇒ Object

标注结构信息将内容以行分割顺序存放在数组中，并对行猜测是否为结构信息，将猜测的结果以哈希的形式保存在数组中。.
#revise_struct(struct) ⇒ Object

修正结构 TODO.
#sanitize_for_epub_text(content) ⇒ Object

sanitize_for_epub_text.
#to_utf8(text, encoding = 'GB2312') ⇒ Object

Instance Method Details

#build_doc_book(struct, options = {}) ⇒ `Object`

# File 'lib/extract_book_struct.rb', line 258

def build_doc_book(struct,options={})
  toc = extract_toc_from_struct(struct)

  doc_toc = gen_docbook_toc(toc)

  struct = struct.map{|item| item if item.is_a?(Hash)}.compact

  doc_content = gen_docbook_content(struct)

<<-EOS
<?xml version="1.0" encoding="utf-8"?>
  <book xmlns="http://docbook.org/ns/docbook" version="5.0">
  <info>
  <title>#{options[:title]}</title>
  <author>#{options[:author]}</author>
  <pubdate>#{options[:pubdate]}</pubdate>
  <publisher>#{options[:publisher]}</publisher>
  </info>
  #{doc_toc}
  #{doc_content}
  </book>
EOS
end

#build_struct(content) ⇒ `Object`

# File 'lib/extract_book_struct.rb', line 386

def build_struct(content)
  stack = Array.new(8)
  struct = []
  content.each do |line|
    if line.is_a?(Hash)
      case type = line[:type].to_sym
      when :volume
        7.downto(0) do |index|
          closed_node(struct,stack[0..index])
          stack[index]=nil
        end
        stack[0] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
      when :part
        7.downto(1) do |index|
          closed_node(struct,stack[0..index])
          stack[index]=nil
        end
        stack[1] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
      when :chapter,:appendix,:index,:glossary,:preface,:afterword
        7.downto(2) do |index|
          closed_node(struct,stack[0..index])
          stack[index]=nil
        end
        stack[2] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
      when :sect1
        if stack[2] && ['preface','appendix','index','glossary'].include?(stack[2][:type])
          stack[2][:children] << line[:title]
        else
          7.downto(3) do |index|
            closed_node(struct,stack[0..index])
            stack[index]=nil
          end
          stack[3] =  {:title=>line[:title],:type=>type.to_s,:children=>[]}
        end
      when :sect2
        7.downto(4) do |index|
          closed_node(struct,stack[0..index])
          stack[index]=nil
        end
        stack[4] =  {:title=>line[:title],:type=>type.to_s,:children=>[]}
      when :sect3
        7.downto(5) do |index|
          closed_node(struct,stack[0..index])
          stack[index]=nil
        end
        stack[5] =  {:title=>line[:title],:type=>type.to_s,:children=>[]}
      when :sect4
        7.downto(6) do |index|
          closed_node(struct,stack[0..index])
          stack[index]=nil
        end
        stack[6] =  {:title=>line[:title],:type=>type.to_s,:children=>[]}
      when :sect5
        closed_node(struct,stack)
        stack[7] = {:title=>line[:title],:type=>type.to_s,:children=>[]}
      end
    else
      if stack[7]
        stack[7][:children] << line
      elsif stack[6]
        stack[6][:children] << line
      elsif stack[5]
        stack[5][:children] << line
      elsif stack[4]
        stack[4][:children] << line
      elsif stack[3]
        stack[3][:children] << line
      elsif stack[2]
        stack[2][:children] << line
      elsif stack[1]
        stack[1][:children] << line
      elsif stack[0]
        stack[0][:children] << line
      else
        struct << line
      end
    end
  end

  7.downto(0) do |index|
    closed_node(struct,stack[0..index])
    stack[index] = nil
  end

  struct
end

#clean_text(text) ⇒ `Object`

clean_text

获得干净的文本，去除两边的空格和回车

# File 'lib/extract_book_struct.rb', line 583

def clean_text(text)
  return text if text.nil?
  text = text.strip
  text.gsub("\n",'')
end

#closed_node(struct, stack) ⇒ `Object`

# File 'lib/extract_book_struct.rb', line 473

def closed_node(struct,stack)
  last = stack.pop
  if last
    result = false
    while stack.any?
      item = stack.pop
      if item
        item[:children] << last
        result = true
        break
      end
    end
    if result == false
      struct << last
    end
  end
end

#detect_struct_type(paras) ⇒ `Object`

# File 'lib/extract_book_struct.rb', line 129

def detect_struct_type(paras)
  text_flag = false
  digital_flag = false
  paras.each do |para|
    if guess_volume?(para) || guess_part?(para) || guess_chapter?(para) || guess_section?(para)
      text_flag = true
    end

    if guess_digital_head_line?(para)
      digital_flag = true
    end
  end

  if text_flag && digital_flag
    :hybrid
  elsif text_flag
    :text
  elsif digital_flag
    :digital
  else
    :unknown
  end
end

#detect_utf8(content) ⇒ `Object`

# File 'lib/extract_book_struct.rb', line 561

def detect_utf8(content)
  content.each_line{|line| line.strip}
  true
rescue
  false
end

#escape_html(text) ⇒ `Object`

escape_html 文本转义，在txt文本转html时需要使用



591
592
593

# File 'lib/extract_book_struct.rb', line 591

def escape_html(text)
  CGI::escapeHTML(text)
end

#extract_book_struct(paras, options = {}) ⇒ `Object`

# File 'lib/extract_book_struct.rb', line 95

def extract_book_struct(paras,options={})
  # 检查书类型（text,digital,hybrid)
  format = options[:format] || detect_struct_type(paras)
  case format
  when :text
    extract_text_book_struct(paras,options)
  when :digital
    extract_digital_book_struct(paras,options)
  when :hybrid
    extract_hybrid_book_struct(paras,options)
  else
    puts "警告: 没有检测到书结构信息."
    return nil
  end
end

#extract_digital_book_struct(content, options = {}) ⇒ `Object`

从数字类型书中提取结构

# File 'lib/extract_book_struct.rb', line 169

def extract_digital_book_struct(content,options={})
  marked_content = mark_digital_struct_info(content)

  # 构建书结构
  struct = build_struct(marked_content)

  # 修正结构
  revised_struct = revise_struct(struct)

  # 生成docbook
  build_doc_book(revised_struct,options)
end

#extract_hybrid_book_struct(content, options = {}) ⇒ `Object`

从混合类型书中提取结构

# File 'lib/extract_book_struct.rb', line 183

def extract_hybrid_book_struct(content,options={})
  marked_content = mark_hybrid_struct_info(content)

  # 构建书结构
  struct = build_struct(marked_content)

  # 修正结构
  revised_struct = revise_struct(struct)

  # 生成docbook
  build_doc_book(revised_struct,options)
end

#extract_paras(content) ⇒ `Object`

# File 'lib/extract_book_struct.rb', line 120

def extract_paras(content)
  paras = []
  content.each_line do |line|
    text = clean_text(line)
    paras << text if text.length > 0
  end
  paras
end

#extract_text_book_struct(content, options = {}) ⇒ `Object`

从text类型书中提取结构

# File 'lib/extract_book_struct.rb', line 154

def extract_text_book_struct(content,options={})
  # 标注结构信息
  marked_content = mark_struct_info(content)

  # 构建书结构
  struct = build_struct(marked_content)

  # 修正结构
  revised_struct = revise_struct(struct)

  # 生成docbook
  build_doc_book(revised_struct,options)
end

#extract_text_from_file(filename, format) ⇒ `Object`

# File 'lib/extract_book_struct.rb', line 111

def extract_text_from_file(filename,format)
  txt_file = File.basename(filename,format)
  cmd = "ebook-convert #{filename} #{txt_file}.txt"
  output = `#{cmd}`
  content = File.open("#{txt_file}.txt").read
  FileUtils.remove_file("#{txt_file}.txt",true)
  sanitize_for_epub_text(content)
end

#extract_toc_from_struct(struct) ⇒ `Object`

# File 'lib/extract_book_struct.rb', line 496

def extract_toc_from_struct(struct)
  toc = []
  struct.each do |item|
    if item.is_a?(Hash)
      children = []
      if item[:children].any?
        children = extract_toc_from_struct(item[:children])
      end
      item_hash = {:title=>item[:title], :type=> item[:type],:children=>children}
      toc << item_hash
    end
  end
  toc
end

#from_epub(filename, options = {}) ⇒ `Object`

# File 'lib/extract_book_struct.rb', line 88

def from_epub(filename,options={})
  content = extract_text_from_file(filename,'.epub')
  content = to_utf8(content) unless detect_utf8(content)
  paras = extract_paras(content)
  extract_book_struct(paras,options)
end

#from_html(filename, options = {}) ⇒ `Object`

# File 'lib/extract_book_struct.rb', line 81

def from_html(filename,options={})
  content = extract_text_from_file(filename,'.html')
  content = to_utf8(content) unless detect_utf8(content)
  paras = extract_paras(content)
  extract_book_struct(paras,options)
end

#from_txt(filename, options = {}) ⇒ `Object`

# File 'lib/extract_book_struct.rb', line 71

def from_txt(filename,options={})
  content = File.open(filename).read
  unless detect_utf8(content)
    content = to_utf8(content)
  end
  content = sanitize_for_epub_text(content)
  paras = extract_paras(content)
  extract_book_struct(paras,options)
end

#gen_docbook_content(struct) ⇒ `Object`

# File 'lib/extract_book_struct.rb', line 527

def gen_docbook_content(struct)
  content = []
  struct.each do |item|
    if item.is_a?(Hash)
      children = ""
      if item[:children].any?
        children = gen_docbook_content(item[:children])
      end
      case item[:type]
      when 'volume','part'
        content << "<part label='#{UUID.generate}'>\n<info><title>#{item[:title]}</title>\n</info>#{children}\n</part>"
      when 'chapter','appendix','glossary','index','preface'
        content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
      when 'sect1','sect2','sect3','sect4','sect5'
        content << "<#{item[:type]} id='#{UUID.generate}'>\n<info>\n<title>#{item[:title]}</title>\n</info>#{children}\n</#{item[:type]}>"
      end
    else
      text = escape_html(clean_text(item))
      if text.length > 0
        content << "<para id='#{UUID.generate}'>#{text}</para>"
      end
    end
  end
  content.join("\n")
end

#gen_docbook_toc(toc) ⇒ `Object`



511
512
513

# File 'lib/extract_book_struct.rb', line 511

def gen_docbook_toc(toc)
  "<toc><title>Table of contents</title>#{gen_docbook_tocdiv(toc)}</toc>"
end

#gen_docbook_tocdiv(toc) ⇒ `Object`

# File 'lib/extract_book_struct.rb', line 515

def gen_docbook_tocdiv(toc)
  doc_toc = []
  toc.each do |item|
    children = ""
    if item[:children].any?
      children = gen_docbook_tocdiv(item[:children])
    end
    doc_toc << "<tocdiv><title>#{item[:title]}</title>#{children}</tocdiv>"
  end
  doc_toc.join("")
end

#guess_appendix?(text) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/extract_book_struct.rb', line 330

def guess_appendix?(text)
  return false if hav_complete_sentence?(text)
  return true if text =~ /^附\s*录$/
  return true if text =~ /^附\s*录\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/
  text = text.downcase
  return true if text =~ /^appendix$/
  return true if text =~ /^appendix\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIiA-Za-z]/
end

#guess_chapter?(text) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/extract_book_struct.rb', line 296

def guess_chapter?(text)
  return false if hav_complete_sentence?(text)
  return true if text =~ /^第.{1,4}[章回]/
  text = text.downcase
  return true if text =~ /^chapter\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
end

#guess_digital_head_line?(text) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/extract_book_struct.rb', line 358

def guess_digital_head_line?(text)
  return false if hav_complete_sentence?(text)
  matcher = text.match(/(^\d+(\.\d)*\s)(.*)/)
  if matcher
    return false if matcher[3].length == 0
    levels = matcher[1].split(".")
    return false if levels[0].to_i > 99
    case levels.count
    when 1
      "chapter".to_sym
    else
      "sect#{levels.count - 1}".to_sym
    end
  end
end

#guess_digital_section?(text) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/extract_book_struct.rb', line 348

def guess_digital_section?(text)
  return false if hav_complete_sentence?(text)
  matcher = text.match(/^(\d+\.)+[\d]\s*(.*)/)
  if matcher
    return false if matcher[2].length == 0
    level = matcher[0].split(".").count - 1
    "sect#{level}".to_sym
  end
end

#guess_glossary?(text) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/extract_book_struct.rb', line 339

def guess_glossary?(text)
  return false if hav_complete_sentence?(text)
  return true if text =~ /^术\s*语$/
  return true if text =~ /^术\s*语\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
  text = text.downcase
  return true if text =~ /^glossary$/
  return true if text =~ /^glossary\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
end

#guess_head_line?(text) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/extract_book_struct.rb', line 374

def guess_head_line?(text)
  return :volume if guess_volume?(text)
  return :part if guess_part?(text)
  return :chapter if guess_chapter?(text)
  return :section if guess_section?(text)
  return :preface if guess_preface?(text)
  return :appendix if guess_appendix?(text)
  return :index if guess_index?(text)
  return :glossary if guess_glossary?(text)
end

#guess_index?(text) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/extract_book_struct.rb', line 321

def guess_index?(text)
  return false if hav_complete_sentence?(text)
  return true if text =~ /^索\s*引$/
  return true if text =~ /^索\s*引\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
  text = text.downcase
  return true if text =~ /^index$/
  return true if text =~ /^index\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
end

#guess_part?(text, options = {}) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/extract_book_struct.rb', line 289

def guess_part?(text,options={})
  return false if hav_complete_sentence?(text)
  return true if text =~ /^第.{1,3}[部篇]/
  text = text.downcase
  return true if text =~ /^part\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
end

#guess_preface?(text) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/extract_book_struct.rb', line 308

def guess_preface?(text)
  return false if hav_complete_sentence?(text)
  return true if text =~ /^前\s*言$/
  return true if text =~ /^序\s*言$/
  return true if text =~ /^序$/
  return true if text =~ /^序[言]\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
  text = text.downcase
  return true if text =~ /^preface$/
  return true if text =~ /^preface\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
  return true if text =~ /^foreword$/
  return true if text =~ /^foreword\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
end

#guess_section?(text) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/extract_book_struct.rb', line 303

def guess_section?(text)
  return false if hav_complete_sentence?(text)
  return true if text =~ /^第.{1,3}[节]/
end

#guess_volume?(text, options = {}) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/extract_book_struct.rb', line 282

def guess_volume?(text,options={})
  return false if hav_complete_sentence?(text)
  return true if (text =~ /^第.{1,3}卷/ || text =~ /^卷\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/)
  text = text.downcase
  return true if text =~ /^volume\s*[\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹIi]/
end

#hav_complete_sentence?(text) ⇒ `Boolean`

Returns:

(Boolean)

# File 'lib/extract_book_struct.rb', line 491

def hav_complete_sentence?(text)
  text = text.gsub(/^\d+(\.\d)*\s/,'')
  text =~ /[\.。!\?！？]/
end

#mark_digital_struct_info(content) ⇒ `Object`

# File 'lib/extract_book_struct.rb', line 233

def mark_digital_struct_info(content)
  marked_content = []
  content.each do |text|
    if text.length > 0
      type = guess_head_line?(text)
      if type
        marked_content << {:title=>text,:type=>type}
      else
        type = guess_digital_head_line?(text)
        if type
          marked_content << {:title=>text,:type=>type}
        else
          marked_content << text
        end
      end
    end
  end
  marked_content
end

#mark_hybrid_struct_info(content) ⇒ `Object`

# File 'lib/extract_book_struct.rb', line 213

def mark_hybrid_struct_info(content)
  marked_content = []
  content.each do |text|
    if text.length > 0
      type = guess_head_line?(text)
      if type
        marked_content << {:title=>text,:type=>type}
      else
        type = guess_digital_section?(text)
        if type
          marked_content << {:title=>text,:type=>type}
        else
          marked_content << text
        end
      end
    end
  end
  marked_content
end

#mark_struct_info(content) ⇒ `Object`

标注结构信息

将内容以行分割顺序存放在数组中，并对行猜测是否为结构信息，将猜测的结果以哈希的形式保存在数组中。

# File 'lib/extract_book_struct.rb', line 198

def mark_struct_info(content)
  marked_content = []
  content.each do |text|
    if text.length > 0
      type = guess_head_line?(text)
      if type
        marked_content << {:title=>text,:type=>type}
      else
        marked_content << text
      end
    end
  end
  marked_content
end

#revise_struct(struct) ⇒ `Object`

修正结构 TODO



254
255
256

# File 'lib/extract_book_struct.rb', line 254

def revise_struct(struct)
  struct
end

#sanitize_for_epub_text(content) ⇒ `Object`

sanitize_for_epub_text

# File 'lib/extract_book_struct.rb', line 569

def sanitize_for_epub_text(content)
  lines = []
  content.each_line do |line|
    unless line.downcase.include?('document outline')
      lines << line
    else
      break;
    end
  end
  lines.join("")
end

#to_utf8(text, encoding = 'GB2312') ⇒ `Object`

# File 'lib/extract_book_struct.rb', line 553

def to_utf8(text,encoding='GB2312')
  doc = Iconv.iconv('UTF-8//IGNORE',"#{encoding}//IGNORE",text)
  doc.join("")
  #text.encode(encoding)
rescue
  text
end

Module: ExtractBookStruct

Instance Method Summary collapse

Instance Method Details

#build_doc_book(struct, options = {}) ⇒ Object

#build_struct(content) ⇒ Object

#clean_text(text) ⇒ Object

#closed_node(struct, stack) ⇒ Object

#detect_struct_type(paras) ⇒ Object

#detect_utf8(content) ⇒ Object

#escape_html(text) ⇒ Object

#extract_book_struct(paras, options = {}) ⇒ Object

#extract_digital_book_struct(content, options = {}) ⇒ Object

#extract_hybrid_book_struct(content, options = {}) ⇒ Object

#extract_paras(content) ⇒ Object

#extract_text_book_struct(content, options = {}) ⇒ Object

#extract_text_from_file(filename, format) ⇒ Object

#extract_toc_from_struct(struct) ⇒ Object

#from_epub(filename, options = {}) ⇒ Object

#from_html(filename, options = {}) ⇒ Object

#from_txt(filename, options = {}) ⇒ Object

#gen_docbook_content(struct) ⇒ Object

#gen_docbook_toc(toc) ⇒ Object

#gen_docbook_tocdiv(toc) ⇒ Object

#guess_appendix?(text) ⇒ Boolean

#guess_chapter?(text) ⇒ Boolean

#guess_digital_head_line?(text) ⇒ Boolean

#guess_digital_section?(text) ⇒ Boolean

#guess_glossary?(text) ⇒ Boolean

#guess_head_line?(text) ⇒ Boolean

#guess_index?(text) ⇒ Boolean

#guess_part?(text, options = {}) ⇒ Boolean

#guess_preface?(text) ⇒ Boolean

#guess_section?(text) ⇒ Boolean

#guess_volume?(text, options = {}) ⇒ Boolean

#hav_complete_sentence?(text) ⇒ Boolean

#mark_digital_struct_info(content) ⇒ Object

#mark_hybrid_struct_info(content) ⇒ Object

#mark_struct_info(content) ⇒ Object

#revise_struct(struct) ⇒ Object

#sanitize_for_epub_text(content) ⇒ Object

#to_utf8(text, encoding = 'GB2312') ⇒ Object

#build_doc_book(struct, options = {}) ⇒ `Object`

#build_struct(content) ⇒ `Object`

#clean_text(text) ⇒ `Object`

#closed_node(struct, stack) ⇒ `Object`

#detect_struct_type(paras) ⇒ `Object`

#detect_utf8(content) ⇒ `Object`

#escape_html(text) ⇒ `Object`

#extract_book_struct(paras, options = {}) ⇒ `Object`

#extract_digital_book_struct(content, options = {}) ⇒ `Object`

#extract_hybrid_book_struct(content, options = {}) ⇒ `Object`

#extract_paras(content) ⇒ `Object`

#extract_text_book_struct(content, options = {}) ⇒ `Object`

#extract_text_from_file(filename, format) ⇒ `Object`

#extract_toc_from_struct(struct) ⇒ `Object`

#from_epub(filename, options = {}) ⇒ `Object`

#from_html(filename, options = {}) ⇒ `Object`

#from_txt(filename, options = {}) ⇒ `Object`

#gen_docbook_content(struct) ⇒ `Object`

#gen_docbook_toc(toc) ⇒ `Object`

#gen_docbook_tocdiv(toc) ⇒ `Object`

#guess_appendix?(text) ⇒ `Boolean`

#guess_chapter?(text) ⇒ `Boolean`

#guess_digital_head_line?(text) ⇒ `Boolean`

#guess_digital_section?(text) ⇒ `Boolean`

#guess_glossary?(text) ⇒ `Boolean`

#guess_head_line?(text) ⇒ `Boolean`

#guess_index?(text) ⇒ `Boolean`

#guess_part?(text, options = {}) ⇒ `Boolean`

#guess_preface?(text) ⇒ `Boolean`

#guess_section?(text) ⇒ `Boolean`

#guess_volume?(text, options = {}) ⇒ `Boolean`

#hav_complete_sentence?(text) ⇒ `Boolean`

#mark_digital_struct_info(content) ⇒ `Object`

#mark_hybrid_struct_info(content) ⇒ `Object`

#mark_struct_info(content) ⇒ `Object`

#revise_struct(struct) ⇒ `Object`

#sanitize_for_epub_text(content) ⇒ `Object`

#to_utf8(text, encoding = 'GB2312') ⇒ `Object`