Module: BatchExtract

Extended by:
BatchExtract
Included in:
BatchExtract
Defined in:
lib/batch_extract.rb

Instance Method Summary collapse

Instance Method Details

#batch_extract_from_dir(source, destination, options = {}) ⇒ Object

batch_extract_from_dir

batch extract book struct form dir

parameters:

+source+     source directory
+destination+   output directory
+options+        optional parameter.
   :format     指定需要提取结构的文件后缀名,例如要从所有txt文件中提取,通过:format=>'.txt'指定


14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/batch_extract.rb', line 14

def batch_extract_from_dir(source,destination,options={})
  format = options.delete(:format)
  files = scan_file_from_dir(source,{:format=>format})

  files.each do |file|
    extname = File.extname(file)
    basename = File.basename(file,extname)
    dest_file = File.join(File.dirname(File.join(destination,file.gsub(source,''))),"#{basename}.xml")
    puts "start extract #{file} ..."
    begin
      docbook_xml = case extname
      when '.html'
        ExtractBookStruct.from_html(file,options)
      when '.txt'
        ExtractBookStruct.from_txt(file,options)
      when '.epub'
        ExtractBookStruct.from_epub(file,options)
      else
        nil
      end
      if docbook_xml
        File.open(dest_file,'wb'){|file|file.write docbook_xml}
        puts "success: extract book struct  successfully!"
      end
    #rescue => e
    #  puts "error: #{file} \n#{e.backtrace.join("\n")}"
    end

  end
end

#scan_file_from_dir(dir, options = {}) ⇒ Object

scan_file_from_dir 遍历目录下的文件 parameters:

+dir+       需遍历的目录
+options+   可选参数
   :format     指定需要遍历的文件后缀名,例如要遍历所有pdf文件,通过:format=>'.pdf'指定


51
52
53
54
55
56
57
# File 'lib/batch_extract.rb', line 51

def scan_file_from_dir(dir,options={})
  files = []
  walk_dir(dir,options) do |file|
    files << file.to_s
  end
  files
end

#walk_dir(path_str, options = {}) ⇒ Object



59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/batch_extract.rb', line 59

def walk_dir(path_str,options={})
  path = Pathname.new(path_str)
  format = options[:format]
  path.children.each do |entry|
    if entry.directory?
      walk_dir(entry) {|x| yield(x)}
    elsif entry.file?
      if format
        if entry.extname == format
          yield entry
        end
      else
        yield entry
      end
    end
  end
end