Module: PDF
Instance Method Summary collapse
-
#extract_illustrations(filename, options = {}) ⇒ Object
extract_illustrations 提取pdf文件中的插图 parameters:
filename
pdf文件options
可选参数dir
插图存放的目录,默认存放在当前目录下与filename同名的子目录下。. - #extract_page_illustrations(illustrations, index) ⇒ Object
-
#extract_pdf_meta(filename) ⇒ Object
extract_pdf_meta 提取pdf元数据 parameters:
filename
pdf文件. -
#extract_pdf_pages_text(filename) ⇒ Object
extract_pdf_pages_text 提取pdf中页文本内容 parameters:
filename
pdf文件. -
#extract_sections(filename) ⇒ Object
extract_sections 提取pdf文件的大纲 parameters:
filename
pdf文件. - #fixed_break_of_cross_page(pages, length = 80) ⇒ Object
- #fixed_break_with_pages_text(pages_text) ⇒ Object
- #gen_html_from_page_text(page_text, illustrations, options = {}) ⇒ Object
- #gen_html_from_page_texts(page_texts, illustrations, options = {}) ⇒ Object
- #gen_html_from_sections_and_page_texts(sections, page_texts, illustrations) ⇒ Object
- #guess_footer_row_count(pages_text) ⇒ Object
-
#guess_header_line?(lines) ⇒ Boolean
(also: #guess_footer_line?)
猜测是否是页眉/页脚行 猜测规则: 1.
-
#guess_header_row_count(pages_text) ⇒ Object
猜测页眉/页脚的行数 页眉页脚有一定的规律: 1.
-
#sanitize_page_header_and_footer(pdf_pages_text, options = {}) ⇒ Object
sanitize_page_header_and_footer 清洗页眉页脚 parameters:
pdf_pages_text
pdf文件页文本内容集合options
可选参数 :header_rows_count 指定页眉行数 :footer_rows_count 指定页脚行数. -
#scan_pdf?(filename) ⇒ Boolean
scan_pdf? 检查指定的文件是否为扫描版pdf parameters:
filename
pdf文件. - #walk_index(indexer, sections) ⇒ Object
- #work_index(child, sections) ⇒ Object
Methods included from Utils
#breaklines, #clean_text, #detect_sections_from_html, #detect_utf8, #end_mark?, #escape_html, #extract_keywords_from_path, #extract_text_from_file, #fixed_page_break, #guess_content_line_length, #line_closed?, #make_destination_dir, #merge_para_part, #scan_file_from_dir, #source_exists?, #text_similarity, #text_to_array, #timer, #to_utf8, #walk_dir, #wrapper_html, #write_file
Instance Method Details
#extract_illustrations(filename, options = {}) ⇒ Object
extract_illustrations
提取pdf文件中的插图
parameters:
+filename+ pdf文件
++ 可选参数
+dir+ 插图存放的目录,默认存放在当前目录下与filename同名的子目录下。
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
# File 'lib/pdf.rb', line 86 def extract_illustrations(filename,={}) tmp_dir = [:dir] || File.basename(filename,'.pdf') old_dir = Dir.getwd Dir.mkdir(tmp_dir) unless Dir.exists?(tmp_dir) system("pdfimages -p '#{filename}' '#{tmp_dir}/'") system("mogrify -format png '#{tmp_dir}/*.ppm'") Dir.chdir(tmp_dir) images = Dir.glob('*.png') images_path = [] images.each do |image| images_path << image end Dir.chdir(old_dir) images_path end |
#extract_page_illustrations(illustrations, index) ⇒ Object
116 117 118 119 120 121 122 123 124 |
# File 'lib/pdf.rb', line 116 def extract_page_illustrations(illustrations,index) page_illustrations = [] illustrations.each do |image_path| if image_path.split("-")[1].to_i == index page_illustrations << image_path end end page_illustrations end |
#extract_pdf_meta(filename) ⇒ Object
extract_pdf_meta
提取pdf元数据
parameters:
+filename+ pdf文件
58 59 60 61 62 63 64 |
# File 'lib/pdf.rb', line 58 def (filename) pdf = Poppler::Document.new(filename) ={} [:author] = pdf. [:title] = pdf.title end |
#extract_pdf_pages_text(filename) ⇒ Object
extract_pdf_pages_text
提取pdf中页文本内容
parameters:
+filename+ pdf文件
25 26 27 28 29 30 31 32 33 |
# File 'lib/pdf.rb', line 25 def extract_pdf_pages_text(filename) pdf = PDF::Reader.new(filename) pages = [] pdf.pages.each do |page| pages << page.text end pages end |
#extract_sections(filename) ⇒ Object
extract_sections
提取pdf文件的大纲
parameters:
+filename+ pdf文件
70 71 72 73 74 75 76 77 78 |
# File 'lib/pdf.rb', line 70 def extract_sections(filename) sections = [] pdf = Poppler::Document.new(filename) indexer = Poppler::IndexIter.new(pdf) walk_index(indexer,sections) sections rescue sections end |
#fixed_break_of_cross_page(pages, length = 80) ⇒ Object
175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
# File 'lib/pdf.rb', line 175 def fixed_break_of_cross_page(pages,length=80) i=0 while i < (pages.count-1) first_page_lines = pages[i].split("\n") second_page_lines = pages[i+1].split("\n") if first_page_lines.any? && second_page_lines.any? first_page_last = first_page_lines.last second_page_first = second_page_lines.first unless Utils.end_mark?(first_page_last) first_page_lines[(first_page_lines.count-1)] = Utils.merge_para_part(first_page_last,second_page_first) second_page_lines.shift pages[i] = first_page_lines.join("\n") pages[i+1] = second_page_lines.join("\n") end end i = i + 1 end pages end |
#fixed_break_with_pages_text(pages_text) ⇒ Object
102 103 104 105 106 |
# File 'lib/pdf.rb', line 102 def fixed_break_with_pages_text(pages_text) line_length = pages_text.map{|text| Utils.guess_content_line_length(text)}.compact.sort.last * 0.5 pages_text = pages_text.map{|page_text| Utils.fixed_page_break(page_text,:length=>line_length) } pages_text = fixed_break_of_cross_page(pages_text,line_length) end |
#gen_html_from_page_text(page_text, illustrations, options = {}) ⇒ Object
135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
# File 'lib/pdf.rb', line 135 def gen_html_from_page_text(page_text,illustrations,={}) html = '' page_text.split("\n").each_with_index do |line,index| if line.present? if HeaderDetect.guess_header?(line) html += "<h2 id='#{[:index]}_#{index}'>#{Utils.escape_html(Utils.clean_text(line))}</h2>" else html += "<p class='division'>#{Utils.escape_html(Utils.clean_text(line))}</p>" end end end images = illustrations.map{|image_path| "<p class='division'><img src='#{image_path}' /></p>"}.compact.join("") "<div class='page' name='#{[:index]}' >#{html}#{images}</div>" end |
#gen_html_from_page_texts(page_texts, illustrations, options = {}) ⇒ Object
126 127 128 129 130 131 132 133 |
# File 'lib/pdf.rb', line 126 def gen_html_from_page_texts(page_texts,illustrations,={}) page_htmls = [] page_texts.each_with_index do |page_text,index| page_illustrations = extract_page_illustrations(illustrations,index) page_htmls << gen_html_from_page_text(page_text,page_illustrations,.merge(:index=>index)) end page_htmls.join("") end |
#gen_html_from_sections_and_page_texts(sections, page_texts, illustrations) ⇒ Object
108 109 110 111 112 113 114 |
# File 'lib/pdf.rb', line 108 def gen_html_from_sections_and_page_texts(sections,page_texts,illustrations) if sections.empty? gen_html_from_page_texts(page_texts,illustrations) else gen_html_from_page_texts(page_texts,illustrations) #sections中的页码不准确,暂时不进行处理 end end |
#guess_footer_row_count(pages_text) ⇒ Object
213 214 215 216 217 218 219 220 221 222 223 224 |
# File 'lib/pdf.rb', line 213 def (pages_text) i = 0 while true lines = pages_text.map{|page_text| page_text.split("\n")[(-i -1)]} if (lines) i = i + 1 else break end end i > 2 ? 0 : i end |
#guess_header_line?(lines) ⇒ Boolean Also known as:
猜测是否是页眉/页脚行 猜测规则:
1. 相邻页的行匹配相似度,一定相似比例(默认70%)以上加入相似集合。如果相似集合占总集合数的比例高于一定值(默认50%)时,猜测为页眉页脚行
2. 隔页的行匹配相似度,一定相似比例(默认70%)以上加入相似集合。如果相似集合占总集合数的比例高于一定值(默认50%)时,猜测为页眉页脚行
3. 页码猜测,页的行是数值则加入相似集合。如果相似集合占总集合数的比例高于一定值(默认50%)时,猜测为页眉页脚行
231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 |
# File 'lib/pdf.rb', line 231 def guess_header_line?(lines) return false if lines.empty? lines = lines.map{|line| line.strip if line.present?} similarity_set = [] lines.each_with_index do |line,index| if Utils.text_similarity(line,lines[index+1]) > 0.7 similarity_set << [index,index+1] end end similarity_set.flatten! similarity_set.uniq! return true if similarity_set.count.to_f / lines.count.to_f > 0.5 similarity_set = [] lines.each_with_index do |line,index| if Utils.text_similarity(line,lines[index+2]) > 0.7 similarity_set << [index,index+2] end end similarity_set.flatten! similarity_set.uniq! return true if similarity_set.count.to_f / lines.count.to_f > 0.5 similarity_set=[] lines.each_with_index do |line,index| similarity_set << index if line.to_i > 0 end return true if similarity_set.count.to_f / lines.count.to_f > 0.5 false end |
#guess_header_row_count(pages_text) ⇒ Object
猜测页眉/页脚的行数 页眉页脚有一定的规律:
1. 页眉和页脚一般都在每页的固定位置出现或者对称出现(相邻两页左右位置堆成)
2. 呈现的内容一般是书名、章节名、页码等。
3. 呈现的顺序一般有两种形式:逐页式,即每一页的页眉页脚大致相似;隔页式
201 202 203 204 205 206 207 208 209 210 211 |
# File 'lib/pdf.rb', line 201 def guess_header_row_count(pages_text) i = 0 while true if guess_header_line?(pages_text.map{|page_text| page_text.split("\n")[i]}) i = i + 1 else break end end i > 2 ? 0 : i end |
#sanitize_page_header_and_footer(pdf_pages_text, options = {}) ⇒ Object
sanitize_page_header_and_footer
清洗页眉页脚
parameters:
+pdf_pages_text+ pdf文件页文本内容集合
+options+ 可选参数
:header_rows_count 指定页眉行数
:footer_rows_count 指定页脚行数
42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/pdf.rb', line 42 def (pdf_pages_text,={}) header_rows_count = [:header_rows_count] || guess_header_row_count(pdf_pages_text) = [:footer_rows_count] || (pdf_pages_text) pages_text = [] pdf_pages_text.each do |page_text| page_lines = page_text.split("\n") page_lines = page_lines[(header_rows_count)..(--1)] || [] pages_text << page_lines.join("\n") end pages_text end |
#scan_pdf?(filename) ⇒ Boolean
scan_pdf?
检查指定的文件是否为扫描版pdf
parameters:
+filename+ pdf文件
12 13 14 15 16 17 18 19 |
# File 'lib/pdf.rb', line 12 def scan_pdf?(filename) if File.extname(filename).downcase == '.pdf' threshold = 1000 pdf = Poppler::Document.new(filename) content = pdf.map{|page| page.get_text}.join('') content.strip.length < threshold ? true : false end end |
#walk_index(indexer, sections) ⇒ Object
151 152 153 154 155 156 157 158 159 160 161 |
# File 'lib/pdf.rb', line 151 def walk_index(indexer,sections) indexer.each_with_index do |i,index| sections[index] = {:title=>Utils.clean_text(i.action.title),:page_num=>i.action.dest.page_num} child = i.child if child.nil? == false sub_sections = [] work_index(child,sub_sections) sections[index][:sub_sections] = sub_sections end end end |
#work_index(child, sections) ⇒ Object
163 164 165 166 167 168 169 170 171 172 173 |
# File 'lib/pdf.rb', line 163 def work_index(child,sections) child.each_with_index do |h,index| sections[index] = {:title=> Utils.clean_text(h.action.title),:page_num=>h.action.dest.page_num} sub = h.child if sub.nil? == false sub_sections = [] work_index(sub,sub_sections) sections[index][:sub_sections] = sub_sections end end end |