26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
|
# File 'lib/model/chunks.rb', line 26
def self.include_in pdf
pdf.spatials :chunks, :paged => true, :depends_on => [:characters] do |parser|
rows = {}
parser.before do
rows = {}
end
parser.objects :characters do |chars|
y = chars[:y]
rows[y] = [] if rows[y].nil?
idx = rows[y].index { |obj| chars[:x] <= obj[:x] }
if idx.nil?
rows[y] << chars.dup
else
rows[y].insert idx, chars.dup
end
end
parser.after do
char_slop = pdf.settings[:char_slop]
word_slop = pdf.settings[:word_slop]
overlap_slop = pdf.settings[:overlap_slop]
text_chunks = []
rows.each_pair do |y, row|
char_width = row.first[:width]
while row.length > 1
left = row.first
right = row[1]
if (left[:x] + left[:width] + (char_width * char_slop)) >= right[:x]
row[0] = Spatial.merge left, right
row.delete_at 1
char_width = right[:width] unless right[:content].strip =~ /[^A-Za-z0-9]/
elsif (left[:x] + left[:width] + (char_width * word_slop)) >= right[:x]
row[0] = Spatial.merge left, right, :separator => ' '
row.delete_at 1
char_width = right[:width] unless right[:content].strip =~ /[^A-Za-z0-9]/
else
text_chunks << left
row.delete_at 0
char_width = row.first[:width]
end
end
text_chunks << row.first
end
text_chunks.sort_by! { |obj| obj[:x] }
merged_text_chunks = []
while text_chunks.count > 1
left = text_chunks.first
right = text_chunks[1]
overlap = [left[:height], right[:height]].min - (left[:y] - right[:y]).abs
overlap = overlap / [left[:height], right[:height]].min
if overlap >= overlap_slop
text_chunks[0] = Spatial.merge left, right
text_chunks.delete_at 1
else
merged_text_chunks << text_chunks.first
text_chunks.delete_at 0
end
end
merged_text_chunks << text_chunks.first
merged_text_chunks.reject { |chunk| chunk[:content].strip == "" }
end
end
end
|