Class: PDF::Reader::TextReceiver
- Inherits:
-
Object
- Object
- PDF::Reader::TextReceiver
- Defined in:
- lib/pdf/reader/text_receiver.rb
Overview
An example receiver class that processes all text found in a PDF file. All text that is found will be printed to the IO object specified in the constructor.
Usage:
receiver = PDF::Reader::TextReceiver.new($stdout)
PDF::Reader.file("somefile.pdf", receiver)
DEPRECATED: this class was deprecated in version 0.11.0 and will
eventually be removed
Instance Method Summary collapse
-
#begin_document(root) ⇒ Object
Called when the document parsing begins.
-
#begin_page(info) ⇒ Object
Called when new page parsing begins.
- #begin_page_container(page) ⇒ Object
-
#begin_text_object ⇒ Object
PDF operator BT.
- #calculate_line_and_location(new_loc) ⇒ Object
-
#end_document ⇒ Object
Called when the document parsing ends.
-
#end_page ⇒ Object
Called when page parsing ends.
- #end_page_container ⇒ Object
-
#end_text_object ⇒ Object
PDF operator ET.
-
#initialize(main_receiver) ⇒ TextReceiver
constructor
Initialize with the library user’s receiver.
- #media_box_check(dict) ⇒ Object
-
#move_text_position(tx, ty) ⇒ Object
PDF operator Td.
-
#move_text_position_and_set_leading(tx, ty) ⇒ Object
PDF operator TD.
-
#move_to_next_line_and_show_text(string) ⇒ Object
PDF operator ‘.
-
#move_to_start_of_next_line ⇒ Object
PDF operator T*.
-
#set_character_spacing(n) ⇒ Object
PDF operator Tc.
-
#set_horizontal_text_scaling(n) ⇒ Object
PDF operator Tz.
-
#set_spacing_next_line_show_text(aw, ac, string) ⇒ Object
PDF operator “.
-
#set_text_leading(n) ⇒ Object
PDF operator TL.
-
#set_text_matrix_and_text_line_matrix(*args) ⇒ Object
PDF operator Tm.
-
#set_word_spacing(n) ⇒ Object
PDF operator Tw.
-
#show_text(string) ⇒ Object
PDF operator Tj.
-
#show_text_with_positioning(params) ⇒ Object
PDF operator TJ.
- #super_show_text(string) ⇒ Object
Constructor Details
#initialize(main_receiver) ⇒ TextReceiver
Initialize with the library user’s receiver
40 41 42 43 |
# File 'lib/pdf/reader/text_receiver.rb', line 40 def initialize (main_receiver) @main_receiver = main_receiver @upper_corners = [] end |
Instance Method Details
#begin_document(root) ⇒ Object
Called when the document parsing begins
46 47 48 |
# File 'lib/pdf/reader/text_receiver.rb', line 46 def begin_document (root) @upper_corners = [] end |
#begin_page(info) ⇒ Object
Called when new page parsing begins
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
# File 'lib/pdf/reader/text_receiver.rb', line 64 def begin_page (info) @page = info @state = [{ :char_spacing => 0, :word_spacing => 0, :hori_scaling => 100, :leading => 0, :tj_adjustment => 0, }] @upper_corners.push(media_box_check(info)) @output = [] @line = 0 @location = 0 @displacement = {} @smallest_y_loc = @upper_corners.last[:ury] @written_to = false end |
#begin_page_container(page) ⇒ Object
55 56 57 |
# File 'lib/pdf/reader/text_receiver.rb', line 55 def begin_page_container (page) @upper_corners.push(media_box_check(page)) end |
#begin_text_object ⇒ Object
PDF operator BT
92 93 94 |
# File 'lib/pdf/reader/text_receiver.rb', line 92 def begin_text_object @state.push(@state.last.dup) end |
#calculate_line_and_location(new_loc) ⇒ Object
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 |
# File 'lib/pdf/reader/text_receiver.rb', line 234 def calculate_line_and_location (new_loc) ##puts "calculate_line_and_location(#{new_loc})" key = new_loc; key.freeze #key = new_loc.to_s # because hashes with string keys are magic (auto-freeze) if @written_to unless @displacement.has_key?(key) if key < @location @displacement[key] = @line + 1 elsif key < @smallest_y_loc @displacement[key] = @line + 1 else key = @displacement.keys.find_all {|i| key > i}.sort.last @displacement[key] = 0 unless @displacement.has_key?(key) end end else @displacement[key] = 0 end @smallest_y_loc = key if key < @smallest_y_loc @location = key @line = @displacement[key] #puts "calculate_line_and_location: @location=#@location @line=#@line smallest_y_loc=#@smallest_y_loc" end |
#end_document ⇒ Object
Called when the document parsing ends
51 52 53 |
# File 'lib/pdf/reader/text_receiver.rb', line 51 def end_document @state.clear end |
#end_page ⇒ Object
Called when page parsing ends
86 87 88 89 |
# File 'lib/pdf/reader/text_receiver.rb', line 86 def end_page @main_receiver << @output.join("\n") @upper_corners.pop end |
#end_page_container ⇒ Object
59 60 61 |
# File 'lib/pdf/reader/text_receiver.rb', line 59 def end_page_container @upper_corners.pop end |
#end_text_object ⇒ Object
PDF operator ET
97 98 99 |
# File 'lib/pdf/reader/text_receiver.rb', line 97 def end_text_object @state.pop end |
#media_box_check(dict) ⇒ Object
222 223 224 225 226 227 228 229 230 231 232 |
# File 'lib/pdf/reader/text_receiver.rb', line 222 def media_box_check (dict) corners = (@upper_corners.last || {:urx => 0, :ury => 0}).dup if dict.has_key?(:MediaBox) media_box = dict[:MediaBox] corners[:urx] = media_box[2] - media_box[0] corners[:ury] = media_box[3] - media_box[1] end corners end |
#move_text_position(tx, ty) ⇒ Object
PDF operator Td
134 135 136 137 |
# File 'lib/pdf/reader/text_receiver.rb', line 134 def move_text_position (tx, ty) #puts "#{tx} #{ty} Td" calculate_line_and_location(@location + ty) end |
#move_text_position_and_set_leading(tx, ty) ⇒ Object
PDF operator TD
140 141 142 143 |
# File 'lib/pdf/reader/text_receiver.rb', line 140 def move_text_position_and_set_leading (tx, ty) set_text_leading(ty)# * -1) move_text_position(tx, ty) end |
#move_to_next_line_and_show_text(string) ⇒ Object
PDF operator ‘
210 211 212 213 |
# File 'lib/pdf/reader/text_receiver.rb', line 210 def move_to_next_line_and_show_text (string) move_to_start_of_next_line show_text(string) end |
#move_to_start_of_next_line ⇒ Object
PDF operator T*
129 130 131 |
# File 'lib/pdf/reader/text_receiver.rb', line 129 def move_to_start_of_next_line move_text_position(0, @state.last[:leading]) end |
#set_character_spacing(n) ⇒ Object
PDF operator Tc
109 110 111 |
# File 'lib/pdf/reader/text_receiver.rb', line 109 def set_character_spacing (n) @state.last[:char_spacing] = n end |
#set_horizontal_text_scaling(n) ⇒ Object
PDF operator Tz
119 120 121 |
# File 'lib/pdf/reader/text_receiver.rb', line 119 def set_horizontal_text_scaling (n) @state.last[:hori_scaling] = n/100 end |
#set_spacing_next_line_show_text(aw, ac, string) ⇒ Object
PDF operator “
216 217 218 219 220 |
# File 'lib/pdf/reader/text_receiver.rb', line 216 def set_spacing_next_line_show_text (aw, ac, string) set_word_spacing(aw) set_character_spacing(ac) move_to_next_line_and_show_text(string) end |
#set_text_leading(n) ⇒ Object
PDF operator TL
124 125 126 |
# File 'lib/pdf/reader/text_receiver.rb', line 124 def set_text_leading (n) @state.last[:leading] = n end |
#set_text_matrix_and_text_line_matrix(*args) ⇒ Object
PDF operator Tm
102 103 104 105 106 |
# File 'lib/pdf/reader/text_receiver.rb', line 102 def set_text_matrix_and_text_line_matrix (*args) # these variable names look bad, but they're from the PDF spec a, b, c, d, e, f = *args calculate_line_and_location(f) end |
#set_word_spacing(n) ⇒ Object
PDF operator Tw
114 115 116 |
# File 'lib/pdf/reader/text_receiver.rb', line 114 def set_word_spacing (n) @state.last[:word_spacing] = n end |
#show_text(string) ⇒ Object
PDF operator Tj
146 147 148 149 150 151 152 153 154 155 156 157 |
# File 'lib/pdf/reader/text_receiver.rb', line 146 def show_text (string) #puts "getting line #@line" place = (@output[@line] ||= "") #place << " " unless place.empty? place << " " * (@state.last[:tj_adjustment].abs/900) if @state.last[:tj_adjustment] < -1000 place << string #puts "place is now: #{place}" @written_to = true end |
#show_text_with_positioning(params) ⇒ Object
PDF operator TJ
194 195 196 197 198 199 200 201 202 203 204 205 206 207 |
# File 'lib/pdf/reader/text_receiver.rb', line 194 def show_text_with_positioning (params) prev_adjustment = @state.last[:tj_adjustment] params.each do |p| case p when Float, Fixnum @state.last[:tj_adjustment] = p else show_text(p) end end @state.last[:tj_adjustment] = prev_adjustment end |
#super_show_text(string) ⇒ Object
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
# File 'lib/pdf/reader/text_receiver.rb', line 158 def super_show_text (string) urx = @upper_corners.last[:urx]/TS_UNITS_PER_H_CHAR ury = @upper_corners.last[:ury]/TS_UNITS_PER_V_CHAR x = (@tm[2,0]/TS_UNITS_PER_H_CHAR).to_i y = (ury - (@tm[2,1]/TS_UNITS_PER_V_CHAR)).to_i #puts "rendering '#{string}' to #{x}x#{y}" place = (@output[y] ||= (" " * urx.to_i)) #puts "#{urx} #{place.size} #{string.size} #{x}" return if x+string.size >= urx string.split(//).each do |c| chars = 1 case c when " " chars += @state.last[:word_spacing].to_i place[x-1, chars] = (" " * chars) else chars += @state.last[:char_spacing].to_i chars -= (@state.last[:tj_adjustment]/1000).to_i if @state.last[:tj_adjustment] chars = 1 if chars < 1 place[x-1] = c place[x, chars-1] = (" " * (chars-1)) if chars > 1 end x += chars end @tm += Matrix.rows([[1, 0, 0], [0, 1, 0], [x*TS_UNITS_PER_H_CHAR, y*TS_UNITS_PER_V_CHAR, 1]]) end |