Module: Swordfish::DOCX::Parser
- Included in:
- Document
- Defined in:
- lib/swordfish/formats/docx/parser.rb
Instance Method Summary collapse
-
#_node_parse_list(node) ⇒ Object
Parse a list.
-
#_node_parse_paragraph(node) ⇒ Object
Parse a paragraph.
-
#_node_parse_runs(node, context = nil) ⇒ Object
Parse one or more runs.
-
#_node_parse_table(node) ⇒ Object
Parse a table.
-
#_node_parse_table_cell(node) ⇒ Object
Parse a table cell.
-
#_node_parse_table_row(node) ⇒ Object
Parse a table row.
Instance Method Details
#_node_parse_list(node) ⇒ Object
Parse a list
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
# File 'lib/swordfish/formats/docx/parser.rb', line 118 def _node_parse_list(node) # In Office OpenXML, a list is not a distinct element type, but rather a # specialized paragraph that references an abstract numbering scheme # and includes an indentation level. As a result, the build buffer # must be used to assemble the Swordfish::Node representation of the list, # since the only way to tell the list has been fully parsed is to encounter # a non-list element. # Handle paragraphs with no level, which represent multi-paragraph list items if node.xpath(".//w:numPr/w:ilvl").length.zero? para = Swordfish::Node::Paragraph.new _node_parse_runs(node).each {|r| para.append(r)} @buffer.last_list_item(:recurse => true).wrap_children(Swordfish::Node::Inline, Swordfish::Node::Paragraph) @buffer.last_list_item(:recurse => true).append para return end # Get the list item's abstract numbering and level list_item = Swordfish::Node::ListItem.new _node_parse_runs(node).each {|r| list_item.append(r)} level = node.xpath(".//w:numPr/w:ilvl")[0]['w:val'].to_i numbering_scheme = node.xpath(".//w:numPr/w:numId")[0]['w:val'].to_i # If the build buffer is empty, this is a new list unless @buffer @buffer = Swordfish::Node::List.new # default to bullet in case of bad numbering reference @buffer.stylize @numbering.fetch(numbering_scheme, {}).fetch(level, "bullet").to_sym @buffer_initial_value = level # Lists may have an arbitrary initial level end # Compare the level of this list item to the bottommost node in # the build buffer to determine where in the hierarchy to add # this node (i.e., are we dealing with list nesting or not?) if @buffer.depth_of_final_node >= level || @buffer.children.empty? # Add sibling to existing list target = @buffer (level - @buffer_initial_value).times do target = target.last_list_item.nested_list end target.append list_item elsif @buffer.depth_of_final_node < level # Add new nested list target = @buffer (level - @buffer_initial_value - 1).times do target = target.last_list_item.nested_list end list = Swordfish::Node::List.new list.append list_item list.stylize @numbering[numbering_scheme][level].to_sym target.last_list_item.append list end end |
#_node_parse_paragraph(node) ⇒ Object
Parse a paragraph
107 108 109 110 111 112 113 114 115 |
# File 'lib/swordfish/formats/docx/parser.rb', line 107 def _node_parse_paragraph(node) paragraph = Swordfish::Node::Paragraph.new _node_parse_runs(node).each {|r| paragraph.append(r)} if node.xpath("./w:pPr/w:pStyle").length > 0 style_id = node.xpath("./w:pPr/w:pStyle")[0]['w:val'].to_sym paragraph.style = @styles[style_id] if @styles[style_id] end paragraph end |
#_node_parse_runs(node, context = nil) ⇒ Object
Parse one or more runs
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/swordfish/formats/docx/parser.rb', line 10 def _node_parse_runs(node, context = nil) # The 'run' is the basic unit of text in Office OpenXML. A paragraph, table cell, or other # block element may contain one or more runs, and each run has an associated set of styles. texts = [] # A complex field is a special type of node spanning multiple runs, where most of the runs # designate a special control flow rather than normal text. complex_field = nil nodes = node.is_a?(Array) ? node : node.children nodes.each_with_index do |run_xml, idx| case run_xml.name when 'r' if run_xml.xpath('./w:br').length > 0 # This run contains a linebreak. It may also contain other elements, so this isn't exclusive. texts << Swordfish::Node::Linebreak.new end if run_xml.xpath('./w:t').length > 0 && complex_field.nil? # A True run node # Only examine the run if it includes text codes. The run may also include # things like comment nodes, which should be ignored. text = Swordfish::Node::Text.new text.content = run_xml.xpath('./w:t')[0].content get_styles_for_node(run_xml.xpath('./w:rPr')[0], text) texts << text elsif run_xml.xpath('.//*[name()="pic:pic"]').length > 0 # An image run image = Swordfish::Node::Image.new relationship_id = run_xml.xpath('.//*[name()="pic:pic"]/*[name()="pic:blipFill"]/*[name()="a:blip"]')[0]['r:embed'] rescue nil if relationship_id image.original_name = @relationships[relationship_id].split('/').last @swordfish_doc.images[image.original_name] = read_image(image.original_name) texts << image end elsif run_xml.xpath('./w:fldChar').length > 0 || complex_field # A complex field case when run_xml.xpath('./w:fldChar').length > 0 && run_xml.xpath('./w:fldChar')[0]['w:fldCharType'] == 'begin' # Start the complex field complex_field = true when run_xml.xpath('./w:instrText').length > 0 # An instruction run, defining the complex field's behavior instruction = run_xml.xpath('./w:instrText')[0].content if instruction =~ /^\s*HYPERLINK/ # A hyperlink complex_field = Swordfish::Node::Hyperlink.new complex_field.href = instruction.match(/^\s*HYPERLINK (?:"" )?(?:\\l )?"([^"]+)"/).captures[0] else # Anything else complex_field = Swordfish::Node::Text.new end when run_xml.xpath('./w:t').length > 0 && complex_field.children.length.zero? # The textual content complex_field.append(_node_parse_runs(nodes.to_a[idx..-1])) when run_xml.xpath('./w:fldChar').length > 0 && run_xml.xpath('./w:fldChar')[0]['w:fldCharType'] == 'end' # End the complex field if complex_field texts << complex_field complex_field = nil else # Handle the case where _node_parse_runs gets called from within a complex field return texts end end elsif run_xml.xpath('./w:footnoteReference').length > 0 # A footnote reference id = run_xml.xpath('./w:footnoteReference')[0]['w:id'].to_i texts << @footnotes[id] if @footnotes[id] elsif run_xml.xpath('./w:endnoteReference').length > 0 # An endnote reference id = run_xml.xpath('./w:endnoteReference')[0]['w:id'].to_i texts << @endnotes[id] if @endnotes[id] end when 'hyperlink' # Hyperlink nodes are placed amongst other run nodes, but # they themselves also contain runs. Hyperlinks include # a relationship ID attribute defining their reference. link = Swordfish::Node::Hyperlink.new link.href = context ? @relationships[context][run_xml['r:id']] : @relationships[run_xml['r:id']] _node_parse_runs(run_xml).each {|r| link.append(r)} texts << link end end # Clean up runs by merging them if they have identical styles texts = texts.reduce([]) do |memo, run| if memo.length > 0 && memo.last.is_a?(Swordfish::Node::Text) && run.is_a?(Swordfish::Node::Text) && memo.last.style == run.style memo.last.content += run.content else memo << run end memo end texts end |
#_node_parse_table(node) ⇒ Object
Parse a table
173 174 175 176 177 178 179 |
# File 'lib/swordfish/formats/docx/parser.rb', line 173 def _node_parse_table(node) table = Swordfish::Node::Table.new node.xpath("./w:tr").each do |row| table.append _node_parse_table_row(row) end table end |
#_node_parse_table_cell(node) ⇒ Object
Parse a table cell
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 |
# File 'lib/swordfish/formats/docx/parser.rb', line 191 def _node_parse_table_cell(node) # In a Swordfish::Node::Table object, the number of table cells must equal the # total number of rows times the total number of columns; that is, even if # two cells are merged together, there must be a Swordfish::Node::TableCell for # each one. Merges are defined using the "merge_up" and "merge_left" properties. cell = Swordfish::Node::TableCell.new extra_cells = [] # Get the inner content of the cell node.xpath("./w:p").each do |paragraph| cell.append _node_parse_paragraph(paragraph) end # Determine whether this cell spans multiple rows. In Office OpenXML, # a table cell is defined in every row, even if the cell is vertically-merged. The representation # of the merged cell within each row is given a vMerge property, with the topmost one also # having a vMerge value of "restart", and the others having no vMerge value. if node.xpath("./w:tcPr/w:vMerge").length > 0 && node.xpath("./w:tcPr/w:vMerge")[0]['w:val'].nil? cell.merge_up = true end # Determine whether this cell spans multiple columns. Unlike with vertical merges, # a horizontally-merged Office OpenXML cell is only defined once, but is given a gridSpan # property defining the number of columns it spans. Since Swordfish requires a cell for each # column, loop to generate the additional cells, and set their merge_left values appropriately. if node.xpath("./w:tcPr/w:gridSpan").length > 0 node.xpath("./w:tcPr/w:gridSpan")[0]['w:val'].to_i.-(1).times do c = Swordfish::Node::TableCell.new c.merge_left = true extra_cells << c end end # Return the generated cell or cells if extra_cells.empty? return cell else return [cell] + extra_cells end end |