Module: Konjac::Office::XML

Defined in:
lib/konjac/office/xml.rb,
lib/konjac/office/xml/shared.rb

Overview

Specialized XML handling for with the Office Open XML-based formats in Office 2007+

Instance Method Summary collapse

Instance Method Details

#export_tags(files, opts = {}) ⇒ Object

Exports the text content of Microsoft Office document



58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/konjac/office/xml/shared.rb', line 58

def export_tags(files, opts = {})
  # Determine whether to attempt translating
  if opts[:from_given] && opts[:to_given]
    from_lang = Language.find(opts[:from])
    to_lang   = Language.find(opts[:to])
    unless from_lang.nil? || to_lang.nil?
      Translator.load_dictionary from_lang, to_lang, opts
      attempting_to_translate = true
    end
  end

  sub_files = Utils.parse_files(files)
  return if sub_files.empty?
  sub_files.each do |sub_file|
    case File.extname(sub_file)
    when ".doc", ".docx"
      return if OS.not_a_mac
      break unless Utils.user_allows_overwrite?(sub_file + ".diff")

      system File.join(File.dirname(__FILE__), "..", "applescripts", "konjac_word_export"), sub_file
    when ".ppt", ".pptx"
      return if OS.not_a_mac
      break unless Utils.user_allows_overwrite?(sub_file + ".diff")

      system File.join(File.dirname(__FILE__), "..", "applescripts", "konjac_powerpoint_export"), sub_file
    when ".xls", ".xlsx"
      return if OS.not_a_mac
      break unless Utils.user_allows_overwrite?(sub_file + ".diff")

      system File.join(File.dirname(__FILE__), "..", "applescripts", "konjac_excel_export"), sub_file
    else
      puts I18n.t(:unknown) % sub_file
    end
  end
end

#export_xml(files, opts = {}) ⇒ Object

Exports the Word document in XML then extracts the tags and condenses like paragraphs

I might deprecate this, but it exports XML. It’s much faster, but supporting two methods might not be a great idea.



99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# File 'lib/konjac/office/xml/shared.rb', line 99

def export_xml(files, opts = {})
  # Determine whether to attempt translating
  if opts[:from_given] && opts[:to_given]
    from_lang = Language.find(opts[:from])
    to_lang   = Language.find(opts[:to])
    unless from_lang.nil? || to_lang.nil?
      Translator.load_dictionary from_lang, to_lang, opts
      attempting_to_translate = true
    end
  end

  sub_files = Utils.parse_files(files)
  return if sub_files.empty?
  sub_files.each do |sub_file|
    case File.extname(sub_file)
    when ".docx"
      # Build a list of all the paths we're working with
      dirname    = File.dirname(sub_file)
      basename   = File.basename(sub_file, ".*")
      orig_docx  = "#{dirname}/#{basename}.docx"
      xml_path   = "#{dirname}/#{basename}_orig.xml"
      clean_path = "#{dirname}/#{basename}.xml"
      tags_path  = "#{dirname}/#{basename}.docx.diff"

      break unless Utils.user_allows_overwrite?(tags_path)

      # Unzip the DOCX's word/document.xml file and pipe the output into
      # an XML with the same base name as the DOCX
      system "unzip -p #{orig_docx} word/document.xml > #{xml_path}"

      # Read in the XML file and extract the content from each <w:t> tag
      cleaner = Nokogiri::XML(File.read(xml_path))
      File.open(tags_path, "w") do |tags_file|
        # Remove all grammar and spellcheck tags
        cleaner.xpath("//w:proofErr").remove

        nodes = cleaner.xpath("//w:r")
        prev = nil
        nodes.each do |node|
          unless prev.nil?
            if (prev.next_sibling == node) && compare_nodes(prev, node)
              begin
                node.at_xpath("w:t").content = prev.at_xpath("w:t").content +
                  node.at_xpath("w:t").content
                prev.remove
              rescue
              end
            end
          end
          
          prev = node
        end

        # Write the tags file
        tags_file.puts "---" + orig_docx
        tags_file.puts "+++" + orig_docx
        cleaner.xpath("//w:t").each_with_index do |node, index|
          tags_file.puts "@@ %i @@" % [index, additional_info(node)]
          tags_file.puts "-" + node.content
          if attempting_to_translate
            tags_file.puts "+" + Translator.translate_content(node.content)
          else
            tags_file.puts "+" + node.content
          end
        end
      end

      # Write the cleaned-up XML to a file for inspection
      File.open(clean_path, "w") do |xml|
        xml.puts cleaner.to_xml
      end
    else
      puts I18n.t(:unknown) % sub_file
    end
  end
end

#import_xml(files, opts = {}) ⇒ Object

Imports the text content of a tag file into a Word 2003+, utilizing a cleaned-up version of the document’s original XML structure



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/konjac/office/xml/shared.rb', line 9

def import_xml(files, opts = {})
  sub_files = Utils.parse_files(files)
  return if sub_files.empty?
  sub_files.each do |sub_file|
    case File.extname(sub_file)
    when ".docx"
      # Build the list of paths we need to work with
      dirname   = File.dirname(sub_file)
      basename  = File.basename(sub_file, ".*")
      orig_docx = "#{dirname}/#{basename}.docx"
      new_path  = "#{dirname}/#{basename}_imported.docx"
      xml_path  = "#{dirname}/#{basename}.xml"
      tags_path = "#{dirname}/#{basename}.docx.diff"
      out_path  = "#{dirname}/word/document.xml"

      # Open the original XML file and the updated tags
      writer = Nokogiri::XML(File.read(xml_path))
      nodes  = writer.xpath("//w:t")
      tags   = TagManager.new(tags_path)

      # Overwrite each <w:t> tag's content with the new tag
      tags.all.each do |tag|
        if tag.translated?
          nodes[tag.index].content = tag.translated
        end
      end

      # Create a directory for word/document.xml if necessary
      unless File.directory?("#{dirname}/word")
        FileUtils.mkdir "#{dirname}/word"
      end

      # Write the modified XML to a file
      File.open(out_path, "w") do |file|
        file.write writer.to_xml.gsub(/\n\s*/, "").sub(/\?></, "?>\n<")
      end

      # Copy the original file
      FileUtils.cp orig_docx, new_path

      # Add the new document XML to the copied file
      system "cd #{dirname} && zip -q #{new_path} word/document.xml"
    else
      puts I18n.t(:unknown) % sub_file
    end
  end
end