Class: DocParser

Inherits:
Object
  • Object
show all
Defined in:
lib/almirah/doc_parser.rb

Overview

rubocop:disable Metrics/ClassLength,Style/Documentation

Class Method Summary collapse

Class Method Details

.parse(doc, text_lines) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
# File 'lib/almirah/doc_parser.rb', line 41

def self.parse(doc, text_lines)
  temp_md_table = nil
  temp_md_list = nil
  temp_code_block = nil
  # restart section numbering for each new document
  Heading.reset_global_section_number
  # try to get frontmatter first
  text_lines = try_to_extract_frontmatter(doc, text_lines)
  # There is no document without heading
  title = "#{doc.id}.md"
  item = Heading.new(doc, title, 0)
  doc.items.append(item)
  doc.headings.append(item)
  doc.title = title
  # replace dummy title with extracted from frontmatter
  if doc.frontmatter && (doc.frontmatter.parameters.key? 'title')
    doc.title = doc.frontmatter.parameters['title']
    doc.headings[0].text = doc.frontmatter.parameters['title']
  end
  # main loop
  text_lines.each do |s|
    if s.lstrip != ''
      if res = /^(\#{1,})\s(.*)/.match(s) # Heading

        temp_md_table = process_temp_table(doc, temp_md_table)
        if temp_md_list
          doc.items.append temp_md_list
          temp_md_list = nil
        end

        level = res[1].length
        value = res[2]

        item = Heading.new(doc, value, level)
        doc.items.append(item)
        doc.headings.append(item)

      elsif res = /^%\s(.*)/.match(s) # Pandoc Document Title

        title = res[1]

        # Rewrite
        doc.title = title
        doc.headings[0].text = title

      elsif res = /^\[(\S*)\]\s+(.*)/.match(s) # Controlled Paragraph

        temp_md_table = process_temp_table(doc, temp_md_table)
        if temp_md_list
          doc.items.append temp_md_list
          temp_md_list = nil
        end

        id = res[1].upcase
        text = res[2]
        up_links = nil

        # check if it contains the uplink (one or many)
        # TODO: check this regular expression
        first_pos = text.length # for trailing commas
        tmp = text.scan(/(>\[(?>[^\[\]]|\g<0>)*\])/) # >[SRS-001], >[SYS-002]
        if tmp.length > 0
          up_links = []
          tmp.each do |ul|
            lnk = ul[0]
            # do not add links for the self document
            doc_id = /([a-zA-Z]+)-\d+/.match(lnk) # SRS
            up_links << lnk.upcase if doc_id and (doc_id[1].downcase != doc.id.downcase)
            # try to find the real end of text
            pos = text.index(lnk)
            first_pos = pos if pos < first_pos
            # remove uplink from text
            text = text.split(lnk, 1).join('')
          end
          # remove trailing commas and spaces
          if text.length > first_pos
            first_pos -= 1
            text = text[0..first_pos].strip
          end
        end

        # since we already know id and text
        item = ControlledParagraph.new(doc, text, id)

        if up_links
          up_links.uniq! # remove duplicates
          doc.items_with_uplinks_number += 1 # for statistics
          up_links.each do |ul|
            next unless tmp = />\[(\S*)\]$/.match(ul) # >[SRS-001]

            up_link_id = tmp[1]

            item.up_link_ids = [] unless item.up_link_ids

            item.up_link_ids.append(up_link_id)

            if tmp = /^([a-zA-Z]+)-\d+/.match(up_link_id) # SRS
              doc.up_link_docs[tmp[1].downcase.to_s] = tmp[1].downcase # multiple documents could be up-linked
            end
          end
        end

        doc.items.append(item)
        # for statistics
        if doc.dictionary.has_key?(id.to_s)
          doc.duplicated_ids_number += 1
          doc.duplicates_list.append(item)
        else
          doc.dictionary[id.to_s] = item # for fast search
        end
        doc.controlled_items.append(item) # for fast search

        # for statistics
        n = /\d+/.match(id)[0].to_i
        if n > doc.last_used_id_number
          doc.last_used_id = id
          doc.last_used_id_number = n
        end

      elsif res = /^!\[(.*)\]\((.*)\)/.match(s) # Image

        temp_md_table = process_temp_table(doc, temp_md_table)
        if temp_md_list
          doc.items.append temp_md_list
          temp_md_list = nil
        end

        img_text = res[1]
        img_path = res[2]

        item = Image.new(img_text, img_path)
        item.parent_doc = doc
        item.parent_heading = doc.headings[-1]

        doc.items.append(item)

      elsif res = /^(\*\s+)(.*)/.match(s) # check if unordered list start

        if doc.title == ''
          # dummy section if root is not a Document Title (level 0)
          title = "#{doc.id}.md"
          item = Heading.new(doc, title, 0)
          doc.items.append(item)
          doc.headings.append(item)
          doc.title = title
        end

        temp_md_table = process_temp_table(doc, temp_md_table)

        row = res[2]

        if temp_md_list
          temp_md_list.add_row(s)
        else
          item = MarkdownList.new(doc, false)
          item.add_row(s)
          temp_md_list = item
        end

      elsif res = /^\d[.]\s(.*)/.match(s) # check if ordered list start

        temp_md_table = process_temp_table(doc, temp_md_table)

        row = res[1]

        if temp_md_list
          temp_md_list.add_row(s)
        else
          item = MarkdownList.new(doc, true)
          item.add_row(s)
          temp_md_list = item
        end

      elsif /^[+](-*[+])/.match(s) # try to ignore Grid Table borders

      elsif (s[0] == '|') || (s[0] == '+') # check if table

        if doc.title == ''
          # dummy section if root is not a Document Title (level 0)
          title = "#{doc.id}.md"
          item = Heading.new(doc, title, 0)
          doc.items.append(item)
          doc.headings.append(item)
          doc.title = title
        end

        if temp_md_list
          doc.items.append temp_md_list
          temp_md_list = nil
        end

        # check if it is a separator first
        if /^[|]\s?(:?)(-{3,})(:?)\s?[|]/.match(s) || /^[+]\s?(:?)(={3,})(:?)\s?[+]/.match(s)

          if temp_md_table
            # separator is found after heading
            temp_md_table.is_separator_detected = true
            temp_md_table.add_separator(s)
          else
            # separator out of table scope consider it just as a regular paragraph
            item = Paragraph.new(doc, s)
            doc.items.append(item)
          end

        elsif res = /^[|](.*[|])/.match(s) # check if it looks as a table row

          row = res[1]

          if temp_md_table
            if temp_md_table.is_separator_detected # if there is a separator
              # check if parent doc is a Protocol
              if doc.instance_of? Protocol
                # check if it is a controlled table
                tmp = /(.*)\s+>\[(\S*)\]/.match(row)
                if tmp && (temp_md_table.instance_of? MarkdownTable)
                  # this is not a regular Markdown table
                  # so the table type shall be changed and this row shall be passed one more time
                  temp_md_table = ControlledTable.new(doc, temp_md_table)
                end
              end
              temp_md_table.add_row(row)
            else
              # replece table heading with regular paragraph
              item = Paragraph.new(doc, temp_md_table.heading_row)
              doc.items.append(item)
              # and current row
              item = Paragraph.new(doc, s)
              doc.items.append(item)
              temp_md_table = nil
            end
          else
            # start table from heading
            temp_md_table = MarkdownTable.new(doc, s)
          end
        end

      elsif res = /^>(.*)/.match(s) # check if blockquote

        temp_md_table = process_temp_table(doc, temp_md_table)

        if temp_md_list
          doc.items.append temp_md_list
          temp_md_list = nil
        end

        item = Blockquote.new(res[1])
        item.parent_doc = doc
        item.parent_heading = doc.headings[-1]
        doc.items.append(item)

      elsif res = /^```(\w*)/.match(s) # check if code block

        temp_md_table = process_temp_table(doc, temp_md_table)
        if temp_md_list
          doc.items.append temp_md_list
          temp_md_list = nil
        end

        suggested_format = ''
        suggested_format = res[1] if res.length == 2

        if temp_code_block
          # close already opened block
          doc.items.append(temp_code_block)
          temp_code_block = nil
        else
          # start code block
          temp_code_block = CodeBlock.new(suggested_format)
          temp_code_block.parent_doc = doc
          temp_code_block.parent_heading = doc.headings[-1]
        end

      elsif res = /^TODO:(.*)/.match(s) # check if TODO block

        temp_md_table = process_temp_table(doc, temp_md_table)
        if temp_md_list
          doc.items.append temp_md_list
          temp_md_list = nil
        end

        text = '**TODO**: ' + res[1]

        item = TodoBlock.new(text)
        item.parent_doc = doc
        item.parent_heading = doc.headings[-1]
        doc.items.append(item)
        doc.todo_blocks.append(item)

      else # Reqular Paragraph

        temp_md_table = process_temp_table(doc, temp_md_table)
        if temp_md_list
          if MarkdownList.unordered_list_item?(s) || MarkdownList.ordered_list_item?(s)
            temp_md_list.add_row(s)
            next
          else
            doc.items.append temp_md_list
            temp_md_list = nil
          end
        end
        if temp_code_block
          temp_code_block.code_lines.append(s)
        else
          item = Paragraph.new(doc, s)
          doc.items.append(item)
        end
      end
    elsif temp_md_list
      doc.items.append temp_md_list
      temp_md_list = nil # lists are separated by emty line from each other
    end
  end
  # Finalize non-closed elements
  temp_md_table = process_temp_table(doc, temp_md_table)
  if temp_md_list
    doc.items.append temp_md_list
    temp_md_list = nil
  end
  if temp_code_block
    doc.items.append temp_code_block
    temp_code_block = nil
  end
  # Add footer to close opened tables if any
  item = DocFooter.new
  item.parent_doc = doc
  doc.items.append(item)
end

.process_temp_table(doc, temp_md_table) ⇒ Object



369
370
371
372
373
374
375
376
377
378
379
380
381
# File 'lib/almirah/doc_parser.rb', line 369

def self.process_temp_table(doc, temp_md_table)
  if temp_md_table
    if temp_md_table.is_separator_detected
      doc.items.append temp_md_table
    else # no separator
      # replece table heading with regular paragraph
      item = Paragraph.new(doc, temp_md_table.heading_row)
      doc.items.append(item)
    end
    temp_md_table = nil
  end
  temp_md_table
end

.try_to_extract_frontmatter(doc, text_lines) ⇒ Object

rubocop:disable Metrics/MethodLength



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/almirah/doc_parser.rb', line 17

def self.try_to_extract_frontmatter(doc, text_lines) # rubocop:disable Metrics/MethodLength
  lines_to_remove = 0
  frontmatter_lines = ''
  if /^(-{3,})/.match(text_lines[0])
    frontmatter_started = false
    text_lines.each do |s|
      lines_to_remove += 1
      if /^(-{3,})/.match(s)
        if frontmatter_started
          doc.frontmatter = Frontmatter.new(frontmatter_lines)
          frontmatter_started = false
          break
        else
          frontmatter_started = true
        end
      elsif frontmatter_started
        frontmatter_lines += s
      end
    end
  end
  text_lines.shift(lines_to_remove)
  text_lines
end