Class: DocParser
- Inherits:
-
Object
- Object
- DocParser
- Defined in:
- lib/almirah/doc_parser.rb
Overview
rubocop:disable Metrics/ClassLength,Style/Documentation
Class Method Summary collapse
- .parse(doc, text_lines) ⇒ Object
- .process_temp_table(doc, temp_md_table) ⇒ Object
-
.try_to_extract_frontmatter(doc, text_lines) ⇒ Object
rubocop:disable Metrics/MethodLength.
Class Method Details
.parse(doc, text_lines) ⇒ Object
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 |
# File 'lib/almirah/doc_parser.rb', line 41 def self.parse(doc, text_lines) temp_md_table = nil temp_md_list = nil temp_code_block = nil # restart section numbering for each new document Heading.reset_global_section_number # try to get frontmatter first text_lines = try_to_extract_frontmatter(doc, text_lines) # There is no document without heading title = "#{doc.id}.md" item = Heading.new(doc, title, 0) doc.items.append(item) doc.headings.append(item) doc.title = title # replace dummy title with extracted from frontmatter if doc.frontmatter && (doc.frontmatter.parameters.key? 'title') doc.title = doc.frontmatter.parameters['title'] doc.headings[0].text = doc.frontmatter.parameters['title'] end # main loop text_lines.each do |s| if s.lstrip != '' if res = /^(\#{1,})\s(.*)/.match(s) # Heading temp_md_table = process_temp_table(doc, temp_md_table) if temp_md_list doc.items.append temp_md_list temp_md_list = nil end level = res[1].length value = res[2] item = Heading.new(doc, value, level) doc.items.append(item) doc.headings.append(item) elsif res = /^%\s(.*)/.match(s) # Pandoc Document Title title = res[1] # Rewrite doc.title = title doc.headings[0].text = title elsif res = /^\[(\S*)\]\s+(.*)/.match(s) # Controlled Paragraph temp_md_table = process_temp_table(doc, temp_md_table) if temp_md_list doc.items.append temp_md_list temp_md_list = nil end id = res[1].upcase text = res[2] up_links = nil # check if it contains the uplink (one or many) # TODO: check this regular expression first_pos = text.length # for trailing commas tmp = text.scan(/(>\[(?>[^\[\]]|\g<0>)*\])/) # >[SRS-001], >[SYS-002] if tmp.length > 0 up_links = [] tmp.each do |ul| lnk = ul[0] # do not add links for the self document doc_id = /([a-zA-Z]+)-\d+/.match(lnk) # SRS up_links << lnk.upcase if doc_id and (doc_id[1].downcase != doc.id.downcase) # try to find the real end of text pos = text.index(lnk) first_pos = pos if pos < first_pos # remove uplink from text text = text.split(lnk, 1).join('') end # remove trailing commas and spaces if text.length > first_pos first_pos -= 1 text = text[0..first_pos].strip end end # since we already know id and text item = ControlledParagraph.new(doc, text, id) if up_links up_links.uniq! # remove duplicates doc.items_with_uplinks_number += 1 # for statistics up_links.each do |ul| next unless tmp = />\[(\S*)\]$/.match(ul) # >[SRS-001] up_link_id = tmp[1] item.up_link_ids = [] unless item.up_link_ids item.up_link_ids.append(up_link_id) if tmp = /^([a-zA-Z]+)-\d+/.match(up_link_id) # SRS doc.up_link_docs[tmp[1].downcase.to_s] = tmp[1].downcase # multiple documents could be up-linked end end end doc.items.append(item) # for statistics if doc.dictionary.has_key?(id.to_s) doc.duplicated_ids_number += 1 doc.duplicates_list.append(item) else doc.dictionary[id.to_s] = item # for fast search end doc.controlled_items.append(item) # for fast search # for statistics n = /\d+/.match(id)[0].to_i if n > doc.last_used_id_number doc.last_used_id = id doc.last_used_id_number = n end elsif res = /^!\[(.*)\]\((.*)\)/.match(s) # Image temp_md_table = process_temp_table(doc, temp_md_table) if temp_md_list doc.items.append temp_md_list temp_md_list = nil end img_text = res[1] img_path = res[2] item = Image.new(img_text, img_path) item.parent_doc = doc item.parent_heading = doc.headings[-1] doc.items.append(item) elsif res = /^(\*\s+)(.*)/.match(s) # check if unordered list start if doc.title == '' # dummy section if root is not a Document Title (level 0) title = "#{doc.id}.md" item = Heading.new(doc, title, 0) doc.items.append(item) doc.headings.append(item) doc.title = title end temp_md_table = process_temp_table(doc, temp_md_table) row = res[2] if temp_md_list temp_md_list.add_row(s) else item = MarkdownList.new(doc, false) item.add_row(s) temp_md_list = item end elsif res = /^\d[.]\s(.*)/.match(s) # check if ordered list start temp_md_table = process_temp_table(doc, temp_md_table) row = res[1] if temp_md_list temp_md_list.add_row(s) else item = MarkdownList.new(doc, true) item.add_row(s) temp_md_list = item end elsif /^[+](-*[+])/.match(s) # try to ignore Grid Table borders elsif (s[0] == '|') || (s[0] == '+') # check if table if doc.title == '' # dummy section if root is not a Document Title (level 0) title = "#{doc.id}.md" item = Heading.new(doc, title, 0) doc.items.append(item) doc.headings.append(item) doc.title = title end if temp_md_list doc.items.append temp_md_list temp_md_list = nil end # check if it is a separator first if /^[|]\s?(:?)(-{3,})(:?)\s?[|]/.match(s) || /^[+]\s?(:?)(={3,})(:?)\s?[+]/.match(s) if temp_md_table # separator is found after heading temp_md_table.is_separator_detected = true temp_md_table.add_separator(s) else # separator out of table scope consider it just as a regular paragraph item = Paragraph.new(doc, s) doc.items.append(item) end elsif res = /^[|](.*[|])/.match(s) # check if it looks as a table row row = res[1] if temp_md_table if temp_md_table.is_separator_detected # if there is a separator # check if parent doc is a Protocol if doc.instance_of? Protocol # check if it is a controlled table tmp = /(.*)\s+>\[(\S*)\]/.match(row) if tmp && (temp_md_table.instance_of? MarkdownTable) # this is not a regular Markdown table # so the table type shall be changed and this row shall be passed one more time temp_md_table = ControlledTable.new(doc, temp_md_table) end end temp_md_table.add_row(row) else # replece table heading with regular paragraph item = Paragraph.new(doc, temp_md_table.heading_row) doc.items.append(item) # and current row item = Paragraph.new(doc, s) doc.items.append(item) temp_md_table = nil end else # start table from heading temp_md_table = MarkdownTable.new(doc, s) end end elsif res = /^>(.*)/.match(s) # check if blockquote temp_md_table = process_temp_table(doc, temp_md_table) if temp_md_list doc.items.append temp_md_list temp_md_list = nil end item = Blockquote.new(res[1]) item.parent_doc = doc item.parent_heading = doc.headings[-1] doc.items.append(item) elsif res = /^```(\w*)/.match(s) # check if code block temp_md_table = process_temp_table(doc, temp_md_table) if temp_md_list doc.items.append temp_md_list temp_md_list = nil end suggested_format = '' suggested_format = res[1] if res.length == 2 if temp_code_block # close already opened block doc.items.append(temp_code_block) temp_code_block = nil else # start code block temp_code_block = CodeBlock.new(suggested_format) temp_code_block.parent_doc = doc temp_code_block.parent_heading = doc.headings[-1] end elsif res = /^TODO:(.*)/.match(s) # check if TODO block temp_md_table = process_temp_table(doc, temp_md_table) if temp_md_list doc.items.append temp_md_list temp_md_list = nil end text = '**TODO**: ' + res[1] item = TodoBlock.new(text) item.parent_doc = doc item.parent_heading = doc.headings[-1] doc.items.append(item) doc.todo_blocks.append(item) else # Reqular Paragraph temp_md_table = process_temp_table(doc, temp_md_table) if temp_md_list if MarkdownList.unordered_list_item?(s) || MarkdownList.ordered_list_item?(s) temp_md_list.add_row(s) next else doc.items.append temp_md_list temp_md_list = nil end end if temp_code_block temp_code_block.code_lines.append(s) else item = Paragraph.new(doc, s) doc.items.append(item) end end elsif temp_md_list doc.items.append temp_md_list temp_md_list = nil # lists are separated by emty line from each other end end # Finalize non-closed elements temp_md_table = process_temp_table(doc, temp_md_table) if temp_md_list doc.items.append temp_md_list temp_md_list = nil end if temp_code_block doc.items.append temp_code_block temp_code_block = nil end # Add footer to close opened tables if any item = DocFooter.new item.parent_doc = doc doc.items.append(item) end |
.process_temp_table(doc, temp_md_table) ⇒ Object
369 370 371 372 373 374 375 376 377 378 379 380 381 |
# File 'lib/almirah/doc_parser.rb', line 369 def self.process_temp_table(doc, temp_md_table) if temp_md_table if temp_md_table.is_separator_detected doc.items.append temp_md_table else # no separator # replece table heading with regular paragraph item = Paragraph.new(doc, temp_md_table.heading_row) doc.items.append(item) end temp_md_table = nil end temp_md_table end |
.try_to_extract_frontmatter(doc, text_lines) ⇒ Object
rubocop:disable Metrics/MethodLength
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/almirah/doc_parser.rb', line 17 def self.try_to_extract_frontmatter(doc, text_lines) # rubocop:disable Metrics/MethodLength lines_to_remove = 0 frontmatter_lines = '' if /^(-{3,})/.match(text_lines[0]) frontmatter_started = false text_lines.each do |s| lines_to_remove += 1 if /^(-{3,})/.match(s) if frontmatter_started doc.frontmatter = Frontmatter.new(frontmatter_lines) frontmatter_started = false break else frontmatter_started = true end elsif frontmatter_started frontmatter_lines += s end end end text_lines.shift(lines_to_remove) text_lines end |