Module: MaRuKu::In::Markdown::BlockLevelParser
- Includes:
- Helpers, SpanLevelParser, Strings
- Included in:
- MDDocument
- Defined in:
- lib/maruku.rb,
lib/maruku/input/parse_doc.rb,
lib/maruku/input/linesource.rb,
lib/maruku/input/parse_block.rb
Defined Under Namespace
Classes: LineSource
Constant Summary
Constants included from SpanLevelParser
SpanLevelParser::CharSource, SpanLevelParser::EscapedCharInInlineCode, SpanLevelParser::EscapedCharInQuotes, SpanLevelParser::EscapedCharInText, SpanLevelParser::R_REF_ID, SpanLevelParser::SPACE
Constants included from Strings
Strings::Abbreviation, Strings::AttributeDefinitionList, Strings::Definition, Strings::EMailAddress, Strings::FootnoteText, Strings::HeaderWithAttributes, Strings::HeaderWithId, Strings::IncompleteLink, Strings::LinkRegex, Strings::MightBeTableHeader, Strings::Sep, Strings::TabSize, Strings::TableSeparator, Strings::URL
Instance Method Summary collapse
-
#eventually_comes_a_def_list(src) ⇒ Object
If current line is text, a definition list is coming if 1) text,empty,*,definition.
-
#expand_attribute_list(al, result) ⇒ Object
Expands an attribute list in an Hash.
- #parse_blocks(src) ⇒ Object
- #parse_doc(s) ⇒ Object
-
#parse_text_as_markdown(text) ⇒ Object
Splits the string and calls parse_lines_as_markdown.
- #read_abbreviation(src) ⇒ Object
- #read_ald(src) ⇒ Object
- #read_code(src) ⇒ Object
- #read_definition(src) ⇒ Object
- #read_footnote_text(src) ⇒ Object
-
#read_header12(src) ⇒ Object
reads a header (with —– or ========).
-
#read_header3(src) ⇒ Object
reads a header like ‘#### header ####’.
-
#read_indented_content(src, indentation, break_list, item_type) ⇒ Object
This is the only ugly function in the code base.
-
#read_list_item(src) ⇒ Object
Reads one list item, either ordered or unordered.
-
#read_metadata(src) ⇒ Object
Reads a series of metadata lines with empty lines in between.
- #read_paragraph(src) ⇒ Object
- #read_quote(src) ⇒ Object
- #read_raw_html(src) ⇒ Object
- #read_ref_definition(src) ⇒ Object
- #read_table(src) ⇒ Object
- #search_abbreviations ⇒ Object
- #split_cells(s) ⇒ Object
-
#substitute_markdown_inside_raw_html ⇒ Object
(PHP Markdown extra) Search for elements that have markdown=1 or markdown=block defined.
Methods included from SpanLevelParser
#describe_pos, #is_ial, #md_al, #parse_lines_as_span, #parse_span_better, #read_attribute_list, #read_em, #read_email_el, #read_emstrong, #read_footnote_ref, #read_image, #read_inline_code, #read_inline_html, #read_link, #read_quoted, #read_quoted_or_unquoted, #read_ref_id, #read_server_directive, #read_simple, #read_span, #read_strong, #read_url, #read_url_el, #unit_tests_for_attribute_lists
Methods included from Helpers
#md_abbr, #md_abbr_def, #md_ald, #md_br, #md_code, #md_codeblock, #md_el, #md_em, #md_email, #md_emstrong, #md_entity, #md_foot_ref, #md_footnote, #md_header, #md_hrule, #md_html, #md_ial, #md_im_image, #md_im_link, #md_image, #md_li, #md_link, #md_par, #md_quote, #md_ref_def, #md_server, #md_strong, #md_url
Methods included from Strings
#add_tabs, #dbg_describe_ary, #force_linebreak?, #line_md_type, #normalize_key_and_value, #num_leading_hashes, #number_of_leading_spaces, #parse_email_headers, #spaces_before_first_char, #split_lines, #strip_hashes, #strip_indent, #unquote
Instance Method Details
#eventually_comes_a_def_list(src) ⇒ Object
If current line is text, a definition list is coming if 1) text,empty,*,definition
503 504 505 506 507 508 |
# File 'lib/maruku/input/parse_block.rb', line 503 def eventually_comes_a_def_list(src) future = src.tell_me_the_future ok = future =~ %r{^t+e?d}x # puts "future: #{future} - #{ok}" ok end |
#expand_attribute_list(al, result) ⇒ Object
Expands an attribute list in an Hash
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
# File 'lib/maruku/input/parse_doc.rb', line 79 def (al, result) al.each do |k, v| case k when :class if not result[:class] result[:class] = v else result[:class] += " " + v end when :id; result[:id] = v when :ref; if self.ald[v] already = (result[:expanded_references] ||= []) if not already.include?(v) already.push v (self.ald[v], result) else maruku_error "Circular reference: #{v} already seen\n"+ already.inspect end else if not result[:unresolved_references] result[:unresolved_references] = v else result[:unresolved_references] << " #{v}" end result[v.to_sym] = true end else result[k.to_sym]=v end end end |
#parse_blocks(src) ⇒ Object
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
# File 'lib/maruku/input/parse_block.rb', line 35 def parse_blocks(src) output = []; # run state machine while src.cur_line # Prints detected type (useful for debugging) #puts "#{src.cur_line.md_type}|#{src.cur_line}" case src.cur_line.md_type when :empty; src.ignore_line when :ial src.shift_line =~ /\s*\{([^\}]*)\}\s*/ al = $1 al = read_attribute_list(CharSource.new(al), context=nil, break_on=[nil]) if not output.empty? output.last.al = al else maruku_error "An attribute list at beginning of context {#{al.to_md}}" tell_user "I will ignore this AL: {#{al.to_md}}" end when :ald output << read_ald(src) when :text if src.cur_line =~ MightBeTableHeader and (src.next_line && src.next_line =~ TableSeparator) output << read_table(src) elsif [:header1,:header2].include? src.next_line.md_type output << read_header12(src) elsif eventually_comes_a_def_list(src) definition = read_definition(src) if output.last && output.last.node_type == :definition_list output.last.children << definition else output << md_el(:definition_list, [definition]) end else # Start of a paragraph output << read_paragraph(src) end when :header2, :hrule # hrule src.shift_line output << md_hrule() when :header3 output << read_header3(src) when :ulist, :olist list_type = src.cur_line.md_type == :ulist ? :ul : :ol li = read_list_item(src) # append to current list if we have one if output.last && output.last.node_type == list_type output.last.children << li else output << md_el(list_type, [li]) end when :quote; output << read_quote(src) when :code; e = read_code(src); output << e if e when :raw_html; e = read_raw_html(src); output << e if e when :footnote_text; output << read_footnote_text(src) when :ref_definition; output << read_ref_definition(src) when :abbreviation; output << read_abbreviation(src) # # these do not produce output when :metadata; maruku_error "Please use the new meta-data syntax: \n"+ " http://maruku.rubyforge.org/proposal.html\n", src src.ignore_line # warn if we forgot something else md_type = src.cur_line.md_type line = src.cur_line maruku_error "Ignoring line '#{line}' type = #{md_type}", src src.shift_line end # FIXME # if current_metadata and output.last # output.last.meta.merge! current_metadata # current_metadata = nil # puts "meta for #{output.last.node_type}\n #{output.last.meta.inspect}" # end # current_metadata = just_read_metadata # just_read_metadata = nil end # See for each list if we can omit the paragraphs and use li_span # TODO: do this after output.each do |c| # Remove paragraphs that we can get rid of if [:ul,:ol].include? c.node_type if c.children.all? {|li| !li.want_my_paragraph} then c.children.each do |d| d.node_type = :li_span d.children = d.children[0].children end end end if c.node_type == :definition_list if c.children.all?{|defi| !defi.want_my_paragraph} then c.children.each do |definition| definition.definitions.each do |dd| dd.children = dd.children[0].children end end end end end output end |
#parse_doc(s) ⇒ Object
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/maruku/input/parse_doc.rb', line 26 def parse_doc(s) = parse_email_headers(s) data = [:data] .delete :data self.attributes.merge! enc = self.attributes[:encoding] self.attributes.delete :encoding if enc && enc.downcase != 'utf-8' # puts "Converting from #{enc} to UTF-8." converted = Iconv.new('utf-8', enc).iconv(data) # puts "Data: #{data.inspect}: #{data}" # puts "Conv: #{converted.inspect}: #{converted}" data = converted end @children = parse_text_as_markdown(data) if true #markdown_extra? self.search_abbreviations self.substitute_markdown_inside_raw_html end toc = create_toc # use title if not set if not self.attributes[:title] and toc.header_element title = toc.header_element.to_s self.attributes[:title] = title # puts "Set document title to #{title}" end # save for later use self.toc = toc # Now do the attributes magic each_element do |e| # default attribute list if default = self.ald[e.node_type.to_s] (default, e.attributes) end (e.al, e.attributes) # puts "#{e.node_type}: #{e.attributes.inspect}" end # puts self.inspect end |
#parse_text_as_markdown(text) ⇒ Object
Splits the string and calls parse_lines_as_markdown
29 30 31 32 33 |
# File 'lib/maruku/input/parse_block.rb', line 29 def parse_text_as_markdown(text) lines = split_lines(text) src = LineSource.new(lines) return parse_blocks(src) end |
#read_abbreviation(src) ⇒ Object
260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 |
# File 'lib/maruku/input/parse_block.rb', line 260 def read_abbreviation(src) if not (l=src.shift_line) =~ Abbreviation maruku_error "Bug: it's Andrea's fault. Tell him.\n#{l.inspect}" end abbr = $1 desc = $2 if (not abbr) or (abbr.size==0) maruku_error "Bad abbrev. abbr=#{abbr.inspect} desc=#{desc.inspect}" end self.abbreviations[abbr] = desc return md_abbr_def(abbr, desc) end |
#read_ald(src) ⇒ Object
146 147 148 149 150 151 152 153 154 155 156 |
# File 'lib/maruku/input/parse_block.rb', line 146 def read_ald(src) if (l=src.shift_line) =~ AttributeDefinitionList id = $1; al=$2; al = read_attribute_list(CharSource.new(al), context=nil, break_on=[nil]) self.ald[id] = al; return md_ald(id, al) else maruku_error "Bug Bug:\n#{l.inspect}" return nil end end |
#read_code(src) ⇒ Object
383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 |
# File 'lib/maruku/input/parse_block.rb', line 383 def read_code(src) # collect all indented lines lines = [] while src.cur_line && ([:code, :empty].include? src.cur_line.md_type) lines << strip_indent(src.shift_line, 4) end #while lines.last && (lines.last.md_type == :empty ) while lines.last && lines.last.strip.size == 0 lines.pop end while lines.first && lines.first.strip.size == 0 lines.shift end return nil if lines.empty? source = lines.join("\n") # dbg_describe_ary(lines, 'CODE') return md_codeblock(source) end |
#read_definition(src) ⇒ Object
511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 |
# File 'lib/maruku/input/parse_block.rb', line 511 def read_definition(src) # Read one or more terms terms = [] while src.cur_line && src.cur_line.md_type == :text terms << md_el(:definition_term, parse_lines_as_span([src.shift_line])) end # dbg_describe_ary(terms, 'DT') want_my_paragraph = false raise "Chunky Bacon!" if not src.cur_line # one optional empty if src.cur_line.md_type == :empty want_my_paragraph = true src.shift_line end raise "Chunky Bacon!" if src.cur_line.md_type != :definition # Read one or more definitions definitions = [] while src.cur_line && src.cur_line.md_type == :definition parent_offset = src.cur_index first = src.shift_line first =~ Definition first = $1 # I know, it's ugly!!! lines, w_m_p = read_indented_content(src,4, [:definition], :definition) want_my_paragraph ||= w_m_p lines.unshift first # dbg_describe_ary(lines, 'DD') src2 = LineSource.new(lines, src, parent_offset) children = parse_blocks(src2) definitions << md_el(:definition_data, children) end return md_el(:definition, terms+definitions, { :terms => terms, :definitions => definitions, :want_my_paragraph => want_my_paragraph}) end |
#read_footnote_text(src) ⇒ Object
277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 |
# File 'lib/maruku/input/parse_block.rb', line 277 def read_footnote_text(src) parent_offset = src.cur_index first = src.shift_line if not first =~ FootnoteText maruku_error "Bug (it's Andrea's fault)" end id = $1 text = $2 # Ugly things going on inside `read_indented_content` indentation = 4 #first.size-text.size # puts "id =_#{id}_; text=_#{text}_ indent=#{indentation}" break_list = [:footnote_text] item_type = :footnote_text lines, want_my_paragraph = read_indented_content(src,indentation, break_list, item_type) # add first line if text && text.strip != "" then lines.unshift text end # dbg_describe_ary(lines, 'FOOTNOTE') src2 = LineSource.new(lines, src, parent_offset) children = parse_blocks(src2) e = md_footnote(id, children) self.footnotes[id] = e return e end |
#read_header12(src) ⇒ Object
reads a header (with —– or ========)
159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
# File 'lib/maruku/input/parse_block.rb', line 159 def read_header12(src) line = src.shift_line.strip al = nil # Check if there is an IAL if and line =~ /^(.*)\{(.*)\}\s*$/ line = $1.strip ial = $2 al = read_attribute_list(CharSource.new(ial), context=nil, break_on=[nil]) end text = parse_lines_as_span [ line ] level = src.cur_line.md_type == :header2 ? 2 : 1; src.shift_line return md_header(level, text, al) end |
#read_header3(src) ⇒ Object
reads a header like ‘#### header ####’
175 176 177 178 179 180 181 182 183 184 185 186 187 |
# File 'lib/maruku/input/parse_block.rb', line 175 def read_header3(src) line = src.shift_line.strip al = nil # Check if there is an IAL if and line =~ /^(.*)\{(.*)\}\s*$/ line = $1.strip ial = $2 al = read_attribute_list(CharSource.new(ial), context=nil, break_on=[nil]) end level = num_leading_hashes(line) text = parse_lines_as_span [strip_hashes(line)] return md_header(level, text, al) end |
#read_indented_content(src, indentation, break_list, item_type) ⇒ Object
This is the only ugly function in the code base. It is used to read list items, descriptions, footnote text
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 |
# File 'lib/maruku/input/parse_block.rb', line 314 def read_indented_content(src, indentation, break_list, item_type) lines =[] # collect all indented lines saw_empty = false; saw_anything_after = false while src.cur_line #puts "#{src.cur_line.md_type} #{src.cur_line.inspect}" if src.cur_line.md_type == :empty saw_empty = true lines << src.shift_line next end # after a white line if saw_empty # we expect things to be properly aligned if (ns=number_of_leading_spaces(src.cur_line)) < indentation #puts "breaking for spaces, only #{ns}: #{src.cur_line}" break end saw_anything_after = true else break if break_list.include? src.cur_line.md_type # break if src.cur_line.md_type != :text end stripped = strip_indent(src.shift_line, indentation) lines << stripped #puts "Accepted as #{stripped.inspect}" # You are only required to indent the first line of # a child paragraph. if stripped.md_type == :text while src.cur_line && (src.cur_line.md_type == :text) lines << strip_indent(src.shift_line, indentation) end end end want_my_paragraph = saw_anything_after || (saw_empty && (src.cur_line && (src.cur_line.md_type == item_type))) # dbg_describe_ary(lines, 'LI') # create a new context while lines.last && (lines.last.md_type == :empty) lines.pop end return lines, want_my_paragraph end |
#read_list_item(src) ⇒ Object
Reads one list item, either ordered or unordered.
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 |
# File 'lib/maruku/input/parse_block.rb', line 234 def read_list_item(src) parent_offset = src.cur_index item_type = src.cur_line.md_type first = src.shift_line # Ugly things going on inside `read_indented_content` indentation = spaces_before_first_char(first) break_list = [:ulist, :olist, :ial] lines, want_my_paragraph = read_indented_content(src,indentation, break_list, item_type) # add first line # Strip first '*', '-', '+' from first line stripped = first[indentation, first.size-1] lines.unshift stripped #dbg_describe_ary(lines, 'LIST ITEM ') src2 = LineSource.new(lines, src, parent_offset) children = parse_blocks(src2) with_par = want_my_paragraph || (children.size>1) return md_li(children, with_par) end |
#read_metadata(src) ⇒ Object
Reads a series of metadata lines with empty lines in between
409 410 411 412 413 414 415 416 417 418 419 |
# File 'lib/maruku/input/parse_block.rb', line 409 def (src) hash = {} while src.cur_line case src.cur_line.md_type when :empty; src.shift_line when :metadata; hash.merge! (src.shift_line) else break end end hash end |
#read_paragraph(src) ⇒ Object
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 |
# File 'lib/maruku/input/parse_block.rb', line 208 def read_paragraph(src) lines = [] while src.cur_line # :olist does not break case t = src.cur_line.md_type when :quote,:header3,:empty,:raw_html,:ref_definition,:ial break when :olist,:ulist break if src.next_line.md_type == t else true end break if src.cur_line.strip.size == 0 break if [:header1,:header2].include? src.next_line.md_type lines << src.shift_line end # dbg_describe_ary(lines, 'PAR') children = parse_lines_as_span(lines) return md_par(children) end |
#read_quote(src) ⇒ Object
368 369 370 371 372 373 374 375 376 377 378 379 380 381 |
# File 'lib/maruku/input/parse_block.rb', line 368 def read_quote(src) parent_offset = src.cur_index lines = [] # collect all indented lines while src.cur_line && src.cur_line.md_type == :quote lines << unquote(src.shift_line) end # dbg_describe_ary(lines, 'QUOTE') src2 = LineSource.new(lines, src, parent_offset) children = parse_blocks(src2) return md_quote(children) end |
#read_raw_html(src) ⇒ Object
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
# File 'lib/maruku/input/parse_block.rb', line 190 def read_raw_html(src) h = HTMLHelper.new begin h.eat_this(l=src.shift_line) # puts "\nBLOCK:\nhtml -> #{l.inspect}" while src.cur_line and not h.is_finished? l=src.shift_line # puts "html -> #{l.inspect}" h.eat_this "\n"+l end rescue Exception => e ex = e.inspect + e.backtrace.join("\n") maruku_error "Bad block-level HTML:\n#{add_tabs(ex,1,'|')}\n", src end raw_html = h.stuff_you_read return md_html(raw_html) end |
#read_ref_definition(src) ⇒ Object
422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 |
# File 'lib/maruku/input/parse_block.rb', line 422 def read_ref_definition(src) line = src.shift_line # if link is incomplete, shift next line if src.cur_line && (src.cur_line.md_type != :ref_definition) && ([1,2,3].include? number_of_leading_spaces(src.cur_line) ) line += " "+ src.shift_line end # puts "total= #{line}" match = LinkRegex.match(line) if not match error "Link does not respect format: '#{line}'" end id = match[1]; url = match[2]; title = match[3]; id = id.strip.downcase hash = self.refs[id] = {:url=>url,:title=>title} stuff=match[4] if stuff stuff.split.each do |couple| # puts "found #{couple}" k, v = couple.split('=') v ||= "" if v[0,1]=='"' then v = v[1, v.size-2] end # puts "key:_#{k}_ value=_#{v}_" hash[k.to_sym] = v end end # puts hash.inspect return md_ref_def(id, url, ={:title=>title}) end |
#read_table(src) ⇒ Object
460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 |
# File 'lib/maruku/input/parse_block.rb', line 460 def read_table(src) def split_cells(s) s.strip.split('|').select{|x|x.strip.size>0}.map{|x|x.strip} end head = split_cells(src.shift_line).map{|s| md_el(:head_cell, parse_lines_as_span([s])) } separator=split_cells(src.shift_line) align = separator.map { |s| s =~ Sep if $1 and $2 then :center elsif $2 then :right else :left end } num_columns = align.size if head.size != num_columns maruku_error "Table head does not have #{num_columns} columns: \n#{head.inspect}" tell_user "I will ignore this table." # XXX try to recover return md_br() end rows = [] while src.cur_line && src.cur_line =~ /\|/ row = split_cells(src.shift_line).map{|s| md_el(:cell, parse_lines_as_span([s]))} if head.size != num_columns maruku_error "Row does not have #{num_columns} columns: \n#{row.inspect}" tell_user "I will ignore this table." # XXX try to recover return md_br() end rows << row end children = (head+rows).flatten return md_el(:table, children, {:align => align}) end |
#search_abbreviations ⇒ Object
114 115 116 117 118 119 120 121 122 123 124 125 126 |
# File 'lib/maruku/input/parse_doc.rb', line 114 def search_abbreviations self.abbreviations.each do |abbrev, title| reg = Regexp.new(Regexp.escape(abbrev)) self.replace_each_string do |s| if m = reg.match(s) e = md_abbr(abbrev.dup, title ? title.dup : nil) [m.pre_match, e, m.post_match] else s end end end end |
#split_cells(s) ⇒ Object
462 463 464 |
# File 'lib/maruku/input/parse_block.rb', line 462 def split_cells(s) s.strip.split('|').select{|x|x.strip.size>0}.map{|x|x.strip} end |
#substitute_markdown_inside_raw_html ⇒ Object
(PHP Markdown extra) Search for elements that have markdown=1 or markdown=block defined
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
# File 'lib/maruku/input/parse_doc.rb', line 130 def substitute_markdown_inside_raw_html self.each_element(:raw_html) do |e| doc = e.instance_variable_get :@parsed_html if doc # valid html # parse block-level markdown elements in these HTML tags = ['div'] # use xpath to find elements with 'markdown' attribute doc.elements.to_a( "//*[attribute::markdown]" ).each do |e| # should we parse block-level or span-level? parse_blocks = (e.attributes['markdown'] == 'block') || .include?(e.name) # remove 'markdown' attribute e.delete_attribute 'markdown' # Select all text elements of e e.texts.each do |original_text| # puts "parse_blocks = #{parse_blocks} found = #{original_text} " s = original_text.to_s.strip # XXX el = md_el(:dummy, parse_blocks ? parse_text_as_markdown(s) : parse_lines_as_span([s]) ) el.children_to_html.each do |x| e.insert_before(original_text, x) end e.delete(original_text) end end end end end |