Module: MaRuKu::In::Markdown::BlockLevelParser

Includes:: Helpers, SpanLevelParser, Strings

Included in:: MDDocument

Defined in:: lib/maruku.rb,
lib/maruku/input/parse_doc.rb,
lib/maruku/input/linesource.rb,
lib/maruku/input/parse_block.rb

Defined Under Namespace

Constant Summary

Constants included from SpanLevelParser

SpanLevelParser::CharSource, SpanLevelParser::EscapedCharInInlineCode, SpanLevelParser::EscapedCharInQuotes, SpanLevelParser::EscapedCharInText, SpanLevelParser::R_REF_ID, SpanLevelParser::SPACE

Constants included from Strings

Strings::Abbreviation, Strings::AttributeDefinitionList, Strings::Definition, Strings::EMailAddress, Strings::FootnoteText, Strings::HeaderWithAttributes, Strings::HeaderWithId, Strings::IncompleteLink, Strings::LinkRegex, Strings::MightBeTableHeader, Strings::Sep, Strings::TabSize, Strings::TableSeparator, Strings::URL

Instance Method Summary collapse

#eventually_comes_a_def_list(src) ⇒ Object

If current line is text, a definition list is coming if 1) text,empty,*,definition.
#expand_attribute_list(al, result) ⇒ Object

Expands an attribute list in an Hash.
#parse_blocks(src) ⇒ Object
#parse_doc(s) ⇒ Object
#parse_text_as_markdown(text) ⇒ Object

Splits the string and calls parse_lines_as_markdown.
#read_abbreviation(src) ⇒ Object
#read_ald(src) ⇒ Object
#read_code(src) ⇒ Object
#read_definition(src) ⇒ Object
#read_footnote_text(src) ⇒ Object
#read_header12(src) ⇒ Object

reads a header (with —– or ========).
#read_header3(src) ⇒ Object

reads a header like ‘#### header ####’.
#read_indented_content(src, indentation, break_list, item_type) ⇒ Object

This is the only ugly function in the code base.
#read_list_item(src) ⇒ Object

Reads one list item, either ordered or unordered.
#read_metadata(src) ⇒ Object

Reads a series of metadata lines with empty lines in between.
#read_paragraph(src) ⇒ Object
#read_quote(src) ⇒ Object
#read_raw_html(src) ⇒ Object
#read_ref_definition(src) ⇒ Object
#read_table(src) ⇒ Object
#search_abbreviations ⇒ Object
#split_cells(s) ⇒ Object
#substitute_markdown_inside_raw_html ⇒ Object

(PHP Markdown extra) Search for elements that have markdown=1 or markdown=block defined.

Instance Method Details

#eventually_comes_a_def_list(src) ⇒ `Object`

If current line is text, a definition list is coming if 1) text,empty,*,definition

# File 'lib/maruku/input/parse_block.rb', line 503

def eventually_comes_a_def_list(src)
	future = src.tell_me_the_future
	ok = future =~ %r{^t+e?d}x
#		puts "future: #{future} - #{ok}"
	ok
end

#expand_attribute_list(al, result) ⇒ `Object`

Expands an attribute list in an Hash

# File 'lib/maruku/input/parse_doc.rb', line 79

def expand_attribute_list(al, result)
	al.each do |k, v|
		case k
		when :class
			if not result[:class]
				result[:class] = v
			else
				result[:class] += " " + v
			end
		when :id; result[:id] = v
		when :ref; 
			if self.ald[v]
				already = (result[:expanded_references] ||= [])
				if not already.include?(v)
					already.push v
					expand_attribute_list(self.ald[v], result)
				else
					maruku_error "Circular reference: #{v} already seen\n"+
						already.inspect
				end
			else
				if not result[:unresolved_references]
					result[:unresolved_references] = v
				else
					result[:unresolved_references] << " #{v}"
				end
				
				result[v.to_sym] = true
			end
		else
			result[k.to_sym]=v
		end
	end
end

#parse_blocks(src) ⇒ `Object`

# File 'lib/maruku/input/parse_block.rb', line 35

def parse_blocks(src)
	output = [];
	
	# run state machine
	while src.cur_line
#  Prints detected type (useful for debugging)
		#puts "#{src.cur_line.md_type}|#{src.cur_line}"
		case src.cur_line.md_type
			when :empty; 
				src.ignore_line
			when :ial
				src.shift_line =~ /\s*\{([^\}]*)\}\s*/ 
				al = $1
				al = read_attribute_list(CharSource.new(al), context=nil, break_on=[nil])
				if not output.empty? 
					output.last.al = al
				else
					maruku_error "An attribute list at beginning of context {#{al.to_md}}"
					tell_user "I will ignore this AL: {#{al.to_md}}"
				end
			when :ald
				output << read_ald(src)
			when :text
				if src.cur_line =~ MightBeTableHeader and 
					(src.next_line && src.next_line =~ TableSeparator)
					output << read_table(src)
				elsif [:header1,:header2].include? src.next_line.md_type
					output << read_header12(src)
				elsif eventually_comes_a_def_list(src)
				 	definition = read_definition(src)
					if output.last && output.last.node_type == :definition_list
						output.last.children << definition
					else
						output << md_el(:definition_list, [definition])
					end
				else # Start of a paragraph
					output << read_paragraph(src)
				end
			when :header2, :hrule
				# hrule
				src.shift_line
				output << md_hrule()
			when :header3
				output << read_header3(src)
			when :ulist, :olist
				list_type = src.cur_line.md_type == :ulist ? :ul : :ol
				li = read_list_item(src)
				# append to current list if we have one
				if output.last && output.last.node_type == list_type
					output.last.children << li
				else
					output << md_el(list_type, [li])
				end
			when :quote;    output << read_quote(src)
			when :code;     e = read_code(src); output << e if e
			when :raw_html; e = read_raw_html(src); output << e if e

			when :footnote_text;   output << read_footnote_text(src)
			when :ref_definition;  output << read_ref_definition(src)
			when :abbreviation;    output << read_abbreviation(src)

#				# these do not produce output
			when :metadata;        
				maruku_error "Please use the new meta-data syntax: \n"+
				"  http://maruku.rubyforge.org/proposal.html\n", src
				src.ignore_line
			# warn if we forgot something
			else
				md_type = src.cur_line.md_type
				line = src.cur_line
				maruku_error "Ignoring line '#{line}' type = #{md_type}", src
				src.shift_line
		end

# FIXME			
#			if current_metadata and output.last
#				output.last.meta.merge! current_metadata
#				current_metadata = nil
#				puts "meta for #{output.last.node_type}\n #{output.last.meta.inspect}"
#			end
#			current_metadata = just_read_metadata
#			just_read_metadata = nil
	end
	
	# See for each list if we can omit the paragraphs and use li_span
	# TODO: do this after
	output.each do |c| 
		# Remove paragraphs that we can get rid of
		if [:ul,:ol].include? c.node_type 
			if c.children.all? {|li| !li.want_my_paragraph} then
				c.children.each do |d|
					d.node_type = :li_span
					d.children = d.children[0].children 
				end
			end
		end 
		if c.node_type == :definition_list
			if c.children.all?{|defi| !defi.want_my_paragraph} then
				c.children.each do |definition| 
					definition.definitions.each do |dd|
						dd.children = dd.children[0].children 
					end
				end
			end
		end 
	end
	
	output
end

#parse_doc(s) ⇒ `Object`

# File 'lib/maruku/input/parse_doc.rb', line 26

def parse_doc(s)
	
	meta2 =  parse_email_headers(s)
	data = meta2[:data]
	meta2.delete :data
	
	self.attributes.merge! meta2
	
	enc = self.attributes[:encoding]
	self.attributes.delete :encoding
	if enc && enc.downcase != 'utf-8'
#			puts "Converting from #{enc} to UTF-8."
		converted = Iconv.new('utf-8', enc).iconv(data)
		
#			puts "Data: #{data.inspect}: #{data}"
#			puts "Conv: #{converted.inspect}: #{converted}"
		
		data = converted
	end
	
	@children = parse_text_as_markdown(data)
	
	if true #markdown_extra? 
		self.search_abbreviations
		self.substitute_markdown_inside_raw_html
	end
	
	toc = create_toc

	# use title if not set
	if not self.attributes[:title] and toc.header_element
		title = toc.header_element.to_s
		self.attributes[:title]  = title
#			puts "Set document title to #{title}"
	end
	
	# save for later use
	self.toc = toc
	
	# Now do the attributes magic
	each_element do |e|
		# default attribute list
		if default = self.ald[e.node_type.to_s]
			expand_attribute_list(default, e.attributes)
		end
		expand_attribute_list(e.al, e.attributes)
#			puts "#{e.node_type}: #{e.attributes.inspect}"
	end
	
#		puts self.inspect
end

#parse_text_as_markdown(text) ⇒ `Object`

Splits the string and calls parse_lines_as_markdown

# File 'lib/maruku/input/parse_block.rb', line 29

def parse_text_as_markdown(text)
	lines =  split_lines(text)
	src = LineSource.new(lines)
	return parse_blocks(src)
end

#read_abbreviation(src) ⇒ `Object`

# File 'lib/maruku/input/parse_block.rb', line 260

def read_abbreviation(src)
	if not (l=src.shift_line) =~ Abbreviation
		maruku_error "Bug: it's Andrea's fault. Tell him.\n#{l.inspect}"
	end
	
	abbr = $1
	desc = $2
	
	if (not abbr) or (abbr.size==0)
		maruku_error "Bad abbrev. abbr=#{abbr.inspect} desc=#{desc.inspect}"
	end
	
	self.abbreviations[abbr] = desc
	
	return md_abbr_def(abbr, desc)
end

#read_ald(src) ⇒ `Object`

# File 'lib/maruku/input/parse_block.rb', line 146

def read_ald(src)
	if (l=src.shift_line) =~ AttributeDefinitionList
		id = $1;   al=$2;
		al = read_attribute_list(CharSource.new(al), context=nil, break_on=[nil])
		self.ald[id] = al;
		return md_ald(id, al)
	else
		maruku_error "Bug Bug:\n#{l.inspect}"
		return nil
	end
end

#read_code(src) ⇒ `Object`

# File 'lib/maruku/input/parse_block.rb', line 383

def read_code(src)
	# collect all indented lines
	lines = []
	while src.cur_line && ([:code, :empty].include? src.cur_line.md_type)
		lines << strip_indent(src.shift_line, 4)
	end
	
	#while lines.last && (lines.last.md_type == :empty )
	while lines.last && lines.last.strip.size == 0
		lines.pop 
	end

	while lines.first && lines.first.strip.size == 0
		lines.shift 
	end
	
	return nil if lines.empty?

	source = lines.join("\n")
	
#		dbg_describe_ary(lines, 'CODE')

	return md_codeblock(source)
end

#read_definition(src) ⇒ `Object`

# File 'lib/maruku/input/parse_block.rb', line 511

def read_definition(src)
	# Read one or more terms
	terms = []
	while  src.cur_line &&  src.cur_line.md_type == :text
		terms << md_el(:definition_term, parse_lines_as_span([src.shift_line]))
	end
#		dbg_describe_ary(terms, 'DT')

	want_my_paragraph = false

	raise "Chunky Bacon!" if not src.cur_line

	# one optional empty
	if src.cur_line.md_type == :empty
		want_my_paragraph = true
		src.shift_line
	end
	
	raise "Chunky Bacon!" if src.cur_line.md_type != :definition
	
	# Read one or more definitions
	definitions = []
	while src.cur_line && src.cur_line.md_type == :definition
		parent_offset = src.cur_index
			
		first = src.shift_line
		first =~ Definition
		first = $1
		
		# I know, it's ugly!!!

		lines, w_m_p = 
			read_indented_content(src,4, [:definition], :definition)
		want_my_paragraph ||= w_m_p
	
		lines.unshift first
		
#			dbg_describe_ary(lines, 'DD')
		src2 = LineSource.new(lines, src, parent_offset)
		children = parse_blocks(src2)
		definitions << md_el(:definition_data, children)
	end
	
	return md_el(:definition, terms+definitions, { 	
		:terms => terms, 
		:definitions => definitions, 
		:want_my_paragraph => want_my_paragraph})
end

#read_footnote_text(src) ⇒ `Object`

# File 'lib/maruku/input/parse_block.rb', line 277

def read_footnote_text(src)
	parent_offset = src.cur_index
		
	first = src.shift_line
	
	if not first =~ FootnoteText 
		maruku_error "Bug (it's Andrea's fault)"
	end
	
	id = $1
	text = $2

	# Ugly things going on inside `read_indented_content`
	indentation = 4 #first.size-text.size
	
#		puts "id =_#{id}_; text=_#{text}_ indent=#{indentation}"
	
	break_list = [:footnote_text]
	item_type = :footnote_text
	lines, want_my_paragraph = 
		read_indented_content(src,indentation, break_list, item_type)

	# add first line
	if text && text.strip != "" then lines.unshift text end
	
#		dbg_describe_ary(lines, 'FOOTNOTE')
	src2 = LineSource.new(lines, src, parent_offset)
	children = parse_blocks(src2)
	
	e = md_footnote(id, children)
	self.footnotes[id] = e
	return e
end

#read_header12(src) ⇒ `Object`

reads a header (with —– or ========)

# File 'lib/maruku/input/parse_block.rb', line 159

def read_header12(src)
	line = src.shift_line.strip
	al = nil
	# Check if there is an IAL
	if new_meta_data? and line =~ /^(.*)\{(.*)\}\s*$/
		line = $1.strip
		ial = $2
		al  = read_attribute_list(CharSource.new(ial), context=nil, break_on=[nil])
	end
	text = parse_lines_as_span [ line ]
	level = src.cur_line.md_type == :header2 ? 2 : 1;  
	src.shift_line
	return md_header(level, text, al)
end

#read_header3(src) ⇒ `Object`

reads a header like ‘#### header ####’

# File 'lib/maruku/input/parse_block.rb', line 175

def read_header3(src)
	line = src.shift_line.strip
	al = nil
	# Check if there is an IAL
	if new_meta_data? and line =~ /^(.*)\{(.*)\}\s*$/
		line = $1.strip
		ial = $2
		al  = read_attribute_list(CharSource.new(ial), context=nil, break_on=[nil])
	end
	level = num_leading_hashes(line)
	text = parse_lines_as_span [strip_hashes(line)] 
	return md_header(level, text, al)
end

#read_indented_content(src, indentation, break_list, item_type) ⇒ `Object`

This is the only ugly function in the code base. It is used to read list items, descriptions, footnote text

# File 'lib/maruku/input/parse_block.rb', line 314

def read_indented_content(src, indentation, break_list, item_type)
	lines =[]
	# collect all indented lines
	saw_empty = false; saw_anything_after = false
	while src.cur_line 
		#puts "#{src.cur_line.md_type} #{src.cur_line.inspect}"
		if src.cur_line.md_type == :empty
			saw_empty = true
			lines << src.shift_line
			next
		end
	
		# after a white line
		if saw_empty
			# we expect things to be properly aligned
			if (ns=number_of_leading_spaces(src.cur_line)) < indentation
				#puts "breaking for spaces, only #{ns}: #{src.cur_line}"
				break
			end
			saw_anything_after = true
		else
			break if break_list.include? src.cur_line.md_type
#				break if src.cur_line.md_type != :text
		end
	

		stripped = strip_indent(src.shift_line, indentation)
		lines << stripped

		#puts "Accepted as #{stripped.inspect}"
	
		# You are only required to indent the first line of 
		# a child paragraph.
		if stripped.md_type == :text
			while src.cur_line && (src.cur_line.md_type == :text)
				lines << strip_indent(src.shift_line, indentation)
			end
		end
	end

	want_my_paragraph = saw_anything_after || 
		(saw_empty && (src.cur_line  && (src.cur_line.md_type == item_type))) 

#		dbg_describe_ary(lines, 'LI')
	# create a new context 

	while lines.last && (lines.last.md_type == :empty)
		lines.pop
	end
	
	return lines, want_my_paragraph
end

#read_list_item(src) ⇒ `Object`

Reads one list item, either ordered or unordered.

# File 'lib/maruku/input/parse_block.rb', line 234

def read_list_item(src)
	parent_offset = src.cur_index
	
	item_type = src.cur_line.md_type
	first = src.shift_line

	# Ugly things going on inside `read_indented_content`
	indentation = spaces_before_first_char(first)
	break_list = [:ulist, :olist, :ial]
	lines, want_my_paragraph = 
		read_indented_content(src,indentation, break_list, item_type)

	# add first line
		# Strip first '*', '-', '+' from first line
		stripped = first[indentation, first.size-1]
	lines.unshift stripped
	
	#dbg_describe_ary(lines, 'LIST ITEM ')

	src2 = LineSource.new(lines, src, parent_offset)
	children = parse_blocks(src2)
	with_par = want_my_paragraph || (children.size>1)
	
	return md_li(children, with_par)
end

#read_metadata(src) ⇒ `Object`

Reads a series of metadata lines with empty lines in between

# File 'lib/maruku/input/parse_block.rb', line 409

def read_metadata(src)
	hash = {}
	while src.cur_line 
		case src.cur_line.md_type
			when :empty;  src.shift_line
			when :metadata; hash.merge! parse_metadata(src.shift_line)
			else break
		end
	end
	hash
end

#read_paragraph(src) ⇒ `Object`

# File 'lib/maruku/input/parse_block.rb', line 208

def read_paragraph(src)
	lines = []
	while src.cur_line 
		# :olist does not break
		case t = src.cur_line.md_type
			when :quote,:header3,:empty,:raw_html,:ref_definition,:ial
				break
			when :olist,:ulist
				break if src.next_line.md_type == t
			else
				true
		end
		
		break if src.cur_line.strip.size == 0
		
		break if [:header1,:header2].include? src.next_line.md_type
		
		lines << src.shift_line
	end
#		dbg_describe_ary(lines, 'PAR')
	children = parse_lines_as_span(lines)

	return md_par(children)
end

#read_quote(src) ⇒ `Object`

# File 'lib/maruku/input/parse_block.rb', line 368

def read_quote(src)
	parent_offset = src.cur_index
		
	lines = []
	# collect all indented lines
	while src.cur_line && src.cur_line.md_type == :quote
		lines << unquote(src.shift_line)
	end
#		dbg_describe_ary(lines, 'QUOTE')

	src2 = LineSource.new(lines, src, parent_offset)
	children = parse_blocks(src2)
	return md_quote(children)
end

#read_raw_html(src) ⇒ `Object`

# File 'lib/maruku/input/parse_block.rb', line 190

def read_raw_html(src)
	h = HTMLHelper.new
	begin 
		h.eat_this(l=src.shift_line)
#			puts "\nBLOCK:\nhtml -> #{l.inspect}"
		while src.cur_line and not h.is_finished? 
			l=src.shift_line
#				puts "html -> #{l.inspect}"
			h.eat_this "\n"+l
		end
	rescue Exception => e
		ex = e.inspect + e.backtrace.join("\n")
		maruku_error "Bad block-level HTML:\n#{add_tabs(ex,1,'|')}\n", src
	end
	raw_html = h.stuff_you_read
	return md_html(raw_html)
end

#read_ref_definition(src) ⇒ `Object`

# File 'lib/maruku/input/parse_block.rb', line 422

def read_ref_definition(src)
	line = src.shift_line
	
	# if link is incomplete, shift next line
	if src.cur_line && (src.cur_line.md_type != :ref_definition) && 
		([1,2,3].include? number_of_leading_spaces(src.cur_line) )
		line += " "+ src.shift_line
	end
	
#		puts "total= #{line}"
	
	match = LinkRegex.match(line)
	if not match
		error "Link does not respect format: '#{line}'"
	end
	
	id = match[1]; url = match[2]; title = match[3]; 
	id = id.strip.downcase
	
	hash = self.refs[id] = {:url=>url,:title=>title}
	
	stuff=match[4]
	
	if stuff
		stuff.split.each do |couple|
#					puts "found #{couple}"
			k, v = couple.split('=')
			v ||= ""
			if v[0,1]=='"' then v = v[1, v.size-2] end
#					puts "key:_#{k}_ value=_#{v}_"
			hash[k.to_sym] = v
		end
	end
#			puts hash.inspect
	
	return md_ref_def(id, url, meta={:title=>title})
end

#read_table(src) ⇒ `Object`

# File 'lib/maruku/input/parse_block.rb', line 460

def read_table(src)
	
	def split_cells(s)
		s.strip.split('|').select{|x|x.strip.size>0}.map{|x|x.strip}
	end
	
	head = split_cells(src.shift_line).map{|s| md_el(:head_cell, parse_lines_as_span([s])) }
		
	separator=split_cells(src.shift_line)

	align = separator.map { |s|  s =~ Sep
		if $1 and $2 then :center elsif $2 then :right else :left end }
			
	num_columns = align.size
	
	if head.size != num_columns
		maruku_error "Table head does not have #{num_columns} columns: \n#{head.inspect}"
		tell_user "I will ignore this table."
		# XXX try to recover
		return md_br()
	end
			
	rows = []
	
	while src.cur_line && src.cur_line =~ /\|/
		row = split_cells(src.shift_line).map{|s|
			md_el(:cell, parse_lines_as_span([s]))}
		if head.size != num_columns
			maruku_error  "Row does not have #{num_columns} columns: \n#{row.inspect}"
			tell_user "I will ignore this table."
			# XXX try to recover
			return md_br()
		end
		rows << row
	end

	children = (head+rows).flatten
	return md_el(:table, children, {:align => align})
end

#search_abbreviations ⇒ `Object`

# File 'lib/maruku/input/parse_doc.rb', line 114

def search_abbreviations
	self.abbreviations.each do |abbrev, title|
		reg = Regexp.new(Regexp.escape(abbrev))
		self.replace_each_string do |s|
			if m = reg.match(s)
				e = md_abbr(abbrev.dup, title ? title.dup : nil)
				[m.pre_match, e, m.post_match]
			else
				s
			end
		end
	end
end

#split_cells(s) ⇒ `Object`



462
463
464

# File 'lib/maruku/input/parse_block.rb', line 462

def split_cells(s)
	s.strip.split('|').select{|x|x.strip.size>0}.map{|x|x.strip}
end

#substitute_markdown_inside_raw_html ⇒ `Object`

(PHP Markdown extra) Search for elements that have markdown=1 or markdown=block defined

# File 'lib/maruku/input/parse_doc.rb', line 130

def substitute_markdown_inside_raw_html
	self.each_element(:raw_html) do |e|
		doc = e.instance_variable_get :@parsed_html
		if doc # valid html
			# parse block-level markdown elements in these HTML tags
			block_tags = ['div']
			# use xpath to find elements with 'markdown' attribute
			doc.elements.to_a( "//*[attribute::markdown]" ).each do |e|
				# should we parse block-level or span-level?
				parse_blocks = (e.attributes['markdown'] == 'block') || 
				               block_tags.include?(e.name)
				# remove 'markdown' attribute
				e.delete_attribute 'markdown'
				# Select all text elements of e
				e.texts.each do |original_text|
#						puts "parse_blocks = #{parse_blocks} found = #{original_text} "
					s = original_text.to_s.strip # XXX
					el = md_el(:dummy,
					 	parse_blocks ? parse_text_as_markdown(s) :
					                  parse_lines_as_span([s]) )
					el.children_to_html.each do |x|
						e.insert_before(original_text, x)
					end
					e.delete(original_text)
				end
				
			end
		end
	end
end

Module: MaRuKu::In::Markdown::BlockLevelParser

Defined Under Namespace

Constant Summary

Constants included from SpanLevelParser

Constants included from Strings

Instance Method Summary collapse

Methods included from SpanLevelParser

Methods included from Helpers

Methods included from Strings

Instance Method Details

#eventually_comes_a_def_list(src) ⇒ Object

#expand_attribute_list(al, result) ⇒ Object

#parse_blocks(src) ⇒ Object

#parse_doc(s) ⇒ Object

#parse_text_as_markdown(text) ⇒ Object

#read_abbreviation(src) ⇒ Object

#read_ald(src) ⇒ Object

#read_code(src) ⇒ Object

#read_definition(src) ⇒ Object

#read_footnote_text(src) ⇒ Object

#read_header12(src) ⇒ Object

#read_header3(src) ⇒ Object

#read_indented_content(src, indentation, break_list, item_type) ⇒ Object

#read_list_item(src) ⇒ Object

#read_metadata(src) ⇒ Object

#read_paragraph(src) ⇒ Object

#read_quote(src) ⇒ Object

#read_raw_html(src) ⇒ Object

#read_ref_definition(src) ⇒ Object

#read_table(src) ⇒ Object

#search_abbreviations ⇒ Object

#split_cells(s) ⇒ Object

#substitute_markdown_inside_raw_html ⇒ Object