diff --git a/src/Pluf/Text/MarkDown.php b/src/Pluf/Text/MarkDown.php index 0477193..8128f2e 100644 --- a/src/Pluf/Text/MarkDown.php +++ b/src/Pluf/Text/MarkDown.php @@ -3,8 +3,8 @@ # Markdown Extra - A text-to-HTML conversion tool for web writers # # PHP Markdown & Extra -# Copyright (c) 2004-2008 Michel Fortin -# +# Copyright (c) 2004-2009 Michel Fortin +# # # Original Markdown # Copyright (c) 2004-2006 John Gruber @@ -12,8 +12,8 @@ # -define( 'MARKDOWN_VERSION', "1.0.1l" ); # Sun 11 May 2008 -define( 'MARKDOWNEXTRA_VERSION', "1.2.1" ); # Tue 27 May 2008 +define( 'MARKDOWN_VERSION', "1.0.1n" ); # Sat 10 Oct 2009 +define( 'MARKDOWNEXTRA_VERSION', "1.2.4" ); # Sat 10 Oct 2009 # @@ -35,6 +35,15 @@ define( 'MARKDOWNEXTRA_VERSION', "1.2.1" ); # Tue 27 May 2008 @define( 'MARKDOWN_FN_BACKLINK_CLASS', "" ); +# +# WordPress settings: +# + +# Change to false to remove Markdown from posts and/or comments. +@define( 'MARKDOWN_WP_POSTS', true ); +@define( 'MARKDOWN_WP_COMMENTS', true ); + + ### Standard Function Interface ### @@ -56,22 +65,10 @@ function Markdown($text) { } function Pluf_Text_MarkDown_parse($text) { -# -# Initialize the parser and return the result of its transform method. -# - # Setup static parser variable. - static $parser; - if (!isset($parser)) { - $parser_class = MARKDOWN_PARSER_CLASS; - $parser = new $parser_class; - } - - # Transform text using parser. - return $parser->transform($text); + return Markdown($text); } - # # Markdown Parser Class # @@ -81,16 +78,14 @@ class Markdown_Parser { # Regex to match balanced [brackets]. # Needed to insert a maximum bracked depth while converting to PHP. var $nested_brackets_depth = 6; - var $nested_brackets; + var $nested_brackets_re; var $nested_url_parenthesis_depth = 4; - var $nested_url_parenthesis; + var $nested_url_parenthesis_re; # Table of hash values for escaped characters: var $escape_chars = '\`*_{}[]()>#+-.!'; - - # Regular expression to catch extra attributes. - var $attr_regex; + var $escape_chars_re; # Change to ">" for HTML output. var $empty_element_suffix = MARKDOWN_EMPTY_ELEMENT_SUFFIX; @@ -110,15 +105,18 @@ class Markdown_Parser { # Constructor function. Initialize appropriate member variables. # $this->_initDetab(); + $this->prepareItalicsAndBold(); - $this->nested_brackets = + $this->nested_brackets_re = str_repeat('(?>[^\[\]]+|\[', $this->nested_brackets_depth). str_repeat('\])*', $this->nested_brackets_depth); - $this->nested_url_parenthesis = + $this->nested_url_parenthesis_re = str_repeat('(?>[^()\s]+|\(', $this->nested_url_parenthesis_depth). str_repeat('(?>\)))*', $this->nested_url_parenthesis_depth); + $this->escape_chars_re = '['.preg_quote($this->escape_chars).']'; + # Sort document, block, and span gamut in ascendent priority order. asort($this->document_gamut); asort($this->block_gamut); @@ -166,8 +164,8 @@ class Markdown_Parser { # $this->setup(); - # Remove UTF-8 BOM, if present. - $text = preg_replace('{^\xEF\xBB\xBF}', '', $text); + # Remove UTF-8 BOM and marker character in input, if present. + $text = preg_replace('{^\xEF\xBB\xBF|\x1A}', '', $text); # Standardize line endings: # DOS to Unix and Mac to Unix @@ -219,14 +217,18 @@ class Markdown_Parser { [ ]* \n? # maybe *one* newline [ ]* - ? # url = $2 + (?: + <(.+?)> # url = $2 + | + (\S+?) # url = $3 + ) [ ]* \n? # maybe one newline [ ]* (?: (?<=\s) # lookbehind for whitespace ["(] - (.*?) # title = $3 + (.*?) # title = $4 [")] [ ]* )? # title is optional @@ -238,9 +240,9 @@ class Markdown_Parser { } function _stripLinkDefinitions_callback($matches) { $link_id = strtolower($matches[1]); - $this->urls[$link_id] = $this->encodeAttribute($matches[2]); - if (isset($matches[3])) - $this->titles[$link_id] = $this->encodeAttribute($matches[3]); + $url = $matches[2] == '' ? $matches[3] : $matches[2]; + $this->urls[$link_id] = $url; + $this->titles[$link_id] =& $matches[4]; return ''; # String that will replace the block } @@ -263,9 +265,9 @@ class Markdown_Parser { # inline later. # * List "b" is made of tags which are always block-level; # - $block_tags_a = 'ins|del'; - $block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|'. - 'script|noscript|form|fieldset|iframe|math'; + $block_tags_a_re = 'ins|del'; + $block_tags_b_re = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|'. + 'script|noscript|form|fieldset|iframe|math'; # Regular expression for the content of a block tag. $nested_tags_level = 4; @@ -328,7 +330,7 @@ class Markdown_Parser { # in between. [ ]{0,'.$less_than_tab.'} - <('.$block_tags_b.')# start tag = $2 + <('.$block_tags_b_re.')# start tag = $2 '.$attr.'> # attributes followed by > and \n '.$content.' # content, support nesting # the matching end tag @@ -338,7 +340,7 @@ class Markdown_Parser { | # Special version for tags of group a. [ ]{0,'.$less_than_tab.'} - <('.$block_tags_a.')# start tag = $3 + <('.$block_tags_a_re.')# start tag = $3 '.$attr.'>[ ]*\n # attributes followed by > '.$content2.' # content, support nesting # the matching end tag @@ -541,7 +543,7 @@ class Markdown_Parser { $text = preg_replace_callback('{ ( # wrap whole match in $1 \[ - ('.$this->nested_brackets.') # link text = $2 + ('.$this->nested_brackets_re.') # link text = $2 \] [ ]? # one optional space @@ -560,40 +562,40 @@ class Markdown_Parser { $text = preg_replace_callback('{ ( # wrap whole match in $1 \[ - ('.$this->nested_brackets.') # link text = $2 + ('.$this->nested_brackets_re.') # link text = $2 \] \( # literal paren - [ ]* + [ \n]* (?: - <(\S*)> # href = $3 + <(.+?)> # href = $3 | - ('.$this->nested_url_parenthesis.') # href = $4 + ('.$this->nested_url_parenthesis_re.') # href = $4 ) - [ ]* + [ \n]* ( # $5 ([\'"]) # quote char = $6 (.*?) # Title = $7 \6 # matching quote - [ ]* # ignore any spaces/tabs between closing quote and ) + [ \n]* # ignore any spaces/tabs between closing quote and ) )? # title is optional \) ) }xs', - array(&$this, '_DoAnchors_inline_callback'), $text); + array(&$this, '_doAnchors_inline_callback'), $text); # # Last, handle reference-style shortcuts: [link text] - # These must come last in case you've also got [link test][1] - # or [link test](/foo) + # These must come last in case you've also got [link text][1] + # or [link text](/foo) # -// $text = preg_replace_callback('{ -// ( # wrap whole match in $1 -// \[ -// ([^\[\]]+) # link text = $2; can\'t contain [ or ] -// \] -// ) -// }xs', -// array(&$this, '_doAnchors_reference_callback'), $text); + $text = preg_replace_callback('{ + ( # wrap whole match in $1 + \[ + ([^\[\]]+) # link text = $2; can\'t contain [ or ] + \] + ) + }xs', + array(&$this, '_doAnchors_reference_callback'), $text); $this->in_anchor = false; return $text; @@ -663,7 +665,7 @@ class Markdown_Parser { $text = preg_replace_callback('{ ( # wrap whole match in $1 !\[ - ('.$this->nested_brackets.') # alt text = $2 + ('.$this->nested_brackets_re.') # alt text = $2 \] [ ]? # one optional space @@ -684,22 +686,22 @@ class Markdown_Parser { $text = preg_replace_callback('{ ( # wrap whole match in $1 !\[ - ('.$this->nested_brackets.') # alt text = $2 + ('.$this->nested_brackets_re.') # alt text = $2 \] \s? # One optional whitespace character \( # literal paren - [ ]* + [ \n]* (?: <(\S*)> # src url = $3 | - ('.$this->nested_url_parenthesis.') # src url = $4 + ('.$this->nested_url_parenthesis_re.') # src url = $4 ) - [ ]* + [ \n]* ( # $5 ([\'"]) # quote char = $6 (.*?) # title = $7 \6 # matching quote - [ ]* + [ \n]* )? # title is optional \) ) @@ -719,10 +721,11 @@ class Markdown_Parser { $alt_text = $this->encodeAttribute($alt_text); if (isset($this->urls[$link_id])) { - $url = $this->urls[$link_id]; + $url = $this->encodeAttribute($this->urls[$link_id]); $result = "\"$alt_text\"";titles[$link_id])) { $title = $this->titles[$link_id]; + $title = $this->encodeAttribute($title); $result .= " title=\"$title\""; } $result .= $this->empty_element_suffix; @@ -785,6 +788,10 @@ class Markdown_Parser { return $text; } function _doHeaders_callback_setext($matches) { + # Terrible hack to check we haven't found an empty list item. + if ($matches[2] == '-' && preg_match('{^-(?: |$)}', $matches[1])) + return $matches[0]; + $level = $matches[2]{0} == '=' ? 1 : 2; $block = "".$this->runSpanGamut($matches[1]).""; return "\n" . $this->hashBlock($block) . "\n\n"; @@ -803,30 +810,39 @@ class Markdown_Parser { $less_than_tab = $this->tab_width - 1; # Re-usable patterns to match list item bullets and number markers: - $marker_ul = '[*+-]'; - $marker_ol = '\d+[.]'; - $marker_any = "(?:$marker_ul|$marker_ol)"; + $marker_ul_re = '[*+-]'; + $marker_ol_re = '\d+[.]'; + $marker_any_re = "(?:$marker_ul_re|$marker_ol_re)"; - $markers = array($marker_ul, $marker_ol); + $markers_relist = array( + $marker_ul_re => $marker_ol_re, + $marker_ol_re => $marker_ul_re, + ); - foreach ($markers as $marker) { + foreach ($markers_relist as $marker_re => $other_marker_re) { # Re-usable pattern to match any entirel ul or ol list: - $whole_list = ' + $whole_list_re = ' ( # $1 = whole list ( # $2 - [ ]{0,'.$less_than_tab.'} - ('.$marker.') # $3 = first list item marker + ([ ]{0,'.$less_than_tab.'}) # $3 = number of spaces + ('.$marker_re.') # $4 = first list item marker [ ]+ ) (?s:.+?) - ( # $4 + ( # $5 \z | \n{2,} (?=\S) (?! # Negative lookahead for another list item marker [ ]* - '.$marker.'[ ]+ + '.$marker_re.'[ ]+ + ) + | + (?= # Lookahead for another kind of list + \n + \3 # Must have the same indentation + '.$other_marker_re.'[ ]+ ) ) ) @@ -838,14 +854,14 @@ class Markdown_Parser { if ($this->list_level) { $text = preg_replace_callback('{ ^ - '.$whole_list.' + '.$whole_list_re.' }mx', array(&$this, '_doLists_callback'), $text); } else { $text = preg_replace_callback('{ (?:(?<=\n)\n|\A\n?) # Must eat the newline - '.$whole_list.' + '.$whole_list_re.' }mx', array(&$this, '_doLists_callback'), $text); } @@ -855,17 +871,17 @@ class Markdown_Parser { } function _doLists_callback($matches) { # Re-usable patterns to match list item bullets and number markers: - $marker_ul = '[*+-]'; - $marker_ol = '\d+[.]'; - $marker_any = "(?:$marker_ul|$marker_ol)"; + $marker_ul_re = '[*+-]'; + $marker_ol_re = '\d+[.]'; + $marker_any_re = "(?:$marker_ul_re|$marker_ol_re)"; $list = $matches[1]; - $list_type = preg_match("/$marker_ul/", $matches[3]) ? "ul" : "ol"; + $list_type = preg_match("/$marker_ul_re/", $matches[4]) ? "ul" : "ol"; - $marker_any = ( $list_type == "ul" ? $marker_ul : $marker_ol ); + $marker_any_re = ( $list_type == "ul" ? $marker_ul_re : $marker_ol_re ); $list .= "\n"; - $result = $this->processListItems($list, $marker_any); + $result = $this->processListItems($list, $marker_any_re); $result = $this->hashBlock("<$list_type>\n" . $result . ""); return "\n". $result ."\n\n"; @@ -873,7 +889,7 @@ class Markdown_Parser { var $list_level = 0; - function processListItems($list_str, $marker_any) { + function processListItems($list_str, $marker_any_re) { # # Process the contents of a single ordered or unordered list, splitting it # into individual list items. @@ -906,11 +922,13 @@ class Markdown_Parser { $list_str = preg_replace_callback('{ (\n)? # leading line = $1 - (^[ ]*) # leading whitespace = $2 - ('.$marker_any.' [ ]+) # list marker and space = $3 - ((?s:.+?)) # list item text = $4 + (^[ ]*) # leading whitespace = $2 + ('.$marker_any_re.' # list marker and space = $3 + (?:[ ]+|(?=\n)) # space only required if item is not empty + ) + ((?s:.*?)) # list item text = $4 (?:(\n+(?=\n))|\n) # tailing blank line = $5 - (?= \n* (\z | \2 ('.$marker_any.') [ ]+)) + (?= \n* (\z | \2 ('.$marker_any_re.') (?:[ ]+|(?=\n)))) }xm', array(&$this, '_processListItems_callback'), $list_str); @@ -983,45 +1001,165 @@ class Markdown_Parser { } - function doItalicsAndBold($text) { - # must go first: - $text = preg_replace_callback('{ - ( # $1: Marker - (? - [^*_]+? # Anthing not em markers. - | - # Balence any regular emphasis inside. - \1 (?=\S) .+? (?<=\S) \1 - | - . # Allow unbalenced * and _. - )+? - ) - (?<=\S) \1\1 # End mark not preceded by whitespace. - }sx', - array(&$this, '_doItalicAndBold_strong_callback'), $text); - # Then : - $text = preg_replace_callback( - '{ ( (?runSpanGamut($text); - return $this->hashPart("$text"); + var $em_relist = array( + '' => '(?:(? '(?<=\S|^)(? '(?<=\S|^)(? '(?:(? '(?<=\S|^)(? '(?<=\S|^)(? '(?:(? '(?<=\S|^)(? '(?<=\S|^)(?em_relist as $em => $em_re) { + foreach ($this->strong_relist as $strong => $strong_re) { + # Construct list of allowed token expressions. + $token_relist = array(); + if (isset($this->em_strong_relist["$em$strong"])) { + $token_relist[] = $this->em_strong_relist["$em$strong"]; + } + $token_relist[] = $em_re; + $token_relist[] = $strong_re; + + # Construct master expression from list. + $token_re = '{('. implode('|', $token_relist) .')}'; + $this->em_strong_prepared_relist["$em$strong"] = $token_re; + } + } } - function _doItalicAndBold_strong_callback($matches) { - $text = $matches[2]; - $text = $this->runSpanGamut($text); - return $this->hashPart("$text"); + + function doItalicsAndBold($text) { + $token_stack = array(''); + $text_stack = array(''); + $em = ''; + $strong = ''; + $tree_char_em = false; + + while (1) { + # + # Get prepared regular expression for seraching emphasis tokens + # in current context. + # + $token_re = $this->em_strong_prepared_relist["$em$strong"]; + + # + # Each loop iteration search for the next emphasis token. + # Each token is then passed to handleSpanToken. + # + $parts = preg_split($token_re, $text, 2, PREG_SPLIT_DELIM_CAPTURE); + $text_stack[0] .= $parts[0]; + $token =& $parts[1]; + $text =& $parts[2]; + + if (empty($token)) { + # Reached end of text span: empty stack without emitting. + # any more emphasis. + while ($token_stack[0]) { + $text_stack[1] .= array_shift($token_stack); + $text_stack[0] .= array_shift($text_stack); + } + break; + } + + $token_len = strlen($token); + if ($tree_char_em) { + # Reached closing marker while inside a three-char emphasis. + if ($token_len == 3) { + # Three-char closing marker, close em and strong. + array_shift($token_stack); + $span = array_shift($text_stack); + $span = $this->runSpanGamut($span); + $span = "$span"; + $text_stack[0] .= $this->hashPart($span); + $em = ''; + $strong = ''; + } else { + # Other closing marker: close one em or strong and + # change current token state to match the other + $token_stack[0] = str_repeat($token{0}, 3-$token_len); + $tag = $token_len == 2 ? "strong" : "em"; + $span = $text_stack[0]; + $span = $this->runSpanGamut($span); + $span = "<$tag>$span"; + $text_stack[0] = $this->hashPart($span); + $$tag = ''; # $$tag stands for $em or $strong + } + $tree_char_em = false; + } else if ($token_len == 3) { + if ($em) { + # Reached closing marker for both em and strong. + # Closing strong marker: + for ($i = 0; $i < 2; ++$i) { + $shifted_token = array_shift($token_stack); + $tag = strlen($shifted_token) == 2 ? "strong" : "em"; + $span = array_shift($text_stack); + $span = $this->runSpanGamut($span); + $span = "<$tag>$span"; + $text_stack[0] .= $this->hashPart($span); + $$tag = ''; # $$tag stands for $em or $strong + } + } else { + # Reached opening three-char emphasis marker. Push on token + # stack; will be handled by the special condition above. + $em = $token{0}; + $strong = "$em$em"; + array_unshift($token_stack, $token); + array_unshift($text_stack, ''); + $tree_char_em = true; + } + } else if ($token_len == 2) { + if ($strong) { + # Unwind any dangling emphasis marker: + if (strlen($token_stack[0]) == 1) { + $text_stack[1] .= array_shift($token_stack); + $text_stack[0] .= array_shift($text_stack); + } + # Closing strong marker: + array_shift($token_stack); + $span = array_shift($text_stack); + $span = $this->runSpanGamut($span); + $span = "$span"; + $text_stack[0] .= $this->hashPart($span); + $strong = ''; + } else { + array_unshift($token_stack, $token); + array_unshift($text_stack, ''); + $strong = $token; + } + } else { + # Here $token_len == 1 + if ($em) { + if (strlen($token_stack[0]) == 1) { + # Closing emphasis marker: + array_shift($token_stack); + $span = array_shift($text_stack); + $span = $this->runSpanGamut($span); + $span = "$span"; + $text_stack[0] .= $this->hashPart($span); + $em = ''; + } else { + $text_stack[0] .= $token; + } + } else { + array_unshift($token_stack, $token); + array_unshift($text_stack, ''); + $em = $token; + } + } + } + return $text_stack[0]; } @@ -1050,7 +1188,7 @@ class Markdown_Parser { # These leading spaces cause problem with
 content, 
 		# so we need to fix that:
 		$bq = preg_replace_callback('{(\s*
.+?
)}sx', - array(&$this, '_DoBlockQuotes_callback2'), $bq); + array(&$this, '_doBlockQuotes_callback2'), $bq); return "\n". $this->hashBlock("
\n$bq\n
")."\n\n"; } @@ -1133,7 +1271,8 @@ class Markdown_Parser { function encodeAttribute($text) { # - # Encode text for a double-quoted HTML attribute. + # Encode text for a double-quoted HTML attribute. This function + # is *not* suitable for attributes enclosed in single quotes. # $text = $this->encodeAmpsAndAngles($text); $text = str_replace('"', '"', $text); @@ -1142,20 +1281,21 @@ class Markdown_Parser { function encodeAmpsAndAngles($text) { - # Smart processing for ampersands and angle brackets that need to be encoded. + # + # Smart processing for ampersands and angle brackets that need to + # be encoded. Valid character entities are left alone unless the + # no-entities mode is set. + # if ($this->no_entities) { $text = str_replace('&', '&', $text); - $text = str_replace('<', '<', $text); - return $text; + } else { + # Ampersand-encoding based entirely on Nat Irons's Amputator + # MT plugin: + $text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/', + '&', $text);; } - - # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: - # http://bumppo.net/projects/amputator/ - $text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/', - '&', $text);; - - # Encode naked <'s - $text = preg_replace('{<(?![a-z/?\$!%])}i', '<', $text); + # Encode remaining <'s + $text = str_replace('<', '<', $text); return $text; } @@ -1170,9 +1310,17 @@ class Markdown_Parser { < (?:mailto:)? ( - [-.\w\x80-\xFF]+ + (?: + [-!#$%&\'*+/=?^_`.{|}~\w\x80-\xFF]+ + | + ".*?" + ) \@ - [-a-z0-9\x80-\xFF]+(\.[-a-z0-9\x80-\xFF]+)*\.[a-z]+ + (?: + [-a-z0-9\x80-\xFF]+(\.[-a-z0-9\x80-\xFF]+)*\.[a-z]+ + | + \[[\d.a-fA-F:]+\] # IPv4 & IPv6 + ) ) > }xi', @@ -1240,9 +1388,9 @@ class Markdown_Parser { # $output = ''; - $regex = '{ + $span_re = '{ ( - \\\\['.preg_quote($this->escape_chars).'] + \\\\'.$this->escape_chars_re.' | (? | <%.*?%> # processing instruction | - <[/!$]?[-a-zA-Z0-9:]+ # regular tags + <[/!$]?[-a-zA-Z0-9:_]+ # regular tags (?> \s (?>[^"\'>]+|"[^"]*"|\'[^\']*\')* @@ -1268,7 +1416,7 @@ class Markdown_Parser { # openning code span marker, or the next escaped character. # Each token is then passed to handleSpanToken. # - $parts = preg_split($regex, $str, 2, PREG_SPLIT_DELIM_CAPTURE); + $parts = preg_split($span_re, $str, 2, PREG_SPLIT_DELIM_CAPTURE); # Create token from text preceding tag. if ($parts[0] != "") { @@ -1299,7 +1447,7 @@ class Markdown_Parser { return $this->hashPart("&#". ord($token{1}). ";"); case "`": # Search for end marker in remaining text. - if (preg_match('/^(.*?[^`])'.$token.'(?!`)(.*)$/sm', + if (preg_match('/^(.*?[^`])'.preg_quote($token).'(?!`)(.*)$/sm', $str, $matches)) { $str = $matches[2]; @@ -1415,6 +1563,7 @@ class MarkdownExtra_Parser extends Markdown_Parser { # Insert extra document, block, and span transformations. # Parent constructor will do the sorting. $this->document_gamut += array( + "doFencedCodeBlocks" => 5, "stripFootnotes" => 15, "stripAbbreviations" => 25, "appendFootnotes" => 50, @@ -1433,11 +1582,11 @@ class MarkdownExtra_Parser extends Markdown_Parser { } - # Extra hashes used during extra transformations. + # Extra variables used during extra transformations. var $footnotes = array(); var $footnotes_ordered = array(); var $abbr_desciptions = array(); - var $abbr_matches = array(); + var $abbr_word_re = ''; # Give the current footnote number. var $footnote_counter = 1; @@ -1452,11 +1601,13 @@ class MarkdownExtra_Parser extends Markdown_Parser { $this->footnotes = array(); $this->footnotes_ordered = array(); $this->abbr_desciptions = array(); - $this->abbr_matches = array(); + $this->abbr_word_re = ''; $this->footnote_counter = 1; foreach ($this->predef_abbr as $abbr_word => $abbr_desc) { - $this->abbr_matches[] = preg_quote($abbr_word); + if ($this->abbr_word_re) + $this->abbr_word_re .= '|'; + $this->abbr_word_re .= preg_quote($abbr_word); $this->abbr_desciptions[$abbr_word] = trim($abbr_desc); } } @@ -1468,7 +1619,7 @@ class MarkdownExtra_Parser extends Markdown_Parser { $this->footnotes = array(); $this->footnotes_ordered = array(); $this->abbr_desciptions = array(); - $this->abbr_matches = array(); + $this->abbr_word_re = ''; parent::teardown(); } @@ -1477,20 +1628,20 @@ class MarkdownExtra_Parser extends Markdown_Parser { ### HTML Block Parser ### # Tags that are always treated as block tags: - var $block_tags = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|form|fieldset|iframe|hr|legend'; + var $block_tags_re = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|form|fieldset|iframe|hr|legend'; # Tags treated as block tags only if the opening tag is alone on it's line: - var $context_block_tags = 'script|noscript|math|ins|del'; + var $context_block_tags_re = 'script|noscript|math|ins|del'; # Tags where markdown="1" default to span mode: - var $contain_span_tags = 'p|h[1-6]|li|dd|dt|td|th|legend|address'; + var $contain_span_tags_re = 'p|h[1-6]|li|dd|dt|td|th|legend|address'; # Tags which must not have their contents modified, no matter where # they appear: - var $clean_tags = 'script|math'; + var $clean_tags_re = 'script|math'; # Tags that do not need to be closed. - var $auto_close_tags = 'hr|img'; + var $auto_close_tags_re = 'hr|img'; function hashHTMLBlocks($text) { @@ -1517,7 +1668,7 @@ class MarkdownExtra_Parser extends Markdown_Parser { return $text; } function _hashHTMLBlocks_inMarkdown($text, $indent = 0, - $enclosing_tag = '', $span = false) + $enclosing_tag_re = '', $span = false) { # # Parse markdown text, calling _HashHTMLBlocks_InHTML for block tags. @@ -1535,7 +1686,7 @@ class MarkdownExtra_Parser extends Markdown_Parser { # If you don't like this, just don't indent the tag on which # you apply the markdown="1" attribute. # - # * If $enclosing_tag is not empty, stops at the first unmatched closing + # * If $enclosing_tag_re is not empty, stops at the first unmatched closing # tag with that name. Nested tags supported. # # * If $span is true, text inside must treated as span. So any double @@ -1547,8 +1698,8 @@ class MarkdownExtra_Parser extends Markdown_Parser { if ($text === '') return array('', ''); # Regex to check for the presense of newlines around a block tag. - $newline_match_before = '/(?:^\n?|\n\n)*$/'; - $newline_match_after = + $newline_before_re = '/(?:^\n?|\n\n)*$/'; + $newline_after_re = '{ ^ # Start of text following the tag. (?>[ ]*)? # Optional comment. @@ -1556,18 +1707,18 @@ class MarkdownExtra_Parser extends Markdown_Parser { }xs'; # Regex to match any tag. - $block_tag_match = + $block_tag_re = '{ ( # $2: Capture hole tag. # Tag name. - '.$this->block_tags.' | - '.$this->context_block_tags.' | - '.$this->clean_tags.' | - (?!\s)'.$enclosing_tag.' + '.$this->block_tags_re.' | + '.$this->context_block_tags_re.' | + '.$this->clean_tags_re.' | + (?!\s)'.$enclosing_tag_re.' ) (?: - (?=[\s"\'/]) # Allowed characters after tag name. + (?=[\s"\'/a-zA-Z0-9]) # Allowed characters after tag name. (?> ".*?" | # Double quotes (can contain `>`) \'.*?\' | # Single quotes (can contain `>`) @@ -1587,7 +1738,7 @@ class MarkdownExtra_Parser extends Markdown_Parser { '. ( !$span ? ' # If not in span. | # Indented code block - (?> ^[ ]*\n? | \n[ ]*\n ) + (?: ^[ ]*\n | ^ | \n[ ]*\n ) [ ]{'.($indent+4).'}[^\n]* \n (?> (?: [ ]{'.($indent+4).'}[^\n]* | [ ]* ) \n @@ -1615,7 +1766,7 @@ class MarkdownExtra_Parser extends Markdown_Parser { # pattern will be at the end, and between will be any catches made # by the pattern. # - $parts = preg_split($block_tag_match, $text, 2, + $parts = preg_split($block_tag_re, $text, 2, PREG_SPLIT_DELIM_CAPTURE); # If in Markdown span mode, add a empty-string span-level hash @@ -1636,13 +1787,15 @@ class MarkdownExtra_Parser extends Markdown_Parser { $tag = $parts[1]; # Tag to handle. $text = $parts[2]; # Remaining text after current tag. + $tag_re = preg_quote($tag); # For use in a regular expression. # # Check for: Code span marker # if ($tag{0} == "`") { # Find corresponding end marker. - if (preg_match('{^(?>.+?|\n(?!\n))*?(?.+?|\n(?!\n))*?(?.*\n)+?'.$tag_re.' *\n}', $text, + $matches)) + { + # End marker found: pass text unchanged until marker. + $parsed .= $tag . $matches[0]; + $text = substr($text, strlen($matches[0])); } else { - # Fenced code block marker: find matching end marker. - if (preg_match('{^(?>.*\n)+?'.trim($tag).' *\n}', $text, - $matches)) - { - # End marker found: pass text unchanged until marker. - $parsed .= $tag . $matches[0]; - $text = substr($text, strlen($matches[0])); - } - else { - # No end marker: just skip it. - $parsed .= $tag; - } + # No end marker: just skip it. + $parsed .= $tag; } } # # Check for: Opening Block level tag or - # Opening Content Block tag (like ins and del) + # Opening Context Block tag (like ins and del) # used as a block tag (tag is alone on it's line). # - else if (preg_match("{^<(?:$this->block_tags)\b}", $tag) || - ( preg_match("{^<(?:$this->context_block_tags)\b}", $tag) && - preg_match($newline_match_before, $parsed) && - preg_match($newline_match_after, $text) ) + else if (preg_match('{^<(?:'.$this->block_tags_re.')\b}', $tag) || + ( preg_match('{^<(?:'.$this->context_block_tags_re.')\b}', $tag) && + preg_match($newline_before_re, $parsed) && + preg_match($newline_after_re, $text) ) ) { # Need to parse tag and following text using the HTML parser. @@ -1700,7 +1855,7 @@ class MarkdownExtra_Parser extends Markdown_Parser { # Check for: Clean tag (like script, math) # HTML Comments, processing instructions. # - else if (preg_match("{^<(?:$this->clean_tags)\b}", $tag) || + else if (preg_match('{^<(?:'.$this->clean_tags_re.')\b}', $tag) || $tag{1} == '!' || $tag{1} == '?') { # Need to parse tag and following text using the HTML parser. @@ -1713,9 +1868,9 @@ class MarkdownExtra_Parser extends Markdown_Parser { # # Check for: Tag with same name as enclosing tag. # - else if ($enclosing_tag !== '' && + else if ($enclosing_tag_re !== '' && # Same name as enclosing tag. - preg_match("{^ ".*?" | # Double quotes (can contain `>`) \'.*?\' | # Single quotes (can contain `>`) @@ -1801,9 +1956,10 @@ class MarkdownExtra_Parser extends Markdown_Parser { # # Get the name of the starting tag. + # (This pattern makes $base_tag_name_re safe without quoting.) # - if (preg_match("/^<([\w:$]*)\b/", $text, $matches)) - $base_tag_name = $matches[1]; + if (preg_match('/^<([\w:$]*)\b/', $text, $matches)) + $base_tag_name_re = $matches[1]; # # Loop through every tag until we find the corresponding closing tag. @@ -1815,7 +1971,7 @@ class MarkdownExtra_Parser extends Markdown_Parser { # pattern will be at the end, and between will be any catches made # by the pattern. # - $parts = preg_split($tag_match, $text, 2, PREG_SPLIT_DELIM_CAPTURE); + $parts = preg_split($tag_re, $text, 2, PREG_SPLIT_DELIM_CAPTURE); if (count($parts) < 3) { # @@ -1835,7 +1991,7 @@ class MarkdownExtra_Parser extends Markdown_Parser { # Check for: Auto-close tag (like
) # Comments and Processing Instructions. # - if (preg_match("{^auto_close_tags)\b}", $tag) || + if (preg_match('{^auto_close_tags_re.')\b}', $tag) || $tag{1} == '!' || $tag{1} == '?') { # Just add the tag to the block as if it was text. @@ -1846,7 +2002,7 @@ class MarkdownExtra_Parser extends Markdown_Parser { # Increase/decrease nested tag count. Only do so if # the tag's name match base tag's. # - if (preg_match("{^mode = $attr_m[2] . $attr_m[3]; $span_mode = $this->mode == 'span' || $this->mode != 'block' && - preg_match("{^<(?:$this->contain_span_tags)\b}", $tag); + preg_match('{^<(?:'.$this->contain_span_tags_re.')\b}', $tag); # Calculate indent before tag. if (preg_match('/(?:^|\n)( *?)(?! ).*?$/', $block_text, $matches)) { @@ -1879,13 +2035,14 @@ class MarkdownExtra_Parser extends Markdown_Parser { $parsed .= $this->$hash_method($block_text); # Get enclosing tag name for the ParseMarkdown function. + # (This pattern makes $tag_name_re safe without quoting.) preg_match('/^<([\w:$]*)\b/', $tag, $matches); - $tag_name = $matches[1]; + $tag_name_re = $matches[1]; # Parse the content using the HTML-in-Markdown parser. list ($block_text, $text) = $this->_hashHTMLBlocks_inMarkdown($text, $indent, - $tag_name, $span_mode); + $tag_name_re, $span_mode); # Outdent markdown text. if ($indent > 0) { @@ -1969,6 +2126,8 @@ class MarkdownExtra_Parser extends Markdown_Parser { return " id=\"$attr\""; } function _doHeaders_callback_setext($matches) { + if ($matches[3] == '-' && preg_match('{^- }', $matches[1])) + return $matches[0]; $level = $matches[3]{0} == '=' ? 1 : 2; $attr = $this->_doHeaders_attr($id =& $matches[2]); $block = "".$this->runSpanGamut($matches[1]).""; @@ -2119,7 +2278,7 @@ class MarkdownExtra_Parser extends Markdown_Parser { $less_than_tab = $this->tab_width - 1; # Re-usable pattern to match any entire dl list: - $whole_list = '(?> + $whole_list_re = '(?> ( # $1 = whole list ( # $2 [ ]{0,'.$less_than_tab.'} @@ -2148,7 +2307,7 @@ class MarkdownExtra_Parser extends Markdown_Parser { $text = preg_replace_callback('{ (?>\A\n?|(?<=\n\n)) - '.$whole_list.' + '.$whole_list_re.' }mx', array(&$this, '_doDefLists_callback'), $text); @@ -2285,63 +2444,25 @@ class MarkdownExtra_Parser extends Markdown_Parser { } - function doItalicsAndBold($text) { # - # Redefined to change emphasis by underscore behaviour so that it does not + # Redefining emphasis markers so that emphasis by underscore does not # work in the middle of a word. # - # must go first: - $text = preg_replace_callback(array( - '{ - ( # $1: Marker - (? - [^_]+? # Anthing not em markers. - | - # Balence any regular _ emphasis inside. - (? - [^*]+? # Anthing not em markers. - | - # Balence any regular * emphasis inside. - \* (?=\S) (.+?) (?<=\S) \* - | - \* # Allow unbalenced as last resort. - )+? - ) - (?<=\S) \*\* # End mark not preceded by whitespace. - }sx', - ), - array(&$this, '_doItalicAndBold_strong_callback'), $text); - # Then : - $text = preg_replace_callback(array( - '{ ( (? '(?:(? '(?<=\S|^)(? '(?<=\S|^)(? '(?:(? '(?<=\S|^)(? '(?<=\S|^)(? '(?:(? '(?<=\S|^)(? '(?<=\S|^)(?footnotes_ordered)) { $text .= "\n\n"; $text .= "
\n"; - $text .= "empty_element_suffix ."\n"; $text .= "
    \n\n"; $attr = " rev=\"footnote\""; @@ -2464,10 +2585,11 @@ class MarkdownExtra_Parser extends Markdown_Parser { $footnote = preg_replace_callback('{F\x1Afn:(.*?)\x1A:}', array(&$this, '_appendFootnotes_callback'), $footnote); - $attr2 = str_replace("%%", ++$num, $attr); + $attr = str_replace("%%", ++$num, $attr); + $note_id = $this->encodeAttribute($note_id); # Add backlink to last paragraph; create new paragraph if needed. - $backlink = ""; + $backlink = ""; if (preg_match('{

    $}', $footnote)) { $footnote = substr($footnote, 0, -4) . " $backlink

    "; } else { @@ -2506,7 +2628,9 @@ class MarkdownExtra_Parser extends Markdown_Parser { $title = $this->encodeAttribute($title); $attr .= " title=\"$title\""; } + $attr = str_replace("%%", $num, $attr); + $node_id = $this->encodeAttribute($node_id); return "". @@ -2538,7 +2662,9 @@ class MarkdownExtra_Parser extends Markdown_Parser { function _stripAbbreviations_callback($matches) { $abbr_word = $matches[1]; $abbr_desc = $matches[2]; - $this->abbr_matches[] = preg_quote($abbr_word); + if ($this->abbr_word_re) + $this->abbr_word_re .= '|'; + $this->abbr_word_re .= preg_quote($abbr_word); $this->abbr_desciptions[$abbr_word] = trim($abbr_desc); return ''; # String that will replace the block } @@ -2548,12 +2674,12 @@ class MarkdownExtra_Parser extends Markdown_Parser { # # Find defined abbreviations in text and wrap them in elements. # - if ($this->abbr_matches) { - // cannot use the /x modifier because abbr_matches may - // contain spaces: + if ($this->abbr_word_re) { + // cannot use the /x modifier because abbr_word_re may + // contain significant spaces: $text = preg_replace_callback('{'. '(?abbr_matches) .')'. + '(?:'.$this->abbr_word_re.')'. '(?![\w\x1A])'. '}', array(&$this, '_doAbbreviations_callback'), $text); @@ -2567,7 +2693,7 @@ class MarkdownExtra_Parser extends Markdown_Parser { if (empty($desc)) { return $this->hashPart("$abbr"); } else { - $desc = htmlspecialchars($desc, ENT_NOQUOTES); + $desc = $this->encodeAttribute($desc); return $this->hashPart("$abbr"); } } else { @@ -2626,9 +2752,9 @@ See the readme file for detailed release notes for this version. Copyright and License --------------------- -PHP Markdown & Extra -Copyright (c) 2004-2008 Michel Fortin - +PHP Markdown & Extra +Copyright (c) 2004-2009 Michel Fortin + All rights reserved. Based on Markdown