marked/src/rules.js
2020-09-07 21:43:28 -05:00

340 lines
13 KiB
JavaScript

const {
noopTest,
edit,
merge
} = require('./helpers.js');
/**
* Block-Level Grammar
*/
const block = {
newline: /^\n+/,
code: /^( {4}[^\n]+\n*)+/,
fences: /^ {0,3}(`{3,}(?=[^`\n]*\n)|~{3,})([^\n]*)\n(?:|([\s\S]*?)\n)(?: {0,3}\1[~`]* *(?:\n+|$)|$)/,
hr: /^ {0,3}((?:- *){3,}|(?:_ *){3,}|(?:\* *){3,})(?:\n+|$)/,
heading: /^ {0,3}(#{1,6}) +([^\n]*?)(?: +#+)? *(?:\n+|$)/,
blockquote: /^( {0,3}> ?(paragraph|[^\n]*)(?:\n|$))+/,
list: /^( {0,3})(bull) [\s\S]+?(?:hr|def|\n{2,}(?! )(?!\1bull )\n*|\s*$)/,
html: '^ {0,3}(?:' // optional indentation
+ '<(script|pre|style)[\\s>][\\s\\S]*?(?:</\\1>[^\\n]*\\n+|$)' // (1)
+ '|comment[^\\n]*(\\n+|$)' // (2)
+ '|<\\?[\\s\\S]*?(?:\\?>\\n*|$)' // (3)
+ '|<![A-Z][\\s\\S]*?(?:>\\n*|$)' // (4)
+ '|<!\\[CDATA\\[[\\s\\S]*?(?:\\]\\]>\\n*|$)' // (5)
+ '|</?(tag)(?: +|\\n|/?>)[\\s\\S]*?(?:\\n{2,}|$)' // (6)
+ '|<(?!script|pre|style)([a-z][\\w-]*)(?:attribute)*? */?>(?=[ \\t]*(?:\\n|$))[\\s\\S]*?(?:\\n{2,}|$)' // (7) open tag
+ '|</(?!script|pre|style)[a-z][\\w-]*\\s*>(?=[ \\t]*(?:\\n|$))[\\s\\S]*?(?:\\n{2,}|$)' // (7) closing tag
+ ')',
def: /^ {0,3}\[(label)\]: *\n? *<?([^\s>]+)>?(?:(?: +\n? *| *\n *)(title))? *(?:\n+|$)/,
nptable: noopTest,
table: noopTest,
lheading: /^([^\n]+)\n {0,3}(=+|-+) *(?:\n+|$)/,
// regex template, placeholders will be replaced according to different paragraph
// interruption rules of commonmark and the original markdown spec:
_paragraph: /^([^\n]+(?:\n(?!hr|heading|lheading|blockquote|fences|list|html)[^\n]+)*)/,
text: /^[^\n]+/
};
block._label = /(?!\s*\])(?:\\[\[\]]|[^\[\]])+/;
block._title = /(?:"(?:\\"?|[^"\\])*"|'[^'\n]*(?:\n[^'\n]+)*\n?'|\([^()]*\))/;
block.def = edit(block.def)
.replace('label', block._label)
.replace('title', block._title)
.getRegex();
block.bullet = /(?:[*+-]|\d{1,9}[.)])/;
block.item = /^( *)(bull) ?[^\n]*(?:\n(?!\1bull ?)[^\n]*)*/;
block.item = edit(block.item, 'gm')
.replace(/bull/g, block.bullet)
.getRegex();
block.list = edit(block.list)
.replace(/bull/g, block.bullet)
.replace('hr', '\\n+(?=\\1?(?:(?:- *){3,}|(?:_ *){3,}|(?:\\* *){3,})(?:\\n+|$))')
.replace('def', '\\n+(?=' + block.def.source + ')')
.getRegex();
block._tag = 'address|article|aside|base|basefont|blockquote|body|caption'
+ '|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|fieldset|figcaption'
+ '|figure|footer|form|frame|frameset|h[1-6]|head|header|hr|html|iframe'
+ '|legend|li|link|main|menu|menuitem|meta|nav|noframes|ol|optgroup|option'
+ '|p|param|section|source|summary|table|tbody|td|tfoot|th|thead|title|tr'
+ '|track|ul';
block._comment = /<!--(?!-?>)[\s\S]*?(?:-->|$)/;
block.html = edit(block.html, 'i')
.replace('comment', block._comment)
.replace('tag', block._tag)
.replace('attribute', / +[a-zA-Z:_][\w.:-]*(?: *= *"[^"\n]*"| *= *'[^'\n]*'| *= *[^\s"'=<>`]+)?/)
.getRegex();
block.paragraph = edit(block._paragraph)
.replace('hr', block.hr)
.replace('heading', ' {0,3}#{1,6} ')
.replace('|lheading', '') // setex headings don't interrupt commonmark paragraphs
.replace('blockquote', ' {0,3}>')
.replace('fences', ' {0,3}(?:`{3,}(?=[^`\\n]*\\n)|~{3,})[^\\n]*\\n')
.replace('list', ' {0,3}(?:[*+-]|1[.)]) ') // only lists starting from 1 can interrupt
.replace('html', '</?(?:tag)(?: +|\\n|/?>)|<(?:script|pre|style|!--)')
.replace('tag', block._tag) // pars can be interrupted by type (6) html blocks
.getRegex();
block.blockquote = edit(block.blockquote)
.replace('paragraph', block.paragraph)
.getRegex();
/**
* Normal Block Grammar
*/
block.normal = merge({}, block);
/**
* GFM Block Grammar
*/
block.gfm = merge({}, block.normal, {
nptable: '^ *([^|\\n ].*\\|.*)\\n' // Header
+ ' {0,3}([-:]+ *\\|[-| :]*)' // Align
+ '(?:\\n((?:(?!\\n|hr|heading|blockquote|code|fences|list|html).*(?:\\n|$))*)\\n*|$)', // Cells
table: '^ *\\|(.+)\\n' // Header
+ ' {0,3}\\|?( *[-:]+[-| :]*)' // Align
+ '(?:\\n *((?:(?!\\n|hr|heading|blockquote|code|fences|list|html).*(?:\\n|$))*)\\n*|$)' // Cells
});
block.gfm.nptable = edit(block.gfm.nptable)
.replace('hr', block.hr)
.replace('heading', ' {0,3}#{1,6} ')
.replace('blockquote', ' {0,3}>')
.replace('code', ' {4}[^\\n]')
.replace('fences', ' {0,3}(?:`{3,}(?=[^`\\n]*\\n)|~{3,})[^\\n]*\\n')
.replace('list', ' {0,3}(?:[*+-]|1[.)]) ') // only lists starting from 1 can interrupt
.replace('html', '</?(?:tag)(?: +|\\n|/?>)|<(?:script|pre|style|!--)')
.replace('tag', block._tag) // tables can be interrupted by type (6) html blocks
.getRegex();
block.gfm.table = edit(block.gfm.table)
.replace('hr', block.hr)
.replace('heading', ' {0,3}#{1,6} ')
.replace('blockquote', ' {0,3}>')
.replace('code', ' {4}[^\\n]')
.replace('fences', ' {0,3}(?:`{3,}(?=[^`\\n]*\\n)|~{3,})[^\\n]*\\n')
.replace('list', ' {0,3}(?:[*+-]|1[.)]) ') // only lists starting from 1 can interrupt
.replace('html', '</?(?:tag)(?: +|\\n|/?>)|<(?:script|pre|style|!--)')
.replace('tag', block._tag) // tables can be interrupted by type (6) html blocks
.getRegex();
/**
* Pedantic grammar (original John Gruber's loose markdown specification)
*/
block.pedantic = merge({}, block.normal, {
html: edit(
'^ *(?:comment *(?:\\n|\\s*$)'
+ '|<(tag)[\\s\\S]+?</\\1> *(?:\\n{2,}|\\s*$)' // closed tag
+ '|<tag(?:"[^"]*"|\'[^\']*\'|\\s[^\'"/>\\s]*)*?/?> *(?:\\n{2,}|\\s*$))')
.replace('comment', block._comment)
.replace(/tag/g, '(?!(?:'
+ 'a|em|strong|small|s|cite|q|dfn|abbr|data|time|code|var|samp|kbd|sub'
+ '|sup|i|b|u|mark|ruby|rt|rp|bdi|bdo|span|br|wbr|ins|del|img)'
+ '\\b)\\w+(?!:|[^\\w\\s@]*@)\\b')
.getRegex(),
def: /^ *\[([^\]]+)\]: *<?([^\s>]+)>?(?: +(["(][^\n]+[")]))? *(?:\n+|$)/,
heading: /^ *(#{1,6}) *([^\n]+?) *(?:#+ *)?(?:\n+|$)/,
fences: noopTest, // fences not supported
paragraph: edit(block.normal._paragraph)
.replace('hr', block.hr)
.replace('heading', ' *#{1,6} *[^\n]')
.replace('lheading', block.lheading)
.replace('blockquote', ' {0,3}>')
.replace('|fences', '')
.replace('|list', '')
.replace('|html', '')
.getRegex()
});
/**
* Inline-Level Grammar
*/
const inline = {
escape: /^\\([!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~])/,
autolink: /^<(scheme:[^\s\x00-\x1f<>]*|email)>/,
url: noopTest,
tag: '^comment'
+ '|^</[a-zA-Z][\\w:-]*\\s*>' // self-closing tag
+ '|^<[a-zA-Z][\\w-]*(?:attribute)*?\\s*/?>' // open tag
+ '|^<\\?[\\s\\S]*?\\?>' // processing instruction, e.g. <?php ?>
+ '|^<![a-zA-Z]+\\s[\\s\\S]*?>' // declaration, e.g. <!DOCTYPE html>
+ '|^<!\\[CDATA\\[[\\s\\S]*?\\]\\]>', // CDATA section
link: /^!?\[(label)\]\(\s*(href)(?:\s+(title))?\s*\)/,
reflink: /^!?\[(label)\]\[(?!\s*\])((?:\\[\[\]]?|[^\[\]\\])+)\]/,
nolink: /^!?\[(?!\s*\])((?:\[[^\[\]]*\]|\\[\[\]]|[^\[\]])*)\](?:\[\])?/,
reflinkSearch: 'reflink|nolink(?!\\()',
strong: {
start: /^(?:(\*\*(?=[*punctuation]))|\*\*)(?![\s])|__/, // (1) returns if starts w/ punctuation
middle: /^\*\*(?:(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)|\*(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)*?\*)+?\*\*$|^__(?![\s])((?:(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)|_(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)*?_)+?)__$/,
endAst: /[^punctuation\s]\*\*(?!\*)|[punctuation]\*\*(?!\*)(?:(?=[punctuation_\s]|$))/, // last char can't be punct, or final * must also be followed by punct (or endline)
endUnd: /[^\s]__(?!_)(?:(?=[punctuation*\s])|$)/ // last char can't be a space, and final _ must preceed punct or \s (or endline)
},
em: {
start: /^(?:(\*(?=[punctuation]))|\*)(?![*\s])|_/, // (1) returns if starts w/ punctuation
middle: /^\*(?:(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)|\*(?:(?!overlapSkip)(?:[^*]|\\\*)|overlapSkip)*?\*)+?\*$|^_(?![_\s])(?:(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)|_(?:(?!overlapSkip)(?:[^_]|\\_)|overlapSkip)*?_)+?_$/,
endAst: /[^punctuation\s]\*(?!\*)|[punctuation]\*(?!\*)(?:(?=[punctuation_\s]|$))/, // last char can't be punct, or final * must also be followed by punct (or endline)
endUnd: /[^\s]_(?!_)(?:(?=[punctuation*\s])|$)/ // last char can't be a space, and final _ must preceed punct or \s (or endline)
},
code: /^(`+)([^`]|[^`][\s\S]*?[^`])\1(?!`)/,
br: /^( {2,}|\\)\n(?!\s*$)/,
del: noopTest,
text: /^(`+|[^`])(?:(?= {2,}\n)|[\s\S]*?(?:(?=[\\<!\[`*]|\b_|$)|[^ ](?= {2,}\n)))/,
punctuation: /^([\s*punctuation])/
};
// list of punctuation marks from common mark spec
// without * and _ to workaround cases with double emphasis
inline._punctuation = '!"#$%&\'()+\\-.,/:;<=>?@\\[\\]`^{|}~';
inline.punctuation = edit(inline.punctuation).replace(/punctuation/g, inline._punctuation).getRegex();
// sequences em should skip over [title](link), `code`, <html>
inline._blockSkip = '\\[[^\\]]*?\\]\\([^\\)]*?\\)|`[^`]*?`|<[^>]*?>';
inline._overlapSkip = '__[^_]*?__|\\*\\*\\[^\\*\\]*?\\*\\*';
inline._comment = edit(block._comment).replace('(?:-->|$)', '-->').getRegex();
inline.em.start = edit(inline.em.start)
.replace(/punctuation/g, inline._punctuation)
.getRegex();
inline.em.middle = edit(inline.em.middle)
.replace(/punctuation/g, inline._punctuation)
.replace(/overlapSkip/g, inline._overlapSkip)
.getRegex();
inline.em.endAst = edit(inline.em.endAst, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();
inline.em.endUnd = edit(inline.em.endUnd, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();
inline.strong.start = edit(inline.strong.start)
.replace(/punctuation/g, inline._punctuation)
.getRegex();
inline.strong.middle = edit(inline.strong.middle)
.replace(/punctuation/g, inline._punctuation)
.replace(/blockSkip/g, inline._blockSkip)
.getRegex();
inline.strong.endAst = edit(inline.strong.endAst, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();
inline.strong.endUnd = edit(inline.strong.endUnd, 'g')
.replace(/punctuation/g, inline._punctuation)
.getRegex();
inline.blockSkip = edit(inline._blockSkip, 'g')
.getRegex();
inline.overlapSkip = edit(inline._overlapSkip, 'g')
.getRegex();
inline._escapes = /\\([!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~])/g;
inline._scheme = /[a-zA-Z][a-zA-Z0-9+.-]{1,31}/;
inline._email = /[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+(@)[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)+(?![-_])/;
inline.autolink = edit(inline.autolink)
.replace('scheme', inline._scheme)
.replace('email', inline._email)
.getRegex();
inline._attribute = /\s+[a-zA-Z:_][\w.:-]*(?:\s*=\s*"[^"]*"|\s*=\s*'[^']*'|\s*=\s*[^\s"'=<>`]+)?/;
inline.tag = edit(inline.tag)
.replace('comment', inline._comment)
.replace('attribute', inline._attribute)
.getRegex();
inline._label = /(?:\[(?:\\.|[^\[\]\\])*\]|\\.|`[^`]*`|[^\[\]\\`])*?/;
inline._href = /<(?:\\[<>]?|[^\s<>\\])*>|[^\s\x00-\x1f]*/;
inline._title = /"(?:\\"?|[^"\\])*"|'(?:\\'?|[^'\\])*'|\((?:\\\)?|[^)\\])*\)/;
inline.link = edit(inline.link)
.replace('label', inline._label)
.replace('href', inline._href)
.replace('title', inline._title)
.getRegex();
inline.reflink = edit(inline.reflink)
.replace('label', inline._label)
.getRegex();
inline.reflinkSearch = edit(inline.reflinkSearch, 'g')
.replace('reflink', inline.reflink)
.replace('nolink', inline.nolink)
.getRegex();
/**
* Normal Inline Grammar
*/
inline.normal = merge({}, inline);
/**
* Pedantic Inline Grammar
*/
inline.pedantic = merge({}, inline.normal, {
strong: {
start: /^__|\*\*/,
middle: /^__(?=\S)([\s\S]*?\S)__(?!_)|^\*\*(?=\S)([\s\S]*?\S)\*\*(?!\*)/,
endAst: /\*\*(?!\*)/g,
endUnd: /__(?!_)/g
},
em: {
start: /^_|\*/,
middle: /^()\*(?=\S)([\s\S]*?\S)\*(?!\*)|^_(?=\S)([\s\S]*?\S)_(?!_)/,
endAst: /\*(?!\*)/g,
endUnd: /_(?!_)/g
},
link: edit(/^!?\[(label)\]\((.*?)\)/)
.replace('label', inline._label)
.getRegex(),
reflink: edit(/^!?\[(label)\]\s*\[([^\]]*)\]/)
.replace('label', inline._label)
.getRegex()
});
/**
* GFM Inline Grammar
*/
inline.gfm = merge({}, inline.normal, {
escape: edit(inline.escape).replace('])', '~|])').getRegex(),
_extended_email: /[A-Za-z0-9._+-]+(@)[a-zA-Z0-9-_]+(?:\.[a-zA-Z0-9-_]*[a-zA-Z0-9])+(?![-_])/,
url: /^((?:ftp|https?):\/\/|www\.)(?:[a-zA-Z0-9\-]+\.?)+[^\s<]*|^email/,
_backpedal: /(?:[^?!.,:;*_~()&]+|\([^)]*\)|&(?![a-zA-Z0-9]+;$)|[?!.,:;*_~)]+(?!$))+/,
del: /^~+(?=\S)([\s\S]*?\S)~+/,
text: /^(`+|[^`])(?:(?= {2,}\n)|[\s\S]*?(?:(?=[\\<!\[`*~]|\b_|https?:\/\/|ftp:\/\/|www\.|$)|[^ ](?= {2,}\n)|[^a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-](?=[a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-]+@))|(?=[a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-]+@))/
});
inline.gfm.url = edit(inline.gfm.url, 'i')
.replace('email', inline.gfm._extended_email)
.getRegex();
/**
* GFM + Line Breaks Inline Grammar
*/
inline.breaks = merge({}, inline.gfm, {
br: edit(inline.br).replace('{2,}', '*').getRegex(),
text: edit(inline.gfm.text)
.replace('\\b_', '\\b_| {2,}\\n')
.replace(/\{2,\}/g, '*')
.getRegex()
});
module.exports = {
block,
inline
};