marked/src/Tokenizer.js
2020-04-08 13:04:38 -05:00

664 lines
16 KiB
JavaScript

const { defaults } = require('./defaults.js');
const {
rtrim,
splitCells,
escape,
findClosingBracket
} = require('./helpers.js');
const { block, inline } = require('./rules.js');
function outputLink(cap, link, tokens, raw, lexer) {
const href = link.href;
const title = link.title ? escape(link.title) : null;
if (cap[0].charAt(0) !== '!') {
return {
type: 'link',
raw,
href,
title,
tokens: lexer.inlineTokens(cap[1])
};
} else {
return {
type: 'image',
raw,
text: escape(cap[1]),
href,
title
};
}
}
/**
* Tokenizer
*/
module.exports = class Tokenizer {
constructor(options) {
this.options = options || defaults;
this.initialize();
}
initialize() {
this.inLink = false;
this.inRawBlock = false;
this.rules = {
block: block.normal,
inline: inline.normal
};
if (this.options.pedantic) {
this.rules.block = block.pedantic;
this.rules.inline = inline.pedantic;
} else if (this.options.gfm) {
this.rules.block = block.gfm;
if (this.options.breaks) {
this.rules.inline = inline.breaks;
} else {
this.rules.inline = inline.gfm;
}
}
}
/**
* Expose Block Rules
*/
static get rules() {
return {
block,
inline
};
}
space(lexer, src, tokens, top) {
const cap = this.rules.block.newline.exec(src);
if (cap) {
if (cap[0].length > 1) {
return {
type: 'space',
raw: cap[0]
};
}
return { raw: '\n' };
}
}
code(lexer, src, tokens, top) {
const cap = this.rules.block.code.exec(src);
if (cap) {
const lastToken = tokens[tokens.length - 1];
// An indented code block cannot interrupt a paragraph.
if (lastToken && lastToken.type === 'paragraph') {
tokens.pop();
lastToken.text += '\n' + cap[0].trimRight();
lastToken.raw += '\n' + cap[0];
return lastToken;
} else {
const text = cap[0].replace(/^ {4}/gm, '');
return {
type: 'code',
raw: cap[0],
codeBlockStyle: 'indented',
text: !this.options.pedantic
? rtrim(text, '\n')
: text
};
}
}
}
fences(lexer, src, tokens, top) {
const cap = this.rules.block.fences.exec(src);
if (cap) {
return {
type: 'code',
raw: cap[0],
lang: cap[2] ? cap[2].trim() : cap[2],
text: cap[3] || ''
};
}
}
heading(lexer, src, tokens, top) {
const cap = this.rules.block.heading.exec(src);
if (cap) {
return {
type: 'heading',
raw: cap[0],
depth: cap[1].length,
text: cap[2]
};
}
}
nptable(lexer, src, tokens, top) {
const cap = this.rules.block.nptable.exec(src);
if (cap) {
const item = {
type: 'table',
header: splitCells(cap[1].replace(/^ *| *\| *$/g, '')),
align: cap[2].replace(/^ *|\| *$/g, '').split(/ *\| */),
cells: cap[3] ? cap[3].replace(/\n$/, '').split('\n') : [],
raw: cap[0]
};
if (item.header.length === item.align.length) {
let l = item.align.length;
let i;
for (i = 0; i < l; i++) {
if (/^ *-+: *$/.test(item.align[i])) {
item.align[i] = 'right';
} else if (/^ *:-+: *$/.test(item.align[i])) {
item.align[i] = 'center';
} else if (/^ *:-+ *$/.test(item.align[i])) {
item.align[i] = 'left';
} else {
item.align[i] = null;
}
}
l = item.cells.length;
for (i = 0; i < l; i++) {
item.cells[i] = splitCells(item.cells[i], item.header.length);
}
return item;
}
}
}
hr(lexer, src, tokens, top) {
const cap = this.rules.block.hr.exec(src);
if (cap) {
return {
type: 'hr',
raw: cap[0]
};
}
}
blockquote(lexer, src, tokens, top) {
const cap = this.rules.block.blockquote.exec(src);
if (cap) {
const text = cap[0].replace(/^ *> ?/gm, '');
return {
type: 'blockquote',
raw: cap[0],
tokens: lexer.blockTokens(text, [], top)
};
}
}
list(lexer, src, tokens, top) {
const cap = this.rules.block.list.exec(src);
if (cap) {
let raw = cap[0];
const bull = cap[2];
const isordered = bull.length > 1;
const list = {
type: 'list',
raw,
ordered: isordered,
start: isordered ? +bull : '',
loose: false,
items: []
};
// Get each top-level item.
const itemMatch = cap[0].match(this.rules.block.item);
let next = false,
item,
space,
b,
addBack,
loose,
istask,
ischecked;
const l = itemMatch.length;
for (let i = 0; i < l; i++) {
item = itemMatch[i];
raw = item;
// Remove the list item's bullet
// so it is seen as the next token.
space = item.length;
item = item.replace(/^ *([*+-]|\d+\.) */, '');
// Outdent whatever the
// list item contains. Hacky.
if (~item.indexOf('\n ')) {
space -= item.length;
item = !this.options.pedantic
? item.replace(new RegExp('^ {1,' + space + '}', 'gm'), '')
: item.replace(/^ {1,4}/gm, '');
}
// Determine whether the next list item belongs here.
// Backpedal if it does not belong in this list.
if (i !== l - 1) {
b = this.rules.block.bullet.exec(itemMatch[i + 1])[0];
if (bull.length > 1 ? b.length === 1
: (b.length > 1 || (this.options.smartLists && b !== bull))) {
addBack = itemMatch.slice(i + 1).join('\n');
list.raw = list.raw.substring(0, list.raw.length - addBack.length);
i = l - 1;
}
}
// Determine whether item is loose or not.
// Use: /(^|\n)(?! )[^\n]+\n\n(?!\s*$)/
// for discount behavior.
loose = next || /\n\n(?!\s*$)/.test(item);
if (i !== l - 1) {
next = item.charAt(item.length - 1) === '\n';
if (!loose) loose = next;
}
if (loose) {
list.loose = true;
}
// Check for task list items
istask = /^\[[ xX]\] /.test(item);
ischecked = undefined;
if (istask) {
ischecked = item[1] !== ' ';
item = item.replace(/^\[[ xX]\] +/, '');
}
list.items.push({
raw,
task: istask,
checked: ischecked,
loose: loose,
tokens: lexer.blockTokens(item, [], false)
});
}
return list;
}
}
html(lexer, src, tokens, top) {
const cap = this.rules.block.html.exec(src);
if (cap) {
return {
type: this.options.sanitize
? 'paragraph'
: 'html',
raw: cap[0],
pre: !this.options.sanitizer
&& (cap[1] === 'pre' || cap[1] === 'script' || cap[1] === 'style'),
text: this.options.sanitize ? (this.options.sanitizer ? this.options.sanitizer(cap[0]) : escape(cap[0])) : cap[0]
};
}
}
def(lexer, src, tokens, top) {
const cap = this.rules.block.def.exec(src);
if (cap) {
if (cap[3]) cap[3] = cap[3].substring(1, cap[3].length - 1);
const tag = cap[1].toLowerCase().replace(/\s+/g, ' ');
return {
tag,
raw: cap[0],
href: cap[2],
title: cap[3]
};
}
}
table(lexer, src, tokens, top) {
const cap = this.rules.block.table.exec(src);
if (cap) {
const item = {
type: 'table',
header: splitCells(cap[1].replace(/^ *| *\| *$/g, '')),
align: cap[2].replace(/^ *|\| *$/g, '').split(/ *\| */),
cells: cap[3] ? cap[3].replace(/\n$/, '').split('\n') : []
};
if (item.header.length === item.align.length) {
item.raw = cap[0];
let l = item.align.length;
let i;
for (i = 0; i < l; i++) {
if (/^ *-+: *$/.test(item.align[i])) {
item.align[i] = 'right';
} else if (/^ *:-+: *$/.test(item.align[i])) {
item.align[i] = 'center';
} else if (/^ *:-+ *$/.test(item.align[i])) {
item.align[i] = 'left';
} else {
item.align[i] = null;
}
}
l = item.cells.length;
for (i = 0; i < l; i++) {
item.cells[i] = splitCells(
item.cells[i].replace(/^ *\| *| *\| *$/g, ''),
item.header.length);
}
return item;
}
}
}
lheading(lexer, src, tokens, top) {
const cap = this.rules.block.lheading.exec(src);
if (cap) {
return {
type: 'heading',
raw: cap[0],
depth: cap[2].charAt(0) === '=' ? 1 : 2,
text: cap[1]
};
}
}
paragraph(lexer, src, tokens, top) {
const cap = this.rules.block.paragraph.exec(src);
if (cap) {
return {
type: 'paragraph',
raw: cap[0],
text: cap[1].charAt(cap[1].length - 1) === '\n'
? cap[1].slice(0, -1)
: cap[1]
};
}
}
text(lexer, src, tokens, top) {
const cap = this.rules.block.text.exec(src);
if (cap) {
return {
type: 'text',
raw: cap[0],
text: cap[0]
};
}
}
escape(lexer, src, tokens) {
const cap = this.rules.inline.escape.exec(src);
if (cap) {
return {
type: 'escape',
raw: cap[0],
text: escape(cap[1])
};
}
}
tag(lexer, src, tokens) {
const cap = this.rules.inline.tag.exec(src);
if (cap) {
if (!this.inLink && /^<a /i.test(cap[0])) {
this.inLink = true;
} else if (this.inLink && /^<\/a>/i.test(cap[0])) {
this.inLink = false;
}
if (!this.inRawBlock && /^<(pre|code|kbd|script)(\s|>)/i.test(cap[0])) {
this.inRawBlock = true;
} else if (this.inRawBlock && /^<\/(pre|code|kbd|script)(\s|>)/i.test(cap[0])) {
this.inRawBlock = false;
}
return {
type: this.options.sanitize
? 'text'
: 'html',
raw: cap[0],
text: this.options.sanitize
? (this.options.sanitizer
? this.options.sanitizer(cap[0])
: escape(cap[0]))
: cap[0]
};
}
}
link(lexer, src, tokens) {
const cap = this.rules.inline.link.exec(src);
if (cap) {
const lastParenIndex = findClosingBracket(cap[2], '()');
if (lastParenIndex > -1) {
const start = cap[0].indexOf('!') === 0 ? 5 : 4;
const linkLen = start + cap[1].length + lastParenIndex;
cap[2] = cap[2].substring(0, lastParenIndex);
cap[0] = cap[0].substring(0, linkLen).trim();
cap[3] = '';
}
this.inLink = true;
let href = cap[2];
let title = '';
if (this.options.pedantic) {
const link = /^([^'"]*[^\s])\s+(['"])(.*)\2/.exec(href);
if (link) {
href = link[1];
title = link[3];
} else {
title = '';
}
} else {
title = cap[3] ? cap[3].slice(1, -1) : '';
}
href = href.trim().replace(/^<([\s\S]*)>$/, '$1');
const token = outputLink(cap, {
href: href ? href.replace(this.rules.inline._escapes, '$1') : href,
title: title ? title.replace(this.rules.inline._escapes, '$1') : title
}, tokens, cap[0], lexer);
this.inLink = false;
return token;
}
}
reflink(lexer, src, tokens) {
let cap;
if ((cap = this.rules.inline.reflink.exec(src))
|| (cap = this.rules.inline.nolink.exec(src))) {
let link = (cap[2] || cap[1]).replace(/\s+/g, ' ');
link = lexer.tokens.links[link.toLowerCase()];
if (!link || !link.href) {
const text = cap[0].charAt(0);
return {
type: 'text',
raw: text,
text
};
}
this.inLink = true;
const token = outputLink(cap, link, tokens, cap[0], lexer);
this.inLink = false;
return token;
}
}
strong(lexer, src, tokens) {
const cap = this.rules.inline.strong.exec(src);
if (cap) {
return {
type: 'strong',
raw: cap[0],
tokens: lexer.inlineTokens(cap[4] || cap[3] || cap[2] || cap[1])
};
}
}
em(lexer, src, tokens) {
const cap = this.rules.inline.em.exec(src);
if (cap) {
return {
type: 'em',
raw: cap[0],
tokens: lexer.inlineTokens(cap[6] || cap[5] || cap[4] || cap[3] || cap[2] || cap[1])
};
}
}
codespan(lexer, src, tokens) {
const cap = this.rules.inline.code.exec(src);
if (cap) {
return {
type: 'codespan',
raw: cap[0],
text: escape(cap[2].trim(), true)
};
}
}
br(lexer, src, tokens) {
const cap = this.rules.inline.br.exec(src);
if (cap) {
return {
type: 'br',
raw: cap[0]
};
}
}
del(lexer, src, tokens) {
const cap = this.rules.inline.del.exec(src);
if (cap) {
return {
type: 'del',
raw: cap[0],
tokens: lexer.inlineTokens(cap[1])
};
}
}
autolink(lexer, src, tokens) {
const cap = this.rules.inline.autolink.exec(src);
if (cap) {
let text, href;
if (cap[2] === '@') {
text = escape(this.options.mangle ? this.mangle(cap[1]) : cap[1]);
href = 'mailto:' + text;
} else {
text = escape(cap[1]);
href = text;
}
return {
type: 'link',
raw: cap[0],
text,
href,
tokens: [
{
type: 'text',
raw: text,
text
}
]
};
}
}
url(lexer, src, tokens) {
let cap;
if (!this.inLink && (cap = this.rules.inline.url.exec(src))) {
let text, href;
if (cap[2] === '@') {
text = escape(this.options.mangle ? this.mangle(cap[0]) : cap[0]);
href = 'mailto:' + text;
} else {
// do extended autolink path validation
let prevCapZero;
do {
prevCapZero = cap[0];
cap[0] = this.rules.inline._backpedal.exec(cap[0])[0];
} while (prevCapZero !== cap[0]);
text = escape(cap[0]);
if (cap[1] === 'www.') {
href = 'http://' + text;
} else {
href = text;
}
}
return {
type: 'link',
raw: cap[0],
text,
href,
tokens: [
{
type: 'text',
raw: text,
text
}
]
};
}
}
inlineText(lexer, src, tokens) {
const cap = this.rules.inline.text.exec(src);
if (cap) {
let text;
if (this.inRawBlock) {
text = this.options.sanitize ? (this.options.sanitizer ? this.options.sanitizer(cap[0]) : escape(cap[0])) : cap[0];
} else {
text = escape(this.options.smartypants ? this.smartypants(cap[0]) : cap[0]);
}
return {
type: 'text',
raw: cap[0],
text
};
}
}
/**
* Smartypants Transformations
*/
smartypants(text) {
return text
// em-dashes
.replace(/---/g, '\u2014')
// en-dashes
.replace(/--/g, '\u2013')
// opening singles
.replace(/(^|[-\u2014/(\[{"\s])'/g, '$1\u2018')
// closing singles & apostrophes
.replace(/'/g, '\u2019')
// opening doubles
.replace(/(^|[-\u2014/(\[{\u2018\s])"/g, '$1\u201c')
// closing doubles
.replace(/"/g, '\u201d')
// ellipses
.replace(/\.{3}/g, '\u2026');
}
/**
* Mangle Links
*/
mangle(text) {
let out = '',
i,
ch;
const l = text.length;
for (i = 0; i < l; i++) {
ch = text.charCodeAt(i);
if (Math.random() > 0.5) {
ch = 'x' + ch.toString(16);
}
out += '&#' + ch + ';';
}
return out;
}
};