Lexer.js 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491
  1. const Tokenizer = require('./Tokenizer.js');
  2. const { defaults } = require('./defaults.js');
  3. const { block, inline } = require('./rules.js');
  4. const { repeatString } = require('./helpers.js');
  5. /**
  6. * smartypants text replacement
  7. */
  8. function smartypants(text) {
  9. return text
  10. // em-dashes
  11. .replace(/---/g, '\u2014')
  12. // en-dashes
  13. .replace(/--/g, '\u2013')
  14. // opening singles
  15. .replace(/(^|[-\u2014/(\[{"\s])'/g, '$1\u2018')
  16. // closing singles & apostrophes
  17. .replace(/'/g, '\u2019')
  18. // opening doubles
  19. .replace(/(^|[-\u2014/(\[{\u2018\s])"/g, '$1\u201c')
  20. // closing doubles
  21. .replace(/"/g, '\u201d')
  22. // ellipses
  23. .replace(/\.{3}/g, '\u2026');
  24. }
  25. /**
  26. * mangle email addresses
  27. */
  28. function mangle(text) {
  29. let out = '',
  30. i,
  31. ch;
  32. const l = text.length;
  33. for (i = 0; i < l; i++) {
  34. ch = text.charCodeAt(i);
  35. if (Math.random() > 0.5) {
  36. ch = 'x' + ch.toString(16);
  37. }
  38. out += '&#' + ch + ';';
  39. }
  40. return out;
  41. }
  42. /**
  43. * Block Lexer
  44. */
  45. module.exports = class Lexer {
  46. constructor(options) {
  47. this.tokens = [];
  48. this.tokens.links = Object.create(null);
  49. this.options = options || defaults;
  50. this.options.tokenizer = this.options.tokenizer || new Tokenizer();
  51. this.tokenizer = this.options.tokenizer;
  52. this.tokenizer.options = this.options;
  53. const rules = {
  54. block: block.normal,
  55. inline: inline.normal
  56. };
  57. if (this.options.pedantic) {
  58. rules.block = block.pedantic;
  59. rules.inline = inline.pedantic;
  60. } else if (this.options.gfm) {
  61. rules.block = block.gfm;
  62. if (this.options.breaks) {
  63. rules.inline = inline.breaks;
  64. } else {
  65. rules.inline = inline.gfm;
  66. }
  67. }
  68. this.tokenizer.rules = rules;
  69. }
  70. /**
  71. * Expose Rules
  72. */
  73. static get rules() {
  74. return {
  75. block,
  76. inline
  77. };
  78. }
  79. /**
  80. * Static Lex Method
  81. */
  82. static lex(src, options) {
  83. const lexer = new Lexer(options);
  84. return lexer.lex(src);
  85. }
  86. /**
  87. * Static Lex Inline Method
  88. */
  89. static lexInline(src, options) {
  90. const lexer = new Lexer(options);
  91. return lexer.inlineTokens(src);
  92. }
  93. /**
  94. * Preprocessing
  95. */
  96. lex(src) {
  97. src = src
  98. .replace(/\r\n|\r/g, '\n')
  99. .replace(/\t/g, ' ');
  100. this.blockTokens(src, this.tokens, true);
  101. this.inline(this.tokens);
  102. return this.tokens;
  103. }
  104. /**
  105. * Lexing
  106. */
  107. blockTokens(src, tokens = [], top = true) {
  108. if (this.options.pedantic) {
  109. src = src.replace(/^ +$/gm, '');
  110. }
  111. let token, i, l, lastToken;
  112. while (src) {
  113. // newline
  114. if (token = this.tokenizer.space(src)) {
  115. src = src.substring(token.raw.length);
  116. if (token.type) {
  117. tokens.push(token);
  118. }
  119. continue;
  120. }
  121. // code
  122. if (token = this.tokenizer.code(src)) {
  123. src = src.substring(token.raw.length);
  124. lastToken = tokens[tokens.length - 1];
  125. // An indented code block cannot interrupt a paragraph.
  126. if (lastToken && lastToken.type === 'paragraph') {
  127. lastToken.raw += '\n' + token.raw;
  128. lastToken.text += '\n' + token.text;
  129. } else {
  130. tokens.push(token);
  131. }
  132. continue;
  133. }
  134. // fences
  135. if (token = this.tokenizer.fences(src)) {
  136. src = src.substring(token.raw.length);
  137. tokens.push(token);
  138. continue;
  139. }
  140. // heading
  141. if (token = this.tokenizer.heading(src)) {
  142. src = src.substring(token.raw.length);
  143. tokens.push(token);
  144. continue;
  145. }
  146. // table no leading pipe (gfm)
  147. if (token = this.tokenizer.nptable(src)) {
  148. src = src.substring(token.raw.length);
  149. tokens.push(token);
  150. continue;
  151. }
  152. // hr
  153. if (token = this.tokenizer.hr(src)) {
  154. src = src.substring(token.raw.length);
  155. tokens.push(token);
  156. continue;
  157. }
  158. // blockquote
  159. if (token = this.tokenizer.blockquote(src)) {
  160. src = src.substring(token.raw.length);
  161. token.tokens = this.blockTokens(token.text, [], top);
  162. tokens.push(token);
  163. continue;
  164. }
  165. // list
  166. if (token = this.tokenizer.list(src)) {
  167. src = src.substring(token.raw.length);
  168. l = token.items.length;
  169. for (i = 0; i < l; i++) {
  170. token.items[i].tokens = this.blockTokens(token.items[i].text, [], false);
  171. }
  172. tokens.push(token);
  173. continue;
  174. }
  175. // html
  176. if (token = this.tokenizer.html(src)) {
  177. src = src.substring(token.raw.length);
  178. tokens.push(token);
  179. continue;
  180. }
  181. // def
  182. if (top && (token = this.tokenizer.def(src))) {
  183. src = src.substring(token.raw.length);
  184. if (!this.tokens.links[token.tag]) {
  185. this.tokens.links[token.tag] = {
  186. href: token.href,
  187. title: token.title
  188. };
  189. }
  190. continue;
  191. }
  192. // table (gfm)
  193. if (token = this.tokenizer.table(src)) {
  194. src = src.substring(token.raw.length);
  195. tokens.push(token);
  196. continue;
  197. }
  198. // lheading
  199. if (token = this.tokenizer.lheading(src)) {
  200. src = src.substring(token.raw.length);
  201. tokens.push(token);
  202. continue;
  203. }
  204. // top-level paragraph
  205. if (top && (token = this.tokenizer.paragraph(src))) {
  206. src = src.substring(token.raw.length);
  207. tokens.push(token);
  208. continue;
  209. }
  210. // text
  211. if (token = this.tokenizer.text(src)) {
  212. src = src.substring(token.raw.length);
  213. lastToken = tokens[tokens.length - 1];
  214. if (lastToken && lastToken.type === 'text') {
  215. lastToken.raw += '\n' + token.raw;
  216. lastToken.text += '\n' + token.text;
  217. } else {
  218. tokens.push(token);
  219. }
  220. continue;
  221. }
  222. if (src) {
  223. const errMsg = 'Infinite loop on byte: ' + src.charCodeAt(0);
  224. if (this.options.silent) {
  225. console.error(errMsg);
  226. break;
  227. } else {
  228. throw new Error(errMsg);
  229. }
  230. }
  231. }
  232. return tokens;
  233. }
  234. inline(tokens) {
  235. let i,
  236. j,
  237. k,
  238. l2,
  239. row,
  240. token;
  241. const l = tokens.length;
  242. for (i = 0; i < l; i++) {
  243. token = tokens[i];
  244. switch (token.type) {
  245. case 'paragraph':
  246. case 'text':
  247. case 'heading': {
  248. token.tokens = [];
  249. this.inlineTokens(token.text, token.tokens);
  250. break;
  251. }
  252. case 'table': {
  253. token.tokens = {
  254. header: [],
  255. cells: []
  256. };
  257. // header
  258. l2 = token.header.length;
  259. for (j = 0; j < l2; j++) {
  260. token.tokens.header[j] = [];
  261. this.inlineTokens(token.header[j], token.tokens.header[j]);
  262. }
  263. // cells
  264. l2 = token.cells.length;
  265. for (j = 0; j < l2; j++) {
  266. row = token.cells[j];
  267. token.tokens.cells[j] = [];
  268. for (k = 0; k < row.length; k++) {
  269. token.tokens.cells[j][k] = [];
  270. this.inlineTokens(row[k], token.tokens.cells[j][k]);
  271. }
  272. }
  273. break;
  274. }
  275. case 'blockquote': {
  276. this.inline(token.tokens);
  277. break;
  278. }
  279. case 'list': {
  280. l2 = token.items.length;
  281. for (j = 0; j < l2; j++) {
  282. this.inline(token.items[j].tokens);
  283. }
  284. break;
  285. }
  286. default: {
  287. // do nothing
  288. }
  289. }
  290. }
  291. return tokens;
  292. }
  293. /**
  294. * Lexing/Compiling
  295. */
  296. inlineTokens(src, tokens = [], inLink = false, inRawBlock = false) {
  297. let token, lastToken;
  298. // String with links masked to avoid interference with em and strong
  299. let maskedSrc = src;
  300. let match;
  301. let keepPrevChar, prevChar;
  302. // Mask out reflinks
  303. if (this.tokens.links) {
  304. const links = Object.keys(this.tokens.links);
  305. if (links.length > 0) {
  306. while ((match = this.tokenizer.rules.inline.reflinkSearch.exec(maskedSrc)) != null) {
  307. if (links.includes(match[0].slice(match[0].lastIndexOf('[') + 1, -1))) {
  308. maskedSrc = maskedSrc.slice(0, match.index) + '[' + repeatString('a', match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.reflinkSearch.lastIndex);
  309. }
  310. }
  311. }
  312. }
  313. // Mask out other blocks
  314. while ((match = this.tokenizer.rules.inline.blockSkip.exec(maskedSrc)) != null) {
  315. maskedSrc = maskedSrc.slice(0, match.index) + '[' + repeatString('a', match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.blockSkip.lastIndex);
  316. }
  317. // Mask out escaped em & strong delimiters
  318. while ((match = this.tokenizer.rules.inline.escapedEmSt.exec(maskedSrc)) != null) {
  319. maskedSrc = maskedSrc.slice(0, match.index) + '++' + maskedSrc.slice(this.tokenizer.rules.inline.escapedEmSt.lastIndex);
  320. }
  321. while (src) {
  322. if (!keepPrevChar) {
  323. prevChar = '';
  324. }
  325. keepPrevChar = false;
  326. // escape
  327. if (token = this.tokenizer.escape(src)) {
  328. src = src.substring(token.raw.length);
  329. tokens.push(token);
  330. continue;
  331. }
  332. // tag
  333. if (token = this.tokenizer.tag(src, inLink, inRawBlock)) {
  334. src = src.substring(token.raw.length);
  335. inLink = token.inLink;
  336. inRawBlock = token.inRawBlock;
  337. const lastToken = tokens[tokens.length - 1];
  338. if (lastToken && token.type === 'text' && lastToken.type === 'text') {
  339. lastToken.raw += token.raw;
  340. lastToken.text += token.text;
  341. } else {
  342. tokens.push(token);
  343. }
  344. continue;
  345. }
  346. // link
  347. if (token = this.tokenizer.link(src)) {
  348. src = src.substring(token.raw.length);
  349. if (token.type === 'link') {
  350. token.tokens = this.inlineTokens(token.text, [], true, inRawBlock);
  351. }
  352. tokens.push(token);
  353. continue;
  354. }
  355. // reflink, nolink
  356. if (token = this.tokenizer.reflink(src, this.tokens.links)) {
  357. src = src.substring(token.raw.length);
  358. const lastToken = tokens[tokens.length - 1];
  359. if (token.type === 'link') {
  360. token.tokens = this.inlineTokens(token.text, [], true, inRawBlock);
  361. tokens.push(token);
  362. } else if (lastToken && token.type === 'text' && lastToken.type === 'text') {
  363. lastToken.raw += token.raw;
  364. lastToken.text += token.text;
  365. } else {
  366. tokens.push(token);
  367. }
  368. continue;
  369. }
  370. // em & strong
  371. if (token = this.tokenizer.emStrong(src, maskedSrc, prevChar)) {
  372. src = src.substring(token.raw.length);
  373. token.tokens = this.inlineTokens(token.text, [], inLink, inRawBlock);
  374. tokens.push(token);
  375. continue;
  376. }
  377. // code
  378. if (token = this.tokenizer.codespan(src)) {
  379. src = src.substring(token.raw.length);
  380. tokens.push(token);
  381. continue;
  382. }
  383. // br
  384. if (token = this.tokenizer.br(src)) {
  385. src = src.substring(token.raw.length);
  386. tokens.push(token);
  387. continue;
  388. }
  389. // del (gfm)
  390. if (token = this.tokenizer.del(src)) {
  391. src = src.substring(token.raw.length);
  392. token.tokens = this.inlineTokens(token.text, [], inLink, inRawBlock);
  393. tokens.push(token);
  394. continue;
  395. }
  396. // autolink
  397. if (token = this.tokenizer.autolink(src, mangle)) {
  398. src = src.substring(token.raw.length);
  399. tokens.push(token);
  400. continue;
  401. }
  402. // url (gfm)
  403. if (!inLink && (token = this.tokenizer.url(src, mangle))) {
  404. src = src.substring(token.raw.length);
  405. tokens.push(token);
  406. continue;
  407. }
  408. // text
  409. if (token = this.tokenizer.inlineText(src, inRawBlock, smartypants)) {
  410. src = src.substring(token.raw.length);
  411. if (token.raw.slice(-1) !== '_') { // Track prevChar before string of ____ started
  412. prevChar = token.raw.slice(-1);
  413. }
  414. keepPrevChar = true;
  415. lastToken = tokens[tokens.length - 1];
  416. if (lastToken && lastToken.type === 'text') {
  417. lastToken.raw += token.raw;
  418. lastToken.text += token.text;
  419. } else {
  420. tokens.push(token);
  421. }
  422. continue;
  423. }
  424. if (src) {
  425. const errMsg = 'Infinite loop on byte: ' + src.charCodeAt(0);
  426. if (this.options.silent) {
  427. console.error(errMsg);
  428. break;
  429. } else {
  430. throw new Error(errMsg);
  431. }
  432. }
  433. }
  434. return tokens;
  435. }
  436. };