From ccf83953a881f09c5387b890c6820078f45ace31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Sko=C5=99epa?= <jakub@skorepa.info> Date: Sat, 17 Sep 2016 17:38:34 +0200 Subject: [PATCH] Created markdown parser based on kramed --- lib/index.js | 30 ++-- lib/markdownToJSON/escape.js | 21 +++ lib/markdownToJSON/index.js | 11 ++ lib/markdownToJSON/inline.js | 291 +++++++++++++++++++++++++++++++++ lib/markdownToJSON/parser.js | 197 ++++++++++++++++++++++ lib/markdownToJSON/renderer.js | 171 +++++++++++++++++++ lib/parser.js | 5 + package.json | 1 + 8 files changed, 707 insertions(+), 20 deletions(-) create mode 100644 lib/markdownToJSON/escape.js create mode 100644 lib/markdownToJSON/index.js create mode 100644 lib/markdownToJSON/inline.js create mode 100644 lib/markdownToJSON/parser.js create mode 100644 lib/markdownToJSON/renderer.js diff --git a/lib/index.js b/lib/index.js index dd00375..00b1067 100644 --- a/lib/index.js +++ b/lib/index.js @@ -1,29 +1,19 @@ -/* -definitions: - workUnit: { metadata: {}, content: string } - promisedWorkUnit: () => Promise( workUnit ) - -Functions: - init - read data (from file, database, ...) - () => [ promisedWorkUnit ] - parse - workUnit => workUnit - collect - [workUnit] => [workUnit] - parse - workUnit => workUnit - terminate - workUnit => () - -parse, collect and render should be pure - -*/ +import _ from 'lodash/fp' +import fs from 'fs' import {readFiles} from './readFiles' import parser from './parser' +import markdownToJSON from './markdownToJSON' readFiles({ filter: file => !/\.git$/.exec(file) })(change => console.log()) .then(files => files.map(f => parser(f))) +.then(files => files.filter(f => f)) +.then(files => files.map(f => ({ metadata: f.metadata, content: markdownToJSON(f.content) }))) +.then(files => files.map(f => { + console.log(_.repeat(80,'=')) + console.log(f.metadata) + console.log(f.content) +})) .catch(e => console.log(e)) diff --git a/lib/markdownToJSON/escape.js b/lib/markdownToJSON/escape.js new file mode 100644 index 0000000..6f78d70 --- /dev/null +++ b/lib/markdownToJSON/escape.js @@ -0,0 +1,21 @@ +export const escape = (html, encode) => { + return html + .replace(!encode ? /&(?!#?\w+;)/g : /&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); +} + +export const unescape = (html) => { + return html.replace(/&([#\w]+);/g, function(_, n) { + n = n.toLowerCase(); + if (n === 'colon') return ':'; + if (n.charAt(0) === '#') { + return n.charAt(1) === 'x' + ? String.fromCharCode(parseInt(n.substring(2), 16)) + : String.fromCharCode(+n.substring(1)); + } + return ''; + }); +} diff --git a/lib/markdownToJSON/index.js b/lib/markdownToJSON/index.js new file mode 100644 index 0000000..adcaa54 --- /dev/null +++ b/lib/markdownToJSON/index.js @@ -0,0 +1,11 @@ +import _ from 'lodash/fp' + +import {toStr} from './renderer' +import Parser from './parser' +import kramed from 'kramed' + +const mtj = new Parser() +export default _.flow( + a => kramed.lexer(a), + a => mtj.parse(a) +) diff --git a/lib/markdownToJSON/inline.js b/lib/markdownToJSON/inline.js new file mode 100644 index 0000000..7de75c5 --- /dev/null +++ b/lib/markdownToJSON/inline.js @@ -0,0 +1,291 @@ +import { escape } from './escape' +import Renderer from './renderer' +var noop = () => {}; + +var inline = require('kramed/lib/rules/inline'); +var defaultOptions = require('kramed/lib/lex/options'); +var isHTMLBlock = require('kramed/lib/lex/html_blocks'); + +/** + * Inline Lexer & Compiler + */ + +function InlineLexer(links, options, renderer) { + this.options = options || defaultOptions; + this.links = links; + this.rules = inline.normal; + this.renderer = renderer + + if (!this.links) { + throw new + Error('Tokens array requires a `links` property.'); + } + + if (this.options.gfm) { + if (this.options.breaks) { + this.rules = inline.breaks; + } else { + this.rules = inline.gfm; + } + } else if (this.options.pedantic) { + this.rules = inline.pedantic; + } + + // Is mathjax disabled ? + if (!this.options.mathjax) { + this.rules.math = noop; + } +} + +/** + * Expose Inline Rules + */ + +InlineLexer.rules = inline; + +/** + * Static Lexing/Compiling Method + */ + +InlineLexer.output = function(src, links, options) { + var inline = new InlineLexer(links, options, new Renderer()); + return inline.output(src); +}; + +InlineLexer.prototype.escape = function(html, encode) { + // Handle escaping being turned off + if(this.options && this.options.escape === false) { + return html; + } + return escape(html, encode); +}; + +/** + * Lexing/Compiling + */ + +InlineLexer.prototype.output = function(src) { + var out = [] + , link + , text + , href + , cap; + + while (src) { + // escape + if (cap = this.rules.escape.exec(src)) { + src = src.substring(cap[0].length); + out.push(cap[1]); + continue; + } + + // autolink + if (cap = this.rules.autolink.exec(src)) { + src = src.substring(cap[0].length); + if (cap[2] === '@') { + text = cap[1].charAt(6) === ':' + ? this.mangle(cap[1].substring(7)) + : this.mangle(cap[1]); + href = this.mangle('mailto:') + text; + } else { + text = this.escape(cap[1]); + href = text; + } + out.push(this.renderer.link(href, null, text)); + continue; + } + + // url (gfm) + if (!this.inLink && (cap = this.rules.url.exec(src))) { + src = src.substring(cap[0].length); + text = this.escape(cap[1]); + href = text; + out.push(this.renderer.link(href, null, text)); + continue; + } + + // html + if (cap = this.rules.html.exec(src)) { + // Found a link + if(cap[1] === 'a' && cap[2] && !this.inLink) { + // Opening tag + out.push(cap[0].substring(0, cap[0].indexOf(cap[2]))); + this.inLink = true; + // In between the tag + out.push(this.output(cap[2])); + this.inLink = false; + // Outer tag + out.push(cap[0].substring(cap[0].indexOf(cap[2])+cap[2].length)); + // Advance parser + src = src.substring(cap[0].length); + continue; + } + + // Found HTML that we should parse + if(cap[1] && !isHTMLBlock(cap[1]) && cap[2]) { + // Opening tag + out.push(cap[0].substring(0, cap[0].indexOf(cap[2]))); + // In between the tag + out.push(this.output(cap[2])); + // Outer tag + out.push(cap[0].substring(cap[0].indexOf(cap[2])+cap[2].length)); + // Advance parser + src = src.substring(cap[0].length); + continue; + } + + // Any other HTML + src = src.substring(cap[0].length); + out.push(cap[0]); + continue; + } + + // link + if (cap = this.rules.link.exec(src)) { + src = src.substring(cap[0].length); + this.inLink = true; + out.push(this.outputLink(cap, { + href: cap[2], + title: cap[3] + })); + this.inLink = false; + continue; + } + + // reffn + if ((cap = this.rules.reffn.exec(src))) { + src = src.substring(cap[0].length); + out.push(this.renderer.reffn(cap[1])); + continue; + } + + // reflink, nolink + if ((cap = this.rules.reflink.exec(src)) + || (cap = this.rules.nolink.exec(src))) { + src = src.substring(cap[0].length); + link = (cap[2] || cap[1]).replace(/\s+/g, ' '); + link = this.links[link.toLowerCase()]; + if (!link || !link.href) { + out.push(cap[0].charAt(0)); + src = cap[0].substring(1) + src; + continue; + } + this.inLink = true; + out.push(this.outputLink(cap, link)); + this.inLink = false; + continue; + } + + // strong + if (cap = this.rules.strong.exec(src)) { + src = src.substring(cap[0].length); + out.push(this.renderer.strong(this.output(cap[2] || cap[1]))); + continue; + } + + // em + if (cap = this.rules.em.exec(src)) { + src = src.substring(cap[0].length); + out.push(this.renderer.em(this.output(cap[2] || cap[1]))); + continue; + } + + // code + if (cap = this.rules.code.exec(src)) { + src = src.substring(cap[0].length); + out.push(this.renderer.codespan(this.escape(cap[2], true))); + continue; + } + + // math + if (cap = this.rules.math.exec(src)) { + src = src.substring(cap[0].length); + out.push(this.renderer.math(cap[1], 'math/tex', false)); //FIXME: filter <script> & </script> + continue; + } + + // br + if (cap = this.rules.br.exec(src)) { + src = src.substring(cap[0].length); + out.push(this.renderer.br()); + continue; + } + + // del (gfm) + if (cap = this.rules.del.exec(src)) { + src = src.substring(cap[0].length); + out.push(this.renderer.del(this.output(cap[1]))); + continue; + } + + // text + if (cap = this.rules.text.exec(src)) { + src = src.substring(cap[0].length); + out.push(this.escape(this.smartypants(cap[0]))); + continue; + } + + if (src) { + throw new + Error('Infinite loop on byte: ' + src.charCodeAt(0)); + } + } + return out; +}; + +/** + * Compile Link + */ + +InlineLexer.prototype.outputLink = function(cap, link) { + var href = this.escape(link.href) + , title = link.title ? this.escape(link.title) : null; + + return cap[0].charAt(0) !== '!' + ? this.renderer.link(href, title, this.output(cap[1])) + : this.renderer.image(href, title, this.escape(cap[1])); +}; + +/** + * Smartypants Transformations + */ + +InlineLexer.prototype.smartypants = function(text) { + if (!this.options.smartypants) return text; + return text + // em-dashes + .replace(/--/g, '\u2014') + // opening singles + .replace(/(^|[-\u2014/(\[{"\s])'/g, '$1\u2018') + // closing singles & apostrophes + .replace(/'/g, '\u2019') + // opening doubles + .replace(/(^|[-\u2014/(\[{\u2018\s])"/g, '$1\u201c') + // closing doubles + .replace(/"/g, '\u201d') + // ellipses + .replace(/\.{3}/g, '\u2026'); +}; + +/** + * Mangle Links + */ + +InlineLexer.prototype.mangle = function(text) { + var out = '' + , l = text.length + , i = 0 + , ch; + + for (; i < l; i++) { + ch = text.charCodeAt(i); + if (Math.random() > 0.5) { + ch = 'x' + ch.toString(16); + } + out += '&#' + ch + ';'; + } + + return out; +}; + +module.exports = InlineLexer; diff --git a/lib/markdownToJSON/parser.js b/lib/markdownToJSON/parser.js new file mode 100644 index 0000000..ae60668 --- /dev/null +++ b/lib/markdownToJSON/parser.js @@ -0,0 +1,197 @@ +var Renderer = require('./renderer'); +var InlineLexer = require('./inline'); + +/** + * Parsing & Compiling + */ + +function Parser(options, renderer) { + this.tokens = []; + this.token = null; + this.options = options ? options : null; + this.renderer = renderer || (this.options && this.options.renderer) || new Renderer(this.options); +} + +/** + * Static Parse Method + */ + +Parser.parse = function(src, options, renderer) { + var parser = new Parser(options, renderer); + return parser.parse(src); +}; + +/** + * Parse Loop + */ + +Parser.prototype.parse = function(src) { + this.inline = new InlineLexer(src.links, this.options, this.renderer); + this.tokens = src.reverse(); + + var out = []; + while (this.next()) { + const token = this.tok(); + if(token) out.push(token) + } + + return out; +}; + +/** + * Next Token + */ + +Parser.prototype.next = function() { + return this.token = this.tokens.pop(); +}; + +/** + * Preview Next Token + */ + +Parser.prototype.peek = function() { + return this.tokens[this.tokens.length - 1] || 0; +}; + +/** + * Parse Text Tokens + */ + +Parser.prototype.parseText = function() { + let body = this.token.text + + while (this.peek().type === 'text') { + body+= this.next().text + '\n'; + } + + return this.inline.output(body); +}; + +/** + * Parse Current Token + */ + +Parser.prototype.tok = function() { + if(typeof this.token === 'undefined' || !this.token.hasOwnProperty('type')) { + return undefined; + } + switch (this.token.type) { + case 'space': { + return undefined; + } + case 'hr': { + return this.renderer.hr(); + } + case 'heading': { + let id = /({#)(.+)(})/g.exec(this.token.text); + id = id ? id[2] : null; + + return this.renderer.heading( + this.inline.output(this.token.text.replace(/{#.+}/g, '')), + this.token.depth, + this.token.text, + id); + } + case 'footnote': { + return this.renderer.footnote( + this.token.refname, + this.inline.output(this.token.text)); + } + case 'code': { + return this.renderer.code(this.token.text, + this.token.lang, + this.token.escaped); + } + case 'math': { + return this.renderer.math(this.token.text, 'math/tex', true); + } + case 'table': { + var header = [] + , body = [] + , i + , row + , cell + , flags + , j; + + // header + cell = []; + for (i = 0; i < this.token.header.length; i++) { + flags = { header: true, align: this.token.align[i] }; + cell.push(this.renderer.tablecell( + this.inline.output(this.token.header[i]), + { header: true, align: this.token.align[i] } + )); + } + header.push(this.renderer.tablerow(cell)); + + for (i = 0; i < this.token.cells.length; i++) { + row = this.token.cells[i]; + + cell = []; + for (j = 0; j < row.length; j++) { + cell.push(this.renderer.tablecell( + this.inline.output(row[j]), + { header: false, align: this.token.align[j] } + )); + } + + body.push(this.renderer.tablerow(cell)); + } + return this.renderer.table(header, body); + } + case 'blockquote_start': { + var body = []; + + while (this.next().type !== 'blockquote_end') { + body.push(this.tok()); + } + + return this.renderer.blockquote(body); + } + case 'list_start': { + var body = [] + , ordered = this.token.ordered; + + while (this.next().type !== 'list_end') { + body.push(this.tok()); + } + + return this.renderer.list(body, ordered); + } + case 'list_item_start': { + var body = []; + + while (this.next().type !== 'list_item_end') { + const token = this.token.type === 'text' + ? this.parseText() + : this.tok() + if(token) body.push(token) + } + + return this.renderer.listitem(body); + } + case 'loose_item_start': { + var body = []; + + while (this.next().type !== 'list_item_end') { + body.push(this.tok()); + } + + return this.renderer.listitem(body); + } + case 'html': { + return this.renderer.html(this.token.text); + } + case 'paragraph': { + const out = this.inline.output(this.token.text) + return this.renderer.paragraph(out); + } + case 'text': { + return this.renderer.paragraph(this.parseText()); + } + } +}; + +module.exports = Parser; diff --git a/lib/markdownToJSON/renderer.js b/lib/markdownToJSON/renderer.js new file mode 100644 index 0000000..927c8d9 --- /dev/null +++ b/lib/markdownToJSON/renderer.js @@ -0,0 +1,171 @@ +import {escape, unescape} from './escape' +import _ from 'lodash' + +/** + * Renderer + */ + +var defaultOptions = { + langPrefix: 'lang-', + smartypants: false, + headerPrefix: '', + headerAutoId: true, + xhtml: false, +}; + +function Renderer(options) { + this.options = options || defaultOptions; +} + +const flatten = (a) => { + if(!Array.isArray(a)) { + return a + } + const b = _.flattenDeep(a) + if(b.length < 2) return b[0] + return b +} + +const toStr = (obj, indent = '') => { + let children + if(typeof obj.children === 'string') { + children = JSON.stringify(obj.children)+',' + } else if(!obj.children) { + children = ',' + } else if(Array.isArray(obj.children)) { + const func = a => { + if(typeof a === 'string') return indent+' '+JSON.stringify(a) + else return toStr(a, indent+' ') + } + const list = obj.children.reduce((l, c) => { + if(!l) return func(c) + return l+',\n'+func(c) + }, false) + children = `[\n${list}\n${indent}],` + } else { + children = '\n'+toStr(obj.children, indent+' ')+',' + } + const opts = obj.opts ? JSON.stringify(obj.opts) : '' + return `${indent}{${obj.name},${children}${opts}}` +} + +const fmt = (name, children_, opts) => { + const children = flatten(children_) + return opts ? (children ? {name, children, opts} : {name, opts}) : (children ? {name, children} : {name}) +} + +Renderer.prototype.code = function(code, lang, escaped) { + if (this.options.highlight) { + var out = this.options.highlight(code, lang); + if (out != null && out !== code) { + escaped = true; + code = out; + } + } + if(lang) + return fmt( + 'code', + (escaped ? code : escape(code, true)), + { lang: this.options.langPrefix + escape(lang, true) }, + ) + else + return fmt( + 'code', + (escaped ? code : escape(code, true)), + ) +}; + +const simple = name => children => fmt(name, children) +Renderer.prototype.blockquote = simple('blockquote') +Renderer.prototype.hr = () => fmt('hr') +Renderer.prototype.listitem = simple('li') +Renderer.prototype.paragraph = simple('p') +Renderer.prototype.tablerow = simple('tr') + +Renderer.prototype.table = (header, body) => fmt('table', body, {header}) + +Renderer.prototype.list = function(body, ordered) { + var type = ordered ? 'ol' : 'ul'; + return fmt(type, body) +}; + +Renderer.prototype.html = html => html + +Renderer.prototype._createId = function(str) { + // replace " " and all punctuation characters to "-" + str = str.toLowerCase().replace(/[\s\]\[\!\"\#\$\%\&\'\(\)\*\+\,\.\/\:\;\<\=\>\?\@\\\^\_\`\{\|\}\~\-]+/g, '-'); + try { + str = encodeURIComponent(str); + } catch (e) { + str = str.replace(/[^\w]+/g, '-'); + } + return str.replace(/-$/, ''); +}; + +Renderer.prototype.heading = function(text, level, raw, _id) { + var id = _id; + + if (!id && this.options.headerAutoId !== false) id = this._createId(raw) + + return fmt('h', + text, + { id, level }) +}; + +Renderer.prototype.tablecell = function(content, {align, header}) { + var type = header ? 'th' : 'td'; + return fmt(type, content, align ? {align} : undefined) +}; + +Renderer.prototype.math = function(content, language, display) { + return fmt('math', content, {language, display}) + const mode = display ? '; mode=display' : ''; + return '<script type="' + language + mode + '">' + content + '</script>'; +} + +// span level renderer +Renderer.prototype.strong = simple('strong') +Renderer.prototype.em = simple('em') +Renderer.prototype.codespan = simple('code') +Renderer.prototype.del = simple('del') + +Renderer.prototype.br = () => 'br' + +Renderer.prototype.reffn = function(refname) { + return fmt('ref', undefined, {refname}) + //return '<sup><a href="#fn_' + refname + '" id="reffn_' + refname + '">' + refname + '</a></sup>' +} + +Renderer.prototype.footnote = function(refname, text) { + return fmt('footnote', text, {refname}) + /*return '<blockquote id="fn_' + refname + '">\n' + + '<sup>' + refname + '</sup>. ' + + text + + '<a href="#reffn_' + refname + '" title="Jump back to footnote [' + refname + '] in the text."> ↩</a>\n' + + '</blockquote>\n';*/ +} + +Renderer.prototype.link = function(href, title, text) { + if (this.options.sanitize) { + try { + var prot = decodeURIComponent(unescape(href)) + .replace(/[^\w:]/g, '') + .toLowerCase(); + } catch (e) { + return undefined; + } + if (prot.indexOf('javascript:') === 0) { + return undefined; + } + } + const out = fmt('a', text, {href, title}) + return out; +}; + +Renderer.prototype.image = function(href, title, text) { + return fmt('img', undefined, {href, alt: text, title}) +}; + +module.exports = Renderer; +module.exports.fmt = fmt +module.exports.toStr = toStr diff --git a/lib/parser.js b/lib/parser.js index 34055aa..7110471 100644 --- a/lib/parser.js +++ b/lib/parser.js @@ -29,6 +29,11 @@ const parser = page => { console.log(metadata._error) return undefined } + return { + ...page, + metadata: {...page.metadata, ...metadata}, + content: body, + } } export default parser diff --git a/package.json b/package.json index 1435153..b97d1cb 100644 --- a/package.json +++ b/package.json @@ -30,6 +30,7 @@ "babel-plugin-transform-object-rest-spread": "^6.8.0", "babel-preset-es2017": "^6.14.0", "fb-watchman": "^1.9.0", + "kramed": "^0.5.6", "lodash": "^4.15.0", "moment": "^2.15.0", "sane": "^1.4.1", -- GitLab