From ccf83953a881f09c5387b890c6820078f45ace31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Sko=C5=99epa?= <jakub@skorepa.info>
Date: Sat, 17 Sep 2016 17:38:34 +0200
Subject: [PATCH] Created markdown parser based on kramed

---
 lib/index.js                   |  30 ++--
 lib/markdownToJSON/escape.js   |  21 +++
 lib/markdownToJSON/index.js    |  11 ++
 lib/markdownToJSON/inline.js   | 291 +++++++++++++++++++++++++++++++++
 lib/markdownToJSON/parser.js   | 197 ++++++++++++++++++++++
 lib/markdownToJSON/renderer.js | 171 +++++++++++++++++++
 lib/parser.js                  |   5 +
 package.json                   |   1 +
 8 files changed, 707 insertions(+), 20 deletions(-)
 create mode 100644 lib/markdownToJSON/escape.js
 create mode 100644 lib/markdownToJSON/index.js
 create mode 100644 lib/markdownToJSON/inline.js
 create mode 100644 lib/markdownToJSON/parser.js
 create mode 100644 lib/markdownToJSON/renderer.js

diff --git a/lib/index.js b/lib/index.js
index dd00375..00b1067 100644
--- a/lib/index.js
+++ b/lib/index.js
@@ -1,29 +1,19 @@
-/*
-definitions:
-  workUnit: { metadata: {}, content: string }
-  promisedWorkUnit: () => Promise( workUnit )
-
-Functions:
-  init - read data (from file, database, ...)
-    () => [ promisedWorkUnit ]
-  parse
-    workUnit => workUnit
-  collect
-    [workUnit] => [workUnit]
-  parse
-    workUnit => workUnit
-  terminate
-    workUnit => ()
-
-parse, collect and render should be pure
-
-*/
+import _ from 'lodash/fp'
+import fs from 'fs'
 
 import {readFiles} from './readFiles'
 import parser from './parser'
+import markdownToJSON from './markdownToJSON'
 
 readFiles({
   filter: file => !/\.git$/.exec(file)
 })(change => console.log())
 .then(files => files.map(f => parser(f)))
+.then(files => files.filter(f => f))
+.then(files => files.map(f => ({ metadata: f.metadata, content: markdownToJSON(f.content) })))
+.then(files => files.map(f => {
+  console.log(_.repeat(80,'='))
+  console.log(f.metadata)
+  console.log(f.content)
+}))
 .catch(e => console.log(e))
diff --git a/lib/markdownToJSON/escape.js b/lib/markdownToJSON/escape.js
new file mode 100644
index 0000000..6f78d70
--- /dev/null
+++ b/lib/markdownToJSON/escape.js
@@ -0,0 +1,21 @@
+export const escape = (html, encode) => {
+  return html
+    .replace(!encode ? /&(?!#?\w+;)/g : /&/g, '&amp;')
+    .replace(/</g, '&lt;')
+    .replace(/>/g, '&gt;')
+    .replace(/"/g, '&quot;')
+    .replace(/'/g, '&#39;');
+}
+
+export const unescape = (html) => {
+  return html.replace(/&([#\w]+);/g, function(_, n) {
+    n = n.toLowerCase();
+    if (n === 'colon') return ':';
+    if (n.charAt(0) === '#') {
+      return n.charAt(1) === 'x'
+        ? String.fromCharCode(parseInt(n.substring(2), 16))
+        : String.fromCharCode(+n.substring(1));
+    }
+    return '';
+  });
+}
diff --git a/lib/markdownToJSON/index.js b/lib/markdownToJSON/index.js
new file mode 100644
index 0000000..adcaa54
--- /dev/null
+++ b/lib/markdownToJSON/index.js
@@ -0,0 +1,11 @@
+import _ from 'lodash/fp'
+
+import {toStr} from './renderer'
+import Parser from './parser'
+import kramed from 'kramed'
+
+const mtj = new Parser()
+export default _.flow(
+  a => kramed.lexer(a),
+  a => mtj.parse(a)
+)
diff --git a/lib/markdownToJSON/inline.js b/lib/markdownToJSON/inline.js
new file mode 100644
index 0000000..7de75c5
--- /dev/null
+++ b/lib/markdownToJSON/inline.js
@@ -0,0 +1,291 @@
+import { escape } from './escape'
+import Renderer from './renderer'
+var noop = () => {};
+
+var inline = require('kramed/lib/rules/inline');
+var defaultOptions = require('kramed/lib/lex/options');
+var isHTMLBlock =  require('kramed/lib/lex/html_blocks');
+
+/**
+ * Inline Lexer & Compiler
+ */
+
+function InlineLexer(links, options, renderer) {
+  this.options = options || defaultOptions;
+  this.links = links;
+  this.rules = inline.normal;
+  this.renderer = renderer
+
+  if (!this.links) {
+    throw new
+      Error('Tokens array requires a `links` property.');
+  }
+
+  if (this.options.gfm) {
+    if (this.options.breaks) {
+      this.rules = inline.breaks;
+    } else {
+      this.rules = inline.gfm;
+    }
+  } else if (this.options.pedantic) {
+    this.rules = inline.pedantic;
+  }
+
+  // Is mathjax disabled ?
+  if (!this.options.mathjax) {
+     this.rules.math = noop;
+  }
+}
+
+/**
+ * Expose Inline Rules
+ */
+
+InlineLexer.rules = inline;
+
+/**
+ * Static Lexing/Compiling Method
+ */
+
+InlineLexer.output = function(src, links, options) {
+  var inline = new InlineLexer(links, options, new Renderer());
+  return inline.output(src);
+};
+
+InlineLexer.prototype.escape = function(html, encode) {
+  // Handle escaping being turned off
+  if(this.options && this.options.escape === false) {
+    return html;
+  }
+  return escape(html, encode);
+};
+
+/**
+ * Lexing/Compiling
+ */
+
+InlineLexer.prototype.output = function(src) {
+  var out = []
+    , link
+    , text
+    , href
+    , cap;
+
+  while (src) {
+    // escape
+    if (cap = this.rules.escape.exec(src)) {
+      src = src.substring(cap[0].length);
+      out.push(cap[1]);
+      continue;
+    }
+
+    // autolink
+    if (cap = this.rules.autolink.exec(src)) {
+      src = src.substring(cap[0].length);
+      if (cap[2] === '@') {
+        text = cap[1].charAt(6) === ':'
+          ? this.mangle(cap[1].substring(7))
+          : this.mangle(cap[1]);
+        href = this.mangle('mailto:') + text;
+      } else {
+        text = this.escape(cap[1]);
+        href = text;
+      }
+      out.push(this.renderer.link(href, null, text));
+      continue;
+    }
+
+    // url (gfm)
+    if (!this.inLink && (cap = this.rules.url.exec(src))) {
+      src = src.substring(cap[0].length);
+      text = this.escape(cap[1]);
+      href = text;
+      out.push(this.renderer.link(href, null, text));
+      continue;
+    }
+
+    // html
+    if (cap = this.rules.html.exec(src)) {
+      // Found a link
+      if(cap[1] === 'a' && cap[2] && !this.inLink) {
+        // Opening tag
+        out.push(cap[0].substring(0, cap[0].indexOf(cap[2])));
+        this.inLink = true;
+        // In between the tag
+        out.push(this.output(cap[2]));
+        this.inLink = false;
+        // Outer tag
+        out.push(cap[0].substring(cap[0].indexOf(cap[2])+cap[2].length));
+        // Advance parser
+        src = src.substring(cap[0].length);
+        continue;
+      }
+
+      // Found HTML that we should parse
+      if(cap[1] && !isHTMLBlock(cap[1]) && cap[2]) {
+        // Opening tag
+        out.push(cap[0].substring(0, cap[0].indexOf(cap[2])));
+        // In between the tag
+        out.push(this.output(cap[2]));
+        // Outer tag
+        out.push(cap[0].substring(cap[0].indexOf(cap[2])+cap[2].length));
+        // Advance parser
+        src = src.substring(cap[0].length);
+        continue;
+      }
+
+      // Any other HTML
+      src = src.substring(cap[0].length);
+      out.push(cap[0]);
+      continue;
+    }
+
+    // link
+    if (cap = this.rules.link.exec(src)) {
+      src = src.substring(cap[0].length);
+      this.inLink = true;
+      out.push(this.outputLink(cap, {
+        href: cap[2],
+        title: cap[3]
+      }));
+      this.inLink = false;
+      continue;
+    }
+
+    // reffn
+    if ((cap = this.rules.reffn.exec(src))) {
+        src = src.substring(cap[0].length);
+        out.push(this.renderer.reffn(cap[1]));
+        continue;
+    }
+
+    // reflink, nolink
+    if ((cap = this.rules.reflink.exec(src))
+        || (cap = this.rules.nolink.exec(src))) {
+      src = src.substring(cap[0].length);
+      link = (cap[2] || cap[1]).replace(/\s+/g, ' ');
+      link = this.links[link.toLowerCase()];
+      if (!link || !link.href) {
+        out.push(cap[0].charAt(0));
+        src = cap[0].substring(1) + src;
+        continue;
+      }
+      this.inLink = true;
+      out.push(this.outputLink(cap, link));
+      this.inLink = false;
+      continue;
+    }
+
+    // strong
+    if (cap = this.rules.strong.exec(src)) {
+      src = src.substring(cap[0].length);
+      out.push(this.renderer.strong(this.output(cap[2] || cap[1])));
+      continue;
+    }
+
+    // em
+    if (cap = this.rules.em.exec(src)) {
+      src = src.substring(cap[0].length);
+      out.push(this.renderer.em(this.output(cap[2] || cap[1])));
+      continue;
+    }
+
+    // code
+    if (cap = this.rules.code.exec(src)) {
+      src = src.substring(cap[0].length);
+      out.push(this.renderer.codespan(this.escape(cap[2], true)));
+      continue;
+    }
+
+    // math
+    if (cap = this.rules.math.exec(src)) {
+      src = src.substring(cap[0].length);
+      out.push(this.renderer.math(cap[1], 'math/tex', false)); //FIXME: filter <script> & </script>
+      continue;
+    }
+
+    // br
+    if (cap = this.rules.br.exec(src)) {
+      src = src.substring(cap[0].length);
+      out.push(this.renderer.br());
+      continue;
+    }
+
+    // del (gfm)
+    if (cap = this.rules.del.exec(src)) {
+      src = src.substring(cap[0].length);
+      out.push(this.renderer.del(this.output(cap[1])));
+      continue;
+    }
+
+    // text
+    if (cap = this.rules.text.exec(src)) {
+      src = src.substring(cap[0].length);
+      out.push(this.escape(this.smartypants(cap[0])));
+      continue;
+    }
+
+    if (src) {
+      throw new
+        Error('Infinite loop on byte: ' + src.charCodeAt(0));
+    }
+  }
+  return out;
+};
+
+/**
+ * Compile Link
+ */
+
+InlineLexer.prototype.outputLink = function(cap, link) {
+  var href = this.escape(link.href)
+    , title = link.title ? this.escape(link.title) : null;
+
+  return cap[0].charAt(0) !== '!'
+    ? this.renderer.link(href, title, this.output(cap[1]))
+    : this.renderer.image(href, title, this.escape(cap[1]));
+};
+
+/**
+ * Smartypants Transformations
+ */
+
+InlineLexer.prototype.smartypants = function(text) {
+  if (!this.options.smartypants) return text;
+  return text
+    // em-dashes
+    .replace(/--/g, '\u2014')
+    // opening singles
+    .replace(/(^|[-\u2014/(\[{"\s])'/g, '$1\u2018')
+    // closing singles & apostrophes
+    .replace(/'/g, '\u2019')
+    // opening doubles
+    .replace(/(^|[-\u2014/(\[{\u2018\s])"/g, '$1\u201c')
+    // closing doubles
+    .replace(/"/g, '\u201d')
+    // ellipses
+    .replace(/\.{3}/g, '\u2026');
+};
+
+/**
+ * Mangle Links
+ */
+
+InlineLexer.prototype.mangle = function(text) {
+  var out = ''
+  , l = text.length
+  , i = 0
+  , ch;
+
+  for (; i < l; i++) {
+    ch = text.charCodeAt(i);
+    if (Math.random() > 0.5) {
+      ch = 'x' + ch.toString(16);
+    }
+    out += '&#' + ch + ';';
+  }
+
+  return out;
+};
+
+module.exports = InlineLexer;
diff --git a/lib/markdownToJSON/parser.js b/lib/markdownToJSON/parser.js
new file mode 100644
index 0000000..ae60668
--- /dev/null
+++ b/lib/markdownToJSON/parser.js
@@ -0,0 +1,197 @@
+var Renderer = require('./renderer');
+var InlineLexer = require('./inline');
+
+/**
+ * Parsing & Compiling
+ */
+
+function Parser(options, renderer) {
+  this.tokens = [];
+  this.token = null;
+  this.options = options ? options : null;
+  this.renderer = renderer || (this.options && this.options.renderer) || new Renderer(this.options);
+}
+
+/**
+ * Static Parse Method
+ */
+
+Parser.parse = function(src, options, renderer) {
+  var parser = new Parser(options, renderer);
+  return parser.parse(src);
+};
+
+/**
+ * Parse Loop
+ */
+
+Parser.prototype.parse = function(src) {
+  this.inline = new InlineLexer(src.links, this.options, this.renderer);
+  this.tokens = src.reverse();
+
+  var out = [];
+  while (this.next()) {
+    const token = this.tok();
+    if(token) out.push(token)
+  }
+
+  return out;
+};
+
+/**
+ * Next Token
+ */
+
+Parser.prototype.next = function() {
+  return this.token = this.tokens.pop();
+};
+
+/**
+ * Preview Next Token
+ */
+
+Parser.prototype.peek = function() {
+  return this.tokens[this.tokens.length - 1] || 0;
+};
+
+/**
+ * Parse Text Tokens
+ */
+
+Parser.prototype.parseText = function() {
+  let body = this.token.text
+
+  while (this.peek().type === 'text') {
+    body+= this.next().text + '\n';
+  }
+
+  return this.inline.output(body);
+};
+
+/**
+ * Parse Current Token
+ */
+
+Parser.prototype.tok = function() {
+  if(typeof this.token === 'undefined' || !this.token.hasOwnProperty('type')) {
+      return undefined;
+  }
+  switch (this.token.type) {
+    case 'space': {
+      return undefined;
+    }
+    case 'hr': {
+      return this.renderer.hr();
+    }
+    case 'heading': {
+      let id = /({#)(.+)(})/g.exec(this.token.text);
+      id = id ? id[2] : null;
+
+      return this.renderer.heading(
+        this.inline.output(this.token.text.replace(/{#.+}/g, '')),
+        this.token.depth,
+        this.token.text,
+        id);
+    }
+    case 'footnote': {
+      return this.renderer.footnote(
+        this.token.refname,
+        this.inline.output(this.token.text));
+    }
+    case 'code': {
+      return this.renderer.code(this.token.text,
+        this.token.lang,
+        this.token.escaped);
+    }
+    case 'math': {
+      return this.renderer.math(this.token.text, 'math/tex', true);
+    }
+    case 'table': {
+      var header = []
+        , body = []
+        , i
+        , row
+        , cell
+        , flags
+        , j;
+
+      // header
+      cell = [];
+      for (i = 0; i < this.token.header.length; i++) {
+        flags = { header: true, align: this.token.align[i] };
+        cell.push(this.renderer.tablecell(
+          this.inline.output(this.token.header[i]),
+          { header: true, align: this.token.align[i] }
+        ));
+      }
+      header.push(this.renderer.tablerow(cell));
+
+      for (i = 0; i < this.token.cells.length; i++) {
+        row = this.token.cells[i];
+
+        cell = [];
+        for (j = 0; j < row.length; j++) {
+          cell.push(this.renderer.tablecell(
+            this.inline.output(row[j]),
+            { header: false, align: this.token.align[j] }
+          ));
+        }
+
+        body.push(this.renderer.tablerow(cell));
+      }
+      return this.renderer.table(header, body);
+    }
+    case 'blockquote_start': {
+      var body = [];
+
+      while (this.next().type !== 'blockquote_end') {
+        body.push(this.tok());
+      }
+
+      return this.renderer.blockquote(body);
+    }
+    case 'list_start': {
+      var body = []
+        , ordered = this.token.ordered;
+
+      while (this.next().type !== 'list_end') {
+        body.push(this.tok());
+      }
+
+      return this.renderer.list(body, ordered);
+    }
+    case 'list_item_start': {
+      var body = [];
+
+      while (this.next().type !== 'list_item_end') {
+        const token = this.token.type === 'text'
+          ? this.parseText()
+          : this.tok()
+        if(token) body.push(token)
+      }
+
+      return this.renderer.listitem(body);
+    }
+    case 'loose_item_start': {
+      var body = [];
+
+      while (this.next().type !== 'list_item_end') {
+        body.push(this.tok());
+      }
+
+      return this.renderer.listitem(body);
+    }
+    case 'html': {
+      return this.renderer.html(this.token.text);
+    }
+    case 'paragraph': {
+      const out = this.inline.output(this.token.text)
+      return this.renderer.paragraph(out);
+    }
+    case 'text': {
+      return this.renderer.paragraph(this.parseText());
+    }
+  }
+};
+
+module.exports = Parser;
diff --git a/lib/markdownToJSON/renderer.js b/lib/markdownToJSON/renderer.js
new file mode 100644
index 0000000..927c8d9
--- /dev/null
+++ b/lib/markdownToJSON/renderer.js
@@ -0,0 +1,171 @@
+import {escape, unescape} from './escape'
+import _ from 'lodash'
+
+/**
+ * Renderer
+ */
+
+var defaultOptions = {
+  langPrefix: 'lang-',
+  smartypants: false,
+  headerPrefix: '',
+  headerAutoId: true,
+  xhtml: false,
+};
+
+function Renderer(options) {
+  this.options = options || defaultOptions;
+}
+
+const flatten = (a) => {
+  if(!Array.isArray(a)) {
+    return a
+  }
+  const b = _.flattenDeep(a)
+  if(b.length < 2) return b[0]
+  return b
+}
+
+const toStr = (obj, indent = '') => {
+  let children
+  if(typeof obj.children === 'string') {
+    children = JSON.stringify(obj.children)+','
+  } else if(!obj.children) {
+    children = ','
+  } else if(Array.isArray(obj.children)) {
+    const func = a => {
+      if(typeof a === 'string') return indent+'  '+JSON.stringify(a)
+      else return toStr(a, indent+'  ')
+    }
+    const list = obj.children.reduce((l, c) => {
+      if(!l) return func(c)
+      return l+',\n'+func(c)
+    }, false)
+    children = `[\n${list}\n${indent}],`
+  } else {
+    children = '\n'+toStr(obj.children, indent+'  ')+','
+  }
+  const opts = obj.opts ? JSON.stringify(obj.opts) : ''
+  return `${indent}{${obj.name},${children}${opts}}`
+}
+
+const fmt = (name, children_, opts) => {
+  const children = flatten(children_)
+  return opts ? (children ? {name, children, opts} : {name, opts}) : (children ? {name, children} : {name})
+}
+
+Renderer.prototype.code = function(code, lang, escaped) {
+  if (this.options.highlight) {
+    var out = this.options.highlight(code, lang);
+    if (out != null && out !== code) {
+      escaped = true;
+      code = out;
+    }
+  }
+  if(lang)
+    return fmt(
+      'code',
+      (escaped ? code : escape(code, true)),
+      { lang: this.options.langPrefix + escape(lang, true) },
+    )
+  else
+    return fmt(
+      'code',
+      (escaped ? code : escape(code, true)),
+    )
+};
+
+const simple = name => children => fmt(name, children)
+Renderer.prototype.blockquote = simple('blockquote')
+Renderer.prototype.hr = () => fmt('hr')
+Renderer.prototype.listitem = simple('li')
+Renderer.prototype.paragraph = simple('p')
+Renderer.prototype.tablerow = simple('tr')
+
+Renderer.prototype.table = (header, body) => fmt('table', body, {header})
+
+Renderer.prototype.list = function(body, ordered) {
+  var type = ordered ? 'ol' : 'ul';
+  return fmt(type, body)
+};
+
+Renderer.prototype.html = html => html
+
+Renderer.prototype._createId = function(str) {
+  // replace " " and all punctuation characters to "-"
+  str = str.toLowerCase().replace(/[\s\]\[\!\"\#\$\%\&\'\(\)\*\+\,\.\/\:\;\<\=\>\?\@\\\^\_\`\{\|\}\~\-]+/g, '-');
+  try {
+    str = encodeURIComponent(str);
+  } catch (e) {
+    str = str.replace(/[^\w]+/g, '-');
+  }
+  return str.replace(/-$/, '');
+};
+
+Renderer.prototype.heading = function(text, level, raw, _id) {
+  var id = _id;
+
+  if (!id && this.options.headerAutoId !== false) id = this._createId(raw)
+
+  return fmt('h',
+    text,
+    { id, level })
+};
+
+Renderer.prototype.tablecell = function(content, {align, header}) {
+  var type = header ? 'th' : 'td';
+  return fmt(type, content, align ? {align} : undefined)
+};
+
+Renderer.prototype.math = function(content, language, display) {
+  return fmt('math', content, {language, display})
+  const mode = display ? '; mode=display' : '';
+  return '<script type="' + language + mode + '">' + content + '</script>';
+}
+
+// span level renderer
+Renderer.prototype.strong = simple('strong')
+Renderer.prototype.em = simple('em')
+Renderer.prototype.codespan = simple('code')
+Renderer.prototype.del = simple('del')
+
+Renderer.prototype.br = () => 'br'
+
+Renderer.prototype.reffn = function(refname) {
+  return fmt('ref', undefined, {refname})
+  //return '<sup><a href="#fn_' + refname + '" id="reffn_' + refname + '">' + refname + '</a></sup>'
+}
+
+Renderer.prototype.footnote = function(refname, text) {
+  return fmt('footnote', text, {refname})
+  /*return '<blockquote id="fn_' + refname + '">\n'
+    + '<sup>' + refname + '</sup>. '
+    + text
+    + '<a href="#reffn_' + refname + '" title="Jump back to footnote [' + refname + '] in the text."> &#8617;</a>\n'
+    + '</blockquote>\n';*/
+}
+
+Renderer.prototype.link = function(href, title, text) {
+  if (this.options.sanitize) {
+    try {
+      var prot = decodeURIComponent(unescape(href))
+        .replace(/[^\w:]/g, '')
+        .toLowerCase();
+    } catch (e) {
+      return undefined;
+    }
+    if (prot.indexOf('javascript:') === 0) {
+      return undefined;
+    }
+  }
+  const out = fmt('a', text, {href, title})
+  return out;
+};
+
+Renderer.prototype.image = function(href, title, text) {
+  return fmt('img', undefined, {href, alt: text, title})
+};
+
+module.exports = Renderer;
+module.exports.fmt = fmt
+module.exports.toStr = toStr
diff --git a/lib/parser.js b/lib/parser.js
index 34055aa..7110471 100644
--- a/lib/parser.js
+++ b/lib/parser.js
@@ -29,6 +29,11 @@ const parser = page => {
     console.log(metadata._error)
     return undefined
   }
+  return {
+    ...page,
+    metadata: {...page.metadata, ...metadata},
+    content: body,
+  }
 }
 
 export default parser
diff --git a/package.json b/package.json
index 1435153..b97d1cb 100644
--- a/package.json
+++ b/package.json
@@ -30,6 +30,7 @@
     "babel-plugin-transform-object-rest-spread": "^6.8.0",
     "babel-preset-es2017": "^6.14.0",
     "fb-watchman": "^1.9.0",
+    "kramed": "^0.5.6",
     "lodash": "^4.15.0",
     "moment": "^2.15.0",
     "sane": "^1.4.1",
-- 
GitLab