From a8f374dd436ac58ac6101cc39ecabb453f844a4c Mon Sep 17 00:00:00 2001 From: ginnyTheCat Date: Fri, 16 Sep 2022 17:51:53 +0200 Subject: [PATCH] Parsing HTML to Markdown AST (#847) * Force mentions to have a space after the # * Use types for rendering * Parse HTML * Add code block support * Add table support * Allow starting heading without a space * Escape relevant plaintext areas * Resolve many crashes * Use better matrix id regex * Don't match . after id * Don't parse mentions as links * Add emote support * Only emit HTML link if necessary * Implement review changes --- src/app/molecules/message/Message.jsx | 5 +- src/client/state/Notifications.js | 13 +- src/util/common.js | 24 +++ src/util/markdown.js | 269 +++++++++++++++++++++----- 4 files changed, 265 insertions(+), 46 deletions(-) diff --git a/src/app/molecules/message/Message.jsx b/src/app/molecules/message/Message.jsx index 02a5562c..6becae1c 100644 --- a/src/app/molecules/message/Message.jsx +++ b/src/app/molecules/message/Message.jsx @@ -40,6 +40,7 @@ import BinIC from '../../../../public/res/ic/outlined/bin.svg'; import { confirmDialog } from '../confirm-dialog/ConfirmDialog'; import { getBlobSafeMimeType } from '../../../util/mimetypes'; +import { html, plain } from '../../../util/markdown'; function PlaceholderMessage() { return ( @@ -802,7 +803,9 @@ function Message({ )} {isEdit && ( { if (newBody !== body) { initMatrix.roomsInput.sendEditedMessage(roomId, mEvent, newBody); diff --git a/src/client/state/Notifications.js b/src/client/state/Notifications.js index da4521dd..db4610a3 100644 --- a/src/client/state/Notifications.js +++ b/src/client/state/Notifications.js @@ -10,6 +10,7 @@ import { setFavicon } from '../../util/common'; import LogoSVG from '../../../public/res/svg/cinny.svg'; import LogoUnreadSVG from '../../../public/res/svg/cinny-unread.svg'; import LogoHighlightSVG from '../../../public/res/svg/cinny-highlight.svg'; +import { html, plain } from '../../util/markdown'; function isNotifEvent(mEvent) { const eType = mEvent.getType(); @@ -257,8 +258,18 @@ class Notifications extends EventEmitter { scale: 8, }); + const content = mEvent.getContent(); + + const state = { kind: 'notification', onlyPlain: true }; + let body; + if (content.format === 'org.matrix.custom.html') { + body = html(content.formatted_body, state); + } else { + body = plain(content.body, state); + } + const noti = new window.Notification(title, { - body: mEvent.getContent().body, + body: body.plain, icon, tag: mEvent.getId(), silent: settings.isNotificationSounds, diff --git a/src/util/common.js b/src/util/common.js index 1de2bf0f..2affe27d 100644 --- a/src/util/common.js +++ b/src/util/common.js @@ -204,3 +204,27 @@ export function scaleDownImage(imageFile, width, height) { img.src = imgURL; }); } + +/** + * @param {sigil} string sigil to search for (for example '@', '#' or '$') + * @param {flags} string regex flags + * @param {prefix} string prefix appended at the beginning of the regex + * @returns {RegExp} + */ +export function idRegex(sigil, flags, prefix) { + const servername = '(?:[a-zA-Z0-9-.]*[a-zA-Z0-9]+|\\[\\S+?\\])(?::\\d+)?'; + return new RegExp(`${prefix}(${sigil}\\S+:${servername})`, flags); +} + +const matrixToRegex = /^https?:\/\/matrix.to\/#\/(\S+:\S+)/; +/** + * Parses a matrix.to URL into an matrix id. + * This function can later be extended to support matrix: URIs + * @param {string} uri The URI to parse + * @returns {string|null} The id or null if the URI does not match + */ +export function parseIdUri(uri) { + const res = decodeURIComponent(uri).match(matrixToRegex); + if (!res) return null; + return res[1]; +} diff --git a/src/util/markdown.js b/src/util/markdown.js index 324a12b5..bc83cb33 100644 --- a/src/util/markdown.js +++ b/src/util/markdown.js @@ -1,4 +1,6 @@ +/* eslint-disable no-use-before-define */ import SimpleMarkdown from '@khanacademy/simple-markdown'; +import { idRegex, parseIdUri } from './common'; const { defaultRules, parserFor, outputFor, anyScopeRegex, blockRegex, inlineRegex, @@ -31,25 +33,24 @@ const emojiRegex = /^:([\w-]+):/; const plainRules = { Array: { ...defaultRules.Array, - plain: (arr, output, state) => arr.map((node) => output(node, state)).join(''), + plain: defaultRules.Array.html, }, userMention: { order: defaultRules.em.order - 0.9, - match: inlineRegex(/^(@\S+:\S+)/), + match: inlineRegex(idRegex('@', undefined, '^')), parse: (capture, _, state) => ({ + type: 'mention', content: state.userNames[capture[1]] ? `@${state.userNames[capture[1]]}` : capture[1], id: capture[1], }), - plain: (node) => node.content, - html: (node) => htmlTag('a', sanitizeText(node.content), { - href: `https://matrix.to/#/${encodeURIComponent(node.id)}`, - }), }, roomMention: { order: defaultRules.em.order - 0.8, - match: inlineRegex(/^(#\S+:\S+)/), // TODO: Handle line beginning with roomMention (instead of heading) - parse: (capture) => ({ content: capture[1], id: capture[1] }), - plain: (node) => node.content, + match: inlineRegex(idRegex('#', undefined, '^')), + parse: (capture) => ({ type: 'mention', content: capture[1], id: capture[1] }), + }, + mention: { + plain: (node, _, state) => (state.kind === 'edit' ? node.id : node.content), html: (node) => htmlTag('a', sanitizeText(node.content), { href: `https://matrix.to/#/${encodeURIComponent(node.id)}`, }), @@ -95,7 +96,7 @@ const plainRules = { text: { ...defaultRules.text, match: anyScopeRegex(/^[\s\S]+?(?=[^0-9A-Za-z\s\u00c0-\uffff]| *\n|\w+:\S|$)/), - plain: (node) => node.content, + plain: (node) => node.content.replace(/(\*|_|!\[|\[|\|\||\$\$?)/g, '\\$1'), }, }; @@ -104,12 +105,13 @@ const markdownRules = { ...plainRules, heading: { ...defaultRules.heading, + match: blockRegex(/^ *(#{1,6})([^\n:]*?(?: [^\n]*?)?)#* *(?:\n *)+\n/), plain: (node, output, state) => { const out = output(node.content, state); - if (node.level <= 2) { - return `${out}\n${(node.level === 1 ? '=' : '-').repeat(out.length)}\n\n`; + if (state.kind === 'edit' || state.kind === 'notification' || node.level > 2) { + return `${'#'.repeat(node.level)} ${out}\n\n`; } - return `${'#'.repeat(node.level)} ${out}\n\n`; + return `${out}\n${(node.level === 1 ? '=' : '-').repeat(out.length)}\n\n`; }, }, hr: { @@ -119,6 +121,9 @@ const markdownRules = { codeBlock: { ...defaultRules.codeBlock, plain: (node) => `\`\`\`${node.lang || ''}\n${node.content}\n\`\`\``, + html: (node) => htmlTag('pre', htmlTag('code', sanitizeText(node.content), { + class: node.lang ? `language-${node.lang}` : undefined, + })), }, fence: { ...defaultRules.fence, @@ -131,7 +136,7 @@ const markdownRules = { list: { ...defaultRules.list, plain: (node, output, state) => `${node.items.map((item, i) => { - const prefix = node.ordered ? `${node.start + i + 1}. ` : '* '; + const prefix = node.ordered ? `${node.start + i}. ` : '* '; return prefix + output(item, state).replace(/\n/g, `\n${' '.repeat(prefix.length)}`); }).join('\n')}\n`, }, @@ -141,8 +146,8 @@ const markdownRules = { plain: (node, output, state) => { const header = node.header.map((content) => output(content, state)); - function lineWidth(i) { - switch (node.align[i]) { + const colWidth = node.align.map((align) => { + switch (align) { case 'left': case 'right': return 2; @@ -151,12 +156,14 @@ const markdownRules = { default: return 1; } - } - const colWidth = header.map((s, i) => Math.max(s.length, lineWidth(i))); + }); + header.forEach((s, i) => { + if (s.length > colWidth[i])colWidth[i] = s.length; + }); const cells = node.cells.map((row) => row.map((content, i) => { const s = output(content, state); - if (s.length > colWidth[i]) { + if (colWidth[i] === undefined || s.length > colWidth[i]) { colWidth[i] = s.length; } return s; @@ -228,10 +235,17 @@ const markdownRules = { } return out; }, - html: (node, output, state) => htmlTag('a', output(node.content, state), { - href: sanitizeUrl(node.target) || '', - title: node.title, - }), + html: (node, output, state) => { + const out = output(node.content, state); + const target = sanitizeUrl(node.target) || ''; + if (out !== target || node.title) { + return htmlTag('a', out, { + href: target, + title: node.title, + }); + } + return target; + }, }, image: { ...defaultRules.image, @@ -271,7 +285,17 @@ const markdownRules = { content: parse(capture[1], state), reason: capture[2], }), - plain: (node, output, state) => `[spoiler${node.reason ? `: ${node.reason}` : ''}](${output(node.content, state)})`, + plain: (node, output, state) => { + const warning = `spoiler${node.reason ? `: ${node.reason}` : ''}`; + switch (state.kind) { + case 'edit': + return `||${output(node.content, state)}||${node.reason ? `(${node.reason})` : ''}`; + case 'notification': + return `<${warning}>`; + default: + return `[${warning}](${output(node.content, state)})`; + } + }, html: (node, output, state) => htmlTag( 'span', output(node.content, state), @@ -287,32 +311,189 @@ const markdownRules = { }, }; -function genOut(rules) { - const parser = parserFor(rules); +function mapElement(el) { + switch (el.tagName) { + case 'MX-REPLY': + return []; - const plainOut = outputFor(rules, 'plain'); - const htmlOut = outputFor(rules, 'html'); + case 'P': + return [{ type: 'paragraph', content: mapChildren(el) }]; + case 'BR': + return [{ type: 'br' }]; - return (source, state) => { - let content = parser(source, state); - - if (content.length === 1 && content[0].type === 'paragraph') { - content = content[0].content; + case 'H1': + case 'H2': + case 'H3': + case 'H4': + case 'H5': + case 'H6': + return [{ type: 'heading', level: Number(el.tagName[1]), content: mapChildren(el) }]; + case 'HR': + return [{ type: 'hr' }]; + case 'PRE': { + let lang; + if (el.firstChild) { + Array.from(el.firstChild.classList).some((c) => { + const langPrefix = 'language-'; + if (c.startsWith(langPrefix)) { + lang = c.slice(langPrefix.length); + return true; + } + return false; + }); + } + return [{ type: 'codeBlock', lang, content: el.innerText }]; } + case 'BLOCKQUOTE': + return [{ type: 'blockQuote', content: mapChildren(el) }]; + case 'UL': + return [{ type: 'list', items: mapChildren(el) }]; + case 'OL': + return [{ + type: 'list', + ordered: true, + start: Number(el.getAttribute('start')), + items: mapChildren(el), + }]; + case 'TABLE': { + const headerEl = Array.from(el.querySelector('thead > tr').childNodes); + const align = headerEl.map((childE) => childE.style['text-align']); + return [{ + type: 'table', + header: headerEl.map(mapChildren), + align, + cells: Array.from(el.querySelectorAll('tbody > tr')).map((rowEl) => Array.from(rowEl.childNodes).map((childEl, i) => { + if (align[i] === undefined) align[i] = childEl.style['text-align']; + return mapChildren(childEl); + })), + }]; + } + case 'A': { + const href = el.getAttribute('href'); - const plain = plainOut(content, state).trim(); - const html = htmlOut(content, state); + const id = parseIdUri(href); + if (id) return [{ type: 'mention', content: el.innerText, id }]; - const plainHtml = html.replace(/
/g, '\n').replace(/<\/p>

/g, '\n\n').replace(/<\/?p>/g, ''); - const onlyPlain = sanitizeText(plain) === plainHtml; + return [{ + type: 'link', + target: el.getAttribute('href'), + title: el.getAttribute('title'), + content: mapChildren(el), + }]; + } + case 'IMG': { + const src = el.getAttribute('src'); + let title = el.getAttribute('title'); + if (el.hasAttribute('data-mx-emoticon')) { + if (title.length > 2 && title.startsWith(':') && title.endsWith(':')) { + title = title.slice(1, -1); + } + return [{ + type: 'emoji', + content: title, + emoji: { + mxc: src, + shortcode: title, + }, + }]; + } - return { - onlyPlain, - plain, - html, - }; + return [{ + type: 'image', + alt: el.getAttribute('alt'), + target: src, + title, + }]; + } + case 'EM': + case 'I': + return [{ type: 'em', content: mapChildren(el) }]; + case 'STRONG': + case 'B': + return [{ type: 'strong', content: mapChildren(el) }]; + case 'U': + return [{ type: 'u', content: mapChildren(el) }]; + case 'DEL': + case 'STRIKE': + return [{ type: 'del', content: mapChildren(el) }]; + case 'CODE': + return [{ type: 'inlineCode', content: el.innerText }]; + + case 'DIV': + if (el.hasAttribute('data-mx-maths')) { + return [{ type: 'displayMath', content: el.getAttribute('data-mx-maths') }]; + } + return mapChildren(el); + case 'SPAN': + if (el.hasAttribute('data-mx-spoiler')) { + return [{ type: 'spoiler', reason: el.getAttribute('data-mx-spoiler'), content: mapChildren(el) }]; + } + if (el.hasAttribute('data-mx-maths')) { + return [{ type: 'inlineMath', content: el.getAttribute('data-mx-maths') }]; + } + return mapChildren(el); + default: + return mapChildren(el); + } +} + +function mapNode(n) { + switch (n.nodeType) { + case Node.TEXT_NODE: + return [{ type: 'text', content: n.textContent }]; + case Node.ELEMENT_NODE: + return mapElement(n); + default: + return []; + } +} + +function mapChildren(n) { + return Array.from(n.childNodes).reduce((ast, childN) => { + ast.push(...mapNode(childN)); + return ast; + }, []); +} + +function render(content, state, plainOut, htmlOut) { + let c = content; + if (content.length === 1 && content[0].type === 'paragraph') { + c = c[0].content; + } + + const plainStr = plainOut(c, state).trim(); + if (state.onlyPlain) return { plain: plainStr }; + + const htmlStr = htmlOut(c, state); + + const plainHtml = htmlStr.replace(/
/g, '\n').replace(/<\/p>

/g, '\n\n').replace(/<\/?p>/g, ''); + const onlyPlain = sanitizeText(plainStr) === plainHtml; + + return { + onlyPlain, + plain: plainStr, + html: htmlStr, }; } -export const plain = genOut(plainRules); -export const markdown = genOut(markdownRules); +const plainParser = parserFor(plainRules); +const plainPlainOut = outputFor(plainRules, 'plain'); +const plainHtmlOut = outputFor(plainRules, 'html'); + +const mdParser = parserFor(markdownRules); +const mdPlainOut = outputFor(markdownRules, 'plain'); +const mdHtmlOut = outputFor(markdownRules, 'html'); + +export function plain(source, state) { + return render(plainParser(source, state), state, plainPlainOut, plainHtmlOut); +} + +export function markdown(source, state) { + return render(mdParser(source, state), state, mdPlainOut, mdHtmlOut); +} + +export function html(source, state) { + const el = document.createElement('template'); + el.innerHTML = source; + return render(mapChildren(el.content), state, mdPlainOut, mdHtmlOut); +}