vector-im-hydrogen-web/src/domain/session/room/timeline/deserialize.js

import { MessageBody, HeaderBlock, ListBlock, CodeBlock, FormatPart, NewLinePart, RulePart, TextPart, LinkPart, ImagePart } from "./MessageBody.js"
import sanitizeHtml from "../../../../../lib/sanitize-html/index.js"

/* At the time of writing (Jul 1 2021), Matrix Spec recommends
 * allowing the following HTML tags:
 *     font, del, h1, h2, h3, h4, h5, h6, blockquote, p, a, ul, ol, sup, sub, li, b, i, u,
 *     strong, em, strike, code, hr, br, div, table, thead, tbody, tr, th, td, caption, pre, span, img
 */

/**
 * Nodes that don't have any properties to them other than their tag.
 * While <a> has `href`, and <img> has `src`, these have... themselves.
 */
const basicNodes = ["EM", "STRONG", "CODE", "DEL", "P", "DIV", "SPAN" ]

/**
 * Return a builder function for a particular tag.
 */
function basicWrapper(tag) {
    return (_, node, children) => new FormatPart(tag, children);
}

/**
 * Return a builder function for a particular header level.
 */
function headerWrapper(level) {
    return (_, node, children) => new HeaderBlock(level, children);
}

function parseLink(options, node, children) {
    // TODO Not equivalent to `node.href`!
    // Add another HTMLParseResult method?
    let href = options.result.getAttributeValue(node, "href");
    return new LinkPart(href, children);
}

function parseList(options, node) {
    const { result } = options;
    let start = null;
    if (result.getNodeElementName(node) === "OL") {
        // Will return 1 for, say, '1A', which may not be intended?
        start = parseInt(result.getAttributeValue(node, "start")) || 1;
    }
    const nodes = [];
    for (const child of result.getChildNodes(node)) {
        if (result.getNodeElementName(child) !== "LI") {
            continue;
        }
        const item = parseNodes(options, result.getChildNodes(child));
        nodes.push(item);
    }
    return new ListBlock(start, nodes);
}

function parseCodeBlock(options, node) {
    const { result } = options;
    let codeNode;
    for (const child of result.getChildNodes(node)) {
        codeNode = child;
        break;
    }
    if (!(codeNode && result.getNodeElementName(codeNode) === "CODE")) {
        return null;
    }
    let language = "";
    const cl = result.getAttributeValue(codeNode, "class") || ""
    for (const clname of cl.split(" ")) {
        if (clname.startsWith("language-") && !clname.startsWith("language-_")) {
            language = clname.substring(9) // "language-".length
            break;
        }
    }
    return new CodeBlock(language, codeNode.textContent);
}

function parseImage(options, node) {
    const { result, mediaRepository } = options;
    const src = result.getAttributeValue(node, "src") || "";
    const url = mediaRepository.mxcUrl(src);
    // We just ignore non-mxc `src` attributes.
    if (!url) {
        return null;
    }
    const width = result.getAttributeValue(node, "width");
    const height = result.getAttributeValue(node, "height");
    const alt = result.getAttributeValue(node, "alt");
    const title = result.getAttributeValue(node, "title");
    return new ImagePart(url, { width, height, alt, title });
}

function buildNodeMap() {
    let map = {
        A: { descend: true, parsefn: parseLink },
        UL: { descend: false, parsefn: parseList },
        OL: { descend: false, parsefn: parseList },
        PRE: { descend: false, parsefn: parseCodeBlock },
        BR: { descend: false, parsefn: () => new NewLinePart() },
        HR: { descend: false, parsefn: () => new RulePart() },
        IMG: { descend: false, parsefn: parseImage }
    }
    for (const tag of basicNodes) {
        map[tag] = { descend: true, parsefn: basicWrapper(tag) }
    }
    for (let level = 1; level <= 6; level++) {
        const tag = "h" + level;
        map[tag] = { descend: true, parsefn: headerWrapper(level) }
    }
    return map;
}

/**
 * Handlers for various nodes.
 *
 * Each handler has two properties: `descend` and `parsefn`.
 * If `descend` is true, the node's children should be
 * parsed just like any other node, and fed as a second argument
 * to `parsefn`. If not, the node's children are either to be ignored
 * (as in <pre>) or processed specially (as in <ul>).
 *
 * The `parsefn` combines a node's data and its children into
 * an internal representation node.
 */
const nodes = buildNodeMap();

function parseNode(options, node) {
    const { result } = options;
    if (result.isTextNode(node)) {
        return new TextPart(result.getNodeText(node));
    } else if (result.isElementNode(node)) {
        const f = nodes[result.getNodeElementName(node)];
        if (!f) {
            return null;
        }
        const children = f.descend ? parseNodes(options, node.childNodes) : null;
        return f.parsefn(options, node, children);
    }
    return null;
}

function parseNodes(options, nodes) {
    const parsed = [];
    for (const htmlNode of nodes) {
        let node = parseNode(options, htmlNode);
        // Just ignore invalid / unknown tags.
        if (node) {
            parsed.push(node);
        }
    }
    return parsed;
}

const sanitizeConfig = {
    allowedTags: [
        "font", "del", "h1", "h2", "h3", "h4", "h5", "h6",
        "blockquote", "p", "a", "ul", "ol", "sup", "sub", "li",
        "b", "i", "u", "strong", "em", "strike", "code", "hr",
        "br", "div", "table", "thead", "tbody", "tr", "th", "td",
        "caption", "pre", "span", "img"
    ],
    allowedAttributes: {
        "font": ["data-mx-bg-color", "data-mx-color"],
        "span": ["data-mx-bg-color", "data-mx-color"],
        "a": ["name", "target", "href"],
        "img": ["width", "height", "alt", "title", "src"],
        "ol": ["start"],
        "code": ["class"]
    },
    allowedSchemes: [ "http", "https", "ftp", "mailto", "tel", "mxc" ]
};

export function parseHTMLBody({ mediaRepository, platform }, html) {
    const parseResult = platform.parseHTML(sanitizeHtml(html, sanitizeConfig));
    const options = { result: parseResult, mediaRepository };
    const parts = parseNodes(options, parseResult.rootNodes);
    return new MessageBody(html, parts);
}

import parse from '../../../../../lib/node-html-parser/index.js';

export class HTMLParseResult {
    constructor(bodyNode) {
        this._bodyNode = bodyNode;
    }

    get rootNodes() {
        return this._bodyNode.childNodes;
    }

    getChildNodes(node) {
        return node.childNodes;
    }

    getAttributeNames(node) {
        return node.getAttributeNames();
    }

    getAttributeValue(node, attr) {
        return node.getAttribute(attr);
    }

    isTextNode(node) {
        return !node.tagName;
    }

    getNodeText(node) {
        return node.text;
    }

    isElementNode(node) {
        return !!node.tagName;
    }

    getNodeElementName(node) {
        return node.tagName;
    }
}

const platform = {
    parseHTML: (html) => new HTMLParseResult(parse(html))
};

export function tests() {
    function test(assert, input, output) {
        assert.deepEqual(parseHTMLBody({ mediaRepository: null, platform }, input), new MessageBody(input, output));
    }

    return {
        "Text only": assert => {
            const input = "This is a sentence";
            const output = [new TextPart(input)];
            test(assert, input, output);
        },
        "Text with inline code format": assert => {
            const input = "Here's <em>some</em> <code>code</code>!";
            const output = [
                new TextPart("Here's "),
                new FormatPart("em", [new TextPart("some")]),
                new TextPart(" "),
                new FormatPart("code", [new TextPart("code")]),
                new TextPart("!")
            ];
            test(assert, input, output);
        },
        "Text with ordered list with no attributes": assert => {
            const input = "<ol><li>Lorem</li><li>Ipsum</li></ol>";
            const output = [
                new ListBlock(1, [
                    [ new TextPart("Lorem") ],
                    [ new TextPart("Ipsum") ]
                ])
            ];
            test(assert, input, output);
        },
        "Text with ordered list starting at 3": assert => {
            const input = '<ol start="3"><li>Lorem</li><li>Ipsum</li></ol>';
            const output = [
                new ListBlock(3, [
                    [ new TextPart("Lorem") ],
                    [ new TextPart("Ipsum") ]
                ])
            ];
            test(assert, input, output);
        },
        "Text with unordered list": assert => {
            const input = '<ul start="3"><li>Lorem</li><li>Ipsum</li></ul>';
            const output = [
                new ListBlock(null, [
                    [ new TextPart("Lorem") ],
                    [ new TextPart("Ipsum") ]
                ])
            ];
            test(assert, input, output);
        },
        /* Doesnt work: HTML library doesn't handle <pre><code> properly.
        "Text with code block": assert => {
            const code = 'main :: IO ()\nmain = putStrLn "Hello"'
            const input = `<pre><code>${code}</code></pre>`;
            const output = [
                new CodeBlock(null, code)
            ];
            test(assert, input, output);
        }
        */
    };
}
Start working on images. 2021-07-08 08:40:16 +02:00			`import { MessageBody, HeaderBlock, ListBlock, CodeBlock, FormatPart, NewLinePart, RulePart, TextPart, LinkPart, ImagePart } from "./MessageBody.js"`
Add HTML sanitization to domain layer. 2021-07-09 22:06:24 +02:00			`import sanitizeHtml from "../../../../../lib/sanitize-html/index.js"`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00
			`/* At the time of writing (Jul 1 2021), Matrix Spec recommends`
			`* allowing the following HTML tags:`
			`* font, del, h1, h2, h3, h4, h5, h6, blockquote, p, a, ul, ol, sup, sub, li, b, i, u,`
			`* strong, em, strike, code, hr, br, div, table, thead, tbody, tr, th, td, caption, pre, span, img`
			`*/`

Add some comments. 2021-07-02 09:23:59 +02:00			`/**`
			`* Nodes that don't have any properties to them other than their tag.`
			* While <a> has `href`, and <img> has `src`, these have... themselves.
			`*/`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`const basicNodes = ["EM", "STRONG", "CODE", "DEL", "P", "DIV", "SPAN" ]`

Add some comments. 2021-07-02 09:23:59 +02:00			`/**`
			`* Return a builder function for a particular tag.`
			`*/`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`function basicWrapper(tag) {`
Display images in messages 2021-07-08 09:16:58 +02:00			`return (_, node, children) => new FormatPart(tag, children);`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`}`

Add some comments. 2021-07-02 09:23:59 +02:00			`/**`
			`* Return a builder function for a particular header level.`
			`*/`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`function headerWrapper(level) {`
Display images in messages 2021-07-08 09:16:58 +02:00			`return (_, node, children) => new HeaderBlock(level, children);`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`}`

Display images in messages 2021-07-08 09:16:58 +02:00			`function parseLink(options, node, children) {`
Move HTML parsing into platform. 2021-07-03 00:05:50 +02:00			// TODO Not equivalent to `node.href`!
			`// Add another HTMLParseResult method?`
Display images in messages 2021-07-08 09:16:58 +02:00			`let href = options.result.getAttributeValue(node, "href");`
Move HTML parsing into platform. 2021-07-03 00:05:50 +02:00			`return new LinkPart(href, children);`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`}`

Display images in messages 2021-07-08 09:16:58 +02:00			`function parseList(options, node) {`
			`const { result } = options;`
Better handle list start attribute. 2021-07-07 01:14:35 +02:00			`let start = null;`
Display lists. 2021-07-07 01:48:31 +02:00			`if (result.getNodeElementName(node) === "OL") {`
Better handle list start attribute. 2021-07-07 01:14:35 +02:00			`// Will return 1 for, say, '1A', which may not be intended?`
			`start = parseInt(result.getAttributeValue(node, "start")) \|\| 1;`
			`}`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`const nodes = [];`
Move HTML parsing into platform. 2021-07-03 00:05:50 +02:00			`for (const child of result.getChildNodes(node)) {`
			`if (result.getNodeElementName(child) !== "LI") {`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`continue;`
			`}`
Display images in messages 2021-07-08 09:16:58 +02:00			`const item = parseNodes(options, result.getChildNodes(child));`
Move HTML parsing into platform. 2021-07-03 00:05:50 +02:00			`nodes.push(item);`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`}`
			`return new ListBlock(start, nodes);`
			`}`

Display images in messages 2021-07-08 09:16:58 +02:00			`function parseCodeBlock(options, node) {`
			`const { result } = options;`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`let codeNode;`
Move HTML parsing into platform. 2021-07-03 00:05:50 +02:00			`for (const child of result.getChildNodes(node)) {`
			`codeNode = child;`
			`break;`
			`}`
			`if (!(codeNode && result.getNodeElementName(codeNode) === "CODE")) {`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`return null;`
			`}`
			`let language = "";`
Move HTML parsing into platform. 2021-07-03 00:05:50 +02:00			`const cl = result.getAttributeValue(codeNode, "class") \|\| ""`
			`for (const clname of cl.split(" ")) {`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`if (clname.startsWith("language-") && !clname.startsWith("language-_")) {`
			`language = clname.substring(9) // "language-".length`
			`break;`
			`}`
			`}`
			`return new CodeBlock(language, codeNode.textContent);`
			`}`

Display images in messages 2021-07-08 09:16:58 +02:00			`function parseImage(options, node) {`
			`const { result, mediaRepository } = options;`
Start working on images. 2021-07-08 08:40:16 +02:00			`const src = result.getAttributeValue(node, "src") \|\| "";`
Display images in messages 2021-07-08 09:16:58 +02:00			`const url = mediaRepository.mxcUrl(src);`
Start working on images. 2021-07-08 08:40:16 +02:00			// We just ignore non-mxc `src` attributes.
Display images in messages 2021-07-08 09:16:58 +02:00			`if (!url) {`
Start working on images. 2021-07-08 08:40:16 +02:00			`return null;`
			`}`
			`const width = result.getAttributeValue(node, "width");`
			`const height = result.getAttributeValue(node, "height");`
			`const alt = result.getAttributeValue(node, "alt");`
			`const title = result.getAttributeValue(node, "title");`
Display images in messages 2021-07-08 09:16:58 +02:00			`return new ImagePart(url, { width, height, alt, title });`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`}`

			`function buildNodeMap() {`
			`let map = {`
			`A: { descend: true, parsefn: parseLink },`
			`UL: { descend: false, parsefn: parseList },`
			`OL: { descend: false, parsefn: parseList },`
			`PRE: { descend: false, parsefn: parseCodeBlock },`
			`BR: { descend: false, parsefn: () => new NewLinePart() },`
			`HR: { descend: false, parsefn: () => new RulePart() },`
			`IMG: { descend: false, parsefn: parseImage }`
			`}`
			`for (const tag of basicNodes) {`
			`map[tag] = { descend: true, parsefn: basicWrapper(tag) }`
			`}`
			`for (let level = 1; level <= 6; level++) {`
			`const tag = "h" + level;`
			`map[tag] = { descend: true, parsefn: headerWrapper(level) }`
			`}`
			`return map;`
			`}`

Add some comments. 2021-07-02 09:23:59 +02:00			`/**`
			`* Handlers for various nodes.`
			`*`
			* Each handler has two properties: `descend` and `parsefn`.
			* If `descend` is true, the node's children should be
			`* parsed just like any other node, and fed as a second argument`
			* to `parsefn`. If not, the node's children are either to be ignored
			`* (as in <pre>) or processed specially (as in <ul>).`
			`*`
			* The `parsefn` combines a node's data and its children into
			`* an internal representation node.`
			`*/`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`const nodes = buildNodeMap();`

Display images in messages 2021-07-08 09:16:58 +02:00			`function parseNode(options, node) {`
			`const { result } = options;`
Move HTML parsing into platform. 2021-07-03 00:05:50 +02:00			`if (result.isTextNode(node)) {`
			`return new TextPart(result.getNodeText(node));`
			`} else if (result.isElementNode(node)) {`
			`const f = nodes[result.getNodeElementName(node)];`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`if (!f) {`
			`return null;`
			`}`
Display images in messages 2021-07-08 09:16:58 +02:00			`const children = f.descend ? parseNodes(options, node.childNodes) : null;`
			`return f.parsefn(options, node, children);`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`}`
			`return null;`
			`}`

Display images in messages 2021-07-08 09:16:58 +02:00			`function parseNodes(options, nodes) {`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`const parsed = [];`
Move HTML parsing into platform. 2021-07-03 00:05:50 +02:00			`for (const htmlNode of nodes) {`
Display images in messages 2021-07-08 09:16:58 +02:00			`let node = parseNode(options, htmlNode);`
Add some comments. 2021-07-02 09:23:59 +02:00			`// Just ignore invalid / unknown tags.`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`if (node) {`
			`parsed.push(node);`
			`}`
			`}`
			`return parsed;`
			`}`

Add HTML sanitization to domain layer. 2021-07-09 22:06:24 +02:00			`const sanitizeConfig = {`
			`allowedTags: [`
			`"font", "del", "h1", "h2", "h3", "h4", "h5", "h6",`
			`"blockquote", "p", "a", "ul", "ol", "sup", "sub", "li",`
			`"b", "i", "u", "strong", "em", "strike", "code", "hr",`
			`"br", "div", "table", "thead", "tbody", "tr", "th", "td",`
			`"caption", "pre", "span", "img"`
			`],`
			`allowedAttributes: {`
			`"font": ["data-mx-bg-color", "data-mx-color"],`
			`"span": ["data-mx-bg-color", "data-mx-color"],`
			`"a": ["name", "target", "href"],`
			`"img": ["width", "height", "alt", "title", "src"],`
			`"ol": ["start"],`
			`"code": ["class"]`
			`},`
			`allowedSchemes: [ "http", "https", "ftp", "mailto", "tel", "mxc" ]`
			`};`

Start working on images. 2021-07-08 08:40:16 +02:00			`export function parseHTMLBody({ mediaRepository, platform }, html) {`
Add HTML sanitization to domain layer. 2021-07-09 22:06:24 +02:00			`const parseResult = platform.parseHTML(sanitizeHtml(html, sanitizeConfig));`
Display images in messages 2021-07-08 09:16:58 +02:00			`const options = { result: parseResult, mediaRepository };`
			`const parts = parseNodes(options, parseResult.rootNodes);`
Make `parse` a function that returns a MessageBody 2021-07-03 03:20:07 +02:00			`return new MessageBody(html, parts);`
Begin a parser implementation from HTML into an internal representation. 2021-07-02 09:18:37 +02:00			`}`
Add some deserialization tests. 2021-07-07 04:11:05 +02:00
Move deserialization into domain and fix tests. 2021-07-07 21:21:10 +02:00			`import parse from '../../../../../lib/node-html-parser/index.js';`
Add some deserialization tests. 2021-07-07 04:11:05 +02:00
			`export class HTMLParseResult {`
			`constructor(bodyNode) {`
			`this._bodyNode = bodyNode;`
			`}`

			`get rootNodes() {`
			`return this._bodyNode.childNodes;`
			`}`

			`getChildNodes(node) {`
			`return node.childNodes;`
			`}`

			`getAttributeNames(node) {`
			`return node.getAttributeNames();`
			`}`

			`getAttributeValue(node, attr) {`
			`return node.getAttribute(attr);`
			`}`

			`isTextNode(node) {`
			`return !node.tagName;`
			`}`

			`getNodeText(node) {`
			`return node.text;`
			`}`

			`isElementNode(node) {`
			`return !!node.tagName;`
			`}`

			`getNodeElementName(node) {`
			`return node.tagName;`
			`}`
			`}`

			`const platform = {`
			`parseHTML: (html) => new HTMLParseResult(parse(html))`
			`};`

			`export function tests() {`
			`function test(assert, input, output) {`
Start working on images. 2021-07-08 08:40:16 +02:00			`assert.deepEqual(parseHTMLBody({ mediaRepository: null, platform }, input), new MessageBody(input, output));`
Add some deserialization tests. 2021-07-07 04:11:05 +02:00			`}`

			`return {`
			`"Text only": assert => {`
			`const input = "This is a sentence";`
			`const output = [new TextPart(input)];`
			`test(assert, input, output);`
			`},`
			`"Text with inline code format": assert => {`
			`const input = "Here's <em>some</em> <code>code</code>!";`
			`const output = [`
			`new TextPart("Here's "),`
			`new FormatPart("em", [new TextPart("some")]),`
			`new TextPart(" "),`
			`new FormatPart("code", [new TextPart("code")]),`
			`new TextPart("!")`
			`];`
			`test(assert, input, output);`
			`},`
			`"Text with ordered list with no attributes": assert => {`
			`const input = "<ol><li>Lorem</li><li>Ipsum</li></ol>";`
			`const output = [`
			`new ListBlock(1, [`
			`[ new TextPart("Lorem") ],`
			`[ new TextPart("Ipsum") ]`
			`])`
			`];`
			`test(assert, input, output);`
			`},`
			`"Text with ordered list starting at 3": assert => {`
			`const input = '<ol start="3"><li>Lorem</li><li>Ipsum</li></ol>';`
			`const output = [`
			`new ListBlock(3, [`
			`[ new TextPart("Lorem") ],`
			`[ new TextPart("Ipsum") ]`
			`])`
			`];`
			`test(assert, input, output);`
			`},`
			`"Text with unordered list": assert => {`
			`const input = '<ul start="3"><li>Lorem</li><li>Ipsum</li></ul>';`
			`const output = [`
			`new ListBlock(null, [`
			`[ new TextPart("Lorem") ],`
			`[ new TextPart("Ipsum") ]`
			`])`
			`];`
			`test(assert, input, output);`
			`},`
			`/* Doesnt work: HTML library doesn't handle <pre><code> properly.`
			`"Text with code block": assert => {`
			`const code = 'main :: IO ()\nmain = putStrLn "Hello"'`
			const input = `<pre><code>${code}</code></pre>`;
			`const output = [`
			`new CodeBlock(null, code)`
			`];`
			`test(assert, input, output);`
			`}`
			`*/`
			`};`
			`}`