vector-im-hydrogen-web/src/domain/session/room/timeline/deserialize.js

286 lines
9.1 KiB
JavaScript
Raw Normal View History

2021-07-08 08:40:16 +02:00
import { MessageBody, HeaderBlock, ListBlock, CodeBlock, FormatPart, NewLinePart, RulePart, TextPart, LinkPart, ImagePart } from "./MessageBody.js"
2021-07-09 22:06:24 +02:00
import sanitizeHtml from "../../../../../lib/sanitize-html/index.js"
/* At the time of writing (Jul 1 2021), Matrix Spec recommends
* allowing the following HTML tags:
* font, del, h1, h2, h3, h4, h5, h6, blockquote, p, a, ul, ol, sup, sub, li, b, i, u,
* strong, em, strike, code, hr, br, div, table, thead, tbody, tr, th, td, caption, pre, span, img
*/
2021-07-02 09:23:59 +02:00
/**
* Nodes that don't have any properties to them other than their tag.
* While <a> has `href`, and <img> has `src`, these have... themselves.
*/
const basicNodes = ["EM", "STRONG", "CODE", "DEL", "P", "DIV", "SPAN" ]
2021-07-02 09:23:59 +02:00
/**
* Return a builder function for a particular tag.
*/
function basicWrapper(tag) {
2021-07-08 09:16:58 +02:00
return (_, node, children) => new FormatPart(tag, children);
}
2021-07-02 09:23:59 +02:00
/**
* Return a builder function for a particular header level.
*/
function headerWrapper(level) {
2021-07-08 09:16:58 +02:00
return (_, node, children) => new HeaderBlock(level, children);
}
2021-07-08 09:16:58 +02:00
function parseLink(options, node, children) {
2021-07-03 00:05:50 +02:00
// TODO Not equivalent to `node.href`!
// Add another HTMLParseResult method?
2021-07-08 09:16:58 +02:00
let href = options.result.getAttributeValue(node, "href");
2021-07-03 00:05:50 +02:00
return new LinkPart(href, children);
}
2021-07-08 09:16:58 +02:00
function parseList(options, node) {
const { result } = options;
2021-07-07 01:14:35 +02:00
let start = null;
2021-07-07 01:48:31 +02:00
if (result.getNodeElementName(node) === "OL") {
2021-07-07 01:14:35 +02:00
// Will return 1 for, say, '1A', which may not be intended?
start = parseInt(result.getAttributeValue(node, "start")) || 1;
}
const nodes = [];
2021-07-03 00:05:50 +02:00
for (const child of result.getChildNodes(node)) {
if (result.getNodeElementName(child) !== "LI") {
continue;
}
2021-07-08 09:16:58 +02:00
const item = parseNodes(options, result.getChildNodes(child));
2021-07-03 00:05:50 +02:00
nodes.push(item);
}
return new ListBlock(start, nodes);
}
2021-07-08 09:16:58 +02:00
function parseCodeBlock(options, node) {
const { result } = options;
let codeNode;
2021-07-03 00:05:50 +02:00
for (const child of result.getChildNodes(node)) {
codeNode = child;
break;
}
if (!(codeNode && result.getNodeElementName(codeNode) === "CODE")) {
return null;
}
let language = "";
2021-07-03 00:05:50 +02:00
const cl = result.getAttributeValue(codeNode, "class") || ""
for (const clname of cl.split(" ")) {
if (clname.startsWith("language-") && !clname.startsWith("language-_")) {
language = clname.substring(9) // "language-".length
break;
}
}
return new CodeBlock(language, codeNode.textContent);
}
2021-07-08 09:16:58 +02:00
function parseImage(options, node) {
const { result, mediaRepository } = options;
2021-07-08 08:40:16 +02:00
const src = result.getAttributeValue(node, "src") || "";
2021-07-08 09:16:58 +02:00
const url = mediaRepository.mxcUrl(src);
2021-07-08 08:40:16 +02:00
// We just ignore non-mxc `src` attributes.
2021-07-08 09:16:58 +02:00
if (!url) {
2021-07-08 08:40:16 +02:00
return null;
}
const width = result.getAttributeValue(node, "width");
const height = result.getAttributeValue(node, "height");
const alt = result.getAttributeValue(node, "alt");
const title = result.getAttributeValue(node, "title");
2021-07-08 09:16:58 +02:00
return new ImagePart(url, { width, height, alt, title });
}
function buildNodeMap() {
let map = {
A: { descend: true, parsefn: parseLink },
UL: { descend: false, parsefn: parseList },
OL: { descend: false, parsefn: parseList },
PRE: { descend: false, parsefn: parseCodeBlock },
BR: { descend: false, parsefn: () => new NewLinePart() },
HR: { descend: false, parsefn: () => new RulePart() },
IMG: { descend: false, parsefn: parseImage }
}
for (const tag of basicNodes) {
map[tag] = { descend: true, parsefn: basicWrapper(tag) }
}
for (let level = 1; level <= 6; level++) {
const tag = "h" + level;
map[tag] = { descend: true, parsefn: headerWrapper(level) }
}
return map;
}
2021-07-02 09:23:59 +02:00
/**
* Handlers for various nodes.
*
* Each handler has two properties: `descend` and `parsefn`.
* If `descend` is true, the node's children should be
* parsed just like any other node, and fed as a second argument
* to `parsefn`. If not, the node's children are either to be ignored
* (as in <pre>) or processed specially (as in <ul>).
*
* The `parsefn` combines a node's data and its children into
* an internal representation node.
*/
const nodes = buildNodeMap();
2021-07-08 09:16:58 +02:00
function parseNode(options, node) {
const { result } = options;
2021-07-03 00:05:50 +02:00
if (result.isTextNode(node)) {
return new TextPart(result.getNodeText(node));
} else if (result.isElementNode(node)) {
const f = nodes[result.getNodeElementName(node)];
if (!f) {
return null;
}
2021-07-08 09:16:58 +02:00
const children = f.descend ? parseNodes(options, node.childNodes) : null;
return f.parsefn(options, node, children);
}
return null;
}
2021-07-08 09:16:58 +02:00
function parseNodes(options, nodes) {
const parsed = [];
2021-07-03 00:05:50 +02:00
for (const htmlNode of nodes) {
2021-07-08 09:16:58 +02:00
let node = parseNode(options, htmlNode);
2021-07-02 09:23:59 +02:00
// Just ignore invalid / unknown tags.
if (node) {
parsed.push(node);
}
}
return parsed;
}
2021-07-09 22:06:24 +02:00
const sanitizeConfig = {
allowedTags: [
"font", "del", "h1", "h2", "h3", "h4", "h5", "h6",
"blockquote", "p", "a", "ul", "ol", "sup", "sub", "li",
"b", "i", "u", "strong", "em", "strike", "code", "hr",
"br", "div", "table", "thead", "tbody", "tr", "th", "td",
"caption", "pre", "span", "img"
],
allowedAttributes: {
"font": ["data-mx-bg-color", "data-mx-color"],
"span": ["data-mx-bg-color", "data-mx-color"],
"a": ["name", "target", "href"],
"img": ["width", "height", "alt", "title", "src"],
"ol": ["start"],
"code": ["class"]
},
allowedSchemes: [ "http", "https", "ftp", "mailto", "tel", "mxc" ]
};
2021-07-08 08:40:16 +02:00
export function parseHTMLBody({ mediaRepository, platform }, html) {
2021-07-09 22:06:24 +02:00
const parseResult = platform.parseHTML(sanitizeHtml(html, sanitizeConfig));
2021-07-08 09:16:58 +02:00
const options = { result: parseResult, mediaRepository };
const parts = parseNodes(options, parseResult.rootNodes);
return new MessageBody(html, parts);
}
2021-07-07 04:11:05 +02:00
import parse from '../../../../../lib/node-html-parser/index.js';
2021-07-07 04:11:05 +02:00
export class HTMLParseResult {
constructor(bodyNode) {
this._bodyNode = bodyNode;
}
get rootNodes() {
return this._bodyNode.childNodes;
}
getChildNodes(node) {
return node.childNodes;
}
getAttributeNames(node) {
return node.getAttributeNames();
}
getAttributeValue(node, attr) {
return node.getAttribute(attr);
}
isTextNode(node) {
return !node.tagName;
}
getNodeText(node) {
return node.text;
}
isElementNode(node) {
return !!node.tagName;
}
getNodeElementName(node) {
return node.tagName;
}
}
const platform = {
parseHTML: (html) => new HTMLParseResult(parse(html))
};
export function tests() {
function test(assert, input, output) {
2021-07-08 08:40:16 +02:00
assert.deepEqual(parseHTMLBody({ mediaRepository: null, platform }, input), new MessageBody(input, output));
2021-07-07 04:11:05 +02:00
}
return {
"Text only": assert => {
const input = "This is a sentence";
const output = [new TextPart(input)];
test(assert, input, output);
},
"Text with inline code format": assert => {
const input = "Here's <em>some</em> <code>code</code>!";
const output = [
new TextPart("Here's "),
new FormatPart("em", [new TextPart("some")]),
new TextPart(" "),
new FormatPart("code", [new TextPart("code")]),
new TextPart("!")
];
test(assert, input, output);
},
"Text with ordered list with no attributes": assert => {
const input = "<ol><li>Lorem</li><li>Ipsum</li></ol>";
const output = [
new ListBlock(1, [
[ new TextPart("Lorem") ],
[ new TextPart("Ipsum") ]
])
];
test(assert, input, output);
},
"Text with ordered list starting at 3": assert => {
const input = '<ol start="3"><li>Lorem</li><li>Ipsum</li></ol>';
const output = [
new ListBlock(3, [
[ new TextPart("Lorem") ],
[ new TextPart("Ipsum") ]
])
];
test(assert, input, output);
},
"Text with unordered list": assert => {
const input = '<ul start="3"><li>Lorem</li><li>Ipsum</li></ul>';
const output = [
new ListBlock(null, [
[ new TextPart("Lorem") ],
[ new TextPart("Ipsum") ]
])
];
test(assert, input, output);
},
/* Doesnt work: HTML library doesn't handle <pre><code> properly.
"Text with code block": assert => {
const code = 'main :: IO ()\nmain = putStrLn "Hello"'
const input = `<pre><code>${code}</code></pre>`;
const output = [
new CodeBlock(null, code)
];
test(assert, input, output);
}
*/
};
}