/*
Copyright 2021 The Matrix.org Foundation C.I.C.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
import { MessageBody, HeaderBlock, TableBlock, ListBlock, CodeBlock, PillPart, FormatPart, NewLinePart, RulePart, TextPart, LinkPart, ImagePart } from "./MessageBody.js"
import { linkify } from "./linkify/linkify.js";
/* At the time of writing (Jul 1 2021), Matrix Spec recommends
* allowing the following HTML tags:
* font, del, h1, h2, h3, h4, h5, h6, blockquote, p, a, ul, ol, sup, sub, li, b, i, u,
* strong, em, strike, code, hr, br, div, table, thead, tbody, tr, th, td, caption, pre, span, img
*/
/**
* Nodes that don't have any properties to them other than their tag.
* While has `href`, and has `src`, these have... themselves.
*/
const basicInline = ["EM", "STRONG", "CODE", "DEL", "SPAN" ];
const basicBlock = ["DIV", "BLOCKQUOTE"];
const safeSchemas = ["https", "http", "ftp", "mailto", "magnet"].map(name => `${name}://`);
const baseUrl = 'https://matrix.to';
const linkPrefix = `${baseUrl}/#/`;
class Deserializer {
constructor(result, mediaRepository) {
this.result = result;
this.mediaRepository = mediaRepository;
}
parsePillLink(link) {
if (!link.startsWith(linkPrefix)) {
return null;
}
const contents = link.substring(linkPrefix.length);
if (contents[0] === '@') {
return contents;
}
return null;
}
parseLink(node, children) {
const href = this.result.getAttributeValue(node, "href");
const lcUrl = href?.toLowerCase();
// urls should be absolute and with a safe schema, as listed in the spec
if (!lcUrl || !safeSchemas.some(schema => lcUrl.startsWith(schema))) {
return new FormatPart("span", children);
}
const pillId = this.parsePillLink(href);
if (pillId) {
return new PillPart(pillId, href, children);
}
return new LinkPart(href, children);
}
parseList(node) {
const result = this.result;
let start = null;
if (result.getNodeElementName(node) === "OL") {
// Will return 1 for, say, '1A', which may not be intended?
start = parseInt(result.getAttributeValue(node, "start")) || 1;
}
const items = [];
for (const child of result.getChildNodes(node)) {
if (result.getNodeElementName(child) !== "LI") {
continue;
}
const item = this.parseAnyNodes(result.getChildNodes(child));
items.push(item);
}
return new ListBlock(start, items);
}
_ensureElement(node, tag) {
return node &&
this.result.isElementNode(node) &&
this.result.getNodeElementName(node) === tag;
}
parseCodeBlock(node) {
const result = this.result;
let codeNode;
for (const child of result.getChildNodes(node)) {
codeNode = child;
break;
}
let language = null;
if (!this._ensureElement(codeNode, "CODE")) {
return new CodeBlock(language, this.result.getNodeText(node));
}
const cl = result.getAttributeValue(codeNode, "class") || ""
for (const clname of cl.split(" ")) {
if (clname.startsWith("language-") && !clname.startsWith("language-_")) {
language = clname.substring(9) // "language-".length
break;
}
}
return new CodeBlock(language, this.result.getNodeText(codeNode));
}
parseImage(node) {
const result = this.result;
const src = result.getAttributeValue(node, "src") || "";
const url = this.mediaRepository.mxcUrl(src);
// We just ignore non-mxc `src` attributes.
if (!url) {
return null;
}
const width = parseInt(result.getAttributeValue(node, "width")) || null;
const height = parseInt(result.getAttributeValue(node, "height")) || null;
const alt = result.getAttributeValue(node, "alt");
const title = result.getAttributeValue(node, "title");
return new ImagePart(url, width, height, alt, title);
}
parseTableRow(row, tag) {
const cells = [];
for (const node of this.result.getChildNodes(row)) {
if(!this._ensureElement(node, tag)) {
continue;
}
const children = this.result.getChildNodes(node);
const inlines = this.parseInlineNodes(children);
cells.push(inlines);
}
return cells;
}
parseTableHead(head) {
let headRow = null;
for (const node of this.result.getChildNodes(head)) {
headRow = node;
break;
}
if (this._ensureElement(headRow, "TR")) {
return this.parseTableRow(headRow, "TH");
}
return null;
}
parseTableBody(body) {
const rows = [];
for (const node of this.result.getChildNodes(body)) {
if(!this._ensureElement(node, "TR")) {
continue;
}
rows.push(this.parseTableRow(node, "TD"));
}
return rows;
}
parseTable(node) {
// We are only assuming iterable, so convert to arrary for indexing.
const children = Array.from(this.result.getChildNodes(node));
let head, body;
if (this._ensureElement(children[0], "THEAD") && this._ensureElement(children[1], "TBODY")) {
head = this.parseTableHead(children[0]);
body = this.parseTableBody(children[1]);
} else if (this._ensureElement(children[0], "TBODY")) {
head = null;
body = this.parseTableBody(children[0]);
}
return new TableBlock(head, body);
}
/** Once a node is known to be an element,
* attempt to interpret it as an inline element.
*
* @returns the inline message part, or null if the element
* is not inline or not allowed.
*/
parseInlineElement(node) {
const result = this.result;
const tag = result.getNodeElementName(node);
const children = result.getChildNodes(node);
switch (tag) {
case "A": {
const inlines = this.parseInlineNodes(children);
return this.parseLink(node, inlines);
}
case "BR":
return new NewLinePart();
default: {
if (!basicInline.includes(tag)) {
return null;
}
const inlines = this.parseInlineNodes(children);
return new FormatPart(tag, inlines);
}
}
}
/** Attempt to interpret a node as inline.
*
* @returns the inline message part, or null if the
* element is not inline or not allowed.
*/
parseInlineNode(node) {
if (this.result.isElementNode(node)) {
return this.parseInlineElement(node);
}
return null;
}
/** Once a node is known to be an element, attempt
* to interpret it as a block element.
*
* @returns the block message part, or null of the
* element is not a block or not allowed.
*/
parseBlockElement(node) {
const result = this.result;
const tag = result.getNodeElementName(node);
const children = result.getChildNodes(node);
switch (tag) {
case "H1":
case "H2":
case "H3":
case "H4":
case "H5":
case "H6": {
const inlines = this.parseInlineNodes(children);
return new HeaderBlock(parseInt(tag[1]), inlines)
}
case "UL":
case "OL":
return this.parseList(node);
case "PRE":
return this.parseCodeBlock(node);
case "HR":
return new RulePart();
case "IMG":
return this.parseImage(node);
case "P": {
const inlines = this.parseInlineNodes(children);
return new FormatPart(tag, inlines);
}
case "TABLE":
return this.parseTable(node);
default: {
if (!basicBlock.includes(tag)) {
return null;
}
const blocks = this.parseAnyNodes(children);
return new FormatPart(tag, blocks);
}
}
}
/** Attempt to parse a node as a block.
*
* @return the block message part, or null if the node
* is not a block element.
*/
parseBlockNode(node) {
if (this.result.isElementNode(node)) {
return this.parseBlockElement(node);
}
return null;
}
_parseTextParts(node, into) {
if(!this.result.isTextNode(node)) {
return false;
}
// XXX pretty much identical to `MessageBody`'s.
const linkifyCallback = (text, isLink) => {
if (isLink) {
into.push(new LinkPart(text, [new TextPart(text)]));
} else {
into.push(new TextPart(text));
}
};
linkify(this.result.getNodeText(node), linkifyCallback);
return true;
}
_parseInlineNodes(nodes, into) {
for (const htmlNode of nodes) {
if (this._parseTextParts(htmlNode, into)) {
// This was a text node, and we already
// dumped its parts into our list.
continue;
}
const node = this.parseInlineNode(htmlNode);
if (node) {
into.push(node);
continue;
}
// Node is either block or unrecognized. In
// both cases, just move on to its children.
this._parseInlineNodes(this.result.getChildNodes(htmlNode), into);
}
}
parseInlineNodes(nodes) {
const into = [];
this._parseInlineNodes(nodes, into);
return into;
}
// XXX very similar to `_parseInlineNodes`.
_parseAnyNodes(nodes, into) {
for (const htmlNode of nodes) {
if (this._parseTextParts(htmlNode, into)) {
// This was a text node, and we already
// dumped its parts into our list.
continue;
}
const node = this.parseInlineNode(htmlNode) || this.parseBlockNode(htmlNode);
if (node) {
into.push(node);
continue;
}
// Node is unrecognized. Just move on to its children.
this._parseAnyNodes(this.result.getChildNodes(htmlNode), into);
}
}
parseAnyNodes(nodes) {
const into = [];
this._parseAnyNodes(nodes, into);
return into;
}
}
export function parseHTMLBody(platform, mediaRepository, html) {
const parseResult = platform.parseHTML(html);
const deserializer = new Deserializer(parseResult, mediaRepository);
const parts = deserializer.parseAnyNodes(parseResult.rootNodes);
return new MessageBody(html, parts);
}
import parse from '../../../../../lib/node-html-parser/index.js';
export function tests() {
class HTMLParseResult {
constructor(bodyNode) {
this._bodyNode = bodyNode;
}
get rootNodes() {
return this._bodyNode.childNodes;
}
getChildNodes(node) {
return node.childNodes;
}
getAttributeNames(node) {
return node.getAttributeNames();
}
getAttributeValue(node, attr) {
return node.getAttribute(attr);
}
isTextNode(node) {
return !node.tagName;
}
getNodeText(node) {
return node.text;
}
isElementNode(node) {
return !!node.tagName;
}
getNodeElementName(node) {
return node.tagName;
}
}
const platform = {
parseHTML: (html) => new HTMLParseResult(parse(html))
};
function test(assert, input, output) {
assert.deepEqual(parseHTMLBody(platform, null, input), new MessageBody(input, output));
}
return {
"Text only": assert => {
const input = "This is a sentence";
const output = [new TextPart(input)];
test(assert, input, output);
},
"Text with inline code format": assert => {
const input = "Here's some hello worldcode
!";
const output = [
new TextPart("Here's "),
new FormatPart("em", [new TextPart("some")]),
new TextPart(" "),
new FormatPart("code", [new TextPart("code")]),
new TextPart("!")
];
test(assert, input, output);
},
"Text with ordered list with no attributes": assert => {
const input = "
";
const output = [
new ListBlock(1, [
[ new TextPart("Lorem") ],
[ new TextPart("Ipsum") ]
])
];
test(assert, input, output);
},
"Text with ordered list starting at 3": assert => {
const input = '
';
const output = [
new ListBlock(3, [
[ new TextPart("Lorem") ],
[ new TextPart("Ipsum") ]
])
];
test(assert, input, output);
},
"Text with unordered list": assert => {
const input = '
';
const output = [
new ListBlock(null, [
[ new TextPart("Lorem") ],
[ new TextPart("Ipsum") ]
])
];
test(assert, input, output);
},
"Auto-closed tags": assert => {
const input = '
Hello
Hello
';
const output = [
new FormatPart("span", [
new FormatPart("code", [new TextPart("Hello")]),
new FormatPart("em", [new TextPart("World")])
])
];
test(assert, input, output);
},
"Unknown and invalid attributes are stripped": assert => {
const input = 'Hello';
const output = [
new FormatPart("em", [new TextPart("Hello")])
];
test(assert, input, output);
},
"Text with code block but no tag": assert => {
const code = 'main :: IO ()\nmain = putStrLn "Hello"'
const input = `${code}
`;
const output = [
new CodeBlock(null, code)
];
test(assert, input, output);
},
"Text with code block and 'unsupported' tag": assert => {
const code = 'Hello, world'
const input = `${code}
`;
const output = [
new CodeBlock(null, code)
];
test(assert, input, output);
}
/* Doesnt work: HTML library doesn't handle properly.
"Text with code block": assert => {
const code = 'main :: IO ()\nmain = putStrLn "Hello"'
const input = `${code}
`;
const output = [
new CodeBlock(null, code)
];
test(assert, input, output);
}
*/
};
}