2022-05-13 10:32:11 -04:00
|
|
|
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.Deque;
|
|
|
|
import java.util.LinkedList;
|
|
|
|
import java.util.Locale;
|
|
|
|
import java.text.BreakIterator;
|
|
|
|
import cafe.biskuteri.hinoki.Tree;
|
|
|
|
|
|
|
|
interface
|
|
|
|
BasicHTMLParser {
|
|
|
|
|
|
|
|
public static Tree<String>
|
|
|
|
parse(String html)
|
|
|
|
{
|
|
|
|
List<String> segments;
|
|
|
|
segments = distinguishTagsFromPcdata(html);
|
2022-05-17 03:14:16 -04:00
|
|
|
|
2022-05-13 10:32:11 -04:00
|
|
|
Tree<String> document;
|
|
|
|
document = toNodes(segments);
|
2022-05-16 12:52:00 -04:00
|
|
|
document = splitText(document);
|
2022-05-14 18:04:46 -04:00
|
|
|
document = evaluateHtmlEscapes(document);
|
2022-05-13 10:32:11 -04:00
|
|
|
document = hierarchise(document);
|
|
|
|
|
|
|
|
return document;
|
|
|
|
}
|
|
|
|
|
|
|
|
// - -%- -
|
|
|
|
|
|
|
|
private static List<String>
|
|
|
|
distinguishTagsFromPcdata(String html)
|
|
|
|
{
|
|
|
|
List<String> returnee = new ArrayList<>();
|
|
|
|
StringBuilder segment = new StringBuilder();
|
|
|
|
boolean inTag = false;
|
|
|
|
for (char c: html.toCharArray())
|
|
|
|
{
|
|
|
|
if (c == '<')
|
|
|
|
{
|
|
|
|
String addee = empty(segment);
|
|
|
|
if (!addee.isEmpty()) returnee.add(addee);
|
|
|
|
inTag = true;
|
|
|
|
segment.append(c);
|
|
|
|
}
|
|
|
|
else if (c == '>')
|
|
|
|
{
|
|
|
|
assert inTag;
|
|
|
|
assert segment.length() > 0;
|
|
|
|
segment.append(c);
|
|
|
|
returnee.add(empty(segment));
|
|
|
|
inTag = false;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
segment.append(c);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
String addee = empty(segment);
|
|
|
|
if (!addee.isEmpty()) returnee.add(addee);
|
|
|
|
|
|
|
|
return returnee;
|
|
|
|
}
|
|
|
|
|
|
|
|
private static Tree<String>
|
|
|
|
toNodes(List<String> segments)
|
|
|
|
{
|
|
|
|
Tree<String> returnee = new Tree<String>();
|
|
|
|
|
|
|
|
for (String segment: segments)
|
|
|
|
{
|
|
|
|
boolean isTag = segment.startsWith("<");
|
|
|
|
Tree<String> node = new Tree<String>();
|
|
|
|
|
|
|
|
if (!isTag)
|
|
|
|
{
|
|
|
|
node.key = "text";
|
|
|
|
node.value = segment;
|
|
|
|
returnee.add(node);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
node.key = "tag";
|
|
|
|
|
|
|
|
String key = null, value = null;
|
|
|
|
StringBuilder b = new StringBuilder();
|
|
|
|
boolean inQuotes = false, inValue = false;
|
|
|
|
char[] chars = segment.toCharArray();
|
|
|
|
for (int o = 1; o < chars.length - 1; ++o)
|
|
|
|
{
|
|
|
|
char c = chars[o];
|
|
|
|
if (c == '"')
|
|
|
|
{
|
|
|
|
inQuotes = !inQuotes;
|
|
|
|
}
|
|
|
|
else if (inQuotes)
|
|
|
|
{
|
|
|
|
b.append(c);
|
|
|
|
}
|
|
|
|
else if (c == '=')
|
|
|
|
{
|
|
|
|
assert b.length() > 0;
|
|
|
|
key = empty(b);
|
|
|
|
inValue = true;
|
|
|
|
}
|
|
|
|
else if (Character.isWhitespace(c))
|
|
|
|
{
|
|
|
|
if (b.length() > 0)
|
|
|
|
{
|
|
|
|
if (inValue) value = empty(b);
|
|
|
|
else key = empty(b);
|
|
|
|
Tree<String> attr = new Tree<String>();
|
|
|
|
attr.key = key;
|
|
|
|
attr.value = value;
|
|
|
|
node.add(attr);
|
|
|
|
}
|
|
|
|
inValue = false;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
b.append(c);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (b.length() > 0)
|
|
|
|
{
|
|
|
|
if (inValue) value = empty(b);
|
|
|
|
else key = empty(b);
|
|
|
|
Tree<String> attr = new Tree<String>();
|
|
|
|
attr.key = key;
|
|
|
|
attr.value = value;
|
|
|
|
node.add(attr);
|
|
|
|
}
|
|
|
|
|
|
|
|
returnee.add(node);
|
|
|
|
}
|
|
|
|
|
|
|
|
return returnee;
|
|
|
|
}
|
|
|
|
|
2022-05-14 18:04:46 -04:00
|
|
|
private static Tree<String>
|
2022-05-16 12:52:00 -04:00
|
|
|
splitText(Tree<String> nodes)
|
2022-05-14 18:04:46 -04:00
|
|
|
{
|
2022-05-16 12:52:00 -04:00
|
|
|
Tree<String> returnee = new Tree<>();
|
2022-05-13 10:32:11 -04:00
|
|
|
|
|
|
|
for (Tree<String> node: nodes)
|
|
|
|
{
|
2022-05-16 12:52:00 -04:00
|
|
|
if (node.key.equals("tag"))
|
2022-05-13 10:32:11 -04:00
|
|
|
{
|
|
|
|
returnee.add(node);
|
|
|
|
continue;
|
|
|
|
}
|
2022-05-16 12:52:00 -04:00
|
|
|
assert node.key.equals("text");
|
2022-05-13 10:32:11 -04:00
|
|
|
|
|
|
|
StringBuilder b = new StringBuilder();
|
2022-05-17 03:14:16 -04:00
|
|
|
boolean alnum = false, calnum;
|
2022-05-16 12:52:00 -04:00
|
|
|
boolean space = false, cspace;
|
|
|
|
boolean emoji = false;
|
|
|
|
for (char c: node.value.toCharArray())
|
2022-05-13 10:32:11 -04:00
|
|
|
{
|
2022-05-17 03:14:16 -04:00
|
|
|
calnum = isMastodonAlnum(c);
|
2022-05-16 12:52:00 -04:00
|
|
|
cspace = Character.isWhitespace(c);
|
|
|
|
|
2022-05-17 03:14:16 -04:00
|
|
|
if (c == ':' && !emoji)
|
2022-05-16 12:52:00 -04:00
|
|
|
{
|
2022-05-17 03:14:16 -04:00
|
|
|
// See note on #isMastodonAlnum.
|
|
|
|
|
2022-05-16 12:52:00 -04:00
|
|
|
if (b.length() > 0)
|
|
|
|
{
|
|
|
|
Tree<String> addee = new Tree<>();
|
|
|
|
addee.key = space ? "space" : "text";
|
|
|
|
addee.value = empty(b);
|
|
|
|
returnee.add(addee);
|
|
|
|
}
|
|
|
|
emoji = true;
|
|
|
|
b.append(c);
|
|
|
|
}
|
|
|
|
else if (c == ':' && emoji)
|
|
|
|
{
|
2022-05-17 03:14:16 -04:00
|
|
|
assert !space;
|
2022-05-16 12:52:00 -04:00
|
|
|
b.append(c);
|
|
|
|
Tree<String> addee = new Tree<>();
|
|
|
|
addee.key = "emoji";
|
|
|
|
addee.value = empty(b);
|
|
|
|
returnee.add(addee);
|
|
|
|
/*
|
|
|
|
* Technically, addee.value.length()
|
|
|
|
* could be zero, which probably means
|
|
|
|
* someone just put two colons in a row,
|
|
|
|
* maybe for Haskell source code. I'd
|
|
|
|
* be surprised if Mastodon didn't escape
|
|
|
|
* it. (If they did, the next step will
|
|
|
|
* handle them.) Anyways treating it as
|
|
|
|
* an empty emoji is the correct action.
|
|
|
|
*/
|
|
|
|
emoji = false;
|
2022-05-17 03:14:16 -04:00
|
|
|
calnum = false;
|
2022-05-16 12:52:00 -04:00
|
|
|
}
|
2022-05-17 03:14:16 -04:00
|
|
|
else if (cspace != space)
|
2022-05-16 12:52:00 -04:00
|
|
|
{
|
2022-05-17 03:14:16 -04:00
|
|
|
if (b.length() > 0)
|
|
|
|
{
|
|
|
|
Tree<String> addee = new Tree<>();
|
|
|
|
addee.key = space ? "space" : "text";
|
|
|
|
addee.value = empty(b);
|
|
|
|
returnee.add(addee);
|
|
|
|
}
|
2022-05-16 12:52:00 -04:00
|
|
|
b.append(c);
|
2022-05-13 10:32:11 -04:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2022-05-16 12:52:00 -04:00
|
|
|
b.append(c);
|
2022-05-13 10:32:11 -04:00
|
|
|
}
|
2022-05-16 12:52:00 -04:00
|
|
|
/*
|
|
|
|
* We can specially handle special
|
|
|
|
* characters like \n, but I'll opt not to.
|
|
|
|
*/
|
|
|
|
|
2022-05-17 03:14:16 -04:00
|
|
|
alnum = calnum;
|
|
|
|
space = cspace;
|
2022-05-13 10:32:11 -04:00
|
|
|
}
|
|
|
|
if (b.length() > 0)
|
|
|
|
{
|
2022-05-16 12:52:00 -04:00
|
|
|
Tree<String> addee = new Tree<>();
|
|
|
|
addee.key = space ? "space" : "text";
|
|
|
|
addee.value = empty(b);
|
|
|
|
returnee.add(addee);
|
2022-05-13 10:32:11 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return returnee;
|
|
|
|
}
|
|
|
|
|
2022-05-16 12:52:00 -04:00
|
|
|
private static Tree<String>
|
|
|
|
evaluateHtmlEscapes(Tree<String> nodes)
|
|
|
|
{
|
|
|
|
for (Tree<String> node: nodes)
|
|
|
|
{
|
|
|
|
node.value = evaluateHtmlEscapes(node.value);
|
|
|
|
for (Tree<String> attr: node)
|
|
|
|
{
|
|
|
|
attr.key = evaluateHtmlEscapes(attr.key);
|
|
|
|
attr.value = evaluateHtmlEscapes(attr.value);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nodes;
|
|
|
|
}
|
|
|
|
|
2022-05-13 10:32:11 -04:00
|
|
|
private static Tree<String>
|
|
|
|
hierarchise(Tree<String> nodes)
|
|
|
|
{
|
|
|
|
Tree<String> root = new Tree<String>();
|
2022-05-14 18:04:46 -04:00
|
|
|
root.key = "tag";
|
|
|
|
root.add(new Tree<>("html", null));
|
2022-05-13 10:32:11 -04:00
|
|
|
root.add(new Tree<>("children", null));
|
2022-05-17 03:14:16 -04:00
|
|
|
|
2022-05-13 10:32:11 -04:00
|
|
|
Deque<Tree<String>> parents = new LinkedList<>();
|
|
|
|
parents.push(root);
|
|
|
|
for (Tree<String> node: nodes)
|
|
|
|
{
|
|
|
|
if (node.key.equals("tag"))
|
|
|
|
{
|
|
|
|
assert node.size() > 0;
|
|
|
|
String tagName = node.get(0).key;
|
|
|
|
|
2022-05-14 18:04:46 -04:00
|
|
|
assert node.get("children") == null;
|
|
|
|
node.add(new Tree<>("children", null));
|
|
|
|
|
2022-05-13 10:32:11 -04:00
|
|
|
boolean isClosing, selfClosing;
|
|
|
|
isClosing = tagName.startsWith("/");
|
|
|
|
selfClosing = node.get("/") != null;
|
|
|
|
selfClosing |= tagName.equals("br");
|
|
|
|
if (isClosing)
|
|
|
|
{
|
|
|
|
assert parents.size() > 1;
|
|
|
|
|
|
|
|
Tree<String> parent, grandparent;
|
|
|
|
parent = parents.pop();
|
|
|
|
grandparent = parents.peek();
|
|
|
|
|
2022-05-14 18:04:46 -04:00
|
|
|
String pTagName = parent.get(0).key;
|
|
|
|
assert tagName.equals("/" + pTagName);
|
2022-05-13 10:32:11 -04:00
|
|
|
|
|
|
|
grandparent.get("children").add(parent);
|
|
|
|
}
|
|
|
|
else if (selfClosing)
|
|
|
|
{
|
2022-05-14 18:04:46 -04:00
|
|
|
parents.peek().get("children").add(node);
|
2022-05-13 10:32:11 -04:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2022-05-14 18:04:46 -04:00
|
|
|
parents.push(node);
|
2022-05-13 10:32:11 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
parents.peek().get("children").add(node);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
assert parents.size() == 1;
|
|
|
|
return parents.pop();
|
|
|
|
}
|
|
|
|
|
|
|
|
private static String
|
|
|
|
empty(StringBuilder b)
|
|
|
|
{
|
|
|
|
String s = b.toString();
|
|
|
|
b.delete(0, b.length());
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2022-05-16 12:52:00 -04:00
|
|
|
private static boolean
|
2022-05-17 03:14:16 -04:00
|
|
|
isMastodonAlnum(char c)
|
2022-05-16 12:52:00 -04:00
|
|
|
{
|
2022-05-17 03:14:16 -04:00
|
|
|
return Character.isLetterOrDigit(c);
|
|
|
|
/*
|
|
|
|
* Not joking. Mastodon is using the POSIX :alnum: regex
|
|
|
|
* character class here (/app/lib/emoji_formatter.rb;
|
|
|
|
* ruby-doc§Regexp). It prevents emojis preceeded by
|
|
|
|
* Japanese like さ too, but not punctuation like tildes
|
|
|
|
* or full stops. This is server-enforced, the web client
|
|
|
|
* does string substitution and supports anything.
|
|
|
|
* (To see this, make a post with an emoji preceeded
|
|
|
|
* by text, then try again with the same emoji also
|
|
|
|
* present elsewhere in the post at a valid position.)
|
|
|
|
*/
|
2022-05-13 10:32:11 -04:00
|
|
|
}
|
|
|
|
|
2022-05-14 18:04:46 -04:00
|
|
|
private static String
|
|
|
|
evaluateHtmlEscapes(String string)
|
|
|
|
{
|
|
|
|
if (string == null) return string;
|
|
|
|
|
|
|
|
StringBuilder whole = new StringBuilder();
|
|
|
|
StringBuilder part = new StringBuilder();
|
|
|
|
boolean inEscape = false;
|
|
|
|
for (char c: string.toCharArray())
|
|
|
|
{
|
|
|
|
if (inEscape && c == ';')
|
|
|
|
{
|
|
|
|
part.append(c);
|
|
|
|
inEscape = false;
|
|
|
|
String v = empty(part);
|
|
|
|
if (v.equals("<")) part.append('<');
|
|
|
|
if (v.equals(">")) part.append('>');
|
|
|
|
if (v.equals("&")) part.append('&');
|
|
|
|
if (v.equals(""")) part.append('"');
|
|
|
|
if (v.equals("'")) part.append('\'');
|
|
|
|
if (v.equals("'")) part.append('\'');
|
|
|
|
}
|
|
|
|
else if (!inEscape && c == '&')
|
|
|
|
{
|
|
|
|
String v = empty(part);
|
|
|
|
if (!v.isEmpty()) whole.append(v);
|
|
|
|
part.append(c);
|
|
|
|
inEscape = true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
part.append(c);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
String v = empty(part);
|
|
|
|
if (!v.isEmpty()) whole.append(v);
|
|
|
|
return whole.toString();
|
|
|
|
}
|
2022-05-17 03:14:16 -04:00
|
|
|
}
|