biskuteri-cafe-JKomasto2/BasicHTMLParser.java
Snowyfox e6fea4c061 Fixed bug when redraft makes no changes
(Before this, JKomasto and sometimes the Mastodon web client would get '411 Record Not Found' when submitting the same text after deleting and redrafting. Presumably the Mastodon server caches both whether an idempotency key was fulfilled and which post it leads to, and for some reason it looks up the second and fails.)
2022-05-31 03:39:56 -04:00

368 lines
7.8 KiB
Java

import java.util.List;
import java.util.ArrayList;
import java.util.Deque;
import java.util.LinkedList;
import java.util.Locale;
import java.text.BreakIterator;
import cafe.biskuteri.hinoki.Tree;
interface
BasicHTMLParser {
public static Tree<String>
parse(String html)
{
List<String> segments;
segments = distinguishTagsFromPcdata(html);
Tree<String> document;
document = toNodes(segments);
document = splitText(document);
document = evaluateHtmlEscapes(document);
document = hierarchise(document);
return document;
}
// - -%- -
private static List<String>
distinguishTagsFromPcdata(String html)
{
List<String> returnee = new ArrayList<>();
StringBuilder segment = new StringBuilder();
boolean inTag = false;
for (char c: html.toCharArray())
{
if (c == '<')
{
String addee = empty(segment);
if (!addee.isEmpty()) returnee.add(addee);
inTag = true;
segment.append(c);
}
else if (c == '>')
{
assert inTag;
assert segment.length() > 0;
segment.append(c);
returnee.add(empty(segment));
inTag = false;
}
else
{
segment.append(c);
}
}
String addee = empty(segment);
if (!addee.isEmpty()) returnee.add(addee);
return returnee;
}
private static Tree<String>
toNodes(List<String> segments)
{
Tree<String> returnee = new Tree<String>();
for (String segment: segments)
{
boolean isTag = segment.startsWith("<");
Tree<String> node = new Tree<String>();
if (!isTag)
{
node.key = "text";
node.value = segment;
returnee.add(node);
continue;
}
node.key = "tag";
String key = null, value = null;
StringBuilder b = new StringBuilder();
boolean inQuotes = false, inValue = false;
char[] chars = segment.toCharArray();
for (int o = 1; o < chars.length - 1; ++o)
{
char c = chars[o];
if (c == '"')
{
inQuotes = !inQuotes;
}
else if (inQuotes)
{
b.append(c);
}
else if (c == '=')
{
assert b.length() > 0;
key = empty(b);
inValue = true;
}
else if (Character.isWhitespace(c))
{
if (b.length() > 0)
{
if (inValue) value = empty(b);
else key = empty(b);
Tree<String> attr = new Tree<String>();
attr.key = key;
attr.value = value;
node.add(attr);
}
inValue = false;
}
else
{
b.append(c);
}
}
if (b.length() > 0)
{
if (inValue) value = empty(b);
else key = empty(b);
Tree<String> attr = new Tree<String>();
attr.key = key;
attr.value = value;
node.add(attr);
}
returnee.add(node);
}
return returnee;
}
private static Tree<String>
splitText(Tree<String> nodes)
{
Tree<String> returnee = new Tree<>();
for (Tree<String> node: nodes)
{
if (node.key.equals("tag"))
{
returnee.add(node);
continue;
}
assert node.key.equals("text");
StringBuilder b = new StringBuilder();
boolean alnum = false, calnum;
boolean space = false, cspace;
boolean emoji = false;
for (char c: node.value.toCharArray())
{
calnum = isMastodonAlnum(c);
cspace = Character.isWhitespace(c);
if (c == ':' && !emoji)
{
// See note on #isMastodonAlnum.
if (b.length() > 0)
{
Tree<String> addee = new Tree<>();
addee.key = space ? "space" : "text";
addee.value = empty(b);
returnee.add(addee);
}
emoji = true;
b.append(c);
}
else if (c == ':' && emoji)
{
assert !space;
b.append(c);
Tree<String> addee = new Tree<>();
addee.key = "emoji";
addee.value = empty(b);
returnee.add(addee);
/*
* Technically, addee.value.length()
* could be zero, which probably means
* someone just put two colons in a row,
* maybe for Haskell source code. I'd
* be surprised if Mastodon didn't escape
* it. (If they did, the next step will
* handle them.) Anyways treating it as
* an empty emoji is the correct action.
*/
emoji = false;
calnum = false;
}
else if (cspace != space)
{
if (b.length() > 0)
{
Tree<String> addee = new Tree<>();
addee.key = space ? "space" : "text";
addee.value = empty(b);
returnee.add(addee);
}
b.append(c);
}
else
{
b.append(c);
}
/*
* We can specially handle special
* characters like \n, but I'll opt not to.
*/
alnum = calnum;
space = cspace;
}
if (b.length() > 0)
{
Tree<String> addee = new Tree<>();
addee.key = space ? "space" : "text";
addee.value = empty(b);
returnee.add(addee);
}
}
return returnee;
}
private static Tree<String>
evaluateHtmlEscapes(Tree<String> nodes)
{
for (Tree<String> node: nodes)
{
node.value = evaluateHtmlEscapes(node.value);
for (Tree<String> attr: node)
{
attr.key = evaluateHtmlEscapes(attr.key);
attr.value = evaluateHtmlEscapes(attr.value);
}
}
return nodes;
}
private static Tree<String>
hierarchise(Tree<String> nodes)
{
Tree<String> root = new Tree<String>();
root.key = "tag";
root.add(new Tree<>("html", null));
root.add(new Tree<>("children", null));
Deque<Tree<String>> parents = new LinkedList<>();
parents.push(root);
for (Tree<String> node: nodes)
{
if (node.key.equals("tag"))
{
assert node.size() > 0;
String tagName = node.get(0).key;
assert node.get("children") == null;
node.add(new Tree<>("children", null));
boolean isClosing, selfClosing;
isClosing = tagName.startsWith("/");
selfClosing = node.get("/") != null;
selfClosing |= tagName.equals("br");
if (isClosing)
{
assert parents.size() > 1;
Tree<String> parent, grandparent;
parent = parents.pop();
grandparent = parents.peek();
String pTagName = parent.get(0).key;
assert tagName.equals("/" + pTagName);
grandparent.get("children").add(parent);
}
else if (selfClosing)
{
parents.peek().get("children").add(node);
}
else
{
parents.push(node);
}
}
else
{
parents.peek().get("children").add(node);
}
}
assert parents.size() == 1;
return parents.pop();
}
private static String
empty(StringBuilder b)
{
String s = b.toString();
b.delete(0, b.length());
return s;
}
private static boolean
isMastodonAlnum(char c)
{
return Character.isLetterOrDigit(c);
/*
* Not joking. Mastodon is using the POSIX :alnum: regex
* character class here (/app/lib/emoji_formatter.rb;
* ruby-doc§Regexp). It prevents emojis preceeded by
* Japanese like さ too, but not punctuation like tildes
* or full stops. This is server-enforced, the web client
* does string substitution and supports anything.
* (To see this, make a post with an emoji preceeded
* by text, then try again with the same emoji also
* present elsewhere in the post at a valid position.)
*/
}
private static String
evaluateHtmlEscapes(String string)
{
if (string == null) return string;
StringBuilder whole = new StringBuilder();
StringBuilder part = new StringBuilder();
boolean inEscape = false;
for (char c: string.toCharArray())
{
if (inEscape && c == ';')
{
part.append(c);
inEscape = false;
String v = empty(part);
if (v.equals("&lt;")) part.append('<');
if (v.equals("&gt;")) part.append('>');
if (v.equals("&amp;")) part.append('&');
if (v.equals("&quot;")) part.append('"');
if (v.equals("&apos;")) part.append('\'');
if (v.equals("&#39;")) part.append('\'');
}
else if (!inEscape && c == '&')
{
String v = empty(part);
if (!v.isEmpty()) whole.append(v);
part.append(c);
inEscape = true;
}
else
{
part.append(c);
}
}
String v = empty(part);
if (!v.isEmpty()) whole.append(v);
return whole.toString();
}
}