biskuteri-cafe-JKomasto2/BasicHTMLParser.java

368 lines
7.8 KiB
Java
Raw Normal View History

import java.util.List;
import java.util.ArrayList;
import java.util.Deque;
import java.util.LinkedList;
import java.util.Locale;
import java.text.BreakIterator;
import cafe.biskuteri.hinoki.Tree;
interface
BasicHTMLParser {
public static Tree<String>
parse(String html)
{
List<String> segments;
segments = distinguishTagsFromPcdata(html);
2022-05-17 03:14:16 -04:00
Tree<String> document;
document = toNodes(segments);
2022-05-16 12:52:00 -04:00
document = splitText(document);
document = evaluateHtmlEscapes(document);
document = hierarchise(document);
return document;
}
// - -%- -
private static List<String>
distinguishTagsFromPcdata(String html)
{
List<String> returnee = new ArrayList<>();
StringBuilder segment = new StringBuilder();
boolean inTag = false;
for (char c: html.toCharArray())
{
if (c == '<')
{
String addee = empty(segment);
if (!addee.isEmpty()) returnee.add(addee);
inTag = true;
segment.append(c);
}
else if (c == '>')
{
assert inTag;
assert segment.length() > 0;
segment.append(c);
returnee.add(empty(segment));
inTag = false;
}
else
{
segment.append(c);
}
}
String addee = empty(segment);
if (!addee.isEmpty()) returnee.add(addee);
return returnee;
}
private static Tree<String>
toNodes(List<String> segments)
{
Tree<String> returnee = new Tree<String>();
for (String segment: segments)
{
boolean isTag = segment.startsWith("<");
Tree<String> node = new Tree<String>();
if (!isTag)
{
node.key = "text";
node.value = segment;
returnee.add(node);
continue;
}
node.key = "tag";
String key = null, value = null;
StringBuilder b = new StringBuilder();
boolean inQuotes = false, inValue = false;
char[] chars = segment.toCharArray();
for (int o = 1; o < chars.length - 1; ++o)
{
char c = chars[o];
if (c == '"')
{
inQuotes = !inQuotes;
}
else if (inQuotes)
{
b.append(c);
}
else if (c == '=')
{
assert b.length() > 0;
key = empty(b);
inValue = true;
}
else if (Character.isWhitespace(c))
{
if (b.length() > 0)
{
if (inValue) value = empty(b);
else key = empty(b);
Tree<String> attr = new Tree<String>();
attr.key = key;
attr.value = value;
node.add(attr);
}
inValue = false;
}
else
{
b.append(c);
}
}
if (b.length() > 0)
{
if (inValue) value = empty(b);
else key = empty(b);
Tree<String> attr = new Tree<String>();
attr.key = key;
attr.value = value;
node.add(attr);
}
returnee.add(node);
}
return returnee;
}
private static Tree<String>
2022-05-16 12:52:00 -04:00
splitText(Tree<String> nodes)
{
2022-05-16 12:52:00 -04:00
Tree<String> returnee = new Tree<>();
for (Tree<String> node: nodes)
{
2022-05-16 12:52:00 -04:00
if (node.key.equals("tag"))
{
returnee.add(node);
continue;
}
2022-05-16 12:52:00 -04:00
assert node.key.equals("text");
StringBuilder b = new StringBuilder();
2022-05-17 03:14:16 -04:00
boolean alnum = false, calnum;
2022-05-16 12:52:00 -04:00
boolean space = false, cspace;
boolean emoji = false;
for (char c: node.value.toCharArray())
{
2022-05-17 03:14:16 -04:00
calnum = isMastodonAlnum(c);
2022-05-16 12:52:00 -04:00
cspace = Character.isWhitespace(c);
2022-05-17 03:14:16 -04:00
if (c == ':' && !emoji)
2022-05-16 12:52:00 -04:00
{
2022-05-17 03:14:16 -04:00
// See note on #isMastodonAlnum.
2022-05-16 12:52:00 -04:00
if (b.length() > 0)
{
Tree<String> addee = new Tree<>();
addee.key = space ? "space" : "text";
addee.value = empty(b);
returnee.add(addee);
}
emoji = true;
b.append(c);
}
else if (c == ':' && emoji)
{
2022-05-17 03:14:16 -04:00
assert !space;
2022-05-16 12:52:00 -04:00
b.append(c);
Tree<String> addee = new Tree<>();
addee.key = "emoji";
addee.value = empty(b);
returnee.add(addee);
/*
* Technically, addee.value.length()
* could be zero, which probably means
* someone just put two colons in a row,
* maybe for Haskell source code. I'd
* be surprised if Mastodon didn't escape
* it. (If they did, the next step will
* handle them.) Anyways treating it as
* an empty emoji is the correct action.
*/
emoji = false;
2022-05-17 03:14:16 -04:00
calnum = false;
2022-05-16 12:52:00 -04:00
}
2022-05-17 03:14:16 -04:00
else if (cspace != space)
2022-05-16 12:52:00 -04:00
{
2022-05-17 03:14:16 -04:00
if (b.length() > 0)
{
Tree<String> addee = new Tree<>();
addee.key = space ? "space" : "text";
addee.value = empty(b);
returnee.add(addee);
}
2022-05-16 12:52:00 -04:00
b.append(c);
}
else
{
2022-05-16 12:52:00 -04:00
b.append(c);
}
2022-05-16 12:52:00 -04:00
/*
* We can specially handle special
* characters like \n, but I'll opt not to.
*/
2022-05-17 03:14:16 -04:00
alnum = calnum;
space = cspace;
}
if (b.length() > 0)
{
2022-05-16 12:52:00 -04:00
Tree<String> addee = new Tree<>();
addee.key = space ? "space" : "text";
addee.value = empty(b);
returnee.add(addee);
}
}
return returnee;
}
2022-05-16 12:52:00 -04:00
private static Tree<String>
evaluateHtmlEscapes(Tree<String> nodes)
{
for (Tree<String> node: nodes)
{
node.value = evaluateHtmlEscapes(node.value);
for (Tree<String> attr: node)
{
attr.key = evaluateHtmlEscapes(attr.key);
attr.value = evaluateHtmlEscapes(attr.value);
}
}
return nodes;
}
private static Tree<String>
hierarchise(Tree<String> nodes)
{
Tree<String> root = new Tree<String>();
root.key = "tag";
root.add(new Tree<>("html", null));
root.add(new Tree<>("children", null));
2022-05-17 03:14:16 -04:00
Deque<Tree<String>> parents = new LinkedList<>();
parents.push(root);
for (Tree<String> node: nodes)
{
if (node.key.equals("tag"))
{
assert node.size() > 0;
String tagName = node.get(0).key;
assert node.get("children") == null;
node.add(new Tree<>("children", null));
boolean isClosing, selfClosing;
isClosing = tagName.startsWith("/");
selfClosing = node.get("/") != null;
selfClosing |= tagName.equals("br");
if (isClosing)
{
assert parents.size() > 1;
Tree<String> parent, grandparent;
parent = parents.pop();
grandparent = parents.peek();
String pTagName = parent.get(0).key;
assert tagName.equals("/" + pTagName);
grandparent.get("children").add(parent);
}
else if (selfClosing)
{
parents.peek().get("children").add(node);
}
else
{
parents.push(node);
}
}
else
{
parents.peek().get("children").add(node);
}
}
assert parents.size() == 1;
return parents.pop();
}
private static String
empty(StringBuilder b)
{
String s = b.toString();
b.delete(0, b.length());
return s;
}
2022-05-16 12:52:00 -04:00
private static boolean
2022-05-17 03:14:16 -04:00
isMastodonAlnum(char c)
2022-05-16 12:52:00 -04:00
{
2022-05-17 03:14:16 -04:00
return Character.isLetterOrDigit(c);
/*
* Not joking. Mastodon is using the POSIX :alnum: regex
* character class here (/app/lib/emoji_formatter.rb;
* ruby-doc§Regexp). It prevents emojis preceeded by
* Japanese like too, but not punctuation like tildes
* or full stops. This is server-enforced, the web client
* does string substitution and supports anything.
* (To see this, make a post with an emoji preceeded
* by text, then try again with the same emoji also
* present elsewhere in the post at a valid position.)
*/
}
private static String
evaluateHtmlEscapes(String string)
{
if (string == null) return string;
StringBuilder whole = new StringBuilder();
StringBuilder part = new StringBuilder();
boolean inEscape = false;
for (char c: string.toCharArray())
{
if (inEscape && c == ';')
{
part.append(c);
inEscape = false;
String v = empty(part);
if (v.equals("&lt;")) part.append('<');
if (v.equals("&gt;")) part.append('>');
if (v.equals("&amp;")) part.append('&');
if (v.equals("&quot;")) part.append('"');
if (v.equals("&apos;")) part.append('\'');
if (v.equals("&#39;")) part.append('\'');
}
else if (!inEscape && c == '&')
{
String v = empty(part);
if (!v.isEmpty()) whole.append(v);
part.append(c);
inEscape = true;
}
else
{
part.append(c);
}
}
String v = empty(part);
if (!v.isEmpty()) whole.append(v);
return whole.toString();
}
2022-05-17 03:14:16 -04:00
}