biskuteri-cafe-JKomasto2/RudimentaryHTMLParser.java
Snowyfox e6fea4c061 Fixed bug when redraft makes no changes
(Before this, JKomasto and sometimes the Mastodon web client would get '411 Record Not Found' when submitting the same text after deleting and redrafting. Presumably the Mastodon server caches both whether an idempotency key was fulfilled and which post it leads to, and for some reason it looks up the second and fails.)
2022-05-31 03:39:56 -04:00

250 lines
6.9 KiB
Java

import cafe.biskuteri.hinoki.Tree;
import java.util.List;
import java.util.ArrayList;
import java.io.StringReader;
import java.io.Reader;
import java.io.IOException;
class
RudimentaryHTMLParser {
public static Tree<String>
depthlessRead(String html)
{
try {
return pass3(pass2(pass1(html)));
}
catch (IOException eIo) {
assert false;
/*
* We use only StringReaders, which only throw an
* IOException when they are read after being closed.
* And we don't close them.
*/
return null;
}
}
// - -%- -
private static Tree<String>
pass1(String html)
throws IOException
{
Reader r = new StringReader(html);
Tree<String> docu = new Tree<String>();
StringBuilder text = new StringBuilder();
StringBuilder emoji = new StringBuilder();
StringBuilder htmlEscape = new StringBuilder();
boolean quoted = false, inEmoji = false;
int c; while ((c = r.read()) != -1)
{
if (c == '&' || htmlEscape.length() > 0)
{
htmlEscape.append((char)c);
if (c == ';')
{
String s = empty(htmlEscape);
if (quoted) text.append('\\');
/*
* If we're quoted (i.e. within unescaped
* quotes), we're in a tag, add escaping
* backslash for pass2 to work with.
* Only necessary for the quotes, but,
* might as well uniformly use.
*/
if (s.equals("&lt;")) text.append('<');
if (s.equals("&gt;")) text.append('>');
if (s.equals("&amp;")) text.append('&');
if (s.equals("&quot;")) text.append('"');
if (s.equals("&apos;")) text.append('\'');
if (s.equals("&#39;")) text.append('\'');
}
continue;
}
if (c == '"')
{
text.append((char)c);
quoted = !quoted;
continue;
}
if (!quoted)
{
if (c == '<')
{
if (text.length() > 0)
{
Tree<String> node = new Tree<>();
node.key = "text";
node.value = empty(text);
docu.add(node);
}
continue;
}
if (c == '>')
{
Tree<String> node = new Tree<>();
node.key = "tag";
node.value = empty(text);
docu.add(node);
continue;
}
}
text.append((char)c);
continue;
}
if (text.length() > 0)
{
Tree<String> node = new Tree<>();
node.key = "text";
node.value = empty(text);
docu.add(node);
}
return docu;
}
private static Tree<String>
pass2(Tree<String> docu)
throws IOException
{
for (Tree<String> node: docu.children)
{
if (!node.key.equals("tag")) continue;
Reader r = new StringReader(node.value);
Tree<String> part = new Tree<String>();
boolean escaped = false, quoted = false;
StringBuilder field = new StringBuilder();
int c; while ((c = r.read()) != -1)
{
if (escaped) {
field.append((char)c);
escaped = false;
continue;
}
if (c == '\\') {
escaped = true;
continue;
}
if (c == '"') {
quoted = !quoted;
continue;
}
if (quoted) {
field.append((char)c);
continue;
}
if (c == '=') {
part.key = empty(field);
continue;
}
if (c == ' ') {
if (field.length() > 0) {
boolean v = part.key != null;
if (v) part.value = empty(field);
else part.key = empty(field);
node.add(part);
part = new Tree<String>();
}
continue;
}
field.append((char)c);
}
if (field.length() > 0) {
boolean v = part.key != null;
if (v) part.value = empty(field);
else part.key = empty(field);
node.add(part);
}
node.value = null;
}
return docu;
}
private static Tree<String>
pass3(Tree<String> docu)
{
Tree<String> returnee = new Tree<String>();
for (Tree<String> node: docu)
{
if (!node.key.equals("text"))
{
returnee.add(node);
continue;
}
StringBuilder value = new StringBuilder();
for (String segment: whitespaceSplit(node.value))
{
boolean st = segment.startsWith(":");
boolean ed = segment.endsWith(":");
if (st && ed)
{
Tree<String> text = new Tree<String>();
text.key = "text";
text.value = empty(value);
returnee.add(text);
Tree<String> emoji = new Tree<String>();
emoji.key = "emoji";
emoji.value = segment;
returnee.add(emoji);
}
else
{
value.append(segment);
}
}
if (value.length() > 0)
{
Tree<String> text = new Tree<String>();
text.key = "text";
text.value = empty(value);
returnee.add(text);
}
}
return returnee;
}
private static String
empty(StringBuilder b)
{
String s = b.toString();
b.delete(0, b.length());
return s;
}
private static List<String>
whitespaceSplit(String text)
{
List<String> returnee = new ArrayList<>();
StringBuilder segment = new StringBuilder();
boolean isWhitespace = false;
for (char c: text.toCharArray())
{
boolean diff = isWhitespace ^ Character.isWhitespace(c);
if (diff) {
returnee.add(empty(segment));
isWhitespace = !isWhitespace;
}
segment.append(c);
}
returnee.add(empty(segment));
return returnee;
}
// ---%-@-%---
public static void
main(String... args)
{
final String EX2 =
"<p>2000s energy<br /><a href=\"http://www.pspad.com/en/screenshot.htm\" rel=\"nofollow noopener noreferrer\" target=\"_blank\"><span class=\"invisible\">http://www.</span><span class=\"\">pspad.com/en/screenshot.htm</span><span class=\"invisible\"></span></a></p>";
}
}