biskuteri-cafe-JKomasto2/BasicHTMLParser.java
Snowyfox e4f13ad8c8 Added attempt at using AttributedString.
Somewhat works now, but I think we'll abandon it..
2022-05-14 18:04:46 -04:00

328 lines
6.6 KiB
Java

import java.util.List;
import java.util.ArrayList;
import java.util.Deque;
import java.util.LinkedList;
import java.util.Locale;
import java.text.BreakIterator;
import cafe.biskuteri.hinoki.Tree;
interface
BasicHTMLParser {
public static Tree<String>
parse(String html)
{
List<String> segments;
segments = distinguishTagsFromPcdata(html);
Tree<String> document;
document = toNodes(segments);
document = evaluateHtmlEscapes(document);
document = distinguishEmojisFromText(document);
document = hierarchise(document);
return document;
}
// - -%- -
private static List<String>
distinguishTagsFromPcdata(String html)
{
List<String> returnee = new ArrayList<>();
StringBuilder segment = new StringBuilder();
boolean inTag = false;
for (char c: html.toCharArray())
{
if (c == '<')
{
String addee = empty(segment);
if (!addee.isEmpty()) returnee.add(addee);
inTag = true;
segment.append(c);
}
else if (c == '>')
{
assert inTag;
assert segment.length() > 0;
segment.append(c);
returnee.add(empty(segment));
inTag = false;
}
else
{
segment.append(c);
}
}
String addee = empty(segment);
if (!addee.isEmpty()) returnee.add(addee);
return returnee;
}
private static Tree<String>
toNodes(List<String> segments)
{
Tree<String> returnee = new Tree<String>();
for (String segment: segments)
{
boolean isTag = segment.startsWith("<");
Tree<String> node = new Tree<String>();
if (!isTag)
{
node.key = "text";
node.value = segment;
returnee.add(node);
continue;
}
node.key = "tag";
String key = null, value = null;
StringBuilder b = new StringBuilder();
boolean inQuotes = false, inValue = false;
char[] chars = segment.toCharArray();
for (int o = 1; o < chars.length - 1; ++o)
{
char c = chars[o];
if (c == '"')
{
inQuotes = !inQuotes;
}
else if (inQuotes)
{
b.append(c);
}
else if (c == '=')
{
assert b.length() > 0;
key = empty(b);
inValue = true;
}
else if (Character.isWhitespace(c))
{
if (b.length() > 0)
{
if (inValue) value = empty(b);
else key = empty(b);
Tree<String> attr = new Tree<String>();
attr.key = key;
attr.value = value;
node.add(attr);
}
inValue = false;
}
else
{
b.append(c);
}
}
if (b.length() > 0)
{
if (inValue) value = empty(b);
else key = empty(b);
Tree<String> attr = new Tree<String>();
attr.key = key;
attr.value = value;
node.add(attr);
}
returnee.add(node);
}
return returnee;
}
private static Tree<String>
evaluateHtmlEscapes(Tree<String> nodes)
{
for (Tree<String> node: nodes)
{
node.value = evaluateHtmlEscapes(node.value);
for (Tree<String> attr: node)
{
attr.key = evaluateHtmlEscapes(attr.key);
attr.value = evaluateHtmlEscapes(attr.value);
}
}
return nodes;
}
private static Tree<String>
distinguishEmojisFromText(Tree<String> nodes)
{
Tree<String> returnee = new Tree<String>();
for (Tree<String> node: nodes)
{
if (!node.key.equals("text"))
{
returnee.add(node);
continue;
}
List<String> segments;
segments = distinguishWhitespaceFromText(node.value);
StringBuilder b = new StringBuilder();
for (String segment: segments)
{
boolean starts = segment.startsWith(":");
boolean ends = segment.endsWith(":");
if (starts && ends)
{
Tree<String> text = new Tree<String>();
text.key = "text";
text.value = empty(b);
returnee.add(text);
Tree<String> emoji = new Tree<String>();
emoji.key = "emoji";
emoji.value = segment;
returnee.add(emoji);
}
else
{
b.append(segment);
}
}
if (b.length() > 0)
{
Tree<String> text = new Tree<String>();
text.key = "text";
text.value = empty(b);
returnee.add(text);
}
}
return returnee;
}
private static Tree<String>
hierarchise(Tree<String> nodes)
{
Tree<String> root = new Tree<String>();
root.key = "tag";
root.add(new Tree<>("html", null));
root.add(new Tree<>("children", null));
Deque<Tree<String>> parents = new LinkedList<>();
parents.push(root);
for (Tree<String> node: nodes)
{
if (node.key.equals("tag"))
{
assert node.size() > 0;
String tagName = node.get(0).key;
assert node.get("children") == null;
node.add(new Tree<>("children", null));
boolean isClosing, selfClosing;
isClosing = tagName.startsWith("/");
selfClosing = node.get("/") != null;
selfClosing |= tagName.equals("br");
if (isClosing)
{
assert parents.size() > 1;
Tree<String> parent, grandparent;
parent = parents.pop();
grandparent = parents.peek();
String pTagName = parent.get(0).key;
assert tagName.equals("/" + pTagName);
grandparent.get("children").add(parent);
}
else if (selfClosing)
{
parents.peek().get("children").add(node);
}
else
{
parents.push(node);
}
}
else
{
parents.peek().get("children").add(node);
}
}
assert parents.size() == 1;
return parents.pop();
}
private static String
empty(StringBuilder b)
{
String s = b.toString();
b.delete(0, b.length());
return s;
}
private static List<String>
distinguishWhitespaceFromText(String text)
{
List<String> returnee = new ArrayList<>();
StringBuilder segment = new StringBuilder();
boolean inWhitespace = false;
for (char c: text.toCharArray())
{
boolean w = Character.isWhitespace(c);
boolean change = w ^ inWhitespace;
if (change)
{
returnee.add(empty(segment));
inWhitespace = !inWhitespace;
}
segment.append(c);
}
returnee.add(empty(segment));
return returnee;
}
private static String
evaluateHtmlEscapes(String string)
{
if (string == null) return string;
StringBuilder whole = new StringBuilder();
StringBuilder part = new StringBuilder();
boolean inEscape = false;
for (char c: string.toCharArray())
{
if (inEscape && c == ';')
{
part.append(c);
inEscape = false;
String v = empty(part);
if (v.equals("&lt;")) part.append('<');
if (v.equals("&gt;")) part.append('>');
if (v.equals("&amp;")) part.append('&');
if (v.equals("&quot;")) part.append('"');
if (v.equals("&apos;")) part.append('\'');
if (v.equals("&#39;")) part.append('\'');
}
else if (!inEscape && c == '&')
{
String v = empty(part);
if (!v.isEmpty()) whole.append(v);
part.append(c);
inEscape = true;
}
else
{
part.append(c);
}
}
String v = empty(part);
if (!v.isEmpty()) whole.append(v);
return whole.toString();
}
}