mirror of
https://gitlab.com/biskuteri-cafe/JKomasto2.git
synced 2025-01-08 22:34:45 +01:00
71b9c496c4
Implemented hierarchical HTML parser.
328 lines
6.6 KiB
Java
328 lines
6.6 KiB
Java
|
|
import java.util.List;
|
|
import java.util.ArrayList;
|
|
import java.util.Deque;
|
|
import java.util.LinkedList;
|
|
import java.util.Locale;
|
|
import java.text.BreakIterator;
|
|
import cafe.biskuteri.hinoki.Tree;
|
|
|
|
interface
|
|
BasicHTMLParser {
|
|
|
|
public static Tree<String>
|
|
parse(String html)
|
|
{
|
|
List<String> segments;
|
|
segments = distinguishTagsFromPcdata(html);
|
|
segments = evaluateHtmlEscapes(segments);
|
|
|
|
Tree<String> document;
|
|
document = toNodes(segments);
|
|
document = distinguishEmojisFromText(document);
|
|
document = hierarchise(document);
|
|
|
|
return document;
|
|
}
|
|
|
|
// - -%- -
|
|
|
|
private static List<String>
|
|
distinguishTagsFromPcdata(String html)
|
|
{
|
|
List<String> returnee = new ArrayList<>();
|
|
StringBuilder segment = new StringBuilder();
|
|
boolean inTag = false;
|
|
for (char c: html.toCharArray())
|
|
{
|
|
if (c == '<')
|
|
{
|
|
String addee = empty(segment);
|
|
if (!addee.isEmpty()) returnee.add(addee);
|
|
inTag = true;
|
|
segment.append(c);
|
|
}
|
|
else if (c == '>')
|
|
{
|
|
assert inTag;
|
|
assert segment.length() > 0;
|
|
segment.append(c);
|
|
returnee.add(empty(segment));
|
|
inTag = false;
|
|
}
|
|
else
|
|
{
|
|
segment.append(c);
|
|
}
|
|
}
|
|
String addee = empty(segment);
|
|
if (!addee.isEmpty()) returnee.add(addee);
|
|
|
|
return returnee;
|
|
}
|
|
|
|
private static List<String>
|
|
evaluateHtmlEscapes(List<String> strings)
|
|
{
|
|
List<String> returnee = new ArrayList<>();
|
|
|
|
for (String string: strings)
|
|
{
|
|
StringBuilder whole = new StringBuilder();
|
|
StringBuilder part = new StringBuilder();
|
|
boolean inEscape = false;
|
|
for (char c: string.toCharArray())
|
|
{
|
|
if (inEscape && c == ';')
|
|
{
|
|
part.append(c);
|
|
inEscape = false;
|
|
String v = empty(part);
|
|
if (v.equals("<")) part.append('<');
|
|
if (v.equals(">")) part.append('>');
|
|
if (v.equals("&")) part.append('&');
|
|
if (v.equals(""")) part.append('"');
|
|
if (v.equals("'")) part.append('\'');
|
|
if (v.equals("'")) part.append('\'');
|
|
}
|
|
else if (!inEscape && c == '&')
|
|
{
|
|
String v = empty(part);
|
|
if (!v.isEmpty()) whole.append(v);
|
|
part.append(c);
|
|
inEscape = true;
|
|
}
|
|
else
|
|
{
|
|
part.append(c);
|
|
}
|
|
}
|
|
String v = empty(part);
|
|
if (!v.isEmpty()) whole.append(v);
|
|
|
|
returnee.add(empty(whole));
|
|
}
|
|
|
|
return returnee;
|
|
}
|
|
|
|
private static Tree<String>
|
|
toNodes(List<String> segments)
|
|
{
|
|
Tree<String> returnee = new Tree<String>();
|
|
|
|
for (String segment: segments)
|
|
{
|
|
boolean isTag = segment.startsWith("<");
|
|
Tree<String> node = new Tree<String>();
|
|
|
|
if (!isTag)
|
|
{
|
|
node.key = "text";
|
|
node.value = segment;
|
|
returnee.add(node);
|
|
continue;
|
|
}
|
|
|
|
node.key = "tag";
|
|
|
|
String key = null, value = null;
|
|
StringBuilder b = new StringBuilder();
|
|
boolean inQuotes = false, inValue = false;
|
|
char[] chars = segment.toCharArray();
|
|
for (int o = 1; o < chars.length - 1; ++o)
|
|
{
|
|
char c = chars[o];
|
|
if (c == '"')
|
|
{
|
|
inQuotes = !inQuotes;
|
|
}
|
|
else if (inQuotes)
|
|
{
|
|
b.append(c);
|
|
}
|
|
else if (c == '=')
|
|
{
|
|
assert b.length() > 0;
|
|
key = empty(b);
|
|
inValue = true;
|
|
}
|
|
else if (Character.isWhitespace(c))
|
|
{
|
|
if (b.length() > 0)
|
|
{
|
|
if (inValue) value = empty(b);
|
|
else key = empty(b);
|
|
Tree<String> attr = new Tree<String>();
|
|
attr.key = key;
|
|
attr.value = value;
|
|
node.add(attr);
|
|
}
|
|
inValue = false;
|
|
}
|
|
else
|
|
{
|
|
b.append(c);
|
|
}
|
|
}
|
|
if (b.length() > 0)
|
|
{
|
|
if (inValue) value = empty(b);
|
|
else key = empty(b);
|
|
Tree<String> attr = new Tree<String>();
|
|
attr.key = key;
|
|
attr.value = value;
|
|
node.add(attr);
|
|
}
|
|
|
|
returnee.add(node);
|
|
}
|
|
|
|
return returnee;
|
|
}
|
|
|
|
private static Tree<String>
|
|
distinguishEmojisFromText(Tree<String> nodes)
|
|
{
|
|
Tree<String> returnee = new Tree<String>();
|
|
|
|
for (Tree<String> node: nodes)
|
|
{
|
|
if (!node.key.equals("text"))
|
|
{
|
|
returnee.add(node);
|
|
continue;
|
|
}
|
|
|
|
List<String> segments;
|
|
segments = distinguishWhitespaceFromText(node.value);
|
|
StringBuilder b = new StringBuilder();
|
|
for (String segment: segments)
|
|
{
|
|
boolean starts = segment.startsWith(":");
|
|
boolean ends = segment.endsWith(":");
|
|
if (starts && ends)
|
|
{
|
|
Tree<String> text = new Tree<String>();
|
|
text.key = "text";
|
|
text.value = empty(b);
|
|
returnee.add(text);
|
|
Tree<String> emoji = new Tree<String>();
|
|
emoji.key = "emoji";
|
|
emoji.value = segment;
|
|
returnee.add(emoji);
|
|
}
|
|
else
|
|
{
|
|
b.append(segment);
|
|
}
|
|
}
|
|
if (b.length() > 0)
|
|
{
|
|
Tree<String> text = new Tree<String>();
|
|
text.key = "text";
|
|
text.value = empty(b);
|
|
returnee.add(text);
|
|
}
|
|
}
|
|
|
|
return returnee;
|
|
}
|
|
|
|
private static Tree<String>
|
|
hierarchise(Tree<String> nodes)
|
|
{
|
|
Tree<String> root = new Tree<String>();
|
|
root.add(new Tree<>("attributes", null));
|
|
root.get(0).add(new Tree<>("html", null));
|
|
root.add(new Tree<>("children", null));
|
|
|
|
Deque<Tree<String>> parents = new LinkedList<>();
|
|
parents.push(root);
|
|
for (Tree<String> node: nodes)
|
|
{
|
|
if (node.key.equals("tag"))
|
|
{
|
|
assert node.size() > 0;
|
|
String tagName = node.get(0).key;
|
|
|
|
boolean isClosing, selfClosing;
|
|
isClosing = tagName.startsWith("/");
|
|
selfClosing = node.get("/") != null;
|
|
selfClosing |= tagName.equals("br");
|
|
if (isClosing)
|
|
{
|
|
assert parents.size() > 1;
|
|
|
|
Tree<String> parent, grandparent;
|
|
parent = parents.pop();
|
|
grandparent = parents.peek();
|
|
|
|
assert tagName.equals(
|
|
"/"
|
|
+ parent.get("attributes").get(0).key
|
|
);
|
|
|
|
grandparent.get("children").add(parent);
|
|
}
|
|
else if (selfClosing)
|
|
{
|
|
Tree<String> elem = new Tree<String>();
|
|
node.key = "attributes";
|
|
elem.add(node);
|
|
elem.add(new Tree<>("children", null));
|
|
|
|
parents.peek().get("children").add(elem);
|
|
}
|
|
else
|
|
{
|
|
Tree<String> elem = new Tree<String>();
|
|
node.key = "attributes";
|
|
elem.add(node);
|
|
elem.add(new Tree<>("children", null));
|
|
|
|
parents.push(elem);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
parents.peek().get("children").add(node);
|
|
}
|
|
}
|
|
|
|
assert parents.size() == 1;
|
|
return parents.pop();
|
|
}
|
|
|
|
private static String
|
|
empty(StringBuilder b)
|
|
{
|
|
String s = b.toString();
|
|
b.delete(0, b.length());
|
|
return s;
|
|
}
|
|
|
|
private static List<String>
|
|
distinguishWhitespaceFromText(String text)
|
|
{
|
|
List<String> returnee = new ArrayList<>();
|
|
|
|
StringBuilder segment = new StringBuilder();
|
|
boolean inWhitespace = false;
|
|
for (char c: text.toCharArray())
|
|
{
|
|
boolean w = Character.isWhitespace(c);
|
|
boolean change = w ^ inWhitespace;
|
|
if (change)
|
|
{
|
|
returnee.add(empty(segment));
|
|
inWhitespace = !inWhitespace;
|
|
}
|
|
segment.append(c);
|
|
}
|
|
returnee.add(empty(segment));
|
|
|
|
return returnee;
|
|
}
|
|
|
|
} |