/* copyright This file is part of JKomasto2. Written in 2022 by Usawashi This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . copyright */ import java.util.List; import java.util.ArrayList; import java.util.Deque; import java.util.LinkedList; import java.util.Locale; import java.text.BreakIterator; import cafe.biskuteri.hinoki.Tree; interface BasicHTMLParser { public static Tree parse(String html) { List segments; segments = distinguishTagsFromPcdata(html); Tree document; document = toNodes(segments); document = splitText(document); document = evaluateHtmlEscapes(document); document = hierarchise(document); return document; } // - -%- - private static List distinguishTagsFromPcdata(String html) { List returnee = new ArrayList<>(); StringBuilder segment = new StringBuilder(); boolean inTag = false; for (char c: html.toCharArray()) { if (c == '<') { String addee = empty(segment); if (!addee.isEmpty()) returnee.add(addee); inTag = true; segment.append(c); } else if (c == '>') { assert inTag; assert segment.length() > 0; segment.append(c); returnee.add(empty(segment)); inTag = false; } else { segment.append(c); } } String addee = empty(segment); if (!addee.isEmpty()) returnee.add(addee); return returnee; } private static Tree toNodes(List segments) { Tree returnee = new Tree(); for (String segment: segments) { boolean isTag = segment.startsWith("<"); Tree node = new Tree(); if (!isTag) { node.key = "text"; node.value = segment; returnee.add(node); continue; } node.key = "tag"; String key = null, value = null; StringBuilder b = new StringBuilder(); boolean inQuotes = false, inValue = false; char[] chars = segment.toCharArray(); for (int o = 1; o < chars.length - 1; ++o) { char c = chars[o]; if (c == '"') { inQuotes = !inQuotes; } else if (inQuotes) { b.append(c); } else if (c == '=') { assert b.length() > 0; key = empty(b); inValue = true; } else if (Character.isWhitespace(c)) { if (b.length() > 0) { if (inValue) value = empty(b); else key = empty(b); Tree attr = new Tree(); attr.key = key; attr.value = value; node.add(attr); } inValue = false; } else { b.append(c); } } if (b.length() > 0) { if (inValue) value = empty(b); else key = empty(b); Tree attr = new Tree(); attr.key = key; attr.value = value; node.add(attr); } returnee.add(node); } return returnee; } private static Tree splitText(Tree nodes) { Tree returnee = new Tree<>(); for (Tree node: nodes) { if (node.key.equals("tag")) { returnee.add(node); continue; } assert node.key.equals("text"); StringBuilder b = new StringBuilder(); boolean alnum = false, calnum; boolean space = false, cspace; boolean emoji = false; for (char c: node.value.toCharArray()) { calnum = isMastodonAlnum(c); cspace = Character.isWhitespace(c); if (c == ':' && !emoji) { // See note on #isMastodonAlnum. if (b.length() > 0) { Tree addee = new Tree<>(); addee.key = space ? "space" : "text"; addee.value = empty(b); returnee.add(addee); } emoji = true; b.append(c); } else if (c == ':' && emoji) { assert !space; b.append(c); Tree addee = new Tree<>(); addee.key = "emoji"; addee.value = empty(b); returnee.add(addee); /* * Technically, addee.value.length() * could be zero, which probably means * someone just put two colons in a row, * maybe for Haskell source code. I'd * be surprised if Mastodon didn't escape * it. (If they did, the next step will * handle them.) Anyways treating it as * an empty emoji is the correct action. */ emoji = false; calnum = false; } else if (cspace != space) { if (b.length() > 0) { Tree addee = new Tree<>(); addee.key = space ? "space" : "text"; addee.value = empty(b); returnee.add(addee); } b.append(c); } else { b.append(c); } /* * We can specially handle special * characters like \n, but I'll opt not to. */ alnum = calnum; space = cspace; } if (b.length() > 0) { Tree addee = new Tree<>(); addee.key = space ? "space" : "text"; addee.value = empty(b); returnee.add(addee); } } return returnee; } private static Tree evaluateHtmlEscapes(Tree nodes) { for (Tree node: nodes) { node.value = evaluateHtmlEscapes(node.value); for (Tree attr: node) { attr.key = evaluateHtmlEscapes(attr.key); attr.value = evaluateHtmlEscapes(attr.value); } } return nodes; } private static Tree hierarchise(Tree nodes) { Tree root = new Tree(); root.key = "tag"; root.add(new Tree<>("html", null)); root.add(new Tree<>("children", null)); Deque> parents = new LinkedList<>(); parents.push(root); for (Tree node: nodes) { if (node.key.equals("tag")) { assert node.size() > 0; String tagName = node.get(0).key; assert node.get("children") == null; node.add(new Tree<>("children", null)); boolean isClosing, selfClosing; isClosing = tagName.startsWith("/"); selfClosing = node.get("/") != null; selfClosing |= tagName.equals("br"); if (isClosing) { assert parents.size() > 1; Tree parent, grandparent; parent = parents.pop(); grandparent = parents.peek(); String pTagName = parent.get(0).key; assert tagName.equals("/" + pTagName); grandparent.get("children").add(parent); } else if (selfClosing) { parents.peek().get("children").add(node); } else { parents.push(node); } } else { parents.peek().get("children").add(node); } } assert parents.size() == 1; return parents.pop(); } private static String empty(StringBuilder b) { String s = b.toString(); b.delete(0, b.length()); return s; } private static boolean isMastodonAlnum(char c) { return Character.isLetterOrDigit(c); /* * Not joking. Mastodon is using the POSIX :alnum: regex * character class here (/app/lib/emoji_formatter.rb; * ruby-doc§Regexp). It prevents emojis preceeded by * Japanese like さ too, but not punctuation like tildes * or full stops. This is server-enforced, the web client * does string substitution and supports anything. * (To see this, make a post with an emoji preceeded * by text, then try again with the same emoji also * present elsewhere in the post at a valid position.) */ } private static String evaluateHtmlEscapes(String string) { if (string == null) return string; StringBuilder whole = new StringBuilder(); StringBuilder part = new StringBuilder(); boolean inEscape = false; for (char c: string.toCharArray()) { if (inEscape && c == ';') { part.append(c); inEscape = false; String v = empty(part); if (v.equals("<")) part.append('<'); if (v.equals(">")) part.append('>'); if (v.equals("&")) part.append('&'); if (v.equals(""")) part.append('"'); if (v.equals("'")) part.append('\''); if (v.equals("'")) part.append('\''); } else if (!inEscape && c == '&') { String v = empty(part); if (!v.isEmpty()) whole.append(v); part.append(c); inEscape = true; } else { part.append(c); } } String v = empty(part); if (!v.isEmpty()) whole.append(v); return whole.toString(); } }