/* copyright This file is part of JKomasto2. Written in 2022 by Usawashi This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . copyright */ import cafe.biskuteri.hinoki.Tree; import java.util.List; import java.util.ArrayList; import java.io.StringReader; import java.io.Reader; import java.io.IOException; class RudimentaryHTMLParser { public static Tree depthlessRead(String html) { try { return pass3(pass2(pass1(html))); } catch (IOException eIo) { assert false; /* * We use only StringReaders, which only throw an * IOException when they are read after being closed. * And we don't close them. */ return null; } } // - -%- - private static Tree pass1(String html) throws IOException { Reader r = new StringReader(html); Tree docu = new Tree(); StringBuilder text = new StringBuilder(); StringBuilder emoji = new StringBuilder(); StringBuilder htmlEscape = new StringBuilder(); boolean quoted = false, inEmoji = false; int c; while ((c = r.read()) != -1) { if (c == '&' || htmlEscape.length() > 0) { htmlEscape.append((char)c); if (c == ';') { String s = empty(htmlEscape); if (quoted) text.append('\\'); /* * If we're quoted (i.e. within unescaped * quotes), we're in a tag, add escaping * backslash for pass2 to work with. * Only necessary for the quotes, but, * might as well uniformly use. */ if (s.equals("<")) text.append('<'); if (s.equals(">")) text.append('>'); if (s.equals("&")) text.append('&'); if (s.equals(""")) text.append('"'); if (s.equals("'")) text.append('\''); if (s.equals("'")) text.append('\''); } continue; } if (c == '"') { text.append((char)c); quoted = !quoted; continue; } if (!quoted) { if (c == '<') { if (text.length() > 0) { Tree node = new Tree<>(); node.key = "text"; node.value = empty(text); docu.add(node); } continue; } if (c == '>') { Tree node = new Tree<>(); node.key = "tag"; node.value = empty(text); docu.add(node); continue; } } text.append((char)c); continue; } if (text.length() > 0) { Tree node = new Tree<>(); node.key = "text"; node.value = empty(text); docu.add(node); } return docu; } private static Tree pass2(Tree docu) throws IOException { for (Tree node: docu.children) { if (!node.key.equals("tag")) continue; Reader r = new StringReader(node.value); Tree part = new Tree(); boolean escaped = false, quoted = false; StringBuilder field = new StringBuilder(); int c; while ((c = r.read()) != -1) { if (escaped) { field.append((char)c); escaped = false; continue; } if (c == '\\') { escaped = true; continue; } if (c == '"') { quoted = !quoted; continue; } if (quoted) { field.append((char)c); continue; } if (c == '=') { part.key = empty(field); continue; } if (c == ' ') { if (field.length() > 0) { boolean v = part.key != null; if (v) part.value = empty(field); else part.key = empty(field); node.add(part); part = new Tree(); } continue; } field.append((char)c); } if (field.length() > 0) { boolean v = part.key != null; if (v) part.value = empty(field); else part.key = empty(field); node.add(part); } node.value = null; } return docu; } private static Tree pass3(Tree docu) { Tree returnee = new Tree(); for (Tree node: docu) { if (!node.key.equals("text")) { returnee.add(node); continue; } StringBuilder value = new StringBuilder(); for (String segment: whitespaceSplit(node.value)) { boolean st = segment.startsWith(":"); boolean ed = segment.endsWith(":"); if (st && ed) { Tree text = new Tree(); text.key = "text"; text.value = empty(value); returnee.add(text); Tree emoji = new Tree(); emoji.key = "emoji"; emoji.value = segment; returnee.add(emoji); } else { value.append(segment); } } if (value.length() > 0) { Tree text = new Tree(); text.key = "text"; text.value = empty(value); returnee.add(text); } } return returnee; } private static String empty(StringBuilder b) { String s = b.toString(); b.delete(0, b.length()); return s; } private static List whitespaceSplit(String text) { List returnee = new ArrayList<>(); StringBuilder segment = new StringBuilder(); boolean isWhitespace = false; for (char c: text.toCharArray()) { boolean diff = isWhitespace ^ Character.isWhitespace(c); if (diff) { returnee.add(empty(segment)); isWhitespace = !isWhitespace; } segment.append(c); } returnee.add(empty(segment)); return returnee; } // ---%-@-%--- public static void main(String... args) { final String EX2 = "

2000s energy
http://www.pspad.com/en/screenshot.htm

"; } }