biskuteri-cafe-JKomasto2/RudimentaryHTMLParser.java

269 lines
8.1 KiB
Java

/* copyright
This file is part of JKomasto2.
Written in 2022 by Usawashi <usawashi16@yahoo.co.jp>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
copyright */
import cafe.biskuteri.hinoki.Tree;
import java.util.List;
import java.util.ArrayList;
import java.io.StringReader;
import java.io.Reader;
import java.io.IOException;
class
RudimentaryHTMLParser {
public static Tree<String>
depthlessRead(String html)
{
try {
return pass3(pass2(pass1(html)));
}
catch (IOException eIo) {
assert false;
/*
* We use only StringReaders, which only throw an
* IOException when they are read after being closed.
* And we don't close them.
*/
return null;
}
}
// - -%- -
private static Tree<String>
pass1(String html)
throws IOException
{
Reader r = new StringReader(html);
Tree<String> docu = new Tree<String>();
StringBuilder text = new StringBuilder();
StringBuilder emoji = new StringBuilder();
StringBuilder htmlEscape = new StringBuilder();
boolean quoted = false, inEmoji = false;
int c; while ((c = r.read()) != -1)
{
if (c == '&' || htmlEscape.length() > 0)
{
htmlEscape.append((char)c);
if (c == ';')
{
String s = empty(htmlEscape);
if (quoted) text.append('\\');
/*
* If we're quoted (i.e. within unescaped
* quotes), we're in a tag, add escaping
* backslash for pass2 to work with.
* Only necessary for the quotes, but,
* might as well uniformly use.
*/
if (s.equals("&lt;")) text.append('<');
if (s.equals("&gt;")) text.append('>');
if (s.equals("&amp;")) text.append('&');
if (s.equals("&quot;")) text.append('"');
if (s.equals("&apos;")) text.append('\'');
if (s.equals("&#39;")) text.append('\'');
}
continue;
}
if (c == '"')
{
text.append((char)c);
quoted = !quoted;
continue;
}
if (!quoted)
{
if (c == '<')
{
if (text.length() > 0)
{
Tree<String> node = new Tree<>();
node.key = "text";
node.value = empty(text);
docu.add(node);
}
continue;
}
if (c == '>')
{
Tree<String> node = new Tree<>();
node.key = "tag";
node.value = empty(text);
docu.add(node);
continue;
}
}
text.append((char)c);
continue;
}
if (text.length() > 0)
{
Tree<String> node = new Tree<>();
node.key = "text";
node.value = empty(text);
docu.add(node);
}
return docu;
}
private static Tree<String>
pass2(Tree<String> docu)
throws IOException
{
for (Tree<String> node: docu.children)
{
if (!node.key.equals("tag")) continue;
Reader r = new StringReader(node.value);
Tree<String> part = new Tree<String>();
boolean escaped = false, quoted = false;
StringBuilder field = new StringBuilder();
int c; while ((c = r.read()) != -1)
{
if (escaped) {
field.append((char)c);
escaped = false;
continue;
}
if (c == '\\') {
escaped = true;
continue;
}
if (c == '"') {
quoted = !quoted;
continue;
}
if (quoted) {
field.append((char)c);
continue;
}
if (c == '=') {
part.key = empty(field);
continue;
}
if (c == ' ') {
if (field.length() > 0) {
boolean v = part.key != null;
if (v) part.value = empty(field);
else part.key = empty(field);
node.add(part);
part = new Tree<String>();
}
continue;
}
field.append((char)c);
}
if (field.length() > 0) {
boolean v = part.key != null;
if (v) part.value = empty(field);
else part.key = empty(field);
node.add(part);
}
node.value = null;
}
return docu;
}
private static Tree<String>
pass3(Tree<String> docu)
{
Tree<String> returnee = new Tree<String>();
for (Tree<String> node: docu)
{
if (!node.key.equals("text"))
{
returnee.add(node);
continue;
}
StringBuilder value = new StringBuilder();
for (String segment: whitespaceSplit(node.value))
{
boolean st = segment.startsWith(":");
boolean ed = segment.endsWith(":");
if (st && ed)
{
Tree<String> text = new Tree<String>();
text.key = "text";
text.value = empty(value);
returnee.add(text);
Tree<String> emoji = new Tree<String>();
emoji.key = "emoji";
emoji.value = segment;
returnee.add(emoji);
}
else
{
value.append(segment);
}
}
if (value.length() > 0)
{
Tree<String> text = new Tree<String>();
text.key = "text";
text.value = empty(value);
returnee.add(text);
}
}
return returnee;
}
private static String
empty(StringBuilder b)
{
String s = b.toString();
b.delete(0, b.length());
return s;
}
private static List<String>
whitespaceSplit(String text)
{
List<String> returnee = new ArrayList<>();
StringBuilder segment = new StringBuilder();
boolean isWhitespace = false;
for (char c: text.toCharArray())
{
boolean diff = isWhitespace ^ Character.isWhitespace(c);
if (diff) {
returnee.add(empty(segment));
isWhitespace = !isWhitespace;
}
segment.append(c);
}
returnee.add(empty(segment));
return returnee;
}
// ---%-@-%---
public static void
main(String... args)
{
final String EX2 =
"<p>2000s energy<br /><a href=\"http://www.pspad.com/en/screenshot.htm\" rel=\"nofollow noopener noreferrer\" target=\"_blank\"><span class=\"invisible\">http://www.</span><span class=\"\">pspad.com/en/screenshot.htm</span><span class=\"invisible\"></span></a></p>";
}
}