2022-06-02 18:28:54 +02:00
|
|
|
/* copyright
|
|
|
|
|
|
|
|
This file is part of JKomasto2.
|
|
|
|
Written in 2022 by Usawashi <usawashi16@yahoo.co.jp>
|
|
|
|
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
copyright */
|
2022-04-12 08:37:39 +02:00
|
|
|
|
|
|
|
import cafe.biskuteri.hinoki.Tree;
|
2022-04-15 14:54:31 +02:00
|
|
|
import java.util.List;
|
2022-05-13 16:32:11 +02:00
|
|
|
import java.util.ArrayList;
|
2022-04-12 08:37:39 +02:00
|
|
|
import java.io.StringReader;
|
|
|
|
import java.io.Reader;
|
|
|
|
import java.io.IOException;
|
|
|
|
|
|
|
|
class
|
|
|
|
RudimentaryHTMLParser {
|
|
|
|
|
|
|
|
public static Tree<String>
|
|
|
|
depthlessRead(String html)
|
|
|
|
{
|
2022-06-02 20:41:59 +02:00
|
|
|
try {
|
|
|
|
return pass3(pass2(pass1(html)));
|
|
|
|
}
|
|
|
|
catch (IOException eIo) {
|
|
|
|
assert false;
|
|
|
|
/*
|
|
|
|
* We use only StringReaders, which only throw an
|
|
|
|
* IOException when they are read after being closed.
|
|
|
|
* And we don't close them.
|
|
|
|
*/
|
|
|
|
return null;
|
|
|
|
}
|
2022-04-12 08:37:39 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// - -%- -
|
|
|
|
|
|
|
|
private static Tree<String>
|
|
|
|
pass1(String html)
|
|
|
|
throws IOException
|
|
|
|
{
|
2022-06-02 20:41:59 +02:00
|
|
|
Reader r = new StringReader(html);
|
2022-04-12 08:37:39 +02:00
|
|
|
Tree<String> docu = new Tree<String>();
|
|
|
|
StringBuilder text = new StringBuilder();
|
2022-06-02 20:41:59 +02:00
|
|
|
StringBuilder emoji = new StringBuilder();
|
2022-04-12 08:37:39 +02:00
|
|
|
StringBuilder htmlEscape = new StringBuilder();
|
2022-04-15 14:54:31 +02:00
|
|
|
boolean quoted = false, inEmoji = false;
|
2022-04-12 08:37:39 +02:00
|
|
|
int c; while ((c = r.read()) != -1)
|
|
|
|
{
|
|
|
|
if (c == '&' || htmlEscape.length() > 0)
|
|
|
|
{
|
|
|
|
htmlEscape.append((char)c);
|
|
|
|
if (c == ';')
|
|
|
|
{
|
|
|
|
String s = empty(htmlEscape);
|
|
|
|
if (quoted) text.append('\\');
|
|
|
|
/*
|
|
|
|
* If we're quoted (i.e. within unescaped
|
|
|
|
* quotes), we're in a tag, add escaping
|
|
|
|
* backslash for pass2 to work with.
|
|
|
|
* Only necessary for the quotes, but,
|
|
|
|
* might as well uniformly use.
|
|
|
|
*/
|
|
|
|
if (s.equals("<")) text.append('<');
|
|
|
|
if (s.equals(">")) text.append('>');
|
|
|
|
if (s.equals("&")) text.append('&');
|
|
|
|
if (s.equals(""")) text.append('"');
|
|
|
|
if (s.equals("'")) text.append('\'');
|
2022-04-19 16:08:20 +02:00
|
|
|
if (s.equals("'")) text.append('\'');
|
2022-04-12 08:37:39 +02:00
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (c == '"')
|
|
|
|
{
|
|
|
|
text.append((char)c);
|
|
|
|
quoted = !quoted;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (!quoted)
|
|
|
|
{
|
|
|
|
if (c == '<')
|
|
|
|
{
|
|
|
|
if (text.length() > 0)
|
|
|
|
{
|
|
|
|
Tree<String> node = new Tree<>();
|
|
|
|
node.key = "text";
|
|
|
|
node.value = empty(text);
|
|
|
|
docu.add(node);
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (c == '>')
|
|
|
|
{
|
|
|
|
Tree<String> node = new Tree<>();
|
|
|
|
node.key = "tag";
|
|
|
|
node.value = empty(text);
|
|
|
|
docu.add(node);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
2022-06-02 20:41:59 +02:00
|
|
|
text.append((char)c);
|
2022-04-12 08:37:39 +02:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (text.length() > 0)
|
|
|
|
{
|
|
|
|
Tree<String> node = new Tree<>();
|
|
|
|
node.key = "text";
|
|
|
|
node.value = empty(text);
|
|
|
|
docu.add(node);
|
|
|
|
}
|
|
|
|
return docu;
|
|
|
|
}
|
|
|
|
|
|
|
|
private static Tree<String>
|
|
|
|
pass2(Tree<String> docu)
|
|
|
|
throws IOException
|
|
|
|
{
|
|
|
|
for (Tree<String> node: docu.children)
|
|
|
|
{
|
2022-04-15 14:54:31 +02:00
|
|
|
if (!node.key.equals("tag")) continue;
|
2022-04-12 08:37:39 +02:00
|
|
|
|
|
|
|
Reader r = new StringReader(node.value);
|
|
|
|
Tree<String> part = new Tree<String>();
|
|
|
|
boolean escaped = false, quoted = false;
|
|
|
|
StringBuilder field = new StringBuilder();
|
|
|
|
int c; while ((c = r.read()) != -1)
|
|
|
|
{
|
|
|
|
if (escaped) {
|
|
|
|
field.append((char)c);
|
|
|
|
escaped = false;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (c == '\\') {
|
|
|
|
escaped = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (c == '"') {
|
|
|
|
quoted = !quoted;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (quoted) {
|
|
|
|
field.append((char)c);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (c == '=') {
|
|
|
|
part.key = empty(field);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (c == ' ') {
|
|
|
|
if (field.length() > 0) {
|
|
|
|
boolean v = part.key != null;
|
|
|
|
if (v) part.value = empty(field);
|
|
|
|
else part.key = empty(field);
|
|
|
|
node.add(part);
|
|
|
|
part = new Tree<String>();
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
field.append((char)c);
|
|
|
|
}
|
|
|
|
if (field.length() > 0) {
|
|
|
|
boolean v = part.key != null;
|
|
|
|
if (v) part.value = empty(field);
|
|
|
|
else part.key = empty(field);
|
|
|
|
node.add(part);
|
|
|
|
}
|
|
|
|
node.value = null;
|
|
|
|
}
|
|
|
|
return docu;
|
|
|
|
}
|
|
|
|
|
2022-06-02 20:41:59 +02:00
|
|
|
private static Tree<String>
|
|
|
|
pass3(Tree<String> docu)
|
|
|
|
{
|
2022-05-13 16:32:11 +02:00
|
|
|
Tree<String> returnee = new Tree<String>();
|
|
|
|
|
|
|
|
for (Tree<String> node: docu)
|
|
|
|
{
|
|
|
|
if (!node.key.equals("text"))
|
|
|
|
{
|
|
|
|
returnee.add(node);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2022-06-02 20:41:59 +02:00
|
|
|
StringBuilder value = new StringBuilder();
|
|
|
|
for (String segment: whitespaceSplit(node.value))
|
|
|
|
{
|
|
|
|
boolean st = segment.startsWith(":");
|
2022-05-13 16:32:11 +02:00
|
|
|
boolean ed = segment.endsWith(":");
|
|
|
|
|
|
|
|
if (st && ed)
|
|
|
|
{
|
|
|
|
Tree<String> text = new Tree<String>();
|
2022-06-02 20:41:59 +02:00
|
|
|
text.key = "text";
|
|
|
|
text.value = empty(value);
|
|
|
|
returnee.add(text);
|
2022-05-13 16:32:11 +02:00
|
|
|
|
|
|
|
Tree<String> emoji = new Tree<String>();
|
|
|
|
emoji.key = "emoji";
|
2022-06-02 20:41:59 +02:00
|
|
|
emoji.value = segment;
|
2022-05-13 16:32:11 +02:00
|
|
|
returnee.add(emoji);
|
|
|
|
}
|
2022-06-02 20:41:59 +02:00
|
|
|
else
|
|
|
|
{
|
|
|
|
value.append(segment);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (value.length() > 0)
|
|
|
|
{
|
|
|
|
Tree<String> text = new Tree<String>();
|
|
|
|
text.key = "text";
|
|
|
|
text.value = empty(value);
|
|
|
|
returnee.add(text);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return returnee;
|
|
|
|
}
|
2022-04-15 14:54:31 +02:00
|
|
|
|
2022-04-12 08:37:39 +02:00
|
|
|
private static String
|
|
|
|
empty(StringBuilder b)
|
|
|
|
{
|
|
|
|
String s = b.toString();
|
|
|
|
b.delete(0, b.length());
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2022-06-02 20:41:59 +02:00
|
|
|
private static List<String>
|
|
|
|
whitespaceSplit(String text)
|
|
|
|
{
|
|
|
|
List<String> returnee = new ArrayList<>();
|
|
|
|
StringBuilder segment = new StringBuilder();
|
|
|
|
boolean isWhitespace = false;
|
|
|
|
for (char c: text.toCharArray())
|
|
|
|
{
|
|
|
|
boolean diff = isWhitespace ^ Character.isWhitespace(c);
|
|
|
|
if (diff) {
|
|
|
|
returnee.add(empty(segment));
|
|
|
|
isWhitespace = !isWhitespace;
|
|
|
|
}
|
|
|
|
segment.append(c);
|
|
|
|
}
|
|
|
|
returnee.add(empty(segment));
|
2022-05-13 16:32:11 +02:00
|
|
|
|
2022-06-02 20:41:59 +02:00
|
|
|
return returnee;
|
|
|
|
}
|
2022-05-13 16:32:11 +02:00
|
|
|
|
2022-04-12 08:37:39 +02:00
|
|
|
// ---%-@-%---
|
|
|
|
|
|
|
|
public static void
|
|
|
|
main(String... args)
|
|
|
|
{
|
|
|
|
final String EX2 =
|
|
|
|
"<p>2000s energy<br /><a href=\"http://www.pspad.com/en/screenshot.htm\" rel=\"nofollow noopener noreferrer\" target=\"_blank\"><span class=\"invisible\">http://www.</span><span class=\"\">pspad.com/en/screenshot.htm</span><span class=\"invisible\"></span></a></p>";
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|