132 lines
4.0 KiB
Python
Raw Normal View History

2018-10-09 11:11:51 +10:00
#!/usr/bin/env python3
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from mastodon import Mastodon
from getpass import getpass
from os import path
from bs4 import BeautifulSoup
import shutil, os, sqlite3, signal, sys
# import re
api_base_url = "https://botsin.space"
scopes = ["read:statuses", "read:accounts", "read:follows", "write:statuses"]
if not path.exists("clientcred.secret"):
print("No clientcred.secret, registering application")
Mastodon.create_app("lynnesbian_mastodon_ebooks", api_base_url=api_base_url, to_file="clientcred.secret", scopes=scopes, website="https://github.com/Lynnesbian/mastodon-ebooks")
if not path.exists("usercred.secret"):
print("No usercred.secret, registering application")
client = Mastodon(client_id="clientcred.secret", api_base_url=api_base_url)
print("Visit this url:")
print(client.auth_request_url(scopes=scopes))
client.log_in(code=input("Secret: "), to_file="usercred.secret", scopes=scopes)
def parse_toot(toot):
if toot.spoiler_text != "": return
if toot.reblog is not None: return
if toot.visibility not in ["public", "unlisted"]: return
soup = BeautifulSoup(toot.content, "html.parser")
# pull the mentions out
# for mention in soup.select("span.h-card"):
# mention.unwrap()
# for mention in soup.select("a.u-url.mention"):
# mention.unwrap()
# this is the code that removes all mentions
# TODO: make it so that it removes the @ and instance but keeps the name
for mention in soup.select("span.h-card"):
mention.decompose()
# make all linebreaks actual linebreaks
for lb in soup.select("br"):
lb.insert_after("\n")
lb.decompose()
# make each p element its own line because sometimes they decide not to be
for p in soup.select("p"):
p.insert_after("\n")
p.unwrap()
# keep hashtags in the toots
for ht in soup.select("a.hashtag"):
ht.unwrap()
# unwrap all links (i like the bots posting links)
for link in soup.select("a"):
link.insert_after(link["href"])
link.decompose()
text = map(lambda a: a.strip(), soup.get_text().strip().split("\n"))
# next up: store this and patch markovify to take it
# return {"text": text, "mentions": mentions, "links": links}
# it's 4am though so we're not doing that now, but i still want the parser updates
return "\0".join(list(text))
def get_toots(client, id, since_id):
i = 0
toots = client.account_statuses(id, since_id = since_id)
while toots is not None and len(toots) > 0:
for toot in toots:
t = parse_toot(toot)
if t != None:
yield {
"content": t,
"id": toot.id
}
try:
toots = client.fetch_next(toots)
except TimeoutError:
print("Operation timed out, committing to database and exiting.")
db.commit()
db.close()
sys.exit(1)
i += 1
if i%10 == 0:
print(i)
client = Mastodon(
client_id="clientcred.secret",
access_token="usercred.secret",
api_base_url=api_base_url)
me = client.account_verify_credentials()
following = client.account_following(me.id)
db = sqlite3.connect("toots.db")
db.text_factory=str
c = db.cursor()
c.execute("CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID")
db.commit()
def handleCtrlC(signal, frame):
print("\nPREMATURE EVACUATION - Saving chunks")
db.commit()
sys.exit(1)
signal.signal(signal.SIGINT, handleCtrlC)
for f in following:
last_toot = c.execute("SELECT id FROM `toots` WHERE userid LIKE ? ORDER BY id DESC LIMIT 1", (f.id,)).fetchone()
if last_toot != None:
last_toot = last_toot[0]
else:
last_toot = 0
print("Downloading toots for user @{}, starting from {}".format(f.username, last_toot))
for t in get_toots(client, f.id, last_toot):
# try:
c.execute("REPLACE INTO toots (id, userid, content) VALUES (?, ?, ?)", (t['id'], f.id, t['content']))
# except:
# pass #ignore toots that can't be encoded properly
db.commit()
db.execute("VACUUM") #compact db
db.commit()
db.close()