225 lines
6.3 KiB
Python
Raw Normal View History

2018-10-09 11:11:51 +10:00
#!/usr/bin/env python3
# toot downloader version two!!
2018-10-09 11:11:51 +10:00
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from mastodon import Mastodon
from os import path
from bs4 import BeautifulSoup
import os, sqlite3, signal, sys, json, re
import requests
2018-10-09 11:11:51 +10:00
scopes = ["read:statuses", "read:accounts", "read:follows", "write:statuses"]
cfg = json.load(open('config.json', 'r'))
2018-10-09 11:11:51 +10:00
if os.path.exists("clientcred.secret"):
2018-10-27 18:28:20 +10:00
print("Upgrading to new storage method")
cc = open("clientcred.secret").read().split("\n")
cfg['client'] = {
"id": cc[0],
"secret": cc[1]
}
cfg['secret'] = open("usercred.secret").read().rstrip("\n")
os.remove("clientcred.secret")
os.remove("usercred.secret")
if "client" not in cfg:
print("No client credentials, registering application")
client_id, client_secret = Mastodon.create_app("mstdn-ebooks",
api_base_url=cfg['site'],
scopes=scopes,
website="https://github.com/Lynnesbian/mstdn-ebooks")
cfg['client'] = {
"id": client_id,
"secret": client_secret
}
if "secret" not in cfg:
print("No user credentials, logging in")
client = Mastodon(client_id = cfg['client']['id'],
client_secret = cfg['client']['secret'],
api_base_url=cfg['site'])
2018-10-09 11:11:51 +10:00
print("Open this URL: {}".format(client.auth_request_url(scopes=scopes)))
cfg['secret'] = client.log_in(code=input("Secret: "), scopes=scopes)
2018-10-09 11:11:51 +10:00
json.dump(cfg, open("config.json", "w+"))
2018-10-09 11:11:51 +10:00
def extract_toot(toot):
toot = toot.replace("'", "'")
toot = toot.replace(""", '"')
soup = BeautifulSoup(toot, "html.parser")
2018-10-09 11:11:51 +10:00
# this is the code that removes all mentions
# TODO: make it so that it removes the @ and instance but keeps the name
for mention in soup.select("span.h-card"):
mention.a.unwrap()
mention.span.unwrap()
2018-10-09 11:11:51 +10:00
# replace <br> with linebreak
2018-10-09 11:11:51 +10:00
for lb in soup.select("br"):
lb.insert_after("\n")
lb.decompose()
# replace <p> with linebreak
2018-10-09 11:11:51 +10:00
for p in soup.select("p"):
p.insert_after("\n")
p.unwrap()
# fix hashtags
2018-10-09 11:11:51 +10:00
for ht in soup.select("a.hashtag"):
ht.unwrap()
# fix links
2018-10-09 11:11:51 +10:00
for link in soup.select("a"):
link.insert_after(link["href"])
link.decompose()
toot = soup.get_text()
toot = toot.rstrip("\n") #remove trailing newline
toot = toot.replace("@", "@\u202B") #put a zws between @ and username to avoid mentioning
return(toot)
2018-10-09 11:11:51 +10:00
client = Mastodon(
client_id=cfg['client']['id'],
client_secret = cfg['client']['secret'],
access_token=cfg['secret'],
api_base_url=cfg['site'])
2018-10-09 11:11:51 +10:00
me = client.account_verify_credentials()
following = client.account_following(me.id)
db = sqlite3.connect("toots.db")
db.text_factory=str
c = db.cursor()
c.execute("CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID")
2018-10-09 11:11:51 +10:00
db.commit()
def handleCtrlC(signal, frame):
print("\nPREMATURE EVACUATION - Saving chunks")
db.commit()
sys.exit(1)
signal.signal(signal.SIGINT, handleCtrlC)
2018-10-27 18:28:20 +10:00
def get_toots_legacy(client, id):
i = 0
toots = client.account_statuses(id)
while toots is not None and len(toots) > 0:
for toot in toots:
if toot.spoiler_text != "": continue
if toot.reblog is not None: continue
if toot.visibility not in ["public", "unlisted"]: continue
t = extract_toot(toot.content)
if t != None:
yield {
"toot": t,
"id": toot.id,
"uri": toot.uri
}
toots = client.fetch_next(toots)
i += 1
if i%20 == 0:
print('.', end='', flush=True)
2018-10-09 11:11:51 +10:00
for f in following:
last_toot = c.execute("SELECT id FROM `toots` WHERE userid LIKE ? ORDER BY id DESC LIMIT 1", (f.id,)).fetchone()
if last_toot != None:
last_toot = last_toot[0]
else:
last_toot = 0
print("Harvesting toots for user @{}, starting from {}".format(f.acct, last_toot))
#find the user's activitypub outbox
2018-10-27 18:28:20 +10:00
print("WebFingering...")
instance = re.search(r"^.*@(.+)", f.acct)
if instance == None:
instance = re.search(r"https?:\/\/(.*)", cfg['site']).group(1)
else:
instance = instance.group(1)
2018-10-26 00:33:57 +10:00
if instance == "bofa.lol":
print("rest in piece bofa, skipping")
continue
2018-10-27 18:28:20 +10:00
# print("{} is on {}".format(f.acct, instance))
try:
2018-10-27 22:07:38 +10:00
r = requests.get("https://{}/.well-known/host-meta".format(instance), timeout=10)
uri = re.search(r'template="([^"]+)"', r.text).group(1)
uri = uri.format(uri = "{}@{}".format(f.username, instance))
2018-10-27 22:07:38 +10:00
r = requests.get(uri, headers={"Accept": "application/json"}, timeout=10)
2018-10-27 18:28:20 +10:00
j = r.json()
if len(j['aliases']) == 1: #TODO: this is a hack on top of a hack, fix it
uri = j['aliases'][0]
else:
uri = j['aliases'][1]
uri = "{}/outbox?page=true&min_id={}".format(uri, last_toot)
2018-10-27 22:07:38 +10:00
r = requests.get(uri, timeout=10)
j = r.json()
except Exception:
print("oopsy woopsy!! we made a fucky wucky!!!\n(we're probably rate limited, please hang up and try again)")
sys.exit(1)
2018-10-27 18:28:20 +10:00
pleroma = False
if 'first' in j:
print("{} is a pleroma instance -- falling back to legacy toot collection method".format(instance))
pleroma = True
print("Downloading and parsing toots", end='', flush=True)
current = None
try:
2018-10-27 18:28:20 +10:00
if pleroma:
for t in get_toots_legacy(client, f.id):
try:
c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)",
(t['id'],
f.id,
t['uri'],
t['toot']
)
2018-10-27 18:28:20 +10:00
)
except:
pass
else:
while len(j['orderedItems']) > 0:
for oi in j['orderedItems']:
if (not pleroma and oi['type'] == "Create") or (pleroma and oi['to']['type'] == "Create"):
# its a toost baby
content = oi['object']['content']
if oi['object']['summary'] != None:
#don't download CW'd toots
continue
toot = extract_toot(content)
# print(toot)
try:
pid = re.search(r"[^\/]+$", oi['object']['id']).group(0)
c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)",
(pid,
f.id,
oi['object']['id'],
toot
)
)
pass
except:
pass #ignore any toots that don't go into the DB
# sys.exit(0)
2018-10-27 22:07:38 +10:00
r = requests.get(j['prev'], timeout=10)
2018-10-27 18:28:20 +10:00
j = r.json()
print('.', end='', flush=True)
print(" Done!")
db.commit()
except:
print("Encountered an error! Saving toots to database and exiting.")
db.commit()
db.close()
sys.exit(1)
2018-10-09 11:11:51 +10:00
db.commit()
db.execute("VACUUM") #compact db
db.commit()
db.close()