AgathaSorceress-mstdn-ebooks/main.py

#!/usr/bin/env python3
# toot downloader version two!!
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from mastodon import Mastodon
from os import path
from bs4 import BeautifulSoup
import os, sqlite3, signal, sys, json, re
import requests

scopes = ["read:statuses", "read:accounts", "read:follows", "write:statuses", "read:notifications"]
cfg = json.load(open('config.json', 'r'))

if os.path.exists("clientcred.secret"):
		print("Upgrading to new storage method")
		cc = open("clientcred.secret").read().split("\n")
		cfg['client'] = {
				"id": cc[0],
				"secret": cc[1]
		}
		cfg['secret'] = open("usercred.secret").read().rstrip("\n")
		os.remove("clientcred.secret")
		os.remove("usercred.secret")
		

if "client" not in cfg:
	print("No client credentials, registering application")
	client_id, client_secret = Mastodon.create_app("mstdn-ebooks",
		api_base_url=cfg['site'],
		scopes=scopes,
		website="https://github.com/Lynnesbian/mstdn-ebooks")

	cfg['client'] = {
		"id": client_id,
		"secret": client_secret
	}

if "secret" not in cfg:
	print("No user credentials, logging in")
	client = Mastodon(client_id = cfg['client']['id'],
		client_secret = cfg['client']['secret'],
		api_base_url=cfg['site'])

	print("Open this URL: {}".format(client.auth_request_url(scopes=scopes)))
	cfg['secret'] = client.log_in(code=input("Secret: "), scopes=scopes)

json.dump(cfg, open("config.json", "w+"))

def extract_toot(toot):
	toot = toot.replace("&apos;", "'")
	toot = toot.replace("&quot;", '"')
	soup = BeautifulSoup(toot, "html.parser")
	
	# this is the code that removes all mentions
	# TODO: make it so that it removes the @ and instance but keeps the name
	for mention in soup.select("span.h-card"):
		mention.a.unwrap()
		mention.span.unwrap()
	
	# replace <br> with linebreak
	for lb in soup.select("br"):
		lb.insert_after("\n")
		lb.decompose()

	# replace <p> with linebreak
	for p in soup.select("p"):
		p.insert_after("\n")
		p.unwrap()
	
	# fix hashtags
	for ht in soup.select("a.hashtag"):
		ht.unwrap()

	# fix links
	for link in soup.select("a"):
		link.insert_after(link["href"])
		link.decompose()

	toot = soup.get_text()
	toot = toot.rstrip("\n") #remove trailing newline
	toot = toot.replace("@", "@\u202B") #put a zws between @ and username to avoid mentioning
	return(toot)

client = Mastodon(
	client_id=cfg['client']['id'],
	client_secret = cfg['client']['secret'], 
	access_token=cfg['secret'], 
	api_base_url=cfg['site'])

me = client.account_verify_credentials()
following = client.account_following(me.id)

db = sqlite3.connect("toots.db")
db.text_factory=str
c = db.cursor()
c.execute("CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID")
db.commit()

def handleCtrlC(signal, frame):
	print("\nPREMATURE EVACUATION - Saving chunks")
	db.commit()
	sys.exit(1)

signal.signal(signal.SIGINT, handleCtrlC)

def get_toots_legacy(client, id):
	i = 0
	toots = client.account_statuses(id)
	while toots is not None and len(toots) > 0:
		for toot in toots:
			if toot.spoiler_text != "": continue
			if toot.reblog is not None: continue
			if toot.visibility not in ["public", "unlisted"]: continue
			t = extract_toot(toot.content)
			if t != None:
				yield {
					"toot": t,
					"id": toot.id,
					"uri": toot.uri
				}
			toots = client.fetch_next(toots)
			i += 1
			if i%20 == 0:
				print('.', end='', flush=True)

for f in following:
	last_toot = c.execute("SELECT id FROM `toots` WHERE userid LIKE ? ORDER BY id DESC LIMIT 1", (f.id,)).fetchone()
	if last_toot != None:
		last_toot = last_toot[0]
	else:
		last_toot = 0
	print("Harvesting toots for user @{}, starting from {}".format(f.acct, last_toot))

	#find the user's activitypub outbox
	print("WebFingering...")
	instance = re.search(r"^.*@(.+)", f.acct)
	if instance == None:
		instance = re.search(r"https?:\/\/(.*)", cfg['site']).group(1)
	else:
		instance = instance.group(1)

	if instance == "bofa.lol":
		print("rest in piece bofa, skipping")
		continue
				
	# print("{} is on {}".format(f.acct, instance))
	try:
		r = requests.get("https://{}/.well-known/host-meta".format(instance), timeout=10)
		uri = re.search(r'template="([^"]+)"', r.text).group(1)
		uri = uri.format(uri = "{}@{}".format(f.username, instance))
		r = requests.get(uri, headers={"Accept": "application/json"}, timeout=10)
		j = r.json()
		if len(j['aliases']) == 1: #TODO: this is a hack on top of a hack, fix it
			uri = j['aliases'][0]
		else:
			uri = j['aliases'][1]
		uri = "{}/outbox?page=true&min_id={}".format(uri, last_toot)
		r = requests.get(uri, timeout=10)
		j = r.json()
	except Exception:
		print("oopsy woopsy!! we made a fucky wucky!!!\n(we're probably rate limited, please hang up and try again)")
		sys.exit(1)

	pleroma = False
	if 'first' in j:
		print("{} is a pleroma instance -- falling back to legacy toot collection method".format(instance))
		pleroma = True
	
	print("Downloading and parsing toots", end='', flush=True)
	current = None
	try:
		if pleroma:
			for t in get_toots_legacy(client, f.id):
				try:
					c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)",
						(t['id'],
						f.id,
						t['uri'],
						t['toot']
						)
					)
				except:
					pass

		else:
			while len(j['orderedItems']) > 0:
				for oi in j['orderedItems']:
					if (not pleroma and oi['type'] == "Create") or (pleroma and oi['to']['type'] == "Create"):
						# its a toost baby
						content = oi['object']['content']
						if oi['object']['summary'] != None:
							#don't download CW'd toots
							continue
						toot = extract_toot(content)
						# print(toot)
						try:
							pid = re.search(r"[^\/]+$", oi['object']['id']).group(0)
							c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)",
								(pid,
								f.id,
								oi['object']['id'],
								toot
								)
							)
							pass
						except:
							pass #ignore any toots that don't go into the DB
				# sys.exit(0)
				r = requests.get(j['prev'], timeout=10)
				j = r.json()
				print('.', end='', flush=True)
		print(" Done!")
		db.commit()
	except:
		print("Encountered an error! Saving toots to database and continuing.")
		db.commit()
		# db.close()

print("Done!")

db.commit()
db.execute("VACUUM") #compact db
db.commit()
db.close()
initial commit 2018-10-09 03:11:51 +02:00			`#!/usr/bin/env python3`
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`# toot downloader version two!!`
initial commit 2018-10-09 03:11:51 +02:00			`# This Source Code Form is subject to the terms of the Mozilla Public`
			`# License, v. 2.0. If a copy of the MPL was not distributed with this`
			`# file, You can obtain one at http://mozilla.org/MPL/2.0/.`

			`from mastodon import Mastodon`
			`from os import path`
			`from bs4 import BeautifulSoup`
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`import os, sqlite3, signal, sys, json, re`
			`import requests`
initial commit 2018-10-09 03:11:51 +02:00
request notification reading permission 2018-10-29 05:36:21 +01:00			`scopes = ["read:statuses", "read:accounts", "read:follows", "write:statuses", "read:notifications"]`
added config.json to make changing api base url easier 2018-10-14 08:58:58 +02:00			`cfg = json.load(open('config.json', 'r'))`
initial commit 2018-10-09 03:11:51 +02:00
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`if os.path.exists("clientcred.secret"):`
fix for pleroma 2018-10-27 10:28:20 +02:00			`print("Upgrading to new storage method")`
			`cc = open("clientcred.secret").read().split("\n")`
			`cfg['client'] = {`
			`"id": cc[0],`
			`"secret": cc[1]`
			`}`
			`cfg['secret'] = open("usercred.secret").read().rstrip("\n")`
			`os.remove("clientcred.secret")`
			`os.remove("usercred.secret")`

Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00
			`if "client" not in cfg:`
			`print("No client credentials, registering application")`
			`client_id, client_secret = Mastodon.create_app("mstdn-ebooks",`
			`api_base_url=cfg['site'],`
			`scopes=scopes,`
			`website="https://github.com/Lynnesbian/mstdn-ebooks")`

			`cfg['client'] = {`
			`"id": client_id,`
			`"secret": client_secret`
			`}`

			`if "secret" not in cfg:`
			`print("No user credentials, logging in")`
			`client = Mastodon(client_id = cfg['client']['id'],`
			`client_secret = cfg['client']['secret'],`
			`api_base_url=cfg['site'])`
initial commit 2018-10-09 03:11:51 +02:00
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`print("Open this URL: {}".format(client.auth_request_url(scopes=scopes)))`
			`cfg['secret'] = client.log_in(code=input("Secret: "), scopes=scopes)`
initial commit 2018-10-09 03:11:51 +02:00
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`json.dump(cfg, open("config.json", "w+"))`
initial commit 2018-10-09 03:11:51 +02:00
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`def extract_toot(toot):`
			`toot = toot.replace("'", "'")`
			`toot = toot.replace(""", '"')`
			`soup = BeautifulSoup(toot, "html.parser")`
initial commit 2018-10-09 03:11:51 +02:00
			`# this is the code that removes all mentions`
			`# TODO: make it so that it removes the @ and instance but keeps the name`
			`for mention in soup.select("span.h-card"):`
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`mention.a.unwrap()`
			`mention.span.unwrap()`
initial commit 2018-10-09 03:11:51 +02:00
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`# replace <br> with linebreak`
initial commit 2018-10-09 03:11:51 +02:00			`for lb in soup.select("br"):`
			`lb.insert_after("\n")`
			`lb.decompose()`

Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`# replace <p> with linebreak`
initial commit 2018-10-09 03:11:51 +02:00			`for p in soup.select("p"):`
			`p.insert_after("\n")`
			`p.unwrap()`

Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`# fix hashtags`
initial commit 2018-10-09 03:11:51 +02:00			`for ht in soup.select("a.hashtag"):`
			`ht.unwrap()`

Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`# fix links`
initial commit 2018-10-09 03:11:51 +02:00			`for link in soup.select("a"):`
			`link.insert_after(link["href"])`
			`link.decompose()`

Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`toot = soup.get_text()`
			`toot = toot.rstrip("\n") #remove trailing newline`
			`toot = toot.replace("@", "@\u202B") #put a zws between @ and username to avoid mentioning`
			`return(toot)`
initial commit 2018-10-09 03:11:51 +02:00
			`client = Mastodon(`
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`client_id=cfg['client']['id'],`
			`client_secret = cfg['client']['secret'],`
			`access_token=cfg['secret'],`
			`api_base_url=cfg['site'])`
initial commit 2018-10-09 03:11:51 +02:00
			`me = client.account_verify_credentials()`
			`following = client.account_following(me.id)`

			`db = sqlite3.connect("toots.db")`
			`db.text_factory=str`
			`c = db.cursor()`
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			c.execute("CREATE TABLE IF NOT EXISTS `toots` (id INT NOT NULL UNIQUE PRIMARY KEY, userid INT NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL) WITHOUT ROWID")
initial commit 2018-10-09 03:11:51 +02:00			`db.commit()`

			`def handleCtrlC(signal, frame):`
			`print("\nPREMATURE EVACUATION - Saving chunks")`
			`db.commit()`
			`sys.exit(1)`

			`signal.signal(signal.SIGINT, handleCtrlC)`

fix for pleroma 2018-10-27 10:28:20 +02:00			`def get_toots_legacy(client, id):`
			`i = 0`
			`toots = client.account_statuses(id)`
			`while toots is not None and len(toots) > 0:`
			`for toot in toots:`
			`if toot.spoiler_text != "": continue`
			`if toot.reblog is not None: continue`
			`if toot.visibility not in ["public", "unlisted"]: continue`
			`t = extract_toot(toot.content)`
			`if t != None:`
			`yield {`
			`"toot": t,`
			`"id": toot.id,`
			`"uri": toot.uri`
			`}`
			`toots = client.fetch_next(toots)`
			`i += 1`
			`if i%20 == 0:`
			`print('.', end='', flush=True)`

initial commit 2018-10-09 03:11:51 +02:00			`for f in following:`
			last_toot = c.execute("SELECT id FROM `toots` WHERE userid LIKE ? ORDER BY id DESC LIMIT 1", (f.id,)).fetchone()
			`if last_toot != None:`
			`last_toot = last_toot[0]`
			`else:`
			`last_toot = 0`
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`print("Harvesting toots for user @{}, starting from {}".format(f.acct, last_toot))`

			`#find the user's activitypub outbox`
fix for pleroma 2018-10-27 10:28:20 +02:00			`print("WebFingering...")`
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`instance = re.search(r"^.*@(.+)", f.acct)`
			`if instance == None:`
			`instance = re.search(r"https?:\/\/(.*)", cfg['site']).group(1)`
			`else:`
			`instance = instance.group(1)`

fixed a huge bug 2018-10-25 16:33:57 +02:00			`if instance == "bofa.lol":`
			`print("rest in piece bofa, skipping")`
			`continue`
fix for pleroma 2018-10-27 10:28:20 +02:00
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`# print("{} is on {}".format(f.acct, instance))`
			`try:`
added a timeout 2018-10-27 14:07:38 +02:00			`r = requests.get("https://{}/.well-known/host-meta".format(instance), timeout=10)`
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`uri = re.search(r'template="([^"]+)"', r.text).group(1)`
			`uri = uri.format(uri = "{}@{}".format(f.username, instance))`
added a timeout 2018-10-27 14:07:38 +02:00			`r = requests.get(uri, headers={"Accept": "application/json"}, timeout=10)`
fix for pleroma 2018-10-27 10:28:20 +02:00			`j = r.json()`
			`if len(j['aliases']) == 1: #TODO: this is a hack on top of a hack, fix it`
			`uri = j['aliases'][0]`
			`else:`
			`uri = j['aliases'][1]`
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`uri = "{}/outbox?page=true&min_id={}".format(uri, last_toot)`
added a timeout 2018-10-27 14:07:38 +02:00			`r = requests.get(uri, timeout=10)`
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`j = r.json()`
			`except Exception:`
			`print("oopsy woopsy!! we made a fucky wucky!!!\n(we're probably rate limited, please hang up and try again)")`
			`sys.exit(1)`
fix for pleroma 2018-10-27 10:28:20 +02:00
			`pleroma = False`
			`if 'first' in j:`
			`print("{} is a pleroma instance -- falling back to legacy toot collection method".format(instance))`
			`pleroma = True`
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00
			`print("Downloading and parsing toots", end='', flush=True)`
			`current = None`
			`try:`
fix for pleroma 2018-10-27 10:28:20 +02:00			`if pleroma:`
			`for t in get_toots_legacy(client, f.id):`
			`try:`
			`c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)",`
			`(t['id'],`
			`f.id,`
			`t['uri'],`
			`t['toot']`
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`)`
fix for pleroma 2018-10-27 10:28:20 +02:00			`)`
			`except:`
			`pass`

			`else:`
			`while len(j['orderedItems']) > 0:`
			`for oi in j['orderedItems']:`
			`if (not pleroma and oi['type'] == "Create") or (pleroma and oi['to']['type'] == "Create"):`
			`# its a toost baby`
			`content = oi['object']['content']`
			`if oi['object']['summary'] != None:`
			`#don't download CW'd toots`
			`continue`
			`toot = extract_toot(content)`
			`# print(toot)`
			`try:`
			`pid = re.search(r"[^\/]+$", oi['object']['id']).group(0)`
			`c.execute("REPLACE INTO toots (id, userid, uri, content) VALUES (?, ?, ?, ?)",`
			`(pid,`
			`f.id,`
			`oi['object']['id'],`
			`toot`
			`)`
			`)`
			`pass`
			`except:`
			`pass #ignore any toots that don't go into the DB`
			`# sys.exit(0)`
added a timeout 2018-10-27 14:07:38 +02:00			`r = requests.get(j['prev'], timeout=10)`
fix for pleroma 2018-10-27 10:28:20 +02:00			`j = r.json()`
			`print('.', end='', flush=True)`
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`print(" Done!")`
			`db.commit()`
			`except:`
only skip failing users, added windows compatibility 2018-11-01 06:27:03 +01:00			`print("Encountered an error! Saving toots to database and continuing.")`
Version 2.0, with vastly improved toot fetching capabilities! 2018-10-25 04:37:11 +02:00			`db.commit()`
only skip failing users, added windows compatibility 2018-11-01 06:27:03 +01:00			`# db.close()`

			`print("Done!")`
initial commit 2018-10-09 03:11:51 +02:00
			`db.commit()`
			`db.execute("VACUUM") #compact db`
			`db.commit()`
			`db.close()`