ioistired-pleroma-ebooks/fetch_posts.py

#!/usr/bin/env python3
# SPDX-License-Identifier: AGPL-3.0-only

import sys
import anyio
import aiohttp
import platform
import pendulum
import operator
import aiosqlite
import contextlib
from yarl import URL
from pleroma import Pleroma, HandleRateLimits
from bs4 import BeautifulSoup
from functools import partial
from typing import Iterable, NewType
from utils import shield, suppress, http_session_factory
from third_party.utils import extract_post_content

UTC = pendulum.timezone('UTC')
JSON_CONTENT_TYPE = 'application/json'
ACTIVITYPUB_CONTENT_TYPE = 'application/activity+json'

MIGRATION_VERSION = 1

class PostFetcher:
	def __init__(self, *, config):
		self.config = config
		self.erroneous_accounts = []

	async def __aenter__(self):
		stack = contextlib.AsyncExitStack()
		self._fedi = await stack.enter_async_context(
			Pleroma(api_base_url=self.config['site'], access_token=self.config['access_token']),
		)
		self._http = await stack.enter_async_context(
			http_session_factory(
				headers={'Accept': ', '.join([JSON_CONTENT_TYPE, ACTIVITYPUB_CONTENT_TYPE])},
				trust_env=True,
				raise_for_status=True,
			),
		)
		self._rl_handler = HandleRateLimits(self._http)
		self._db = await stack.enter_async_context(aiosqlite.connect(self.config['db_path']))
		await self._maybe_run_migrations()
		await self._db.commit()
		self._db.row_factory = aiosqlite.Row
		self._ctx_stack = stack
		return self

	async def _maybe_run_migrations(self):
		async with self._db.cursor() as cur, suppress(aiosqlite.OperationalError):
			if await (await cur.execute('SELECT migration_version FROM migrations')).fetchone(): return

		await self._run_migrations()

	async def _run_migrations(self):
		# TODO proper migrations, not just "has the schema ever been run" migrations
		async with await (anyio.Path(__file__).parent/'schema.sql').open() as f:
			schema = await f.read()

		async with self._db.cursor() as cur:
			await cur.executescript(schema)
			await cur.execute('INSERT INTO migrations (migration_version) VALUES (?)', (MIGRATION_VERSION,))

	async def __aexit__(self, *excinfo):
		return await self._ctx_stack.__aexit__(*excinfo)

	# username@instance
	AccountHandle = NewType('AccountHandle', str)

	async def fetch_all(self):
		"""fetch all following accounts, or an iterable of accounts if provided"""
		await self._fedi.verify_credentials()
		self._completed_accounts = {}
		async with anyio.create_task_group() as tg:
			for fqn in map(self.fqn, await self._fedi.following()):
				tg.start_soon(self._do_account, fqn)

	def fqn(self, acc: dict):
		try:
			return acc['fqn']
		except KeyError:
			fqn = acc['acct']
			if '@' in fqn: return fqn
			return fqn + '@' + URL(self.config['site']).host

	async def _do_account(self, acc: AccountHandle):
		async with anyio.create_task_group() as tg:
			self._completed_accounts[acc] = done_ev = anyio.Event()
			tx, rx = anyio.create_memory_object_stream()
			async with rx, tx:
				tg.start_soon(self._process_pages, rx, acc)
				tg.start_soon(self._fetch_account, tx, acc)
				await done_ev.wait()
			# processing is complete, so halt fetching.
			# processing may complete before fetching if we get caught up on new posts.
			tg.cancel_scope.cancel()

	async def _process_pages(self, stream, account):
		done_ev = self._completed_accounts[account]
		try:
			async for activity in stream:
				try:
					await self._insert_activity(activity)
				except aiosqlite.IntegrityError as exc:
					# LOL sqlite error handling is so bad
					if exc.args[0].startswith('UNIQUE constraint failed: '):
						# this means we've encountered an item we already have saved
						# TODO we need to ignore this if we don't actually have all the posts.
						# For example, if a prior fetch was interrupted, we'll have k pages of the most recent posts,
						# but no more. But since we still have the most recent page saved, it'll *look* like
						# we've saved everything, since we stop as soon as we encounter a post we already have.
						# To fix this we can check against totalItems in the user's outbox.
						break

					self.erroneous_accounts.append(account)
					raise
		finally:
			print('Saving posts from', account, 'to the DB')
			await self._db.commit()
			done_ev.set()

	async def _insert_activity(self, activity):
		if activity['type'] != 'Create':
			# this isn't a post but something else (like, boost, reaction, etc)
			return

		obj = activity['object']

		await self._db.execute(
			"""
			INSERT INTO posts (post_id, summary, content, published_at)
			VALUES (?, ?, ?, ?)
			""",
			(
				obj['id'],
				# Pleroma returns an empty string here for posts without a CW,
				# which is semantically incorrect IMO
				obj['summary'] or None,
				extract_post_content(obj['content']),
				pendulum.parse(obj['published']).astimezone(pendulum.timezone('UTC')).timestamp(),
			),
		)

	# TODO figure out why i put shield here lol
	@shield
	async def _fetch_account(self, tx, account: AccountHandle):
		done_ev = self._completed_accounts[account]

		try:
			outbox = await self.fetch_outbox(account)
		except Exception as exc:
			import traceback
			traceback.print_exception(type(exc), exc, exc.__traceback__)
			done_ev.set()
			self.erroneous_accounts.append(account)
			return

		print(f'Fetching posts for {account}...')

		next_page_url = outbox['first']
		while True:
			print(f'Fetching {next_page_url}... ')
			async with self._rl_handler.request('GET', next_page_url) as resp: page = await resp.json()

			for activity in page['orderedItems']:
				try:
					await tx.send(activity)
				except anyio.BrokenResourceError:
					# already closed means we're already done
					return

			# show progress
			#print('.', end='', flush=True)

			if not (next_page_url := page.get('next')):
				#done_ev.set()
				break

		done_ev.set()

	async def fetch_outbox(self, handle):
		"""finger handle, a fully-qualified ActivityPub actor name, returning their outbox URL"""
		# it's fucking incredible how overengineered ActivityPub is btw
		print('Fingering ', handle, '...', sep='')

		username, at, instance = handle.lstrip('@').partition('@')
		assert at == '@'

		# i was planning on doing /.well-known/host-meta to find the webfinger URL, but
		# 1) honk does not support host-meta
		# 2) WebFinger is always located at the same location anyway

		profile_url = await self._finger_actor(username, instance)

		try:
			async with self._http.get(profile_url) as resp: profile = await resp.json()
		except aiohttp.ContentTypeError:
			# we didn't get JSON, so just guess the outbox URL
			outbox_url = profile_url + '/outbox'
		else:
			outbox_url = profile['outbox']

		async with self._http.get(outbox_url) as resp: outbox = await resp.json()
		assert outbox['type'] == 'OrderedCollection'
		return outbox

	async def _finger_actor(self, username, instance):
		# despite HTTP being a direct violation of the WebFinger spec, assume e.g. Tor instances do not support
		# HTTPS-over-onion
		finger_url = f'http://{instance}/.well-known/webfinger?resource=acct:{username}@{instance}'
		async with self._http.get(finger_url) as resp: finger_result = await resp.json()
		return (profile_url := self._parse_webfinger_result(username, instance, finger_result))

	def _parse_webfinger_result(self, username, instance, finger_result):
		"""given webfinger data, return profile URL for handle"""
		def check_content_type(type, ct): return ct == type or ct.startswith(type+';')
		check_ap = partial(check_content_type, ACTIVITYPUB_CONTENT_TYPE)

		try:
			# note: the server might decide to return multiple links
			# so we need to decide how to prefer one.
			# i'd put "and yarl.URL(template).host == instance" here,
			# but some instances have no subdomain for the handle yet use a subdomain for the canonical URL.
			# Additionally, an instance could theoretically serve profile pages over I2P and the clearnet,
			# for example.
			return (profile_url := next(
				link['href']
				for link in finger_result['links']
				if link['rel'] == 'self' and check_ap(link['type'])
			))
		except StopIteration:
			# this should never happen either
			raise RuntimeError(f'fatal: while fingering {username}@{instance}, failed to find a profile URL')

async def amain():
	import json5 as json
	import third_party.utils as utils
	args = utils.arg_parser_factory(description='Fetch posts from all followed accounts').parse_args()
	config = utils.load_config(args.cfg)
	async with PostFetcher(config=config) as fetcher: await fetcher.fetch_all()
	if (accs := fetcher.erroneous_accounts):
		print(
			'Exiting unsuccessfully due to previous errors in these accounts:',
			', '.join(accs),
			file=sys.stderr,
		)
		sys.exit(1)

def main():
	try:
		anyio.run(amain)
	except KeyboardInterrupt:
		# suppress the lengthy traceback
		sys.exit(1)

if __name__ == '__main__':
	main()
initial commit 2018-10-09 03:11:51 +02:00			`#!/usr/bin/env python3`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`# SPDX-License-Identifier: AGPL-3.0-only`
initial commit 2018-10-09 03:11:51 +02:00
rewrite for anyio+aiohttp 2021-06-16 03:59:57 +02:00			`import sys`
			`import anyio`
			`import aiohttp`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`import platform`
			`import pendulum`
save some memory by using account handles instead of objects 2021-08-19 12:40:57 +02:00			`import operator`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`import aiosqlite`
rewrite for anyio+aiohttp 2021-06-16 03:59:57 +02:00			`import contextlib`
fix fetching from masto instances 2021-09-21 15:58:20 +02:00			`from yarl import URL`
use external pleroma.py 2023-01-11 07:23:42 +01:00			`from pleroma import Pleroma, HandleRateLimits`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`from bs4 import BeautifulSoup`
			`from functools import partial`
save some memory by using account handles instead of objects 2021-08-19 12:40:57 +02:00			`from typing import Iterable, NewType`
use external pleroma.py 2023-01-11 07:23:42 +01:00			`from utils import shield, suppress, http_session_factory`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`from third_party.utils import extract_post_content`

			`UTC = pendulum.timezone('UTC')`
			`JSON_CONTENT_TYPE = 'application/json'`
			`ACTIVITYPUB_CONTENT_TYPE = 'application/activity+json'`

add basic migration support 2021-09-17 08:34:44 +02:00			`MIGRATION_VERSION = 1`

rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`class PostFetcher:`
			`def __init__(self, *, config):`
			`self.config = config`
a few changes - better error reporting - fix the program hanging at the end if any of the accounts failed to fetch - remove trailing semicolon from user agent string 2021-08-13 11:40:20 +02:00			`self.erroneous_accounts = []`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00
			`async def __aenter__(self):`
			`stack = contextlib.AsyncExitStack()`
			`self._fedi = await stack.enter_async_context(`
			`Pleroma(api_base_url=self.config['site'], access_token=self.config['access_token']),`
			`)`
			`self._http = await stack.enter_async_context(`
use external pleroma.py 2023-01-11 07:23:42 +01:00			`http_session_factory(`
			`headers={'Accept': ', '.join([JSON_CONTENT_TYPE, ACTIVITYPUB_CONTENT_TYPE])},`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`trust_env=True,`
			`raise_for_status=True,`
			`),`
			`)`
handle rate limits 2021-09-17 08:35:54 +02:00			`self._rl_handler = HandleRateLimits(self._http)`
move db_path default to load_config 2021-07-26 08:47:43 +02:00			`self._db = await stack.enter_async_context(aiosqlite.connect(self.config['db_path']))`
add basic migration support 2021-09-17 08:34:44 +02:00			`await self._maybe_run_migrations()`
create table posts if it does not exist 2021-09-17 19:53:11 +02:00			`await self._db.commit()`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`self._db.row_factory = aiosqlite.Row`
			`self._ctx_stack = stack`
			`return self`

add basic migration support 2021-09-17 08:34:44 +02:00			`async def _maybe_run_migrations(self):`
			`async with self._db.cursor() as cur, suppress(aiosqlite.OperationalError):`
			`if await (await cur.execute('SELECT migration_version FROM migrations')).fetchone(): return`

			`await self._run_migrations()`

			`async def _run_migrations(self):`
			`# TODO proper migrations, not just "has the schema ever been run" migrations`
			`async with await (anyio.Path(__file__).parent/'schema.sql').open() as f:`
			`schema = await f.read()`

			`async with self._db.cursor() as cur:`
			`await cur.executescript(schema)`
			`await cur.execute('INSERT INTO migrations (migration_version) VALUES (?)', (MIGRATION_VERSION,))`

rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`async def __aexit__(self, *excinfo):`
			`return await self._ctx_stack.__aexit__(*excinfo)`

save some memory by using account handles instead of objects 2021-08-19 12:40:57 +02:00			`# username@instance`
			`AccountHandle = NewType('AccountHandle', str)`

rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`async def fetch_all(self):`
save some memory by using account handles instead of objects 2021-08-19 12:40:57 +02:00			`"""fetch all following accounts, or an iterable of accounts if provided"""`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`await self._fedi.verify_credentials()`
			`self._completed_accounts = {}`
			`async with anyio.create_task_group() as tg:`
fix fetching from masto instances 2021-09-21 15:58:20 +02:00			`for fqn in map(self.fqn, await self._fedi.following()):`
			`tg.start_soon(self._do_account, fqn)`

			`def fqn(self, acc: dict):`
			`try:`
			`return acc['fqn']`
			`except KeyError:`
			`fqn = acc['acct']`
			`if '@' in fqn: return fqn`
			`return fqn + '@' + URL(self.config['site']).host`
rewrite for anyio+aiohttp 2021-06-16 03:59:57 +02:00
save some memory by using account handles instead of objects 2021-08-19 12:40:57 +02:00			`async def _do_account(self, acc: AccountHandle):`
rewrite for anyio+aiohttp 2021-06-16 03:59:57 +02:00			`async with anyio.create_task_group() as tg:`
save some memory by using account handles instead of objects 2021-08-19 12:40:57 +02:00			`self._completed_accounts[acc] = done_ev = anyio.Event()`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`tx, rx = anyio.create_memory_object_stream()`
			`async with rx, tx:`
			`tg.start_soon(self._process_pages, rx, acc)`
			`tg.start_soon(self._fetch_account, tx, acc)`
			`await done_ev.wait()`
			`# processing is complete, so halt fetching.`
			`# processing may complete before fetching if we get caught up on new posts.`
			`tg.cancel_scope.cancel()`

			`async def _process_pages(self, stream, account):`
save some memory by using account handles instead of objects 2021-08-19 12:40:57 +02:00			`done_ev = self._completed_accounts[account]`
rewrite for anyio+aiohttp 2021-06-16 03:59:57 +02:00			`try:`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`async for activity in stream:`
			`try:`
			`await self._insert_activity(activity)`
			`except aiosqlite.IntegrityError as exc:`
			`# LOL sqlite error handling is so bad`
			`if exc.args[0].startswith('UNIQUE constraint failed: '):`
			`# this means we've encountered an item we already have saved`
add TODO comment 2021-09-17 08:37:39 +02:00			`# TODO we need to ignore this if we don't actually have all the posts.`
			`# For example, if a prior fetch was interrupted, we'll have k pages of the most recent posts,`
			`# but no more. But since we still have the most recent page saved, it'll look like`
			`# we've saved everything, since we stop as soon as we encounter a post we already have.`
			`# To fix this we can check against totalItems in the user's outbox.`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`break`

save some memory by using account handles instead of objects 2021-08-19 12:40:57 +02:00			`self.erroneous_accounts.append(account)`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`raise`
			`finally:`
save some memory by using account handles instead of objects 2021-08-19 12:40:57 +02:00			`print('Saving posts from', account, 'to the DB')`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`await self._db.commit()`
a few changes - better error reporting - fix the program hanging at the end if any of the accounts failed to fetch - remove trailing semicolon from user agent string 2021-08-13 11:40:20 +02:00			`done_ev.set()`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00
			`async def _insert_activity(self, activity):`
			`if activity['type'] != 'Create':`
			`# this isn't a post but something else (like, boost, reaction, etc)`
			`return`

			`obj = activity['object']`

			`await self._db.execute(`
			`"""`
			`INSERT INTO posts (post_id, summary, content, published_at)`
			`VALUES (?, ?, ?, ?)`
			`""",`
			`(`
			`obj['id'],`
normalize Pleroman posts without cws (fix #3) 2021-09-27 13:13:03 +02:00			`# Pleroma returns an empty string here for posts without a CW,`
			`# which is semantically incorrect IMO`
			`obj['summary'] or None,`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`extract_post_content(obj['content']),`
			`pendulum.parse(obj['published']).astimezone(pendulum.timezone('UTC')).timestamp(),`
			`),`
			`)`

a few changes - better error reporting - fix the program hanging at the end if any of the accounts failed to fetch - remove trailing semicolon from user agent string 2021-08-13 11:40:20 +02:00			`# TODO figure out why i put shield here lol`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`@shield`
save some memory by using account handles instead of objects 2021-08-19 12:40:57 +02:00			`async def _fetch_account(self, tx, account: AccountHandle):`
			`done_ev = self._completed_accounts[account]`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00
rewrite for anyio+aiohttp 2021-06-16 03:59:57 +02:00			`try:`
save some memory by using account handles instead of objects 2021-08-19 12:40:57 +02:00			`outbox = await self.fetch_outbox(account)`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`except Exception as exc:`
rewrite for anyio+aiohttp 2021-06-16 03:59:57 +02:00			`import traceback`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`traceback.print_exception(type(exc), exc, exc.__traceback__)`
a few changes - better error reporting - fix the program hanging at the end if any of the accounts failed to fetch - remove trailing semicolon from user agent string 2021-08-13 11:40:20 +02:00			`done_ev.set()`
save some memory by using account handles instead of objects 2021-08-19 12:40:57 +02:00			`self.erroneous_accounts.append(account)`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`return`

save some memory by using account handles instead of objects 2021-08-19 12:40:57 +02:00			`print(f'Fetching posts for {account}...')`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00
			`next_page_url = outbox['first']`
			`while True:`
a few changes - better error reporting - fix the program hanging at the end if any of the accounts failed to fetch - remove trailing semicolon from user agent string 2021-08-13 11:40:20 +02:00			`print(f'Fetching {next_page_url}... ')`
handle rate limits 2021-09-17 08:35:54 +02:00			`async with self._rl_handler.request('GET', next_page_url) as resp: page = await resp.json()`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00
			`for activity in page['orderedItems']:`
			`try:`
			`await tx.send(activity)`
			`except anyio.BrokenResourceError:`
			`# already closed means we're already done`
			`return`

			`# show progress`
			`#print('.', end='', flush=True)`

			`if not (next_page_url := page.get('next')):`
			`#done_ev.set()`
			`break`

			`done_ev.set()`

			`async def fetch_outbox(self, handle):`
			`"""finger handle, a fully-qualified ActivityPub actor name, returning their outbox URL"""`
			`# it's fucking incredible how overengineered ActivityPub is btw`
			`print('Fingering ', handle, '...', sep='')`

			`username, at, instance = handle.lstrip('@').partition('@')`
			`assert at == '@'`

			`# i was planning on doing /.well-known/host-meta to find the webfinger URL, but`
			`# 1) honk does not support host-meta`
			`# 2) WebFinger is always located at the same location anyway`

			`profile_url = await self._finger_actor(username, instance)`

			`try:`
			`async with self._http.get(profile_url) as resp: profile = await resp.json()`
			`except aiohttp.ContentTypeError:`
			`# we didn't get JSON, so just guess the outbox URL`
			`outbox_url = profile_url + '/outbox'`
			`else:`
			`outbox_url = profile['outbox']`

			`async with self._http.get(outbox_url) as resp: outbox = await resp.json()`
			`assert outbox['type'] == 'OrderedCollection'`
			`return outbox`

			`async def _finger_actor(self, username, instance):`
			`# despite HTTP being a direct violation of the WebFinger spec, assume e.g. Tor instances do not support`
			`# HTTPS-over-onion`
			`finger_url = f'http://{instance}/.well-known/webfinger?resource=acct:{username}@{instance}'`
			`async with self._http.get(finger_url) as resp: finger_result = await resp.json()`
			`return (profile_url := self._parse_webfinger_result(username, instance, finger_result))`

			`def _parse_webfinger_result(self, username, instance, finger_result):`
			`"""given webfinger data, return profile URL for handle"""`
			`def check_content_type(type, ct): return ct == type or ct.startswith(type+';')`
			`check_ap = partial(check_content_type, ACTIVITYPUB_CONTENT_TYPE)`

			`try:`
			`# note: the server might decide to return multiple links`
			`# so we need to decide how to prefer one.`
fetch_posts.py: remove unused import 2021-07-26 08:04:14 +02:00			`# i'd put "and yarl.URL(template).host == instance" here,`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`# but some instances have no subdomain for the handle yet use a subdomain for the canonical URL.`
			`# Additionally, an instance could theoretically serve profile pages over I2P and the clearnet,`
			`# for example.`
			`return (profile_url := next(`
			`link['href']`
			`for link in finger_result['links']`
			`if link['rel'] == 'self' and check_ap(link['type'])`
			`))`
			`except StopIteration:`
			`# this should never happen either`
			`raise RuntimeError(f'fatal: while fingering {username}@{instance}, failed to find a profile URL')`

			`async def amain():`
			`import json5 as json`
fetch_posts.py: use argparse 2021-07-26 08:04:32 +02:00			`import third_party.utils as utils`
			`args = utils.arg_parser_factory(description='Fetch posts from all followed accounts').parse_args()`
			`config = utils.load_config(args.cfg)`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`async with PostFetcher(config=config) as fetcher: await fetcher.fetch_all()`
a few changes - better error reporting - fix the program hanging at the end if any of the accounts failed to fetch - remove trailing semicolon from user agent string 2021-08-13 11:40:20 +02:00			`if (accs := fetcher.erroneous_accounts):`
			`print(`
			`'Exiting unsuccessfully due to previous errors in these accounts:',`
			`', '.join(accs),`
			`file=sys.stderr,`
			`)`
			`sys.exit(1)`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00
			`def main():`
fetch_posts: add KeyboardInterrupt handling 2021-07-26 08:17:01 +02:00			`try:`
			`anyio.run(amain)`
			`except KeyboardInterrupt:`
			`# suppress the lengthy traceback`
			`sys.exit(1)`
initial commit 2018-10-09 03:11:51 +02:00
rewrite for anyio+aiohttp 2021-06-16 03:59:57 +02:00			`if __name__ == '__main__':`
rewrite fetch_posts.py from scratch now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB 2021-07-26 06:59:04 +02:00			`main()`