rewrite fetch_posts.py from scratch
now it should be properly async by fetching posts in a separate task and sending them across a queue to a task that inserts to the DB
This commit is contained in:
parent
c22a493dff
commit
330fdc2809
1 changed files with 176 additions and 188 deletions
372
fetch_posts.py
372
fetch_posts.py
|
@ -1,223 +1,211 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# SPDX-License-Identifier: EUPL-1.2
|
# SPDX-License-Identifier: AGPL-3.0-only
|
||||||
|
|
||||||
import re
|
|
||||||
import sys
|
import sys
|
||||||
import json
|
|
||||||
import anyio
|
import anyio
|
||||||
import asqlite
|
|
||||||
import sqlite3
|
|
||||||
import asyncio
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import argparse
|
import platform
|
||||||
import functions
|
import pendulum
|
||||||
|
import aiosqlite
|
||||||
import contextlib
|
import contextlib
|
||||||
from http import HTTPStatus
|
from yarl import URL
|
||||||
from pleroma import Pleroma, http_session_factory
|
from utils import shield
|
||||||
|
from pleroma import Pleroma
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from functools import partial
|
||||||
|
from third_party.utils import extract_post_content
|
||||||
|
|
||||||
PATTERNS = {
|
USER_AGENT = (
|
||||||
"handle": re.compile(r'^.*@(.+)'),
|
'fedi-ebooks; '
|
||||||
"base_url": re.compile(r'https?:\/\/(.*)'),
|
f'{aiohttp.__version__}; '
|
||||||
"webfinger_template_url": re.compile(r'template="([^"]+)"'),
|
f'{platform.python_implementation()}/{platform.python_version()}; '
|
||||||
"post_id": re.compile(r'[^\/]+$'),
|
)
|
||||||
}
|
|
||||||
|
|
||||||
@contextlib.asynccontextmanager
|
UTC = pendulum.timezone('UTC')
|
||||||
async def get_db():
|
JSON_CONTENT_TYPE = 'application/json'
|
||||||
async with asqlite.connect('toots.db') as conn:
|
ACTIVITYPUB_CONTENT_TYPE = 'application/activity+json'
|
||||||
async with conn.cursor() as cur:
|
|
||||||
await cur.execute("""
|
class PostFetcher:
|
||||||
CREATE TABLE IF NOT EXISTS toots (
|
def __init__(self, *, config):
|
||||||
sortid INTEGER UNIQUE PRIMARY KEY AUTOINCREMENT,
|
self.config = config
|
||||||
id VARCHAR NOT NULL,
|
|
||||||
cw VARCHAR,
|
async def __aenter__(self):
|
||||||
userid VARCHAR NOT NULL,
|
stack = contextlib.AsyncExitStack()
|
||||||
uri VARCHAR NOT NULL,
|
self._fedi = await stack.enter_async_context(
|
||||||
content VARCHAR NOT NULL
|
Pleroma(api_base_url=self.config['site'], access_token=self.config['access_token']),
|
||||||
)
|
)
|
||||||
""")
|
self._http = await stack.enter_async_context(
|
||||||
await cur.execute("""
|
aiohttp.ClientSession(
|
||||||
CREATE TABLE IF NOT EXISTS cursors (
|
headers={
|
||||||
userid VARCHAR PRIMARY KEY,
|
'User-Agent': USER_AGENT,
|
||||||
next_page VARCHAR NOT NULL
|
'Accept': ', '.join([JSON_CONTENT_TYPE, ACTIVITYPUB_CONTENT_TYPE]),
|
||||||
|
},
|
||||||
|
trust_env=True,
|
||||||
|
raise_for_status=True,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
""")
|
self._db = await stack.enter_async_context(aiosqlite.connect(self.config.get('db_path', 'posts.db')))
|
||||||
await cur.execute("""
|
self._db.row_factory = aiosqlite.Row
|
||||||
CREATE TRIGGER IF NOT EXISTS dedup
|
self._ctx_stack = stack
|
||||||
AFTER INSERT ON toots
|
return self
|
||||||
FOR EACH ROW BEGIN
|
|
||||||
DELETE FROM toots
|
|
||||||
WHERE rowid NOT IN (
|
|
||||||
SELECT MIN(sortid)
|
|
||||||
FROM toots GROUP BY uri
|
|
||||||
);
|
|
||||||
END
|
|
||||||
""")
|
|
||||||
await conn.commit()
|
|
||||||
yield conn
|
|
||||||
|
|
||||||
async def main():
|
async def __aexit__(self, *excinfo):
|
||||||
args = functions.parse_args(description='Log in and download posts.')
|
return await self._ctx_stack.__aexit__(*excinfo)
|
||||||
cfg = functions.load_config(args.cfg)
|
|
||||||
|
|
||||||
async with (
|
|
||||||
Pleroma(api_base_url=cfg['site'], access_token=cfg['access_token']) as client,
|
|
||||||
get_db() as db, db.cursor() as cur,
|
|
||||||
http_session_factory() as http,
|
|
||||||
):
|
|
||||||
try:
|
|
||||||
following = await client.following()
|
|
||||||
except aiohttp.ClientResponseError as exc:
|
|
||||||
if exc.status == HTTPStatus.FORBIDDEN:
|
|
||||||
print(f'The provided access token in {args.cfg} is invalid.', file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
async def fetch_all(self):
|
||||||
|
await self._fedi.verify_credentials()
|
||||||
|
self._completed_accounts = {}
|
||||||
async with anyio.create_task_group() as tg:
|
async with anyio.create_task_group() as tg:
|
||||||
for acc in following:
|
for acc in await self._fedi.following():
|
||||||
tg.start_soon(fetch_posts, cfg, http, cur, acc)
|
tg.start_soon(self._do_account, acc)
|
||||||
|
|
||||||
print('Done!')
|
async def _do_account(self, acc):
|
||||||
|
|
||||||
await db.commit()
|
|
||||||
await db.execute('VACUUM') # compact db
|
|
||||||
await db.commit()
|
|
||||||
|
|
||||||
async def fetch_posts(cfg, http, cur, acc):
|
|
||||||
next_page = await (await cur.execute('SELECT next_page FROM cursors WHERE userid = ?', (acc['id'],))).fetchone()
|
|
||||||
direction = 'next'
|
|
||||||
if next_page is not None:
|
|
||||||
next_page ,= next_page
|
|
||||||
direction = 'prev'
|
|
||||||
print('Downloading posts for user @' + acc['acct'])
|
|
||||||
|
|
||||||
page = await fetch_first_page(cfg, http, acc, next_page)
|
|
||||||
|
|
||||||
if 'next' not in page and 'prev' not in page:
|
|
||||||
# there's only one page of results, don't bother doing anything special
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
# this is for when we're all done. it points to the activities created *after* we started fetching.
|
|
||||||
next_page = page['prev']
|
|
||||||
|
|
||||||
print('Downloading and saving posts', end='', flush=True)
|
|
||||||
done = False
|
|
||||||
while not done and len(page['orderedItems']) > 0:
|
|
||||||
try:
|
|
||||||
async with anyio.create_task_group() as tg:
|
async with anyio.create_task_group() as tg:
|
||||||
for obj in page['orderedItems']:
|
self._completed_accounts[acc['fqn']] = done_ev = anyio.Event()
|
||||||
tg.start_soon(process_object, cur, acc, obj)
|
tx, rx = anyio.create_memory_object_stream()
|
||||||
except DoneWithAccount:
|
async with rx, tx:
|
||||||
done = True
|
tg.start_soon(self._process_pages, rx, acc)
|
||||||
continue
|
tg.start_soon(self._fetch_account, tx, acc)
|
||||||
except anyio.ExceptionGroup as eg:
|
await done_ev.wait()
|
||||||
for exc in eg.exceptions:
|
# processing is complete, so halt fetching.
|
||||||
if isinstance(exc, DoneWithAccount):
|
# processing may complete before fetching if we get caught up on new posts.
|
||||||
done = True
|
tg.cancel_scope.cancel()
|
||||||
continue
|
|
||||||
|
|
||||||
# get the next/previous page
|
async def _process_pages(self, stream, account):
|
||||||
|
done_ev = self._completed_accounts[account['fqn']]
|
||||||
try:
|
try:
|
||||||
async with http.get(page[direction], timeout=15) as resp:
|
async for activity in stream:
|
||||||
page = await resp.json()
|
try:
|
||||||
except asyncio.TimeoutError:
|
await self._insert_activity(activity)
|
||||||
print('HTTP timeout, site did not respond within 15 seconds', file=sys.stderr)
|
except aiosqlite.IntegrityError as exc:
|
||||||
except KeyError:
|
# LOL sqlite error handling is so bad
|
||||||
print("Couldn't get next page - we've probably got all the posts", file=sys.stderr)
|
if exc.args[0].startswith('UNIQUE constraint failed: '):
|
||||||
except KeyboardInterrupt:
|
# this means we've encountered an item we already have saved
|
||||||
done = True
|
done_ev.set()
|
||||||
break
|
|
||||||
except aiohttp.ClientResponseError as exc:
|
|
||||||
if exc.status == HTTPStatus.TOO_MANY_REQUESTS:
|
|
||||||
print("We're rate limited. Skipping to next account.")
|
|
||||||
done = True
|
|
||||||
break
|
break
|
||||||
|
|
||||||
raise
|
raise
|
||||||
except Exception:
|
finally:
|
||||||
import traceback
|
print('COMMIT')
|
||||||
print('An error occurred while trying to obtain more posts:', file=sys.stderr)
|
await self._db.commit()
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
print('.', end='', flush=True)
|
async def _insert_activity(self, activity):
|
||||||
else:
|
if activity['type'] != 'Create':
|
||||||
# the while loop ran without breaking
|
# this isn't a post but something else (like, boost, reaction, etc)
|
||||||
await cur.execute('REPLACE INTO cursors (userid, next_page) VALUES (?, ?)', (acc['id'], next_page))
|
|
||||||
await cur.connection.commit()
|
|
||||||
|
|
||||||
print(' Done!')
|
|
||||||
|
|
||||||
async def finger(cfg, http, acc):
|
|
||||||
instance = PATTERNS['handle'].search(acc['acct'])
|
|
||||||
if instance is None:
|
|
||||||
instance = PATTERNS['base_url'].search(cfg['site'])[1]
|
|
||||||
else:
|
|
||||||
instance = instance[1]
|
|
||||||
|
|
||||||
# 1. download host-meta to find webfinger URL
|
|
||||||
async with http.get('https://{}/.well-known/host-meta'.format(instance), timeout=10) as resp:
|
|
||||||
host_meta = await resp.text()
|
|
||||||
|
|
||||||
# 2. use webfinger to find user's info page
|
|
||||||
webfinger_url = PATTERNS['webfinger_template_url'].search(host_meta).group(1)
|
|
||||||
webfinger_url = webfinger_url.format(uri='{}@{}'.format(acc['username'], instance))
|
|
||||||
|
|
||||||
async with http.get(webfinger_url, headers={'Accept': 'application/json'}, timeout=10) as resp:
|
|
||||||
profile = await resp.json()
|
|
||||||
|
|
||||||
for link in profile['links']:
|
|
||||||
if link['rel'] == 'self':
|
|
||||||
# this is a link formatted like 'https://instan.ce/users/username', which is what we need
|
|
||||||
return link['href']
|
|
||||||
|
|
||||||
print("Couldn't find a valid ActivityPub outbox URL.", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
class DoneWithAccount(Exception): pass
|
|
||||||
|
|
||||||
async def process_object(cur, acc, obj):
|
|
||||||
if obj['type'] != 'Create':
|
|
||||||
# this isn't a toot/post/status/whatever, it's a boost or a follow or some other activitypub thing. ignore
|
|
||||||
return
|
return
|
||||||
|
|
||||||
# its a toost baby
|
obj = activity['object']
|
||||||
content = obj['object']['content']
|
|
||||||
toot = extract_toot(content)
|
content = extract_post_content(obj['content'])
|
||||||
|
await self._db.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO posts (post_id, summary, content, published_at)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
""",
|
||||||
|
(
|
||||||
|
obj['id'],
|
||||||
|
obj['summary'],
|
||||||
|
extract_post_content(obj['content']),
|
||||||
|
pendulum.parse(obj['published']).astimezone(pendulum.timezone('UTC')).timestamp(),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
@shield
|
||||||
|
async def _fetch_account(self, tx, account):
|
||||||
|
done_ev = self._completed_accounts[account['fqn']]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await cur.execute('SELECT COUNT(*) FROM toots WHERE uri = ?', (obj['object']['id'],))
|
outbox = await self.fetch_outbox(account['fqn'])
|
||||||
existing = await cur.fetchone()
|
except Exception as exc:
|
||||||
if existing is not None and existing[0]:
|
import traceback
|
||||||
# we've caught up to the notices we've already downloaded, so we can stop now
|
traceback.print_exception(type(exc), exc, exc.__traceback__)
|
||||||
# you might be wondering, 'lynne, what if the instance ratelimits you after 40 posts, and they've made 60 since main.py was last run? wouldn't the bot miss 20 posts and never be able to see them?' to which i reply, 'i know but i don't know how to fix it'
|
return
|
||||||
raise DoneWithAccount
|
|
||||||
await insert_toot(cur, acc, obj, toot)
|
|
||||||
except sqlite3.Error:
|
|
||||||
pass # ignore any toots that don't successfully go into the DB
|
|
||||||
|
|
||||||
async def fetch_first_page(cfg, http, acc, next_page):
|
print(f'Fetching posts for {account["acct"]}...')
|
||||||
# download a page of the outbox
|
|
||||||
if not next_page:
|
next_page_url = outbox['first']
|
||||||
print('Fingering UwU...')
|
while True:
|
||||||
# find the user's activitypub outbox
|
print(f'Fetching {next_page_url}... ', end='', flush=True)
|
||||||
outbox_url = await finger(cfg, http, acc) + '/outbox?page=true'
|
async with self._http.get(next_page_url) as resp: page = await resp.json()
|
||||||
|
print('done.')
|
||||||
|
|
||||||
|
for activity in page['orderedItems']:
|
||||||
|
try:
|
||||||
|
await tx.send(activity)
|
||||||
|
except anyio.BrokenResourceError:
|
||||||
|
# already closed means we're already done
|
||||||
|
return
|
||||||
|
|
||||||
|
# show progress
|
||||||
|
#print('.', end='', flush=True)
|
||||||
|
|
||||||
|
if not (next_page_url := page.get('next')):
|
||||||
|
#done_ev.set()
|
||||||
|
break
|
||||||
|
|
||||||
|
done_ev.set()
|
||||||
|
|
||||||
|
async def fetch_outbox(self, handle):
|
||||||
|
"""finger handle, a fully-qualified ActivityPub actor name, returning their outbox URL"""
|
||||||
|
# it's fucking incredible how overengineered ActivityPub is btw
|
||||||
|
print('Fingering ', handle, '...', sep='')
|
||||||
|
|
||||||
|
username, at, instance = handle.lstrip('@').partition('@')
|
||||||
|
assert at == '@'
|
||||||
|
|
||||||
|
# i was planning on doing /.well-known/host-meta to find the webfinger URL, but
|
||||||
|
# 1) honk does not support host-meta
|
||||||
|
# 2) WebFinger is always located at the same location anyway
|
||||||
|
|
||||||
|
profile_url = await self._finger_actor(username, instance)
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with self._http.get(profile_url) as resp: profile = await resp.json()
|
||||||
|
except aiohttp.ContentTypeError:
|
||||||
|
# we didn't get JSON, so just guess the outbox URL
|
||||||
|
outbox_url = profile_url + '/outbox'
|
||||||
else:
|
else:
|
||||||
outbox_url = next_page
|
outbox_url = profile['outbox']
|
||||||
|
|
||||||
async with http.get(outbox_url, timeout=15) as resp:
|
async with self._http.get(outbox_url) as resp: outbox = await resp.json()
|
||||||
return await resp.json()
|
assert outbox['type'] == 'OrderedCollection'
|
||||||
|
return outbox
|
||||||
|
|
||||||
def extract_toot(toot):
|
async def _finger_actor(self, username, instance):
|
||||||
toot = functions.extract_toot(toot)
|
# despite HTTP being a direct violation of the WebFinger spec, assume e.g. Tor instances do not support
|
||||||
toot = toot.replace('@', '@\u200B') # put a zws between @ and username to avoid mentioning
|
# HTTPS-over-onion
|
||||||
return(toot)
|
finger_url = f'http://{instance}/.well-known/webfinger?resource=acct:{username}@{instance}'
|
||||||
|
async with self._http.get(finger_url) as resp: finger_result = await resp.json()
|
||||||
|
return (profile_url := self._parse_webfinger_result(username, instance, finger_result))
|
||||||
|
|
||||||
async def insert_toot(cursor, acc, obj, content):
|
def _parse_webfinger_result(self, username, instance, finger_result):
|
||||||
post_id = PATTERNS['post_id'].search(obj['object']['id']).group(0)
|
"""given webfinger data, return profile URL for handle"""
|
||||||
await cursor.execute('REPLACE INTO toots (id, cw, userid, uri, content) VALUES (?, ?, ?, ?, ?)', (
|
def check_content_type(type, ct): return ct == type or ct.startswith(type+';')
|
||||||
post_id,
|
check_ap = partial(check_content_type, ACTIVITYPUB_CONTENT_TYPE)
|
||||||
obj['object']['summary'] or None,
|
|
||||||
acc['id'],
|
try:
|
||||||
obj['object']['id'],
|
# note: the server might decide to return multiple links
|
||||||
content,
|
# so we need to decide how to prefer one.
|
||||||
|
# i'd put "and URL(template).host == instance" here,
|
||||||
|
# but some instances have no subdomain for the handle yet use a subdomain for the canonical URL.
|
||||||
|
# Additionally, an instance could theoretically serve profile pages over I2P and the clearnet,
|
||||||
|
# for example.
|
||||||
|
return (profile_url := next(
|
||||||
|
link['href']
|
||||||
|
for link in finger_result['links']
|
||||||
|
if link['rel'] == 'self' and check_ap(link['type'])
|
||||||
))
|
))
|
||||||
|
except StopIteration:
|
||||||
|
# this should never happen either
|
||||||
|
raise RuntimeError(f'fatal: while fingering {username}@{instance}, failed to find a profile URL')
|
||||||
|
|
||||||
|
async def amain():
|
||||||
|
import json5 as json
|
||||||
|
with open('config.json' if len(sys.argv) < 2 else sys.argv[1]) as f: config = json.load(f)
|
||||||
|
async with PostFetcher(config=config) as fetcher: await fetcher.fetch_all()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
anyio.run(amain)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
anyio.run(main)
|
main()
|
||||||
|
|
Loading…
Reference in a new issue