diff --git a/.gitignore b/.gitignore index b8a7f3f..7b8cac2 100644 --- a/.gitignore +++ b/.gitignore @@ -19,8 +19,3 @@ __pycache__/ !*.defaults.json venv/ .venv/ - -bin -lib -lib64 -pyenv.cfg diff --git a/README.md b/README.md index 47d7bcf..fda1675 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,9 @@ # pleroma-ebooks -this is a very slightly modified version of which is modified from - -_autumn's changes_: removes any generated @'s in replies to prevent accidentally mentioning somebody else on the same instance - -_smitten's changes_: adjusts CW filtering to be word-based regexp instead of exact match - It's like [@AgathaSorceress's mstdn-ebooks] but it supports Pleroma better. [@AgathaSorceress's mstdn-ebooks]: https://github.com/AgathaSorceress/mstdn-ebooks -## How to Use -1. Create your bot account on the server. -2. Follow the user(s) you want to base the model on. -3. Get an access token for your bot. See [mastodon-bot](https://tinysubversions.com/notes/mastodon-bot/) for details. -4. Copy `config.defaults.json` to `config.json` and set as `access_token`. Make any other config tweaks you'd like. -5. Run `fetch_posts.py` to collect the posts from the followed user(s). -6. Run `gen.py` to generate the sentence and write it to the server. -7. (optional) set up cron or other periodic auto-run. run `reply.py` in the background. - ## Secure Fetch Secure fetch (aka authorised fetches, authenticated fetches, secure mode...) is *not* supported by pleroma-ebooks, and will fail to download any posts from users on instances with secure fetch enabled. For more information, see [this wiki page](https://github.com/Lynnesbian/mstdn-ebooks/wiki/Secure-fetch). @@ -43,11 +28,11 @@ Configuring pleroma-ebooks is accomplished by editing `config.json`. If you want | site | https://botsin.space | The instance your bot will log in to and post from. This must start with `https://` or `http://` (preferably the latter) | | cw | null | The content warning (aka subject) pleroma-ebooks will apply to non-error posts. | | learn_from_cw | false | If true, pleroma-ebooks will learn from CW'd posts. | -| ignored_cws | [] | If `learn_from_cw` is true, do not learn from posts with these CW words. matches case-insensitive and by word. +| ignored_cws | [] | If `learn_from_cw` is true, do not learn from posts with these CWs. | mention_handling | 1 | 0: Never use mentions. 1: Only generate fake mentions in the middle of posts, never at the start. 2: Use mentions as normal (old behaviour). | | max_thread_length | 15 | The maximum number of bot posts in a thread before it stops replying. A thread can be 10 or 10000 posts long, but the bot will stop after it has posted `max_thread_length` times. | | strip_paired_punctuation | false | If true, pleroma-ebooks will remove punctuation that commonly appears in pairs, like " and (). This avoids the issue of posts that open a bracket (or quote) without closing it. | -| limit_length | false | If true, the sentence word length will be random between `length_lower_limit` and `length_upper_limit` | +| limit_length | false | If true, the sentence length will be random between `length_lower_limit` and `length_upper_limit` | | length_lower_limit | 5 | The lower bound in the random number range above. Only matters if `limit_length` is true. | | length_upper_limit | 50 | The upper bound in the random number range above. Can be the same as `length_lower_limit` to disable randomness. Only matters if `limit_length` is true. | | overlap_ratio_enabled | false | If true, checks the output's similarity to the original posts. | @@ -63,4 +48,4 @@ Please don't feel obligated to donate at all. This is released under the AGPLv3 (only) license, and based on Lynnesbian's fork which is under the MPL 2.0 license. See LICENSE-AGPL.md and LICENSE-MPL for details. -**This means you must publish the source code of any ebooks bot you make with this.** A link back to this repository on your bot's profile page or profile metadata will suffice. If you make changes to the code you need to link to your fork/repo instead. +**This means you must publish the source code of any ebooks bot you make with this.** A link back to this repository on your bot's profile page or profile metadata will suffice. If you make changes to the code you need to link to your fork/repo instead diff --git a/fetch_posts.py b/fetch_posts.py index d5eaa2f..611604d 100755 --- a/fetch_posts.py +++ b/fetch_posts.py @@ -10,13 +10,19 @@ import operator import aiosqlite import contextlib from yarl import URL -from pleroma import Pleroma, HandleRateLimits +from pleroma import Pleroma from bs4 import BeautifulSoup from functools import partial from typing import Iterable, NewType -from utils import shield, suppress, http_session_factory +from utils import shield, HandleRateLimits, suppress from third_party.utils import extract_post_content +USER_AGENT = ( + 'pleroma-ebooks; ' + f'{aiohttp.__version__}; ' + f'{platform.python_implementation()}/{platform.python_version()}' +) + UTC = pendulum.timezone('UTC') JSON_CONTENT_TYPE = 'application/json' ACTIVITYPUB_CONTENT_TYPE = 'application/activity+json' @@ -34,8 +40,11 @@ class PostFetcher: Pleroma(api_base_url=self.config['site'], access_token=self.config['access_token']), ) self._http = await stack.enter_async_context( - http_session_factory( - headers={'Accept': ', '.join([JSON_CONTENT_TYPE, ACTIVITYPUB_CONTENT_TYPE])}, + aiohttp.ClientSession( + headers={ + 'User-Agent': USER_AGENT, + 'Accept': ', '.join([JSON_CONTENT_TYPE, ACTIVITYPUB_CONTENT_TYPE]), + }, trust_env=True, raise_for_status=True, ), @@ -128,11 +137,7 @@ class PostFetcher: obj = activity['object'] - try: - obj['summary'] - except KeyError: - obj['summary'] = None - + content = extract_post_content(obj['content']) await self._db.execute( """ INSERT INTO posts (post_id, summary, content, published_at) @@ -140,9 +145,7 @@ class PostFetcher: """, ( obj['id'], - # Pleroma returns an empty string here for posts without a CW, - # which is semantically incorrect IMO - obj['summary'] or None, + obj['summary'], extract_post_content(obj['content']), pendulum.parse(obj['published']).astimezone(pendulum.timezone('UTC')).timestamp(), ), diff --git a/generators/markov.py b/generators/markov.py index a777f90..5bf7a0e 100644 --- a/generators/markov.py +++ b/generators/markov.py @@ -2,7 +2,6 @@ import sqlite3 import markovify -import regex def make_sentence(cfg): class nlt_fixed(markovify.NewlineText): # modified version of NewlineText that never rejects sentences @@ -11,21 +10,19 @@ def make_sentence(cfg): db = sqlite3.connect(cfg["db_path"]) db.text_factory = str - p = regex.compile(r"\L", words=cfg["ignored_cws"],flags=regex.IGNORECASE) - def cw_regexp(x): - return 1 if p.search(x) else 0 - db.create_function('cwregexp', 1, cw_regexp) c = db.cursor() if cfg['learn_from_cw']: + ignored_cws_query_params = "(" + ",".join("?" * len(cfg["ignored_cws"])) + ")" toots = c.execute( f""" SELECT content FROM posts WHERE summary IS NULL - OR NOT CWREGEXP(summary) + OR summary NOT IN {ignored_cws_query_params} ORDER BY RANDOM() LIMIT 10000 """, + cfg["ignored_cws"], ).fetchall() else: toots = c.execute( diff --git a/pleroma.py b/pleroma.py new file mode 100644 index 0000000..dda9cd0 --- /dev/null +++ b/pleroma.py @@ -0,0 +1,156 @@ +# SPDX-License-Identifier: AGPL-3.0-only + +import sys +import yarl +import json +import hashlib +import aiohttp +from http import HTTPStatus + +def http_session_factory(headers={}): + py_version = '.'.join(map(str, sys.version_info)) + user_agent = ( + 'pleroma-ebooks (https://github.com/ioistired/pleroma-ebooks); ' + 'aiohttp/{aiohttp.__version__}; ' + 'python/{py_version}' + ) + return aiohttp.ClientSession( + headers={'User-Agent': user_agent, **headers}, + ) + +class BadRequest(Exception): + pass + +class Pleroma: + def __init__(self, *, api_base_url, access_token): + self.api_base_url = api_base_url.rstrip('/') + self.access_token = access_token + self._session = http_session_factory({'Authorization': 'Bearer ' + access_token}) + self._logged_in_id = None + + async def __aenter__(self): + self._session = await self._session.__aenter__() + return self + + async def __aexit__(self, *excinfo): + return await self._session.__aexit__(*excinfo) + + async def request(self, method, path, **kwargs): + # blocklist of some horrible instances + if hashlib.sha256( + yarl.URL(self.api_base_url).host.encode() + + bytes.fromhex('d590e3c48d599db6776e89dfc8ebaf53c8cd84866a76305049d8d8c5d4126ce1') + ).hexdigest() in { + '56704d4d95b882e81c8e7765e9079be0afc4e353925ba9add8fd65976f52db83', + '1932431fa41a0baaccce7815115b01e40e0237035bb155713712075b887f5a19', + 'a42191105a9f3514a1d5131969c07a95e06d0fdf0058f18e478823bf299881c9', + }: + raise RuntimeError('stop being a chud') + + async with self._session.request(method, self.api_base_url + path, **kwargs) as resp: + if resp.status == HTTPStatus.BAD_REQUEST: + raise BadRequest((await resp.json())['error']) + #resp.raise_for_status() + return await resp.json() + + async def verify_credentials(self): + return await self.request('GET', '/api/v1/accounts/verify_credentials') + + me = verify_credentials + + async def _get_logged_in_id(self): + if self._logged_in_id is None: + self._logged_in_id = (await self.me())['id'] + return self._logged_in_id + + async def following(self, account_id=None): + account_id = account_id or await self._get_logged_in_id() + return await self.request('GET', f'/api/v1/accounts/{account_id}/following') + + @staticmethod + def _unpack_id(obj): + if isinstance(obj, dict) and 'id' in obj: + return obj['id'] + return obj + + async def status_context(self, id): + id = self._unpack_id(id) + return await self.request('GET', f'/api/v1/statuses/{id}/context') + + async def post(self, content, *, in_reply_to_id=None, cw=None, visibility=None): + if visibility not in {None, 'private', 'public', 'unlisted', 'direct'}: + raise ValueError('invalid visibility', visibility) + + data = dict(status=content) + if in_reply_to_id := self._unpack_id(in_reply_to_id): + data['in_reply_to_id'] = in_reply_to_id + if visibility is not None: + data['visibility'] = visibility + if cw is not None: + data['spoiler_text'] = cw + + return await self.request('POST', '/api/v1/statuses', data=data) + + async def reply(self, to_status, content, *, cw=None): + user_id = await self._get_logged_in_id() + + mentioned_accounts = {} + mentioned_accounts[to_status['account']['id']] = to_status['account']['acct'] + for account in to_status['mentions']: + if account['id'] != user_id and account['id'] not in mentioned_accounts: + mentioned_accounts[account['id']] = account['acct'] + + content = ''.join('@' + x + ' ' for x in mentioned_accounts.values()) + content + + visibility = 'unlisted' if to_status['visibility'] == 'public' else to_status['visibility'] + if cw is None and 'spoiler_text' in to_status and to_status['spoiler_text']: + cw = 're: ' + to_status['spoiler_text'] + + return await self.post(content, in_reply_to_id=to_status['id'], cw=cw, visibility=visibility) + + async def favorite(self, id): + id = self._unpack_id(id) + return await self.request('POST', f'/api/v1/statuses/{id}/favourite') + + async def unfavorite(self, id): + id = self._unpack_id(id) + return await self.request('POST', f'/api/v1/statuses/{id}/unfavourite') + + async def react(self, id, reaction): + id = self._unpack_id(id) + return await self.request('PUT', f'/api/v1/pleroma/statuses/{id}/reactions/{reaction}') + + async def remove_reaction(self, id, reaction): + id = self._unpack_id(id) + return await self.request('DELETE', f'/api/v1/pleroma/statuses/{id}/reactions/{reaction}') + + async def pin(self, id): + id = self._unpack_id(id) + return await self.request('POST', f'/api/v1/statuses/{id}/pin') + + async def unpin(self, id): + id = self._unpack_id(id) + return await self.request('POST', f'/api/v1/statuses/{id}/unpin') + + async def stream(self, stream_name, *, target_event_type=None): + async with self._session.ws_connect( + self.api_base_url + f'/api/v1/streaming?stream={stream_name}&access_token={self.access_token}' + ) as ws: + async for msg in ws: + if msg.type == aiohttp.WSMsgType.TEXT: + event = msg.json() + # the only event type that doesn't define `payload` is `filters_changed` + if event['event'] == 'filters_changed': + yield event + elif target_event_type is None or event['event'] == target_event_type: + # don't ask me why the payload is also JSON encoded smh + yield json.loads(event['payload']) + + async def stream_notifications(self): + async for notif in self.stream('user:notification', target_event_type='notification'): + yield notif + + async def stream_mentions(self): + async for notif in self.stream_notifications(): + if notif['type'] == 'mention': + yield notif diff --git a/reply.py b/reply.py index 960067d..c54df27 100755 --- a/reply.py +++ b/reply.py @@ -22,22 +22,10 @@ class ReplyBot: async for notification in self.pleroma.stream_mentions(): await self.process_notification(notification) - async def process_notification(self, notification, retry_count=0): + async def process_notification(self, notification): acct = "@" + notification['account']['acct'] # get the account's @ post_id = notification['status']['id'] - - # catch HTTP 500 and backoff on requests - retry_count = retry_count + 1 - try: - context = await self.pleroma.status_context(post_id) - except pleroma.BadResponse as exc: - if retry_count < 3: - await anyio.sleep(2**retry_count) - await self.process_notification(notification, retry_count) - else: - # failed too many times in a row, logging - print(f"Received HTTP 500 {retry_count} times in a row, aborting reply attempt.") - return + context = await self.pleroma.status_context(post_id) # check if we've already been participating in this thread if self.check_thread_length(context): @@ -81,13 +69,12 @@ class ReplyBot: await self.pleroma.react(post_id, '✅') async def reply(self, notification): - toot = await utils.make_post(self.cfg) # generate a toot - toot = re.sub(r"@\S+\s", r"", toot) # remove any generated @'s + toot = utils.make_toot(self.cfg) # generate a toot await self.pleroma.reply(notification['status'], toot, cw=self.cfg['cw']) @staticmethod def extract_toot(toot): - text = utils.extract_post_content(toot) + text = utils.extract_toot(toot) text = re.sub(r"^@\S+\s", r"", text) # remove the initial mention text = text.lower() # treat text as lowercase for easier keyword matching (if this bot uses it) return text diff --git a/requirements/base.txt b/requirements/base.txt index fd9f720..290f67b 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,5 +1,4 @@ beautifulsoup4 ~= 4.9 -pleroma.py ~= 0.0.1 aiohttp ~= 3.0 json5 ~= 0.9.5 anyio ~= 3.0 diff --git a/utils.py b/utils.py index 0bdf8d2..8bb1daa 100644 --- a/utils.py +++ b/utils.py @@ -1,10 +1,9 @@ # SPDX-License-Identifier: AGPL-3.0-only -import sys import anyio -import aiohttp import contextlib from functools import wraps +from datetime import datetime, timezone def as_corofunc(f): @wraps(f) @@ -36,14 +35,34 @@ def removeprefix(s, prefix): # compatibility for pre-3.9 return s[len(prefix):] if s.startswith(prefix) else s -def http_session_factory(headers={}, **kwargs): - py_version = '.'.join(map(str, sys.version_info)) - user_agent = ( - 'pleroma-ebooks (https://github.com/ioistired/pleroma-ebooks); ' - f'aiohttp/{aiohttp.__version__}; ' - f'python/{py_version}' - ) - return aiohttp.ClientSession( - headers={'User-Agent': user_agent, **headers}, - **kwargs, - ) +async def sleep_until(dt): + await anyio.sleep((dt - datetime.now(timezone.utc)).total_seconds()) + +class HandleRateLimits: + def __init__(self, http): + self.http = http + + def request(self, *args, **kwargs): + return _RateLimitContextManager(self.http, args, kwargs) + +class _RateLimitContextManager(contextlib.AbstractAsyncContextManager): + def __init__(self, http, args, kwargs): + self.http = http + self.args = args + self.kwargs = kwargs + + async def __aenter__(self): + self._request_cm = self.http.request(*self.args, **self.kwargs) + return await self._do_enter() + + async def _do_enter(self): + resp = await self._request_cm.__aenter__() + if resp.headers.get('X-RateLimit-Remaining') not in {'0', '1'}: + return resp + + await sleep_until(datetime.fromisoformat(resp.headers['X-RateLimit-Reset'])) + await self._request_cm.__aexit__(*(None,)*3) + return await self.__aenter__() + + async def __aexit__(self, *excinfo): + return await self._request_cm.__aexit__(*excinfo)