license compliance stuff and other changes

relicense to AGPLv3 comply with the MPL's file copyleft some other changes in the direction of supporting GPT-2 which were hard to untangle from the rest of the changes (sorry)
2021-07-26 04:52:13 +00:00 · 2021-07-26 04:52:13 +00:00 · c22a493dff
commit c22a493dff
parent 944e2fc3a5
12 changed files with 859 additions and 436 deletions
--- a/third_party/utils.py
+++ b/third_party/utils.py
@ -0,0 +1,113 @@
+# SPDX-License-Identifier: EUPL-1.2
+
+import re
+import os
+import sys
+import html
+import enum
+import json
+import shutil
+import sqlite3
+import argparse
+import itertools
+import json5 as json
+import multiprocessing
+from random import randint
+from bs4 import BeautifulSoup
+
+TextGenerationMode = enum.Enum('TextGenerationMode', """
+	markov
+	gpt_2
+""".split())
+
+def arg_parser_factory(*, description):
+	parser = argparse.ArgumentParser(description=description)
+	parser.add_argument(
+		'-c', '--cfg', dest='cfg', default='config.json', nargs='?',
+		help='Specify a custom location for the config file.'
+	)
+	return parser
+
+def parse_args(*, description):
+	return arg_parser_factory(description=description).parse_args()
+
+def load_config(cfg_path):
+	with open('config.defaults.json') as f:
+		cfg = json.load(f)
+
+	with open(cfg_path) as f:
+		cfg.update(json.load(f))
+
+	if not cfg['site'].startswith('https://') and not cfg['site'].startswith('http://'):
+		print("Site must begin with 'https://' or 'http://'. Value '{0}' is invalid - try 'https://{0}' instead.".format(cfg['site']), file=sys.stderr)
+		sys.exit(1)
+
+	if not cfg.get('access_token'):
+		print('No authentication info', file=sys.stderr)
+		print('Get a client id, client secret, and access token here: https://tools.splat.soy/pleroma-access-token/', file=sys.stderr)
+		print('Then put `access_token` in your config file.', file=sys.stderr)
+		sys.exit(1)
+
+	cfg['generation_mode'] = TextGenerationMode.__members__[cfg['generation_mode']]
+
+	return cfg
+
+def remove_mention(cfg, sentence):
+	# optionally remove mentions
+	if cfg['mention_handling'] == 1:
+		return re.sub(r"^\S*@\u200B\S*\s?", "", sentence)
+	elif cfg['mention_handling'] == 0:
+		sentence = re.sub(r"\S*@\u200B\S*\s?", "", sentence)
+
+	return sentence
+
+def _wrap_pipe(f):
+	def g(pout, *args, **kwargs):
+		try:
+			pout.send(f(*args, **kwargs))
+		except ValueError as exc:
+			pout.send(exc.args[0])
+	return g
+
+def make_toot(cfg, *, mode=TextGenerationMode.markov):
+	toot = None
+	pin, pout = multiprocessing.Pipe(False)
+
+	if mode is TextGenerationMode.markov:
+		from generators.markov import make_sentence
+	elif mode is TextGenerationMode.gpt_2:
+		from generators.gpt_2 import make_sentence
+	else:
+		raise ValueError('Invalid text generation mode')
+
+	p = multiprocessing.Process(target=_wrap_pipe(make_sentence), args=[pout, cfg])
+	p.start()
+	p.join(5)  # wait 5 seconds to get something
+	if p.is_alive():  # if it's still trying to make a toot after 5 seconds
+		p.terminate()
+		p.join()
+	else:
+		toot = pin.recv()
+
+	if toot is None:
+		toot = 'Toot generation failed! Contact io@csdisaster.club for assistance.'
+	return toot
+
+def extract_post_content(text):
+	soup = BeautifulSoup(text, "html.parser")
+	for el in soup.select('br'):  # replace <br> with linebreak
+		el.replace_with('\n')
+
+	for ht in soup.select("a.hashtag, a.mention"):  # convert hashtags and mentions from links to text
+		ht.unwrap()
+
+	for link in soup.select("a"):  # convert <a href='https://example.com>example.com</a> to just https://example.com
+		if 'href' in link:
+			# apparently not all a tags have a href,
+			# which is understandable if you're doing normal web stuff, but on a social media platform??
+			link.replace_with(link["href"])
+
+	for el in soup.select('p'):
+		el.replace_with('\n' + el.get_text() + '\n')
+
+	return soup.get_text().strip()