pleroma-ebooks/functions.py

#!/usr/bin/env python3
# SPDX-License-Identifier: EUPL-1.2

import re
import os
import html
import json
import shutil
import sqlite3
import argparse
import markovify
import multiprocessing
import pytomlpp as toml
from random import randint
from bs4 import BeautifulSoup

def arg_parser_factory(*, description):
	parser = argparse.ArgumentParser(description=description)
	parser.add_argument(
		'-c', '--cfg', dest='cfg', default='config.toml', nargs='?',
		help='Specify a custom location for the config file.'
	)
	return parser

def parse_args(*, description):
	return arg_parser_factory(description=description).parse_args()

def load_config(cfg_path):
	# TOML doesn't support null here so we have to use JSON 😒
	with open('config.defaults.json') as f:
		cfg = json.load(f)

	with open(cfg_path) as f:
		cfg.update(toml.load(f))

	if not cfg['site'].startswith('https://') and not cfg['site'].startswith('http://'):
		print("Site must begin with 'https://' or 'http://'. Value '{0}' is invalid - try 'https://{0}' instead.".format(cfg['site']), file=sys.stderr)
		sys.exit(1)

	if 'access_token' not in cfg:
		print('No authentication info', file=sys.stderr)
		print('Get a client id, client secret, and access token here: https://tinysubversions.com/notes/mastodon-bot/', file=sys.stderr)
		print('Then put `access_token` in your config file.', file=sys.stderr)
		sys.exit(1)

	return cfg

def make_sentence(output, cfg):
	class nlt_fixed(markovify.NewlineText):  # modified version of NewlineText that never rejects sentences
		def test_sentence_input(self, sentence):
			return True  # all sentences are valid <3

	shutil.copyfile("toots.db", "toots-copy.db")  # create a copy of the database because reply.py will be using the main one
	db = sqlite3.connect("toots-copy.db")
	db.text_factory = str
	c = db.cursor()
	if cfg['learn_from_cw']:
		ignored_cws_query_params = "(" + ",".join("?" * len(cfg["ignored_cws"])) + ")"
		toots = c.execute(f"SELECT content FROM `toots` WHERE cw IS NULL OR CW NOT IN {ignored_cws_query_params} ORDER BY RANDOM() LIMIT 10000", cfg["ignored_cws"]).fetchall()
	else:
		toots = c.execute("SELECT content FROM `toots` WHERE cw IS NULL ORDER BY RANDOM() LIMIT 10000").fetchall()

	if len(toots) == 0:
		output.send("Database is empty! Try running main.py.")
		return

	nlt = markovify.NewlineText if cfg['overlap_ratio_enabled'] else nlt_fixed

	model = nlt(
		"\n".join([toot[0] for toot in toots])
	)

	db.close()
	os.remove("toots-copy.db")

	if cfg['limit_length']:
		sentence_len = randint(cfg['length_lower_limit'], cfg['length_upper_limit'])

	sentence = None
	tries = 0
	while sentence is None and tries < 10:
		sentence = model.make_short_sentence(
			max_chars=500,
			tries=10000,
			max_overlap_ratio=cfg['overlap_ratio'] if cfg['overlap_ratio_enabled'] else 0.7,
			max_words=sentence_len if cfg['limit_length'] else None
			)
		tries = tries + 1

	# optionally remove mentions
	if cfg['mention_handling'] == 1:
		sentence = re.sub(r"^\S*@\u200B\S*\s?", "", sentence)
	elif cfg['mention_handling'] == 0:
		sentence = re.sub(r"\S*@\u200B\S*\s?", "", sentence)

	output.send(sentence)

def make_toot(cfg):
	toot = None
	pin, pout = multiprocessing.Pipe(False)
	p = multiprocessing.Process(target=make_sentence, args=[pout, cfg])
	p.start()
	p.join(5)  # wait 5 seconds to get something
	if p.is_alive():  # if it's still trying to make a toot after 5 seconds
		p.terminate()
		p.join()
	else:
		toot = pin.recv()

	if toot is None:
		toot = 'Toot generation failed! Contact io@csdisaster.club for assistance.'
	return toot


def extract_toot(toot):
	toot = html.unescape(toot)  # convert HTML escape codes to text
	soup = BeautifulSoup(toot, "html.parser")
	for lb in soup.select("br"):  # replace <br> with linebreak
		lb.name = "\n"

	for p in soup.select("p"):  # ditto for <p>
		p.name = "\n"

	for ht in soup.select("a.hashtag"):  # convert hashtags from links to text
		ht.unwrap()

	for link in soup.select("a"):  # convert <a href='https://example.com>example.com</a> to just https://example.com
		if 'href' in link:
			# apparently not all a tags have a href, which is understandable if you're doing normal web stuff, but on a social media platform??
			link.replace_with(link["href"])

	text = soup.get_text()
	text = re.sub(r"https://([^/]+)/(@[^\s]+)", r"\2@\1", text)  # put mastodon-style mentions back in
	text = re.sub(r"https://([^/]+)/users/([^\s/]+)", r"@\2@\1", text)  # put pleroma-style mentions back in
	text = text.rstrip("\n")  # remove trailing newline(s)
	return text
initial commit 2018-10-09 03:11:51 +02:00			`#!/usr/bin/env python3`
relicense 2021-06-16 03:29:53 +02:00			`# SPDX-License-Identifier: EUPL-1.2`
initial commit 2018-10-09 03:11:51 +02:00
rewrite for anyio+aiohttp 2021-06-16 03:59:57 +02:00			`import re`
			`import os`
			`import html`
			`import json`
			`import shutil`
			`import sqlite3`
			`import argparse`
initial commit 2018-10-09 03:11:51 +02:00			`import markovify`
rewrite for anyio+aiohttp 2021-06-16 03:59:57 +02:00			`import multiprocessing`
			`import pytomlpp as toml`
Expose overlap ratio and length limit to config 2021-06-04 23:14:56 +02:00			`from random import randint`
rewrite for anyio+aiohttp 2021-06-16 03:59:57 +02:00			`from bs4 import BeautifulSoup`

			`def arg_parser_factory(*, description):`
			`parser = argparse.ArgumentParser(description=description)`
			`parser.add_argument(`
			`'-c', '--cfg', dest='cfg', default='config.toml', nargs='?',`
			`help='Specify a custom location for the config file.'`
			`)`
			`return parser`

			`def parse_args(*, description):`
			`return arg_parser_factory(description=description).parse_args()`
initial commit 2018-10-09 03:11:51 +02:00
rewrite for anyio+aiohttp 2021-06-16 03:59:57 +02:00			`def load_config(cfg_path):`
			`# TOML doesn't support null here so we have to use JSON 😒`
			`with open('config.defaults.json') as f:`
			`cfg = json.load(f)`

			`with open(cfg_path) as f:`
			`cfg.update(toml.load(f))`

			`if not cfg['site'].startswith('https://') and not cfg['site'].startswith('http://'):`
			`print("Site must begin with 'https://' or 'http://'. Value '{0}' is invalid - try 'https://{0}' instead.".format(cfg['site']), file=sys.stderr)`
			`sys.exit(1)`

			`if 'access_token' not in cfg:`
			`print('No authentication info', file=sys.stderr)`
			`print('Get a client id, client secret, and access token here: https://tinysubversions.com/notes/mastodon-bot/', file=sys.stderr)`
			print('Then put `access_token` in your config file.', file=sys.stderr)
			`sys.exit(1)`

			`return cfg`
Clean up formatting and help linter calm down 2021-06-04 23:38:36 +02:00
allow users to specify custom config.json location 2019-08-07 05:46:57 +02:00			`def make_sentence(output, cfg):`
Clean up formatting and help linter calm down 2021-06-04 23:38:36 +02:00			`class nlt_fixed(markovify.NewlineText): # modified version of NewlineText that never rejects sentences`
initial commit 2018-10-09 03:11:51 +02:00			`def test_sentence_input(self, sentence):`
Clean up formatting and help linter calm down 2021-06-04 23:38:36 +02:00			`return True # all sentences are valid <3`
initial commit 2018-10-09 03:11:51 +02:00
Clean up formatting and help linter calm down 2021-06-04 23:38:36 +02:00			`shutil.copyfile("toots.db", "toots-copy.db") # create a copy of the database because reply.py will be using the main one`
initial commit 2018-10-09 03:11:51 +02:00			`db = sqlite3.connect("toots-copy.db")`
add config option to filter for language 2019-02-25 19:30:40 +01:00			`db.text_factory = str`
initial commit 2018-10-09 03:11:51 +02:00			`c = db.cursor()`
added support for learning from CW'd posts 2019-02-25 02:17:06 +01:00			`if cfg['learn_from_cw']:`
add ability to ignore CWs 2021-06-11 23:29:51 +02:00			`ignored_cws_query_params = "(" + ",".join("?" * len(cfg["ignored_cws"])) + ")"`
SQL NULL a fuck 2021-06-11 23:37:09 +02:00			toots = c.execute(f"SELECT content FROM `toots` WHERE cw IS NULL OR CW NOT IN {ignored_cws_query_params} ORDER BY RANDOM() LIMIT 10000", cfg["ignored_cws"]).fetchall()
add config option to filter for language 2019-02-25 19:30:40 +01:00			`else:`
add ability to ignore CWs 2021-06-11 23:29:51 +02:00			toots = c.execute("SELECT content FROM `toots` WHERE cw IS NULL ORDER BY RANDOM() LIMIT 10000").fetchall()
initial commit 2018-10-09 03:11:51 +02:00
minor code cleanup 2020-03-08 10:46:07 +01:00			`if len(toots) == 0:`
handle empty database 2019-07-10 13:25:07 +02:00			`output.send("Database is empty! Try running main.py.")`
			`return`
update extract code to match fedibooks 2020-05-27 14:31:16 +02:00
Expose overlap ratio and length limit to config 2021-06-04 23:14:56 +02:00			`nlt = markovify.NewlineText if cfg['overlap_ratio_enabled'] else nlt_fixed`

			`model = nlt(`
minor code cleanup 2020-03-08 10:46:07 +01:00			`"\n".join([toot[0] for toot in toots])`
			`)`
update extract code to match fedibooks 2020-05-27 14:31:16 +02:00
minor code cleanup 2020-03-08 10:46:07 +01:00			`db.close()`
			`os.remove("toots-copy.db")`
handle empty database 2019-07-10 13:25:07 +02:00
Expose overlap ratio and length limit to config 2021-06-04 23:14:56 +02:00			`if cfg['limit_length']:`
			`sentence_len = randint(cfg['length_lower_limit'], cfg['length_upper_limit'])`

initial commit 2018-10-09 03:11:51 +02:00			`sentence = None`
fixed another possibility for the bots to get stuck 2018-10-29 02:23:01 +01:00			`tries = 0`
			`while sentence is None and tries < 10:`
Expose overlap ratio and length limit to config 2021-06-04 23:14:56 +02:00			`sentence = model.make_short_sentence(`
			`max_chars=500,`
			`tries=10000,`
			`max_overlap_ratio=cfg['overlap_ratio'] if cfg['overlap_ratio_enabled'] else 0.7,`
			`max_words=sentence_len if cfg['limit_length'] else None`
			`)`
fixed another possibility for the bots to get stuck 2018-10-29 02:23:01 +01:00			`tries = tries + 1`
renamed create.py to functions.py 2019-01-11 13:47:42 +01:00
add an option to avoid creating fake mentions 2019-04-29 06:21:46 +02:00			`# optionally remove mentions`
			`if cfg['mention_handling'] == 1:`
fixed a very silly mistake 2019-04-29 06:38:44 +02:00			`sentence = re.sub(r"^\S@\u200B\S\s?", "", sentence)`
add an option to avoid creating fake mentions 2019-04-29 06:21:46 +02:00			`elif cfg['mention_handling'] == 0:`
fixed a very silly mistake 2019-04-29 06:38:44 +02:00			`sentence = re.sub(r"\S@\u200B\S\s?", "", sentence)`
renamed create.py to functions.py 2019-01-11 13:47:42 +01:00
initial commit 2018-10-09 03:11:51 +02:00			`output.send(sentence)`

allow users to specify custom config.json location 2019-08-07 05:46:57 +02:00			`def make_toot(cfg):`
initial commit 2018-10-09 03:11:51 +02:00			`toot = None`
code cleanup, fixes #23 2019-05-19 15:06:31 +02:00			`pin, pout = multiprocessing.Pipe(False)`
Clean up formatting and help linter calm down 2021-06-04 23:38:36 +02:00			`p = multiprocessing.Process(target=make_sentence, args=[pout, cfg])`
code cleanup, fixes #23 2019-05-19 15:06:31 +02:00			`p.start()`
Clean up formatting and help linter calm down 2021-06-04 23:38:36 +02:00			`p.join(5) # wait 5 seconds to get something`
			`if p.is_alive(): # if it's still trying to make a toot after 5 seconds`
code cleanup, fixes #23 2019-05-19 15:06:31 +02:00			`p.terminate()`
			`p.join()`
			`else:`
			`toot = pin.recv()`

Clean up formatting and help linter calm down 2021-06-04 23:38:36 +02:00			`if toot is None:`
rewrite for anyio+aiohttp 2021-06-16 03:59:57 +02:00			`toot = 'Toot generation failed! Contact io@csdisaster.club for assistance.'`
removed unused media code 2019-07-02 12:43:34 +02:00			`return toot`
added extract_toot function to functions.py 2019-01-11 13:55:31 +01:00
Clean up formatting and help linter calm down 2021-06-04 23:38:36 +02:00
added extract_toot function to functions.py 2019-01-11 13:55:31 +01:00			`def extract_toot(toot):`
Clean up formatting and help linter calm down 2021-06-04 23:38:36 +02:00			`toot = html.unescape(toot) # convert HTML escape codes to text`
added extract_toot function to functions.py 2019-01-11 13:55:31 +01:00			`soup = BeautifulSoup(toot, "html.parser")`
Clean up formatting and help linter calm down 2021-06-04 23:38:36 +02:00			`for lb in soup.select("br"): # replace <br> with linebreak`
Make bs4 only replace the tag name instead of name and contents 2021-02-18 17:01:43 +01:00			`lb.name = "\n"`
added extract_toot function to functions.py 2019-01-11 13:55:31 +01:00
Clean up formatting and help linter calm down 2021-06-04 23:38:36 +02:00			`for p in soup.select("p"): # ditto for <p>`
Make bs4 only replace the tag name instead of name and contents 2021-02-18 17:01:43 +01:00			`p.name = "\n"`
added extract_toot function to functions.py 2019-01-11 13:55:31 +01:00
Clean up formatting and help linter calm down 2021-06-04 23:38:36 +02:00			`for ht in soup.select("a.hashtag"): # convert hashtags from links to text`
added extract_toot function to functions.py 2019-01-11 13:55:31 +01:00			`ht.unwrap()`

Clean up formatting and help linter calm down 2021-06-04 23:38:36 +02:00			`for link in soup.select("a"): # convert <a href='https://example.com>example.com</a> to just https://example.com`
update extract code to match fedibooks 2020-05-27 14:31:16 +02:00			`if 'href' in link:`
			`# apparently not all a tags have a href, which is understandable if you're doing normal web stuff, but on a social media platform??`
			`link.replace_with(link["href"])`
added extract_toot function to functions.py 2019-01-11 13:55:31 +01:00
fixed a silly mistake, fixed " and ' stuff 2019-01-11 13:56:35 +01:00			`text = soup.get_text()`
Clean up formatting and help linter calm down 2021-06-04 23:38:36 +02:00			`text = re.sub(r"https://([^/]+)/(@[^\s]+)", r"\2@\1", text) # put mastodon-style mentions back in`
			`text = re.sub(r"https://([^/]+)/users/([^\s/]+)", r"@\2@\1", text) # put pleroma-style mentions back in`
			`text = text.rstrip("\n") # remove trailing newline(s)`
add config option to filter for language 2019-02-25 19:30:40 +01:00			`return text`