pleroma-ebooks/generators/markov.py

# SPDX-License-Identifier: MPL-2.0

import sqlite3
import markovify
import regex

def make_sentence(cfg):
	class nlt_fixed(markovify.NewlineText):  # modified version of NewlineText that never rejects sentences
		def test_sentence_input(self, sentence):
			return True  # all sentences are valid <3

	db = sqlite3.connect(cfg["db_path"])
	db.text_factory = str
	p = regex.compile(r"\L<words>", words=cfg["ignored_cws"],flags=regex.IGNORECASE)
	def cw_regexp(x):
		return 1 if p.search(x) else 0
	db.create_function('cwregexp', 1, cw_regexp)
	c = db.cursor()
	if cfg['learn_from_cw']:
		toots = c.execute(
			f"""
			SELECT content
			FROM posts
			WHERE
				summary IS NULL
				OR NOT CWREGEXP(summary)
			ORDER BY RANDOM() LIMIT 10000
			""",
		).fetchall()
	else:
		toots = c.execute(
			"""
			SELECT content
			FROM posts
			WHERE summary IS NULL
			ORDER BY RANDOM()
			LIMIT 10000
			""",
		).fetchall()

	if not toots:
		raise ValueError("Database is empty! Try running main.py.")

	nlt = markovify.NewlineText if cfg['overlap_ratio_enabled'] else nlt_fixed

	# TODO support replicating \n in output posts instead of squashing them together
	model = nlt("\n".join(toot[0].replace('\n', ' ') for toot in toots))

	db.close()

	if cfg['limit_length']:
		sentence_len = randint(cfg['length_lower_limit'], cfg['length_upper_limit'])

	sentence = None
	tries = 0
	for tries in range(10):
		if (sentence := model.make_short_sentence(
			max_chars=500,
			tries=10000,
			max_overlap_ratio=cfg['overlap_ratio'] if cfg['overlap_ratio_enabled'] else 0.7,
			max_words=sentence_len if cfg['limit_length'] else None
		)) is not None:
			break
	else:
		raise ValueError("Failed 10 times to produce a sentence!")

	return sentence
license compliance stuff and other changes relicense to AGPLv3 comply with the MPL's file copyleft some other changes in the direction of supporting GPT-2 which were hard to untangle from the rest of the changes (sorry) 2021-07-26 06:52:13 +02:00			`# SPDX-License-Identifier: MPL-2.0`

			`import sqlite3`
			`import markovify`
Support for CW partial regexp match by word. Update documentation with 'How to Use' 2023-07-20 06:10:54 +02:00			`import regex`
license compliance stuff and other changes relicense to AGPLv3 comply with the MPL's file copyleft some other changes in the direction of supporting GPT-2 which were hard to untangle from the rest of the changes (sorry) 2021-07-26 06:52:13 +02:00
			`def make_sentence(cfg):`
			`class nlt_fixed(markovify.NewlineText): # modified version of NewlineText that never rejects sentences`
			`def test_sentence_input(self, sentence):`
			`return True # all sentences are valid <3`

move db_path default to load_config 2021-07-26 08:47:43 +02:00			`db = sqlite3.connect(cfg["db_path"])`
license compliance stuff and other changes relicense to AGPLv3 comply with the MPL's file copyleft some other changes in the direction of supporting GPT-2 which were hard to untangle from the rest of the changes (sorry) 2021-07-26 06:52:13 +02:00			`db.text_factory = str`
Update docs 2023-07-20 06:24:18 +02:00			`p = regex.compile(r"\L<words>", words=cfg["ignored_cws"],flags=regex.IGNORECASE)`
Support for CW partial regexp match by word. Update documentation with 'How to Use' 2023-07-20 06:10:54 +02:00			`def cw_regexp(x):`
			`return 1 if p.search(x) else 0`
			`db.create_function('cwregexp', 1, cw_regexp)`
license compliance stuff and other changes relicense to AGPLv3 comply with the MPL's file copyleft some other changes in the direction of supporting GPT-2 which were hard to untangle from the rest of the changes (sorry) 2021-07-26 06:52:13 +02:00			`c = db.cursor()`
			`if cfg['learn_from_cw']:`
markov.py: fix queries to reflect schema changes 2021-07-26 08:24:26 +02:00			`toots = c.execute(`
			`f"""`
			`SELECT content`
			`FROM posts`
			`WHERE`
			`summary IS NULL`
Support for CW partial regexp match by word. Update documentation with 'How to Use' 2023-07-20 06:10:54 +02:00			`OR NOT CWREGEXP(summary)`
markov.py: fix queries to reflect schema changes 2021-07-26 08:24:26 +02:00			`ORDER BY RANDOM() LIMIT 10000`
			`""",`
			`).fetchall()`
license compliance stuff and other changes relicense to AGPLv3 comply with the MPL's file copyleft some other changes in the direction of supporting GPT-2 which were hard to untangle from the rest of the changes (sorry) 2021-07-26 06:52:13 +02:00			`else:`
markov.py: fix queries to reflect schema changes 2021-07-26 08:24:26 +02:00			`toots = c.execute(`
			`"""`
			`SELECT content`
			`FROM posts`
			`WHERE summary IS NULL`
			`ORDER BY RANDOM()`
			`LIMIT 10000`
			`""",`
			`).fetchall()`

			`if not toots:`
license compliance stuff and other changes relicense to AGPLv3 comply with the MPL's file copyleft some other changes in the direction of supporting GPT-2 which were hard to untangle from the rest of the changes (sorry) 2021-07-26 06:52:13 +02:00			`raise ValueError("Database is empty! Try running main.py.")`

			`nlt = markovify.NewlineText if cfg['overlap_ratio_enabled'] else nlt_fixed`

add TODOs 2021-08-17 07:55:47 +02:00			`# TODO support replicating \n in output posts instead of squashing them together`
license compliance stuff and other changes relicense to AGPLv3 comply with the MPL's file copyleft some other changes in the direction of supporting GPT-2 which were hard to untangle from the rest of the changes (sorry) 2021-07-26 06:52:13 +02:00			`model = nlt("\n".join(toot[0].replace('\n', ' ') for toot in toots))`

			`db.close()`

			`if cfg['limit_length']:`
			`sentence_len = randint(cfg['length_lower_limit'], cfg['length_upper_limit'])`

			`sentence = None`
			`tries = 0`
			`for tries in range(10):`
			`if (sentence := model.make_short_sentence(`
			`max_chars=500,`
			`tries=10000,`
			`max_overlap_ratio=cfg['overlap_ratio'] if cfg['overlap_ratio_enabled'] else 0.7,`
			`max_words=sentence_len if cfg['limit_length'] else None`
			`)) is not None:`
			`break`
			`else:`
			`raise ValueError("Failed 10 times to produce a sentence!")`

			`return sentence`