# SPDX-License-Identifier: MPL-2.0 import sqlite3 import markovify import regex def make_sentence(cfg): class nlt_fixed(markovify.NewlineText): # modified version of NewlineText that never rejects sentences def test_sentence_input(self, sentence): return True # all sentences are valid <3 db = sqlite3.connect(cfg["db_path"]) db.text_factory = str p = regex.compile(r"\L", words=cfg["ignored_cws"],flags=regex.IGNORECASE) def cw_regexp(x): return 1 if p.search(x) else 0 db.create_function('cwregexp', 1, cw_regexp) c = db.cursor() if cfg['learn_from_cw']: toots = c.execute( f""" SELECT content FROM posts WHERE summary IS NULL OR NOT CWREGEXP(summary) ORDER BY RANDOM() LIMIT 10000 """, ).fetchall() else: toots = c.execute( """ SELECT content FROM posts WHERE summary IS NULL ORDER BY RANDOM() LIMIT 10000 """, ).fetchall() if not toots: raise ValueError("Database is empty! Try running main.py.") nlt = markovify.NewlineText if cfg['overlap_ratio_enabled'] else nlt_fixed # TODO support replicating \n in output posts instead of squashing them together model = nlt("\n".join(toot[0].replace('\n', ' ') for toot in toots)) db.close() if cfg['limit_length']: sentence_len = randint(cfg['length_lower_limit'], cfg['length_upper_limit']) sentence = None tries = 0 for tries in range(10): if (sentence := model.make_short_sentence( max_chars=500, tries=10000, max_overlap_ratio=cfg['overlap_ratio'] if cfg['overlap_ratio_enabled'] else 0.7, max_words=sentence_len if cfg['limit_length'] else None )) is not None: break else: raise ValueError("Failed 10 times to produce a sentence!") return sentence