add ability to ignore CWs

This commit is contained in:
io 2021-06-11 21:29:51 +00:00
parent a904587b32
commit 71dbf59796
3 changed files with 9 additions and 41 deletions

View file

@ -6,6 +6,7 @@ This version makes quite a few changes from [the original](https://github.com/Je
- Non-Markov stuff - Non-Markov stuff
- Stores toots in a sqlite database rather than a text file - Stores toots in a sqlite database rather than a text file
- Doesn't unnecessarily redownload all toots every time - Doesn't unnecessarily redownload all toots every time
- Ability to ignore specific CWs
## FediBooks ## FediBooks
Before you use mstdn-ebooks to create your own ebooks bot, I recommend checking out [FediBooks](https://fedibooks.com). Compared to mstdn-ebooks, FediBooks offers a few advantages: Before you use mstdn-ebooks to create your own ebooks bot, I recommend checking out [FediBooks](https://fedibooks.com). Compared to mstdn-ebooks, FediBooks offers a few advantages:
@ -54,6 +55,7 @@ Configuring mstdn-ebooks is accomplished by editing `config.json`. If you want t
| cw | null | The content warning (aka subject) mstdn-ebooks will apply to non-error posts. | | cw | null | The content warning (aka subject) mstdn-ebooks will apply to non-error posts. |
| instance_blacklist | ["bofa.lol", "witches.town", "knzk.me"] | If your bot is following someone from a blacklisted instance, it will skip over them and not download their posts. This is useful for ensuring that mstdn-ebooks doesn't waste time trying to download posts from dead instances, without you having to unfollow the user(s) from them. | | instance_blacklist | ["bofa.lol", "witches.town", "knzk.me"] | If your bot is following someone from a blacklisted instance, it will skip over them and not download their posts. This is useful for ensuring that mstdn-ebooks doesn't waste time trying to download posts from dead instances, without you having to unfollow the user(s) from them. |
| learn_from_cw | false | If true, mstdn-ebooks will learn from CW'd posts. | | learn_from_cw | false | If true, mstdn-ebooks will learn from CW'd posts. |
| ignored_cws | [] | If `learn_from_cw` is true, do not learn from posts with these CWs.
| mention_handling | 1 | 0: Never use mentions. 1: Only generate fake mentions in the middle of posts, never at the start. 2: Use mentions as normal (old behaviour). | | mention_handling | 1 | 0: Never use mentions. 1: Only generate fake mentions in the middle of posts, never at the start. 2: Use mentions as normal (old behaviour). |
| max_thread_length | 15 | The maximum number of bot posts in a thread before it stops replying. A thread can be 10 or 10000 posts long, but the bot will stop after it has posted `max_thread_length` times. | | max_thread_length | 15 | The maximum number of bot posts in a thread before it stops replying. A thread can be 10 or 10000 posts long, but the bot will stop after it has posted `max_thread_length` times. |
| strip_paired_punctuation | false | If true, mstdn-ebooks will remove punctuation that commonly appears in pairs, like " and (). This avoids the issue of posts that open a bracket (or quote) without closing it. | | strip_paired_punctuation | false | If true, mstdn-ebooks will remove punctuation that commonly appears in pairs, like " and (). This avoids the issue of posts that open a bracket (or quote) without closing it. |

View file

@ -19,9 +19,10 @@ def make_sentence(output, cfg):
db.text_factory = str db.text_factory = str
c = db.cursor() c = db.cursor()
if cfg['learn_from_cw']: if cfg['learn_from_cw']:
toots = c.execute("SELECT content FROM `toots` ORDER BY RANDOM() LIMIT 10000").fetchall() ignored_cws_query_params = "(" + ",".join("?" * len(cfg["ignored_cws"])) + ")"
toots = c.execute(f"SELECT content FROM `toots` WHERE cw NOT IN {ignored_cws_query_params} ORDER BY RANDOM() LIMIT 10000", cfg["ignored_cws"]).fetchall()
else: else:
toots = c.execute("SELECT content FROM `toots` WHERE cw = 0 ORDER BY RANDOM() LIMIT 10000").fetchall() toots = c.execute("SELECT content FROM `toots` WHERE cw IS NULL ORDER BY RANDOM() LIMIT 10000").fetchall()
if len(toots) == 0: if len(toots) == 0:
output.send("Database is empty! Try running main.py.") output.send("Database is empty! Try running main.py.")

43
main.py
View file

@ -31,7 +31,8 @@ cfg = {
"length_lower_limit": 5, "length_lower_limit": 5,
"length_upper_limit": 50, "length_upper_limit": 50,
"overlap_ratio_enabled": False, "overlap_ratio_enabled": False,
"overlap_ratio": 0.7 "overlap_ratio": 0.7,
"ignored_cws": [],
} }
try: try:
@ -94,46 +95,10 @@ following = client.account_following(me.id)
db = sqlite3.connect("toots.db") db = sqlite3.connect("toots.db")
db.text_factory = str db.text_factory = str
c = db.cursor() c = db.cursor()
c.execute("CREATE TABLE IF NOT EXISTS `toots` (sortid INTEGER UNIQUE PRIMARY KEY AUTOINCREMENT, id VARCHAR NOT NULL, cw INT NOT NULL DEFAULT 0, userid VARCHAR NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL)") c.execute("CREATE TABLE IF NOT EXISTS `toots` (sortid INTEGER UNIQUE PRIMARY KEY AUTOINCREMENT, id VARCHAR NOT NULL, cw VARCHAR, userid VARCHAR NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL)")
c.execute("CREATE TRIGGER IF NOT EXISTS `dedup` AFTER INSERT ON toots FOR EACH ROW BEGIN DELETE FROM toots WHERE rowid NOT IN (SELECT MIN(sortid) FROM toots GROUP BY uri ); END; ") c.execute("CREATE TRIGGER IF NOT EXISTS `dedup` AFTER INSERT ON toots FOR EACH ROW BEGIN DELETE FROM toots WHERE rowid NOT IN (SELECT MIN(sortid) FROM toots GROUP BY uri ); END; ")
db.commit() db.commit()
tableinfo = c.execute("PRAGMA table_info(`toots`)").fetchall()
found = False
columns = []
for entry in tableinfo:
if entry[1] == "sortid":
found = True
break
columns.append(entry[1])
if not found:
print("Migrating to new database format. Please wait...")
print("WARNING: If any of the accounts your bot is following are Pleroma users, please delete toots.db and run main.py again to create it anew.")
try:
c.execute("DROP TABLE `toots_temp`")
except:
pass
c.execute("CREATE TABLE `toots_temp` (sortid INTEGER UNIQUE PRIMARY KEY AUTOINCREMENT, id VARCHAR NOT NULL, cw INT NOT NULL DEFAULT 0, userid VARCHAR NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL)")
for f in following:
user_toots = c.execute("SELECT * FROM `toots` WHERE userid LIKE ? ORDER BY id", (f.id,)).fetchall()
if user_toots is None:
continue
if columns[-1] == "cw":
for toot in user_toots:
c.execute("INSERT INTO `toots_temp` (id, userid, uri, content, cw) VALUES (?, ?, ?, ?, ?)", toot)
else:
for toot in user_toots:
c.execute("INSERT INTO `toots_temp` (id, cw, userid, uri, content) VALUES (?, ?, ?, ?, ?)", toot)
c.execute("DROP TABLE `toots`")
c.execute("ALTER TABLE `toots_temp` RENAME TO `toots`")
c.execute("CREATE TRIGGER IF NOT EXISTS `dedup` AFTER INSERT ON toots FOR EACH ROW BEGIN DELETE FROM toots WHERE rowid NOT IN (SELECT MIN(sortid) FROM toots GROUP BY uri ); END; ")
db.commit()
def handleCtrlC(signal, frame): def handleCtrlC(signal, frame):
print("\nPREMATURE EVACUATION - Saving chunks") print("\nPREMATURE EVACUATION - Saving chunks")
@ -155,7 +120,7 @@ def insert_toot(oii, acc, post, cursor): # extracted to prevent duplication
pid = patterns["pid"].search(oii['object']['id']).group(0) pid = patterns["pid"].search(oii['object']['id']).group(0)
cursor.execute("REPLACE INTO toots (id, cw, userid, uri, content) VALUES (?, ?, ?, ?, ?)", ( cursor.execute("REPLACE INTO toots (id, cw, userid, uri, content) VALUES (?, ?, ?, ?, ?)", (
pid, pid,
1 if (oii['object']['summary'] is not None and oii['object']['summary'] != "") else 0, oii['object']['summary'] or None,
acc.id, acc.id,
oii['object']['id'], oii['object']['id'],
post post