2021-06-16 03:29:53 +02:00
# SPDX-License-Identifier: EUPL-1.2
2018-10-09 03:11:51 +02:00
2021-06-16 03:59:57 +02:00
import re
import os
import html
import json
import shutil
import sqlite3
import argparse
2018-10-09 03:11:51 +02:00
import markovify
2021-06-18 12:38:53 +02:00
import json5 as json
2021-06-16 03:59:57 +02:00
import multiprocessing
2021-06-04 23:14:56 +02:00
from random import randint
2021-06-16 03:59:57 +02:00
from bs4 import BeautifulSoup
def arg_parser_factory ( * , description ) :
parser = argparse . ArgumentParser ( description = description )
parser . add_argument (
2021-06-18 12:38:53 +02:00
' -c ' , ' --cfg ' , dest = ' cfg ' , default = ' config.json ' , nargs = ' ? ' ,
2021-06-16 03:59:57 +02:00
help = ' Specify a custom location for the config file. '
)
return parser
def parse_args ( * , description ) :
return arg_parser_factory ( description = description ) . parse_args ( )
2018-10-09 03:11:51 +02:00
2021-06-16 03:59:57 +02:00
def load_config ( cfg_path ) :
with open ( ' config.defaults.json ' ) as f :
cfg = json . load ( f )
with open ( cfg_path ) as f :
2021-06-18 12:38:53 +02:00
cfg . update ( json . load ( f ) )
2021-06-16 03:59:57 +02:00
if not cfg [ ' site ' ] . startswith ( ' https:// ' ) and not cfg [ ' site ' ] . startswith ( ' http:// ' ) :
print ( " Site must begin with ' https:// ' or ' http:// ' . Value ' {0} ' is invalid - try ' https:// {0} ' instead. " . format ( cfg [ ' site ' ] ) , file = sys . stderr )
sys . exit ( 1 )
if ' access_token ' not in cfg :
print ( ' No authentication info ' , file = sys . stderr )
print ( ' Get a client id, client secret, and access token here: https://tinysubversions.com/notes/mastodon-bot/ ' , file = sys . stderr )
print ( ' Then put `access_token` in your config file. ' , file = sys . stderr )
sys . exit ( 1 )
return cfg
2021-06-04 23:38:36 +02:00
2019-08-07 05:46:57 +02:00
def make_sentence ( output , cfg ) :
2021-06-04 23:38:36 +02:00
class nlt_fixed ( markovify . NewlineText ) : # modified version of NewlineText that never rejects sentences
2018-10-09 03:11:51 +02:00
def test_sentence_input ( self , sentence ) :
2021-06-04 23:38:36 +02:00
return True # all sentences are valid <3
2018-10-09 03:11:51 +02:00
2021-06-04 23:38:36 +02:00
shutil . copyfile ( " toots.db " , " toots-copy.db " ) # create a copy of the database because reply.py will be using the main one
2018-10-09 03:11:51 +02:00
db = sqlite3 . connect ( " toots-copy.db " )
2019-02-25 19:30:40 +01:00
db . text_factory = str
2018-10-09 03:11:51 +02:00
c = db . cursor ( )
2019-02-25 02:17:06 +01:00
if cfg [ ' learn_from_cw ' ] :
2021-06-11 23:29:51 +02:00
ignored_cws_query_params = " ( " + " , " . join ( " ? " * len ( cfg [ " ignored_cws " ] ) ) + " ) "
2021-06-11 23:37:09 +02:00
toots = c . execute ( f " SELECT content FROM `toots` WHERE cw IS NULL OR CW NOT IN { ignored_cws_query_params } ORDER BY RANDOM() LIMIT 10000 " , cfg [ " ignored_cws " ] ) . fetchall ( )
2019-02-25 19:30:40 +01:00
else :
2021-06-11 23:29:51 +02:00
toots = c . execute ( " SELECT content FROM `toots` WHERE cw IS NULL ORDER BY RANDOM() LIMIT 10000 " ) . fetchall ( )
2018-10-09 03:11:51 +02:00
2020-03-08 10:46:07 +01:00
if len ( toots ) == 0 :
2019-07-10 13:25:07 +02:00
output . send ( " Database is empty! Try running main.py. " )
return
2020-05-27 14:31:16 +02:00
2021-06-04 23:14:56 +02:00
nlt = markovify . NewlineText if cfg [ ' overlap_ratio_enabled ' ] else nlt_fixed
model = nlt (
2020-03-08 10:46:07 +01:00
" \n " . join ( [ toot [ 0 ] for toot in toots ] )
)
2020-05-27 14:31:16 +02:00
2020-03-08 10:46:07 +01:00
db . close ( )
os . remove ( " toots-copy.db " )
2019-07-10 13:25:07 +02:00
2021-06-04 23:14:56 +02:00
if cfg [ ' limit_length ' ] :
sentence_len = randint ( cfg [ ' length_lower_limit ' ] , cfg [ ' length_upper_limit ' ] )
2018-10-09 03:11:51 +02:00
sentence = None
2018-10-29 02:23:01 +01:00
tries = 0
while sentence is None and tries < 10 :
2021-06-04 23:14:56 +02:00
sentence = model . make_short_sentence (
max_chars = 500 ,
tries = 10000 ,
max_overlap_ratio = cfg [ ' overlap_ratio ' ] if cfg [ ' overlap_ratio_enabled ' ] else 0.7 ,
max_words = sentence_len if cfg [ ' limit_length ' ] else None
)
2018-10-29 02:23:01 +01:00
tries = tries + 1
2019-01-11 13:47:42 +01:00
2019-04-29 06:21:46 +02:00
# optionally remove mentions
if cfg [ ' mention_handling ' ] == 1 :
2019-04-29 06:38:44 +02:00
sentence = re . sub ( r " ^ \ S*@ \ u200B \ S* \ s? " , " " , sentence )
2019-04-29 06:21:46 +02:00
elif cfg [ ' mention_handling ' ] == 0 :
2019-04-29 06:38:44 +02:00
sentence = re . sub ( r " \ S*@ \ u200B \ S* \ s? " , " " , sentence )
2019-01-11 13:47:42 +01:00
2018-10-09 03:11:51 +02:00
output . send ( sentence )
2019-08-07 05:46:57 +02:00
def make_toot ( cfg ) :
2018-10-09 03:11:51 +02:00
toot = None
2019-05-19 15:06:31 +02:00
pin , pout = multiprocessing . Pipe ( False )
2021-06-04 23:38:36 +02:00
p = multiprocessing . Process ( target = make_sentence , args = [ pout , cfg ] )
2019-05-19 15:06:31 +02:00
p . start ( )
2021-06-04 23:38:36 +02:00
p . join ( 5 ) # wait 5 seconds to get something
if p . is_alive ( ) : # if it's still trying to make a toot after 5 seconds
2019-05-19 15:06:31 +02:00
p . terminate ( )
p . join ( )
else :
toot = pin . recv ( )
2021-06-04 23:38:36 +02:00
if toot is None :
2021-06-16 03:59:57 +02:00
toot = ' Toot generation failed! Contact io@csdisaster.club for assistance. '
2019-07-02 12:43:34 +02:00
return toot
2019-01-11 13:55:31 +01:00
2021-06-04 23:38:36 +02:00
2019-01-11 13:55:31 +01:00
def extract_toot ( toot ) :
2021-06-04 23:38:36 +02:00
toot = html . unescape ( toot ) # convert HTML escape codes to text
2019-01-11 13:55:31 +01:00
soup = BeautifulSoup ( toot , " html.parser " )
2021-06-04 23:38:36 +02:00
for lb in soup . select ( " br " ) : # replace <br> with linebreak
2021-02-18 17:01:43 +01:00
lb . name = " \n "
2019-01-11 13:55:31 +01:00
2021-06-04 23:38:36 +02:00
for p in soup . select ( " p " ) : # ditto for <p>
2021-02-18 17:01:43 +01:00
p . name = " \n "
2019-01-11 13:55:31 +01:00
2021-06-04 23:38:36 +02:00
for ht in soup . select ( " a.hashtag " ) : # convert hashtags from links to text
2019-01-11 13:55:31 +01:00
ht . unwrap ( )
2021-06-04 23:38:36 +02:00
for link in soup . select ( " a " ) : # convert <a href='https://example.com>example.com</a> to just https://example.com
2020-05-27 14:31:16 +02:00
if ' href ' in link :
# apparently not all a tags have a href, which is understandable if you're doing normal web stuff, but on a social media platform??
link . replace_with ( link [ " href " ] )
2019-01-11 13:55:31 +01:00
2019-01-11 13:56:35 +01:00
text = soup . get_text ( )
2021-06-04 23:38:36 +02:00
text = re . sub ( r " https://([^/]+)/(@[^ \ s]+) " , r " \ 2@ \ 1 " , text ) # put mastodon-style mentions back in
text = re . sub ( r " https://([^/]+)/users/([^ \ s/]+) " , r " @ \ 2@ \ 1 " , text ) # put pleroma-style mentions back in
text = text . rstrip ( " \n " ) # remove trailing newline(s)
2019-02-25 19:30:40 +01:00
return text