From 861eb3d808add4f9343462ee9c2556f1790e7527 Mon Sep 17 00:00:00 2001 From: Logan Cusano Date: Wed, 22 Dec 2021 03:06:21 -0500 Subject: [PATCH 1/5] Updated to not save development files --- modules/WillieTimer/.gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/WillieTimer/.gitignore b/modules/WillieTimer/.gitignore index 80c7d3c..4a8f97d 100644 --- a/modules/WillieTimer/.gitignore +++ b/modules/WillieTimer/.gitignore @@ -1,2 +1,3 @@ lyrics.py -phraseGenerator.py \ No newline at end of file +phraseGenerator.py +/lyrics.txt From e3bcc124e1c05994367609d8f6fc5787fa691441 Mon Sep 17 00:00:00 2001 From: Logan Cusano Date: Wed, 22 Dec 2021 16:50:09 -0500 Subject: [PATCH 2/5] Update gitignore --- .gitignore | 3 --- modules/WillieTimer/.gitignore | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 0893afc..4471aed 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,4 @@ config.ini *.7z *.bat -*.hdf5 -*.json -/modules/*.txt /DSDPlus/ diff --git a/modules/WillieTimer/.gitignore b/modules/WillieTimer/.gitignore index 4a8f97d..700d7b9 100644 --- a/modules/WillieTimer/.gitignore +++ b/modules/WillieTimer/.gitignore @@ -1,3 +1,3 @@ -lyrics.py -phraseGenerator.py /lyrics.txt +*.hdf5 +*.json From 24cfac7c33983e9a4776882c457096dbfa66f699 Mon Sep 17 00:00:00 2001 From: Logan Cusano Date: Wed, 22 Dec 2021 16:50:35 -0500 Subject: [PATCH 3/5] Init ML scripts --- modules/WillieTimer/lyrics.py | 50 ++++++++++ modules/WillieTimer/phraseGenerator.py | 121 +++++++++++++++++++++++++ 2 files changed, 171 insertions(+) create mode 100644 modules/WillieTimer/lyrics.py create mode 100644 modules/WillieTimer/phraseGenerator.py diff --git a/modules/WillieTimer/lyrics.py b/modules/WillieTimer/lyrics.py new file mode 100644 index 0000000..2f988eb --- /dev/null +++ b/modules/WillieTimer/lyrics.py @@ -0,0 +1,50 @@ +from lyricsgenius import Genius +import json +import re +import os + + +def get_songs(artists=["Notorious B.I.G", "outkast", "nwa"]): + GENIUS_TOKEN = "gMnJyj87FvjyP2W093rQ_mjo5ZwwLw1u2r0AmcVqYcJ8kkjjW6ZbObeGnS726SrH" + session = Genius(GENIUS_TOKEN, retries=2, timeout=20, sleep_time=0.3) + + lyrics = [] + + # get songs + for artist in artists: + songlist = session.search_artist(artist, max_songs=75, sort='title') + songlist.save_lyrics() + +def sanitize_lyrics(input): + notes_re = re.compile('((?:\[[0-9a-zA-Z :()&+-.]+\])(?: \+ \([a-zA-Z -.]+)?(?:\\n)?)') + footer_re = re.compile('((?:EmbedShare)[ ]*(?:URLCopyEmbedCopy))') + multiline_re = re.compile(('(\\n){3,}')) + sanitized_input = notes_re.sub('', input) + sanitized_input = footer_re.sub('', sanitized_input) + sanitized_input = multiline_re.sub('\n\n', sanitized_input) + return sanitized_input + +def get_lyrics_from_json(json_file): + artist_dict = json.load(json_file) + ready_lyrics = [] + print(artist_dict.keys()) + for song in artist_dict['songs']: + sanitized_lyrics = sanitize_lyrics(song['lyrics']) + print(sanitized_lyrics) + ready_lyrics.append(sanitized_lyrics) + return ready_lyrics + +def save_sanitized_lyrics(): + sanitized_lyrics_list = [] + for file in os.listdir("./"): + if file.endswith(".json"): + with open(file, 'r', encoding="utf-8") as read_file: + sanitized_lyrics_list.extend(get_lyrics_from_json(read_file)) + print(sanitized_lyrics_list) + with open('./lyrics.txt', 'w+', encoding="utf-8") as lyrics_file: + for lyrics in sanitized_lyrics_list: + print(lyrics) + lyrics_file.write(f"{lyrics}\n") + + +save_sanitized_lyrics() diff --git a/modules/WillieTimer/phraseGenerator.py b/modules/WillieTimer/phraseGenerator.py new file mode 100644 index 0000000..d21edf4 --- /dev/null +++ b/modules/WillieTimer/phraseGenerator.py @@ -0,0 +1,121 @@ +import numpy +import sys +from nltk.tokenize import RegexpTokenizer +from nltk.corpus import stopwords +from keras.models import Sequential, load_model +from keras.layers import Dense, Dropout, LSTM +from keras.utils import np_utils +from keras.callbacks import ModelCheckpoint + + +class PhraseGenerator(): + def __init__(self): + self.training_file = "./lyrics.txt" + self.file = open(self.training_file, 'r', encoding='utf-8') + self.model = Sequential() + + self.processed_inputs = self.tokenize_words(self.file) + self.chars = sorted(list(set(self.processed_inputs))) + + self.input_len = len(self.processed_inputs) + self.vocab_len = len(self.chars) + + self.seq_length = 100 + self.x_data = [] + self.y_data = [] + + def nums_to_chars(self): + return dict((i, c) for i, c in enumerate(self.chars)) + + def chars_to_nums(self): + return dict((c, i) for i, c in enumerate(self.chars)) + + def tokenize_words(self, input): + input = str(input).lower() + + tokenizer = RegexpTokenizer(r'\w+') + tokens = tokenizer.tokenize(input) + + filtered = filter(lambda token: token not in stopwords.words('english'), tokens) + + return " ".join(filtered) + + def train(self): + char_to_num = self.chars_to_nums() + + print("Total number of characters:", self.input_len) + print("Total vocab:", self.vocab_len) + + for i in range(0, self.input_len - self.seq_length, 1): + print(i) + # Define input and output sequences + # Input is the current character plus desired sequence length + in_seq = self.processed_inputs[i:i + self.seq_length] + + # Out sequence is the initial character plus total sequence length + out_seq = self.processed_inputs[i + self.seq_length] + + # We now convert list of characters to integers based on + # previously and add the values to our lists + self.x_data.append([char_to_num[char] for char in in_seq]) + self.y_data.append(char_to_num[out_seq]) + print(f"X-Data:\t{self.x_data}\nY-Data:\t{self.y_data}") + + n_patterns = len(self.x_data) + print("Total Patterns:", n_patterns) + + X = numpy.reshape(self.x_data, (n_patterns, self.seq_length, 1)) + X = X/float(self.vocab_len) + + y = np_utils.to_categorical(self.y_data) + + self.model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True)) + self.model.add(Dropout(0.2)) + self.model.add(LSTM(256, return_sequences=True)) + self.model.add(Dropout(0.2)) + self.model.add(LSTM(128)) + self.model.add(Dropout(0.2)) + self.model.add(Dense(y.shape[1], activation='softmax')) + + filepath = "model_weights_saved.hdf5" + self.model.load_weights(filepath) + self.model.compile(loss='categorical_crossentropy', optimizer='adam') + + checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min') + desired_callbacks = [checkpoint] + + self.model.fit(X, y, epochs=500, batch_size=256, callbacks=desired_callbacks) + + self.model.load_weights(filepath) + self.model.compile(loss='categorical_crossentropy', optimizer='adam') + + def generate_text(self): + num_to_char = self.nums_to_chars() + start = numpy.random.randint(0, len(self.x_data) - 1) + pattern = self.x_data[start] + print(pattern) + print("Random Seed:") + print("\"", ''.join([num_to_char[value] for value in pattern]), "\"") + + output_string = "" + for i in range(500): + x = numpy.reshape(pattern, (1, len(pattern), 1)) + x = x / float(self.vocab_len) + prediction = self.model.predict(x, verbose=0) + index = numpy.argmax(prediction) + result = num_to_char[index] + + output_string += str(result) + + pattern.append(index) + pattern = pattern[1:len(pattern)] + print(output_string) + print(pattern) + +print('Starting') +bot = PhraseGenerator() +print('Training') +bot.train() +print("Generating Text") +bot.generate_text() + From 7ed60e9f705840197b9c899750d114543206ba3f Mon Sep 17 00:00:00 2001 From: Logan Cusano Date: Wed, 22 Dec 2021 17:50:16 -0500 Subject: [PATCH 4/5] Ignore Mac system files --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 4471aed..89abbc6 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ config.ini *.7z *.bat /DSDPlus/ +._.DS_Store From d41919e32b62503451dce018175f7077c8942a4b Mon Sep 17 00:00:00 2001 From: Logan Cusano Date: Mon, 27 Dec 2021 01:09:48 -0500 Subject: [PATCH 5/5] Created AI text generator function to fit into WilliBot --- modules/WillieTimer/phraseGenerator.py | 150 +++++++------------------ 1 file changed, 40 insertions(+), 110 deletions(-) diff --git a/modules/WillieTimer/phraseGenerator.py b/modules/WillieTimer/phraseGenerator.py index d21edf4..c183add 100644 --- a/modules/WillieTimer/phraseGenerator.py +++ b/modules/WillieTimer/phraseGenerator.py @@ -1,121 +1,51 @@ -import numpy -import sys -from nltk.tokenize import RegexpTokenizer -from nltk.corpus import stopwords -from keras.models import Sequential, load_model -from keras.layers import Dense, Dropout, LSTM -from keras.utils import np_utils -from keras.callbacks import ModelCheckpoint +import os +import argparse +from textgenrnn import textgenrnn -class PhraseGenerator(): - def __init__(self): - self.training_file = "./lyrics.txt" - self.file = open(self.training_file, 'r', encoding='utf-8') - self.model = Sequential() +class PhraseGenerator(textgenrnn): + def __init__(self, input_training_file_path='./lyrics.txt', input_epochs=1, input_temperature=.5, + input_model_file_path='./textgenrnn_weights.hdf5'): + # Set logging for Tensorflow + os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' - self.processed_inputs = self.tokenize_words(self.file) - self.chars = sorted(list(set(self.processed_inputs))) + # Init vars + self.training_file_path = input_training_file_path + self.model_file_path = input_model_file_path + self.epochs = input_epochs + self.temperature = input_temperature - self.input_len = len(self.processed_inputs) - self.vocab_len = len(self.chars) + # Init Textgenrnn + super().__init__(weights_path=self.model_file_path, allow_growth=True, name='WillieBotModel') - self.seq_length = 100 - self.x_data = [] - self.y_data = [] + def pg_train(self): + self.train_from_file(self.training_file_path, num_epochs=self.epochs, verbose=0, top_n=5, return_as_list=True) - def nums_to_chars(self): - return dict((i, c) for i, c in enumerate(self.chars)) + def pg_generate(self): + generated_text = self.generate(1, temperature=self.temperature, return_as_list=True) + print(generated_text[0]) - def chars_to_nums(self): - return dict((c, i) for i, c in enumerate(self.chars)) - def tokenize_words(self, input): - input = str(input).lower() +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Description of your program') + parser.add_argument('-t', '--train', action='store_true', help='Train the model', required=False) + parser.add_argument('-g', '--generate', action='store_true', help='Generate text', required=False) + parser.add_argument('-e', '--epochs', action='store', type=int, help='Set amount of epochs (defaults to 5)', + required=False) + parser.add_argument('-p', '--temp', action='store', type=int, + help='Set temperature for generation (defaults to .5)', required=False) + parser.add_argument('-f', '--training_file', action='store', type=str, + help='Set the training file (defaults to \'./lyrics.txt\')', required=False) + args = vars(parser.parse_args()) + print(args) + print('Starting') - tokenizer = RegexpTokenizer(r'\w+') - tokens = tokenizer.tokenize(input) + pg = PhraseGenerator(input_epochs=args['epochs'] if args['epochs'] else 1, + input_training_file_path=args['training_file'] if args['training_file'] else './lyrics.txt', + input_temperature=args['temp'] if args['temp'] else .5) - filtered = filter(lambda token: token not in stopwords.words('english'), tokens) - - return " ".join(filtered) - - def train(self): - char_to_num = self.chars_to_nums() - - print("Total number of characters:", self.input_len) - print("Total vocab:", self.vocab_len) - - for i in range(0, self.input_len - self.seq_length, 1): - print(i) - # Define input and output sequences - # Input is the current character plus desired sequence length - in_seq = self.processed_inputs[i:i + self.seq_length] - - # Out sequence is the initial character plus total sequence length - out_seq = self.processed_inputs[i + self.seq_length] - - # We now convert list of characters to integers based on - # previously and add the values to our lists - self.x_data.append([char_to_num[char] for char in in_seq]) - self.y_data.append(char_to_num[out_seq]) - print(f"X-Data:\t{self.x_data}\nY-Data:\t{self.y_data}") - - n_patterns = len(self.x_data) - print("Total Patterns:", n_patterns) - - X = numpy.reshape(self.x_data, (n_patterns, self.seq_length, 1)) - X = X/float(self.vocab_len) - - y = np_utils.to_categorical(self.y_data) - - self.model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True)) - self.model.add(Dropout(0.2)) - self.model.add(LSTM(256, return_sequences=True)) - self.model.add(Dropout(0.2)) - self.model.add(LSTM(128)) - self.model.add(Dropout(0.2)) - self.model.add(Dense(y.shape[1], activation='softmax')) - - filepath = "model_weights_saved.hdf5" - self.model.load_weights(filepath) - self.model.compile(loss='categorical_crossentropy', optimizer='adam') - - checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min') - desired_callbacks = [checkpoint] - - self.model.fit(X, y, epochs=500, batch_size=256, callbacks=desired_callbacks) - - self.model.load_weights(filepath) - self.model.compile(loss='categorical_crossentropy', optimizer='adam') - - def generate_text(self): - num_to_char = self.nums_to_chars() - start = numpy.random.randint(0, len(self.x_data) - 1) - pattern = self.x_data[start] - print(pattern) - print("Random Seed:") - print("\"", ''.join([num_to_char[value] for value in pattern]), "\"") - - output_string = "" - for i in range(500): - x = numpy.reshape(pattern, (1, len(pattern), 1)) - x = x / float(self.vocab_len) - prediction = self.model.predict(x, verbose=0) - index = numpy.argmax(prediction) - result = num_to_char[index] - - output_string += str(result) - - pattern.append(index) - pattern = pattern[1:len(pattern)] - print(output_string) - print(pattern) - -print('Starting') -bot = PhraseGenerator() -print('Training') -bot.train() -print("Generating Text") -bot.generate_text() + if args['train']: + pg.pg_train() + if args['generate']: + pg.pg_generate()