diff --git a/modules/WillieTimer/phraseGenerator.py b/modules/WillieTimer/phraseGenerator.py index d21edf4..c183add 100644 --- a/modules/WillieTimer/phraseGenerator.py +++ b/modules/WillieTimer/phraseGenerator.py @@ -1,121 +1,51 @@ -import numpy -import sys -from nltk.tokenize import RegexpTokenizer -from nltk.corpus import stopwords -from keras.models import Sequential, load_model -from keras.layers import Dense, Dropout, LSTM -from keras.utils import np_utils -from keras.callbacks import ModelCheckpoint +import os +import argparse +from textgenrnn import textgenrnn -class PhraseGenerator(): - def __init__(self): - self.training_file = "./lyrics.txt" - self.file = open(self.training_file, 'r', encoding='utf-8') - self.model = Sequential() +class PhraseGenerator(textgenrnn): + def __init__(self, input_training_file_path='./lyrics.txt', input_epochs=1, input_temperature=.5, + input_model_file_path='./textgenrnn_weights.hdf5'): + # Set logging for Tensorflow + os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' - self.processed_inputs = self.tokenize_words(self.file) - self.chars = sorted(list(set(self.processed_inputs))) + # Init vars + self.training_file_path = input_training_file_path + self.model_file_path = input_model_file_path + self.epochs = input_epochs + self.temperature = input_temperature - self.input_len = len(self.processed_inputs) - self.vocab_len = len(self.chars) + # Init Textgenrnn + super().__init__(weights_path=self.model_file_path, allow_growth=True, name='WillieBotModel') - self.seq_length = 100 - self.x_data = [] - self.y_data = [] + def pg_train(self): + self.train_from_file(self.training_file_path, num_epochs=self.epochs, verbose=0, top_n=5, return_as_list=True) - def nums_to_chars(self): - return dict((i, c) for i, c in enumerate(self.chars)) + def pg_generate(self): + generated_text = self.generate(1, temperature=self.temperature, return_as_list=True) + print(generated_text[0]) - def chars_to_nums(self): - return dict((c, i) for i, c in enumerate(self.chars)) - def tokenize_words(self, input): - input = str(input).lower() +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Description of your program') + parser.add_argument('-t', '--train', action='store_true', help='Train the model', required=False) + parser.add_argument('-g', '--generate', action='store_true', help='Generate text', required=False) + parser.add_argument('-e', '--epochs', action='store', type=int, help='Set amount of epochs (defaults to 5)', + required=False) + parser.add_argument('-p', '--temp', action='store', type=int, + help='Set temperature for generation (defaults to .5)', required=False) + parser.add_argument('-f', '--training_file', action='store', type=str, + help='Set the training file (defaults to \'./lyrics.txt\')', required=False) + args = vars(parser.parse_args()) + print(args) + print('Starting') - tokenizer = RegexpTokenizer(r'\w+') - tokens = tokenizer.tokenize(input) + pg = PhraseGenerator(input_epochs=args['epochs'] if args['epochs'] else 1, + input_training_file_path=args['training_file'] if args['training_file'] else './lyrics.txt', + input_temperature=args['temp'] if args['temp'] else .5) - filtered = filter(lambda token: token not in stopwords.words('english'), tokens) - - return " ".join(filtered) - - def train(self): - char_to_num = self.chars_to_nums() - - print("Total number of characters:", self.input_len) - print("Total vocab:", self.vocab_len) - - for i in range(0, self.input_len - self.seq_length, 1): - print(i) - # Define input and output sequences - # Input is the current character plus desired sequence length - in_seq = self.processed_inputs[i:i + self.seq_length] - - # Out sequence is the initial character plus total sequence length - out_seq = self.processed_inputs[i + self.seq_length] - - # We now convert list of characters to integers based on - # previously and add the values to our lists - self.x_data.append([char_to_num[char] for char in in_seq]) - self.y_data.append(char_to_num[out_seq]) - print(f"X-Data:\t{self.x_data}\nY-Data:\t{self.y_data}") - - n_patterns = len(self.x_data) - print("Total Patterns:", n_patterns) - - X = numpy.reshape(self.x_data, (n_patterns, self.seq_length, 1)) - X = X/float(self.vocab_len) - - y = np_utils.to_categorical(self.y_data) - - self.model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True)) - self.model.add(Dropout(0.2)) - self.model.add(LSTM(256, return_sequences=True)) - self.model.add(Dropout(0.2)) - self.model.add(LSTM(128)) - self.model.add(Dropout(0.2)) - self.model.add(Dense(y.shape[1], activation='softmax')) - - filepath = "model_weights_saved.hdf5" - self.model.load_weights(filepath) - self.model.compile(loss='categorical_crossentropy', optimizer='adam') - - checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min') - desired_callbacks = [checkpoint] - - self.model.fit(X, y, epochs=500, batch_size=256, callbacks=desired_callbacks) - - self.model.load_weights(filepath) - self.model.compile(loss='categorical_crossentropy', optimizer='adam') - - def generate_text(self): - num_to_char = self.nums_to_chars() - start = numpy.random.randint(0, len(self.x_data) - 1) - pattern = self.x_data[start] - print(pattern) - print("Random Seed:") - print("\"", ''.join([num_to_char[value] for value in pattern]), "\"") - - output_string = "" - for i in range(500): - x = numpy.reshape(pattern, (1, len(pattern), 1)) - x = x / float(self.vocab_len) - prediction = self.model.predict(x, verbose=0) - index = numpy.argmax(prediction) - result = num_to_char[index] - - output_string += str(result) - - pattern.append(index) - pattern = pattern[1:len(pattern)] - print(output_string) - print(pattern) - -print('Starting') -bot = PhraseGenerator() -print('Training') -bot.train() -print("Generating Text") -bot.generate_text() + if args['train']: + pg.pg_train() + if args['generate']: + pg.pg_generate()