Created AI text generator function to fit into WilliBot

This commit is contained in:
Logan Cusano
2021-12-27 01:09:48 -05:00
parent 7ed60e9f70
commit d41919e32b

View File

@@ -1,121 +1,51 @@
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
import os
import argparse
from textgenrnn import textgenrnn
class PhraseGenerator():
def __init__(self):
self.training_file = "./lyrics.txt"
self.file = open(self.training_file, 'r', encoding='utf-8')
self.model = Sequential()
class PhraseGenerator(textgenrnn):
def __init__(self, input_training_file_path='./lyrics.txt', input_epochs=1, input_temperature=.5,
input_model_file_path='./textgenrnn_weights.hdf5'):
# Set logging for Tensorflow
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
self.processed_inputs = self.tokenize_words(self.file)
self.chars = sorted(list(set(self.processed_inputs)))
# Init vars
self.training_file_path = input_training_file_path
self.model_file_path = input_model_file_path
self.epochs = input_epochs
self.temperature = input_temperature
self.input_len = len(self.processed_inputs)
self.vocab_len = len(self.chars)
# Init Textgenrnn
super().__init__(weights_path=self.model_file_path, allow_growth=True, name='WillieBotModel')
self.seq_length = 100
self.x_data = []
self.y_data = []
def pg_train(self):
self.train_from_file(self.training_file_path, num_epochs=self.epochs, verbose=0, top_n=5, return_as_list=True)
def nums_to_chars(self):
return dict((i, c) for i, c in enumerate(self.chars))
def pg_generate(self):
generated_text = self.generate(1, temperature=self.temperature, return_as_list=True)
print(generated_text[0])
def chars_to_nums(self):
return dict((c, i) for i, c in enumerate(self.chars))
def tokenize_words(self, input):
input = str(input).lower()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Description of your program')
parser.add_argument('-t', '--train', action='store_true', help='Train the model', required=False)
parser.add_argument('-g', '--generate', action='store_true', help='Generate text', required=False)
parser.add_argument('-e', '--epochs', action='store', type=int, help='Set amount of epochs (defaults to 5)',
required=False)
parser.add_argument('-p', '--temp', action='store', type=int,
help='Set temperature for generation (defaults to .5)', required=False)
parser.add_argument('-f', '--training_file', action='store', type=str,
help='Set the training file (defaults to \'./lyrics.txt\')', required=False)
args = vars(parser.parse_args())
print(args)
print('Starting')
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(input)
pg = PhraseGenerator(input_epochs=args['epochs'] if args['epochs'] else 1,
input_training_file_path=args['training_file'] if args['training_file'] else './lyrics.txt',
input_temperature=args['temp'] if args['temp'] else .5)
filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
return " ".join(filtered)
def train(self):
char_to_num = self.chars_to_nums()
print("Total number of characters:", self.input_len)
print("Total vocab:", self.vocab_len)
for i in range(0, self.input_len - self.seq_length, 1):
print(i)
# Define input and output sequences
# Input is the current character plus desired sequence length
in_seq = self.processed_inputs[i:i + self.seq_length]
# Out sequence is the initial character plus total sequence length
out_seq = self.processed_inputs[i + self.seq_length]
# We now convert list of characters to integers based on
# previously and add the values to our lists
self.x_data.append([char_to_num[char] for char in in_seq])
self.y_data.append(char_to_num[out_seq])
print(f"X-Data:\t{self.x_data}\nY-Data:\t{self.y_data}")
n_patterns = len(self.x_data)
print("Total Patterns:", n_patterns)
X = numpy.reshape(self.x_data, (n_patterns, self.seq_length, 1))
X = X/float(self.vocab_len)
y = np_utils.to_categorical(self.y_data)
self.model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
self.model.add(Dropout(0.2))
self.model.add(LSTM(256, return_sequences=True))
self.model.add(Dropout(0.2))
self.model.add(LSTM(128))
self.model.add(Dropout(0.2))
self.model.add(Dense(y.shape[1], activation='softmax'))
filepath = "model_weights_saved.hdf5"
self.model.load_weights(filepath)
self.model.compile(loss='categorical_crossentropy', optimizer='adam')
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]
self.model.fit(X, y, epochs=500, batch_size=256, callbacks=desired_callbacks)
self.model.load_weights(filepath)
self.model.compile(loss='categorical_crossentropy', optimizer='adam')
def generate_text(self):
num_to_char = self.nums_to_chars()
start = numpy.random.randint(0, len(self.x_data) - 1)
pattern = self.x_data[start]
print(pattern)
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")
output_string = ""
for i in range(500):
x = numpy.reshape(pattern, (1, len(pattern), 1))
x = x / float(self.vocab_len)
prediction = self.model.predict(x, verbose=0)
index = numpy.argmax(prediction)
result = num_to_char[index]
output_string += str(result)
pattern.append(index)
pattern = pattern[1:len(pattern)]
print(output_string)
print(pattern)
print('Starting')
bot = PhraseGenerator()
print('Training')
bot.train()
print("Generating Text")
bot.generate_text()
if args['train']:
pg.pg_train()
if args['generate']:
pg.pg_generate()