From 24cfac7c33983e9a4776882c457096dbfa66f699 Mon Sep 17 00:00:00 2001 From: Logan Cusano Date: Wed, 22 Dec 2021 16:50:35 -0500 Subject: [PATCH] Init ML scripts --- modules/WillieTimer/lyrics.py | 50 ++++++++++ modules/WillieTimer/phraseGenerator.py | 121 +++++++++++++++++++++++++ 2 files changed, 171 insertions(+) create mode 100644 modules/WillieTimer/lyrics.py create mode 100644 modules/WillieTimer/phraseGenerator.py diff --git a/modules/WillieTimer/lyrics.py b/modules/WillieTimer/lyrics.py new file mode 100644 index 0000000..2f988eb --- /dev/null +++ b/modules/WillieTimer/lyrics.py @@ -0,0 +1,50 @@ +from lyricsgenius import Genius +import json +import re +import os + + +def get_songs(artists=["Notorious B.I.G", "outkast", "nwa"]): + GENIUS_TOKEN = "gMnJyj87FvjyP2W093rQ_mjo5ZwwLw1u2r0AmcVqYcJ8kkjjW6ZbObeGnS726SrH" + session = Genius(GENIUS_TOKEN, retries=2, timeout=20, sleep_time=0.3) + + lyrics = [] + + # get songs + for artist in artists: + songlist = session.search_artist(artist, max_songs=75, sort='title') + songlist.save_lyrics() + +def sanitize_lyrics(input): + notes_re = re.compile('((?:\[[0-9a-zA-Z :()&+-.]+\])(?: \+ \([a-zA-Z -.]+)?(?:\\n)?)') + footer_re = re.compile('((?:EmbedShare)[ ]*(?:URLCopyEmbedCopy))') + multiline_re = re.compile(('(\\n){3,}')) + sanitized_input = notes_re.sub('', input) + sanitized_input = footer_re.sub('', sanitized_input) + sanitized_input = multiline_re.sub('\n\n', sanitized_input) + return sanitized_input + +def get_lyrics_from_json(json_file): + artist_dict = json.load(json_file) + ready_lyrics = [] + print(artist_dict.keys()) + for song in artist_dict['songs']: + sanitized_lyrics = sanitize_lyrics(song['lyrics']) + print(sanitized_lyrics) + ready_lyrics.append(sanitized_lyrics) + return ready_lyrics + +def save_sanitized_lyrics(): + sanitized_lyrics_list = [] + for file in os.listdir("./"): + if file.endswith(".json"): + with open(file, 'r', encoding="utf-8") as read_file: + sanitized_lyrics_list.extend(get_lyrics_from_json(read_file)) + print(sanitized_lyrics_list) + with open('./lyrics.txt', 'w+', encoding="utf-8") as lyrics_file: + for lyrics in sanitized_lyrics_list: + print(lyrics) + lyrics_file.write(f"{lyrics}\n") + + +save_sanitized_lyrics() diff --git a/modules/WillieTimer/phraseGenerator.py b/modules/WillieTimer/phraseGenerator.py new file mode 100644 index 0000000..d21edf4 --- /dev/null +++ b/modules/WillieTimer/phraseGenerator.py @@ -0,0 +1,121 @@ +import numpy +import sys +from nltk.tokenize import RegexpTokenizer +from nltk.corpus import stopwords +from keras.models import Sequential, load_model +from keras.layers import Dense, Dropout, LSTM +from keras.utils import np_utils +from keras.callbacks import ModelCheckpoint + + +class PhraseGenerator(): + def __init__(self): + self.training_file = "./lyrics.txt" + self.file = open(self.training_file, 'r', encoding='utf-8') + self.model = Sequential() + + self.processed_inputs = self.tokenize_words(self.file) + self.chars = sorted(list(set(self.processed_inputs))) + + self.input_len = len(self.processed_inputs) + self.vocab_len = len(self.chars) + + self.seq_length = 100 + self.x_data = [] + self.y_data = [] + + def nums_to_chars(self): + return dict((i, c) for i, c in enumerate(self.chars)) + + def chars_to_nums(self): + return dict((c, i) for i, c in enumerate(self.chars)) + + def tokenize_words(self, input): + input = str(input).lower() + + tokenizer = RegexpTokenizer(r'\w+') + tokens = tokenizer.tokenize(input) + + filtered = filter(lambda token: token not in stopwords.words('english'), tokens) + + return " ".join(filtered) + + def train(self): + char_to_num = self.chars_to_nums() + + print("Total number of characters:", self.input_len) + print("Total vocab:", self.vocab_len) + + for i in range(0, self.input_len - self.seq_length, 1): + print(i) + # Define input and output sequences + # Input is the current character plus desired sequence length + in_seq = self.processed_inputs[i:i + self.seq_length] + + # Out sequence is the initial character plus total sequence length + out_seq = self.processed_inputs[i + self.seq_length] + + # We now convert list of characters to integers based on + # previously and add the values to our lists + self.x_data.append([char_to_num[char] for char in in_seq]) + self.y_data.append(char_to_num[out_seq]) + print(f"X-Data:\t{self.x_data}\nY-Data:\t{self.y_data}") + + n_patterns = len(self.x_data) + print("Total Patterns:", n_patterns) + + X = numpy.reshape(self.x_data, (n_patterns, self.seq_length, 1)) + X = X/float(self.vocab_len) + + y = np_utils.to_categorical(self.y_data) + + self.model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True)) + self.model.add(Dropout(0.2)) + self.model.add(LSTM(256, return_sequences=True)) + self.model.add(Dropout(0.2)) + self.model.add(LSTM(128)) + self.model.add(Dropout(0.2)) + self.model.add(Dense(y.shape[1], activation='softmax')) + + filepath = "model_weights_saved.hdf5" + self.model.load_weights(filepath) + self.model.compile(loss='categorical_crossentropy', optimizer='adam') + + checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min') + desired_callbacks = [checkpoint] + + self.model.fit(X, y, epochs=500, batch_size=256, callbacks=desired_callbacks) + + self.model.load_weights(filepath) + self.model.compile(loss='categorical_crossentropy', optimizer='adam') + + def generate_text(self): + num_to_char = self.nums_to_chars() + start = numpy.random.randint(0, len(self.x_data) - 1) + pattern = self.x_data[start] + print(pattern) + print("Random Seed:") + print("\"", ''.join([num_to_char[value] for value in pattern]), "\"") + + output_string = "" + for i in range(500): + x = numpy.reshape(pattern, (1, len(pattern), 1)) + x = x / float(self.vocab_len) + prediction = self.model.predict(x, verbose=0) + index = numpy.argmax(prediction) + result = num_to_char[index] + + output_string += str(result) + + pattern.append(index) + pattern = pattern[1:len(pattern)] + print(output_string) + print(pattern) + +print('Starting') +bot = PhraseGenerator() +print('Training') +bot.train() +print("Generating Text") +bot.generate_text() +