From 24cfac7c33983e9a4776882c457096dbfa66f699 Mon Sep 17 00:00:00 2001
From: Logan Cusano <logan@simplestepsolutions.com>
Date: Wed, 22 Dec 2021 16:50:35 -0500
Subject: [PATCH] Init ML scripts

---
 modules/WillieTimer/lyrics.py          |  50 ++++++++++
 modules/WillieTimer/phraseGenerator.py | 121 +++++++++++++++++++++++++
 2 files changed, 171 insertions(+)
 create mode 100644 modules/WillieTimer/lyrics.py
 create mode 100644 modules/WillieTimer/phraseGenerator.py

diff --git a/modules/WillieTimer/lyrics.py b/modules/WillieTimer/lyrics.py
new file mode 100644
index 0000000..2f988eb
--- /dev/null
+++ b/modules/WillieTimer/lyrics.py
@@ -0,0 +1,50 @@
+from lyricsgenius import Genius
+import json
+import re
+import os
+
+
+def get_songs(artists=["Notorious B.I.G", "outkast", "nwa"]):
+    GENIUS_TOKEN = "gMnJyj87FvjyP2W093rQ_mjo5ZwwLw1u2r0AmcVqYcJ8kkjjW6ZbObeGnS726SrH"
+    session = Genius(GENIUS_TOKEN, retries=2, timeout=20, sleep_time=0.3)
+
+    lyrics = []
+
+    # get songs
+    for artist in artists:
+        songlist = session.search_artist(artist, max_songs=75, sort='title')
+        songlist.save_lyrics()
+
+def sanitize_lyrics(input):
+    notes_re = re.compile('((?:\[[0-9a-zA-Z :()&+-.]+\])(?: \+ \([a-zA-Z -.]+)?(?:\\n)?)')
+    footer_re = re.compile('((?:EmbedShare)[ ]*(?:URLCopyEmbedCopy))')
+    multiline_re = re.compile(('(\\n){3,}'))
+    sanitized_input = notes_re.sub('', input)
+    sanitized_input = footer_re.sub('', sanitized_input)
+    sanitized_input = multiline_re.sub('\n\n', sanitized_input)
+    return sanitized_input
+
+def get_lyrics_from_json(json_file):
+    artist_dict = json.load(json_file)
+    ready_lyrics = []
+    print(artist_dict.keys())
+    for song in artist_dict['songs']:
+        sanitized_lyrics = sanitize_lyrics(song['lyrics'])
+        print(sanitized_lyrics)
+        ready_lyrics.append(sanitized_lyrics)
+    return ready_lyrics
+
+def save_sanitized_lyrics():
+    sanitized_lyrics_list = []
+    for file in os.listdir("./"):
+        if file.endswith(".json"):
+            with open(file, 'r', encoding="utf-8") as read_file:
+                sanitized_lyrics_list.extend(get_lyrics_from_json(read_file))
+                print(sanitized_lyrics_list)
+    with open('./lyrics.txt', 'w+', encoding="utf-8") as lyrics_file:
+        for lyrics in sanitized_lyrics_list:
+            print(lyrics)
+            lyrics_file.write(f"{lyrics}\n")
+
+
+save_sanitized_lyrics()
diff --git a/modules/WillieTimer/phraseGenerator.py b/modules/WillieTimer/phraseGenerator.py
new file mode 100644
index 0000000..d21edf4
--- /dev/null
+++ b/modules/WillieTimer/phraseGenerator.py
@@ -0,0 +1,121 @@
+import numpy
+import sys
+from nltk.tokenize import RegexpTokenizer
+from nltk.corpus import stopwords
+from keras.models import Sequential, load_model
+from keras.layers import Dense, Dropout, LSTM
+from keras.utils import np_utils
+from keras.callbacks import ModelCheckpoint
+
+
+class PhraseGenerator():
+    def __init__(self):
+        self.training_file = "./lyrics.txt"
+        self.file = open(self.training_file, 'r', encoding='utf-8')
+        self.model = Sequential()
+
+        self.processed_inputs = self.tokenize_words(self.file)
+        self.chars = sorted(list(set(self.processed_inputs)))
+
+        self.input_len = len(self.processed_inputs)
+        self.vocab_len = len(self.chars)
+
+        self.seq_length = 100
+        self.x_data = []
+        self.y_data = []
+
+    def nums_to_chars(self):
+        return dict((i, c) for i, c in enumerate(self.chars))
+
+    def chars_to_nums(self):
+        return dict((c, i) for i, c in enumerate(self.chars))
+
+    def tokenize_words(self, input):
+        input = str(input).lower()
+
+        tokenizer = RegexpTokenizer(r'\w+')
+        tokens = tokenizer.tokenize(input)
+
+        filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
+
+        return " ".join(filtered)
+
+    def train(self):
+        char_to_num = self.chars_to_nums()
+
+        print("Total number of characters:", self.input_len)
+        print("Total vocab:", self.vocab_len)
+
+        for i in range(0, self.input_len - self.seq_length, 1):
+            print(i)
+            # Define input and output sequences
+            # Input is the current character plus desired sequence length
+            in_seq = self.processed_inputs[i:i + self.seq_length]
+
+            # Out sequence is the initial character plus total sequence length
+            out_seq = self.processed_inputs[i + self.seq_length]
+
+            # We now convert list of characters to integers based on
+            # previously and add the values to our lists
+            self.x_data.append([char_to_num[char] for char in in_seq])
+            self.y_data.append(char_to_num[out_seq])
+            print(f"X-Data:\t{self.x_data}\nY-Data:\t{self.y_data}")
+
+        n_patterns = len(self.x_data)
+        print("Total Patterns:", n_patterns)
+
+        X = numpy.reshape(self.x_data, (n_patterns, self.seq_length, 1))
+        X = X/float(self.vocab_len)
+
+        y = np_utils.to_categorical(self.y_data)
+
+        self.model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
+        self.model.add(Dropout(0.2))
+        self.model.add(LSTM(256, return_sequences=True))
+        self.model.add(Dropout(0.2))
+        self.model.add(LSTM(128))
+        self.model.add(Dropout(0.2))
+        self.model.add(Dense(y.shape[1], activation='softmax'))
+
+        filepath = "model_weights_saved.hdf5"
+        self.model.load_weights(filepath)
+        self.model.compile(loss='categorical_crossentropy', optimizer='adam')
+
+        checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
+        desired_callbacks = [checkpoint]
+
+        self.model.fit(X, y, epochs=500, batch_size=256, callbacks=desired_callbacks)
+
+        self.model.load_weights(filepath)
+        self.model.compile(loss='categorical_crossentropy', optimizer='adam')
+
+    def generate_text(self):
+        num_to_char = self.nums_to_chars()
+        start = numpy.random.randint(0, len(self.x_data) - 1)
+        pattern = self.x_data[start]
+        print(pattern)
+        print("Random Seed:")
+        print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")
+
+        output_string = ""
+        for i in range(500):
+            x = numpy.reshape(pattern, (1, len(pattern), 1))
+            x = x / float(self.vocab_len)
+            prediction = self.model.predict(x, verbose=0)
+            index = numpy.argmax(prediction)
+            result = num_to_char[index]
+
+            output_string += str(result)
+
+            pattern.append(index)
+            pattern = pattern[1:len(pattern)]
+        print(output_string)
+        print(pattern)
+
+print('Starting')
+bot = PhraseGenerator()
+print('Training')
+bot.train()
+print("Generating Text")
+bot.generate_text()
+