Initial recording scraper

#19
- Update the wrapper called when a feed encounters an error - Will now use a more robus backoff system - Waits in increments of 30 seconds - Keeps track of ignored attempts and 'count' - Updated wrapper to remove source from backoff list - Now removes the object after the first attempt irrespective of deletion station
2023-06-17 17:33:24 -04:00 · 2023-06-16 23:26:38 -04:00 · 2023-06-16 22:02:54 -04:00
4 changed files with 74 additions and 8 deletions
--- a/Server/commands/leave.js
+++ b/Server/commands/leave.js
@@ -40,7 +40,8 @@ module.exports = {
 		.addStringOption(option => 
 			option.setName("bot")
 			.setDescription("The bot to disconnect from the server")
-			.setAutocomplete(true)),
+			.setAutocomplete(true)
+			.setRequired(true)),
 	example: "leave",
 	isPrivileged: false,
 	requiresTokens: false,
--- a/Server/libCore.js
+++ b/Server/libCore.js
@@ -32,17 +32,38 @@ var runningPostsToRemove = [{
 }]
 */
 var runningPostsToRemove = {};
-const sourceFailureLimit = process.env.SOURCE_FAILURE_LIMIT ?? 3;
+const sourceFailureLimit = process.env.SOURCE_FAILURE_LIMIT ?? 15;

 /**
+ * Wrapper for feeds that cause errors. By default it will wait over a day for the source to come back online before deleting it.
 * 
- * @param {*} sourceURL
+ * @param {string} sourceURL The URL of the feed source causing issues
 */
 exports.removeSource = function removeSource(sourceURL) {
  log.INFO("Removing source URL: ", sourceURL);
-  if (!sourceURL in runningPostsToRemove) {runningPostsToRemove[sourceURL] = 1; return;}
+  // Check to see if this is the first time this source has been attempted
+  if (!Object.keys(runningPostsToRemove).includes(sourceURL)) {
+    runningPostsToRemove[sourceURL] = { count: 1, timestamp: Date.now(), ignoredAttempts: 0 }; 
+    return;
+  }  
+
+  const backoffDateTimeDifference = (Date.now() - new Date(runningPostsToRemove[sourceURL].timestamp));
+  const backoffWaitTime = (runningPostsToRemove[sourceURL].count * 30000);
+
+  log.DEBUG("Datetime", runningPostsToRemove[sourceURL], backoffDateTimeDifference, backoffWaitTime);
+
+  // Check to see if the last error occurred within the backoff period or if we should try again
+  if (backoffDateTimeDifference <= backoffWaitTime) {
+    runningPostsToRemove[sourceURL].ignoredAttempts +=1; 
+    return;
+  }
  
-  if (runningPostsToRemove[sourceURL] < sourceFailureLimit) {runningPostsToRemove[sourceURL] += 1; return;}
+  // Increase the retry counter 
+  if (runningPostsToRemove[sourceURL].count < sourceFailureLimit) {
+    runningPostsToRemove[sourceURL].count += 1; 
+    runningPostsToRemove[sourceURL].timestamp = Date.now();
+    return;
+  }

  feedStorage.getRecordBy('link', sourceURL, (err, record) => {
    if (err) log.ERROR("Error getting record from feedStorage", err);
@@ -62,13 +83,14 @@ exports.removeSource = function removeSource(sourceURL) {
 /**
 * Unset a source URL from deletion if the source has not already been deleted
 * @param {*} sourceURL The source URL to be unset from deletion
- * @returns {*}
 */
 exports.unsetRemoveSource = function unsetRemoveSource(sourceURL) {
  log.INFO("Unsetting source URL from deletion (if not already deleted): ", sourceURL);
-  if (!sourceURL in runningPostsToRemove) return;
+  if (!Object.keys(runningPostsToRemove).includes(sourceURL)) return;
  
-  if (runningPostsToRemove[sourceURL] > sourceFailureLimit) return delete runningPostsToRemove[sourceURL];
+  delete runningPostsToRemove[sourceURL];
+
+  return
 }

 /**
--- a/Server/modules/radioRecordingScraper/recordings_spider.py
+++ b/Server/modules/radioRecordingScraper/recordings_spider.py
@@ -0,0 +1,40 @@
+import scrapy
+from scrapy.crawler import CrawlerProcess
+
+class RecordingSpider(scrapy.Spider):
+    name = "recording-scraper"
+    start_urls = [
+        'https://radio.vpn.cusano.net/sdr/transmissions',
+    ]
+
+    def parse(self, response):
+        print("ASDASDD")
+        print(response)
+        for row in response.css("tr"):            
+            if row.css('td.py-1'):
+                links = row.css('a')
+                rows = row.css('td.py-1')
+                print(row)
+                yield {
+                    'device': rows[0],
+                    'date': rows[1],
+                    'duration': rows[2],
+                    "frequency": rows[3],
+                    "link": links[0].attrib["href"],
+                }
+            
+        next_page_url = response.css("a.page-link > a::attr(href)").extract_first()
+        if next_page_url is not None:
+            yield scrapy.Request(response.urljoin(next_page_url))
+
+
+process = CrawlerProcess(
+    settings={
+        "FEEDS": {
+            "items.json": {"format": "json"},
+        },
+    }
+)
+
+process.crawl(RecordingSpider)
+process.start()  # the script will block here until the crawling is finished
--- a/Server/modules/radioRecordingScraper/requirements.txt
+++ b/Server/modules/radioRecordingScraper/requirements.txt
@@ -0,0 +1,3 @@
+scrapy
+fake-useragent
+beautifulsoup4