3 Commits

Author SHA1 Message Date
Logan Cusano
77deb3ba2b Initial recording scraper 2023-06-17 17:33:24 -04:00
Logan Cusano
f4475dc9d7 #19
- Update the wrapper called when a feed encounters an error
    - Will now use a more robus backoff system
    - Waits in increments of 30 seconds
    - Keeps track of ignored attempts and 'count'

- Updated wrapper to remove source from backoff list
    - Now removes the object after the first attempt irrespective of deletion station
2023-06-16 23:26:38 -04:00
Logan Cusano
c4650a9e99 Make the bot option in the leave command required 2023-06-16 22:02:54 -04:00
4 changed files with 74 additions and 8 deletions

View File

@@ -40,7 +40,8 @@ module.exports = {
.addStringOption(option =>
option.setName("bot")
.setDescription("The bot to disconnect from the server")
.setAutocomplete(true)),
.setAutocomplete(true)
.setRequired(true)),
example: "leave",
isPrivileged: false,
requiresTokens: false,

View File

@@ -32,17 +32,38 @@ var runningPostsToRemove = [{
}]
*/
var runningPostsToRemove = {};
const sourceFailureLimit = process.env.SOURCE_FAILURE_LIMIT ?? 3;
const sourceFailureLimit = process.env.SOURCE_FAILURE_LIMIT ?? 15;
/**
* Wrapper for feeds that cause errors. By default it will wait over a day for the source to come back online before deleting it.
*
* @param {*} sourceURL
* @param {string} sourceURL The URL of the feed source causing issues
*/
exports.removeSource = function removeSource(sourceURL) {
log.INFO("Removing source URL: ", sourceURL);
if (!sourceURL in runningPostsToRemove) {runningPostsToRemove[sourceURL] = 1; return;}
// Check to see if this is the first time this source has been attempted
if (!Object.keys(runningPostsToRemove).includes(sourceURL)) {
runningPostsToRemove[sourceURL] = { count: 1, timestamp: Date.now(), ignoredAttempts: 0 };
return;
}
const backoffDateTimeDifference = (Date.now() - new Date(runningPostsToRemove[sourceURL].timestamp));
const backoffWaitTime = (runningPostsToRemove[sourceURL].count * 30000);
log.DEBUG("Datetime", runningPostsToRemove[sourceURL], backoffDateTimeDifference, backoffWaitTime);
// Check to see if the last error occurred within the backoff period or if we should try again
if (backoffDateTimeDifference <= backoffWaitTime) {
runningPostsToRemove[sourceURL].ignoredAttempts +=1;
return;
}
if (runningPostsToRemove[sourceURL] < sourceFailureLimit) {runningPostsToRemove[sourceURL] += 1; return;}
// Increase the retry counter
if (runningPostsToRemove[sourceURL].count < sourceFailureLimit) {
runningPostsToRemove[sourceURL].count += 1;
runningPostsToRemove[sourceURL].timestamp = Date.now();
return;
}
feedStorage.getRecordBy('link', sourceURL, (err, record) => {
if (err) log.ERROR("Error getting record from feedStorage", err);
@@ -62,13 +83,14 @@ exports.removeSource = function removeSource(sourceURL) {
/**
* Unset a source URL from deletion if the source has not already been deleted
* @param {*} sourceURL The source URL to be unset from deletion
* @returns {*}
*/
exports.unsetRemoveSource = function unsetRemoveSource(sourceURL) {
log.INFO("Unsetting source URL from deletion (if not already deleted): ", sourceURL);
if (!sourceURL in runningPostsToRemove) return;
if (!Object.keys(runningPostsToRemove).includes(sourceURL)) return;
if (runningPostsToRemove[sourceURL] > sourceFailureLimit) return delete runningPostsToRemove[sourceURL];
delete runningPostsToRemove[sourceURL];
return
}
/**

View File

@@ -0,0 +1,40 @@
import scrapy
from scrapy.crawler import CrawlerProcess
class RecordingSpider(scrapy.Spider):
name = "recording-scraper"
start_urls = [
'https://radio.vpn.cusano.net/sdr/transmissions',
]
def parse(self, response):
print("ASDASDD")
print(response)
for row in response.css("tr"):
if row.css('td.py-1'):
links = row.css('a')
rows = row.css('td.py-1')
print(row)
yield {
'device': rows[0],
'date': rows[1],
'duration': rows[2],
"frequency": rows[3],
"link": links[0].attrib["href"],
}
next_page_url = response.css("a.page-link > a::attr(href)").extract_first()
if next_page_url is not None:
yield scrapy.Request(response.urljoin(next_page_url))
process = CrawlerProcess(
settings={
"FEEDS": {
"items.json": {"format": "json"},
},
}
)
process.crawl(RecordingSpider)
process.start() # the script will block here until the crawling is finished

View File

@@ -0,0 +1,3 @@
scrapy
fake-useragent
beautifulsoup4