diff --git a/Server/modules/radioRecordingScraper/recordings_spider.py b/Server/modules/radioRecordingScraper/recordings_spider.py new file mode 100644 index 0000000..efb1074 --- /dev/null +++ b/Server/modules/radioRecordingScraper/recordings_spider.py @@ -0,0 +1,40 @@ +import scrapy +from scrapy.crawler import CrawlerProcess + +class RecordingSpider(scrapy.Spider): + name = "recording-scraper" + start_urls = [ + 'https://radio.vpn.cusano.net/sdr/transmissions', + ] + + def parse(self, response): + print("ASDASDD") + print(response) + for row in response.css("tr"): + if row.css('td.py-1'): + links = row.css('a') + rows = row.css('td.py-1') + print(row) + yield { + 'device': rows[0], + 'date': rows[1], + 'duration': rows[2], + "frequency": rows[3], + "link": links[0].attrib["href"], + } + + next_page_url = response.css("a.page-link > a::attr(href)").extract_first() + if next_page_url is not None: + yield scrapy.Request(response.urljoin(next_page_url)) + + +process = CrawlerProcess( + settings={ + "FEEDS": { + "items.json": {"format": "json"}, + }, + } +) + +process.crawl(RecordingSpider) +process.start() # the script will block here until the crawling is finished \ No newline at end of file diff --git a/Server/modules/radioRecordingScraper/requirements.txt b/Server/modules/radioRecordingScraper/requirements.txt new file mode 100644 index 0000000..3a46a7e --- /dev/null +++ b/Server/modules/radioRecordingScraper/requirements.txt @@ -0,0 +1,3 @@ +scrapy +fake-useragent +beautifulsoup4 \ No newline at end of file