Initial recording scraper

2023-06-17 17:33:24 -04:00
parent f4475dc9d7
commit 77deb3ba2b
2 changed files with 43 additions and 0 deletions
--- a/Server/modules/radioRecordingScraper/recordings_spider.py
+++ b/Server/modules/radioRecordingScraper/recordings_spider.py
@@ -0,0 +1,40 @@
 import scrapy
 from scrapy.crawler import CrawlerProcess
 class RecordingSpider(scrapy.Spider):
    name = "recording-scraper"
    start_urls = [
        'https://radio.vpn.cusano.net/sdr/transmissions',
    ]
    def parse(self, response):
        print("ASDASDD")
        print(response)
        for row in response.css("tr"):            
            if row.css('td.py-1'):
                links = row.css('a')
                rows = row.css('td.py-1')
                print(row)
                yield {
                    'device': rows[0],
                    'date': rows[1],
                    'duration': rows[2],
                    "frequency": rows[3],
                    "link": links[0].attrib["href"],
                }
        next_page_url = response.css("a.page-link > a::attr(href)").extract_first()
        if next_page_url is not None:
            yield scrapy.Request(response.urljoin(next_page_url))
 process = CrawlerProcess(
    settings={
        "FEEDS": {
            "items.json": {"format": "json"},
        },
    }
 )
 process.crawl(RecordingSpider)
 process.start()  # the script will block here until the crawling is finished
--- a/Server/modules/radioRecordingScraper/requirements.txt
+++ b/Server/modules/radioRecordingScraper/requirements.txt
@@ -0,0 +1,3 @@
 scrapy
 fake-useragent
 beautifulsoup4