Initial recording scraper

2023-06-17 17:33:24 -04:00
parent f4475dc9d7
commit 77deb3ba2b
2 changed files with 43 additions and 0 deletions
--- a/Server/modules/radioRecordingScraper/recordings_spider.py
+++ b/Server/modules/radioRecordingScraper/recordings_spider.py
@@ -0,0 +1,40 @@
+import scrapy
+from scrapy.crawler import CrawlerProcess
+
+class RecordingSpider(scrapy.Spider):
+    name = "recording-scraper"
+    start_urls = [
+        'https://radio.vpn.cusano.net/sdr/transmissions',
+    ]
+
+    def parse(self, response):
+        print("ASDASDD")
+        print(response)
+        for row in response.css("tr"):            
+            if row.css('td.py-1'):
+                links = row.css('a')
+                rows = row.css('td.py-1')
+                print(row)
+                yield {
+                    'device': rows[0],
+                    'date': rows[1],
+                    'duration': rows[2],
+                    "frequency": rows[3],
+                    "link": links[0].attrib["href"],
+                }
+            
+        next_page_url = response.css("a.page-link > a::attr(href)").extract_first()
+        if next_page_url is not None:
+            yield scrapy.Request(response.urljoin(next_page_url))
+
+
+process = CrawlerProcess(
+    settings={
+        "FEEDS": {
+            "items.json": {"format": "json"},
+        },
+    }
+)
+
+process.crawl(RecordingSpider)
+process.start()  # the script will block here until the crawling is finished
--- a/Server/modules/radioRecordingScraper/requirements.txt
+++ b/Server/modules/radioRecordingScraper/requirements.txt
@@ -0,0 +1,3 @@
+scrapy
+fake-useragent
+beautifulsoup4