Initial recording scraper
This commit is contained in:
40
Server/modules/radioRecordingScraper/recordings_spider.py
Normal file
40
Server/modules/radioRecordingScraper/recordings_spider.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
import scrapy
|
||||||
|
from scrapy.crawler import CrawlerProcess
|
||||||
|
|
||||||
|
class RecordingSpider(scrapy.Spider):
|
||||||
|
name = "recording-scraper"
|
||||||
|
start_urls = [
|
||||||
|
'https://radio.vpn.cusano.net/sdr/transmissions',
|
||||||
|
]
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
print("ASDASDD")
|
||||||
|
print(response)
|
||||||
|
for row in response.css("tr"):
|
||||||
|
if row.css('td.py-1'):
|
||||||
|
links = row.css('a')
|
||||||
|
rows = row.css('td.py-1')
|
||||||
|
print(row)
|
||||||
|
yield {
|
||||||
|
'device': rows[0],
|
||||||
|
'date': rows[1],
|
||||||
|
'duration': rows[2],
|
||||||
|
"frequency": rows[3],
|
||||||
|
"link": links[0].attrib["href"],
|
||||||
|
}
|
||||||
|
|
||||||
|
next_page_url = response.css("a.page-link > a::attr(href)").extract_first()
|
||||||
|
if next_page_url is not None:
|
||||||
|
yield scrapy.Request(response.urljoin(next_page_url))
|
||||||
|
|
||||||
|
|
||||||
|
process = CrawlerProcess(
|
||||||
|
settings={
|
||||||
|
"FEEDS": {
|
||||||
|
"items.json": {"format": "json"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
process.crawl(RecordingSpider)
|
||||||
|
process.start() # the script will block here until the crawling is finished
|
||||||
3
Server/modules/radioRecordingScraper/requirements.txt
Normal file
3
Server/modules/radioRecordingScraper/requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
scrapy
|
||||||
|
fake-useragent
|
||||||
|
beautifulsoup4
|
||||||
Reference in New Issue
Block a user