Compare commits
3 Commits
feature/#1
...
feature/#1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
77deb3ba2b | ||
|
|
f4475dc9d7 | ||
|
|
c4650a9e99 |
@@ -40,7 +40,8 @@ module.exports = {
|
|||||||
.addStringOption(option =>
|
.addStringOption(option =>
|
||||||
option.setName("bot")
|
option.setName("bot")
|
||||||
.setDescription("The bot to disconnect from the server")
|
.setDescription("The bot to disconnect from the server")
|
||||||
.setAutocomplete(true)),
|
.setAutocomplete(true)
|
||||||
|
.setRequired(true)),
|
||||||
example: "leave",
|
example: "leave",
|
||||||
isPrivileged: false,
|
isPrivileged: false,
|
||||||
requiresTokens: false,
|
requiresTokens: false,
|
||||||
|
|||||||
@@ -32,17 +32,38 @@ var runningPostsToRemove = [{
|
|||||||
}]
|
}]
|
||||||
*/
|
*/
|
||||||
var runningPostsToRemove = {};
|
var runningPostsToRemove = {};
|
||||||
const sourceFailureLimit = process.env.SOURCE_FAILURE_LIMIT ?? 3;
|
const sourceFailureLimit = process.env.SOURCE_FAILURE_LIMIT ?? 15;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Wrapper for feeds that cause errors. By default it will wait over a day for the source to come back online before deleting it.
|
||||||
*
|
*
|
||||||
* @param {*} sourceURL
|
* @param {string} sourceURL The URL of the feed source causing issues
|
||||||
*/
|
*/
|
||||||
exports.removeSource = function removeSource(sourceURL) {
|
exports.removeSource = function removeSource(sourceURL) {
|
||||||
log.INFO("Removing source URL: ", sourceURL);
|
log.INFO("Removing source URL: ", sourceURL);
|
||||||
if (!sourceURL in runningPostsToRemove) {runningPostsToRemove[sourceURL] = 1; return;}
|
// Check to see if this is the first time this source has been attempted
|
||||||
|
if (!Object.keys(runningPostsToRemove).includes(sourceURL)) {
|
||||||
|
runningPostsToRemove[sourceURL] = { count: 1, timestamp: Date.now(), ignoredAttempts: 0 };
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (runningPostsToRemove[sourceURL] < sourceFailureLimit) {runningPostsToRemove[sourceURL] += 1; return;}
|
const backoffDateTimeDifference = (Date.now() - new Date(runningPostsToRemove[sourceURL].timestamp));
|
||||||
|
const backoffWaitTime = (runningPostsToRemove[sourceURL].count * 30000);
|
||||||
|
|
||||||
|
log.DEBUG("Datetime", runningPostsToRemove[sourceURL], backoffDateTimeDifference, backoffWaitTime);
|
||||||
|
|
||||||
|
// Check to see if the last error occurred within the backoff period or if we should try again
|
||||||
|
if (backoffDateTimeDifference <= backoffWaitTime) {
|
||||||
|
runningPostsToRemove[sourceURL].ignoredAttempts +=1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Increase the retry counter
|
||||||
|
if (runningPostsToRemove[sourceURL].count < sourceFailureLimit) {
|
||||||
|
runningPostsToRemove[sourceURL].count += 1;
|
||||||
|
runningPostsToRemove[sourceURL].timestamp = Date.now();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
feedStorage.getRecordBy('link', sourceURL, (err, record) => {
|
feedStorage.getRecordBy('link', sourceURL, (err, record) => {
|
||||||
if (err) log.ERROR("Error getting record from feedStorage", err);
|
if (err) log.ERROR("Error getting record from feedStorage", err);
|
||||||
@@ -62,13 +83,14 @@ exports.removeSource = function removeSource(sourceURL) {
|
|||||||
/**
|
/**
|
||||||
* Unset a source URL from deletion if the source has not already been deleted
|
* Unset a source URL from deletion if the source has not already been deleted
|
||||||
* @param {*} sourceURL The source URL to be unset from deletion
|
* @param {*} sourceURL The source URL to be unset from deletion
|
||||||
* @returns {*}
|
|
||||||
*/
|
*/
|
||||||
exports.unsetRemoveSource = function unsetRemoveSource(sourceURL) {
|
exports.unsetRemoveSource = function unsetRemoveSource(sourceURL) {
|
||||||
log.INFO("Unsetting source URL from deletion (if not already deleted): ", sourceURL);
|
log.INFO("Unsetting source URL from deletion (if not already deleted): ", sourceURL);
|
||||||
if (!sourceURL in runningPostsToRemove) return;
|
if (!Object.keys(runningPostsToRemove).includes(sourceURL)) return;
|
||||||
|
|
||||||
if (runningPostsToRemove[sourceURL] > sourceFailureLimit) return delete runningPostsToRemove[sourceURL];
|
delete runningPostsToRemove[sourceURL];
|
||||||
|
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -1,189 +0,0 @@
|
|||||||
import re
|
|
||||||
import json
|
|
||||||
import pandas as pd
|
|
||||||
import requests
|
|
||||||
import os
|
|
||||||
import random
|
|
||||||
from fake_useragent import UserAgent
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from urllib.parse import urlparse, unquote, parse_qs
|
|
||||||
from time import sleep
|
|
||||||
ua = UserAgent()
|
|
||||||
|
|
||||||
|
|
||||||
#simply scrape
|
|
||||||
def scrape(url,**kwargs):
|
|
||||||
|
|
||||||
session=requests.Session()
|
|
||||||
session.headers.update({
|
|
||||||
'User-Agent': ua.random,
|
|
||||||
"authority": "www.zillow.com",
|
|
||||||
"accept": "*/*",
|
|
||||||
"accept-language": "en-US,en;q=0.9",
|
|
||||||
"cache-control": "no-cache",
|
|
||||||
"sec-ch-ua-mobile": "?0",
|
|
||||||
"sec-ch-ua-platform": '^\^"Windows^\^"',
|
|
||||||
"sec-fetch-dest": "empty",
|
|
||||||
"sec-fetch-mode": "cors",
|
|
||||||
"sec-fetch-site": "same-origin",
|
|
||||||
})
|
|
||||||
|
|
||||||
response=session.get(url,**kwargs)
|
|
||||||
|
|
||||||
return response
|
|
||||||
|
|
||||||
# Return all sections with key and attributes
|
|
||||||
def slurp(html, tag, attributes):
|
|
||||||
return BeautifulSoup(html, features="html.parser").findAll(tag, attributes)
|
|
||||||
|
|
||||||
# Returns the first number group from a given string
|
|
||||||
def return_numbers(string):
|
|
||||||
return int(re.findall(r'\d+', string)[0])
|
|
||||||
|
|
||||||
|
|
||||||
class Listing:
|
|
||||||
def __init__(self, address, bedrooms, bathrooms, sqft, price, link):
|
|
||||||
self.address = address
|
|
||||||
self.bedrooms = bedrooms
|
|
||||||
self.bathrooms = bathrooms
|
|
||||||
self.sqft = sqft
|
|
||||||
self.price = price
|
|
||||||
self.link = link
|
|
||||||
|
|
||||||
|
|
||||||
class ScrapeZillowListings:
|
|
||||||
def __init__(self, url):
|
|
||||||
self.parsed_original_url = self.init_check_url(urlparse(url))
|
|
||||||
self.html = scrape(url).text
|
|
||||||
self.listings = []
|
|
||||||
|
|
||||||
def init_check_url(self, parsed_url):
|
|
||||||
# Check to see if we are requesting listResults
|
|
||||||
print(parsed_url)
|
|
||||||
print(unquote(parsed_url.query))
|
|
||||||
print(parse_qs(parsed_url.query)['wants'])
|
|
||||||
for want in parse_qs(parsed_url.query)['wants']:
|
|
||||||
print(unquote(unquote(want)))
|
|
||||||
|
|
||||||
return parsed_url
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
self.listings.extend(self.scrape_listings(self.html))
|
|
||||||
pages = []
|
|
||||||
for page_nav in slurp(self.html, "nav", {"role":"navigation", "aria-label":"Pagination"}):
|
|
||||||
page_nav = f"<html><head><head/><body>{page_nav}<body/><html/>"
|
|
||||||
pages_list = slurp(page_nav, "li", {})
|
|
||||||
for page in pages_list:
|
|
||||||
if re.match("\d{1,2}", page.text) and not page.text == "1":
|
|
||||||
parsed_url = self.setup_url(page.find('a').get('href'))
|
|
||||||
sleep(random.randint(0,15))
|
|
||||||
temp_html = scrape(parsed_url.geturl()).text
|
|
||||||
self.listings.extend(self.scrape_listings(temp_html))
|
|
||||||
|
|
||||||
return self.listings
|
|
||||||
|
|
||||||
def print_listings(self):
|
|
||||||
index = 0
|
|
||||||
for temp_listing in self.listings:
|
|
||||||
print("--------")
|
|
||||||
print(f"Listing #{index}")
|
|
||||||
print(temp_listing.address)
|
|
||||||
print(temp_listing.price)
|
|
||||||
print(temp_listing.bedrooms)
|
|
||||||
print(temp_listing.bathrooms)
|
|
||||||
print(temp_listing.sqft)
|
|
||||||
print(temp_listing.link)
|
|
||||||
print("--------")
|
|
||||||
index += 1
|
|
||||||
|
|
||||||
def scrape_listings(self, html):
|
|
||||||
temp_listings = []
|
|
||||||
for listing in slurp(html, "article", {"data-test":"property-card"}):
|
|
||||||
listing = f"<html><head><head/><body>{listing}<body/><html/>"
|
|
||||||
|
|
||||||
uls = slurp(listing, "li", {})
|
|
||||||
beds = 0
|
|
||||||
baths = 0
|
|
||||||
sqft = 0
|
|
||||||
for ul in uls:
|
|
||||||
ul = ul.get_text()
|
|
||||||
if ("bds" in str(ul)):
|
|
||||||
beds = return_numbers(ul)
|
|
||||||
if ("ba" in str(ul)):
|
|
||||||
baths = return_numbers(ul)
|
|
||||||
if ("sqft" in str(ul)):
|
|
||||||
sqft = return_numbers(ul)
|
|
||||||
|
|
||||||
temp_listings.append(Listing(
|
|
||||||
address=slurp(listing, "address", {"data-test":"property-card-addr"})[0].get_text(),
|
|
||||||
bedrooms=beds,
|
|
||||||
bathrooms=baths,
|
|
||||||
sqft=sqft,
|
|
||||||
price=slurp(listing, "span", {"data-test":"property-card-price"})[0].get_text(),
|
|
||||||
link=slurp(listing, "a", {"data-test":"property-card-link"})[0].get('href'),
|
|
||||||
))
|
|
||||||
|
|
||||||
return temp_listings
|
|
||||||
|
|
||||||
def setup_url(self, url):
|
|
||||||
parsed_url = urlparse(url)
|
|
||||||
print(parsed_url)
|
|
||||||
if not parsed_url.netloc:
|
|
||||||
return urlparse(f"{self.parsed_original_url.scheme}://{self.parsed_original_url.netloc}{parsed_url.path}{self.parsed_original_url.query}{self.parsed_original_url.params}")
|
|
||||||
|
|
||||||
#create dataframe
|
|
||||||
def etl(response):
|
|
||||||
|
|
||||||
#regex to find the data
|
|
||||||
|
|
||||||
for listing in listings:
|
|
||||||
print("--------")
|
|
||||||
print(listing)
|
|
||||||
print("--------")
|
|
||||||
|
|
||||||
print("FORCE STOP")
|
|
||||||
exit()
|
|
||||||
|
|
||||||
#convert text to dict via json
|
|
||||||
dicts=[json.loads('{'+i+'}') for i in num]
|
|
||||||
|
|
||||||
#create dataframe
|
|
||||||
df=pd.DataFrame()
|
|
||||||
for ind,val in enumerate(text):
|
|
||||||
df[val]=dicts[ind].values()
|
|
||||||
df.index=dicts[ind].keys()
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
#scrapper = ScrapeZillowListings('https://www.zillow.com/westchester-county-ny/?searchQueryState=%7B%22usersSearchTerm%22%3A%22Yorktown%20Heights%2C%20NY%22%2C%22mapBounds%22%3A%7B%22north%22%3A41.69948153143324%2C%22east%22%3A-72.68804025585938%2C%22south%22%3A40.83865274682678%2C%22west%22%3A-74.29479074414063%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A250000%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22sort%22%3A%7B%22value%22%3A%22days%22%7D%2C%22land%22%3A%7B%22value%22%3Afalse%7D%2C%22cmsn%22%3A%7B%22value%22%3Afalse%7D%2C%22sche%22%3A%7B%22value%22%3Afalse%7D%2C%22schm%22%3A%7B%22value%22%3Afalse%7D%2C%22schh%22%3A%7B%22value%22%3Afalse%7D%2C%22schp%22%3A%7B%22value%22%3Afalse%7D%2C%22schr%22%3A%7B%22value%22%3Afalse%7D%2C%22schc%22%3A%7B%22value%22%3Afalse%7D%2C%22schu%22%3A%7B%22value%22%3Afalse%7D%7D%2C%22isListVisible%22%3Atrue%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A3148%2C%22regionType%22%3A4%7D%2C%7B%22regionId%22%3A2694%2C%22regionType%22%3A4%7D%5D%2C%22pagination%22%3A%7B%7D%7D')
|
|
||||||
scrapper = ScrapeZillowListings("https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=^%^7B^%^22pagination^%^22^%^3A^%^7B^%^7D^%^2C^%^22usersSearchTerm^%^22^%^3A^%^22Yorktown^%^20Heights^%^2C^%^20NY^%^22^%^2C^%^22mapBounds^%^22^%^3A^%^7B^%^22north^%^22^%^3A42.99146217894271^%^2C^%^22east^%^22^%^3A-70.80209903627659^%^2C^%^22south^%^22^%^3A39.549453943310084^%^2C^%^22west^%^22^%^3A-77.00937442690159^%^7D^%^2C^%^22mapZoom^%^22^%^3A8^%^2C^%^22regionSelection^%^22^%^3A^%^5B^%^7B^%^22regionId^%^22^%^3A3148^%^2C^%^22regionType^%^22^%^3A4^%^7D^%^2C^%^7B^%^22regionId^%^22^%^3A2694^%^2C^%^22regionType^%^22^%^3A4^%^7D^%^5D^%^2C^%^22isMapVisible^%^22^%^3Atrue^%^2C^%^22filterState^%^22^%^3A^%^7B^%^22price^%^22^%^3A^%^7B^%^22max^%^22^%^3A250000^%^7D^%^2C^%^22isAllHomes^%^22^%^3A^%^7B^%^22value^%^22^%^3Atrue^%^7D^%^2C^%^22sortSelection^%^22^%^3A^%^7B^%^22value^%^22^%^3A^%^22days^%^22^%^7D^%^2C^%^22isLotLand^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isMiddleSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isHighSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22includeUnratedSchools^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isComingSoon^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isPublicSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isPrivateSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isElementarySchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isCharterSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^7D^%^2C^%^22isListVisible^%^22^%^3Atrue^%^7D&wants=^\{^%^22cat1^%^22:^\[^%^22mapResults^%^22^\]^\}&requestId=3")
|
|
||||||
listings = scrapper.run()
|
|
||||||
scrapper.print_listings()
|
|
||||||
|
|
||||||
#df=etl(response)
|
|
||||||
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
||||||
|
|
||||||
#curl "https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=^%^7B^%^22pagination^%^22^%^3A^%^7B^%^7D^%^2C^%^22usersSearchTerm^%^22^%^3A^%^22Yorktown^%^20Heights^%^2C^%^20NY^%^22^%^2C^%^22mapBounds^%^22^%^3A^%^7B^%^22north^%^22^%^3A42.99146217894271^%^2C^%^22east^%^22^%^3A-70.80209903627659^%^2C^%^22south^%^22^%^3A39.549453943310084^%^2C^%^22west^%^22^%^3A-77.00937442690159^%^7D^%^2C^%^22mapZoom^%^22^%^3A8^%^2C^%^22regionSelection^%^22^%^3A^%^5B^%^7B^%^22regionId^%^22^%^3A3148^%^2C^%^22regionType^%^22^%^3A4^%^7D^%^2C^%^7B^%^22regionId^%^22^%^3A2694^%^2C^%^22regionType^%^22^%^3A4^%^7D^%^5D^%^2C^%^22isMapVisible^%^22^%^3Atrue^%^2C^%^22filterState^%^22^%^3A^%^7B^%^22price^%^22^%^3A^%^7B^%^22max^%^22^%^3A250000^%^7D^%^2C^%^22isAllHomes^%^22^%^3A^%^7B^%^22value^%^22^%^3Atrue^%^7D^%^2C^%^22sortSelection^%^22^%^3A^%^7B^%^22value^%^22^%^3A^%^22days^%^22^%^7D^%^2C^%^22isLotLand^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isMiddleSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isHighSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22includeUnratedSchools^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isComingSoon^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isPublicSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isPrivateSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isElementarySchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isCharterSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^7D^%^2C^%^22isListVisible^%^22^%^3Atrue^%^7D&wants=^\{^%^22cat1^%^22:^\[^%^22mapResults^%^22^\]^\}&requestId=3",
|
|
||||||
# "authority: www.zillow.com",
|
|
||||||
# "accept: */*",
|
|
||||||
# "accept-language: en-US,en;q=0.9",
|
|
||||||
# "cache-control: no-cache",
|
|
||||||
# "cookie: JSESSIONID=97FD1EB701E102B7353E8EA4528843CE; zguid=24^|^%^24825bd6e9-4f90-46df-a475-4a9910b5847c; zgsession=1^|a6a5b7ca-c651-45a2-93c8-c5b66fea68d3; AWSALB=oQ3DGTMPgyQOTPA6zLmQ0liqJ1oax2QoQ5rUSCsORkWP52C7k6G8H1gZnlxOtgU/zzO503UHUnQ7tUeivhOnupv7aYI6+E5LxUZl4TeE0JyhvT3pZ6LYeC9iFbTw; AWSALBCORS=oQ3DGTMPgyQOTPA6zLmQ0liqJ1oax2QoQ5rUSCsORkWP52C7k6G8H1gZnlxOtgU/zzO503UHUnQ7tUeivhOnupv7aYI6+E5LxUZl4TeE0JyhvT3pZ6LYeC9iFbTw; search=6^|1689549806090^%^7Crect^%^3D42.92311815473404^%^252C-70.80209903627659^%^252C39.62142250427077^%^252C-77.00937442690159^%^26rid^%^3D2694^%^26disp^%^3Dmap^%^26mdm^%^3Dauto^%^26p^%^3D1^%^26sort^%^3Ddays^%^26z^%^3D1^%^26listPriceActive^%^3D1^%^26type^%^3Dhouse^%^252Ccondo^%^252Capartment_duplex^%^252Cmobile^%^252Ctownhouse^%^26lt^%^3Dfsba^%^252Cfsbo^%^252Cfore^%^252Cnew^%^252Cauction^%^26price^%^3D0-250000^%^26fs^%^3D1^%^26fr^%^3D0^%^26mmm^%^3D0^%^26rs^%^3D0^%^26ah^%^3D0^%^26singlestory^%^3D0^%^26housing-connector^%^3D0^%^26abo^%^3D0^%^26garage^%^3D0^%^26pool^%^3D0^%^26ac^%^3D0^%^26waterfront^%^3D0^%^26finished^%^3D0^%^26unfinished^%^3D0^%^26cityview^%^3D0^%^26mountainview^%^3D0^%^26parkview^%^3D0^%^26waterview^%^3D0^%^26hoadata^%^3D1^%^26zillow-owned^%^3D0^%^263dhome^%^3D0^%^26featuredMultiFamilyBuilding^%^3D0^%^26commuteMode^%^3Ddriving^%^26commuteTimeOfDay^%^3Dnow^%^09^%^092694^%^09^%^09^%^09^%^09^%^09^%^09",
|
|
||||||
# "pragma: no-cache",
|
|
||||||
# "sec-ch-ua: ^\^"Not.A/Brand^\^";v=^\^"8^\^", ^\^"Chromium^\^";v=^\^"114^\^", ^\^"Google Chrome^\^";v=^\^"114^\^"",
|
|
||||||
# "sec-ch-ua-mobile: ?0",
|
|
||||||
# "sec-ch-ua-platform: ^\^"Windows^\^"",
|
|
||||||
# "sec-fetch-dest: empty",
|
|
||||||
# "sec-fetch-mode: cors",
|
|
||||||
# "sec-fetch-site: same-origin",
|
|
||||||
# "user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
|
|
||||||
# "x-kl-ajax-request: Ajax_Request",
|
|
||||||
# --compressed
|
|
||||||
40
Server/modules/radioRecordingScraper/recordings_spider.py
Normal file
40
Server/modules/radioRecordingScraper/recordings_spider.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
import scrapy
|
||||||
|
from scrapy.crawler import CrawlerProcess
|
||||||
|
|
||||||
|
class RecordingSpider(scrapy.Spider):
|
||||||
|
name = "recording-scraper"
|
||||||
|
start_urls = [
|
||||||
|
'https://radio.vpn.cusano.net/sdr/transmissions',
|
||||||
|
]
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
print("ASDASDD")
|
||||||
|
print(response)
|
||||||
|
for row in response.css("tr"):
|
||||||
|
if row.css('td.py-1'):
|
||||||
|
links = row.css('a')
|
||||||
|
rows = row.css('td.py-1')
|
||||||
|
print(row)
|
||||||
|
yield {
|
||||||
|
'device': rows[0],
|
||||||
|
'date': rows[1],
|
||||||
|
'duration': rows[2],
|
||||||
|
"frequency": rows[3],
|
||||||
|
"link": links[0].attrib["href"],
|
||||||
|
}
|
||||||
|
|
||||||
|
next_page_url = response.css("a.page-link > a::attr(href)").extract_first()
|
||||||
|
if next_page_url is not None:
|
||||||
|
yield scrapy.Request(response.urljoin(next_page_url))
|
||||||
|
|
||||||
|
|
||||||
|
process = CrawlerProcess(
|
||||||
|
settings={
|
||||||
|
"FEEDS": {
|
||||||
|
"items.json": {"format": "json"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
process.crawl(RecordingSpider)
|
||||||
|
process.start() # the script will block here until the crawling is finished
|
||||||
@@ -1,4 +1,3 @@
|
|||||||
pandas
|
scrapy
|
||||||
requests
|
|
||||||
fake-useragent
|
fake-useragent
|
||||||
beautifulsoup4
|
beautifulsoup4
|
||||||
Reference in New Issue
Block a user