From 96b0bf6adbed12962b74d5f956381d25dfc63c93 Mon Sep 17 00:00:00 2001 From: Logan Cusano Date: Fri, 16 Jun 2023 21:53:54 -0400 Subject: [PATCH] Initial push for #17 - Partial functionality - No pagination - Only 9 results --- Server/modules/mlsScraper/main.py | 189 +++++++++++++++++++++ Server/modules/mlsScraper/requirements.txt | 4 + 2 files changed, 193 insertions(+) create mode 100644 Server/modules/mlsScraper/main.py create mode 100644 Server/modules/mlsScraper/requirements.txt diff --git a/Server/modules/mlsScraper/main.py b/Server/modules/mlsScraper/main.py new file mode 100644 index 0000000..abf5842 --- /dev/null +++ b/Server/modules/mlsScraper/main.py @@ -0,0 +1,189 @@ +import re +import json +import pandas as pd +import requests +import os +import random +from fake_useragent import UserAgent +from bs4 import BeautifulSoup +from urllib.parse import urlparse, unquote, parse_qs +from time import sleep +ua = UserAgent() + + +#simply scrape +def scrape(url,**kwargs): + + session=requests.Session() + session.headers.update({ + 'User-Agent': ua.random, + "authority": "www.zillow.com", + "accept": "*/*", + "accept-language": "en-US,en;q=0.9", + "cache-control": "no-cache", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": '^\^"Windows^\^"', + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "same-origin", + }) + + response=session.get(url,**kwargs) + + return response + +# Return all sections with key and attributes +def slurp(html, tag, attributes): + return BeautifulSoup(html, features="html.parser").findAll(tag, attributes) + +# Returns the first number group from a given string +def return_numbers(string): + return int(re.findall(r'\d+', string)[0]) + + +class Listing: + def __init__(self, address, bedrooms, bathrooms, sqft, price, link): + self.address = address + self.bedrooms = bedrooms + self.bathrooms = bathrooms + self.sqft = sqft + self.price = price + self.link = link + + +class ScrapeZillowListings: + def __init__(self, url): + self.parsed_original_url = self.init_check_url(urlparse(url)) + self.html = scrape(url).text + self.listings = [] + + def init_check_url(self, parsed_url): + # Check to see if we are requesting listResults + print(parsed_url) + print(unquote(parsed_url.query)) + print(parse_qs(parsed_url.query)['wants']) + for want in parse_qs(parsed_url.query)['wants']: + print(unquote(unquote(want))) + + return parsed_url + + def run(self): + self.listings.extend(self.scrape_listings(self.html)) + pages = [] + for page_nav in slurp(self.html, "nav", {"role":"navigation", "aria-label":"Pagination"}): + page_nav = f"{page_nav}" + pages_list = slurp(page_nav, "li", {}) + for page in pages_list: + if re.match("\d{1,2}", page.text) and not page.text == "1": + parsed_url = self.setup_url(page.find('a').get('href')) + sleep(random.randint(0,15)) + temp_html = scrape(parsed_url.geturl()).text + self.listings.extend(self.scrape_listings(temp_html)) + + return self.listings + + def print_listings(self): + index = 0 + for temp_listing in self.listings: + print("--------") + print(f"Listing #{index}") + print(temp_listing.address) + print(temp_listing.price) + print(temp_listing.bedrooms) + print(temp_listing.bathrooms) + print(temp_listing.sqft) + print(temp_listing.link) + print("--------") + index += 1 + + def scrape_listings(self, html): + temp_listings = [] + for listing in slurp(html, "article", {"data-test":"property-card"}): + listing = f"{listing}" + + uls = slurp(listing, "li", {}) + beds = 0 + baths = 0 + sqft = 0 + for ul in uls: + ul = ul.get_text() + if ("bds" in str(ul)): + beds = return_numbers(ul) + if ("ba" in str(ul)): + baths = return_numbers(ul) + if ("sqft" in str(ul)): + sqft = return_numbers(ul) + + temp_listings.append(Listing( + address=slurp(listing, "address", {"data-test":"property-card-addr"})[0].get_text(), + bedrooms=beds, + bathrooms=baths, + sqft=sqft, + price=slurp(listing, "span", {"data-test":"property-card-price"})[0].get_text(), + link=slurp(listing, "a", {"data-test":"property-card-link"})[0].get('href'), + )) + + return temp_listings + + def setup_url(self, url): + parsed_url = urlparse(url) + print(parsed_url) + if not parsed_url.netloc: + return urlparse(f"{self.parsed_original_url.scheme}://{self.parsed_original_url.netloc}{parsed_url.path}{self.parsed_original_url.query}{self.parsed_original_url.params}") + +#create dataframe +def etl(response): + + #regex to find the data + + for listing in listings: + print("--------") + print(listing) + print("--------") + + print("FORCE STOP") + exit() + + #convert text to dict via json + dicts=[json.loads('{'+i+'}') for i in num] + + #create dataframe + df=pd.DataFrame() + for ind,val in enumerate(text): + df[val]=dicts[ind].values() + df.index=dicts[ind].keys() + + return df + + +def main(): + #scrapper = ScrapeZillowListings('https://www.zillow.com/westchester-county-ny/?searchQueryState=%7B%22usersSearchTerm%22%3A%22Yorktown%20Heights%2C%20NY%22%2C%22mapBounds%22%3A%7B%22north%22%3A41.69948153143324%2C%22east%22%3A-72.68804025585938%2C%22south%22%3A40.83865274682678%2C%22west%22%3A-74.29479074414063%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A250000%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22sort%22%3A%7B%22value%22%3A%22days%22%7D%2C%22land%22%3A%7B%22value%22%3Afalse%7D%2C%22cmsn%22%3A%7B%22value%22%3Afalse%7D%2C%22sche%22%3A%7B%22value%22%3Afalse%7D%2C%22schm%22%3A%7B%22value%22%3Afalse%7D%2C%22schh%22%3A%7B%22value%22%3Afalse%7D%2C%22schp%22%3A%7B%22value%22%3Afalse%7D%2C%22schr%22%3A%7B%22value%22%3Afalse%7D%2C%22schc%22%3A%7B%22value%22%3Afalse%7D%2C%22schu%22%3A%7B%22value%22%3Afalse%7D%7D%2C%22isListVisible%22%3Atrue%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A3148%2C%22regionType%22%3A4%7D%2C%7B%22regionId%22%3A2694%2C%22regionType%22%3A4%7D%5D%2C%22pagination%22%3A%7B%7D%7D') + scrapper = ScrapeZillowListings("https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=^%^7B^%^22pagination^%^22^%^3A^%^7B^%^7D^%^2C^%^22usersSearchTerm^%^22^%^3A^%^22Yorktown^%^20Heights^%^2C^%^20NY^%^22^%^2C^%^22mapBounds^%^22^%^3A^%^7B^%^22north^%^22^%^3A42.99146217894271^%^2C^%^22east^%^22^%^3A-70.80209903627659^%^2C^%^22south^%^22^%^3A39.549453943310084^%^2C^%^22west^%^22^%^3A-77.00937442690159^%^7D^%^2C^%^22mapZoom^%^22^%^3A8^%^2C^%^22regionSelection^%^22^%^3A^%^5B^%^7B^%^22regionId^%^22^%^3A3148^%^2C^%^22regionType^%^22^%^3A4^%^7D^%^2C^%^7B^%^22regionId^%^22^%^3A2694^%^2C^%^22regionType^%^22^%^3A4^%^7D^%^5D^%^2C^%^22isMapVisible^%^22^%^3Atrue^%^2C^%^22filterState^%^22^%^3A^%^7B^%^22price^%^22^%^3A^%^7B^%^22max^%^22^%^3A250000^%^7D^%^2C^%^22isAllHomes^%^22^%^3A^%^7B^%^22value^%^22^%^3Atrue^%^7D^%^2C^%^22sortSelection^%^22^%^3A^%^7B^%^22value^%^22^%^3A^%^22days^%^22^%^7D^%^2C^%^22isLotLand^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isMiddleSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isHighSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22includeUnratedSchools^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isComingSoon^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isPublicSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isPrivateSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isElementarySchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isCharterSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^7D^%^2C^%^22isListVisible^%^22^%^3Atrue^%^7D&wants=^\{^%^22cat1^%^22:^\[^%^22mapResults^%^22^\]^\}&requestId=3") + listings = scrapper.run() + scrapper.print_listings() + + #df=etl(response) + + return + + +if __name__ == "__main__": + main() + + +#curl "https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState=^%^7B^%^22pagination^%^22^%^3A^%^7B^%^7D^%^2C^%^22usersSearchTerm^%^22^%^3A^%^22Yorktown^%^20Heights^%^2C^%^20NY^%^22^%^2C^%^22mapBounds^%^22^%^3A^%^7B^%^22north^%^22^%^3A42.99146217894271^%^2C^%^22east^%^22^%^3A-70.80209903627659^%^2C^%^22south^%^22^%^3A39.549453943310084^%^2C^%^22west^%^22^%^3A-77.00937442690159^%^7D^%^2C^%^22mapZoom^%^22^%^3A8^%^2C^%^22regionSelection^%^22^%^3A^%^5B^%^7B^%^22regionId^%^22^%^3A3148^%^2C^%^22regionType^%^22^%^3A4^%^7D^%^2C^%^7B^%^22regionId^%^22^%^3A2694^%^2C^%^22regionType^%^22^%^3A4^%^7D^%^5D^%^2C^%^22isMapVisible^%^22^%^3Atrue^%^2C^%^22filterState^%^22^%^3A^%^7B^%^22price^%^22^%^3A^%^7B^%^22max^%^22^%^3A250000^%^7D^%^2C^%^22isAllHomes^%^22^%^3A^%^7B^%^22value^%^22^%^3Atrue^%^7D^%^2C^%^22sortSelection^%^22^%^3A^%^7B^%^22value^%^22^%^3A^%^22days^%^22^%^7D^%^2C^%^22isLotLand^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isMiddleSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isHighSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22includeUnratedSchools^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isComingSoon^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isPublicSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isPrivateSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isElementarySchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^2C^%^22isCharterSchool^%^22^%^3A^%^7B^%^22value^%^22^%^3Afalse^%^7D^%^7D^%^2C^%^22isListVisible^%^22^%^3Atrue^%^7D&wants=^\{^%^22cat1^%^22:^\[^%^22mapResults^%^22^\]^\}&requestId=3", +# "authority: www.zillow.com", +# "accept: */*", +# "accept-language: en-US,en;q=0.9", +# "cache-control: no-cache", +# "cookie: JSESSIONID=97FD1EB701E102B7353E8EA4528843CE; zguid=24^|^%^24825bd6e9-4f90-46df-a475-4a9910b5847c; zgsession=1^|a6a5b7ca-c651-45a2-93c8-c5b66fea68d3; AWSALB=oQ3DGTMPgyQOTPA6zLmQ0liqJ1oax2QoQ5rUSCsORkWP52C7k6G8H1gZnlxOtgU/zzO503UHUnQ7tUeivhOnupv7aYI6+E5LxUZl4TeE0JyhvT3pZ6LYeC9iFbTw; AWSALBCORS=oQ3DGTMPgyQOTPA6zLmQ0liqJ1oax2QoQ5rUSCsORkWP52C7k6G8H1gZnlxOtgU/zzO503UHUnQ7tUeivhOnupv7aYI6+E5LxUZl4TeE0JyhvT3pZ6LYeC9iFbTw; search=6^|1689549806090^%^7Crect^%^3D42.92311815473404^%^252C-70.80209903627659^%^252C39.62142250427077^%^252C-77.00937442690159^%^26rid^%^3D2694^%^26disp^%^3Dmap^%^26mdm^%^3Dauto^%^26p^%^3D1^%^26sort^%^3Ddays^%^26z^%^3D1^%^26listPriceActive^%^3D1^%^26type^%^3Dhouse^%^252Ccondo^%^252Capartment_duplex^%^252Cmobile^%^252Ctownhouse^%^26lt^%^3Dfsba^%^252Cfsbo^%^252Cfore^%^252Cnew^%^252Cauction^%^26price^%^3D0-250000^%^26fs^%^3D1^%^26fr^%^3D0^%^26mmm^%^3D0^%^26rs^%^3D0^%^26ah^%^3D0^%^26singlestory^%^3D0^%^26housing-connector^%^3D0^%^26abo^%^3D0^%^26garage^%^3D0^%^26pool^%^3D0^%^26ac^%^3D0^%^26waterfront^%^3D0^%^26finished^%^3D0^%^26unfinished^%^3D0^%^26cityview^%^3D0^%^26mountainview^%^3D0^%^26parkview^%^3D0^%^26waterview^%^3D0^%^26hoadata^%^3D1^%^26zillow-owned^%^3D0^%^263dhome^%^3D0^%^26featuredMultiFamilyBuilding^%^3D0^%^26commuteMode^%^3Ddriving^%^26commuteTimeOfDay^%^3Dnow^%^09^%^092694^%^09^%^09^%^09^%^09^%^09^%^09", +# "pragma: no-cache", +# "sec-ch-ua: ^\^"Not.A/Brand^\^";v=^\^"8^\^", ^\^"Chromium^\^";v=^\^"114^\^", ^\^"Google Chrome^\^";v=^\^"114^\^"", +# "sec-ch-ua-mobile: ?0", +# "sec-ch-ua-platform: ^\^"Windows^\^"", +# "sec-fetch-dest: empty", +# "sec-fetch-mode: cors", +# "sec-fetch-site: same-origin", +# "user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36", +# "x-kl-ajax-request: Ajax_Request", +# --compressed \ No newline at end of file diff --git a/Server/modules/mlsScraper/requirements.txt b/Server/modules/mlsScraper/requirements.txt new file mode 100644 index 0000000..cc70a65 --- /dev/null +++ b/Server/modules/mlsScraper/requirements.txt @@ -0,0 +1,4 @@ +pandas +requests +fake-useragent +beautifulsoup4 \ No newline at end of file