#!/usr/bin/env python3 import requests import json from bs4 import BeautifulSoup import re import urllib.parse # Stolen from my machine, appears to work; sufficient and necessary to get # around their firewall apparently? UA={ "User-Agent": 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0' } response = requests.get('https://punchpizza.com/locations/', headers=UA) soup = BeautifulSoup(response.text, 'html.parser') location_links = soup.select('div.loctop > .wpb_wrapper > ul > li > a') # Two rows with the same id :eyeroll: locations = [] for location_link in location_links: location_response = response = requests.get(urllib.parse.urljoin("https://punchpizza.com", location_link['href']), headers=UA) latlon = re.search(r'var punchloc = {lat: ([0-9.-]*), lng: ([0-9.-]*)};', location_response.text) if not latlon: raise Exception("No latlon found") locations.append({ "type": "Feature", "geometry": { "type": "Point", "coordinates": [float(latlon[2]), float(latlon[1])], # yes, [lon, lat] since it's [x, y] }, # TODO: addresses are kind of a mess }) geojson = { "type": "FeatureCollection", "features": locations, } print(len(locations), "locations found") with open("data.geojson", "w") as f: f.write(json.dumps(geojson))