From b062accb68d5d0f3c3e8fc4b99a14662e677f1e7 Mon Sep 17 00:00:00 2001 From: Chandler Swift Date: Thu, 29 Feb 2024 23:19:08 -0600 Subject: [PATCH] Fetch lat/lon for cities for eventual map --- get_data.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 5 deletions(-) diff --git a/get_data.py b/get_data.py index aa41f38..7df10f1 100644 --- a/get_data.py +++ b/get_data.py @@ -1,13 +1,47 @@ import requests import csv import json +import zipfile +import tempfile +import os +import subprocess +import io # https://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2020-2022/SUB-EST2022.pdf -INCORPORATED_PLACE = "162" +INCORPORATED_PLACE = "162" +# Get state FIPS/ANSI codes and other data +# from https://www.census.gov/library/reference/code-lists/ansi.html#states +print("Fetching states…", flush=True, end="") +res = requests.get("https://www2.census.gov/geo/docs/reference/codes2020/national_state2020.txt") +states = list(csv.DictReader(res.text.split('\n'), delimiter='|')) +# {'STATE': 'AL', 'STATEFP': '01', 'STATENS': '01779775', 'STATE_NAME': 'Alabama'} +print("done") + +# Find geographic centers of cities +place_locations = {} +for state in states[:51]: # Just the 50 and DC, not Guam/American Samoa/PR/etc + with tempfile.TemporaryDirectory() as tmpdir: + print(f"Fetching data for {state['STATE_NAME']}…", flush=True, end="") + res = requests.get(f"https://www2.census.gov/geo/tiger/TIGER2020/PLACE/tl_2020_{state['STATEFP']}_place.zip") + print("processing…", end="", flush=True) + zipfile.ZipFile(io.BytesIO(res.content)).extractall(tmpdir) + shapefile_name = os.path.join(tmpdir, f"tl_2020_{state['STATEFP']}_place.shp") + geojson_file_name = os.path.join(tmpdir, "out.geojson") + subprocess.run(f"ogr2ogr -f GeoJSON {geojson_file_name} {shapefile_name}", shell=True) + with open(geojson_file_name) as f: + data = json.load(f) + for feature in data['features']: + # {"type": "Feature", "properties": {"STATEFP": "01", "PLACEFP": "02260", "PLACENS": "02405163", "GEOID": "0102260", "NAME": "Ardmore", "NAMELSAD": "Ardmore town", "LSAD": "43", "CLASSFP": "C1", "PCICBSA": "N", "PCINECTA": "N", "MTFCC": "G4110", "FUNCSTAT": "A", "ALAND": 5289895, "AWATER": 21830, "INTPTLAT": "+34.9878376", "INTPTLON": "-086.8290225"}, "geometry": {"type": "Polygon", "coordinates": [[[-86.856689, 34.992046], [-86.855354, 34.992044], [-86.855101, 34.99204] + state_place = (feature['properties']['STATEFP'], feature['properties']['PLACEFP']) + lat_lon = (float(feature['properties']['INTPTLAT']), float(feature['properties']['INTPTLON'])) + place_locations[state_place] = lat_lon + print("done") + +print("Fetching population data for all states…", flush=True, end="") res = requests.get("https://www2.census.gov/programs-surveys/popest/datasets/2020-2022/cities/totals/sub-est2022.csv") res.raise_for_status() - +print("processing…", flush=True, end="") cities_by_state = {} for line in csv.DictReader(res.content.decode('utf-8-sig').split('\n')): if line['SUMLEV'] != INCORPORATED_PLACE: @@ -16,11 +50,36 @@ for line in csv.DictReader(res.content.decode('utf-8-sig').split('\n')): if not line['STNAME'] in cities_by_state: cities_by_state[line['STNAME']] = [] + try: + loc = place_locations[(line['STATE'], line['PLACE'])] + except KeyError: + # TODO: why do these happen? Currently these: + # WARN: KeyError for ('17', '10373') + # WARN: KeyError for ('17', '31991') + # WARN: KeyError for ('27', '13708') + # WARN: KeyError for ('36', '75779') + # WARN: KeyError for ('40', '43725') + # WARN: KeyError for ('40', '49860') + # WARN: KeyError for ('48', '21031') + # WARN: KeyError for ('48', '23176') + # WARN: KeyError for ('48', '58502') + # WARN: KeyError for ('48', '73493') + # WARN: KeyError for ('55', '31525') + # WARN: KeyError for ('55', '82575') + # WARN: KeyError for ('55', '84275') + # Well, we'll just shove 'em on Null Island, I guess + loc = [0,0] + print("WARN: KeyError for", (line['STATE'], line['PLACE'])) + import time + time.sleep(0.1) cities_by_state[line['STNAME']].append({ "name": " ".join(line['NAME'].split(" ")[:-1]), # Remove "city" or "town" from the end "pop": int(line['POPESTIMATE2022']), + "location": loc, }) +print("done") +print("Writing data to disk…", flush=True, end="") for state, cities in cities_by_state.items(): cities.sort(key=lambda i: i["pop"], reverse=True) @@ -29,11 +88,10 @@ for state, cities in cities_by_state.items(): with open(f"data/states.json", 'w') as f: f.write(json.dumps(list(cities_by_state.keys()))) +print("done") # ----- MAP ----- - -import subprocess - +print("Fetching state outlines…", flush=True, end="") CMD=""" curl --silent --remote-name https://www2.census.gov/geo/tiger/GENZ2022/shp/cb_2022_us_state_20m.zip unzip -q -o cb_2022_us_state_20m.zip @@ -43,3 +101,4 @@ rm cb_2022_us_state_20m.* """ subprocess.run(CMD, shell=True) +print("done")