Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .config/mise.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[tools]
python = "3.13.3"
python = "3.14.1"
node = "latest"
lefthook = "latest"
yamllint = "latest"
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ __pycache__/
**/.xlsx#
*.parquet
**/.parquet
output/
30 changes: 0 additions & 30 deletions enrichers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,13 @@
"""

import copy
import requests
from schemas import enrich_resp_schema
import time
from utils import (
default_headers,
session,
)


class Enrichment(object):
_required_keys = [
"facility_name",
]
# in seconds
_wait_time: float = 1

def __init__(self, **kwargs):
self.resp_info = copy.deepcopy(enrich_resp_schema)
Expand All @@ -32,28 +24,6 @@ def search(self) -> dict:
"""Child objects should implement this"""
return {}

def _req(self, url: str, **kwargs) -> requests.Response:
"""requests response wrapper to ensure we honor waits"""
headers = kwargs.get("headers", {})
# ensure we get all headers configured correctly
# but manually applied headers win the argument
for k, v in default_headers.items():
if k in headers.keys():
continue
headers[k] = v

response = session.get(
url,
allow_redirects=True,
timeout=kwargs.get("timeout", 10),
params=kwargs.get("params", {}),
stream=kwargs.get("stream", False),
headers=headers,
)
response.raise_for_status()
time.sleep(self._wait_time)
return response

def _minimal_clean_facility_name(self, name: str) -> str:
"""Minimal cleaning that preserves important context like 'County Jail'"""
cleaned = name
Expand Down
12 changes: 5 additions & 7 deletions enrichers/openstreetmap.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from enrichers import Enrichment
from utils import logger
from utils import logger, req_get


class OpenStreetMap(Enrichment):
Expand Down Expand Up @@ -40,13 +40,13 @@ def search(self) -> dict:
"dedupe": 1,
},
"street_address": {
"q": f"{full_address}",
"q": full_address,
"format": "json",
"limit": 5,
"dedupe": 1,
},
"locality": {
"q": f"{locality}",
"q": locality,
"format": "json",
"limit": 5,
"dedupe": 1,
Expand All @@ -56,7 +56,7 @@ def search(self) -> dict:
logger.debug("Searching OSM for %s", params["q"])
self.resp_info["search_query_steps"].append(params["q"]) # type: ignore [attr-defined]
try:
response = self._req(search_url, params=params, timeout=15)
response = req_get(search_url, params=params, timeout=15)
data.extend(response.json())
except Exception as e:
logger.debug(" OSM search error for '%s': %s", facility_name, e)
Expand All @@ -73,10 +73,8 @@ def search(self) -> dict:
lon = first_result.get("lon", self.default_coords["longitude"])
osm_type = first_result.get("osm_type", "")
osm_id = first_result.get("osm_id", "")
self.resp_info["details"]["latitude"] = lat # type: ignore [index]
self.resp_info["details"]["longitude"] = lon # type: ignore [index]
self.resp_info["title"] = first_result.get("display_name", "")
self.resp_info["details"]["class"] = first_result.get("class", "") # type: ignore [index]
self.resp_info["details"] = {"latitude": lat, "logitude": lon, "class": first_result.get("class", "")}
if osm_type == "way":
self.resp_info["url"] = f"https://www.openstreetmap.org/way/{osm_id}"
else:
Expand Down
50 changes: 27 additions & 23 deletions enrichers/wikidata.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from enrichers import Enrichment
from utils import logger
from utils import logger, req_get


class Wikidata(Enrichment):
Expand All @@ -11,29 +11,32 @@ def search(self) -> dict:
# Fetches 3 results based on _clean_facility_name (not exact name). todo: needs adjustment.
# Falls back to first result (usually truncated, eg. county)
search_name_fallback = self._clean_facility_name(facility_name)
self.resp_info["enrichment_type"] = "wikidata"
logger.debug("Searching wikidata for %s and %s", facility_name, search_name_fallback)
search_url = "https://www.wikidata.org/w/api.php"
params = {
"action": "wbsearchentities",
"search": facility_name,
"language": "en",
"format": "json",
"limit": 3,
"facility_name": {
"action": "wbsearchentities",
"search": facility_name,
"language": "en",
"format": "json",
"limit": 3,
},
"fallback": {
"action": "wbsearchentities",
"search": search_name_fallback,
"language": "en",
"format": "json",
"limit": 3,
},
}
self.resp_info["enrichment_type"] = "wikidata"
data = {}
try:
response = self._req(search_url, params=params)
data = response.json()
except Exception as e:
logger.debug(" Wikidata search error for '%s': %s", facility_name, e)
self.resp_info["search_query_steps"].append(f"(Failed -> {e})") # type: ignore [attr-defined]
if not data.get("search"):
params["search"] = search_name_fallback
self.resp_info["search_query_steps"].append(search_name_fallback) # type: ignore [attr-defined]
for search, params in params.items():
self.resp_info["search_query_steps"].append(params["search"]) # type: ignore [attr-defined]
try:
response = self._req(search_url, params=params)
response = req_get(search_url, params=params, wait_time=self._wait_time)
data = response.json()
break
except Exception as e:
logger.debug(" Wikidata search error for '%s': %s", facility_name, e)
self.resp_info["search_query_steps"].append(f"(Failed -> {e})") # type: ignore [attr-defined]
Expand All @@ -45,10 +48,11 @@ def search(self) -> dict:
if any(term in description for term in match_terms):
self.resp_info["url"] = f"https://www.wikidata.org/wiki/{result['id']}"
self.resp_info["title"] = result.get("label", "")
return self.resp_info
# fallback to first result
first = data["search"][0]
logger.debug(" Closer matching failed, falling back to first result %s", first)
self.resp_info["url"] = f"https://www.wikidata.org/wiki/{result['id']}"
self.resp_info["title"] = result.get("label", "")
break
else:
# fall back to first result
first = data["search"][0]
logger.debug(" Closer matching failed, falling back to first result %s", first)
self.resp_info["url"] = f"https://www.wikidata.org/wiki/{first['id']}"
self.resp_info["title"] = first.get("label", "")
return self.resp_info
10 changes: 5 additions & 5 deletions enrichers/wikipedia.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from enrichers import Enrichment
from urllib.parse import quote
from utils import logger
from utils import logger, req_get


class Wikipedia(Enrichment):
Expand Down Expand Up @@ -32,15 +32,15 @@ def search(self) -> dict:
self.resp_info["search_query_steps"].append(wiki_url) # type: ignore [attr-defined]
initial_response = False
try:
response = self._req(wiki_url)
response = req_get(wiki_url, wait_time=self._wait_time)
initial_response = True
except Exception as e:
logger.debug(" Wikipedia search error for '%s': %s", wiki_url, e)
self.resp_info["search_query_steps"].append(f"(Failed -> {e})") # type: ignore [attr-defined]
wiki_url = f"{self.static_search}{quote(facility_name.replace(' ', '_').replace('|', '_'))}"
self.resp_info["search_query_steps"].append(wiki_url) # type: ignore [attr-defined]
try:
response = self._req(wiki_url)
response = req_get(wiki_url, wait_time=self._wait_time)
initial_response = True
except Exception as e:
logger.debug(" Wikipedia search error for '%s': %s", wiki_url, e)
Expand Down Expand Up @@ -101,7 +101,7 @@ def search(self) -> dict:
}

try:
response = self._req(self.api_search, params=params)
response = req_get(self.api_search, params=params, wait_time=self._wait_time)
data = response.json()
except Exception as e:
logger.debug(" Wikipedia search for %s failed: %s", self.api_search, e)
Expand Down Expand Up @@ -161,7 +161,7 @@ def search(self) -> dict:

# Verify the page exists and isn't a redirect to something unrelated
try:
verify_response = self._req(final_url)
verify_response = req_get(final_url, wait_time=self._wait_time)
except Exception as e:
logger.debug(" Wikipedia query for %s failed: %s", final_url, e)
self.resp_info["search_query_steps"].append(final_url) # type: ignore [attr-defined]
Expand Down
4 changes: 1 addition & 3 deletions file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,7 @@ def export_to_file(
if not facilities_data or not facilities_data.get("facilities", []):
logger.warning("No data to export!")
return ""
# make sure the folder we're dropping files into exists
os.makedirs(output_folder, exist_ok=True)
full_name = f"{output_folder}/{filename}.{file_type}"
full_name = f"{output_folder}{os.sep}{filename}.{file_type}"
if file_type in ["csv", "xlsx", "parquet"]:
writer = convert_to_dataframe(facilities_data["facilities"])
match file_type:
Expand Down
9 changes: 4 additions & 5 deletions ice_scrapers/agencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,18 @@
import time
from utils import (
logger,
session,
output_folder,
req_get,
)
from .utils import download_file

SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
base_xlsx_url = "https://www.ice.gov/identify-and-arrest/287g"


def scrape_agencies(keep_sheet: bool = True, force_download: bool = True) -> dict:
"""Collect data on participating agencies"""
start_time = time.time()
resp = session.get(base_xlsx_url, timeout=120)
resp.raise_for_status()
resp = req_get(base_xlsx_url, timeout=120)
soup = BeautifulSoup(resp.content, "html.parser")
links = [link["href"] for link in soup.findAll("a", href=re.compile("^https://www.ice.gov/doclib.*xlsx"))]
if not links:
Expand All @@ -45,7 +44,7 @@ def scrape_agencies(keep_sheet: bool = True, force_download: bool = True) -> dic
"""
# remove the date so we can easily overwrite the local (cached) file
filename = date_re.sub("", link.split("/")[-1])
path = f"{SCRIPT_DIR}{os.sep}{filename}"
path = f"{output_folder}{os.sep}{filename}"
if force_download or not os.path.exists(path):
logger.info("Downloading agency info sheet from %s", link)
download_file(link, path)
Expand Down
20 changes: 11 additions & 9 deletions ice_scrapers/facilities_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from utils import (
default_timestamp,
logger,
session,
req_get,
timestamp_format,
)
from .utils import (
Expand Down Expand Up @@ -43,19 +43,23 @@ def scrape_facilities(facilities_data: dict) -> dict:
for facility in facilities:
facility = special_facilities(facility)
addr = facility["address"]
street, cleaned = repair_street(addr["street"], addr["locality"])
street, cleaned, other_st = repair_street(addr["street"], addr["locality"])
addr["other_streets"].extend(other_st)
if cleaned:
addr["street"] = street
facility["_repaired_record"] = True
zcode, cleaned = repair_zip(addr["postal_code"], addr["locality"])
zcode, cleaned, other_zip = repair_zip(addr["postal_code"], addr["locality"])
addr["other_postal_codes"].extend(other_zip)
if cleaned:
addr["postal_code"] = zcode
facility["_repaired_record"] = True
locality, cleaned = repair_locality(addr["locality"], addr["administrative_area"])
locality, cleaned, other_city = repair_locality(addr["locality"], addr["administrative_area"])
addr["other_localities"].extend(other_city)
if cleaned:
addr["locality"] = locality
facility["_repaired_record"] = True
name, cleaned = repair_name(facility["name"], addr["locality"])
name, cleaned, other_name = repair_name(facility["name"], addr["locality"])
facility["other_names"].extend(other_name)
if cleaned:
facility["name"] = name
facility["_repaired_record"] = True
Expand Down Expand Up @@ -95,8 +99,7 @@ def _scrape_updated(url: str) -> datetime.datetime:
return datetime.datetime.strptime(default_timestamp, timestamp_format)
logger.debug(" Fetching: %s", url)
try:
response = session.get(url, timeout=30)
response.raise_for_status()
response = req_get(url, timeout=30, wait_time=0.1)
except Exception as e:
logger.error(" Error parsing %s: %s", url, e)
return datetime.datetime.strptime(default_timestamp, timestamp_format)
Expand All @@ -118,8 +121,7 @@ def _scrape_page(page_url: str) -> list:
"""Scrape a single page of facilities using BeautifulSoup"""
logger.debug(" Fetching: %s", page_url)
try:
response = session.get(page_url, timeout=30)
response.raise_for_status()
response = req_get(page_url, timeout=30, wait_time=0.1)
except Exception as e:
logger.error(" Error parsing %s: %s", page_url, e)
return []
Expand Down
5 changes: 2 additions & 3 deletions ice_scrapers/field_offices.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import time
from utils import (
logger,
session,
req_get,
)
from .utils import get_ice_scrape_pages

Expand Down Expand Up @@ -45,8 +45,7 @@ def _scrape_page(page_url: str) -> list[dict]:
"""Scrape a single page of facilities using BeautifulSoup"""
logger.debug(" Fetching: %s", page_url)
try:
response = session.get(page_url, timeout=30)
response.raise_for_status()
response = req_get(page_url, timeout=30)
except Exception as e:
logger.error(" Error parsing %s: %s", page_url, e)
return []
Expand Down
29 changes: 28 additions & 1 deletion ice_scrapers/general.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import copy
from thefuzz import fuzz # type: ignore [import-untyped]
from schemas import facilities_schema
from .agencies import scrape_agencies
from .custom_facilities import insert_additional_facilities
Expand All @@ -7,18 +8,44 @@
merge_field_offices,
scrape_field_offices,
)
from .inspections import find_inspections
from .spreadsheet_load import load_sheet
from .vera_data import collect_vera_facility_data
from utils import logger


def facilities_scrape_wrapper(
keep_sheet: bool = True, force_download: bool = True, skip_vera: bool = False
keep_sheet: bool = True,
force_download: bool = True,
skip_vera: bool = False,
inspection_text: bool = False,
) -> tuple[dict, dict]:
agencies = scrape_agencies(keep_sheet, force_download)
facilities_data = copy.deepcopy(facilities_schema)
facilities = load_sheet(keep_sheet, force_download)
facilities_data["facilities"] = copy.deepcopy(facilities)
facility_name_map = {v["name"].lower(): k for k, v in facilities_data["facilities"].items()}
inspections = find_inspections(keep_text=inspection_text)
facilities_data = scrape_facilities(facilities_data)

# actually attach inspections to facilities
for facility, inspect in inspections.items():
logger.debug(" Matching %s for inspection details...", facility)
# exact match (extremely unlikely)
if facility.lower() in facility_name_map:
facilities_data["facilities"][facility_name_map[facility.lower()]]["inspection"]["details"] = copy.deepcopy(
inspect
)
break
# logger.debug(" Checking fuzzy matches:")
for k, v in facility_name_map.items():
r = fuzz.partial_ratio(facility, k)
# logger.debug(" %s === %s, ratio: %s", facility, k, r)
if r > 80:
logger.debug(" Probably the right facility %s => %s, (ratio %s)", k, facility, r)
facilities_data["facilities"][facility_name_map[k]]["inspection"]["details"] = copy.deepcopy(inspect)
break

if not skip_vera:
facilities_data = collect_vera_facility_data(facilities_data, keep_sheet, force_download)
field_offices = scrape_field_offices()
Expand Down
Loading