Open-Security-Mapping-Project · johnseekins · Dec 3, 2025 · Dec 3, 2025 · Dec 4, 2025 · Dec 4, 2025
diff --git a/.config/mise.toml b/.config/mise.toml
@@ -1,5 +1,5 @@
 [tools]
-python = "3.13.3"
+python = "3.14.1"
 node = "latest"
 lefthook = "latest"
 yamllint = "latest"

diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,4 @@ __pycache__/
 **/.xlsx#
 *.parquet
 **/.parquet
+output/
diff --git a/enrichers/__init__.py b/enrichers/__init__.py
@@ -5,21 +5,13 @@
 """
 
 import copy
-import requests
 from schemas import enrich_resp_schema
-import time
-from utils import (
-    default_headers,
-    session,
-)
 
 
 class Enrichment(object):
     _required_keys = [
         "facility_name",
     ]
-    # in seconds
-    _wait_time: float = 1
 
     def __init__(self, **kwargs):
         self.resp_info = copy.deepcopy(enrich_resp_schema)
@@ -32,28 +24,6 @@ def search(self) -> dict:
         """Child objects should implement this"""
         return {}
 
-    def _req(self, url: str, **kwargs) -> requests.Response:
-        """requests response wrapper to ensure we honor waits"""
-        headers = kwargs.get("headers", {})
-        # ensure we get all headers configured correctly
-        # but manually applied headers win the argument
-        for k, v in default_headers.items():
-            if k in headers.keys():
-                continue
-            headers[k] = v
-
-        response = session.get(
-            url,
-            allow_redirects=True,
-            timeout=kwargs.get("timeout", 10),
-            params=kwargs.get("params", {}),
-            stream=kwargs.get("stream", False),
-            headers=headers,
-        )
-        response.raise_for_status()
-        time.sleep(self._wait_time)
-        return response
-
     def _minimal_clean_facility_name(self, name: str) -> str:
         """Minimal cleaning that preserves important context like 'County Jail'"""
         cleaned = name

diff --git a/enrichers/openstreetmap.py b/enrichers/openstreetmap.py
@@ -1,5 +1,5 @@
 from enrichers import Enrichment
-from utils import logger
+from utils import logger, req_get
 
 
 class OpenStreetMap(Enrichment):
@@ -40,13 +40,13 @@ def search(self) -> dict:
                     "dedupe": 1,
                 },
                 "street_address": {
-                    "q": f"{full_address}",
+                    "q": full_address,
                     "format": "json",
                     "limit": 5,
                     "dedupe": 1,
                 },
                 "locality": {
-                    "q": f"{locality}",
+                    "q": locality,
                     "format": "json",
                     "limit": 5,
                     "dedupe": 1,
@@ -56,7 +56,7 @@ def search(self) -> dict:
             logger.debug("Searching OSM for %s", params["q"])
             self.resp_info["search_query_steps"].append(params["q"])  # type: ignore [attr-defined]
             try:
-                response = self._req(search_url, params=params, timeout=15)
+                response = req_get(search_url, params=params, timeout=15)
                 data.extend(response.json())
             except Exception as e:
                 logger.debug(" OSM search error for '%s': %s", facility_name, e)
@@ -73,10 +73,8 @@ def search(self) -> dict:
         lon = first_result.get("lon", self.default_coords["longitude"])
         osm_type = first_result.get("osm_type", "")
         osm_id = first_result.get("osm_id", "")
-        self.resp_info["details"]["latitude"] = lat  # type: ignore [index]
-        self.resp_info["details"]["longitude"] = lon  # type: ignore [index]
         self.resp_info["title"] = first_result.get("display_name", "")
-        self.resp_info["details"]["class"] = first_result.get("class", "")  # type: ignore [index]
+        self.resp_info["details"] = {"latitude": lat, "logitude": lon, "class": first_result.get("class", "")}
         if osm_type == "way":
             self.resp_info["url"] = f"https://www.openstreetmap.org/way/{osm_id}"
         else:

diff --git a/enrichers/wikidata.py b/enrichers/wikidata.py
@@ -1,5 +1,5 @@
 from enrichers import Enrichment
-from utils import logger
+from utils import logger, req_get
 
 
 class Wikidata(Enrichment):
@@ -11,29 +11,32 @@ def search(self) -> dict:
         # Fetches 3 results based on _clean_facility_name (not exact name). todo: needs adjustment.
         # Falls back to first result (usually truncated, eg. county)
         search_name_fallback = self._clean_facility_name(facility_name)
+        self.resp_info["enrichment_type"] = "wikidata"
         logger.debug("Searching wikidata for %s and %s", facility_name, search_name_fallback)
         search_url = "https://www.wikidata.org/w/api.php"
         params = {
-            "action": "wbsearchentities",
-            "search": facility_name,
-            "language": "en",
-            "format": "json",
-            "limit": 3,
+            "facility_name": {
+                "action": "wbsearchentities",
+                "search": facility_name,
+                "language": "en",
+                "format": "json",
+                "limit": 3,
+            },
+            "fallback": {
+                "action": "wbsearchentities",
+                "search": search_name_fallback,
+                "language": "en",
+                "format": "json",
+                "limit": 3,
+            },
         }
-        self.resp_info["enrichment_type"] = "wikidata"
         data = {}
-        try:
-            response = self._req(search_url, params=params)
-            data = response.json()
-        except Exception as e:
-            logger.debug("  Wikidata search error for '%s': %s", facility_name, e)
-            self.resp_info["search_query_steps"].append(f"(Failed -> {e})")  # type: ignore [attr-defined]
-        if not data.get("search"):
-            params["search"] = search_name_fallback
-            self.resp_info["search_query_steps"].append(search_name_fallback)  # type: ignore [attr-defined]
+        for search, params in params.items():
+            self.resp_info["search_query_steps"].append(params["search"])  # type: ignore [attr-defined]
             try:
-                response = self._req(search_url, params=params)
+                response = req_get(search_url, params=params, wait_time=self._wait_time)
                 data = response.json()
+                break
             except Exception as e:
                 logger.debug("  Wikidata search error for '%s': %s", facility_name, e)
                 self.resp_info["search_query_steps"].append(f"(Failed -> {e})")  # type: ignore [attr-defined]
@@ -45,10 +48,11 @@ def search(self) -> dict:
             if any(term in description for term in match_terms):
                 self.resp_info["url"] = f"https://www.wikidata.org/wiki/{result['id']}"
                 self.resp_info["title"] = result.get("label", "")
-                return self.resp_info
-        # fallback to first result
-        first = data["search"][0]
-        logger.debug("   Closer matching failed, falling back to first result %s", first)
-        self.resp_info["url"] = f"https://www.wikidata.org/wiki/{result['id']}"
-        self.resp_info["title"] = result.get("label", "")
+                break
+        else:
+            # fall back to first result
+            first = data["search"][0]
+            logger.debug("   Closer matching failed, falling back to first result %s", first)
+            self.resp_info["url"] = f"https://www.wikidata.org/wiki/{first['id']}"
+            self.resp_info["title"] = first.get("label", "")
         return self.resp_info
diff --git a/enrichers/wikipedia.py b/enrichers/wikipedia.py
@@ -1,6 +1,6 @@
 from enrichers import Enrichment
 from urllib.parse import quote
-from utils import logger
+from utils import logger, req_get
 
 
 class Wikipedia(Enrichment):
@@ -32,15 +32,15 @@ def search(self) -> dict:
         self.resp_info["search_query_steps"].append(wiki_url)  # type: ignore [attr-defined]
         initial_response = False
         try:
-            response = self._req(wiki_url)
+            response = req_get(wiki_url, wait_time=self._wait_time)
             initial_response = True
         except Exception as e:
             logger.debug("  Wikipedia search error for '%s': %s", wiki_url, e)
             self.resp_info["search_query_steps"].append(f"(Failed -> {e})")  # type: ignore [attr-defined]
             wiki_url = f"{self.static_search}{quote(facility_name.replace(' ', '_').replace('|', '_'))}"
             self.resp_info["search_query_steps"].append(wiki_url)  # type: ignore [attr-defined]
             try:
-                response = self._req(wiki_url)
+                response = req_get(wiki_url, wait_time=self._wait_time)
                 initial_response = True
             except Exception as e:
                 logger.debug("  Wikipedia search error for '%s': %s", wiki_url, e)
@@ -101,7 +101,7 @@ def search(self) -> dict:
             }
 
             try:
-                response = self._req(self.api_search, params=params)
+                response = req_get(self.api_search, params=params, wait_time=self._wait_time)
                 data = response.json()
             except Exception as e:
                 logger.debug("   Wikipedia search for %s failed: %s", self.api_search, e)
@@ -161,7 +161,7 @@ def search(self) -> dict:
 
                         # Verify the page exists and isn't a redirect to something unrelated
                         try:
-                            verify_response = self._req(final_url)
+                            verify_response = req_get(final_url, wait_time=self._wait_time)
                         except Exception as e:
                             logger.debug("    Wikipedia query for %s failed: %s", final_url, e)
                             self.resp_info["search_query_steps"].append(final_url)  # type: ignore [attr-defined]

diff --git a/file_utils.py b/file_utils.py
@@ -18,9 +18,7 @@ def export_to_file(
     if not facilities_data or not facilities_data.get("facilities", []):
         logger.warning("No data to export!")
         return ""
-    # make sure the folder we're dropping files into exists
-    os.makedirs(output_folder, exist_ok=True)
-    full_name = f"{output_folder}/{filename}.{file_type}"
+    full_name = f"{output_folder}{os.sep}{filename}.{file_type}"
     if file_type in ["csv", "xlsx", "parquet"]:
         writer = convert_to_dataframe(facilities_data["facilities"])
         match file_type:

diff --git a/ice_scrapers/agencies.py b/ice_scrapers/agencies.py
@@ -11,19 +11,18 @@
 import time
 from utils import (
     logger,
-    session,
+    output_folder,
+    req_get,
 )
 from .utils import download_file
 
-SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
 base_xlsx_url = "https://www.ice.gov/identify-and-arrest/287g"
 
 
 def scrape_agencies(keep_sheet: bool = True, force_download: bool = True) -> dict:
     """Collect data on participating agencies"""
     start_time = time.time()
-    resp = session.get(base_xlsx_url, timeout=120)
-    resp.raise_for_status()
+    resp = req_get(base_xlsx_url, timeout=120)
     soup = BeautifulSoup(resp.content, "html.parser")
     links = [link["href"] for link in soup.findAll("a", href=re.compile("^https://www.ice.gov/doclib.*xlsx"))]
     if not links:
@@ -45,7 +44,7 @@ def scrape_agencies(keep_sheet: bool = True, force_download: bool = True) -> dic
         """
         # remove the date so we can easily overwrite the local (cached) file
         filename = date_re.sub("", link.split("/")[-1])
-        path = f"{SCRIPT_DIR}{os.sep}{filename}"
+        path = f"{output_folder}{os.sep}{filename}"
         if force_download or not os.path.exists(path):
             logger.info("Downloading agency info sheet from %s", link)
             download_file(link, path)

diff --git a/ice_scrapers/facilities_scraper.py b/ice_scrapers/facilities_scraper.py
@@ -7,7 +7,7 @@
 from utils import (
     default_timestamp,
     logger,
-    session,
+    req_get,
     timestamp_format,
 )
 from .utils import (
@@ -43,19 +43,23 @@ def scrape_facilities(facilities_data: dict) -> dict:
         for facility in facilities:
             facility = special_facilities(facility)
             addr = facility["address"]
-            street, cleaned = repair_street(addr["street"], addr["locality"])
+            street, cleaned, other_st = repair_street(addr["street"], addr["locality"])
+            addr["other_streets"].extend(other_st)
             if cleaned:
                 addr["street"] = street
                 facility["_repaired_record"] = True
-            zcode, cleaned = repair_zip(addr["postal_code"], addr["locality"])
+            zcode, cleaned, other_zip = repair_zip(addr["postal_code"], addr["locality"])
+            addr["other_postal_codes"].extend(other_zip)
             if cleaned:
                 addr["postal_code"] = zcode
                 facility["_repaired_record"] = True
-            locality, cleaned = repair_locality(addr["locality"], addr["administrative_area"])
+            locality, cleaned, other_city = repair_locality(addr["locality"], addr["administrative_area"])
+            addr["other_localities"].extend(other_city)
             if cleaned:
                 addr["locality"] = locality
                 facility["_repaired_record"] = True
-            name, cleaned = repair_name(facility["name"], addr["locality"])
+            name, cleaned, other_name = repair_name(facility["name"], addr["locality"])
+            facility["other_names"].extend(other_name)
             if cleaned:
                 facility["name"] = name
                 facility["_repaired_record"] = True
@@ -95,8 +99,7 @@ def _scrape_updated(url: str) -> datetime.datetime:
         return datetime.datetime.strptime(default_timestamp, timestamp_format)
     logger.debug("  Fetching: %s", url)
     try:
-        response = session.get(url, timeout=30)
-        response.raise_for_status()
+        response = req_get(url, timeout=30, wait_time=0.1)
     except Exception as e:
         logger.error("  Error parsing %s: %s", url, e)
         return datetime.datetime.strptime(default_timestamp, timestamp_format)
@@ -118,8 +121,7 @@ def _scrape_page(page_url: str) -> list:
     """Scrape a single page of facilities using BeautifulSoup"""
     logger.debug("  Fetching: %s", page_url)
     try:
-        response = session.get(page_url, timeout=30)
-        response.raise_for_status()
+        response = req_get(page_url, timeout=30, wait_time=0.1)
     except Exception as e:
         logger.error("  Error parsing %s: %s", page_url, e)
         return []

diff --git a/ice_scrapers/field_offices.py b/ice_scrapers/field_offices.py
@@ -14,7 +14,7 @@
 import time
 from utils import (
     logger,
-    session,
+    req_get,
 )
 from .utils import get_ice_scrape_pages
 
@@ -45,8 +45,7 @@ def _scrape_page(page_url: str) -> list[dict]:
     """Scrape a single page of facilities using BeautifulSoup"""
     logger.debug("  Fetching: %s", page_url)
     try:
-        response = session.get(page_url, timeout=30)
-        response.raise_for_status()
+        response = req_get(page_url, timeout=30)
     except Exception as e:
         logger.error("  Error parsing %s: %s", page_url, e)
         return []

diff --git a/ice_scrapers/general.py b/ice_scrapers/general.py
@@ -1,4 +1,5 @@
 import copy
+from thefuzz import fuzz  # type: ignore [import-untyped]
 from schemas import facilities_schema
 from .agencies import scrape_agencies
 from .custom_facilities import insert_additional_facilities
@@ -7,18 +8,44 @@
     merge_field_offices,
     scrape_field_offices,
 )
+from .inspections import find_inspections
 from .spreadsheet_load import load_sheet
 from .vera_data import collect_vera_facility_data
+from utils import logger
 
 
 def facilities_scrape_wrapper(
-    keep_sheet: bool = True, force_download: bool = True, skip_vera: bool = False
+    keep_sheet: bool = True,
+    force_download: bool = True,
+    skip_vera: bool = False,
+    inspection_text: bool = False,
 ) -> tuple[dict, dict]:
     agencies = scrape_agencies(keep_sheet, force_download)
     facilities_data = copy.deepcopy(facilities_schema)
     facilities = load_sheet(keep_sheet, force_download)
     facilities_data["facilities"] = copy.deepcopy(facilities)
+    facility_name_map = {v["name"].lower(): k for k, v in facilities_data["facilities"].items()}
+    inspections = find_inspections(keep_text=inspection_text)
     facilities_data = scrape_facilities(facilities_data)
+
+    # actually attach inspections to facilities
+    for facility, inspect in inspections.items():
+        logger.debug("  Matching %s for inspection details...", facility)
+        # exact match (extremely unlikely)
+        if facility.lower() in facility_name_map:
+            facilities_data["facilities"][facility_name_map[facility.lower()]]["inspection"]["details"] = copy.deepcopy(
+                inspect
+            )
+            break
+        # logger.debug("    Checking fuzzy matches:")
+        for k, v in facility_name_map.items():
+            r = fuzz.partial_ratio(facility, k)
+            # logger.debug("    %s === %s, ratio: %s", facility, k, r)
+            if r > 80:
+                logger.debug("  Probably the right facility %s => %s, (ratio %s)", k, facility, r)
+                facilities_data["facilities"][facility_name_map[k]]["inspection"]["details"] = copy.deepcopy(inspect)
+                break
+
     if not skip_vera:
         facilities_data = collect_vera_facility_data(facilities_data, keep_sheet, force_download)
     field_offices = scrape_field_offices()