Add a second detection path alongside ASN lookup: a self-maintained
list of university domains (uni_domains.txt) loaded at startup.
- New /lookup params: email= (extracts domain from address), domain= unchanged
- Suffix matching: insti.uni-stuttgart.de matches list entry uni-stuttgart.de
without false-positives (evil-uni-stuttgart.de does not match)
- New response fields: asn_match, domain_match, matched_domain (omitempty)
- nren remains true if either asn_match OR domain_match is true (backwards compat)
- /healthz now returns JSON body: {"asn_count":N,"domain_count":N}
- asn-updater: new update_uni_domains() merges hs-kompass.de TSV + Hipo JSON
(configurable via UNI_DOMAIN_COUNTRIES / HS_KOMPASS_URL env vars)
- 7 new tests; all existing tests pass unchanged
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
209 lines
7.2 KiB
Python
209 lines
7.2 KiB
Python
import os, time, json, tarfile, tempfile, shutil
|
|
from urllib.parse import urlparse
|
|
import requests
|
|
|
|
OUT_DIR = os.getenv("OUT_DIR", "/data")
|
|
LICENSE_KEY = os.getenv("MAXMIND_LICENSE_KEY", "").strip()
|
|
PDB_API_KEY = os.getenv("PDB_API_KEY", "").strip()
|
|
PDB_BASE = os.getenv("PDB_BASE", "https://www.peeringdb.com")
|
|
INFO_TYPE = os.getenv("PDB_INFO_TYPE", "Educational/Research")
|
|
TIMEOUT = int(os.getenv("HTTP_TIMEOUT", "30"))
|
|
LIMIT = int(os.getenv("PDB_LIMIT", "250"))
|
|
HS_KOMPASS_URL = os.getenv("HS_KOMPASS_URL", "https://hs-kompass.de/kompass/xml/download/hs_liste.txt")
|
|
UNI_DOMAIN_COUNTRIES = os.getenv("UNI_DOMAIN_COUNTRIES", "DE,AT")
|
|
|
|
def atomic_replace(src_path: str, dst_path: str) -> None:
|
|
os.makedirs(os.path.dirname(dst_path), exist_ok=True)
|
|
tmp = dst_path + ".tmp"
|
|
shutil.copyfile(src_path, tmp)
|
|
os.replace(tmp, dst_path)
|
|
os.chmod(dst_path, 0o644)
|
|
|
|
def download_maxmind_mmdb() -> None:
|
|
if not LICENSE_KEY:
|
|
raise RuntimeError("MAXMIND_LICENSE_KEY missing")
|
|
|
|
# Offizieller GeoLite2 Download-Mechanismus per license_key + edition_id
|
|
url = (
|
|
"https://download.maxmind.com/app/geoip_download"
|
|
f"?edition_id=GeoLite2-ASN&license_key={LICENSE_KEY}&suffix=tar.gz"
|
|
)
|
|
|
|
with tempfile.TemporaryDirectory() as td:
|
|
tgz = os.path.join(td, "GeoLite2-ASN.tar.gz")
|
|
r = requests.get(url, timeout=TIMEOUT)
|
|
if r.status_code == 429:
|
|
existing = os.path.join(OUT_DIR, "GeoLite2-ASN.mmdb")
|
|
if os.path.exists(existing):
|
|
print("[warn] MaxMind rate limited (429); keeping existing mmdb")
|
|
return
|
|
raise RuntimeError("MaxMind rate limited (429) and no existing mmdb")
|
|
r.raise_for_status()
|
|
with open(tgz, "wb") as f:
|
|
f.write(r.content)
|
|
|
|
mmdb_found = None
|
|
with tarfile.open(tgz, "r:gz") as tar:
|
|
for member in tar.getmembers():
|
|
if member.name.endswith("GeoLite2-ASN.mmdb"):
|
|
tar.extract(member, path=td)
|
|
mmdb_found = os.path.join(td, member.name)
|
|
break
|
|
|
|
if not mmdb_found or not os.path.exists(mmdb_found):
|
|
raise RuntimeError("GeoLite2-ASN.mmdb not found in archive")
|
|
|
|
atomic_replace(mmdb_found, os.path.join(OUT_DIR, "GeoLite2-ASN.mmdb"))
|
|
|
|
def pdb_headers():
|
|
if not PDB_API_KEY:
|
|
return {"Accept": "application/json"}
|
|
# PeeringDB API Key (optional)
|
|
return {"Accept": "application/json", "Authorization": f"Api-Key {PDB_API_KEY}"}
|
|
|
|
def fetch_pdb_page(skip: int, info_type: str):
|
|
url = f"{PDB_BASE}/api/net"
|
|
params = {
|
|
"info_type": info_type,
|
|
"limit": LIMIT,
|
|
"skip": skip,
|
|
"fields": "asn,status,info_type",
|
|
}
|
|
r = requests.get(url, params=params, headers=pdb_headers(), timeout=TIMEOUT)
|
|
r.raise_for_status()
|
|
j = r.json()
|
|
return j.get("data", [])
|
|
|
|
def update_nren_asns() -> str:
|
|
info_types = [INFO_TYPE]
|
|
# Alternate label seen in PeeringDB deployments.
|
|
if INFO_TYPE != "Research and Education":
|
|
info_types.append("Research and Education")
|
|
if INFO_TYPE != "Educational/Research":
|
|
info_types.append("Educational/Research")
|
|
|
|
asns = set()
|
|
used_info_type = INFO_TYPE
|
|
for info_type in info_types:
|
|
asns.clear()
|
|
skip = 0
|
|
while True:
|
|
data = fetch_pdb_page(skip, info_type)
|
|
for obj in data:
|
|
if obj.get("status") != "ok":
|
|
continue
|
|
asn = obj.get("asn")
|
|
if isinstance(asn, int) and asn > 0:
|
|
asns.add(asn)
|
|
if len(data) < LIMIT:
|
|
break
|
|
skip += LIMIT
|
|
time.sleep(1.1) # sehr konservativ
|
|
if asns:
|
|
used_info_type = info_type
|
|
break
|
|
|
|
if not asns:
|
|
print(f"[warn] no ASNs found for info_type(s)={info_types}")
|
|
|
|
out_txt = os.path.join(OUT_DIR, "nren_asns.txt")
|
|
with tempfile.NamedTemporaryFile("w", delete=False, dir=OUT_DIR) as f:
|
|
for a in sorted(asns):
|
|
f.write(f"{a}\n")
|
|
tmp_path = f.name
|
|
os.replace(tmp_path, out_txt)
|
|
os.chmod(out_txt, 0o644)
|
|
return used_info_type
|
|
|
|
def update_uni_domains() -> int:
|
|
country_set = {c.strip() for c in UNI_DOMAIN_COUNTRIES.split(",")}
|
|
|
|
hs_domains: set = set()
|
|
try:
|
|
r = requests.get(HS_KOMPASS_URL, timeout=TIMEOUT)
|
|
r.raise_for_status()
|
|
for line in r.text.splitlines():
|
|
parts = line.split("\t")
|
|
if len(parts) <= 20:
|
|
continue
|
|
homepage = parts[20].strip()
|
|
if not homepage:
|
|
continue
|
|
try:
|
|
if not homepage.startswith(("http://", "https://")):
|
|
homepage = "http://" + homepage
|
|
parsed = urlparse(homepage)
|
|
hostname = (parsed.hostname or "").lower()
|
|
if hostname.startswith("www."):
|
|
hostname = hostname[4:]
|
|
if hostname:
|
|
hs_domains.add(hostname)
|
|
except Exception:
|
|
continue
|
|
except Exception as err:
|
|
print(f"[warn] hs-kompass fetch failed: {err}")
|
|
|
|
hipo_domains: set = set()
|
|
try:
|
|
r = requests.get(
|
|
"https://raw.githubusercontent.com/Hipo/university-domains-list/master/world_universities_and_domains.json",
|
|
timeout=TIMEOUT,
|
|
)
|
|
r.raise_for_status()
|
|
for entry in r.json():
|
|
if entry.get("alpha_two_code") in country_set:
|
|
for d in entry.get("domains", []):
|
|
hipo_domains.add(d.lower().strip())
|
|
except Exception as err:
|
|
print(f"[warn] hipo fetch failed: {err}")
|
|
|
|
if len(hs_domains) == 0 or len(hipo_domains) == 0:
|
|
print(f"[warn] uni_domains: hs_kompass={len(hs_domains)} hipo={len(hipo_domains)}")
|
|
|
|
merged = hs_domains | hipo_domains
|
|
if len(merged) == 0:
|
|
print("[warn] uni_domains update produced 0 entries — skipping write to preserve existing file")
|
|
return 0
|
|
|
|
out_txt = os.path.join(OUT_DIR, "uni_domains.txt")
|
|
with tempfile.NamedTemporaryFile("w", delete=False, dir=OUT_DIR) as f:
|
|
for d in sorted(merged):
|
|
f.write(f"{d}\n")
|
|
tmp_path = f.name
|
|
os.replace(tmp_path, out_txt)
|
|
os.chmod(out_txt, 0o644)
|
|
|
|
meta = {
|
|
"hs_kompass": len(hs_domains),
|
|
"hipo": len(hipo_domains),
|
|
"total": len(merged),
|
|
"updated_at_unix": int(time.time()),
|
|
}
|
|
meta_path = os.path.join(OUT_DIR, "uni_domains_meta.json")
|
|
with open(meta_path, "w") as f:
|
|
json.dump(meta, f)
|
|
os.chmod(meta_path, 0o644)
|
|
|
|
return len(merged)
|
|
|
|
def write_meta(info_type: str):
|
|
meta = {
|
|
"updated_at_unix": int(time.time()),
|
|
"info_type": info_type,
|
|
"pdb_base": PDB_BASE,
|
|
}
|
|
with open(os.path.join(OUT_DIR, "metadata.json"), "w") as f:
|
|
json.dump(meta, f, indent=2)
|
|
os.chmod(os.path.join(OUT_DIR, "metadata.json"), 0o644)
|
|
|
|
def main():
|
|
os.makedirs(OUT_DIR, exist_ok=True)
|
|
download_maxmind_mmdb()
|
|
used_info_type = update_nren_asns()
|
|
domain_count = update_uni_domains()
|
|
write_meta(used_info_type)
|
|
print(f"[ok] updated mmdb + nren_asns + uni_domains (domain_count={domain_count})")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|