education-flagger/asn-updater/update.py

import os, time, json, tarfile, tempfile, shutil
from urllib.parse import urlparse
import requests

OUT_DIR = os.getenv("OUT_DIR", "/data")
LICENSE_KEY = os.getenv("MAXMIND_LICENSE_KEY", "").strip()
PDB_API_KEY = os.getenv("PDB_API_KEY", "").strip()
PDB_BASE = os.getenv("PDB_BASE", "https://www.peeringdb.com")
INFO_TYPE = os.getenv("PDB_INFO_TYPE", "Educational/Research")
TIMEOUT = int(os.getenv("HTTP_TIMEOUT", "30"))
LIMIT = int(os.getenv("PDB_LIMIT", "250"))
HS_KOMPASS_URL = os.getenv("HS_KOMPASS_URL", "https://hs-kompass.de/kompass/xml/download/hs_liste.txt")
UNI_DOMAIN_COUNTRIES = os.getenv("UNI_DOMAIN_COUNTRIES", "DE,AT")

def atomic_replace(src_path: str, dst_path: str) -> None:
    os.makedirs(os.path.dirname(dst_path), exist_ok=True)
    tmp = dst_path + ".tmp"
    shutil.copyfile(src_path, tmp)
    os.replace(tmp, dst_path)
    os.chmod(dst_path, 0o644)

def download_maxmind_mmdb() -> None:
    if not LICENSE_KEY:
        raise RuntimeError("MAXMIND_LICENSE_KEY missing")

    # Offizieller GeoLite2 Download-Mechanismus per license_key + edition_id
    url = (
        "https://download.maxmind.com/app/geoip_download"
        f"?edition_id=GeoLite2-ASN&license_key={LICENSE_KEY}&suffix=tar.gz"
    )

    with tempfile.TemporaryDirectory() as td:
        tgz = os.path.join(td, "GeoLite2-ASN.tar.gz")
        r = requests.get(url, timeout=TIMEOUT)
        if r.status_code == 429:
            existing = os.path.join(OUT_DIR, "GeoLite2-ASN.mmdb")
            if os.path.exists(existing):
                print("[warn] MaxMind rate limited (429); keeping existing mmdb")
                return
            raise RuntimeError("MaxMind rate limited (429) and no existing mmdb")
        r.raise_for_status()
        with open(tgz, "wb") as f:
            f.write(r.content)

        mmdb_found = None
        with tarfile.open(tgz, "r:gz") as tar:
            for member in tar.getmembers():
                if member.name.endswith("GeoLite2-ASN.mmdb"):
                    tar.extract(member, path=td)
                    mmdb_found = os.path.join(td, member.name)
                    break

        if not mmdb_found or not os.path.exists(mmdb_found):
            raise RuntimeError("GeoLite2-ASN.mmdb not found in archive")

        atomic_replace(mmdb_found, os.path.join(OUT_DIR, "GeoLite2-ASN.mmdb"))

def pdb_headers():
    if not PDB_API_KEY:
        return {"Accept": "application/json"}
    # PeeringDB API Key (optional)
    return {"Accept": "application/json", "Authorization": f"Api-Key {PDB_API_KEY}"}

def fetch_pdb_page(skip: int, info_type: str):
    url = f"{PDB_BASE}/api/net"
    params = {
        "info_type": info_type,
        "limit": LIMIT,
        "skip": skip,
        "fields": "asn,status,info_type",
    }
    r = requests.get(url, params=params, headers=pdb_headers(), timeout=TIMEOUT)
    r.raise_for_status()
    j = r.json()
    return j.get("data", [])

def update_nren_asns() -> str:
    info_types = [INFO_TYPE]
    # Alternate label seen in PeeringDB deployments.
    if INFO_TYPE != "Research and Education":
        info_types.append("Research and Education")
    if INFO_TYPE != "Educational/Research":
        info_types.append("Educational/Research")

    asns = set()
    used_info_type = INFO_TYPE
    for info_type in info_types:
        asns.clear()
        skip = 0
        while True:
            data = fetch_pdb_page(skip, info_type)
            for obj in data:
                if obj.get("status") != "ok":
                    continue
                asn = obj.get("asn")
                if isinstance(asn, int) and asn > 0:
                    asns.add(asn)
            if len(data) < LIMIT:
                break
            skip += LIMIT
            time.sleep(1.1)  # sehr konservativ
        if asns:
            used_info_type = info_type
            break

    if not asns:
        print(f"[warn] no ASNs found for info_type(s)={info_types}")

    out_txt = os.path.join(OUT_DIR, "nren_asns.txt")
    with tempfile.NamedTemporaryFile("w", delete=False, dir=OUT_DIR) as f:
        for a in sorted(asns):
            f.write(f"{a}\n")
        tmp_path = f.name
    os.replace(tmp_path, out_txt)
    os.chmod(out_txt, 0o644)
    return used_info_type

def update_uni_domains() -> int:
    country_set = {c.strip() for c in UNI_DOMAIN_COUNTRIES.split(",")}

    hs_domains: set = set()
    try:
        r = requests.get(HS_KOMPASS_URL, timeout=TIMEOUT)
        r.raise_for_status()
        for line in r.text.splitlines():
            parts = line.split("\t")
            if len(parts) <= 20:
                continue
            homepage = parts[20].strip()
            if not homepage:
                continue
            try:
                if not homepage.startswith(("http://", "https://")):
                    homepage = "http://" + homepage
                parsed = urlparse(homepage)
                hostname = (parsed.hostname or "").lower()
                if hostname.startswith("www."):
                    hostname = hostname[4:]
                if hostname:
                    hs_domains.add(hostname)
            except Exception:
                continue
    except Exception as err:
        print(f"[warn] hs-kompass fetch failed: {err}")

    hipo_domains: set = set()
    try:
        r = requests.get(
            "https://raw.githubusercontent.com/Hipo/university-domains-list/master/world_universities_and_domains.json",
            timeout=TIMEOUT,
        )
        r.raise_for_status()
        for entry in r.json():
            if entry.get("alpha_two_code") in country_set:
                for d in entry.get("domains", []):
                    hipo_domains.add(d.lower().strip())
    except Exception as err:
        print(f"[warn] hipo fetch failed: {err}")

    if len(hs_domains) == 0 or len(hipo_domains) == 0:
        print(f"[warn] uni_domains: hs_kompass={len(hs_domains)} hipo={len(hipo_domains)}")

    merged = hs_domains | hipo_domains
    if len(merged) == 0:
        print("[warn] uni_domains update produced 0 entries — skipping write to preserve existing file")
        return 0

    out_txt = os.path.join(OUT_DIR, "uni_domains.txt")
    with tempfile.NamedTemporaryFile("w", delete=False, dir=OUT_DIR) as f:
        for d in sorted(merged):
            f.write(f"{d}\n")
        tmp_path = f.name
    os.replace(tmp_path, out_txt)
    os.chmod(out_txt, 0o644)

    meta = {
        "hs_kompass": len(hs_domains),
        "hipo": len(hipo_domains),
        "total": len(merged),
        "updated_at_unix": int(time.time()),
    }
    meta_path = os.path.join(OUT_DIR, "uni_domains_meta.json")
    with open(meta_path, "w") as f:
        json.dump(meta, f)
    os.chmod(meta_path, 0o644)

    return len(merged)

def write_meta(info_type: str):
    meta = {
        "updated_at_unix": int(time.time()),
        "info_type": info_type,
        "pdb_base": PDB_BASE,
    }
    with open(os.path.join(OUT_DIR, "metadata.json"), "w") as f:
        json.dump(meta, f, indent=2)
    os.chmod(os.path.join(OUT_DIR, "metadata.json"), 0o644)

def main():
    os.makedirs(OUT_DIR, exist_ok=True)
    download_maxmind_mmdb()
    used_info_type = update_nren_asns()
    domain_count = update_uni_domains()
    write_meta(used_info_type)
    print(f"[ok] updated mmdb + nren_asns + uni_domains (domain_count={domain_count})")

if __name__ == "__main__":
    main()