diff --git a/README.md b/README.md index f9f5b41..6e64f0e 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,11 @@ Client - Kategorie: `Research and Education` - monatliche Aktualisierung +- **Hochschul-Domainliste (`uni_domains.txt`)** + - zusammengeführt aus hs-kompass.de (DE) und Hipo university-domains-list (DE+AT, konfigurierbar) + - fängt Hochschulen ab, die externe Mail-Provider nutzen und daher kein eigenes NREN-AS haben + - monatliche Aktualisierung durch den Updater-Sidecar + --- ## Bereitgestellte Header @@ -62,10 +67,17 @@ Client ## Domain-Lookup (optional) -Für die Validierung von Institutions-Domains kann ein Lookup genutzt werden: +Für die Validierung von Institutions-Domains kann ein Lookup genutzt werden. Der Service prüft sowohl die ASN-Datenbank als auch eine gepflegte Liste von Hochschuldomains (`uni_domains.txt`). +**Abfrage per Domain:** ``` GET /lookup?domain=uni-stuttgart.de +GET /lookup?domain=insti.uni-stuttgart.de +``` + +**Abfrage per E-Mail-Adresse** (Domain wird automatisch extrahiert): +``` +GET /lookup?email=student@uni-stuttgart.de ``` Antwort (JSON): @@ -73,6 +85,8 @@ Antwort (JSON): { "domain": "uni-stuttgart.de", "nren": true, + "asn_match": true, + "domain_match": false, "asn": 12345, "asn_org": "Universitaet Stuttgart", "ips": ["129.69.1.1"], @@ -80,6 +94,23 @@ Antwort (JSON): } ``` +Antwort bei Treffer über die Domain-Liste (z. B. für Hochschulen mit externem Mail-Provider): +```json +{ + "domain": "hdm-stuttgart.de", + "nren": true, + "asn_match": false, + "domain_match": true, + "matched_domain": "hdm-stuttgart.de", + "ips": ["..."] +} +``` + +- `nren`: `true` wenn `asn_match` ODER `domain_match` zutrifft +- `asn_match`: ASN-Treffer in PeeringDB-Daten +- `domain_match`: Treffer in `uni_domains.txt` (inkl. Subdomain-Matching) +- `matched_domain`: der gematchte Eintrag in der Liste (nur bei `domain_match: true`) + --- ## Integration @@ -100,7 +131,8 @@ Der Service ist **nicht öffentlich exponiert** und kommuniziert ausschließlich ## Healthcheck -- `GET /healthz` liefert `200`, wenn mindestens `MIN_ASN_COUNT` ASNs geladen sind +- `GET /healthz` liefert `200` wenn mindestens `MIN_ASN_COUNT` ASNs geladen sind, sonst `503` +- Antwort ist immer JSON: `{"asn_count": N, "domain_count": N}` - Standard: `MIN_ASN_COUNT=10` (konfigurierbar via Env) --- diff --git a/asn-updater/README.md b/asn-updater/README.md index a38b00a..1fd8455 100644 --- a/asn-updater/README.md +++ b/asn-updater/README.md @@ -42,6 +42,11 @@ Client - Kategorie: `Research and Education` - monatliche Aktualisierung +- **Hochschul-Domainliste (`uni_domains.txt`)** + - zusammengeführt aus hs-kompass.de TSV und Hipo university-domains-list JSON + - Länderfilter konfigurierbar via `UNI_DOMAIN_COUNTRIES` (Standard: `DE,AT`) + - nach Update: `uni_domains_meta.json` mit Zählern je Quelle + ## Bereitgestellte Header | Header | Beschreibung | @@ -85,6 +90,8 @@ Bitte füge diese zu dem Service hinzu, bei welchem man die gewünschten Header - `PDB_BASE`, `PDB_INFO_TYPE`, `PDB_LIMIT`: PeeringDB Filter. - `HTTP_TIMEOUT`: Timeout pro HTTP-Request. - `INTERVAL_SECONDS`: Update-Intervall (Standard 30 Tage). +- `UNI_DOMAIN_COUNTRIES`: ISO-Ländercodes für Hipo-Filter (Standard: `DE,AT`). +- `HS_KOMPASS_URL`: URL der hs-kompass.de TSV-Datei (überschreibbar ohne Image-Rebuild). ## Update-Strategie diff --git a/asn-updater/example.env b/asn-updater/example.env index 866b001..6acae6c 100644 --- a/asn-updater/example.env +++ b/asn-updater/example.env @@ -17,3 +17,9 @@ HTTP_TIMEOUT=30 # Update interval (seconds, default 30 days) INTERVAL_SECONDS=2592000 + +# Comma-separated ISO country codes for Hipo university domain list filter +UNI_DOMAIN_COUNTRIES=DE,AT + +# Source URL for German university list (hs-kompass.de TSV) +HS_KOMPASS_URL=https://hs-kompass.de/kompass/xml/download/hs_liste.txt diff --git a/asn-updater/update.py b/asn-updater/update.py index 3915419..f7bfe13 100644 --- a/asn-updater/update.py +++ b/asn-updater/update.py @@ -1,4 +1,5 @@ import os, time, json, tarfile, tempfile, shutil +from urllib.parse import urlparse import requests OUT_DIR = os.getenv("OUT_DIR", "/data") @@ -8,6 +9,8 @@ PDB_BASE = os.getenv("PDB_BASE", "https://www.peeringdb.com") INFO_TYPE = os.getenv("PDB_INFO_TYPE", "Educational/Research") TIMEOUT = int(os.getenv("HTTP_TIMEOUT", "30")) LIMIT = int(os.getenv("PDB_LIMIT", "250")) +HS_KOMPASS_URL = os.getenv("HS_KOMPASS_URL", "https://hs-kompass.de/kompass/xml/download/hs_liste.txt") +UNI_DOMAIN_COUNTRIES = os.getenv("UNI_DOMAIN_COUNTRIES", "DE,AT") def atomic_replace(src_path: str, dst_path: str) -> None: os.makedirs(os.path.dirname(dst_path), exist_ok=True) @@ -112,6 +115,77 @@ def update_nren_asns() -> str: os.chmod(out_txt, 0o644) return used_info_type +def update_uni_domains() -> int: + country_set = {c.strip() for c in UNI_DOMAIN_COUNTRIES.split(",")} + + hs_domains: set = set() + try: + r = requests.get(HS_KOMPASS_URL, timeout=TIMEOUT) + r.raise_for_status() + for line in r.text.splitlines(): + parts = line.split("\t") + if len(parts) <= 20: + continue + homepage = parts[20].strip() + if not homepage: + continue + try: + if not homepage.startswith(("http://", "https://")): + homepage = "http://" + homepage + parsed = urlparse(homepage) + hostname = (parsed.hostname or "").lower() + if hostname.startswith("www."): + hostname = hostname[4:] + if hostname: + hs_domains.add(hostname) + except Exception: + continue + except Exception as err: + print(f"[warn] hs-kompass fetch failed: {err}") + + hipo_domains: set = set() + try: + r = requests.get( + "https://raw.githubusercontent.com/Hipo/university-domains-list/master/world_universities_and_domains.json", + timeout=TIMEOUT, + ) + r.raise_for_status() + for entry in r.json(): + if entry.get("alpha_two_code") in country_set: + for d in entry.get("domains", []): + hipo_domains.add(d.lower().strip()) + except Exception as err: + print(f"[warn] hipo fetch failed: {err}") + + if len(hs_domains) == 0 or len(hipo_domains) == 0: + print(f"[warn] uni_domains: hs_kompass={len(hs_domains)} hipo={len(hipo_domains)}") + + merged = hs_domains | hipo_domains + if len(merged) == 0: + print("[warn] uni_domains update produced 0 entries — skipping write to preserve existing file") + return 0 + + out_txt = os.path.join(OUT_DIR, "uni_domains.txt") + with tempfile.NamedTemporaryFile("w", delete=False, dir=OUT_DIR) as f: + for d in sorted(merged): + f.write(f"{d}\n") + tmp_path = f.name + os.replace(tmp_path, out_txt) + os.chmod(out_txt, 0o644) + + meta = { + "hs_kompass": len(hs_domains), + "hipo": len(hipo_domains), + "total": len(merged), + "updated_at_unix": int(time.time()), + } + meta_path = os.path.join(OUT_DIR, "uni_domains_meta.json") + with open(meta_path, "w") as f: + json.dump(meta, f) + os.chmod(meta_path, 0o644) + + return len(merged) + def write_meta(info_type: str): meta = { "updated_at_unix": int(time.time()), @@ -126,8 +200,9 @@ def main(): os.makedirs(OUT_DIR, exist_ok=True) download_maxmind_mmdb() used_info_type = update_nren_asns() + domain_count = update_uni_domains() write_meta(used_info_type) - print("[ok] updated mmdb + nren_asns") + print(f"[ok] updated mmdb + nren_asns + uni_domains (domain_count={domain_count})") if __name__ == "__main__": main() diff --git a/example.env b/example.env index 866b001..93e4e5a 100644 --- a/example.env +++ b/example.env @@ -17,3 +17,6 @@ HTTP_TIMEOUT=30 # Update interval (seconds, default 30 days) INTERVAL_SECONDS=2592000 + +# University domain list path (populated by asn-updater) +UNI_DOMAINS_PATH=/data/uni_domains.txt diff --git a/main.go b/main.go index 6df6ef7..31cb46e 100644 --- a/main.go +++ b/main.go @@ -3,6 +3,7 @@ package main import ( "bufio" "encoding/json" + "fmt" "log" "net" "net/http" @@ -21,18 +22,22 @@ type asnRecord struct { } type lookupResponse struct { - Domain string `json:"domain"` - NREN bool `json:"nren"` - ASN *uint `json:"asn,omitempty"` - ASNOrg string `json:"asn_org,omitempty"` - IPs []string `json:"ips"` - MatchedIP string `json:"matched_ip,omitempty"` - Error string `json:"error,omitempty"` + Domain string `json:"domain"` + NREN bool `json:"nren"` + ASNMatch bool `json:"asn_match"` + DomainMatch bool `json:"domain_match"` + MatchedDomain string `json:"matched_domain,omitempty"` + ASN *uint `json:"asn,omitempty"` + ASNOrg string `json:"asn_org,omitempty"` + IPs []string `json:"ips"` + MatchedIP string `json:"matched_ip,omitempty"` + Error string `json:"error,omitempty"` } type server struct { db *maxminddb.Reader nrenASNs map[uint]struct{} + uniDomains map[string]struct{} ready atomic.Bool versionTag string minASN int @@ -62,6 +67,65 @@ func loadASNSet(path string) (map[uint]struct{}, error) { return set, sc.Err() } +func loadDomainSet(path string) (map[string]struct{}, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + + set := make(map[string]struct{}, 2048) + sc := bufio.NewScanner(f) + for sc.Scan() { + line := strings.ToLower(strings.TrimSpace(sc.Text())) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + if !strings.Contains(line, ".") { + log.Printf("[warn] skipping invalid domain entry: %s", line) + continue + } + set[line] = struct{}{} + } + return set, sc.Err() +} + +// matchesUniDomain checks if domain or any parent domain (label-aligned suffix) +// appears in set. Returns (true, matchedEntry) on match, (false, "") otherwise. +// Prevents false positives: "evil-uni-stuttgart.de" does not match "uni-stuttgart.de". +func matchesUniDomain(domain string, set map[string]struct{}) (bool, string) { + d := strings.TrimRight(strings.ToLower(domain), ".") + if d == "" { + return false, "" + } + if _, ok := set[d]; ok { + return true, d + } + for { + dot := strings.Index(d, ".") + if dot < 0 { + break + } + d = d[dot+1:] + if !strings.Contains(d, ".") { + // fewer than 2 labels remain — stop + break + } + if _, ok := set[d]; ok { + return true, d + } + } + return false, "" +} + +func extractDomain(s string) string { + if strings.Contains(s, "@") { + parts := strings.SplitN(s, "@", 2) + return strings.ToLower(strings.TrimSpace(parts[1])) + } + return strings.ToLower(strings.TrimSpace(s)) +} + func firstForwardedFor(r *http.Request) string { xff := r.Header.Get("X-Forwarded-For") if xff == "" { @@ -93,6 +157,16 @@ func writeJSON(w http.ResponseWriter, status int, payload any) { _ = json.NewEncoder(w).Encode(payload) } +func (s *server) healthzHandler(w http.ResponseWriter, _ *http.Request) { + status := http.StatusOK + if s.asnCount < s.minASN { + status = http.StatusServiceUnavailable + } + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + fmt.Fprintf(w, `{"asn_count":%d,"domain_count":%d}`, s.asnCount, len(s.uniDomains)) +} + func (s *server) authHandler(w http.ResponseWriter, r *http.Request) { if !s.ready.Load() { w.WriteHeader(http.StatusServiceUnavailable) @@ -142,8 +216,20 @@ func (s *server) lookupHandler(w http.ResponseWriter, r *http.Request) { return } - domain := strings.TrimSpace(r.URL.Query().Get("domain")) - if domain == "" { + emailParam := r.URL.Query().Get("email") + domainParam := r.URL.Query().Get("domain") + + var rawInput string + resp := lookupResponse{} + + if emailParam != "" { + rawInput = emailParam + if !strings.Contains(emailParam, "@") { + resp.Error = "email param has no @ — interpreted as bare domain" + } + } else if domainParam != "" { + rawInput = strings.TrimSpace(domainParam) + } else { writeJSON(w, http.StatusBadRequest, lookupResponse{ NREN: false, Error: "missing domain", @@ -151,55 +237,53 @@ func (s *server) lookupHandler(w http.ResponseWriter, r *http.Request) { return } - ips, err := net.LookupIP(domain) - if err != nil || len(ips) == 0 { - writeJSON(w, http.StatusOK, lookupResponse{ - Domain: domain, - NREN: false, - Error: "domain lookup failed", - }) - return - } + domain := extractDomain(rawInput) + resp.Domain = domain - resp := lookupResponse{ - Domain: domain, - NREN: false, - IPs: make([]string, 0, len(ips)), - } + ips, dnsErr := net.LookupIP(domain) + if dnsErr == nil && len(ips) > 0 { + resp.IPs = make([]string, 0, len(ips)) + var firstASN *uint + var firstOrg string - var firstASN *uint - var firstOrg string + for _, ip := range ips { + ipStr := ip.String() + resp.IPs = append(resp.IPs, ipStr) - for _, ip := range ips { - ipStr := ip.String() - resp.IPs = append(resp.IPs, ipStr) + if s.db == nil { + continue + } + var rec asnRecord + if err := s.db.Lookup(ip, &rec); err != nil || rec.ASN == 0 { + continue + } - var rec asnRecord - if err := s.db.Lookup(ip, &rec); err != nil || rec.ASN == 0 { - continue + if firstASN == nil { + firstASN = new(uint) + *firstASN = rec.ASN + firstOrg = rec.Org + } + + if _, ok := s.nrenASNs[rec.ASN]; ok { + asn := rec.ASN + resp.ASNMatch = true + resp.ASN = &asn + resp.ASNOrg = rec.Org + resp.MatchedIP = ipStr + break + } } - if firstASN == nil { - firstASN = new(uint) - *firstASN = rec.ASN - firstOrg = rec.Org - } - - if _, ok := s.nrenASNs[rec.ASN]; ok { - asn := rec.ASN - resp.NREN = true - resp.ASN = &asn - resp.ASNOrg = rec.Org - resp.MatchedIP = ipStr - writeJSON(w, http.StatusOK, resp) - return + if !resp.ASNMatch && firstASN != nil { + resp.ASN = firstASN + resp.ASNOrg = firstOrg } + } else if resp.Error == "" { + resp.Error = "domain lookup failed" } - if firstASN != nil { - resp.ASN = firstASN - resp.ASNOrg = firstOrg - } + resp.DomainMatch, resp.MatchedDomain = matchesUniDomain(domain, s.uniDomains) + resp.NREN = resp.ASNMatch || resp.DomainMatch writeJSON(w, http.StatusOK, resp) } @@ -223,9 +307,21 @@ func main() { } asnCount := len(set) + uniDomainsPath := getenv("UNI_DOMAINS_PATH", "/data/uni_domains.txt") + uniDomains, err := loadDomainSet(uniDomainsPath) + if err != nil { + if os.IsNotExist(err) { + log.Printf("[warn] uni_domains.txt not found — domain_match will always be false") + } else { + log.Printf("[warn] failed to load uni_domains.txt: %v", err) + } + uniDomains = make(map[string]struct{}) + } + s := &server{ db: db, nrenASNs: set, + uniDomains: uniDomains, versionTag: version, minASN: minASN, asnCount: asnCount, @@ -235,13 +331,7 @@ func main() { mux := http.NewServeMux() mux.HandleFunc("/auth", s.authHandler) mux.HandleFunc("/lookup", s.lookupHandler) - mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) { - if s.asnCount < s.minASN { - w.WriteHeader(http.StatusServiceUnavailable) - return - } - w.WriteHeader(http.StatusOK) - }) + mux.HandleFunc("/healthz", s.healthzHandler) srv := &http.Server{ Addr: addr, @@ -249,7 +339,7 @@ func main() { ReadHeaderTimeout: 2 * time.Second, } - log.Printf("listening on %s (asn_count=%d, min_asn=%d)", addr, asnCount, minASN) + log.Printf("listening on %s (asn_count=%d, min_asn=%d, domain_count=%d)", addr, asnCount, minASN, len(uniDomains)) log.Fatal(srv.ListenAndServe()) } diff --git a/main_test.go b/main_test.go index aec86c6..e6abd85 100644 --- a/main_test.go +++ b/main_test.go @@ -44,3 +44,165 @@ func TestLookupServiceNotReady(t *testing.T) { t.Fatalf("expected 503, got %d", rr.Code) } } + +func TestMatchesUniDomain(t *testing.T) { + set := map[string]struct{}{ + "uni-stuttgart.de": {}, + "hdm-stuttgart.de": {}, + } + tests := []struct { + domain string + wantMatch bool + wantEntry string + }{ + {"uni-stuttgart.de", true, "uni-stuttgart.de"}, // exact match + {"insti.uni-stuttgart.de", true, "uni-stuttgart.de"}, // single-level subdomain + {"a.b.uni-stuttgart.de", true, "uni-stuttgart.de"}, // multi-level subdomain + {"evil-uni-stuttgart.de", false, ""}, // lookalike non-match (different label) + {"example.com", false, ""}, // not in set + {"com", false, ""}, // single-label input + {"uni-stuttgart.de.", true, "uni-stuttgart.de"}, // trailing dot normalised + {"", false, ""}, // empty string + } + for _, tc := range tests { + got, entry := matchesUniDomain(tc.domain, set) + if got != tc.wantMatch || entry != tc.wantEntry { + t.Errorf("matchesUniDomain(%q): got (%v, %q), want (%v, %q)", + tc.domain, got, entry, tc.wantMatch, tc.wantEntry) + } + } +} + +func TestExtractDomain(t *testing.T) { + tests := []struct { + input string + want string + }{ + {"uni-stuttgart.de", "uni-stuttgart.de"}, // plain domain passthrough + {"foo@uni-stuttgart.de", "uni-stuttgart.de"}, // email extraction + {"FOO@UNI-STUTTGART.DE", "uni-stuttgart.de"}, // uppercase normalisation + {" foo@uni-stuttgart.de ", "uni-stuttgart.de"}, // leading/trailing spaces + {"notanemail", "notanemail"}, // no-@ passthrough + } + for _, tc := range tests { + got := extractDomain(tc.input) + if got != tc.want { + t.Errorf("extractDomain(%q): got %q, want %q", tc.input, got, tc.want) + } + } +} + +func TestLookupDomainMatch(t *testing.T) { + s := &server{ + nrenASNs: map[uint]struct{}{}, + uniDomains: map[string]struct{}{"uni-stuttgart.de": {}}, + } + s.ready.Store(true) + + req := httptest.NewRequest(http.MethodGet, "/lookup?domain=insti.uni-stuttgart.de", nil) + rr := httptest.NewRecorder() + s.lookupHandler(rr, req) + + body := rr.Body.String() + if !strings.Contains(body, `"domain_match":true`) { + t.Errorf("expected domain_match:true in %s", body) + } + if !strings.Contains(body, `"matched_domain":"uni-stuttgart.de"`) { + t.Errorf("expected matched_domain in %s", body) + } + if !strings.Contains(body, `"nren":true`) { + t.Errorf("expected nren:true in %s", body) + } + if !strings.Contains(body, `"asn_match":false`) { + t.Errorf("expected asn_match:false in %s", body) + } +} + +func TestLookupEmailParam(t *testing.T) { + s := &server{ + nrenASNs: map[uint]struct{}{}, + uniDomains: map[string]struct{}{"uni-stuttgart.de": {}}, + } + s.ready.Store(true) + + req := httptest.NewRequest(http.MethodGet, "/lookup?email=student%40insti.uni-stuttgart.de", nil) + rr := httptest.NewRecorder() + s.lookupHandler(rr, req) + + body := rr.Body.String() + if !strings.Contains(body, `"domain_match":true`) { + t.Errorf("expected domain_match:true in %s", body) + } + if !strings.Contains(body, `"matched_domain":"uni-stuttgart.de"`) { + t.Errorf("expected matched_domain in %s", body) + } + if !strings.Contains(body, `"nren":true`) { + t.Errorf("expected nren:true in %s", body) + } +} + +func TestLookupEmailPrecedence(t *testing.T) { + s := &server{ + nrenASNs: map[uint]struct{}{}, + uniDomains: map[string]struct{}{"uni-stuttgart.de": {}}, + } + s.ready.Store(true) + + // email= takes precedence over domain=; example.com is not in uniDomains + req := httptest.NewRequest(http.MethodGet, "/lookup?email=a%40uni-stuttgart.de&domain=example.com", nil) + rr := httptest.NewRecorder() + s.lookupHandler(rr, req) + + body := rr.Body.String() + if !strings.Contains(body, `"domain":"uni-stuttgart.de"`) { + t.Errorf("expected domain uni-stuttgart.de (from email param) in %s", body) + } + if !strings.Contains(body, `"nren":true`) { + t.Errorf("expected nren:true in %s", body) + } +} + +func TestHealthzJSON(t *testing.T) { + s := &server{ + asnCount: 42, + minASN: 10, + uniDomains: map[string]struct{}{"uni-stuttgart.de": {}, "hdm-stuttgart.de": {}}, + } + + req := httptest.NewRequest(http.MethodGet, "/healthz", nil) + rr := httptest.NewRecorder() + s.healthzHandler(rr, req) + + if rr.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", rr.Code) + } + body := rr.Body.String() + if !strings.Contains(body, `"asn_count":42`) { + t.Errorf("expected asn_count:42 in %s", body) + } + if !strings.Contains(body, `"domain_count":2`) { + t.Errorf("expected domain_count:2 in %s", body) + } + if ct := rr.Header().Get("Content-Type"); !strings.Contains(ct, "application/json") { + t.Errorf("expected Content-Type application/json, got %s", ct) + } +} + +func TestLookupEmailNoAt(t *testing.T) { + s := &server{ + nrenASNs: map[uint]struct{}{}, + uniDomains: map[string]struct{}{}, + } + s.ready.Store(true) + + req := httptest.NewRequest(http.MethodGet, "/lookup?email=notanemail", nil) + rr := httptest.NewRecorder() + s.lookupHandler(rr, req) + + if rr.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", rr.Code) + } + if !strings.Contains(rr.Body.String(), "email param has no @") { + t.Fatalf("expected error message in response, got: %s", rr.Body.String()) + } +}