Initial import: web4beginners editor and deployment setup

2026-03-06 13:49:43 +01:00
commit fd9ea482bf
73 changed files with 4043 additions and 0 deletions
--- a/scripts/extract_dom_content.php
+++ b/scripts/extract_dom_content.php
@@ -0,0 +1,318 @@
+#!/usr/bin/env php
+<?php
+declare(strict_types=1);
+
+if ($argc < 3) {
+    fwrite(STDERR, "Usage: php scripts/extract_dom_content.php <input_html> <output_json>\n");
+    exit(1);
+}
+
+$inputHtml = $argv[1];
+$outputJson = $argv[2];
+
+if (!is_file($inputHtml)) {
+    fwrite(STDERR, "Input file not found: {$inputHtml}\n");
+    exit(1);
+}
+
+libxml_use_internal_errors(true);
+$dom = new DOMDocument();
+$html = file_get_contents($inputHtml);
+if ($html === false) {
+    fwrite(STDERR, "Failed to read input file: {$inputHtml}\n");
+    exit(1);
+}
+
+$loaded = $dom->loadHTML($html, LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NONET);
+if (!$loaded) {
+    fwrite(STDERR, "Failed to parse HTML: {$inputHtml}\n");
+    exit(1);
+}
+
+$xpath = new DOMXPath($dom);
+
+$data = [
+    'meta' => extractMeta($xpath, $dom),
+    'shared' => extractShared($xpath),
+    'sections' => extractMainSections($xpath),
+];
+
+$json = json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
+if ($json === false) {
+    fwrite(STDERR, "Failed to encode JSON output.\n");
+    exit(1);
+}
+
+if (file_put_contents($outputJson, $json . "\n") === false) {
+    fwrite(STDERR, "Failed to write output file: {$outputJson}\n");
+    exit(1);
+}
+
+fwrite(STDOUT, "Wrote extracted content to {$outputJson}\n");
+
+function extractMeta(DOMXPath $xpath, DOMDocument $dom): array
+{
+    $meta = [
+        'title' => normalizeText($dom->getElementsByTagName('title')->item(0)?->textContent ?? ''),
+        'description' => firstAttrValue($xpath, "//meta[@name='description']", 'content'),
+        'open_graph' => [],
+        'twitter' => [],
+        'other' => [],
+    ];
+
+    $ogNodes = $xpath->query("//meta[starts-with(@property, 'og:') or starts-with(@name, 'og:')]");
+    if ($ogNodes instanceof DOMNodeList) {
+        foreach ($ogNodes as $node) {
+            if (!$node instanceof DOMElement) {
+                continue;
+            }
+            $name = trim((string) ($node->getAttribute('property') ?: $node->getAttribute('name')));
+            $value = normalizeText((string) $node->getAttribute('content'));
+            if ($name !== '' && $value !== '') {
+                $meta['open_graph'][$name] = $value;
+            }
+        }
+    }
+
+    $twitterNodes = $xpath->query("//meta[starts-with(@name, 'twitter:')]");
+    if ($twitterNodes instanceof DOMNodeList) {
+        foreach ($twitterNodes as $node) {
+            if (!$node instanceof DOMElement) {
+                continue;
+            }
+            $name = trim((string) $node->getAttribute('name'));
+            $value = normalizeText((string) $node->getAttribute('content'));
+            if ($name !== '' && $value !== '') {
+                $meta['twitter'][$name] = $value;
+            }
+        }
+    }
+
+    $otherMetaNames = ['title'];
+    foreach ($otherMetaNames as $metaName) {
+        $value = firstAttrValue($xpath, "//meta[@name='{$metaName}']", 'content');
+        if ($value !== '') {
+            $meta['other'][$metaName] = $value;
+        }
+    }
+
+    ksort($meta['open_graph']);
+    ksort($meta['twitter']);
+    ksort($meta['other']);
+
+    return $meta;
+}
+
+function extractShared(DOMXPath $xpath): array
+{
+    $commonTexts = [];
+    $textToKey = [];
+    $shared = [
+        'common_texts' => [],
+        'navigation' => ['text_keys' => [], 'images' => []],
+        'cookie_layer' => ['text_keys' => [], 'images' => []],
+        'footer' => ['text_keys' => [], 'images' => []],
+    ];
+
+    $sharedRoots = [
+        'navigation' => firstNode($xpath, "//header[contains(@class, 'page-head')]"),
+        'cookie_layer' => firstNode($xpath, "//*[@id='cookie-layer']"),
+        'footer' => firstNode($xpath, "//footer[contains(@class, 'site-footer')]"),
+    ];
+
+    foreach ($sharedRoots as $name => $root) {
+        if (!$root instanceof DOMNode) {
+            continue;
+        }
+
+        $texts = extractTexts($xpath, $root);
+        foreach ($texts as $text) {
+            if (!isset($textToKey[$text])) {
+                $key = 'common_' . str_pad((string) (count($commonTexts) + 1), 3, '0', STR_PAD_LEFT);
+                $textToKey[$text] = $key;
+                $commonTexts[$key] = $text;
+            }
+            $shared[$name]['text_keys'][] = $textToKey[$text];
+        }
+
+        $shared[$name]['images'] = extractImages($xpath, $root);
+    }
+
+    $shared['common_texts'] = $commonTexts;
+    return $shared;
+}
+
+function extractMainSections(DOMXPath $xpath): array
+{
+    $sections = [];
+
+    $sectionQueries = [
+        'hero' => "//main//section[contains(@class, 'module-hero-teaser')]",
+        'page_header' => "//main//section[contains(@class, 'page-header')]",
+        'projects' => "//main//section[contains(@class, 'module-projects-teaser')]",
+        'services' => "//main//section[.//*[contains(@class, 'services-teaser__content')]]",
+        'team' => "(//main//section[contains(@class, 'text-image')])[1]",
+        'awards' => "//main//section[.//*[contains(@class, 'awards-teaser__content')]]",
+        'contact' => "//main//section[contains(@class, 'contact-teaser')]",
+        'clients' => "//main//section[.//*[contains(@class, 'clients-teaser__content')]]",
+        'partners' => "(//main//section[contains(@class, 'text-image')])[2]",
+    ];
+
+    foreach ($sectionQueries as $name => $query) {
+        $root = firstNode($xpath, $query);
+        if (!$root instanceof DOMNode) {
+            continue;
+        }
+        $sections[$name] = [
+            'texts' => extractKeyValueTexts($xpath, $root),
+            'images' => extractImages($xpath, $root),
+        ];
+    }
+
+    return $sections;
+}
+
+function extractKeyValueTexts(DOMXPath $xpath, DOMNode $root): array
+{
+    $textNodes = $xpath->query('.//text()', $root);
+    if (!$textNodes instanceof DOMNodeList) {
+        return [];
+    }
+
+    $counters = [];
+    $texts = [];
+
+    foreach ($textNodes as $textNode) {
+        if (!$textNode instanceof DOMText) {
+            continue;
+        }
+
+        $value = normalizeText($textNode->wholeText);
+        if ($value === '') {
+            continue;
+        }
+
+        if (!isVisibleTextNode($textNode)) {
+            continue;
+        }
+
+        $parent = $textNode->parentNode;
+        if (!$parent instanceof DOMElement) {
+            continue;
+        }
+
+        $tag = strtolower($parent->tagName);
+        $counters[$tag] = ($counters[$tag] ?? 0) + 1;
+        $key = $tag . '_' . str_pad((string) $counters[$tag], 3, '0', STR_PAD_LEFT);
+
+        $texts[$key] = $value;
+    }
+
+    return $texts;
+}
+
+function extractTexts(DOMXPath $xpath, DOMNode $root): array
+{
+    $textNodes = $xpath->query('.//text()', $root);
+    if (!$textNodes instanceof DOMNodeList) {
+        return [];
+    }
+
+    $texts = [];
+    foreach ($textNodes as $textNode) {
+        if (!$textNode instanceof DOMText) {
+            continue;
+        }
+
+        $value = normalizeText($textNode->wholeText);
+        if ($value === '') {
+            continue;
+        }
+
+        if (!isVisibleTextNode($textNode)) {
+            continue;
+        }
+
+        $texts[] = $value;
+    }
+
+    return $texts;
+}
+
+function extractImages(DOMXPath $xpath, DOMNode $root): array
+{
+    $imageNodes = $xpath->query('.//img[@src]', $root);
+    if (!$imageNodes instanceof DOMNodeList) {
+        return [];
+    }
+
+    $images = [];
+    $index = 1;
+
+    foreach ($imageNodes as $imageNode) {
+        if (!$imageNode instanceof DOMElement) {
+            continue;
+        }
+
+        $src = trim((string) $imageNode->getAttribute('src'));
+        if ($src === '') {
+            continue;
+        }
+
+        $key = 'img_' . str_pad((string) $index, 3, '0', STR_PAD_LEFT);
+        $images[$key] = [
+            'src' => $src,
+            'alt' => normalizeText((string) $imageNode->getAttribute('alt')),
+        ];
+        $index++;
+    }
+
+    return $images;
+}
+
+function isVisibleTextNode(DOMText $textNode): bool
+{
+    $skipTags = [
+        'script', 'style', 'noscript', 'template', 'svg', 'path', 'defs',
+    ];
+
+    $node = $textNode->parentNode;
+    while ($node instanceof DOMNode) {
+        if ($node instanceof DOMElement) {
+            $tag = strtolower($node->tagName);
+            if (in_array($tag, $skipTags, true)) {
+                return false;
+            }
+        }
+        $node = $node->parentNode;
+    }
+
+    return true;
+}
+
+function normalizeText(string $value): string
+{
+    $value = str_replace(["\r", "\n", "\t"], ' ', $value);
+    $value = preg_replace('/\s+/u', ' ', $value) ?? $value;
+    return trim($value);
+}
+
+function firstNode(DOMXPath $xpath, string $query): ?DOMNode
+{
+    $nodes = $xpath->query($query);
+    if (!$nodes instanceof DOMNodeList || $nodes->length === 0) {
+        return null;
+    }
+
+    $node = $nodes->item(0);
+    return $node instanceof DOMNode ? $node : null;
+}
+
+function firstAttrValue(DOMXPath $xpath, string $query, string $attr): string
+{
+    $node = firstNode($xpath, $query);
+    if (!$node instanceof DOMElement) {
+        return '';
+    }
+    return normalizeText((string) $node->getAttribute($attr));
+}