#!/usr/bin/env php \n"); exit(1); } $inputHtml = $argv[1]; $outputJson = $argv[2]; if (!is_file($inputHtml)) { fwrite(STDERR, "Input file not found: {$inputHtml}\n"); exit(1); } libxml_use_internal_errors(true); $dom = new DOMDocument(); $html = file_get_contents($inputHtml); if ($html === false) { fwrite(STDERR, "Failed to read input file: {$inputHtml}\n"); exit(1); } $loaded = $dom->loadHTML($html, LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NONET); if (!$loaded) { fwrite(STDERR, "Failed to parse HTML: {$inputHtml}\n"); exit(1); } $xpath = new DOMXPath($dom); $data = [ 'meta' => extractMeta($xpath, $dom), 'shared' => extractShared($xpath), 'sections' => extractMainSections($xpath), ]; $json = json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE); if ($json === false) { fwrite(STDERR, "Failed to encode JSON output.\n"); exit(1); } if (file_put_contents($outputJson, $json . "\n") === false) { fwrite(STDERR, "Failed to write output file: {$outputJson}\n"); exit(1); } fwrite(STDOUT, "Wrote extracted content to {$outputJson}\n"); function extractMeta(DOMXPath $xpath, DOMDocument $dom): array { $meta = [ 'title' => normalizeText($dom->getElementsByTagName('title')->item(0)?->textContent ?? ''), 'description' => firstAttrValue($xpath, "//meta[@name='description']", 'content'), 'open_graph' => [], 'twitter' => [], 'other' => [], ]; $ogNodes = $xpath->query("//meta[starts-with(@property, 'og:') or starts-with(@name, 'og:')]"); if ($ogNodes instanceof DOMNodeList) { foreach ($ogNodes as $node) { if (!$node instanceof DOMElement) { continue; } $name = trim((string) ($node->getAttribute('property') ?: $node->getAttribute('name'))); $value = normalizeText((string) $node->getAttribute('content')); if ($name !== '' && $value !== '') { $meta['open_graph'][$name] = $value; } } } $twitterNodes = $xpath->query("//meta[starts-with(@name, 'twitter:')]"); if ($twitterNodes instanceof DOMNodeList) { foreach ($twitterNodes as $node) { if (!$node instanceof DOMElement) { continue; } $name = trim((string) $node->getAttribute('name')); $value = normalizeText((string) $node->getAttribute('content')); if ($name !== '' && $value !== '') { $meta['twitter'][$name] = $value; } } } $otherMetaNames = ['title']; foreach ($otherMetaNames as $metaName) { $value = firstAttrValue($xpath, "//meta[@name='{$metaName}']", 'content'); if ($value !== '') { $meta['other'][$metaName] = $value; } } ksort($meta['open_graph']); ksort($meta['twitter']); ksort($meta['other']); return $meta; } function extractShared(DOMXPath $xpath): array { $commonTexts = []; $textToKey = []; $shared = [ 'common_texts' => [], 'navigation' => ['text_keys' => [], 'images' => []], 'cookie_layer' => ['text_keys' => [], 'images' => []], 'footer' => ['text_keys' => [], 'images' => []], ]; $sharedRoots = [ 'navigation' => firstNode($xpath, "//header[contains(@class, 'page-head')]"), 'cookie_layer' => firstNode($xpath, "//*[@id='cookie-layer']"), 'footer' => firstNode($xpath, "//footer[contains(@class, 'site-footer')]"), ]; foreach ($sharedRoots as $name => $root) { if (!$root instanceof DOMNode) { continue; } $texts = extractTexts($xpath, $root); foreach ($texts as $text) { if (!isset($textToKey[$text])) { $key = 'common_' . str_pad((string) (count($commonTexts) + 1), 3, '0', STR_PAD_LEFT); $textToKey[$text] = $key; $commonTexts[$key] = $text; } $shared[$name]['text_keys'][] = $textToKey[$text]; } $shared[$name]['images'] = extractImages($xpath, $root); } $shared['common_texts'] = $commonTexts; return $shared; } function extractMainSections(DOMXPath $xpath): array { $sections = []; $sectionQueries = [ 'hero' => "//main//section[contains(@class, 'module-hero-teaser')]", 'page_header' => "//main//section[contains(@class, 'page-header')]", 'projects' => "//main//section[contains(@class, 'module-projects-teaser')]", 'services' => "//main//section[.//*[contains(@class, 'services-teaser__content')]]", 'team' => "(//main//section[contains(@class, 'text-image')])[1]", 'awards' => "//main//section[.//*[contains(@class, 'awards-teaser__content')]]", 'contact' => "//main//section[contains(@class, 'contact-teaser')]", 'clients' => "//main//section[.//*[contains(@class, 'clients-teaser__content')]]", 'partners' => "(//main//section[contains(@class, 'text-image')])[2]", ]; foreach ($sectionQueries as $name => $query) { $root = firstNode($xpath, $query); if (!$root instanceof DOMNode) { continue; } $sections[$name] = [ 'texts' => extractKeyValueTexts($xpath, $root), 'images' => extractImages($xpath, $root), ]; } return $sections; } function extractKeyValueTexts(DOMXPath $xpath, DOMNode $root): array { $textNodes = $xpath->query('.//text()', $root); if (!$textNodes instanceof DOMNodeList) { return []; } $counters = []; $texts = []; foreach ($textNodes as $textNode) { if (!$textNode instanceof DOMText) { continue; } $value = normalizeText($textNode->wholeText); if ($value === '') { continue; } if (!isVisibleTextNode($textNode)) { continue; } $parent = $textNode->parentNode; if (!$parent instanceof DOMElement) { continue; } $tag = strtolower($parent->tagName); $counters[$tag] = ($counters[$tag] ?? 0) + 1; $key = $tag . '_' . str_pad((string) $counters[$tag], 3, '0', STR_PAD_LEFT); $texts[$key] = $value; } return $texts; } function extractTexts(DOMXPath $xpath, DOMNode $root): array { $textNodes = $xpath->query('.//text()', $root); if (!$textNodes instanceof DOMNodeList) { return []; } $texts = []; foreach ($textNodes as $textNode) { if (!$textNode instanceof DOMText) { continue; } $value = normalizeText($textNode->wholeText); if ($value === '') { continue; } if (!isVisibleTextNode($textNode)) { continue; } $texts[] = $value; } return $texts; } function extractImages(DOMXPath $xpath, DOMNode $root): array { $imageNodes = $xpath->query('.//img[@src]', $root); if (!$imageNodes instanceof DOMNodeList) { return []; } $images = []; $index = 1; foreach ($imageNodes as $imageNode) { if (!$imageNode instanceof DOMElement) { continue; } $src = trim((string) $imageNode->getAttribute('src')); if ($src === '') { continue; } $key = 'img_' . str_pad((string) $index, 3, '0', STR_PAD_LEFT); $images[$key] = [ 'src' => $src, 'alt' => normalizeText((string) $imageNode->getAttribute('alt')), ]; $index++; } return $images; } function isVisibleTextNode(DOMText $textNode): bool { $skipTags = [ 'script', 'style', 'noscript', 'template', 'svg', 'path', 'defs', ]; $node = $textNode->parentNode; while ($node instanceof DOMNode) { if ($node instanceof DOMElement) { $tag = strtolower($node->tagName); if (in_array($tag, $skipTags, true)) { return false; } } $node = $node->parentNode; } return true; } function normalizeText(string $value): string { $value = str_replace(["\r", "\n", "\t"], ' ', $value); $value = preg_replace('/\s+/u', ' ', $value) ?? $value; return trim($value); } function firstNode(DOMXPath $xpath, string $query): ?DOMNode { $nodes = $xpath->query($query); if (!$nodes instanceof DOMNodeList || $nodes->length === 0) { return null; } $node = $nodes->item(0); return $node instanceof DOMNode ? $node : null; } function firstAttrValue(DOMXPath $xpath, string $query, string $attr): string { $node = firstNode($xpath, $query); if (!$node instanceof DOMElement) { return ''; } return normalizeText((string) $node->getAttribute($attr)); }