Files
interkollektives-micro-website/scripts/extract_dom_content.php

319 lines
8.8 KiB
PHP
Executable File

#!/usr/bin/env php
<?php
declare(strict_types=1);
if ($argc < 3) {
fwrite(STDERR, "Usage: php scripts/extract_dom_content.php <input_html> <output_json>\n");
exit(1);
}
$inputHtml = $argv[1];
$outputJson = $argv[2];
if (!is_file($inputHtml)) {
fwrite(STDERR, "Input file not found: {$inputHtml}\n");
exit(1);
}
libxml_use_internal_errors(true);
$dom = new DOMDocument();
$html = file_get_contents($inputHtml);
if ($html === false) {
fwrite(STDERR, "Failed to read input file: {$inputHtml}\n");
exit(1);
}
$loaded = $dom->loadHTML($html, LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NONET);
if (!$loaded) {
fwrite(STDERR, "Failed to parse HTML: {$inputHtml}\n");
exit(1);
}
$xpath = new DOMXPath($dom);
$data = [
'meta' => extractMeta($xpath, $dom),
'shared' => extractShared($xpath),
'sections' => extractMainSections($xpath),
];
$json = json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
if ($json === false) {
fwrite(STDERR, "Failed to encode JSON output.\n");
exit(1);
}
if (file_put_contents($outputJson, $json . "\n") === false) {
fwrite(STDERR, "Failed to write output file: {$outputJson}\n");
exit(1);
}
fwrite(STDOUT, "Wrote extracted content to {$outputJson}\n");
function extractMeta(DOMXPath $xpath, DOMDocument $dom): array
{
$meta = [
'title' => normalizeText($dom->getElementsByTagName('title')->item(0)?->textContent ?? ''),
'description' => firstAttrValue($xpath, "//meta[@name='description']", 'content'),
'open_graph' => [],
'twitter' => [],
'other' => [],
];
$ogNodes = $xpath->query("//meta[starts-with(@property, 'og:') or starts-with(@name, 'og:')]");
if ($ogNodes instanceof DOMNodeList) {
foreach ($ogNodes as $node) {
if (!$node instanceof DOMElement) {
continue;
}
$name = trim((string) ($node->getAttribute('property') ?: $node->getAttribute('name')));
$value = normalizeText((string) $node->getAttribute('content'));
if ($name !== '' && $value !== '') {
$meta['open_graph'][$name] = $value;
}
}
}
$twitterNodes = $xpath->query("//meta[starts-with(@name, 'twitter:')]");
if ($twitterNodes instanceof DOMNodeList) {
foreach ($twitterNodes as $node) {
if (!$node instanceof DOMElement) {
continue;
}
$name = trim((string) $node->getAttribute('name'));
$value = normalizeText((string) $node->getAttribute('content'));
if ($name !== '' && $value !== '') {
$meta['twitter'][$name] = $value;
}
}
}
$otherMetaNames = ['title'];
foreach ($otherMetaNames as $metaName) {
$value = firstAttrValue($xpath, "//meta[@name='{$metaName}']", 'content');
if ($value !== '') {
$meta['other'][$metaName] = $value;
}
}
ksort($meta['open_graph']);
ksort($meta['twitter']);
ksort($meta['other']);
return $meta;
}
function extractShared(DOMXPath $xpath): array
{
$commonTexts = [];
$textToKey = [];
$shared = [
'common_texts' => [],
'navigation' => ['text_keys' => [], 'images' => []],
'cookie_layer' => ['text_keys' => [], 'images' => []],
'footer' => ['text_keys' => [], 'images' => []],
];
$sharedRoots = [
'navigation' => firstNode($xpath, "//header[contains(@class, 'page-head')]"),
'cookie_layer' => firstNode($xpath, "//*[@id='cookie-layer']"),
'footer' => firstNode($xpath, "//footer[contains(@class, 'site-footer')]"),
];
foreach ($sharedRoots as $name => $root) {
if (!$root instanceof DOMNode) {
continue;
}
$texts = extractTexts($xpath, $root);
foreach ($texts as $text) {
if (!isset($textToKey[$text])) {
$key = 'common_' . str_pad((string) (count($commonTexts) + 1), 3, '0', STR_PAD_LEFT);
$textToKey[$text] = $key;
$commonTexts[$key] = $text;
}
$shared[$name]['text_keys'][] = $textToKey[$text];
}
$shared[$name]['images'] = extractImages($xpath, $root);
}
$shared['common_texts'] = $commonTexts;
return $shared;
}
function extractMainSections(DOMXPath $xpath): array
{
$sections = [];
$sectionQueries = [
'hero' => "//main//section[contains(@class, 'module-hero-teaser')]",
'page_header' => "//main//section[contains(@class, 'page-header')]",
'projects' => "//main//section[contains(@class, 'module-projects-teaser')]",
'services' => "//main//section[.//*[contains(@class, 'services-teaser__content')]]",
'team' => "(//main//section[contains(@class, 'text-image')])[1]",
'awards' => "//main//section[.//*[contains(@class, 'awards-teaser__content')]]",
'contact' => "//main//section[contains(@class, 'contact-teaser')]",
'clients' => "//main//section[.//*[contains(@class, 'clients-teaser__content')]]",
'partners' => "(//main//section[contains(@class, 'text-image')])[2]",
];
foreach ($sectionQueries as $name => $query) {
$root = firstNode($xpath, $query);
if (!$root instanceof DOMNode) {
continue;
}
$sections[$name] = [
'texts' => extractKeyValueTexts($xpath, $root),
'images' => extractImages($xpath, $root),
];
}
return $sections;
}
function extractKeyValueTexts(DOMXPath $xpath, DOMNode $root): array
{
$textNodes = $xpath->query('.//text()', $root);
if (!$textNodes instanceof DOMNodeList) {
return [];
}
$counters = [];
$texts = [];
foreach ($textNodes as $textNode) {
if (!$textNode instanceof DOMText) {
continue;
}
$value = normalizeText($textNode->wholeText);
if ($value === '') {
continue;
}
if (!isVisibleTextNode($textNode)) {
continue;
}
$parent = $textNode->parentNode;
if (!$parent instanceof DOMElement) {
continue;
}
$tag = strtolower($parent->tagName);
$counters[$tag] = ($counters[$tag] ?? 0) + 1;
$key = $tag . '_' . str_pad((string) $counters[$tag], 3, '0', STR_PAD_LEFT);
$texts[$key] = $value;
}
return $texts;
}
function extractTexts(DOMXPath $xpath, DOMNode $root): array
{
$textNodes = $xpath->query('.//text()', $root);
if (!$textNodes instanceof DOMNodeList) {
return [];
}
$texts = [];
foreach ($textNodes as $textNode) {
if (!$textNode instanceof DOMText) {
continue;
}
$value = normalizeText($textNode->wholeText);
if ($value === '') {
continue;
}
if (!isVisibleTextNode($textNode)) {
continue;
}
$texts[] = $value;
}
return $texts;
}
function extractImages(DOMXPath $xpath, DOMNode $root): array
{
$imageNodes = $xpath->query('.//img[@src]', $root);
if (!$imageNodes instanceof DOMNodeList) {
return [];
}
$images = [];
$index = 1;
foreach ($imageNodes as $imageNode) {
if (!$imageNode instanceof DOMElement) {
continue;
}
$src = trim((string) $imageNode->getAttribute('src'));
if ($src === '') {
continue;
}
$key = 'img_' . str_pad((string) $index, 3, '0', STR_PAD_LEFT);
$images[$key] = [
'src' => $src,
'alt' => normalizeText((string) $imageNode->getAttribute('alt')),
];
$index++;
}
return $images;
}
function isVisibleTextNode(DOMText $textNode): bool
{
$skipTags = [
'script', 'style', 'noscript', 'template', 'svg', 'path', 'defs',
];
$node = $textNode->parentNode;
while ($node instanceof DOMNode) {
if ($node instanceof DOMElement) {
$tag = strtolower($node->tagName);
if (in_array($tag, $skipTags, true)) {
return false;
}
}
$node = $node->parentNode;
}
return true;
}
function normalizeText(string $value): string
{
$value = str_replace(["\r", "\n", "\t"], ' ', $value);
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
return trim($value);
}
function firstNode(DOMXPath $xpath, string $query): ?DOMNode
{
$nodes = $xpath->query($query);
if (!$nodes instanceof DOMNodeList || $nodes->length === 0) {
return null;
}
$node = $nodes->item(0);
return $node instanceof DOMNode ? $node : null;
}
function firstAttrValue(DOMXPath $xpath, string $query, string $attr): string
{
$node = firstNode($xpath, $query);
if (!$node instanceof DOMElement) {
return '';
}
return normalizeText((string) $node->getAttribute($attr));
}