319 lines
8.8 KiB
PHP
Executable File
319 lines
8.8 KiB
PHP
Executable File
#!/usr/bin/env php
|
|
<?php
|
|
declare(strict_types=1);
|
|
|
|
if ($argc < 3) {
|
|
fwrite(STDERR, "Usage: php administration/scripts/extract_dom_content.php <input_html> <output_json>\n");
|
|
exit(1);
|
|
}
|
|
|
|
$inputHtml = $argv[1];
|
|
$outputJson = $argv[2];
|
|
|
|
if (!is_file($inputHtml)) {
|
|
fwrite(STDERR, "Input file not found: {$inputHtml}\n");
|
|
exit(1);
|
|
}
|
|
|
|
libxml_use_internal_errors(true);
|
|
$dom = new DOMDocument();
|
|
$html = file_get_contents($inputHtml);
|
|
if ($html === false) {
|
|
fwrite(STDERR, "Failed to read input file: {$inputHtml}\n");
|
|
exit(1);
|
|
}
|
|
|
|
$loaded = $dom->loadHTML($html, LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NONET);
|
|
if (!$loaded) {
|
|
fwrite(STDERR, "Failed to parse HTML: {$inputHtml}\n");
|
|
exit(1);
|
|
}
|
|
|
|
$xpath = new DOMXPath($dom);
|
|
|
|
$data = [
|
|
'meta' => extractMeta($xpath, $dom),
|
|
'shared' => extractShared($xpath),
|
|
'sections' => extractMainSections($xpath),
|
|
];
|
|
|
|
$json = json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
|
|
if ($json === false) {
|
|
fwrite(STDERR, "Failed to encode JSON output.\n");
|
|
exit(1);
|
|
}
|
|
|
|
if (file_put_contents($outputJson, $json . "\n") === false) {
|
|
fwrite(STDERR, "Failed to write output file: {$outputJson}\n");
|
|
exit(1);
|
|
}
|
|
|
|
fwrite(STDOUT, "Wrote extracted content to {$outputJson}\n");
|
|
|
|
function extractMeta(DOMXPath $xpath, DOMDocument $dom): array
|
|
{
|
|
$meta = [
|
|
'title' => normalizeText($dom->getElementsByTagName('title')->item(0)?->textContent ?? ''),
|
|
'description' => firstAttrValue($xpath, "//meta[@name='description']", 'content'),
|
|
'open_graph' => [],
|
|
'twitter' => [],
|
|
'other' => [],
|
|
];
|
|
|
|
$ogNodes = $xpath->query("//meta[starts-with(@property, 'og:') or starts-with(@name, 'og:')]");
|
|
if ($ogNodes instanceof DOMNodeList) {
|
|
foreach ($ogNodes as $node) {
|
|
if (!$node instanceof DOMElement) {
|
|
continue;
|
|
}
|
|
$name = trim((string) ($node->getAttribute('property') ?: $node->getAttribute('name')));
|
|
$value = normalizeText((string) $node->getAttribute('content'));
|
|
if ($name !== '' && $value !== '') {
|
|
$meta['open_graph'][$name] = $value;
|
|
}
|
|
}
|
|
}
|
|
|
|
$twitterNodes = $xpath->query("//meta[starts-with(@name, 'twitter:')]");
|
|
if ($twitterNodes instanceof DOMNodeList) {
|
|
foreach ($twitterNodes as $node) {
|
|
if (!$node instanceof DOMElement) {
|
|
continue;
|
|
}
|
|
$name = trim((string) $node->getAttribute('name'));
|
|
$value = normalizeText((string) $node->getAttribute('content'));
|
|
if ($name !== '' && $value !== '') {
|
|
$meta['twitter'][$name] = $value;
|
|
}
|
|
}
|
|
}
|
|
|
|
$otherMetaNames = ['title'];
|
|
foreach ($otherMetaNames as $metaName) {
|
|
$value = firstAttrValue($xpath, "//meta[@name='{$metaName}']", 'content');
|
|
if ($value !== '') {
|
|
$meta['other'][$metaName] = $value;
|
|
}
|
|
}
|
|
|
|
ksort($meta['open_graph']);
|
|
ksort($meta['twitter']);
|
|
ksort($meta['other']);
|
|
|
|
return $meta;
|
|
}
|
|
|
|
function extractShared(DOMXPath $xpath): array
|
|
{
|
|
$commonTexts = [];
|
|
$textToKey = [];
|
|
$shared = [
|
|
'common_texts' => [],
|
|
'navigation' => ['text_keys' => [], 'images' => []],
|
|
'cookie_layer' => ['text_keys' => [], 'images' => []],
|
|
'footer' => ['text_keys' => [], 'images' => []],
|
|
];
|
|
|
|
$sharedRoots = [
|
|
'navigation' => firstNode($xpath, "//header[contains(@class, 'page-head')]"),
|
|
'cookie_layer' => firstNode($xpath, "//*[@id='cookie-layer']"),
|
|
'footer' => firstNode($xpath, "//footer[contains(@class, 'site-footer')]"),
|
|
];
|
|
|
|
foreach ($sharedRoots as $name => $root) {
|
|
if (!$root instanceof DOMNode) {
|
|
continue;
|
|
}
|
|
|
|
$texts = extractTexts($xpath, $root);
|
|
foreach ($texts as $text) {
|
|
if (!isset($textToKey[$text])) {
|
|
$key = 'common_' . str_pad((string) (count($commonTexts) + 1), 3, '0', STR_PAD_LEFT);
|
|
$textToKey[$text] = $key;
|
|
$commonTexts[$key] = $text;
|
|
}
|
|
$shared[$name]['text_keys'][] = $textToKey[$text];
|
|
}
|
|
|
|
$shared[$name]['images'] = extractImages($xpath, $root);
|
|
}
|
|
|
|
$shared['common_texts'] = $commonTexts;
|
|
return $shared;
|
|
}
|
|
|
|
function extractMainSections(DOMXPath $xpath): array
|
|
{
|
|
$sections = [];
|
|
|
|
$sectionQueries = [
|
|
'hero' => "//main//section[contains(@class, 'module-hero-teaser')]",
|
|
'page_header' => "//main//section[contains(@class, 'page-header')]",
|
|
'projects' => "//main//section[contains(@class, 'module-projects-teaser')]",
|
|
'services' => "//main//section[.//*[contains(@class, 'services-teaser__content')]]",
|
|
'team' => "(//main//section[contains(@class, 'text-image')])[1]",
|
|
'awards' => "//main//section[.//*[contains(@class, 'awards-teaser__content')]]",
|
|
'contact' => "//main//section[contains(@class, 'contact-teaser')]",
|
|
'clients' => "//main//section[.//*[contains(@class, 'clients-teaser__content')]]",
|
|
'partners' => "(//main//section[contains(@class, 'text-image')])[2]",
|
|
];
|
|
|
|
foreach ($sectionQueries as $name => $query) {
|
|
$root = firstNode($xpath, $query);
|
|
if (!$root instanceof DOMNode) {
|
|
continue;
|
|
}
|
|
$sections[$name] = [
|
|
'texts' => extractKeyValueTexts($xpath, $root),
|
|
'images' => extractImages($xpath, $root),
|
|
];
|
|
}
|
|
|
|
return $sections;
|
|
}
|
|
|
|
function extractKeyValueTexts(DOMXPath $xpath, DOMNode $root): array
|
|
{
|
|
$textNodes = $xpath->query('.//text()', $root);
|
|
if (!$textNodes instanceof DOMNodeList) {
|
|
return [];
|
|
}
|
|
|
|
$counters = [];
|
|
$texts = [];
|
|
|
|
foreach ($textNodes as $textNode) {
|
|
if (!$textNode instanceof DOMText) {
|
|
continue;
|
|
}
|
|
|
|
$value = normalizeText($textNode->wholeText);
|
|
if ($value === '') {
|
|
continue;
|
|
}
|
|
|
|
if (!isVisibleTextNode($textNode)) {
|
|
continue;
|
|
}
|
|
|
|
$parent = $textNode->parentNode;
|
|
if (!$parent instanceof DOMElement) {
|
|
continue;
|
|
}
|
|
|
|
$tag = strtolower($parent->tagName);
|
|
$counters[$tag] = ($counters[$tag] ?? 0) + 1;
|
|
$key = $tag . '_' . str_pad((string) $counters[$tag], 3, '0', STR_PAD_LEFT);
|
|
|
|
$texts[$key] = $value;
|
|
}
|
|
|
|
return $texts;
|
|
}
|
|
|
|
function extractTexts(DOMXPath $xpath, DOMNode $root): array
|
|
{
|
|
$textNodes = $xpath->query('.//text()', $root);
|
|
if (!$textNodes instanceof DOMNodeList) {
|
|
return [];
|
|
}
|
|
|
|
$texts = [];
|
|
foreach ($textNodes as $textNode) {
|
|
if (!$textNode instanceof DOMText) {
|
|
continue;
|
|
}
|
|
|
|
$value = normalizeText($textNode->wholeText);
|
|
if ($value === '') {
|
|
continue;
|
|
}
|
|
|
|
if (!isVisibleTextNode($textNode)) {
|
|
continue;
|
|
}
|
|
|
|
$texts[] = $value;
|
|
}
|
|
|
|
return $texts;
|
|
}
|
|
|
|
function extractImages(DOMXPath $xpath, DOMNode $root): array
|
|
{
|
|
$imageNodes = $xpath->query('.//img[@src]', $root);
|
|
if (!$imageNodes instanceof DOMNodeList) {
|
|
return [];
|
|
}
|
|
|
|
$images = [];
|
|
$index = 1;
|
|
|
|
foreach ($imageNodes as $imageNode) {
|
|
if (!$imageNode instanceof DOMElement) {
|
|
continue;
|
|
}
|
|
|
|
$src = trim((string) $imageNode->getAttribute('src'));
|
|
if ($src === '') {
|
|
continue;
|
|
}
|
|
|
|
$key = 'img_' . str_pad((string) $index, 3, '0', STR_PAD_LEFT);
|
|
$images[$key] = [
|
|
'src' => $src,
|
|
'alt' => normalizeText((string) $imageNode->getAttribute('alt')),
|
|
];
|
|
$index++;
|
|
}
|
|
|
|
return $images;
|
|
}
|
|
|
|
function isVisibleTextNode(DOMText $textNode): bool
|
|
{
|
|
$skipTags = [
|
|
'script', 'style', 'noscript', 'template', 'svg', 'path', 'defs',
|
|
];
|
|
|
|
$node = $textNode->parentNode;
|
|
while ($node instanceof DOMNode) {
|
|
if ($node instanceof DOMElement) {
|
|
$tag = strtolower($node->tagName);
|
|
if (in_array($tag, $skipTags, true)) {
|
|
return false;
|
|
}
|
|
}
|
|
$node = $node->parentNode;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
function normalizeText(string $value): string
|
|
{
|
|
$value = str_replace(["\r", "\n", "\t"], ' ', $value);
|
|
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
|
|
return trim($value);
|
|
}
|
|
|
|
function firstNode(DOMXPath $xpath, string $query): ?DOMNode
|
|
{
|
|
$nodes = $xpath->query($query);
|
|
if (!$nodes instanceof DOMNodeList || $nodes->length === 0) {
|
|
return null;
|
|
}
|
|
|
|
$node = $nodes->item(0);
|
|
return $node instanceof DOMNode ? $node : null;
|
|
}
|
|
|
|
function firstAttrValue(DOMXPath $xpath, string $query, string $attr): string
|
|
{
|
|
$node = firstNode($xpath, $query);
|
|
if (!$node instanceof DOMElement) {
|
|
return '';
|
|
}
|
|
return normalizeText((string) $node->getAttribute($attr));
|
|
}
|