Initial import: web4beginners editor and deployment setup
This commit is contained in:
318
scripts/extract_dom_content.php
Executable file
318
scripts/extract_dom_content.php
Executable file
@@ -0,0 +1,318 @@
|
||||
#!/usr/bin/env php
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
if ($argc < 3) {
|
||||
fwrite(STDERR, "Usage: php scripts/extract_dom_content.php <input_html> <output_json>\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
$inputHtml = $argv[1];
|
||||
$outputJson = $argv[2];
|
||||
|
||||
if (!is_file($inputHtml)) {
|
||||
fwrite(STDERR, "Input file not found: {$inputHtml}\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
libxml_use_internal_errors(true);
|
||||
$dom = new DOMDocument();
|
||||
$html = file_get_contents($inputHtml);
|
||||
if ($html === false) {
|
||||
fwrite(STDERR, "Failed to read input file: {$inputHtml}\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
$loaded = $dom->loadHTML($html, LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NONET);
|
||||
if (!$loaded) {
|
||||
fwrite(STDERR, "Failed to parse HTML: {$inputHtml}\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
$xpath = new DOMXPath($dom);
|
||||
|
||||
$data = [
|
||||
'meta' => extractMeta($xpath, $dom),
|
||||
'shared' => extractShared($xpath),
|
||||
'sections' => extractMainSections($xpath),
|
||||
];
|
||||
|
||||
$json = json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
|
||||
if ($json === false) {
|
||||
fwrite(STDERR, "Failed to encode JSON output.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (file_put_contents($outputJson, $json . "\n") === false) {
|
||||
fwrite(STDERR, "Failed to write output file: {$outputJson}\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fwrite(STDOUT, "Wrote extracted content to {$outputJson}\n");
|
||||
|
||||
function extractMeta(DOMXPath $xpath, DOMDocument $dom): array
|
||||
{
|
||||
$meta = [
|
||||
'title' => normalizeText($dom->getElementsByTagName('title')->item(0)?->textContent ?? ''),
|
||||
'description' => firstAttrValue($xpath, "//meta[@name='description']", 'content'),
|
||||
'open_graph' => [],
|
||||
'twitter' => [],
|
||||
'other' => [],
|
||||
];
|
||||
|
||||
$ogNodes = $xpath->query("//meta[starts-with(@property, 'og:') or starts-with(@name, 'og:')]");
|
||||
if ($ogNodes instanceof DOMNodeList) {
|
||||
foreach ($ogNodes as $node) {
|
||||
if (!$node instanceof DOMElement) {
|
||||
continue;
|
||||
}
|
||||
$name = trim((string) ($node->getAttribute('property') ?: $node->getAttribute('name')));
|
||||
$value = normalizeText((string) $node->getAttribute('content'));
|
||||
if ($name !== '' && $value !== '') {
|
||||
$meta['open_graph'][$name] = $value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$twitterNodes = $xpath->query("//meta[starts-with(@name, 'twitter:')]");
|
||||
if ($twitterNodes instanceof DOMNodeList) {
|
||||
foreach ($twitterNodes as $node) {
|
||||
if (!$node instanceof DOMElement) {
|
||||
continue;
|
||||
}
|
||||
$name = trim((string) $node->getAttribute('name'));
|
||||
$value = normalizeText((string) $node->getAttribute('content'));
|
||||
if ($name !== '' && $value !== '') {
|
||||
$meta['twitter'][$name] = $value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$otherMetaNames = ['title'];
|
||||
foreach ($otherMetaNames as $metaName) {
|
||||
$value = firstAttrValue($xpath, "//meta[@name='{$metaName}']", 'content');
|
||||
if ($value !== '') {
|
||||
$meta['other'][$metaName] = $value;
|
||||
}
|
||||
}
|
||||
|
||||
ksort($meta['open_graph']);
|
||||
ksort($meta['twitter']);
|
||||
ksort($meta['other']);
|
||||
|
||||
return $meta;
|
||||
}
|
||||
|
||||
function extractShared(DOMXPath $xpath): array
|
||||
{
|
||||
$commonTexts = [];
|
||||
$textToKey = [];
|
||||
$shared = [
|
||||
'common_texts' => [],
|
||||
'navigation' => ['text_keys' => [], 'images' => []],
|
||||
'cookie_layer' => ['text_keys' => [], 'images' => []],
|
||||
'footer' => ['text_keys' => [], 'images' => []],
|
||||
];
|
||||
|
||||
$sharedRoots = [
|
||||
'navigation' => firstNode($xpath, "//header[contains(@class, 'page-head')]"),
|
||||
'cookie_layer' => firstNode($xpath, "//*[@id='cookie-layer']"),
|
||||
'footer' => firstNode($xpath, "//footer[contains(@class, 'site-footer')]"),
|
||||
];
|
||||
|
||||
foreach ($sharedRoots as $name => $root) {
|
||||
if (!$root instanceof DOMNode) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$texts = extractTexts($xpath, $root);
|
||||
foreach ($texts as $text) {
|
||||
if (!isset($textToKey[$text])) {
|
||||
$key = 'common_' . str_pad((string) (count($commonTexts) + 1), 3, '0', STR_PAD_LEFT);
|
||||
$textToKey[$text] = $key;
|
||||
$commonTexts[$key] = $text;
|
||||
}
|
||||
$shared[$name]['text_keys'][] = $textToKey[$text];
|
||||
}
|
||||
|
||||
$shared[$name]['images'] = extractImages($xpath, $root);
|
||||
}
|
||||
|
||||
$shared['common_texts'] = $commonTexts;
|
||||
return $shared;
|
||||
}
|
||||
|
||||
function extractMainSections(DOMXPath $xpath): array
|
||||
{
|
||||
$sections = [];
|
||||
|
||||
$sectionQueries = [
|
||||
'hero' => "//main//section[contains(@class, 'module-hero-teaser')]",
|
||||
'page_header' => "//main//section[contains(@class, 'page-header')]",
|
||||
'projects' => "//main//section[contains(@class, 'module-projects-teaser')]",
|
||||
'services' => "//main//section[.//*[contains(@class, 'services-teaser__content')]]",
|
||||
'team' => "(//main//section[contains(@class, 'text-image')])[1]",
|
||||
'awards' => "//main//section[.//*[contains(@class, 'awards-teaser__content')]]",
|
||||
'contact' => "//main//section[contains(@class, 'contact-teaser')]",
|
||||
'clients' => "//main//section[.//*[contains(@class, 'clients-teaser__content')]]",
|
||||
'partners' => "(//main//section[contains(@class, 'text-image')])[2]",
|
||||
];
|
||||
|
||||
foreach ($sectionQueries as $name => $query) {
|
||||
$root = firstNode($xpath, $query);
|
||||
if (!$root instanceof DOMNode) {
|
||||
continue;
|
||||
}
|
||||
$sections[$name] = [
|
||||
'texts' => extractKeyValueTexts($xpath, $root),
|
||||
'images' => extractImages($xpath, $root),
|
||||
];
|
||||
}
|
||||
|
||||
return $sections;
|
||||
}
|
||||
|
||||
function extractKeyValueTexts(DOMXPath $xpath, DOMNode $root): array
|
||||
{
|
||||
$textNodes = $xpath->query('.//text()', $root);
|
||||
if (!$textNodes instanceof DOMNodeList) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$counters = [];
|
||||
$texts = [];
|
||||
|
||||
foreach ($textNodes as $textNode) {
|
||||
if (!$textNode instanceof DOMText) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$value = normalizeText($textNode->wholeText);
|
||||
if ($value === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!isVisibleTextNode($textNode)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$parent = $textNode->parentNode;
|
||||
if (!$parent instanceof DOMElement) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$tag = strtolower($parent->tagName);
|
||||
$counters[$tag] = ($counters[$tag] ?? 0) + 1;
|
||||
$key = $tag . '_' . str_pad((string) $counters[$tag], 3, '0', STR_PAD_LEFT);
|
||||
|
||||
$texts[$key] = $value;
|
||||
}
|
||||
|
||||
return $texts;
|
||||
}
|
||||
|
||||
function extractTexts(DOMXPath $xpath, DOMNode $root): array
|
||||
{
|
||||
$textNodes = $xpath->query('.//text()', $root);
|
||||
if (!$textNodes instanceof DOMNodeList) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$texts = [];
|
||||
foreach ($textNodes as $textNode) {
|
||||
if (!$textNode instanceof DOMText) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$value = normalizeText($textNode->wholeText);
|
||||
if ($value === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!isVisibleTextNode($textNode)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$texts[] = $value;
|
||||
}
|
||||
|
||||
return $texts;
|
||||
}
|
||||
|
||||
function extractImages(DOMXPath $xpath, DOMNode $root): array
|
||||
{
|
||||
$imageNodes = $xpath->query('.//img[@src]', $root);
|
||||
if (!$imageNodes instanceof DOMNodeList) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$images = [];
|
||||
$index = 1;
|
||||
|
||||
foreach ($imageNodes as $imageNode) {
|
||||
if (!$imageNode instanceof DOMElement) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$src = trim((string) $imageNode->getAttribute('src'));
|
||||
if ($src === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$key = 'img_' . str_pad((string) $index, 3, '0', STR_PAD_LEFT);
|
||||
$images[$key] = [
|
||||
'src' => $src,
|
||||
'alt' => normalizeText((string) $imageNode->getAttribute('alt')),
|
||||
];
|
||||
$index++;
|
||||
}
|
||||
|
||||
return $images;
|
||||
}
|
||||
|
||||
function isVisibleTextNode(DOMText $textNode): bool
|
||||
{
|
||||
$skipTags = [
|
||||
'script', 'style', 'noscript', 'template', 'svg', 'path', 'defs',
|
||||
];
|
||||
|
||||
$node = $textNode->parentNode;
|
||||
while ($node instanceof DOMNode) {
|
||||
if ($node instanceof DOMElement) {
|
||||
$tag = strtolower($node->tagName);
|
||||
if (in_array($tag, $skipTags, true)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
$node = $node->parentNode;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
function normalizeText(string $value): string
|
||||
{
|
||||
$value = str_replace(["\r", "\n", "\t"], ' ', $value);
|
||||
$value = preg_replace('/\s+/u', ' ', $value) ?? $value;
|
||||
return trim($value);
|
||||
}
|
||||
|
||||
function firstNode(DOMXPath $xpath, string $query): ?DOMNode
|
||||
{
|
||||
$nodes = $xpath->query($query);
|
||||
if (!$nodes instanceof DOMNodeList || $nodes->length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$node = $nodes->item(0);
|
||||
return $node instanceof DOMNode ? $node : null;
|
||||
}
|
||||
|
||||
function firstAttrValue(DOMXPath $xpath, string $query, string $attr): string
|
||||
{
|
||||
$node = firstNode($xpath, $query);
|
||||
if (!$node instanceof DOMElement) {
|
||||
return '';
|
||||
}
|
||||
return normalizeText((string) $node->getAttribute($attr));
|
||||
}
|
||||
Reference in New Issue
Block a user