924 lines
32 KiB
PHP
Executable File
924 lines
32 KiB
PHP
Executable File
<?php
|
|
|
|
namespace Kirby\Toolkit;
|
|
|
|
use Closure;
|
|
use DOMAttr;
|
|
use DOMDocument;
|
|
use DOMDocumentType;
|
|
use DOMElement;
|
|
use DOMNode;
|
|
use DOMProcessingInstruction;
|
|
use DOMXPath;
|
|
use Kirby\Cms\App;
|
|
use Kirby\Exception\Exception;
|
|
use Kirby\Exception\InvalidArgumentException;
|
|
|
|
/**
|
|
* Helper class for DOM handling using the DOMDocument class
|
|
* @since 3.5.8
|
|
*
|
|
* @package Kirby Toolkit
|
|
* @author Bastian Allgeier <bastian@getkirby.com>,
|
|
* Lukas Bestle <lukas@getkirby.com>
|
|
* @link https://getkirby.com
|
|
* @copyright Bastian Allgeier GmbH
|
|
* @license https://opensource.org/licenses/MIT
|
|
*/
|
|
class Dom
|
|
{
|
|
/**
|
|
* Cache for the HTML body
|
|
*
|
|
* @var \DOMElement|null
|
|
*/
|
|
protected $body;
|
|
|
|
/**
|
|
* The original input code as
|
|
* passed to the constructor
|
|
*
|
|
* @var string
|
|
*/
|
|
protected $code;
|
|
|
|
/**
|
|
* Document object
|
|
*
|
|
* @var \DOMDocument
|
|
*/
|
|
protected $doc;
|
|
|
|
/**
|
|
* Document type (`'HTML'` or `'XML'`)
|
|
*
|
|
* @var string
|
|
*/
|
|
protected $type;
|
|
|
|
/**
|
|
* Class constructor
|
|
*
|
|
* @param string $code XML or HTML code
|
|
* @param string $type Document type (`'HTML'` or `'XML'`)
|
|
*/
|
|
public function __construct(string $code, string $type = 'HTML')
|
|
{
|
|
$this->code = $code;
|
|
$this->doc = new DOMDocument();
|
|
|
|
$loaderSetting = null;
|
|
if (\PHP_VERSION_ID < 80000) {
|
|
// prevent loading external entities to protect against XXE attacks;
|
|
// only needed for PHP versions before 8.0 (the function was deprecated
|
|
// as the disabled state is the new default in PHP 8.0+)
|
|
$loaderSetting = libxml_disable_entity_loader(true);
|
|
}
|
|
|
|
// switch to "user error handling"
|
|
$intErrorsSetting = libxml_use_internal_errors(true);
|
|
|
|
$this->type = strtoupper($type);
|
|
if ($this->type === 'HTML') {
|
|
// ensure proper parsing for HTML snippets
|
|
if (preg_match('/<(html|body)[> ]/i', $code) !== 1) {
|
|
$code = '<body>' . $code . '</body>';
|
|
}
|
|
|
|
// the loadHTML() method expects ISO-8859-1 by default;
|
|
// force parsing as UTF-8 by injecting an XML declaration
|
|
$xmlDeclaration = 'encoding="UTF-8" id="' . Str::random(10) . '"';
|
|
$load = $this->doc->loadHTML('<?xml ' . $xmlDeclaration . '>' . $code);
|
|
|
|
// remove the injected XML declaration again
|
|
$pis = $this->query('//processing-instruction()');
|
|
foreach (iterator_to_array($pis, false) as $pi) {
|
|
if ($pi->data === $xmlDeclaration) {
|
|
static::remove($pi);
|
|
}
|
|
}
|
|
|
|
// remove the default doctype
|
|
if (Str::contains($code, '<!DOCTYPE ', true) === false) {
|
|
static::remove($this->doc->doctype);
|
|
}
|
|
} else {
|
|
$load = $this->doc->loadXML($code);
|
|
}
|
|
|
|
if (\PHP_VERSION_ID < 80000) {
|
|
// ensure that we don't alter global state by
|
|
// resetting the original value
|
|
libxml_disable_entity_loader($loaderSetting);
|
|
}
|
|
|
|
// get one error for use below and reset the global state
|
|
$error = libxml_get_last_error();
|
|
libxml_clear_errors();
|
|
libxml_use_internal_errors($intErrorsSetting);
|
|
|
|
if ($load !== true) {
|
|
$message = 'The markup could not be parsed';
|
|
|
|
if ($error !== false) {
|
|
$message .= ': ' . $error->message;
|
|
}
|
|
|
|
throw new InvalidArgumentException([
|
|
'fallback' => $message,
|
|
'details' => compact('error')
|
|
]);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Returns the HTML body if one exists
|
|
*
|
|
* @return \DOMElement|null
|
|
*/
|
|
public function body()
|
|
{
|
|
return $this->body ??= $this->query('/html/body')[0] ?? null;
|
|
}
|
|
|
|
/**
|
|
* Returns the document object
|
|
*
|
|
* @return \DOMDocument
|
|
*/
|
|
public function document()
|
|
{
|
|
return $this->doc;
|
|
}
|
|
|
|
/**
|
|
* Extracts all URLs wrapped in a url() wrapper. E.g. for style attributes.
|
|
* @internal
|
|
*
|
|
* @param string $value
|
|
* @return array
|
|
*/
|
|
public static function extractUrls(string $value): array
|
|
{
|
|
// remove invisible ASCII characters from the value
|
|
$value = trim(preg_replace('/[^ -~]/u', '', $value));
|
|
|
|
$count = preg_match_all(
|
|
'!url\(\s*[\'"]?(.*?)[\'"]?\s*\)!i',
|
|
$value,
|
|
$matches,
|
|
PREG_PATTERN_ORDER
|
|
);
|
|
|
|
if (is_int($count) === true && $count > 0) {
|
|
return $matches[1];
|
|
}
|
|
|
|
return [];
|
|
}
|
|
|
|
/**
|
|
* Checks for allowed attributes according to the allowlist
|
|
* @internal
|
|
*
|
|
* @param \DOMAttr $attr
|
|
* @param array $options
|
|
* @return true|string If not allowed, an error message is returned
|
|
*/
|
|
public static function isAllowedAttr(DOMAttr $attr, array $options)
|
|
{
|
|
$allowedTags = $options['allowedTags'];
|
|
|
|
// check if the attribute is in the list of global allowed attributes
|
|
$isAllowedGlobalAttr = static::isAllowedGlobalAttr($attr, $options);
|
|
|
|
// no specific tag attribute list
|
|
if (is_array($allowedTags) === false) {
|
|
return $isAllowedGlobalAttr;
|
|
}
|
|
|
|
// configuration per tag name
|
|
$tagName = $attr->ownerElement->nodeName;
|
|
$listedTagName = static::listContainsName(array_keys($options['allowedTags']), $attr->ownerElement, $options);
|
|
$allowedAttrsForTag = $listedTagName ? ($allowedTags[$listedTagName] ?? true) : true;
|
|
|
|
// the element allows all global attributes
|
|
if ($allowedAttrsForTag === true) {
|
|
return $isAllowedGlobalAttr;
|
|
}
|
|
|
|
// specific attributes are allowed in addition to the global ones
|
|
if (is_array($allowedAttrsForTag) === true) {
|
|
// if allowed globally, we don't need further checks
|
|
if ($isAllowedGlobalAttr === true) {
|
|
return true;
|
|
}
|
|
|
|
// otherwise the tag configuration decides
|
|
if (static::listContainsName($allowedAttrsForTag, $attr, $options) !== false) {
|
|
return true;
|
|
}
|
|
|
|
return 'Not allowed by the "' . $tagName . '" element';
|
|
}
|
|
|
|
return 'The "' . $tagName . '" element does not allow attributes';
|
|
}
|
|
|
|
/**
|
|
* Checks for allowed attributes according to the global allowlist
|
|
* @internal
|
|
*
|
|
* @param \DOMAttr $attr
|
|
* @param array $options
|
|
* @return true|string If not allowed, an error message is returned
|
|
*/
|
|
public static function isAllowedGlobalAttr(DOMAttr $attr, array $options)
|
|
{
|
|
$allowedAttrs = $options['allowedAttrs'];
|
|
|
|
if ($allowedAttrs === true) {
|
|
// all attributes are allowed
|
|
return true;
|
|
}
|
|
|
|
if (
|
|
static::listContainsName(
|
|
$options['allowedAttrPrefixes'],
|
|
$attr,
|
|
$options,
|
|
fn ($expected, $real): bool => Str::startsWith($real, $expected)
|
|
) !== false
|
|
) {
|
|
return true;
|
|
}
|
|
|
|
if (
|
|
is_array($allowedAttrs) === true &&
|
|
static::listContainsName($allowedAttrs, $attr, $options) !== false
|
|
) {
|
|
return true;
|
|
}
|
|
|
|
return 'Not included in the global allowlist';
|
|
}
|
|
|
|
/**
|
|
* Checks if the URL is acceptable for URL attributes
|
|
* @internal
|
|
*
|
|
* @param string $url
|
|
* @param array $options
|
|
* @return true|string If not allowed, an error message is returned
|
|
*/
|
|
public static function isAllowedUrl(string $url, array $options)
|
|
{
|
|
$url = Str::lower($url);
|
|
|
|
// allow empty URL values
|
|
if (empty($url) === true) {
|
|
return true;
|
|
}
|
|
|
|
// allow URLs that point to fragments inside the file
|
|
if (mb_substr($url, 0, 1) === '#') {
|
|
return true;
|
|
}
|
|
|
|
// disallow protocol-relative URLs
|
|
if (mb_substr($url, 0, 2) === '//') {
|
|
return 'Protocol-relative URLs are not allowed';
|
|
}
|
|
|
|
// allow site-internal URLs that didn't match the
|
|
// protocol-relative check above
|
|
if (mb_substr($url, 0, 1) === '/') {
|
|
// if a CMS instance is active, only allow the URL
|
|
// if it doesn't point outside of the index URL
|
|
if ($kirby = App::instance(null, true)) {
|
|
$indexUrl = $kirby->url('index', true)->path()->toString(true);
|
|
|
|
if (Str::startsWith($url, $indexUrl) !== true) {
|
|
return 'The URL points outside of the site index URL';
|
|
}
|
|
|
|
// disallow directory traversal outside of the index URL
|
|
// TODO: the ../ sequences could be cleaned from the URL
|
|
// before the check by normalizing the URL; then the
|
|
// check above can also validate URLs with ../ sequences
|
|
if (
|
|
Str::contains($url, '../') !== false ||
|
|
Str::contains($url, '..\\') !== false
|
|
) {
|
|
return 'The ../ sequence is not allowed in relative URLs';
|
|
}
|
|
}
|
|
|
|
// no active CMS instance, always allow site-internal URLs
|
|
return true;
|
|
}
|
|
|
|
// allow relative URLs (= URLs without a scheme);
|
|
// this is either a URL without colon or one where the
|
|
// part before the colon is definitely no valid scheme;
|
|
// see https://url.spec.whatwg.org/#url-writing
|
|
if (
|
|
Str::contains($url, ':') === false ||
|
|
Str::contains(Str::before($url, ':'), '/') === true
|
|
) {
|
|
// disallow directory traversal as we cannot know
|
|
// in which URL context the URL will be printed
|
|
if (
|
|
Str::contains($url, '../') !== false ||
|
|
Str::contains($url, '..\\') !== false
|
|
) {
|
|
return 'The ../ sequence is not allowed in relative URLs';
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// allow specific HTTP(S) URLs
|
|
if (
|
|
Str::startsWith($url, 'http://') === true ||
|
|
Str::startsWith($url, 'https://') === true
|
|
) {
|
|
if ($options['allowedDomains'] === true) {
|
|
return true;
|
|
}
|
|
|
|
$hostname = parse_url($url, PHP_URL_HOST);
|
|
|
|
if (in_array($hostname, $options['allowedDomains']) === true) {
|
|
return true;
|
|
}
|
|
|
|
return 'The hostname "' . $hostname . '" is not allowed';
|
|
}
|
|
|
|
// allow listed data URIs
|
|
if (Str::startsWith($url, 'data:') === true) {
|
|
if ($options['allowedDataUris'] === true) {
|
|
return true;
|
|
}
|
|
|
|
foreach ($options['allowedDataUris'] as $dataAttr) {
|
|
if (Str::startsWith($url, $dataAttr) === true) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return 'Invalid data URI';
|
|
}
|
|
|
|
// allow valid email addresses
|
|
if (Str::startsWith($url, 'mailto:') === true) {
|
|
$address = Str::after($url, 'mailto:');
|
|
|
|
if (empty($address) === true || V::email($address) === true) {
|
|
return true;
|
|
}
|
|
|
|
return 'Invalid email address';
|
|
}
|
|
|
|
// allow valid telephone numbers
|
|
if (Str::startsWith($url, 'tel:') === true) {
|
|
$address = Str::after($url, 'tel:');
|
|
|
|
if (
|
|
empty($address) === true ||
|
|
preg_match('!^[+]?[0-9]+$!', $address) === 1
|
|
) {
|
|
return true;
|
|
}
|
|
|
|
return 'Invalid telephone number';
|
|
}
|
|
|
|
return 'Unknown URL type';
|
|
}
|
|
|
|
/**
|
|
* Check if the XML extension is installed on the server.
|
|
* Otherwise DOMDocument won't be available and the Dom cannot
|
|
* work at all.
|
|
*
|
|
* @return bool
|
|
*
|
|
* @codeCoverageIgnore
|
|
*/
|
|
public static function isSupported(): bool
|
|
{
|
|
return class_exists('DOMDocument') === true;
|
|
}
|
|
|
|
/**
|
|
* Returns the XML or HTML markup contained in the node
|
|
*
|
|
* @param \DOMNode $node
|
|
* @return string
|
|
*/
|
|
public function innerMarkup(DOMNode $node): string
|
|
{
|
|
$markup = '';
|
|
$method = 'save' . $this->type;
|
|
|
|
foreach ($node->childNodes as $child) {
|
|
$markup .= $node->ownerDocument->$method($child);
|
|
}
|
|
|
|
return $markup;
|
|
}
|
|
|
|
/**
|
|
* Checks if a list contains the name of a node considering
|
|
* the allowed namespaces
|
|
* @internal
|
|
*
|
|
* @param array $list
|
|
* @param \DOMNode $node
|
|
* @param array $options See `Dom::sanitize()`
|
|
* @param \Closure|null Comparison callback that returns whether the expected and real name match
|
|
* @return string|false Matched name in the list or `false`
|
|
*/
|
|
public static function listContainsName(array $list, DOMNode $node, array $options, ?Closure $compare = null)
|
|
{
|
|
$allowedNamespaces = $options['allowedNamespaces'];
|
|
$localName = $node->localName;
|
|
|
|
if ($compare === null) {
|
|
$compare = fn ($expected, $real): bool => $expected === $real;
|
|
}
|
|
|
|
// if the configuration does not define namespace URIs or if the
|
|
// currently checked node is from the special `xml:` namespace
|
|
// that has a fixed namespace according to the XML spec...
|
|
if ($allowedNamespaces === true || $node->namespaceURI === 'http://www.w3.org/XML/1998/namespace') {
|
|
// ...take the list as it is and only consider
|
|
// exact matches of the local name (which will
|
|
// contain a namespace if that namespace name
|
|
// is not defined in the document)
|
|
|
|
// the list contains the `xml:` prefix, so add it to the name as well
|
|
if ($node->namespaceURI === 'http://www.w3.org/XML/1998/namespace') {
|
|
$localName = 'xml:' . $localName;
|
|
}
|
|
|
|
foreach ($list as $item) {
|
|
if ($compare($item, $localName) === true) {
|
|
return $item;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// we need to consider the namespaces
|
|
foreach ($list as $item) {
|
|
// try to find the expected origin namespace URI
|
|
$namespaceUri = null;
|
|
$itemLocal = $item;
|
|
if (Str::contains($item, ':') === true) {
|
|
list($namespaceName, $itemLocal) = explode(':', $item);
|
|
$namespaceUri = $allowedNamespaces[$namespaceName] ?? null;
|
|
} else {
|
|
// list items without namespace are from the default namespace
|
|
$namespaceUri = $allowedNamespaces[''] ?? null;
|
|
}
|
|
|
|
// try if we can find an exact namespaced match
|
|
if ($namespaceUri === $node->namespaceURI && $compare($itemLocal, $localName) === true) {
|
|
return $item;
|
|
}
|
|
|
|
// also try to match the fully-qualified name
|
|
// if the document doesn't define the namespace
|
|
if ($node->namespaceURI === null && $compare($item, $node->nodeName) === true) {
|
|
return $item;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Removes a node from the document
|
|
*
|
|
* @param \DOMNode $node
|
|
* @return void
|
|
*/
|
|
public static function remove(DOMNode $node): void
|
|
{
|
|
$node->parentNode->removeChild($node);
|
|
}
|
|
|
|
/**
|
|
* Executes an XPath query in the document
|
|
*
|
|
* @param string $query
|
|
* @param \DOMNode|null $node Optional context node for relative queries
|
|
* @return \DOMNodeList|false
|
|
*/
|
|
public function query(string $query, ?DOMNode $node = null)
|
|
{
|
|
return (new DOMXPath($this->doc))->query($query, $node);
|
|
}
|
|
|
|
/**
|
|
* Sanitizes the DOM according to the provided configuration
|
|
*
|
|
* @param array $options Array with the following options:
|
|
* - `allowedAttrPrefixes`: Global list of allowed attribute prefixes
|
|
* like `data-` and `aria-`
|
|
* - `allowedAttrs`: Global list of allowed attrs or `true` to allow
|
|
* any attribute
|
|
* - `allowedDataUris`: List of all MIME types that may be used in
|
|
* data URIs (only checked in `urlAttrs` and inside `url()` wrappers)
|
|
* or `true` for any
|
|
* - `allowedDomains`: Allowed hostnames for HTTP(S) URLs in `urlAttrs`
|
|
* and inside `url()` wrappers or `true` for any
|
|
* - `allowedNamespaces`: Associative array of all allowed namespace URIs;
|
|
* the array keys are reference names that can be referred to from the
|
|
* `allowedAttrPrefixes`, `allowedAttrs`, `allowedTags`, `disallowedTags`
|
|
* and `urlAttrs` lists; the namespace names as used in the document are *not*
|
|
* validated; setting the whole option to `true` will allow any namespace
|
|
* - `allowedPIs`: Names of allowed XML processing instructions or
|
|
* `true` for any
|
|
* - `allowedTags`: Associative array of all allowed tag names with the
|
|
* value of either an array with the list of all allowed attributes for
|
|
* this tag, `true` to allow any attribute from the `allowedAttrs` list
|
|
* or `false` to allow the tag without any attributes;
|
|
* not listed tags will be unwrapped (removed, but children are kept);
|
|
* setting the whole option to `true` will allow any tag
|
|
* - `attrCallback`: Closure that will receive each `DOMAttr` and may
|
|
* modify it; the callback must return an array with exception
|
|
* objects for each modification
|
|
* - `disallowedTags`: Array of explicitly disallowed tags, which will
|
|
* be removed completely including their children (matched case-insensitively)
|
|
* - `doctypeCallback`: Closure that will receive the `DOMDocumentType`
|
|
* and may throw exceptions on validation errors
|
|
* - `elementCallback`: Closure that will receive each `DOMElement` and
|
|
* may modify it; the callback must return an array with exception
|
|
* objects for each modification
|
|
* - `urlAttrs`: List of attributes that may contain URLs
|
|
* @return array List of validation errors during sanitization
|
|
*
|
|
* @throws \Kirby\Exception\InvalidArgumentException If the doctype is not valid
|
|
*/
|
|
public function sanitize(array $options): array
|
|
{
|
|
$options = array_merge([
|
|
'allowedAttrPrefixes' => [],
|
|
'allowedAttrs' => true,
|
|
'allowedDataUris' => true,
|
|
'allowedDomains' => true,
|
|
'allowedNamespaces' => true,
|
|
'allowedPIs' => true,
|
|
'allowedTags' => true,
|
|
'attrCallback' => null,
|
|
'disallowedTags' => [],
|
|
'doctypeCallback' => null,
|
|
'elementCallback' => null,
|
|
'urlAttrs' => ['href', 'src', 'xlink:href'],
|
|
], $options);
|
|
|
|
$errors = [];
|
|
|
|
// validate the doctype;
|
|
// convert the `DOMNodeList` to an array first, otherwise removing
|
|
// nodes would shift the list and make subsequent operations fail
|
|
foreach (iterator_to_array($this->doc->childNodes, false) as $child) {
|
|
if (is_a($child, 'DOMDocumentType') === true) {
|
|
$this->sanitizeDoctype($child, $options, $errors);
|
|
}
|
|
}
|
|
|
|
// validate all processing instructions like <?xml-stylesheet
|
|
$pis = $this->query('//processing-instruction()');
|
|
foreach (iterator_to_array($pis, false) as $pi) {
|
|
$this->sanitizePI($pi, $options, $errors);
|
|
}
|
|
|
|
// validate all elements in the document tree
|
|
$elements = $this->doc->getElementsByTagName('*');
|
|
foreach (iterator_to_array($elements, false) as $element) {
|
|
$this->sanitizeElement($element, $options, $errors);
|
|
}
|
|
|
|
return $errors;
|
|
}
|
|
|
|
/**
|
|
* Returns the document markup as string
|
|
*
|
|
* @param bool $normalize If set to `true`, the document
|
|
* is exported with an XML declaration/
|
|
* full HTML markup even if the input
|
|
* didn't have them
|
|
* @return string
|
|
*/
|
|
public function toString(bool $normalize = false): string
|
|
{
|
|
if ($this->type === 'HTML') {
|
|
$string = $this->exportHtml($normalize);
|
|
} else {
|
|
$string = $this->exportXml($normalize);
|
|
}
|
|
|
|
// add trailing newline if the input contained one
|
|
if (rtrim($this->code, "\r\n") !== $this->code) {
|
|
$string .= "\n";
|
|
}
|
|
|
|
return $string;
|
|
}
|
|
|
|
/**
|
|
* Removes a node from the document but keeps its children
|
|
* by moving them one level up
|
|
*
|
|
* @param \DOMNode $node
|
|
* @return void
|
|
*/
|
|
public static function unwrap(DOMNode $node): void
|
|
{
|
|
foreach ($node->childNodes as $childNode) {
|
|
// discard text nodes as they can be unexpected
|
|
// directly in the parent element
|
|
if (is_a($childNode, 'DOMText') === true) {
|
|
continue;
|
|
}
|
|
|
|
$node->parentNode->insertBefore(clone $childNode, $node);
|
|
}
|
|
|
|
static::remove($node);
|
|
}
|
|
|
|
/**
|
|
* Returns the document markup as HTML string
|
|
*
|
|
* @param bool $normalize If set to `true`, the document
|
|
* is exported with full HTML markup
|
|
* even if the input didn't have it
|
|
* @return string
|
|
*/
|
|
protected function exportHtml(bool $normalize = false): string
|
|
{
|
|
// enforce export as UTF-8 by injecting a <meta> tag
|
|
// at the beginning of the document
|
|
$metaTag = $this->doc->createElement('meta');
|
|
$metaTag->setAttribute('http-equiv', 'Content-Type');
|
|
$metaTag->setAttribute('content', 'text/html; charset=utf-8');
|
|
$metaTag->setAttribute('id', $metaId = Str::random(10));
|
|
$this->doc->insertBefore($metaTag, $this->doc->documentElement);
|
|
|
|
if (
|
|
preg_match('/<html[> ]/i', $this->code) === 1 ||
|
|
$this->doc->doctype !== null ||
|
|
$normalize === true
|
|
) {
|
|
// full document
|
|
$html = $this->doc->saveHTML();
|
|
} elseif (preg_match('/<body[> ]/i', $this->code) === 1) {
|
|
// there was a <body>, but no <html>; export just the <body>
|
|
$html = $this->doc->saveHTML($this->body());
|
|
} else {
|
|
// just an HTML snippet
|
|
$html = $this->innerMarkup($this->body());
|
|
}
|
|
|
|
// remove the <meta> tag from the document and from the output
|
|
static::remove($metaTag);
|
|
$html = str_replace($this->doc->saveHTML($metaTag), '', $html);
|
|
|
|
return trim($html);
|
|
}
|
|
|
|
/**
|
|
* Returns the document markup as XML string
|
|
*
|
|
* @param bool $normalize If set to `true`, the document
|
|
* is exported with an XML declaration
|
|
* even if the input didn't have it
|
|
* @return string
|
|
*/
|
|
protected function exportXml(bool $normalize = false): string
|
|
{
|
|
if (Str::contains($this->code, '<?xml ', true) === false && $normalize === false) {
|
|
// the input didn't contain an XML declaration;
|
|
// only return child nodes, which omits it
|
|
$result = [];
|
|
foreach ($this->doc->childNodes as $node) {
|
|
$result[] = $this->doc->saveXML($node);
|
|
}
|
|
|
|
return implode("\n", $result);
|
|
}
|
|
|
|
// ensure that the document is encoded as UTF-8
|
|
// unless a different encoding was specified in
|
|
// the input or before exporting
|
|
if ($this->doc->encoding === null) {
|
|
$this->doc->encoding = 'UTF-8';
|
|
}
|
|
|
|
return trim($this->doc->saveXML());
|
|
}
|
|
|
|
/**
|
|
* Sanitizes an attribute
|
|
*
|
|
* @param \DOMAttr $attr
|
|
* @param array $options See `Dom::sanitize()`
|
|
* @param array $errors Array to store additional errors in by reference
|
|
* @return void
|
|
*/
|
|
protected function sanitizeAttr(DOMAttr $attr, array $options, array &$errors): void
|
|
{
|
|
$element = $attr->ownerElement;
|
|
$name = $attr->nodeName;
|
|
$value = $attr->value;
|
|
|
|
$allowed = static::isAllowedAttr($attr, $options);
|
|
if ($allowed !== true) {
|
|
$errors[] = new InvalidArgumentException(
|
|
'The "' . $name . '" attribute (line ' .
|
|
$attr->getLineNo() . ') is not allowed: ' .
|
|
$allowed
|
|
);
|
|
$element->removeAttributeNode($attr);
|
|
} elseif (static::listContainsName($options['urlAttrs'], $attr, $options) !== false) {
|
|
$allowed = static::isAllowedUrl($value, $options);
|
|
if ($allowed !== true) {
|
|
$errors[] = new InvalidArgumentException(
|
|
'The URL is not allowed in attribute "' .
|
|
$name . '" (line ' . $attr->getLineNo() . '): ' .
|
|
$allowed
|
|
);
|
|
$element->removeAttributeNode($attr);
|
|
}
|
|
} else {
|
|
// check for unwanted URLs in other attributes
|
|
foreach (static::extractUrls($value) as $url) {
|
|
$allowed = static::isAllowedUrl($url, $options);
|
|
if ($allowed !== true) {
|
|
$errors[] = new InvalidArgumentException(
|
|
'The URL is not allowed in attribute "' .
|
|
$name . '" (line ' . $attr->getLineNo() . '): ' .
|
|
$allowed
|
|
);
|
|
$element->removeAttributeNode($attr);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Sanitizes the doctype
|
|
*
|
|
* @param \DOMDocumentType $doctype
|
|
* @param array $options See `Dom::sanitize()`
|
|
* @param array $errors Array to store additional errors in by reference
|
|
* @return void
|
|
*/
|
|
protected function sanitizeDoctype(DOMDocumentType $doctype, array $options, array &$errors): void
|
|
{
|
|
try {
|
|
$this->validateDoctype($doctype, $options);
|
|
} catch (InvalidArgumentException $e) {
|
|
$errors[] = $e;
|
|
static::remove($doctype);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Sanitizes a single DOM element and its attribute
|
|
*
|
|
* @param \DOMElement $element
|
|
* @param array $options See `Dom::sanitize()`
|
|
* @param array $errors Array to store additional errors in by reference
|
|
* @return void
|
|
*/
|
|
protected function sanitizeElement(DOMElement $element, array $options, array &$errors): void
|
|
{
|
|
$name = $element->nodeName;
|
|
|
|
// check defined namespaces (`xmlns` attributes);
|
|
// we need to check this first as the namespace can affect
|
|
// whether the tag name is valid according to the configuration
|
|
if (is_array($options['allowedNamespaces']) === true) {
|
|
$simpleXmlElement = simplexml_import_dom($element);
|
|
foreach ($simpleXmlElement->getDocNamespaces(false, false) as $namespace => $value) {
|
|
if (array_search($value, $options['allowedNamespaces']) === false) {
|
|
$element->removeAttributeNS($value, $namespace);
|
|
$errors[] = new InvalidArgumentException(
|
|
'The namespace "' . $value . '" is not allowed' .
|
|
' (around line ' . $element->getLineNo() . ')'
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
// check if the tag is blocklisted; remove the element completely
|
|
if (
|
|
static::listContainsName(
|
|
$options['disallowedTags'],
|
|
$element,
|
|
$options,
|
|
fn ($expected, $real): bool => Str::lower($expected) === Str::lower($real)
|
|
) !== false
|
|
) {
|
|
$errors[] = new InvalidArgumentException(
|
|
'The "' . $name . '" element (line ' .
|
|
$element->getLineNo() . ') is not allowed'
|
|
);
|
|
static::remove($element);
|
|
|
|
return;
|
|
}
|
|
|
|
// check if the tag is not allowlisted; keep children
|
|
if ($options['allowedTags'] !== true) {
|
|
$listedName = static::listContainsName(array_keys($options['allowedTags']), $element, $options);
|
|
|
|
if ($listedName === false) {
|
|
$errors[] = new InvalidArgumentException(
|
|
'The "' . $name . '" element (line ' .
|
|
$element->getLineNo() . ') is not allowed, ' .
|
|
'but its children can be kept'
|
|
);
|
|
static::unwrap($element);
|
|
|
|
return;
|
|
}
|
|
}
|
|
|
|
// check attributes
|
|
if ($element->hasAttributes()) {
|
|
// convert the `DOMNodeList` to an array first, otherwise removing
|
|
// attributes would shift the list and make subsequent operations fail
|
|
foreach (iterator_to_array($element->attributes, false) as $attr) {
|
|
$this->sanitizeAttr($attr, $options, $errors);
|
|
|
|
// custom check (if the attribute is still in the document)
|
|
if ($attr->ownerElement !== null && $options['attrCallback']) {
|
|
$errors = array_merge($errors, $options['attrCallback']($attr) ?? []);
|
|
}
|
|
}
|
|
}
|
|
|
|
// custom check
|
|
if ($options['elementCallback']) {
|
|
$errors = array_merge($errors, $options['elementCallback']($element) ?? []);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Sanitizes a single XML processing instruction
|
|
*
|
|
* @param \DOMProcessingInstruction $pi
|
|
* @param array $options See `Dom::sanitize()`
|
|
* @param array $errors Array to store additional errors in by reference
|
|
* @return void
|
|
*/
|
|
protected function sanitizePI(DOMProcessingInstruction $pi, array $options, array &$errors): void
|
|
{
|
|
$name = $pi->nodeName;
|
|
|
|
// check for allow-listed processing instructions
|
|
if (is_array($options['allowedPIs']) === true && in_array($name, $options['allowedPIs']) === false) {
|
|
$errors[] = new InvalidArgumentException(
|
|
'The "' . $name . '" processing instruction (line ' .
|
|
$pi->getLineNo() . ') is not allowed'
|
|
);
|
|
static::remove($pi);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Validates the document type
|
|
*
|
|
* @param \DOMDocumentType $doctype
|
|
* @param array $options See `Dom::sanitize()`
|
|
* @return void
|
|
*
|
|
* @throws \Kirby\Exception\InvalidArgumentException If the doctype is not valid
|
|
*/
|
|
protected function validateDoctype(DOMDocumentType $doctype, array $options): void
|
|
{
|
|
if (empty($doctype->publicId) === false || empty($doctype->systemId) === false) {
|
|
throw new InvalidArgumentException('The doctype must not reference external files');
|
|
}
|
|
|
|
if (empty($doctype->internalSubset) === false) {
|
|
throw new InvalidArgumentException('The doctype must not define a subset');
|
|
}
|
|
|
|
if ($options['doctypeCallback']) {
|
|
$options['doctypeCallback']($doctype);
|
|
}
|
|
}
|
|
}
|