Upgrade to 3.6.0

This commit is contained in:
Bastian Allgeier
2021-11-16 14:53:37 +01:00
parent 7388fa4d24
commit 92b7a330fa
318 changed files with 20017 additions and 6878 deletions

View File

@@ -3,21 +3,53 @@
namespace Kirby\Parsley;
use DOMElement;
use DOMNodeList;
use DOMXPath;
use Kirby\Toolkit\Str;
/**
* Represents a block level element
* in an HTML document
*
* @since 3.5.0
*
* @package Kirby Parsley
* @author Bastian Allgeier <bastian@getkirby.com>,
* @link https://getkirby.com
* @copyright Bastian Allgeier GmbH
* @license https://getkirby.com/license
*/
class Element
{
/**
* @var array
*/
protected $marks;
/**
* @var \DOMElement
*/
protected $node;
/**
* @param \DOMElement $node
* @param array $marks
*/
public function __construct(DOMElement $node, array $marks = [])
{
$this->marks = $marks;
$this->node = $node;
}
public function attr(string $attr, $fallback = null)
/**
* The returns the attribute value or
* the given fallback if the attribute does not exist
*
* @param string $attr
* @param string|null $fallback
* @return string|null
*/
public function attr(string $attr, string $fallback = null): ?string
{
if ($this->node->hasAttribute($attr)) {
return $this->node->getAttribute($attr) ?? $fallback;
@@ -26,27 +58,54 @@ class Element
return $fallback;
}
public function children()
/**
* Returns a list of all child elements
*
* @return \DOMNodeList
*/
public function children(): DOMNodeList
{
return $this->node->childNodes;
}
/**
* Returns an array with all class names
*
* @return array
*/
public function classList(): array
{
return Str::split($this->className(), ' ');
}
public function className()
/**
* Returns the value of the class attribute
*
* @return string|null
*/
public function className(): ?string
{
return $this->node->getAttribute('class');
return $this->attr('class');
}
/**
* Returns the original dom element
*
* @return \DOMElement
*/
public function element()
{
return $this->node;
}
public function filter(string $query)
/**
* Returns an array with all nested elements
* that could be found for the given query
*
* @param string $query
* @return array
*/
public function filter(string $query): array
{
$result = [];
@@ -59,40 +118,80 @@ class Element
return $result;
}
/**
* Tries to find a single nested element by
* query and otherwise returns null
*
* @param string $query
* @return \Kirby\Parsley\Element|null
*/
public function find(string $query)
{
if ($result = $this->query($query)[0]) {
return new static($result);
}
return false;
return null;
}
/**
* Returns the inner HTML of the element
*
* @param array|null $marks List of allowed marks
* @return string
*/
public function innerHtml(array $marks = null): string
{
return (new Inline($this->node, $marks ?? $this->marks))->innerHtml();
}
public function innerText()
/**
* Returns the contents as plain text
*
* @return string
*/
public function innerText(): string
{
return trim($this->node->textContent);
}
/**
* Returns the full HTML for the element
*
* @param array|null $marks
* @return string
*/
public function outerHtml(array $marks = null): string
{
return $this->node->ownerDocument->saveHtml($this->node);
}
public function query($query)
/**
* Searches nested elements
*
* @param string $query
* @return DOMNodeList|null
*/
public function query(string $query)
{
return (new DOMXPath($this->node->ownerDocument))->query($query, $this->node);
}
/**
* Removes the element from the DOM
*
* @return void
*/
public function remove()
{
$this->node->parentNode->removeChild($this->node);
}
/**
* Returns the name of the element
*
* @return string
*/
public function tagName(): string
{
return $this->node->tagName;

View File

@@ -2,70 +2,173 @@
namespace Kirby\Parsley;
use DOMNode;
use DOMNodeList;
use Kirby\Toolkit\Html;
/**
* Represents an inline element
* in an HTML document
*
* @since 3.5.0
*
* @package Kirby Parsley
* @author Bastian Allgeier <bastian@getkirby.com>,
* @link https://getkirby.com
* @copyright Bastian Allgeier GmbH
* @license https://getkirby.com/license
*/
class Inline
{
/**
* @var string
*/
protected $html = '';
/**
* @var array
*/
protected $marks = [];
public function __construct($node, array $marks = [])
/**
* @param \DOMNode $node
* @param array $marks
*/
public function __construct(DOMNode $node, array $marks = [])
{
$this->createMarkRules($marks);
$this->html = trim($this->parseNode($node));
$this->html = trim(static::parseNode($node, $this->marks));
}
public function createMarkRules($marks)
/**
* Loads all mark rules
*
* @param array $marks
* @return array
*/
protected function createMarkRules(array $marks)
{
foreach ($marks as $mark) {
$this->marks[$mark['tag']] = $mark;
}
return $this->marks;
}
public function parseChildren($children): string
/**
* Get all allowed attributes for a DOMNode
* as clean array
*
* @param DOMNode $node
* @param array $marks
* @return array
*/
public static function parseAttrs(DOMNode $node, array $marks = []): array
{
if (!$children) {
return '';
$attrs = [];
$mark = $marks[$node->tagName];
$defaults = $mark['defaults'] ?? [];
foreach ($mark['attrs'] ?? [] as $attr) {
if ($node->hasAttribute($attr)) {
$attrs[$attr] = $node->getAttribute($attr);
} else {
$attrs[$attr] = $defaults[$attr] ?? null;
}
}
return $attrs;
}
/**
* Parses all children and creates clean HTML
* for each of them.
*
* @param \DOMNodeList $children
* @param array $marks
* @return string
*/
public static function parseChildren(DOMNodeList $children, array $marks): string
{
$html = '';
foreach ($children as $child) {
$html .= $this->parseNode($child);
$html .= static::parseNode($child, $marks);
}
return $html;
}
public function parseNode($node)
/**
* Go through all child elements and create
* clean inner HTML for them
*
* @param DOMNode $node
* @return string|null
*/
public static function parseInnerHtml(DOMNode $node, array $marks = []): ?string
{
$html = static::parseChildren($node->childNodes, $marks);
// trim the inner HTML for paragraphs
if ($node->tagName === 'p') {
$html = trim($html);
}
// return null for empty inner HTML
if ($html === '') {
return null;
}
return $html;
}
/**
* Converts the given node to clean HTML
*
* @param \DOMNode $node
* @param array $marks
* @return string|null
*/
public static function parseNode(DOMNode $node, array $marks = []): ?string
{
if (is_a($node, 'DOMText') === true) {
return $node->textContent;
return Html::encode($node->textContent);
}
// ignore comments
if (is_a($node, 'DOMComment') === true) {
return '';
}
// known marks
if (array_key_exists($node->tagName, $this->marks) === true) {
$mark = $this->marks[$node->tagName];
$attrs = [];
$defaults = $mark['defaults'] ?? [];
foreach ($mark['attrs'] ?? [] as $attr) {
if ($node->hasAttribute($attr)) {
$attrs[$attr] = $node->getAttribute($attr);
} else {
$attrs[$attr] = $defaults[$attr] ?? null;
}
}
return '<' . $node->tagName . attr($attrs, ' ') . '>' . $this->parseChildren($node->childNodes) . '</' . $node->tagName . '>';
return null;
}
// unknown marks
return $this->parseChildren($node->childNodes);
if (array_key_exists($node->tagName, $marks) === false) {
return static::parseChildren($node->childNodes, $marks);
}
// collect all allowed attributes
$attrs = static::parseAttrs($node, $marks);
// close self-closing elements
if (Html::isVoid($node->tagName) === true) {
return '<' . $node->tagName . attr($attrs, ' ') . ' />';
}
$innerHtml = static::parseInnerHtml($node, $marks);
// skip empty paragraphs
if ($innerHtml === null && $node->tagName === 'p') {
return null;
}
// create the outer html for the element
return '<' . $node->tagName . attr($attrs, ' ') . '>' . $innerHtml . '</' . $node->tagName . '>';
}
public function innerHtml()
/**
* Returns the HTML contents of the element
*
* @return string
*/
public function innerHtml(): string
{
return $this->html;
}

View File

@@ -2,23 +2,73 @@
namespace Kirby\Parsley;
use DOMDocument;
use DOMXPath;
use DOMNode;
use Kirby\Parsley\Schema\Plain;
use Kirby\Toolkit\Dom;
/**
* HTML parser to extract the best possible blocks
* from any kind of HTML document
*
* @since 3.5.0
*
* @package Kirby Parsley
* @author Bastian Allgeier <bastian@getkirby.com>,
* @link https://getkirby.com
* @copyright Bastian Allgeier GmbH
* @license https://getkirby.com/license
*/
class Parsley
{
/**
* @var array
*/
protected $blocks = [];
protected $body;
/**
* @var \DOMDocument
*/
protected $doc;
protected $inline;
/**
* @var \Kirby\Toolkit\Dom
*/
protected $dom;
/**
* @var array
*/
protected $inline = [];
/**
* @var array
*/
protected $marks = [];
/**
* @var array
*/
protected $nodes = [];
/**
* @var \Kirby\Parsley\Schema
*/
protected $schema;
/**
* @var array
*/
protected $skip = [];
/**
* @var bool
*/
public static $useXmlExtension = true;
/**
* @param string $html
* @param \Kirby\Parsley\Schema|null $schema
*/
public function __construct(string $html, Schema $schema = null)
{
// fail gracefully if the XML extension is not installed
@@ -33,45 +83,66 @@ class Parsley
return;
}
libxml_use_internal_errors(true);
$this->doc = new DOMDocument();
$this->doc->preserveWhiteSpace = false;
$this->doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
libxml_clear_errors();
if (!preg_match('/<body|head*.?>/', $html)) {
$html = '<div>' . $html . '</div>';
}
$this->dom = new Dom($html);
$this->doc = $this->dom->document();
$this->schema = $schema ?? new Plain();
$this->skip = $this->schema->skip();
$this->marks = $this->schema->marks();
$this->inline = [];
// load all allowed nodes from the schema
$this->createNodeRules($this->schema->nodes());
$this->parseNode($this->body());
// start parsing at the top level and go through
// all children of the document
foreach ($this->doc->childNodes as $childNode) {
$this->parseNode($childNode);
}
// needs to be called at last to fetch remaining
// inline elements after parsing has ended
$this->endInlineBlock();
}
/**
* Returns all detected blocks
*
* @return array
*/
public function blocks(): array
{
return $this->blocks;
}
public function body()
{
return $this->body = $this->body ?? $this->query($this->doc, '/html/body')[0];
}
public function createNodeRules($nodes)
/**
* Load all node rules from the schema
*
* @param array $nodes
* @return array
*/
public function createNodeRules(array $nodes): array
{
foreach ($nodes as $node) {
$this->nodes[$node['tag']] = $node;
}
return $this->nodes;
}
public function containsBlock($element): bool
/**
* Checks if the given element contains
* any other block level elements
*
* @param \DOMNode $element
* @return bool
*/
public function containsBlock(DOMNode $element): bool
{
if (!$element->childNodes) {
if ($element->hasChildNodes() === false) {
return false;
}
@@ -84,8 +155,22 @@ class Parsley
return false;
}
/**
* Takes all inline elements in the inline cache
* and combines them in a final block. The block
* will either be merged with the previous block
* if the type matches, or will be appended.
*
* The inline cache will be reset afterwards
*
* @return void
*/
public function endInlineBlock()
{
if (empty($this->inline) === true) {
return;
}
$html = [];
foreach ($this->inline as $inline) {
@@ -102,26 +187,30 @@ class Parsley
$this->inline = [];
}
public function fallback($node)
/**
* Creates a fallback block type for the given
* element. The element can either be a element object
* or a simple HTML/plain text string
*
* @param \Kirby\Parsley\Element|string $element
* @return array|null
*/
public function fallback($element): ?array
{
if (is_a($node, 'DOMText') === true) {
$html = $node->textContent;
} elseif (is_a($node, Element::class) === true) {
$html = $node->innerHtml();
} elseif (is_string($node) === true) {
$html = $node;
} else {
$html = '';
}
if ($fallback = $this->schema->fallback($html)) {
if ($fallback = $this->schema->fallback($element)) {
return $fallback;
}
return false;
return null;
}
public function isBlock($element): bool
/**
* Checks if the given DOMNode is a block element
*
* @param DOMNode $element
* @return bool
*/
public function isBlock(DOMNode $element): bool
{
if (is_a($element, 'DOMElement') === false) {
return false;
@@ -130,13 +219,24 @@ class Parsley
return array_key_exists($element->tagName, $this->nodes) === true;
}
public function isInline($element)
/**
* Checks if the given DOMNode is an inline element
*
* @param \DOMNode $element
* @return bool
*/
public function isInline(DOMNode $element): bool
{
if (is_a($element, 'DOMText') === true) {
return true;
}
if (is_a($element, 'DOMElement') === true) {
// all spans will be treated as inline elements
if ($element->tagName === 'span') {
return true;
}
if ($this->containsBlock($element) === true) {
return false;
}
@@ -152,14 +252,18 @@ class Parsley
return false;
}
public function mergeOrAppend($block)
/**
* @param array $block
* @return void
*/
public function mergeOrAppend(array $block)
{
$lastIndex = count($this->blocks) - 1;
$lastItem = $this->blocks[$lastIndex] ?? null;
// merge with previous block
if ($block['type'] === 'text' && $lastItem && $lastItem['type'] === 'text') {
$this->blocks[$lastIndex]['content']['text'] .= "\n\n" . $block['content']['text'];
$this->blocks[$lastIndex]['content']['text'] .= ' ' . $block['content']['text'];
// append
} else {
@@ -167,13 +271,21 @@ class Parsley
}
}
public function parseNode($element)
/**
* Parses the given DOM node and tries to
* convert it to a block or a list of blocks
*
* @param \DOMNode $element
* @return void
*/
public function parseNode(DOMNode $element): bool
{
// comments
if (is_a($element, 'DOMComment') === true) {
return true;
}
$skip = ['DOMComment', 'DOMDocumentType'];
// unwanted element types
if (in_array(get_class($element), $skip) === true) {
return false;
}
// inline context
if ($this->isInline($element)) {
@@ -193,13 +305,22 @@ class Parsley
return true;
}
// has only unkown children (div, etc.)
// has only unknown children (div, etc.)
if ($this->containsBlock($element) === false) {
if (in_array($element->tagName, $this->skip) === true) {
return true;
return false;
}
if ($element->tagName !== 'body') {
$wrappers = [
'body',
'head',
'html',
];
// wrapper elements should never be converted
// to a simple fallback block. Their children
// have to be parsed individually.
if (in_array($element->tagName, $wrappers) === false) {
$node = new Element($element, $this->marks);
if ($block = $this->fallback($node)) {
@@ -214,19 +335,19 @@ class Parsley
foreach ($element->childNodes as $childNode) {
$this->parseNode($childNode);
}
return true;
}
public function query($element, $query)
{
return (new DOMXPath($element))->query($query);
}
/**
* @return bool
*/
public function useXmlExtension(): bool
{
if (static::$useXmlExtension !== true) {
return false;
}
return class_exists('DOMDocument') === true;
return Dom::isSupported();
}
}

View File

@@ -2,10 +2,61 @@
namespace Kirby\Parsley;
abstract class Schema
/**
* Block schema definition
*
* @since 3.5.0
*
* @package Kirby Parsley
* @author Bastian Allgeier <bastian@getkirby.com>,
* @link https://getkirby.com
* @copyright Bastian Allgeier GmbH
* @license https://getkirby.com/license
*/
class Schema
{
abstract public function fallback(string $html);
abstract public function marks(): array;
abstract public function nodes(): array;
abstract public function skip(): array;
/**
* Returns the fallback block when no
* other block type can be detected
*
* @param \Kirby\Parsley\Element|string $element
* @return array|null
*/
public function fallback($element): ?array
{
return null;
}
/**
* Returns a list of allowed inline marks
* and their parsing rules
*
* @return array
*/
public function marks(): array
{
return [];
}
/**
* Returns a list of allowed nodes and
* their parsing rules
*
* @return array
*/
public function nodes(): array
{
return [];
}
/**
* Returns a list of all elements that should be
* skipped and not be parsed at all
*
* @return array
*/
public function skip(): array
{
return [];
}
}

View File

@@ -5,28 +5,103 @@ namespace Kirby\Parsley\Schema;
use Kirby\Parsley\Element;
use Kirby\Toolkit\Str;
/**
* The plain schema definition converts
* the entire document into simple text blocks
*
* @since 3.5.0
*
* @package Kirby Parsley
* @author Bastian Allgeier <bastian@getkirby.com>,
* @link https://getkirby.com
* @copyright Bastian Allgeier GmbH
* @license https://getkirby.com/license
*/
class Blocks extends Plain
{
public function fallback(string $html)
/**
* @param \Kirby\Parsley\Element $node
* @return array
*/
public function blockquote(Element $node): array
{
$html = trim($html);
$citation = null;
$text = [];
if (Str::length($html) === 0) {
return false;
// get all the text for the quote
foreach ($node->children() as $child) {
if (is_a($child, 'DOMText') === true) {
$text[] = trim($child->textContent);
}
if (is_a($child, 'DOMElement') === true && $child->tagName !== 'footer') {
$text[] = (new Element($child))->innerHTML($this->marks());
}
}
// filter empty blocks and separate text blocks with breaks
$text = implode('', array_filter($text));
// get the citation from the footer
if ($footer = $node->find('footer')) {
$citation = $footer->innerHTML($this->marks());
}
return [
'content' => [
'text' => '<p>' . $html . '</p>',
'citation' => $citation,
'text' => $text
],
'type' => 'quote',
];
}
/**
* Creates the fallback block type
* if no other block can be found
*
* @param \Kirby\Parsley\Element|string $element
* @return array|null
*/
public function fallback($element): ?array
{
if (is_a($element, Element::class) === true) {
$html = $element->innerHtml();
// wrap the inner HTML in a p tag if it doesn't
// contain one yet.
if (Str::contains($html, '<p>') === false) {
$html = '<p>' . $html . '</p>';
}
} elseif (is_string($element) === true) {
$html = trim($element);
if (Str::length($html) === 0) {
return null;
}
$html = '<p>' . $html . '</p>';
} else {
return null;
}
return [
'content' => [
'text' => $html,
],
'type' => 'text',
];
}
public function heading($node, $level)
/**
* Converts a heading element to a heading block
*
* @param \Kirby\Parsley\Element $node
* @return array
*/
public function heading(Element $node): array
{
$content = [
'level' => $level,
'level' => strtolower($node->tagName()),
'text' => $node->innerHTML()
];
@@ -42,7 +117,91 @@ class Blocks extends Plain
];
}
public function list($node)
/**
* @param \Kirby\Parsley\Element $node
* @return array
*/
public function iframe(Element $node): array
{
$caption = null;
$src = $node->attr('src');
if ($figcaption = $node->find('ancestor::figure[1]//figcaption')) {
$caption = $figcaption->innerHTML($this->marks());
// avoid parsing the caption twice
$figcaption->remove();
}
// reverse engineer video URLs
if (preg_match('!player.vimeo.com\/video\/([0-9]+)!i', $src, $array) === 1) {
$src = 'https://vimeo.com/' . $array[1];
} elseif (preg_match('!youtube.com\/embed\/([a-zA-Z0-9_-]+)!', $src, $array) === 1) {
$src = 'https://youtube.com/watch?v=' . $array[1];
} elseif (preg_match('!youtube-nocookie.com\/embed\/([a-zA-Z0-9_-]+)!', $src, $array) === 1) {
$src = 'https://youtube.com/watch?v=' . $array[1];
} else {
$src = false;
}
// correct video URL
if ($src) {
return [
'content' => [
'caption' => $caption,
'url' => $src
],
'type' => 'video',
];
}
return [
'content' => [
'text' => $node->outerHTML()
],
'type' => 'markdown',
];
}
/**
* @param \Kirby\Parsley\Element $node
* @return array
*/
public function img(Element $node): array
{
$caption = null;
$link = null;
if ($figcaption = $node->find('ancestor::figure[1]//figcaption')) {
$caption = $figcaption->innerHTML($this->marks());
// avoid parsing the caption twice
$figcaption->remove();
}
if ($a = $node->find('ancestor::a')) {
$link = $a->attr('href');
}
return [
'content' => [
'alt' => $node->attr('alt'),
'caption' => $caption,
'link' => $link,
'location' => 'web',
'src' => $node->attr('src'),
],
'type' => 'image',
];
}
/**
* Converts a list element to HTML
*
* @param \Kirby\Parsley\Element $node
* @return string
*/
public function list(Element $node): string
{
$html = [];
@@ -69,12 +228,21 @@ class Blocks extends Plain
return '<' . $node->tagName() . '>' . implode($html) . '</' . $node->tagName() . '>';
}
/**
* Returns a list of allowed inline marks
* and their parsing rules
*
* @return array
*/
public function marks(): array
{
return [
[
'tag' => 'a',
'attrs' => ['href', 'target', 'title'],
'attrs' => ['href', 'rel', 'target', 'title'],
'defaults' => [
'rel' => 'noopener noreferrer'
]
],
[
'tag' => 'abbr',
@@ -82,6 +250,9 @@ class Blocks extends Plain
[
'tag' => 'b'
],
[
'tag' => 'br',
],
[
'tag' => 'code'
],
@@ -94,6 +265,9 @@ class Blocks extends Plain
[
'tag' => 'i',
],
[
'tag' => 'p',
],
[
'tag' => 'strike',
],
@@ -112,153 +286,81 @@ class Blocks extends Plain
];
}
/**
* Returns a list of allowed nodes and
* their parsing rules
*
* @codeCoverageIgnore
* @return array
*/
public function nodes(): array
{
return [
[
'tag' => 'blockquote',
'parse' => function ($node) {
$citation = null;
$text = [];
// get all the text for the quote
foreach ($node->element()->childNodes as $child) {
if (is_a($child, 'DOMText') === true) {
$text[] = trim($child->textContent);
}
if (is_a($child, 'DOMElement') === true && $child->tagName !== 'footer') {
$text[] = (new Element($child))->innerHTML($this->marks());
}
}
// filter empty blocks and separate text blocks with breaks
$text = implode('<br></br>', array_filter($text));
// get the citation from the footer
if ($footer = $node->find('footer')) {
$citation = $footer->innerHTML($this->marks());
}
return [
'content' => [
'citation' => $citation,
'text' => $text
],
'type' => 'quote',
];
'parse' => function (Element $node) {
return $this->blockquote($node);
}
],
[
'tag' => 'h1',
'parse' => function ($node) {
return $this->heading($node, 'h1');
'parse' => function (Element $node) {
return $this->heading($node);
}
],
[
'tag' => 'h2',
'parse' => function ($node) {
return $this->heading($node, 'h2');
'parse' => function (Element $node) {
return $this->heading($node);
}
],
[
'tag' => 'h3',
'parse' => function ($node) {
return $this->heading($node, 'h3');
'parse' => function (Element $node) {
return $this->heading($node);
}
],
[
'tag' => 'h4',
'parse' => function ($node) {
return $this->heading($node, 'h4');
'parse' => function (Element $node) {
return $this->heading($node);
}
],
[
'tag' => 'h5',
'parse' => function ($node) {
return $this->heading($node, 'h5');
'parse' => function (Element $node) {
return $this->heading($node);
}
],
[
'tag' => 'h6',
'parse' => function ($node) {
return $this->heading($node, 'h6');
'parse' => function (Element $node) {
return $this->heading($node);
}
],
[
'tag' => 'hr',
'parse' => function (Element $node) {
return [
'type' => 'line'
];
}
],
[
'tag' => 'iframe',
'parse' => function ($node) {
$caption = null;
$src = $node->attr('src');
if ($figcaption = $node->find('ancestor::figure[1]//figcaption')) {
$caption = $figcaption->innerHTML($this->marks());
// avoid parsing the caption twice
$figcaption->remove();
}
// reverse engineer video URLs
if (preg_match('!player.vimeo.com\/video\/([0-9]+)!i', $src, $array) === 1) {
$src = 'https://vimeo.com/' . $array[1];
} elseif (preg_match('!youtube.com\/embed\/([a-zA-Z0-9_-]+)!', $src, $array) === 1) {
$src = 'https://youtube.com/watch?v=' . $array[1];
} elseif (preg_match('!youtube-nocookie.com\/embed\/([a-zA-Z0-9_-]+)!', $src, $array) === 1) {
$src = 'https://youtube.com/watch?v=' . $array[1];
} else {
$src = false;
}
// correct video URL
if ($src) {
return [
'content' => [
'caption' => $caption,
'url' => $src
],
'type' => 'video',
];
}
return [
'content' => [
'text' => $node->outerHTML()
],
'type' => 'markdown',
];
'parse' => function (Element $node) {
return $this->iframe($node);
}
],
[
'tag' => 'img',
'parse' => function ($node) {
$caption = null;
$link = null;
if ($figcaption = $node->find('ancestor::figure[1]//figcaption')) {
$caption = $figcaption->innerHTML($this->marks());
// avoid parsing the caption twice
$figcaption->remove();
}
if ($a = $node->find('ancestor::a')) {
$link = $a->attr('href');
}
return [
'content' => [
'alt' => $node->attr('alt'),
'caption' => $caption,
'link' => $link,
'location' => 'web',
'src' => $node->attr('src'),
],
'type' => 'image',
];
'parse' => function (Element $node) {
return $this->img($node);
}
],
[
'tag' => 'ol',
'parse' => function ($node) {
'parse' => function (Element $node) {
return [
'content' => [
'text' => $this->list($node)
@@ -269,41 +371,19 @@ class Blocks extends Plain
],
[
'tag' => 'pre',
'parse' => function ($node) {
$language = 'text';
if ($code = $node->find('//code')) {
foreach ($code->classList() as $className) {
if (preg_match('!language-(.*?)!', $className)) {
$language = str_replace('language-', '', $className);
break;
}
}
}
return [
'content' => [
'code' => $node->innerText(),
'language' => $language
],
'type' => 'code',
];
'parse' => function (Element $node) {
return $this->pre($node);
}
],
[
'tag' => 'table',
'parse' => function ($node) {
return [
'content' => [
'text' => $node->outerHTML(),
],
'type' => 'markdown',
];
'parse' => function (Element $node) {
return $this->table($node);
}
],
[
'tag' => 'ul',
'parse' => function ($node) {
'parse' => function (Element $node) {
return [
'content' => [
'text' => $this->list($node)
@@ -314,4 +394,44 @@ class Blocks extends Plain
],
];
}
/**
* @param \Kirby\Parsley\Element $node
* @return array
*/
public function pre(Element $node): array
{
$language = 'text';
if ($code = $node->find('//code')) {
foreach ($code->classList() as $className) {
if (preg_match('!language-(.*?)!', $className)) {
$language = str_replace('language-', '', $className);
break;
}
}
}
return [
'content' => [
'code' => $node->innerText(),
'language' => $language
],
'type' => 'code',
];
}
/**
* @param \Kirby\Parsley\Element $node
* @return array
*/
public function table(Element $node): array
{
return [
'content' => [
'text' => $node->outerHTML(),
],
'type' => 'markdown',
];
}
}

View File

@@ -2,39 +2,68 @@
namespace Kirby\Parsley\Schema;
use Kirby\Parsley\Element;
use Kirby\Parsley\Schema;
use Kirby\Toolkit\Str;
/**
* The plain schema definition converts
* the entire document into simple text blocks
*
* @since 3.5.0
*
* @package Kirby Parsley
* @author Bastian Allgeier <bastian@getkirby.com>,
* @link https://getkirby.com
* @copyright Bastian Allgeier GmbH
* @license https://getkirby.com/license
*/
class Plain extends Schema
{
public function fallback(string $html)
/**
* Creates the fallback block type
* if no other block can be found
*
* @param \Kirby\Parsley\Element|string $element
* @return array|null
*/
public function fallback($element): ?array
{
$text = trim($html);
if (is_a($element, Element::class) === true) {
$text = $element->innerText();
} elseif (is_string($element) === true) {
$text = trim($element);
if (Str::length($text) === 0) {
return false;
if (Str::length($text) === 0) {
return null;
}
} else {
return null;
}
return [
'type' => 'text',
'content' => [
'text' => $text
]
],
'type' => 'text',
];
}
public function marks(): array
{
return [];
}
public function nodes(): array
{
return [];
}
/**
* Returns a list of all elements that
* should be skipped during parsing
*
* @return array
*/
public function skip(): array
{
return ['meta', 'script', 'style'];
return [
'base',
'link',
'meta',
'script',
'style',
'title'
];
}
}