Upgrade to rc5

This commit is contained in:
Bastian Allgeier
2020-12-10 11:24:42 +01:00
parent 3fec0d7c93
commit c378376bc9
257 changed files with 13009 additions and 1846 deletions

100
kirby/src/Parsley/Element.php Executable file
View File

@@ -0,0 +1,100 @@
<?php
namespace Kirby\Parsley;
use DOMElement;
use DOMXpath;
use Kirby\Toolkit\Str;
class Element
{
protected $marks;
protected $node;
public function __construct(DOMElement $node, array $marks = [])
{
$this->marks = $marks;
$this->node = $node;
}
public function attr(string $attr, $fallback = null)
{
if ($this->node->hasAttribute($attr)) {
return $this->node->getAttribute($attr) ?? $fallback;
}
return $fallback;
}
public function children()
{
return $this->node->childNodes;
}
public function classList(): array
{
return Str::split($this->className(), ' ');
}
public function className()
{
return $this->node->getAttribute('class');
}
public function element()
{
return $this->node;
}
public function filter(string $query)
{
$result = [];
if ($queryResult = $this->query($query)) {
foreach ($queryResult as $node) {
$result[] = new static($node);
}
}
return $result;
}
public function find(string $query)
{
if ($result = $this->query($query)[0]) {
return new static($result);
}
return false;
}
public function innerHtml(array $marks = null): string
{
return (new Inline($this->node, $marks ?? $this->marks))->innerHtml();
}
public function innerText()
{
return trim($this->node->textContent);
}
public function outerHtml(array $marks = null): string
{
return $this->node->ownerDocument->saveHtml($this->node);
}
public function query($query)
{
return (new DOMXPath($this->node->ownerDocument))->query($query, $this->node);
}
public function remove()
{
$this->node->parentNode->removeChild($this->node);
}
public function tagName(): string
{
return $this->node->tagName;
}
}

74
kirby/src/Parsley/Inline.php Executable file
View File

@@ -0,0 +1,74 @@
<?php
namespace Kirby\Parsley;
class Inline
{
protected $html = '';
protected $marks = [];
public function __construct($node, array $marks = [])
{
$this->createMarkRules($marks);
$this->html = trim($this->parseNode($node));
}
public function createMarkRules($marks)
{
foreach ($marks as $mark) {
$this->marks[$mark['tag']] = $mark;
}
}
public function parseChildren($children): string
{
if (!$children) {
return '';
}
$html = '';
foreach ($children as $child) {
$html .= $this->parseNode($child);
}
return $html;
}
public function parseNode($node)
{
$html = '';
if (is_a($node, 'DOMText') === true) {
return $node->textContent;
}
// ignore comments
if (is_a($node, 'DOMComment') === true) {
return '';
}
// known marks
if (array_key_exists($node->tagName, $this->marks) === true) {
$mark = $this->marks[$node->tagName];
$attrs = [];
$defaults = $mark['defaults'] ?? [];
foreach ($mark['attrs'] ?? [] as $attr) {
if ($node->hasAttribute($attr)) {
$attrs[$attr] = $node->getAttribute($attr);
} else {
$attrs[$attr] = $defaults[$attr] ?? null;
}
}
return '<' . $node->tagName . attr($attrs, ' ') . '>' . $this->parseChildren($node->childNodes) . '</' . $node->tagName . '>';
}
// unknown marks
return $this->parseChildren($node->childNodes);
}
public function innerHtml()
{
return $this->html;
}
}

231
kirby/src/Parsley/Parsley.php Executable file
View File

@@ -0,0 +1,231 @@
<?php
namespace Kirby\Parsley;
use DOMDocument;
use DOMXPath;
use Kirby\Parsley\Schema\Plain;
class Parsley
{
protected $blocks = [];
protected $body;
protected $doc;
protected $marks = [];
protected $nodes = [];
protected $schema;
protected $skip = [];
public static $useXmlExtension = true;
public function __construct(string $html, Schema $schema = null)
{
// fail gracefully if the XML extension is not installed
// or should be skipped
if ($this->useXmlExtension() === false) {
$this->blocks[] = [
'type' => 'markdown',
'content' => [
'text' => $html,
]
];
return;
}
libxml_use_internal_errors(true);
$this->doc = new DOMDocument();
$this->doc->preserveWhiteSpace = false;
$this->doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
libxml_clear_errors();
$this->schema = $schema ?? new Plain();
$this->skip = $this->schema->skip();
$this->marks = $this->schema->marks();
$this->inline = [];
$this->createNodeRules($this->schema->nodes());
$this->parseNode($this->body());
$this->endInlineBlock();
}
public function blocks(): array
{
return $this->blocks;
}
public function body()
{
return $this->body = $this->body ?? $this->query($this->doc, '/html/body')[0];
}
public function createNodeRules($nodes)
{
foreach ($nodes as $node) {
$this->nodes[$node['tag']] = $node;
}
}
public function containsBlock($element): bool
{
if (!$element->childNodes) {
return false;
}
foreach ($element->childNodes as $childNode) {
if ($this->isBlock($childNode) === true || $this->containsBlock($childNode)) {
return true;
}
}
return false;
}
public function endInlineBlock()
{
$html = [];
foreach ($this->inline as $inline) {
$node = new Inline($inline, $this->marks);
$html[] = $node->innerHTML();
}
$innerHTML = implode(' ', $html);
if ($fallback = $this->fallback($innerHTML)) {
$this->mergeOrAppend($fallback);
}
$this->inline = [];
}
public function fallback($node)
{
if (is_a($node, 'DOMText') === true) {
$html = $node->textContent;
} elseif (is_a($node, Element::class) === true) {
$html = $node->innerHtml();
} elseif (is_string($node) === true) {
$html = $node;
} else {
$html = '';
}
if ($fallback = $this->schema->fallback($html)) {
return $fallback;
}
return false;
}
public function isBlock($element): bool
{
if (is_a($element, 'DOMElement') === false) {
return false;
}
return array_key_exists($element->tagName, $this->nodes) === true;
}
public function isInline($element)
{
if (is_a($element, 'DOMText') === true) {
return true;
}
if (is_a($element, 'DOMElement') === true) {
if ($this->containsBlock($element) === true) {
return false;
}
if ($element->tagName === 'p') {
return false;
}
$marks = array_column($this->marks, 'tag');
return in_array($element->tagName, $marks);
}
return false;
}
public function mergeOrAppend($block)
{
$lastIndex = count($this->blocks) - 1;
$lastItem = $this->blocks[$lastIndex] ?? null;
// merge with previous block
if ($block['type'] === 'text' && $lastItem && $lastItem['type'] === 'text') {
$this->blocks[$lastIndex]['content']['text'] .= "\n\n" . $block['content']['text'];
// append
} else {
$this->blocks[] = $block;
}
}
public function parseNode($element)
{
// comments
if (is_a($element, 'DOMComment') === true) {
return true;
}
// inline context
if ($this->isInline($element)) {
$this->inline[] = $element;
return true;
} else {
$this->endInlineBlock();
}
// known block nodes
if ($this->isBlock($element) === true) {
if ($parser = ($this->nodes[$element->tagName]['parse'] ?? null)) {
if ($result = $parser(new Element($element, $this->marks))) {
$this->blocks[] = $result;
}
}
return true;
}
// has only unkown children (div, etc.)
if ($this->containsBlock($element) === false) {
if (in_array($element->tagName, $this->skip) === true) {
return true;
}
if ($element->tagName !== 'body') {
$node = new Element($element, $this->marks);
if ($block = $this->fallback($node)) {
$this->mergeOrAppend($block);
}
return true;
}
}
// parse all children
foreach ($element->childNodes as $childNode) {
$this->parseNode($childNode);
}
}
public function query($element, $query)
{
return (new DOMXPath($element))->query($query);
}
public function useXmlExtension(): bool
{
if (static::$useXmlExtension !== true) {
return false;
}
return class_exists('DOMDocument') === true;
}
}

11
kirby/src/Parsley/Schema.php Executable file
View File

@@ -0,0 +1,11 @@
<?php
namespace Kirby\Parsley;
abstract class Schema
{
abstract public function fallback(string $html);
abstract public function marks(): array;
abstract public function nodes(): array;
abstract public function skip(): array;
}

View File

@@ -0,0 +1,317 @@
<?php
namespace Kirby\Parsley\Schema;
use Kirby\Parsley\Element;
use Kirby\Toolkit\Str;
class Blocks extends Plain
{
public function fallback(string $html)
{
$html = trim($html);
if (Str::length($html) === 0) {
return false;
}
return [
'content' => [
'text' => '<p>' . $html . '</p>',
],
'type' => 'text',
];
}
public function heading($node, $level)
{
$content = [
'level' => $level,
'text' => $node->innerHTML()
];
if ($id = $node->attr('id')) {
$content['id'] = $id;
}
ksort($content);
return [
'content' => $content,
'type' => 'heading',
];
}
public function list($node)
{
$html = [];
foreach ($node->filter('li') as $li) {
$innerHtml = '';
foreach ($li->children() as $child) {
if (is_a($child, 'DOMText') === true) {
$innerHtml .= $child->textContent;
} elseif (is_a($child, 'DOMElement') === true) {
$child = new Element($child);
if (in_array($child->tagName(), ['ul', 'ol']) === true) {
$innerHtml .= $this->list($child);
} else {
$innerHtml .= $child->innerHTML($this->marks());
}
}
}
$html[] = '<li>' . trim($innerHtml) . '</li>';
}
return '<' . $node->tagName() . '>' . implode($html) . '</' . $node->tagName() . '>';
}
public function marks(): array
{
return [
[
'tag' => 'a',
'attrs' => ['href', 'target', 'title'],
],
[
'tag' => 'abbr',
],
[
'tag' => 'b'
],
[
'tag' => 'code'
],
[
'tag' => 'del',
],
[
'tag' => 'em',
],
[
'tag' => 'i',
],
[
'tag' => 'strike',
],
[
'tag' => 'sub',
],
[
'tag' => 'sup',
],
[
'tag' => 'strong',
],
[
'tag' => 'u',
],
];
}
public function nodes(): array
{
return [
[
'tag' => 'blockquote',
'parse' => function ($node) {
$citation = null;
$text = [];
// get all the text for the quote
foreach ($node->element()->childNodes as $child) {
if (is_a($child, 'DOMText') === true) {
$text[] = trim($child->textContent);
}
if (is_a($child, 'DOMElement') === true && $child->tagName !== 'footer') {
$text[] = (new Element($child))->innerHTML($this->marks());
}
}
// filter empty blocks and separate text blocks with breaks
$text = implode('<br></br>', array_filter($text));
// get the citation from the footer
if ($footer = $node->find('footer')) {
$citation = $footer->innerHTML($this->marks());
}
return [
'content' => [
'citation' => $citation,
'text' => $text
],
'type' => 'quote',
];
}
],
[
'tag' => 'h1',
'parse' => function ($node) {
return $this->heading($node, 'h1');
}
],
[
'tag' => 'h2',
'parse' => function ($node) {
return $this->heading($node, 'h2');
}
],
[
'tag' => 'h3',
'parse' => function ($node) {
return $this->heading($node, 'h3');
}
],
[
'tag' => 'h4',
'parse' => function ($node) {
return $this->heading($node, 'h4');
}
],
[
'tag' => 'h5',
'parse' => function ($node) {
return $this->heading($node, 'h5');
}
],
[
'tag' => 'h6',
'parse' => function ($node) {
return $this->heading($node, 'h6');
}
],
[
'tag' => 'iframe',
'parse' => function ($node) {
$caption = null;
$src = $node->attr('src');
if ($figcaption = $node->find('ancestor::figure[1]//figcaption')) {
$caption = $figcaption->innerHTML($this->marks());
// avoid parsing the caption twice
$figcaption->remove();
}
// reverse engineer video URLs
if (preg_match('!player.vimeo.com\/video\/([0-9]+)!i', $src, $array) === 1) {
$src = 'https://vimeo.com/' . $array[1];
} elseif (preg_match('!youtube.com\/embed\/([a-zA-Z0-9_-]+)!', $src, $array) === 1) {
$src = 'https://youtube.com/watch?v=' . $array[1];
} elseif (preg_match('!youtube-nocookie.com\/embed\/([a-zA-Z0-9_-]+)!', $src, $array) === 1) {
$src = 'https://youtube.com/watch?v=' . $array[1];
} else {
$src = false;
}
// correct video URL
if ($src) {
return [
'content' => [
'caption' => $caption,
'url' => $src
],
'type' => 'video',
];
}
return [
'content' => [
'text' => $node->outerHTML()
],
'type' => 'markdown',
];
}
],
[
'tag' => 'img',
'parse' => function ($node) {
$caption = null;
$link = null;
if ($figcaption = $node->find('ancestor::figure[1]//figcaption')) {
$caption = $figcaption->innerHTML($this->marks());
// avoid parsing the caption twice
$figcaption->remove();
}
if ($a = $node->find('ancestor::a')) {
$link = $a->attr('href');
}
return [
'content' => [
'alt' => $node->attr('alt'),
'caption' => $caption,
'link' => $link,
'location' => 'web',
'src' => $node->attr('src'),
],
'type' => 'image',
];
}
],
[
'tag' => 'ol',
'parse' => function ($node) {
return [
'content' => [
'text' => $this->list($node)
],
'type' => 'list',
];
}
],
[
'tag' => 'pre',
'parse' => function ($node) {
$language = 'text';
if ($code = $node->find('//code')) {
foreach ($code->classList() as $className) {
if (preg_match('!language-(.*?)!', $className)) {
$language = str_replace('language-', '', $className);
break;
}
}
}
return [
'content' => [
'code' => $node->innerText(),
'language' => $language
],
'type' => 'code',
];
}
],
[
'tag' => 'table',
'parse' => function ($node) {
return [
'content' => [
'text' => $node->outerHTML(),
],
'type' => 'markdown',
];
}
],
[
'tag' => 'ul',
'parse' => function ($node) {
return [
'content' => [
'text' => $this->list($node)
],
'type' => 'list',
];
}
],
];
}
}

View File

@@ -0,0 +1,40 @@
<?php
namespace Kirby\Parsley\Schema;
use Kirby\Parsley\Schema;
use Kirby\Toolkit\Str;
class Plain extends Schema
{
public function fallback(string $html)
{
$text = trim($html);
if (Str::length($text) === 0) {
return false;
}
return [
'type' => 'text',
'content' => [
'text' => $text
]
];
}
public function marks(): array
{
return [];
}
public function nodes(): array
{
return [];
}
public function skip(): array
{
return ['meta', 'script', 'style'];
}
}