<?php
namespace DevOwl\SearchEnginePostType\splitter;

use DOMDocument;

/**
 * This class splits content by heading.
 */
class HtmlSplitter extends AbstractSplitter {
    const LEVEL_2 = 'h2';
    const LEVEL_3 = 'h3';
    const CONTENT_LIMIT = 1000;

    // Documented in AbstractSplitter
    public function split($content, $key = 'post_content') {
        $dom = new DOMDocument();

        // Suppress warnings about unknown tags (https://stackoverflow.com/a/41845049/5506547)
        libxml_clear_errors();
        $previous = libxml_use_internal_errors(true);

        // Load content as UTF-8 content (see https://stackoverflow.com/a/8218649/5506547)
        $dom->loadHTML('<?xml encoding="utf-8" ?>' . $content);
        $rootNodes = $dom->getElementsByTagName('body')->item(0)->childNodes;
        $values = $split = [];

        foreach ($rootNodes as $node) {
            $useTagName = $node->tagName ?? 'p';
            $values[] = [$useTagName => $this->get_node_content($node)];
        }

        $current = [];

        foreach ($values as $entry) {
            foreach ($entry as $tag => $value) {
                if ($tag === self::LEVEL_2) {
                    $split[] = $current;
                    $current = [
                        'subtitle_h2' => $value,
                        'subtitle_h3' => '',
                        $key => [],
                    ];
                } elseif ($tag === self::LEVEL_3) {
                    $current['subtitle_h3'] = $value;
                } else {
                    $current[$key][] = $value;
                }

                if (!empty($current[$key]) && $this->isContentLargeEnough($current[$key], self::CONTENT_LIMIT)) {
                    $split[] = $current;
                    $current = [
                        'subtitle_h2' => '',
                        'subtitle_h3' => '',
                        $key => [],
                    ];
                }
            }
        }

        // Add the latest, not yet added chunk
        if (count($current[$key] ?? []) > 0) {
            $split[] = $current;
        }

        foreach ($split as $idx => $piece) {
            $split[$idx][$key] = implode("\n\n", $piece[$key]);
        }

        // Remove empty chunks
        foreach ($split as $idx => $piece) {
            if (strlen(trim($piece[$key])) === 0) {
                unset($split[$idx]);
            }
        }

        libxml_clear_errors();
        libxml_use_internal_errors($previous);

        return $split;
    }
}
