<?php
namespace DevOwl\SearchEnginePostType\splitter;

use DevOwl\SearchEnginePostType\UtilsProvider;
use DOMElement;

/**
 * Algolia has a limit of 10 KB per record for performance and relevance reasons.
 * Instead of indexing large pieces of text into a single record, you can split them into
 * multiple records and use the distinct feature to deduplicate results at query time.
 *
 * @see https://www.algolia.com/doc/integration/wordpress/advanced/splitting-large-records/?client=php
 */
abstract class AbstractSplitter {
    use UtilsProvider;

    /**
     * Splits the given post into multiple sections.
     *
     * @param string $content
     */
    abstract public function split($content);

    /**
     * Get the content of a given DOM element.
     *
     * @param DOMElement $node
     */
    public function get_node_content($node) {
        if (in_array(isset($node->tagName) ? $node->tagName : 'p', ['ul', 'ol'], true)) {
            $text = [];
            foreach ($node->childNodes as $li) {
                $text[] = $li->nodeValue;
            }
            return ' - ' . implode("\n - ", $text);
        }

        return $node->textContent;
    }

    /**
     * Check if the given string is large enough for our content limit.
     *
     * @param string $content
     * @param int $contentLimit Length of string
     */
    public function isContentLargeEnough($content, $contentLimit) {
        if (is_array($content)) {
            $content = implode(' ', $content);
        }

        return mb_strlen($content, 'UTF-8') > $contentLimit;
    }

    /**
     * Get the sanitized content.
     *
     * @param string $content
     */
    public static function get_sanitized_content($content) {
        // Compatibility with WP Bakery shortcodes
        if (class_exists('WPBMap')) {
            WPBMap::addAllMappedShortcodes();
        }

        $the_content = apply_filters('the_content', $content);

        // Remove all tags expect "content" tags
        // See https://www.w3schools.com/tags/default.asp and execute `JSON.stringify(Array.from(temp1.querySelectorAll("tr > td:first-child")).map((e) => e.innerText))`
        $the_content = strip_tags(
            $the_content,
            join('', [
                // '<!--...-->',
                // '<!DOCTYPE>',
                '<a>',
                '<abbr>',
                '<acronym>',
                '<address>',
                '<applet>',
                '<area>',
                // '<article>',
                // '<aside>',
                '<audio>',
                '<b>',
                // '<base>',
                // '<basefont>',
                '<bdi>',
                '<bdo>',
                '<big>',
                '<blockquote>',
                // '<body>',
                '<br>',
                '<button>',
                // '<canvas>',
                '<caption>',
                '<center>',
                '<cite>',
                '<code>',
                // '<col>',
                // '<colgroup>',
                '<data>',
                '<datalist>',
                '<dd>',
                '<del>',
                '<details>',
                '<dfn>',
                '<dialog>',
                '<dir>',
                // '<div>',
                '<dl>',
                '<dt>',
                '<em>',
                // '<embed>',
                '<fieldset>',
                '<figcaption>',
                '<figure>',
                '<font>',
                // '<footer>',
                '<form>',
                '<frame>',
                '<frameset>',
                '<h1>',
                '<h2>',
                '<h3>',
                '<h4>',
                '<h5>',
                '<h6>',
                // '<head>',
                // '<header>',
                '<hr>',
                // '<html>',
                '<i>',
                // '<iframe>',
                '<img>',
                '<input>',
                '<ins>',
                '<kbd>',
                '<label>',
                '<legend>',
                '<li>',
                '<link>',
                // '<main>',
                // '<map>',
                '<mark>',
                // '<meta>',
                '<meter>',
                '<nav>',
                '<noframes>',
                // '<noscript>',
                '<object>',
                '<ol>',
                '<optgroup>',
                '<option>',
                '<output>',
                '<p>',
                '<param>',
                '<picture>',
                '<pre>',
                '<progress>',
                '<q>',
                '<rp>',
                '<rt>',
                '<ruby>',
                '<s>',
                '<samp>',
                // '<script>',
                // '<section>',
                '<select>',
                '<small>',
                '<source>',
                '<span>',
                '<strike>',
                '<strong>',
                '<style>',
                '<sub>',
                '<summary>',
                '<sup>',
                '<svg>',
                // '<table>',
                // '<tbody>',
                // '<td>',
                // '<template>',
                '<textarea>',
                // '<tfoot>',
                // '<th>',
                '<thead>',
                '<time>',
                '<title>',
                '<tr>',
                '<track>',
                '<tt>',
                '<u>',
                '<ul>',
                '<var>',
                '<video>',
                '<wbr>',
            ])
        );

        return $the_content;
    }
}
