import { parse } from "./parse.js";
import { mapUrlToRootUrl } from "./utils/mapUrlToRootUrl.js";

/**
 * Use pseudo base so e.g. sitemaps configured as `/sitemap.xml` (relative) configured
 * in `robots.txt` are parsed correctly.
 */
const PSEUDO_URL_BASE = "https://base";

async function crawl(rootUrl: string, sitemapUrl: string, sites?: string[]): Promise<string[]> {
    if (!sites) {
        // Initial crawl
        try {
            return (await crawl(rootUrl, sitemapUrl, [])).sort((a, b) => a.length - b.length);
        } catch (e) {
            // Atm we are ignoring errors
            console.error(e);
            return [];
        }
    } else {
        try {
            const document = await parse(sitemapUrl);
            const { protocol: sitemapProtocol } = new URL(sitemapUrl, PSEUDO_URL_BASE);

            // Check if sitemap is an index of multiple sitemaps
            const sitemapIndex = document.querySelector("sitemapindex");
            if (sitemapIndex) {
                const indexUrls = Array.from(sitemapIndex.children)
                    .map((c) => c.querySelector("loc")?.textContent)
                    // In some cases, the `DOMParser` can return `parsererror` children, we need to skip
                    // them as the other `sitemap` children are still correctly parsed
                    .filter(Boolean);

                // Iterate sequentially instead of concurrent as we are in a browser and we should avoid parallel requests here (CloudFlare Bots etc.)
                for (const indexUrl of indexUrls) {
                    const useUrl = mapUrlToRootUrl(rootUrl, indexUrl) || indexUrl;
                    await crawl(rootUrl, useUrl, sites);
                }
            }

            // Check for URL set (we finally found an URL!)
            const urlSet = document.querySelector("urlset");
            if (urlSet) {
                const urls = Array.from(urlSet.children)
                    .map((c) => c.querySelector("loc")?.textContent)
                    .filter(Boolean)
                    .map((url) => {
                        try {
                            const urlInstance = new URL(url, PSEUDO_URL_BASE);
                            if (urlInstance.protocol === "http:") {
                                urlInstance.protocol = sitemapProtocol;
                            }
                            return urlInstance.toString();
                        } catch (e) {
                            // Silence is golden.
                            return url;
                        }
                    });

                // Not needed atm cause WordPress and plugins force the protocol correctly here
                //.map((url) => mapUrlToRootUrl(rootUrl, url) || url);
                sites.push(...urls);
            }
        } catch (e) {
            console.error(`Error occurred during "crawl('${sitemapUrl}')":\n\r Error: ${e}`);
        }
    }

    return [...new Set(sites)];
}

async function crawlMultiple(rootUrl: string, sitemapUrls: string[]) {
    return [...new Set((await Promise.all(sitemapUrls.map((sitemapUrl) => crawl(rootUrl, sitemapUrl)))).flat())];
}

export { crawl, crawlMultiple };
