import { mapUrlToRootUrl } from "./utils/mapUrlToRootUrl.js";

const ROBOTS_TXT = "robots.txt";

/**
 * Output a boolean flag if the current requested sitemap matches the current blog ID.
 *
 * This is currently only enabled for logged-in users and useful in a multisite scenario
 * with path based subsites. Example `robots.txt`:
 *
 * ```
 * User-agent: *
 * Allow: /
 * Sitemap: https://example.com/de/wp-sitemap.xml
 * Sitemap: https://example.com/en/wp-sitemap.xml
 * ```
 *
 * When we start the scan process on the `/de` subsite, we are not allowed to access the
 * `https://example.com/en/wp-sitemap.xml` URL. This header helps us to identify the correct
 * blog ID in this case.
 */
const QUERY_ARG_SITEMAP_FILTER = "sitemap-crawler-filter";
const HEADER_SITEMAP_FILTER = "X-Sitemap-Crawler-Filter";

/**
 * Look for a valid `robots.txt` and parse `Sitemap` url. Multiple sitemaps are supported.
 *
 * @see https://regex101.com/r/Trn97e/1
 * @see https://stackoverflow.com/a/3184966/5506547
 * @param forceSitemapArgument Append this query argument to the `robots.txt` in case you can server-side activate the sitemap
 * @param filterSitemap Filter the sitemap. It passes the filter as `sitemap-crawler-filter` query argument and expects a `true` or `false` response header `X-Sitemap-Crawler-Filter`.
 */
async function findByRobots(url: string, forceSitemapArgument?: string, filterSitemap?: string) {
    try {
        const query: string[] = [];
        if (forceSitemapArgument) {
            query.push(`${forceSitemapArgument}=1`);
        }
        const response = await fetch(`${url}${ROBOTS_TXT}${query ? `?${query}` : ""}`);

        if (!response.ok) {
            return false;
        }

        if (filterSitemap) {
            query.push(`${QUERY_ARG_SITEMAP_FILTER}=${filterSitemap}`);
        }

        const content = await response.text();

        const sitemapUrls = [...content.matchAll(/^sitemap:(.*)$/gim)].map(([, url]) => url.trim());
        let result: string[] = [];
        for (const sitemapUrl of sitemapUrls) {
            const correctedSitemapUrl = mapUrlToRootUrl(url, sitemapUrl, query.join("&"));
            if (correctedSitemapUrl) {
                // Check if sitemap really exists and isn't stale (e.g. deactivate RankMath SEO WP plugin does not recreate robots.txt)
                const responseCheckSitemap = await fetch(correctedSitemapUrl);
                if (!responseCheckSitemap.ok) {
                    continue;
                }

                // Scenario: Sitemap does not exist but a plugin redirects directly to the homepage => no error code
                const body = await responseCheckSitemap.text();
                if (!/<(?:sitemap|urlset)/gm.test(body)) {
                    continue;
                }

                if (filterSitemap) {
                    const header = responseCheckSitemap.headers.get(HEADER_SITEMAP_FILTER);
                    if (header === "false") {
                        continue;
                    }
                }

                result.push(correctedSitemapUrl);
            }
        }
        result = [...new Set(result)];
        return result.length ? result : false;
    } catch (e) {
        return false;
    }
}

export { findByRobots };
