#
# LinkCheck - Scriptol Library
# https://www.scriptol.com/
# Licence: LPGL
# Check an HTML page for broken links
#
# (c) 2008-2025 by Denis Sureau

include "path.sol"
include "utils.js"

extern bool VERBOSE
bool DEBUG = false     // even more verbose

text website = ""   // the website base URL (protocol, domain, tld)
text domain = ""    // website without protocol
text source = ""	// local directory at start
text root

array broken = ["Links report:"]   // list of pages and broken links
array pagesToCheck = []
int BROCOUNT = 0
int CHECKS = 0
int CHECKOK = 0
int CHECKBAD = 0

array extensions = [".html", ".php", ".htm", ".php5", ".asp",
    ".shtml", ".dhtml", ".jsp", ".xhtml", ".stm"]
    
dict PAGES = {}    

boolean hasProtocol(text theurl)
	text lowname = theurl.ltrim().lower()
	if lowname[ .. 6] = "http://"	return true
	if lowname[ .. 5] = "ftp://" 	return true
	if lowname[ .. 7] = "https://"	return true
return false

boolean isHTML(text name)
    text ext = Path.getExtension(name)
    if ext in extensions return true     
return false


void dispLinksChecks()
    for text filename, array plist  in PAGES
        print filename
        print "-".dup(filename.length())    
        for text p in plist
            print p
        /for
    /for
return


// build the list of url to check

array links = []

void linkchecker(text filename)

    text hostroot
    text content
  
    array dnl
    text link
  
    if not isHTML(filename) return
    filename = convertUnix(filename)
    if VERBOSE print "Scanning $filename"
    hostroot, link = Path.splitFile(filename)
    hostroot = hostroot.replace(root, website)
    content= fileToText(filename, "utf-8")
    if content = nil
        if VERBOSE print "$filename empty."
        return 
    /if
    
    dnl = retrieveLinks(content)
    if VERBOSE print dnl.size(), " links in ", filename
    if dnl.size() = 0 return 
    for text href in dnl
            text original = href
            if href[0] = "#"  continue
            if href[0] = "/"  
                if href[1] = "/" continue             // ad
                href = Path.merge(website, href)
            /if    

            int p = href.find("#")
            if p != 0
                href = href[0 -- p]
            /if
            if not hasProtocol(href)
                if href.length() > 11
                    if href[ .. 10] = "javascript:"
                        if DEBUG print "Skipped javascript." 
                        continue
                    /if     
                /if
                if href.length() > 7
                    if href[ .. 6] = "mailto:"
                        if DEBUG print "Skipped mailto." 
                        continue
                    /if     
                /if
                href = Path.merge(hostroot, href)   // build the URL
            /if
            if href.length() < 8 continue
            if href[ .. 3].lower() != "http" continue
            links.push([href, filename])
    /for
    
return

// Display list of broken links

async void differedCheck(array FilesArray)
    links=[] 
    
    int n = FilesArray.size()
    text s = plural(n)
    print "Found $n page$s..."
    root = convertUnix(source)
    for text t in FilesArray
        linkchecker(t)
    /for    

    int size = links.size()
    s = plural(size)
    print size, "link$s to check."
    if size = 0 return
    for array entry in links
        try
        @await checkLink(entry[0], entry[1])
        catch(var e)
            print `${entry[0]}, failed to access`
        /try
    /for   
    beforeExit()    
return    
