##############################################################################
##############################################################################
#                               HtmlParser.tcl
##############################################################################
##############################################################################
# In this file are implemented the procedures used to parse Html file links.
##############################################################################
##############################################################################
# Copyright 2000 Andrs Garca Garca  -- andresgarci@retemail.es
# Distributed under the terms of the GPL license
##############################################################################
##############################################################################
namespace eval HtmlParser {

###############################################################################
# SetEnttities
#    Initializes the arrays with the translation for Html entities, something
#    like 'entity(lt)==>'
###############################################################################
proc SetEntities {} {
    variable entities

        set entities(quot)      \"
        set entities(amp)       \\&
        set entities(lt)        <
        set entities(gt)        >
        set entities(nbsp)      {}
        set entities(iexcl)     
        set entities(cent)      
        set entities(pound)     
        set entities(curren)    
        set entities(yen)       
        set entities(brvbar)    \|
        set entities(sect)      
        set entities(uml)       
        set entities(copy)      
        set entities(ordf)      
        set entities(laquo)     
        set entities(not)       
        set entities(shy)       
        set entities(reg)       
        set entities(macr)      
        set entities(deg)       
        set entities(plusmn)    
        set entities(sup2)      
        set entities(sup3)      
        set entities(acute)     
        set entities(micro)     
        set entities(para)      
        set entities(middot)    
        set entities(cedil)     
        set entities(sup1)      
        set entities(ordm)      
        set entities(raquo)     
        set entities(frac14)    
        set entities(frac12)    

        set entities(frac34)    
        set entities(iquest)    
        set entities(ntilde)    
        set entities(Agrave)    
        set entities(Aacute)    
        set entities(Acirc)     
        set entities(Atilde)    
        set entities(Auml)      
        set entities(Aring)     
        set entities(AElig)     
        set entities(Ccedil)    
        set entities(Egrave)    
        set entities(Eacute)    
        set entities(Ecirc)     
        set entities(Euml)      
        set entities(Igrave)    
        set entities(Iacute)    
        set entities(Icirc)     
        set entities(Iuml)      
        set entities(ETH)       
        set entities(Ntilde)    
        set entities(Ograve)    
        set entities(Oacute)    
        set entities(Ocirc)     
        set entities(Otilde)    
        set entities(Ouml)      
        set entities(times)     
        set entities(Oslash)    
        set entities(Ugrave)    
        set entities(Uacute)    
        set entities(Ucirc)     
        set entities(Uuml)      
        set entities(Yacute)    
        set entities(THORN)     
        set entities(szlig)     
        set entities(agrave)    
        set entities(aacute)    
        set entities(acirc)     
        set entities(atilde)    
        set entities(auml)      
        set entities(aring)     
        set entities(aelig)     
        set entities(ccedil)    
        set entities(egrave)    
        set entities(eacute)    
        set entities(ecirc)     
        set entities(euml)      
        set entities(igrave)    
        set entities(iacute)    
        set entities(icirc)     
        set entities(iuml)      
        set entities(eth)       
        set entities(ntilde)    
        set entities(ograve)    
        set entities(oacute)    
        set entities(ocirc)     
        set entities(otilde)    
        set entities(ouml)      
        set entities(divide)    
        set entities(oslash)    
        set entities(ugrave)    
        set entities(uacute)    
        set entities(ucirc)     
        set entities(uuml)      
        set entities(yacute)    
        set entities(thorn)     
        set entities(yuml)      

    return
}

###############################################################################
# ShowLinks
#    Show the links found in the last preprocessed page, it's only good for
#    debugging.
###############################################################################
proc ShowLinks {} {
    variable nLinks
    variable linkFile
    variable linkDescription
    variable linkType

    TidyDescription
    for {set i 1} {$i<$nLinks} {incr i} {
        if [info exists linkType($i)] {
            puts "$i: $linkFile($i) -$linkDescription($i)- \
                    -$linkType($i)"
        } else {
            puts "$i: $linkFile($i) -$linkDescription($i)-"
        }
    }
    return
}

###############################################################################
# ParseUrl
#    Given Url 'ParseUrl' will split it in its part: protocol, domain,
#    directory and filename
#
# Parameter
#    The url to be parsed,
#
# Returns
#    A list que the url split as mentioned above or '1' if the url couldn't be
#    parsed.
###############################################################################
proc ParseUrl {url} {

    if {[regexp -nocase \
            {(([^:]*)(?:://))?([^/]+)(((?:~[^/]*)?(?:[^\?]*))(?:/)(.*))?} \
            $url nada nada protocol domain nada dir fileName]} {
        return [list $protocol $domain $dir $fileName]
    }

    return 1
}


###############################################################################
# TidyDir
#    Takes things like ".." and "." from the absolute path.
#
# Parameter:
#    File path.
#
# Returns:
#    The tidied file path.
###############################################################################
proc TidyDir {path} {
    if {[regexp {\.$} $path]} {
        append path /
    }
    for {set a 1 ; set b 1} {($a>0)||($b>0)} {} {
        set a [regsub -all {/\./} $path {/} path]
        set b [regsub -all {([^./]+/\.\./)} $path {} path]
    }
    for {} {[regsub {^/\.\.} $path {} path]} {} {}

    return $path
}

###############################################################################
# TidyDescription
#    Translates it for human eyes the description of the links.
#
# Side effects:
#    The descriptions get translated.
###############################################################################
proc TidyDescription {} {
    global labelDialogs
    variable entities
    variable nLinks
    variable linkFile
    variable linkDescription

    if {![info exists entities]} {
        SetEntities
    }

    for {set i 1} {$i<$nLinks} {incr i} {
        if {[regexp {^[\ \t]*$} $linkDescription($i)]} {
            set linkDescription($i) $linkFile($i)
            continue
        }
        set linkDescription($i) [string trimleft $linkDescription($i)]
        if [regexp -nocase {^(<img)} $linkDescription($i)] {
            if {![regexp {(?:>)(.+)} $linkDescription($i) nada descriptionText]} {
                if {![regexp -nocase {(?:ALT *= *\")([^\"]+)}  \
                        $linkDescription($i) nada descrip]} {
                    regexp -nocase {(?:src *= *\"?)([^\"\ ]+)} \
                        $linkDescription($i) nada descrip
                }
                set linkDescription($i) "$labelDialogs(linkImage): $descrip"
            } else {
                set linkDescription($i) $descriptionText
            }
        }
        regsub -all {<.*?>} $linkDescription($i) {} linkDescription($i)

        while {[regexp {(?:&)([^ ;]+)(;)?} $linkDescription($i) old entity]} {
            regsub {#} $entity {} entity

# Eventually this should be replaced wirh "string is number"
            if {[regexp {[0-9]+} $entity]} {
                set new [format %c $entity]
                regsub -all {([\\])} $new {\\\1} new
            } else {
                if {[catch {set entities($entity)} new]} {
                    set new ""
                }
            }
            regsub -all $old $linkDescription($i) $new linkDescription($i)
        }
    }
    return
}

###############################################################################
# ChangeEncoding
#    Changes the encoding in which the description of the links are written
#
# Parameters:
#    newEncoding: The encoding to use.
###############################################################################
proc ChangeEncoding {newEncoding} {
    variable nLinks
    variable linkDescription

    for {set i 1} {$i<$nLinks} {incr i} {
        set linkDescription($i) \
                [encoding convertfrom $newEncoding $linkDescription($i)]
    }

    return
}

###############################################################################
# Parsing
#    Reads the Web page passed as a parameter and proccess it to extract
#    all links.
#
# Parameters:
#    file: file which contains the page to process.
#
# Returns:
#    - '0': No errors.
#    - '1': Couldn't open file.
#
# Side efects:
#    'nLinks': number of links plus one.
#    'linkFile' keeps the links.
#    'linkDescription' the description of the link.
###############################################################################
proc Parsing {file} {
    variable nLinks
    variable linkFile
    variable linkDescription
    variable linkType
    variable pageEncoding
    global labelDialogs labelTitles

    if [string match $file ""] return

#puts "Fichero a preprocesar: $file"

    set nLinks  1

    if [catch {open $file r} leer] {
        return 1
    }
    catch {unset linkType}
    set pageEncoding ""
    for {set cosaOld "aaaa" ; set rep 0; set cosa [gets $leer]; set thumbnailNext 0} \
            {(![eof $leer]) || ([string compare $cosa ""])} \
            {if {![eof $leer]} {append cosa " " [gets $leer] " "} } {
#puts "\nCosa: $cosa"
       if {$cosaOld==$cosa} {
            incr rep
            if {$rep==10} {
                set cosa ""
                set rep 0
            }
        } else {
            set rep 0
            set cosaOld $cosa
        }
        regexp -nocase {(?:<meta[^>]*)(?:charset=)(?:\"|')?([^\"' ]*)} $cosa \
                nada pageEncoding

        if {[regexp {^\ ?<!} $cosa]} {
            while {![regexp {(?:>)(.*)} $cosa nada cosa]} {
                append cosa " " [gets $leer]
            }
            continue
        }
        if {[regexp -nocase {^\ ?<script} $cosa]} {
            while {![regexp -nocase {(?:/script>)(.*)} $cosa nada cosa]} {
                append cosa " " [gets $leer]
            }
            continue
        }
        if {![regexp -nocase {((<)(a\ $|a[^>]+h|link|img|frame|area|!|script|form)(.*))} $cosa cosa]} {
            set cosa ""
            continue
        }
        if [regexp -nocase  \
            {((?:<a[^>]+href *)|(?:<img[^>]* src *)|(?:<link[^>]+href *)|(?:<frame[^>]+src *)|(?:<form method=\"link\".*action))((?:= *)(?:\"|\')?([^#\"\'\ >]+))(?:.*?>)(.*)} \
			    $cosa nada diff alt fileName cosa] {
#puts "\nNombre fichero: $fileName\n"
            if [info exists repetidos($fileName)] {
                continue
            } else {
                set repetidos($fileName) 1
            }
            if {[regexp {<!} $diff]} continue
		    if {[regexp -nocase {(javascript)(.*)(\()} $fileName]} continue
            regexp -nocase {(?:ps?:)(/[^/].*)} $fileName nada fileName
            if [regexp -nocase {href} $diff] {
                if [regexp -nocase {^<area} $diff] {
                    set linkDescription($nLinks) "$labelDialogs(map)"
                } elseif {[regexp -nocase {^<link} $diff]} {
                    set linkDescription($nLinks) "$labelDialogs(css)"
                } else {
                    while {![regexp -nocase {(.*?)(</a *>)} $cosa nada \
                            descripcion]} {
                        if [eof $leer] break
                        append cosa " " [gets $leer]
                    }
                    if {![catch {set descripcion}]} {
                        set linkDescription($nLinks) $descripcion
                        if [regexp -nocase {<img[^>]* src} $descripcion] {
                            set thumbnailNext 1
                        }
                    } else {
                        set linkDescription($nLinks) "$labelTitles(preError)"
                    }
                }
            } elseif [regexp -nocase {frame} $diff] {
                set linkDescription($nLinks) "$labelDialogs(frame): $fileName"
            } else {
                if {![regexp -nocase {(?:ALT *= *\")(.*?)(\")} $alt \
                        nada descrip]} {
                    if {![regexp -nocase {(?:src *={^[\ \t]*$} *\")(.*?)(\")} \
                            $alt nada descrip]} {
                        set descrip $fileName
                    }
	            }
                set linkDescription($nLinks) "$labelDialogs(image): $descrip"
                if {$thumbnailNext==0} {
                    set linkType($nLinks) image
                } else {
                    set linkType($nLinks) thumb
                    set thumbnailNext 0
                }
            }
            set linkFile($nLinks) $fileName
            incr nLinks
        } else {
            regexp -nocase {(?:[^<]*)(<.*)} $cosa nada cosa
        }
    }
    close $leer

    return
}

###############################################################################
# RemoveLinks
#    Removes a link from linkFile and linkDescription as ordered by procedure
#    'FilterLinks'
#
# Parameter
#    index: index of the entry to be removed from the arrays.
#
# Side effect:
#    nFilteredLinks gets decremented to keep count of the number of links
#    after the filters are applied.
###############################################################################
proc RemoveLinks {index} {
    variable nLinks
    variable nFilteredLinks
    variable linkFile
    variable linkDescription
    variable linkType

    for {set i $index;set j [expr $index+1]} {$j<$nFilteredLinks} \
            {incr i;incr j} {
        set linkFile($i)        $linkFile($j)
        set linkDescription($i) $linkDescription($j)

        if [info exists linkType($j)] {
            set linkType($i) $linkType($j)
            catch {unset linkType($j)}
        } else {
            catch {unset linkType($i)}
        }
    }
    unset linkFile($i)
    unset linkDescription($i)
    catch {unset linkType($i)}
    incr nFilteredLinks -1

    return
}

###############################################################################
# FilterLinks
#    Filters the links extracted from a page according to the rules given.
#
# FIX ME: right now the rules are fixed to those Getleft needs, but a way to
# indicate them at call time should be added soon.
#
# Parameters
#   referer: The url of the page we got the links from.
#
# Side effects:
#    nLinks, linkFile and linkDescription are upated to the new, filtered links
###############################################################################
proc FilterLinks {referer} {
    variable nLinks
    variable nFilteredLinks
    variable linkFile
    variable linkDescription
    variable linkType
    global options filesDone siteUrl directories

    for {set i 1;set nFilteredLinks $nLinks} {$i<$nFilteredLinks} {} {
        set link $linkFile($i)
        if {[regexp -nocase {^mailto:|^news:} $link]} {
            RemoveLinks $i
            continue
        }
        if {([regexp {\?} $link])&&($options(cgi)==0)} {
            RemoveLinks $i
            continue
        }
        if {[regexp -nocase {(javascript)(.*)(\()} $link]} {
            RemoveLinks $i
            continue
        }
        if [info exists filesDone([TidyNames \
                [TidyDir [file join $directories(base)$siteUrl(dir) $link]]])] {
            RemoveLinks $i
            continue
        }
        if {[regexp {^ftp:.*/$} $link]} {
            RemoveLinks $i
            continue
        }
        if {([string first :// $link]!=-1 ) } {
            set parsedUrl [ParseUrl $link]
            set protocol      [lindex $parsedUrl 0]
            set direccion_www [lindex $parsedUrl 1]
            set fileName      [lindex $parsedUrl 2]
            if {[regexp -nocase {https} $protocol]} {
                RemoveLinks $i
                continue
            }
            if ![info exists direccion_www] {
                RemoveLinks $i ;# Maybe something should be put in the error \
                                  log about this.
                continue
            }
            if {[string compare $siteUrl(www) $direccion_www]} {
                if {$options(external)==0} {
                    RemoveLinks $i
                    continue
                }
            }
        }
        if {($options(dir)==0)&&($siteUrl(base)!="")} {
            set link [CompleteUrl $link $referer]
            regexp {(?:://)(?:[^/]+)(.*)} $link nada link
            if {![regexp -nocase "^$siteUrl(base)" $link]} {
                RemoveLinks $i
                continue
            }
        }
        set remove 0
        catch {
            if {($linkType([expr $i+1])=="thumb")&&($options(onlyThumb)==1)} {
                if {[regexp -nocase {(gif$)|(jpg$)|(bmp$)|(xbm$)|(tiff$)|(png$)} $linkFile($i)]} {
                    set remove 1
                }
            }

        }
        catch {
            if {($linkType($i)=="thumb")&&($options(noThumb)==1)} {
                set remove 1
            }
        }
        if {$remove==1} {
            RemoveLinks $i
            continue
        }
        incr i
    }
    set nLinks $nFilteredLinks

    return
}

###############################################################################
# CompleteUrl
#    Given a link, this procedure returns the full Url of that link, for
#    example, a link from a page may be '../index.html', this procedure
#    will return something link 'http://www.algo.es/cosas/index.html'
#
# Parameter
#    link: I'll let you guess
#    referer: url of the referer page for the link
#
# Returns
#    The url
###############################################################################
proc CompleteUrl {link referer} {
    global siteUrl

    if [regexp {://} $link] {
        return $link
    }

    # Workaround for Webmasters that don't kopw the directory separator is
    # a single /
    regsub -all {//} $link {/} link

    set parsedUrl [ParseUrl $referer]
    set prot   [lindex $parsedUrl 0]
    set domain [lindex $parsedUrl 1]
    set dir    [lindex $parsedUrl 2]

    if [regexp {(?::/)([^/].*)} $link nada fileName] {
        set url $prot://$domain/$fileName
        return $url
    }
    if [regexp {^/} $link] {
        set url $prot://$domain$link
        return $url
    }
    set fileName [TidyDir $dir/$link]
    set url "$prot://$domain$fileName"

    return $url
}

###############################################################################
# AbsoluteLinks
#    Makes sure that all links in linkFile are complete with protocol and
#    site.
#
# Parameter:
#    referer: url of the referer page for the link.
###############################################################################
proc AbsoluteLinks {referer} {
    global siteUrl
    variable nLinks
    variable linkFile

    for {set i 1} {$i<$nLinks} {incr i} {
        set linkFile($i) [CompleteUrl $linkFile($i) $referer]
    }

    if {$::DEBUG==1} {
#        puts "Despues de juntarlos"
#        ShowLinks
    }

    return
}

}
