#! /usr/common/bin/gawk -f # # usage: testlink.awk EIS.file > TEST.html # or : testlink.awk EIS.file > /dev/null # # - this is a beta test version (0.4) # - all links in %H lines are extracted (duplicates only counted), # redirect the output to a temporary TEST.html for link checking. # - different URL fragments (#) handled as different links # - relative URLs intentionally not handled (not used in %H lines), # - https:// and gopher:// not yet handled (and reported as error), # - all errors, changes, and unsolved problems reported in LOGFILE, # see below BEGIN (use e.g. file ./report_link or /dev/tty etc.) # #### report error ################################################# function ERROR( BAD ) { ++MM ; print BAD > LOGFILE # unrecognized href } #### LOGFILE ###################################################### BEGIN { LOGFILE = "report_link" # keep error messages SCHEMES = "^(ht|f)tp://" # match http:// etc. } #### handle link ################################################## /^%H A/ { LINE = $0 ; STOP = "(a|A)>" while ( match( LINE, /(href|HREF)="/ )) { LINE = substr( LINE, RSTART + RLENGTH ) if ( ! match( LINE, STOP )) { ERROR( $0 ) ; next } THIS = substr( LINE, 1, RSTART - 1 ) LINE = substr( LINE, RSTART + RLENGTH ) if ( ! match( THIS, /".*>/ )) { ERROR( $0 ) ; next } HREF = substr( THIS, 1, RSTART - 1 ) THIS = substr( THIS, RSTART + RLENGTH ) if ( ! match( HREF, SCHEMES )) { ERROR( $0 ) ; next } if ( ! match( THIS, /[^ ]/ )) { ERROR( $0 ) ; next } if ( LINK[ HREF ] == "" ) { LINK[ HREF ] = 1 TEXT[ HREF ] = THIS ANUM[ HREF ] = $2 NN++ # count different link } else LINK[ HREF ] = LINK[ HREF ] + 1 } if ( LINE == $0 ) ERROR( $0 ) } { next # print nothing else } #### print HTML ################################################### END { X = NN " EIS links" print "
"
for ( X in LINK )
{
LINE = ANUM[ X ] " "
LINE = LINE TEXT[ X ] " ("
print LINE LINK[ X ] ")"
}
print ""
print NN++ " href= links found" > LOGFILE
print MM++ " unrecognized href" > LOGFILE
if ( LOGFILE == "/dev/tty" ) exit
print "see report in " LOGFILE ":" > "/dev/tty"
print --NN " href= links found" > "/dev/tty"
print --MM " unrecognized href" > "/dev/tty"
}