#! /usr/common/bin/gawk -f # # usage: testlink.awk EIS.file > TEST.html # or : testlink.awk EIS.file > /dev/null # # - this is a beta test version (0.4) # - all links in %H lines are extracted (duplicates only counted), # redirect the output to a temporary TEST.html for link checking. # - different URL fragments (#) handled as different links # - relative URLs intentionally not handled (not used in %H lines), # - https:// and gopher:// not yet handled (and reported as error), # - all errors, changes, and unsolved problems reported in LOGFILE, # see below BEGIN (use e.g. file ./report_link or /dev/tty etc.) # #### report error ################################################# function ERROR( BAD ) { ++MM ; print BAD > LOGFILE # unrecognized href } #### LOGFILE ###################################################### BEGIN { LOGFILE = "report_link" # keep error messages SCHEMES = "^(ht|f)tp://" # match http:// etc. } #### handle link ################################################## /^%H A/ { LINE = $0 ; STOP = "" while ( match( LINE, /(href|HREF)="/ )) { LINE = substr( LINE, RSTART + RLENGTH ) if ( ! match( LINE, STOP )) { ERROR( $0 ) ; next } THIS = substr( LINE, 1, RSTART - 1 ) LINE = substr( LINE, RSTART + RLENGTH ) if ( ! match( THIS, /".*>/ )) { ERROR( $0 ) ; next } HREF = substr( THIS, 1, RSTART - 1 ) THIS = substr( THIS, RSTART + RLENGTH ) if ( ! match( HREF, SCHEMES )) { ERROR( $0 ) ; next } if ( ! match( THIS, /[^ ]/ )) { ERROR( $0 ) ; next } if ( LINK[ HREF ] == "" ) { LINK[ HREF ] = 1 TEXT[ HREF ] = THIS ANUM[ HREF ] = $2 NN++ # count different link } else LINK[ HREF ] = LINK[ HREF ] + 1 } if ( LINE == $0 ) ERROR( $0 ) } { next # print nothing else } #### print HTML ################################################### END { X = NN " EIS links" print "" X "
"
                    for ( X in LINK )
                    {
                         LINE = ANUM[ X ] " "
                         LINE = LINE TEXT[ X ] " ("
                         print LINE LINK[ X ] ")"
                    }
                    print "
" print NN++ " href= links found" > LOGFILE print MM++ " unrecognized href" > LOGFILE if ( LOGFILE == "/dev/tty" ) exit print "see report in " LOGFILE ":" > "/dev/tty" print --NN " href= links found" > "/dev/tty" print --MM " unrecognized href" > "/dev/tty" }