I've done two versions of this script:
# visits-IIS6.awk # # Generates two new files given an IIS 6.0-format server log of Web hits. # 1) (sent to stdout for you to pipe whereever you like) # Assigns each visit a unique long integer as an additional, first field in the new log file # The integer is formed by concatenating the POSIX timecode of the visit's first hit # with the IP stripped of periods. This should work at least until 2038-01-19 03:14:07 UTC. # 2) firsthits.txt (you can change this name in the BEGIN section) # Separate text file containing one line for the initial hit of each unique visit. The format # of each line is identical to the format of the main log file. # # The first output file preserves all the data from the original IIS server log (excluding log remarks). # The record order in that main output file is: all hits from one visit are adjacent in # chronological order. The visits are grouped only roughly in chronological order. The file # firsthits.txt allows you to link the two tables (if you import this data into a database) # to do sorting and grouping of visits. # # Example DOS command: c:\util\awk\gawk -f h:\data\visits.awk yourHits.log > yourVisits.txt # # Requires gawk from gnu.org # Copyright (C) October 2002 Alan Ng nospamalan@alan-ng.net. Remove "nospam" to contact me. # Revised January 2005 for IIS 6 format. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # You can view a copy of the GNU General Public License at: # http://www.gnu.org/copyleft/gpl.html BEGIN { ### definition of "visit" is that it expires upon no hit from that IP for timer seconds ### you may change this to any positive integer value timer = 600 ### you may change this to any valid path/filename visitfile = "firsthits.txt" } # MAIN { ### reject all IIS log remark lines, which start with '#' and may occur anywhere in a log file if ($1 ~ /^#/) { next } ### parse timestamp ### assumes that first two fields of each log line are date and time time = mktime(gensub(/-/, " ", "g", $1) " " gensub(/:/, " ", "g", $2)) ### output and then delete visits that are now expired for (ip in ids) { if ((time - last[ip]) > timer) { print transcript[ip] print firsthit[ip] > visitfile delete transcript[ip] delete firsthit[ip] delete ids[ip] delete start[ip] delete end[ip] } } ### store the current hit ### assumes that visitor IP ("c-ip" in IIS log) is the 11th field if (start[$11] == "") { ### new visit ids[$11] = time gensub(/\./, "", "g", $11) start[$11] = time last[$11] = time firsthit[$11] = ids[$11] " " $0 transcript[$11] = ids[$11] " " $0 } else { ### build visit transcript last[$11] = time transcript[$11] = transcript[$11] "\n" ids[$11] " " $0 } } END { ### output all leftover (= unexpired) visits for (ip in ids) { print transcript[ip] print firsthit[ip] > visitfile } }
# visits.awk # # Generates two new files given an IIS 5.0-format server log of Web hits. # 1) (sent to stdout for you to pipe whereever you like) # Assigns each visit a unique long integer as an additional, first field in the new log file # The integer is formed by concatenating the POSIX timecode of the visit's first hit # with the IP stripped of periods. This should work at least until 2038-01-19 03:14:07 UTC. # 2) firsthits.txt (you can change this name in the BEGIN section) # Separate text file containing one line for the initial hit of each unique visit. The format # of each line is identical to the format of the main log file. # # The first output file preserves all the data from the original IIS server log (excluding log remarks). # The record order in that main output file is: all hits from one visit are adjacent in # chronological order. The visits are grouped only roughly in chronological order. The file # firsthits.txt allows you to link the two tables (if you import this data into a database) # to do sorting and grouping of visits. # # Example DOS command: c:\util\awk\gawk -f h:\data\visits.awk yourHits.log > yourVisits.txt # # Requires gawk from gnu.org # Copyright (C) October 2002 Alan Ng nospamalan@alan-ng.net. Remove "nospam" to contact me. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # You can view a copy of the GNU General Public License at: # http://www.gnu.org/copyleft/gpl.html BEGIN { ### definition of "visit" is that it expires upon no hit from that IP for timer seconds ### you may change this to any positive integer value timer = 600 ### you may change this to any valid path/filename visitfile = "firsthits.txt" } # MAIN { ### reject all IIS log remark lines, which start with '#' and may occur anywhere in a log file if ($1 ~ /^#/) { next } ### parse timestamp time = mktime(gensub(/-/, " ", "g", $1) " " gensub(/:/, " ", "g", $2)) ### output and then delete visits that are now expired for (ip in ids) { if ((time - last[ip]) > timer) { print transcript[ip] print firsthit[ip] > visitfile delete transcript[ip] delete firsthit[ip] delete ids[ip] delete start[ip] delete end[ip] } } ### store the current hit if (start[$3] == "") { ### new visit ids[$3] = time gensub(/\./, "", "g", $3) start[$3] = time last[$3] = time firsthit[$3] = ids[$3] " " $0 transcript[$3] = ids[$3] " " $0 } else { ### build visit transcript last[$3] = time transcript[$3] = transcript[$3] "\n" ids[$3] " " $0 } } END { ### output all leftover (= unexpired) visits for (ip in ids) { print transcript[ip] print firsthit[ip] > visitfile } }
If you appreciate this page, please consider pitching in a small donation to the costs of running this Web server. Thank you!