Disk Usage Summary per User and Time, take 2
I would very much like to compare awk (gawk) with python and therefore I coded the same thing in gawk. Here is the code:
#! /bin/bash
#
# count user file size by block
#
if [ $# -ne 1 ]; then
echo "Usage: $0 <directory>"
exit 1
fi
if [ ! -d $1 ]; then
echo "Error. $1 does not exist"
fi
now=$(date +%s)
ls -lRs --time-style=+%s $1 | awk -v now=$now '
function print_header() {
printf("%-15s %8s %8s %8s %8s %8s %8s %8s %8s\n",
"User", "0m-1m", "1m-3m", "3m-6m", "6m-1y",
"1y-2y", "2y-3y", "3y- ", "Total")
}
function print_line() {
d8="--------"
d15="---------------"
printf("%-15s %8s %8s %8s %8s %8s %8s %8s %8s\n", d15, d8, d8, d8, d8, d8, d8, d8, d8)
}
function print_footer() {
printf("\nNote: Size in GB\n")
}
BEGIN {
print_header()
print_line()
factor=1024.0*1024.0
y0m0=0
y0m1=1*30*86400
y0m3=3*30*86400
y0m6=6*30*86400
y1m0=1*365*86400
y2m0=2*365*86400
y3m0=3*365*86400
yxm0=100*365*86400
}
# match directory, link and file
$2 ~ /^[dl-]/ {
block=$1
user= $4
epoch=$7
users[user]=1
dt=now-epoch
if ( y0m0<=dt && dt<y0m1 ) { cnt=1 }
if ( y0m1<=dt && dt<y0m3 ) { cnt=2 }
if ( y0m3<=dt && dt<y0m6 ) { cnt=3 }
if ( y0m6<=dt && dt<y1m0 ) { cnt=4 }
if ( y1m0<=dt && dt<y2m0 ) { cnt=5 }
if ( y2m0<=dt && dt<y3m0 ) { cnt=6 }
if ( y3m0<=dt && dt<yxm0 ) { cnt=7 }
summary[user,cnt]+=block
total_time[cnt]+=block
total_user[user]+=block
}
END {
# sort user name using asorti (gawk)
n=asorti(users, users_sorted)
for(i=1;i<=n;++i) {
user=users_sorted[i]
printf("%-15s", user)
for(cnt=1;cnt<=7;++cnt) {
if ( summary[user,cnt] == "" ) {
summary[user,cnt]=0.0
}
printf(" %8.2f", summary[user,cnt]/factor)
}
# print per user total
printf(" %8.2f\n", total_user[user]/factor)
}
print_line()
# print total per time
total=0.0
printf("%15s", "Total:")
for(cnt=1;cnt<=7;++cnt) {
if ( total_time[cnt] == "" ) {
total_time[cnt]=0.0
}
total+=total_time[cnt]
printf(" % 8.2f", total_time[cnt]/factor)
}
printf(" %8.2f\n", total/factor)
print_footer()
}
'
With 814MB and 10,208 files in /var, python solution took 1.17s and gawk took 0.95s. I am yet to find out how the two compare for millions of files.


2 Comments:
Hi,
may I suggest having the article titles (instead of the month names) on the right "archive" panel section?
I think this will make easier to find what the audience may be interested in.
thanks for sharing such valuable info.
Jorge
Thanks for the advice. I will switch to some new layouts once I have more free time.
Post a Comment
<< Home