Sunday, September 30, 2012

Disk Usage Summary per User and Time, take 2

I would very much like to compare awk (gawk) with python and therefore I coded the same thing in gawk. Here is the code:
#! /bin/bash
#
# count user file size by block
#


if [ $# -ne 1 ]; then
    echo "Usage: $0 <directory>"
    exit 1
fi


if [ ! -d $1 ]; then
    echo "Error. $1 does not exist"
fi


now=$(date +%s)
ls -lRs --time-style=+%s $1 | awk -v now=$now '

function print_header() {
    printf("%-15s %8s %8s %8s %8s %8s %8s %8s %8s\n", 
        "User", "0m-1m", "1m-3m", "3m-6m", "6m-1y",
        "1y-2y", "2y-3y", "3y-  ", "Total")
}

function print_line() {
    d8="--------"
    d15="---------------"
    printf("%-15s %8s %8s %8s %8s %8s %8s %8s %8s\n", d15, d8, d8, d8, d8, d8, d8, d8, d8)
}

function print_footer() {
    printf("\nNote: Size in GB\n")
}

BEGIN {
    print_header()
    print_line()

    factor=1024.0*1024.0

    y0m0=0
    y0m1=1*30*86400
    y0m3=3*30*86400
    y0m6=6*30*86400
    y1m0=1*365*86400
    y2m0=2*365*86400
    y3m0=3*365*86400
    yxm0=100*365*86400
}

# match directory, link and file
$2 ~ /^[dl-]/ {
    block=$1
    user= $4
    epoch=$7

    users[user]=1

    dt=now-epoch

    if ( y0m0<=dt && dt<y0m1 ) { cnt=1 }
    if ( y0m1<=dt && dt<y0m3 ) { cnt=2 }
    if ( y0m3<=dt && dt<y0m6 ) { cnt=3 }
    if ( y0m6<=dt && dt<y1m0 ) { cnt=4 }
    if ( y1m0<=dt && dt<y2m0 ) { cnt=5 }
    if ( y2m0<=dt && dt<y3m0 ) { cnt=6 }
    if ( y3m0<=dt && dt<yxm0 ) { cnt=7 }

    summary[user,cnt]+=block
    total_time[cnt]+=block
    total_user[user]+=block
}
END {
    # sort user name using asorti (gawk)
    n=asorti(users, users_sorted)
    for(i=1;i<=n;++i) {
        user=users_sorted[i]
        printf("%-15s", user)
        for(cnt=1;cnt<=7;++cnt) {
            if ( summary[user,cnt] == "" ) {
                summary[user,cnt]=0.0
            }
            printf(" %8.2f", summary[user,cnt]/factor)
        }

        # print per user total
        printf(" %8.2f\n", total_user[user]/factor)
    }

    print_line()

    # print total per time
    total=0.0
    printf("%15s", "Total:")
    for(cnt=1;cnt<=7;++cnt) {
        if ( total_time[cnt] == "" ) {
            total_time[cnt]=0.0
        }
        total+=total_time[cnt]
        printf(" % 8.2f", total_time[cnt]/factor)
    }
    printf(" %8.2f\n", total/factor)

    print_footer()
}
'


With 814MB and 10,208 files in /var, python solution took 1.17s and gawk took 0.95s. I am yet to find out how the two compare for millions of files.

2 Comments:

Blogger Jorge said...

Hi,
may I suggest having the article titles (instead of the month names) on the right "archive" panel section?
I think this will make easier to find what the audience may be interested in.

thanks for sharing such valuable info.

Jorge

11:04 PM  
Blogger chihungchan said...

Thanks for the advice. I will switch to some new layouts once I have more free time.

10:38 AM  

Post a Comment

<< Home