Tuesday, January 08, 2008

Calculate Min/Max/Sum/Avg/RMS/Std in a Script

I have been working on the Netflix predictions lately and have to find out the min, max, avg, root mean square, standard deviation every now and then. I have been typing a lot of these one-liners to work out some of these values and it is about time to put that all in one script.

Some of the sample one-liners to work out the sum and average:

awk '{s+=$1} END {print s}' data
awk '{s+=$1} END {print s/NR}' data

Below is the script that has been generalised to handle most of the situations such as applying statistical method on specific column, ability to handle various type of field separator.

$ cat ~/bin/calc.sh
#! /bin/sh


usage()
{
        echo "Usage: $0 [-h] [-c column] [-m sum|avg|max|min|rms|std] [-f field_sep] [file ...]"
        echo "\\tDefault: column 1, sum, white space, standard input"
}



#
# get arguments and flags
#
set -- `getopt c:m:f: $* 2>/dev/null`
if [ $? != 0 ]; then
        usage
        exit 1
fi
column=1
method=sum
fs='[ \t]+'
for i in $*; do
        case $i in
                -c)
                        column=$2
                        shift 2
                        ;;
                -f)
                        fs=$2
                        shift 2
                        ;;
                -m)
                        method=$2
                        shift 2
                        ;;
                -h)
                        usage
                        exit 1
                        ;;
                --)
                        shift
                        break
                        ;;
        esac
done



#
# default, standard input channel
#
args=${*:--}



#
# check arguments
#
if [ "$method" != "sum" -a \
     "$method" != "max" -a \
     "$method" != "min" -a \
     "$method" != "avg" -a \
     "$method" != "rms" -a \
     "$method" != "std" ]; then
        echo "Error. Method $method unsupported"
        exit 2
fi
echo $column | egrep '^[1-9][0-9]*$' > /dev/null 2>&1
if [ $? -ne 0 ]; then
        echo "Error. Column number has to be integer"
        exit 2
fi



nawk -v col="$column" -v met="$method" -v fs="$fs" '
BEGIN {
        FS=fs
        max=-99999999999
        min=99999999999
}
col<=NF{
        if ( met == "sum" || met == "avg" ) {
                sum+=$col
                ++count
        } else if ( met == "std" ) {
                sum+=$col
                ++count
                term[count]=$col
        } else if ( met == "std" ) {
                diff=avg-$col
                sum+=diff*diff
                ++count
        } else if ( met == "rms" ) {
                sum+=$col*$col
                ++count
        } else if ( met == "max" ) {
                if ( $col > max ) {
                        max=$col
                }
        } else if ( met == "min" ) {
                if ( $col < min ) {
                        min=$col
                }
        }
}
END {
        if ( met == "sum" ) {
                print sum
        } else if ( met == "avg" ) {
                print sum/count
        } else if ( met == "std" ) {
                avg=sum/count
                for(i in term) {
                        diff=term[i]-avg
                        std+=diff*diff
                }
                print sqrt(std/count)
        } else if ( met == "rms" ) {
                print sqrt(sum/count)
        } else if ( met == "max" ) {
                print max
        } else if ( met == "min" ) {
                print min
        }
}' $args

See my script in action

$ ~/bin/calc.sh -h
Usage: /export/home/chihung/bin/calc.sh [-h] [-c column] [-m sum|avg|max|min|rms|std] [-f field_sep] [file ...]
        Default: column 1, sum, white space, standard input

$ ~/bin/calc.sh -c 1 -m avg /tmp/x
3.51955

$ ~/bin/calc.sh -c 2 -m min /tmp/x
1.0

$ ~/bin/calc.sh -c 2 -m max /tmp/x
4.4

$ awk 'BEGIN{OFS=":"}{print $1,$2}' /tmp/x | ~/bin/calc.sh -c 1 -m avg -f :
3.51955

Labels: ,

0 Comments:

Post a Comment

<< Home