Friday, November 30, 2012

Learn AWK by Examples

Here are some AWK (rather gawk) tricks that I normally use to tackle data crunching problems. You may also want to find out how shell variables can be passed to awk

Print all users in /etc/passwd start with a, c, or d

$ awk -F: '
/^[acd]/ { 
    print $1
}' /etc/passwd

daemon
colord
avahi-autoipd
avahi
chihung

Process stdin based on patterns as range

$ echo 'BEGIN
abc
def
END
junk
junk
START
pqrst
uvwxyz
STOP' | awk '
/START/,/STOP/ { print "-->" $0 }
/BEGIN/,/END/  { print "==>" $0 }
'

==>BEGIN
==>abc
==>def
==>END
-->START
-->pqrst
-->uvwxyz
-->STOP

Print 10 random number 10<=N<20, use /dev/null as a dummy input file

$ awk -v n=10 -v start=10 -v end=20 '
BEGIN {
    srand()
    for (i=1; i<=n; ++i) {
        printf("%d\n", start+rand()*(end-start))
    }
}' /dev/null

17
13
15
19
10
11
19
13
11
19

Count by file types in current directory

$ ls -l | 
awk '
BEGIN {
    dirs=0
    files=0
    socks=0
    links=0
}
/total/ { next }
/^d/ { ++dirs }
/^-/ { ++files }
/^s/ { ++socks }
/^l/ { ++links }
END { print "dirs=" dirs, "files=" files, "socks=" socks, "links=" links }
'

dirs=3 files=10 socks=0 links=0

Count by file types in current directory (using array)

$ ls -l | 
awk '
$1 != "total" {
    c1=substr($1,1,1)
    ++s[c1]
}
END {
    printf("dirs=%d files=%d socks=%d links=%d\n",
        s["d"], s["-"], s["s"], s["l"])
}
'

dirs=3 files=10 socks=0 links=0

Calculate total size of all .gz files in /usr/share directory

$ find /usr/share -type f -name "*.gz" -ls | 
awk '
{
    s+=$7
}
END {
    printf("%.2lf MB\n", s/(1024*1024))
}
'

63.08 MB

Count files by users in /home directory

$ find /home -mount -type f -ls | 
awk '
{
    ++count[$5]
    size[$5]+=$7
} 
END {
    for ( i in count ) {
        printf("User=%s Count=%d Size=%s\n", i, count[i], size[i])
    }
}
'

User=chihung Count=16301 Size=6346658343
User=root Count=10 Size=80109

Print all users start with e, f, g with their corresponding group name. Group id to name mapping is stored in the gid2name array by processing the first file /etc/group. (Note: I present two similar ways to do the same task)

$ awk -F: '
NR==FNR {
    gid2name[$3]=$1
}
NR>FNR && /^[e-g]/ {
    print $1, gid2name[$4]
}' /etc/group /etc/passwd

games games
gnats gnats
gdm gdm
games games
gnats gnats
gdm gdm

$ awk -F: '
FILENAME=="/etc/group" {
    gid2name[$3]=$1
}
FILENAME=="/etc/passwd" && /^[e-g]/ {
    print $1, gid2name[$4]
}' /etc/group /etc/passwd

games games
gnats gnats
gdm gdm
games games
gnats gnats
gdm gdm


Count all file extensions in /usr/include directory

find /usr/include -mount -type f | 
$ awk -F/ '
{
    basename=$NF
    n=split(basename, arr, ".")
    if ( n>1 ) {
        ext=arr[n]
        ++summary[ext]
    }
}
END {
    for ( i in summary ) {
        print i, summary[i]
    }
}
'

h 4335
def 1
x 17
hpp 245
c 6
tcc 37

Multi-line record with blank line(s) as separator

$ cat ~/.mozilla/firefox/profiles.ini 
[General]
StartWithLastProfile=1

[Profile0]
Name=default
IsRelative=1
Path=5d0x3te1.default

$ awk '
BEGIN {
    FS="\n"
    RS=""
}
{
    for ( i=1; i<=NF; ++i ) {
        print "NR=" NR, "NF=" i, "Data=" $i
    }
}' ~/.mozilla/firefox/profiles.ini

NR=1 NF=1 Data=[General]
NR=1 NF=2 Data=StartWithLastProfile=1
NR=2 NF=1 Data=[Profile0]
NR=2 NF=2 Data=Name=default
NR=2 NF=3 Data=IsRelative=1
NR=2 NF=4 Data=Path=5d0x3te1.default

Print all the section headers in a .ini file with function definition to remove square brackets

$ cat ~/.mozilla/firefox/profiles.ini 
[General]
StartWithLastProfile=1

[Profile0]
Name=default
IsRelative=1
Path=5d0x3te1.default

$ awk '                                             
function rmsq(n) {
    gsub("\\[","",n)
    gsub("]","",n)
    return n
}
BEGIN {
    FS="\n"
    RS=""
}
{
    print rmsq($1)
}' ~/.mozilla/firefox/profiles.ini

General
Profile0

Labels: