Saturday, May 31, 2008

Swapping Row to Column, Code Refactoring

While I was mopping the floor this morning (I am not kidding), I realised that I can improve on my previous blog, Swapping Row to Column, by defining a user function in AWK. Previously I was trying to use shell function to generalise the special separator code instead of repeating it for both the input and output separators. However, it is not as easy as I think it is because these separator will upset the shell syntax.

In the function, I also include a few extra separators like COLON, COMMA, DOUBLEQUOTE and SINGLEQUOTE. Single quote (') is not as straightforward as the other separators because the shell will interpret it as if you want to close the awk statement. In order for AWK to work with single quote, I need AWK to sprintf the single quote ASCII representation (which is 39) to a local variable (sp)

Here is the revised code:

#! /bin/sh


usage()
{
 echo "Usage: $0 [-h] [-i sep] [-o sep] [input-file]"
 echo "       -h : to print this help message"
 echo "       -i : input field separator  [default: whitespace]"
 echo "       -o : output field separator [default: space]"
 echo "Note: special field separator"
 echo "      NULL, SPACE, PIPE, COLON, COMMA, SINGLEQUOTE, DOUBLEQUOTE"
}


set -- `getopt i:o:h $* 2>/dev/null`
if [ $? -ne 0 ]; then
 usage
 exit 1
fi


isep="[ \t]+"
osep=" "
for i in $*; do
 case $i in 
  -i)
   isep=$2
   shift 2
   ;;
  -o)
   osep=$2
   shift 2
   ;;
  -h)
   usage
   exit 0
   ;;
  --)
   shift
   ;;
 esac
done



gawk -v isep="$isep" -v osep="$osep" '
function separator(sep, sq)
{
 if ( sep == "NULL" ) {
  return ""
 }
 if ( sep == "SPACE" ) {
  return " "
 }
 if ( sep == "PIPE" ) {
  return "|"
 }
 if ( sep == "COMMA" ) {
  return ","
 }
 if ( sep == "COLON" ) {
  return ":"
 }
 if ( sep == "DOUBLEQUOTE" ) {
  return "\""
 }
 if ( sep == "SINGLEQUOTE" ) {
  # you cannot return a single quote because the shell will
  # think that you are trying to close the awk command
  sq=sprintf("%c",39)
  return sq
 }
 return sep
}
BEGIN {
 FS=separator(isep)
 max=0
}
{
 for ( i=1 ; i<=NF ; ++i ) {
  a[i,NR]=$i
 }
 if ( NF > max ) { max=NF }
}
END {
 for ( i=1 ; i<=max ; ++i ) {
  for ( j=1 ; j<NR ; ++j ) {
   printf("%s%s", a[i,j], separator(osep))
  }
  print a[i,j]

 }
}' $1

Labels: , ,

0 Comments:

Post a Comment

<< Home