/usr/bin/gbdummyfy is in gbutils 5.7.0-1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 | #!/bin/sh
# gbdummyfy ver. .5 Copyright (C) 2009-2012 Giulio Bottazzi
#default settings
pos=1
del="no"
#read command line options; the position of the last option is saved
#in OPTIND
while getopts "c:d:hv-:" opt
do
case $opt in
-)
case "${OPTARG}" in
help) help=yes;;
version) version=yes;;
esac;;
c) pos=$OPTARG ;;
d) del=$OPTARG ;;
v) verbose=yes ;;
h) help=yes ;;
\?) help=yes;;
esac
done
if [ "$version" = "yes" ]; then
cat - <<EOF
gbdummyfy ver. 5.6
Copyright (C) 2009-2015 Giulio Bottazzi
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
(version 2) as published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
Written by Giulio Bottazzi
Report bugs to <gbutils@googlegroups.com>
Package home page <http://cafim.sssup.it/~giulio/software/gbutils/index.html>
EOF
exit
fi
if [ "$help" = "yes" ]; then
cat - <<EOF
This command reads from standard input a text file with space
separated columns. The entry in one column (the first by default) are
considered labels and expanded into a matrix of dummies , i.e. of 0
and 1 values. The number of columns of the matrix is equal to the
number of different labels. Each row contains '1' in the place of the
associated labels in the sorted list of labels, and '0' everywehere
else. Since in general one less dummy variable is required than the
number of labels, you can remove one column of dummies using the
option '-d'.
Usage: gbdummyfy [options]
Options:
-h print this help
-c set the column of labels (default 1)
-d which column to remove, counting from 1 (default none)
-v print the labels and associated positions to standard error
Examples:
echo "a 1\nb 2" | gbdummyfy create a 4x3 marix with dummy values
relative to labels 'a' and 'b'
This program requires awk or gawk. Notice that it simply expands the
data adding new columns. When using the resulting the resulting matrix
in other utilities, the user should specify explicitly which dummies
variable to use and how.
A simple linear dependency can be automatically generated for 'gblreg'
by inserting the following expression in the functional specification
\`seq 3 12 | sed 's/\(.*\)/\+d\1\*x\1/' | tr -d '\n'\`
and
\`seq 3 12 | sed 's/\(.*\)/,\1=0/' | tr -d '\n'\`
among the initial conditions. In this case there are 10 different values
for the dummy. They occupy column positions from 3 to 12 and their initial
value is zero.
EOF
exit
fi
#create temporary files
dataorig=`tempfile`
initial=`tempfile`
final=`tempfile`
dummies=`tempfile`
newdummies=`tempfile`
#store the original data removing possible initial empty spaces and
#shrinking any number of spaces and tab in one single space
cat - | tr '\t' ' ' | tr -s ' ' | sed 's/^ *//' > $dataorig
#count the number of columns
colnum=`head -n 1 $dataorig | gawk '{print NF}'`
#separate the input file in three parts: before the column of dummies
if [ $pos -gt 1 ]; then
cut -d ' ' -f 1-$(( pos-1 )) < $dataorig > $initial
fi
#the column of dummies itself
cut -d ' ' -f $pos < $dataorig > $dummies
#after the column of dummies
if [ $pos -lt $colnum ]; then
cut -d ' ' -f $(( pos+1 ))- < $dataorig > $final
fi
awk -v verbose=$verbose '
{
#store the dummy of the line
line[NR]=$1
# prepare the list of labels
label[$1]=1
}
END {
#sort and collect labels
labnum=asorti(label,labsort)
#print labels if verbose
if(verbose ~ "yes")
for(i=1;i<=labnum;i++)
print labsort[i],i | "cat 1>&2"
#prepare the array
for(i=1;i<=labnum;i++){
#create the string
string=""
for(j=1;j<i;j++)
string=string"0 "
string=string"1 "
for(j=i;j<labnum;j++)
string=string"0 "
#assign it to the appropriate label
label[labsort[i]]=string
}
#print output
for(i=1;i<=NR;i++)
print label[line[i]]
}
' $dummies > $newdummies
#delete a column
if [ "$del" != "no" ]; then
labnum=`head -n 1 $newdummies | gawk '{print NF}'`
if [ $del = 1 ]; then
storage=`tempfile`
cat $newdummies > $storage
cut -d ' ' -f 2- < $storage > $newdummies
rm $storage
elif [ $del = $labnum ]; then
storage=`tempfile`
cat $newdummies > $storage
cut -d ' ' -f 1-$(( labnum-1 )) < $storage > $newdummies
rm $storage
elif test $del -gt 1 && test $del -lt $labnum; then
storage1=`tempfile`
storage2=`tempfile`
cut -d ' ' -f 1-$(( del-1 )) < $newdummies > $storage1
cut -d ' ' -f $(( del+1 ))- < $newdummies > $storage2
paste -d ' ' $storage1 $storage2 > $newdummies
rm $storage1 $storage2
else
echo "gbdummyfy: wrong column spec in option -d; request ignored" > /dev/stderr
fi
fi
#rebuild the file
if [ "$colnum" = 1 ]; then
cat $newdummies
elif [ "$pos" = 1 ]; then
paste -d ' ' $newdummies $final
elif [ "$pos" = "$colnum" ]; then
paste -d ' ' $initial $newdummies
else
paste -d ' ' $initial $newdummies $final
fi
#remove temporary files
rm $dataorig $initial $final $dummies $newdummies
|