This file is indexed.

/usr/lib/irstlm/bin/mdtsel.sh is in irstlm 5.80.03-2.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#! /bin/bash 

#/******************************************************************************
#IrstLM: IRST Language Model Toolkit
#Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
#
#This library is free software; you can redistribute it and/or
#modify it under the terms of the GNU Lesser General Public
#License as published by the Free Software Foundation; either
#version 2.1 of the License, or (at your option) any later version.
#
#This library is distributed in the hope that it will be useful,
# 
# 
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#Lesser General Public License for more details.
#
#You should have received a copy of the GNU Lesser General Public
#License along with this library; if not, write to the Free Software
#Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA
#
#******************************************************************************/

# mdtsel.sh
# by M. Federico
# Copyright Marcello Federico, Fondazione Bruno Kessler, 2012


set -m #enable job control

usage()
{
    cmnd=$(basename $0);
    cat << EOF

$cmnd - performs data selection assuming an indomain corpus and
        a very large out of domain corpus.

USAGE:
       $cmnd [options]

DESCRIPTION.
       This command performs data selection assuming an indomain
       corpus and a very large out of domain corpus.
       Both corpora must contain one sentence in each line delimited
       with <s> and </s>. The process produces a file of scores.


OPTIONS:
       -h        Show this message
       -v        Verbose
       -i        In-domain corpus 
       -o        Out-domain corpus
       -s        Scores output file 
       -x        Out-domain lines are indexed
       -w        Temporary work directory (default /tmp)
       -j        Number of jobs (default 6)
       -m        Data selection model (1 or 2, default 2)
       -f        Word frequency threshold (default 2)
       -n        Ngram order to use (n>=1 default 3)
       -d        Vocabulary size upper bound (default 10000000)   
       -c        Cross-validation parameter (cv>=1, default 1)

EOF
}


if [ ! $IRSTLM ]; then
   echo "Set IRSTLM environment variable with path to irstlm"
   exit 2
fi

#paths to scripts and commands in irstlm
scr=$IRSTLM/bin
bin=$IRSTLM/bin

#check irstlm installation
if [ ! -e $bin/dtsel ]; then
   echo "$IRSTLM does not contain a proper installation of IRSTLM"
   exit 3
fi

#default parameters
indomfile="";
outdomfile="";
scoresfile="";
workdir=/tmp
logfile="/dev/null"
jobs=6
model=2
minfreq=2
ngramorder=3
cv=1
dub=10000000

verbose="";
useindex=0;

while getopts “hvi:o:s:l:w:j:m:f:n:c:d:x:” OPTION
do
     case $OPTION in
         h)
             usage
             exit 1
             ;;
         v)
             verbose="--verbose";
             ;;
         i)
             indfile=$OPTARG
             ;;
			 
         o)
             outdfile=$OPTARG
             ;;
         s)
             scorefile=$OPTARG
             ;;			 
         l)
             logfile=$OPTARG
             ;;
         w)
		     workdir=$OPTARG
             ;;			 
         j)
		     jobs=$OPTARG
             ;;

		 m)
             model=$OPTARG
             ;;	 

         n)
             ngramorder=$OPTARG
             ;;
         f)
		     minfreq=$OPTARG;	
			 ;;
	     d)
		     dub=$OPTARG;	
			 ;;
		 x)
		     useindex=$OPTARG;	
			 ;;

 		 ?)
             usage
             exit 1
             ;;
	
		esac
done


if [ $verbose ];then
echo indfile= $indfile outdfile= $outdfile scorefile= $scorefile useindex= $useindex 
echo logfile= $logfile workdir= $workdir 
echo jobs= $jobs model= $model ngramorder= $ngramorder minfreq= $minfreq dub=$dub
fi

if [ ! $indfile -o ! $outdfile -o ! $scorefile ]; then
    usage
    exit 5
fi
 
if [ -e $scorefile ]; then
   echo "Output score file $outfile already exists! either remove or rename it."
   exit 6
fi

if [ $logfile != "/dev/null" -a $logfile != "/dev/stdout" -a -e $logfile ]; then
   echo "Logfile $logfile already exists! either remove or rename it."
   exit 7
fi

workdir_created=0

if [ ! -d $workdir ]; then
   echo "Temporary work directory $workdir does not exist";
   echo "creating $workdir";
   mkdir -p $workdir;
   workdir_created=1;
fi


#get process id to name process specific temporary files
pid=$$

#compute size of out domain corpus and block size of split
lines=`wc -l < $outdfile`
size=`echo "( $lines + 1000 )" / $jobs | bc` #to avoid any small block

#perform split 
split -l $size $outdfile $workdir/dtsel${pid}-files-

for file in $workdir/dtsel${pid}-files-*
do
echo $file  
( \
$bin/dtsel -x=$useindex -i=$indfile -o=$file -s=${file}.scores -n=$ngramorder -dub=$dub -f=$minfreq -m=$model ; \
cat ${file}.scores | perl -pe '/^nan /1000 /g;' | sort -g > ${file}.scores.tmp ; \
mv ${file}.scores.tmp ${file}.scores \ 
) >>$logfile 2>&1 &

done

# Wait for all parallel jobs to finish
while [ 1 ]; do fg 2> /dev/null; [ $? == 1 ] && break; done

sort -g -m $workdir/dtsel${pid}-files-*.scores > $scorefile
rm $workdir/dtsel${pid}-files-*
if [ $workdir_created == 1 ]
then
rmdir $workdir
fi