/usr/lib/irstlm/bin/split-ngt.sh is in irstlm 5.80.03-2.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | #! /bin/bash
function usage()
{
cmnd=$(basename $0);
cat<<EOF
$cmnd - creates partition files with ngram statistics in Google format
USAGE:
$cmnd [options] <input> <output> <order> <parts>
DESCRIPTION:
<input> Input file name
<output> Partition files name prefix
<order> Order of the ngrams
<parts> Number of partitions
OPTIONS:
-h Show this message
EOF
}
# Parse options
while getopts h OPT; do
case "$OPT" in
h)
usage >&2;
exit 0;
;;
esac
done
#usage:
#ngt-split.sh <input> <output> <size> <parts>
#It creates <parts> files (named <output.000>, ... <output.999>)
#containing ngram statistics (of <order> length) in Google format
#These files are a partition of the whole set of ngrams
basedir=$IRSTLM
bindir=$basedir/bin
scriptdir=$basedir/scripts
unset par
while [ $# -gt 0 ]
do
echo "$0: arg $1"
par[${#par[@]}]="$1"
shift
done
inputfile=${par[0]}
outputfile=${par[1]}
order=${par[2]}
parts=${par[3]}
dictfile=dict$$
$bindir/dict -i="$inputfile" -o=$dictfile -f=y -sort=n
$scriptdir/split-dict.pl --input $dictfile --output ${dictfile}. --parts $parts
rm $dictfile
for d in `ls ${dictfile}.*` ; do
w=`echo $d | perl -pe 's/.+(\.[0-9]+)$/$1/i'`
w="$outputfile$w"
echo "$bindir/ngt -i="$inputfile" -n=$order -gooout=y -o=$w -fd=$d > /dev/null"
$bindir/ngt -n=$order -gooout=y -o=$w -fd=$d -i="$inputfile" > /dev/null
rm $d
done
exit
|