This file is indexed.

/usr/lib/irstlm/bin/goograms2ngrams.pl is in irstlm 6.00.05-2.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#! /usr/bin/perl

#*****************************************************************************
# IrstLM: IRST Language Model Toolkit
# Copyright (C) 2007 Marcello Federico, ITC-irst Trento, Italy

# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.

# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 USA

#******************************************************************************



#transforms google n-grams into real n-grams so that  counts are
#consistent with respect to lower order n-grams

use strict;
use Getopt::Long "GetOptions";

my $gzip=`which gzip 2> /dev/null`;
my $gunzip=`which gunzip 2> /dev/null`;
chomp($gzip);
chomp($gunzip);

my $cutoffword="<CUTOFF>"; #special word for Google 1T-ngram cut-offs
my $blocksize=10000000; #this is the blocksize of produced n-grams
my $from=2;             #starting n-gram level

my($help,$verbose,$maxsize,$googledir,$ngramdir)=();

$help=1 unless
&GetOptions('maxsize=i' => \$maxsize,
			'startfrom=i' => \$from,
			'googledir=s' => \$googledir,
			'ngramdir=s' => \$ngramdir,
			'h|help' => \$help,
			'verbose' => \$verbose);


if ($help || !$maxsize || !$googledir || !$ngramdir ) {
	my $cmnd = "goograms2ngrams.pl";
  print "\n$cmnd - transforms google n-grams into real n-grams so that\n",
	"       counts are consistent with respect to lower order n-grams\n",
	"\nUSAGE:\n",
	"       $cmnd [options]\n",
	"\nOPTIONS:\n",
    "       --maxsize <int>       maximum n-gram level of conversion\n",
    "       --startfrom <int>     skip initial levels if already available (default 2)\n",
    "       --googledir <string>  directory containing the google-grams dirs (1gms,2gms,...)\n",
    "       --ngramdir <string>   directory where to write the n-grams \n",
    "       --verbose             (optional) very talktive output\n",
    "       -h, --help            (optional) print these instructions\n",
    "\n";

  exit(1);
}

warn "goograms2ngrams: maxsize $maxsize from $from googledir $googledir ngramdir $ngramdir \n"
if $verbose;

die "goograms2ngrams: value of --maxsize must be between 2 and 5\n" if $maxsize<2 || $maxsize>5;
die "goograms2ngrams: cannot find --googledir $googledir \n" if ! -d $googledir;
die "goograms2ngrams: cannot find --ngramdir  $ngramdir \n" if ! -d $ngramdir;


my ($n,$hgrams,$ggrams,$ngrams)=();
my ($ggr,$hgr,$hgrcnt,$ggrcnt,$totggrcnt)=();
my (@ggr,@hgr)=();

foreach ($n=$from;$n<=$maxsize;$n++){
  
  my $counter=0;
  	
  warn "Converting google-$n-grams into $n-gram\n"; 

  $hgrams=($n==2?"${googledir}/1gms/vocab.gz":"${ngramdir}/".($n-1)."grams-*.gz");
  open(HGR,"$gunzip -c $hgrams |") || die "cannot open $hgrams\n";
  
  $ggrams="${googledir}/".($n)."gms/".($n)."gm-*";
  open(GGR,"$gunzip -c $ggrams |") || die "cannot open $ggrams\n";
   
  my $id = sprintf("%04d", 0);
  $ngrams="${ngramdir}/".($n)."grams-${id}.gz";

  next if -e $ngrams; #go to next step if file exists already;
  open(NGR,"|$gzip -c > $ngrams ")  || die "cannot open $ngrams\n";

  chop($ggr=<GGR>); @ggr=split(/[ \t]/,$ggr);$ggrcnt=(pop @ggr);
  #warn "ggr: ",$ggrcnt," ",join(" ",@ggr[0..$n-1]),"\n";

  while ($hgr=<HGR>){	

	$counter++;
	printf(STDERR ".") if ($counter % 1000000)==0;
	  
	chop($hgr); @hgr=split(/[ \t]/,$hgr); $hgrcnt=(pop @hgr);
    #warn "hgr: ",$hgrcnt," ",join(" ",@hgr[0..$n-2]),"\n";
  
	if (join(" ",@hgr[0..$n-2]) eq join(" ",@ggr[0..$n-2])){ 

		$totggrcnt=0;
		do{
			$totggrcnt+=$ggrcnt;
			print NGR join(" ",@ggr[0..$n-1])," ",$ggrcnt,"\n";
			chop($ggr=<GGR>);@ggr=split(/[ \t]/,$ggr);$ggrcnt=(pop @ggr);
		}until (join(" ",@hgr[0..$n-2]) ne join(" ",@ggr[0..$n-2]));

		if ($hgrcnt > $totggrcnt){
			#warn "difference: $hgrcnt $totggrcnt =",$hgrcnt-$totggrcnt,"\n";
			print NGR join(" ",@hgr[0..$n-1])," ",$cutoffword," ",$hgrcnt-$totggrcnt,"\n";
		}
	}
	else{ 
		#warn "fully pruned context: $hgr\n";
		print NGR join(" ",@hgr[0..$n-1])," ",$cutoffword," ",$hgrcnt,"\n";
	}	
	
	if (($counter % $blocksize)==0){ 
		close(NGR);
		my $id = sprintf("%04d", int($counter / $blocksize));
		$ngrams="${ngramdir}/".($n)."grams-${id}.gz";
		open(NGR,"|$gzip -c > $ngrams ")  || die "cannot open $ngrams\n";	
	}
	
  }

  close(HGR);close(NGR);close(GGR);
  
}