/usr/share/doc/spambayes/utilities/split.py is in spambayes 1.1a6-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | #! /usr/bin/env python
"""Split an mbox into two files based on a given percentage.
This script will troll through a Unix mbox file randomly assigning each
message to one of two bins. The split is based on a given float percentage.
E.g.
% split.py sourcembox 20 mbox20 mbox80
yields two mbox files, where mbox20 contains approximately 20% of the messages
and mbox80 contains 80% of the messages. Messages are assigned to each bin
randomly.
Usage: %(programs)s [options] sourcembox percent file1 file2
Options:
-h / --help
Print this help message and exit
file1 and file2 are where the output goes. Approximately percent % of
messages will go to file1 and (100 - percent) % of messages will go to file2.
percent is a floating point number between 1 and 99. sourcembox is a Unix
mailbox file. All arguments except -h/--help are required.
"""
import sys
import random
import mailbox
import getopt
from spambayes import mboxutils
program = sys.argv[0]
def usage(code, msg=''):
print >> sys.stderr, __doc__
if msg:
print >> sys.stderr, msg
sys.exit(code)
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], 'h', ['help'])
except getopt.error, msg:
usage(1, msg)
bin1 = bin2 = percentage = mboxfile = None
for opt, arg in opts:
if opt in ('-h', '--help'):
usage(0)
try:
mboxfile = args[0]
percent = float(args[1])
if not (0 < percent < 100):
raise ValueError
percent /= 100.0
bin1 = args[2]
bin2 = args[3]
except IndexError:
usage(1, 'Not enough arguments')
except ValueError:
usage(1, 'Percent argument must be a float between 1.0 and 99.0')
# Cruise
bin1out = open(bin1, 'wb')
bin2out = open(bin2, 'wb')
infp = open(mboxfile, 'rb')
mbox = mailbox.PortableUnixMailbox(infp, mboxutils.get_message)
for msg in mbox:
if random.random() < percent:
outfp = bin1out
else:
outfp = bin2out
astext = str(msg)
assert astext.endswith('\n')
outfp.write(astext)
outfp.close()
bin1out.close()
bin2out.close()
if __name__ == '__main__':
main()
|