This file is indexed.

/usr/lib/python2.7/dist-packages/chemfp/encodings.py is in python-chemfp 1.1p1-2.1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
"""chemfp.decoders - decode different fingerprint representations into chemfp form

The chemfp fingerprints are stored as byte strings, with the bytes in
least-significant bit order (bit #0 is stored in the first/left-most
byte) and with the bits in most-significant bit order (bit #0 is
stored in the first/right-most bit of the first byte).

Other systems use different encodings. These include:
  - the '0 and '1' characters, as in '00111101'
  - hex encoding, like '3d'
  - base64 encoding, like 'SGVsbG8h'
  - CACTVS's variation of base64 encoding

plus variations of different LSB and MSB orders.

This module decodes most of the fingerprint encodings I have come
across. The fingerprint decoders return a 2-ple of the bit length and
the chemfp fingerprint. The bit length is None unless the bit length
is known exactly, which currently is only the case for the binary and
CACTVS fingerprints. (The hex and other encoders must round the
fingerprints up to a multiple of 8 bits.)

"""
import string
import binascii

_lsb_bit_table = {} # "10000000" -> 1
_msb_bit_table = {} # "00000001" -> 1

_reverse_bits_in_a_byte_transtable = None

# These are in lsb order; 
_lsb_4bit_patterns = (
    "0000", "1000", "0100", "1100",
    "0010", "1010", "0110", "1110",
    "0001", "1001", "0101", "1101",
    "0011", "1011", "0111", "1111")

# Generate '00000000', '10000000', '01000000', ... , '01111111', '11111111'
def _lsb_8bit_patterns():
    for right in _lsb_4bit_patterns:
        for left in _lsb_4bit_patterns:
            yield left + right

def _init():
    to_trans = [None]*256
    for value, bit_pattern in enumerate(_lsb_8bit_patterns()):
        # Each pattern maps to the byte
        byte_value = chr(value)
        to_trans[value] = chr(int(bit_pattern, 2))

        _lsb_bit_table[bit_pattern] = byte_value
        # Include the forms with trailing 0s
        # 10000000, 1000000, 100000, 10000, 1000, 100, 10 and 1 are all 0x01
        # (RDKit fingerprint lengths don't need to be a multiple of 8)
        lsb_pattern = bit_pattern
        while lsb_pattern[-1:] == "0":
            lsb_pattern = lsb_pattern[:-1]
            _lsb_bit_table[lsb_pattern] = byte_value

        msb_pattern = bit_pattern[::-1]
        _msb_bit_table[msb_pattern] = byte_value
        while msb_pattern[:1] == "0":
            msb_pattern = msb_pattern[1:]
            _msb_bit_table[msb_pattern] = byte_value
    global _reverse_bits_in_a_byte_transtable
    _reverse_bits_in_a_byte_transtable = string.maketrans(
        "".join(chr(i) for i in range(256)),
        "".join(to_trans))
    

_init()
assert _lsb_bit_table["10000000"] == "\x01", _lsb_bit_table["10000000"]
assert _lsb_bit_table["1000000"] == "\x01", _lsb_bit_table["1000000"]
assert _lsb_bit_table["100000"] == "\x01"
assert _lsb_bit_table["10000"] == "\x01"
assert _lsb_bit_table["1"] == "\x01"
assert _lsb_bit_table["1111111"] == "\x7f"

assert _msb_bit_table["00000001"] == "\x01"
assert _msb_bit_table["0000001"] == "\x01"
assert _msb_bit_table["000001"] == "\x01"
assert _msb_bit_table["00001"] == "\x01"
assert _msb_bit_table["1"] == "\x01"
assert _msb_bit_table["00000011"] == "\x03"
assert _msb_bit_table["00000011"] == "\x03"
assert _msb_bit_table["10000000"] == "\x80"
assert _msb_bit_table["1000000"] == "\x40"


def from_binary_lsb(text):
    """Convert a string like '00010101' (bit 0 here is off) into '\\xa8'

    The encoding characters '0' and '1' are in LSB order, so bit 0 is the left-most field.
    The result is a 2-ple of the fingerprint length and the decoded chemfp fingerprint

    >>> from_binary_lsb('00010101')
    (8, '\\xa8')
    >>> from_binary_lsb('11101')
    (5, '\\x17')
    >>> from_binary_lsb('00000000000000010000000000000')
    (29, '\\x00\\x80\\x00\\x00')
    >>>
    """
    table = _lsb_bit_table
    N = len(text)
    try:
        bytes = "".join(table[text[i:i+8]] for i in xrange(0, N, 8))
    except KeyError:
        raise ValueError("Not a binary string")
    return (N, bytes)
def from_binary_msb(text):
    """Convert a string like '10101000' (bit 0 here is off) into '\\xa8'

    The encoding characters '0' and '1' are in MSB order, so bit 0 is the right-most field.

    >>> from_binary_msb('10101000')
    (8, '\\xa8')
    >>> from_binary_msb('00010101')
    (8, '\\x15')
    >>> from_binary_msb('00111')
    (5, '\\x07')
    >>> from_binary_msb('00000000000001000000000000000')
    (29, '\\x00\\x80\\x00\\x00')
    >>>
    """
    # It feels like there should be a faster, more elegant way to do this.
    # While close,
    #   hex(int('00010101', 2))[2:].decode("hex")
    # does not keep the initial 0s
    try:
        N = len(text)
        bytes = []
        end = N
        start = N-8
        while start > 0:
            bytes.append(_msb_bit_table[text[start:end]])
            end = start
            start -= 8
        bytes.append(_msb_bit_table[text[0:end]])
        return (N, "".join(bytes))
    except KeyError:
        raise ValueError("Not a binary string")


def from_base64(text):
    """Decode a base64 encoded fingerprint string

    The encoded fingerprint must be in chemfp form, with the bytes in
    LSB order and the bits in MSB order.

    >>> from_base64("SGk=")
    (None, 'Hi')
    >>> from_base64("SGk=")[1].encode("hex")
    '4869'
    >>> 
    """
    try:
        # This is the same as doing text.decode("base64") but since I
        # need to catch the exception, I might as well work with the
        # underlying implementation code.
        return (None, binascii.a2b_base64(text))
    except binascii.Error, err:
        raise ValueError(str(err))

#def from_base64_msb(text):
#    return (None, text.decode("base64")[::-1], None)

#def from_base64_lsb(text):
#    return (None, text.decode("base64").translate(_reverse_bits_in_a_byte_transtable), None)
    
def from_hex(text):
    """Decode a hex encoded fingerprint string

    The encoded fingerprint must be in chemfp form, with the bytes in
    LSB order and the bits in MSB order.

    >>> from_hex('10f2')
    (None, '\\x10\\xf2')
    >>>

    Raises a ValueError if the hex string is not a multiple of 2 bytes long
    or if it contains a non-hex character.
    """
    return (None, text.decode("hex"))

def from_hex_msb(text):
    """Decode a hex encoded fingerprint string where the bits and bytes are in MSB order

    >>> from_hex_msb('10f2')
    (None, '\\xf2\\x10')
    >>>

    Raises a ValueError if the hex string is not a multiple of 2 bytes long
    or if it contains a non-hex character.
    """
    return (None, text.decode("hex")[::-1])

def from_hex_lsb(text):
    """Decode a hex encoded fingerprint string where the bits and bytes are in LSB order

    >>> from_hex_lsb('102f')
    (None, '\\x08\\xf4')
    >>> 

    Raises a ValueError if the hex string is not a multiple of 2 bytes long
    or if it contains a non-hex character.
    """
    return (None, text.decode("hex").translate(_reverse_bits_in_a_byte_transtable))


# ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.txt

#   This comes from cid:11 which is 1,2-dichloroethane
# AAADcYBAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGAIAAAAAAAOAAEAAAAA
# AAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA==
# That's simple enough to check the bit ordering by eye. Here's the decoded start
#  80-40-00-00-06-00-00 ... 
# We know it has to match the bits (starting with bit 0)
#  1000 0000 0100 0000 0000 0000 0000 0000 0000 0110
# and it does, perfectly. That means CACTVS is pure little endian.
# chem-fp has little-endian byte order but big endian bit order.


# 0111 1000 0100 0000 0000 0101 0000 0000 0000 0000 0000 0000

def from_cactvs(text):
    """Decode a 881-bit CACTVS-encoded fingerprint used by PubChem

    >>> from_cactvs("AAADceB7sQAEAAAAAAAAAAAAAAAAAWAAAAAwAAAAAAAAAAABwAAAHwIYAAAADA" +
    ...             "rBniwygJJqAACqAyVyVACSBAAhhwIa+CC4ZtgIYCLB0/CUpAhgmADIyYcAgAAO" +
    ...             "AAAAAAABAAAAAAAAAAIAAAAAAAAAAA==")
    (881, '\\x07\\xde\\x8d\\x00 \\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x80\\x06\\x00\\x00\\x00\\x0c\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x80\\x03\\x00\\x00\\xf8@\\x18\\x00\\x00\\x000P\\x83y4L\\x01IV\\x00\\x00U\\xc0\\xa4N*\\x00I \\x00\\x84\\xe1@X\\x1f\\x04\\x1df\\x1b\\x10\\x06D\\x83\\xcb\\x0f)%\\x10\\x06\\x19\\x00\\x13\\x93\\xe1\\x00\\x01\\x00p\\x00\\x00\\x00\\x00\\x00\\x80\\x00\\x00\\x00\\x00\\x00\\x00\\x00@\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00')
    >>>

    For format details, see
      ftp://ftp.ncbi.nlm.nih.gov/pubchem/specifications/pubchem_fingerprints.txt
    """
    fp = text.decode("base64")
    # first 4 bytes are the length (struct.unpack(">I"))
    if fp[:4] != '\x00\x00\x03q':
        raise ValueError("This implementation is hard-coded for 881 bit CACTVS fingerprints")
    return 881, fp[4:].translate(_reverse_bits_in_a_byte_transtable)


########### Convert from the Daylight encoding created by dt_binary2ascii

# Copied from PyDaylight daylight/dayencodings.py
"""
This code is based on the description of the encoding given in the
contrib program '$DY_ROOT/contrib/src/c/fingerprint/ascii2bits.c'

Here is the description from that file.

*****************************************************************************

 ASCII:    |=======+=======+=======+=======| etc.
                                           ^
   becomes...                      3  <->  4
                                   v
 BINARY:   |=====+=====+=====+=====| etc.

 Daylight uses the following method for translating binary data into
 printable ascii and vice versa.  Each 6 bits of binary (range 0-63) is
 converted to one of 64 characters in the set [.,0-9A-Za-z]; each 3-byte
 triplet thus converts to a 4-byte ASCII string.

 Every binary array is padded to a multiple of 3 bytes for the
 conversion; once the conversion is done you can't tell whether the last
 two bytes are pad bytes or real bytes containing zero.  To remedy this,
 an extra character is tacked on the ASCII representation; it will
 always be one of the characters '3', '2', or '1', indicating how many
 of the bytes in the last triplet are genuine.  That is, an
 ASCII-to-binary conversion will always produce an array whose length is
 a 3n bytes, but the last one or two bytes might just be pad bytes;
 the last ascii character indicates this.

 Thus, ascii strings are always of length (4n + 1) bytes.

 Thus, an ascii string can only describe bitmaps with bitcounts
 that are a multiple of 8.  If other sizes are desired, a specific
 bitcount must be remembered.
**************************************************************************
4.61 Change: ',' is replaced by '+'.
**************************************************************************
 Author: Jeremy Yang
 Rev:   27 Jan 1999
*************************************************************************
"""

# Map from 6 bit value to character encoding (used in binary2ascii)
_daylight_table = (".+" +
                   "".join(map(chr, range(ord("0"), ord("9") + 1) +
                               range(ord("A"), ord("Z") + 1) +
                               range(ord("a"), ord("z") + 1))))

# Map from character encoding to 6 bits (used in ascii2binary)
# The '+' used to be represented as ',' in pre-4.61 code
_daylight_reverse_table = {}
for i, c in enumerate(_daylight_table):
    _daylight_reverse_table[c] = i

_daylight_reverse_table[","] = _daylight_reverse_table["+"]
del i, c

def from_daylight(text):
    """Decode a Daylight ASCII fingerprint
  
    >>> from_daylight("I5Z2MLZgOKRcR...1")
    (None, 'PyDaylight')
  
    See the implementation for format details.
    """
    if len(text) % 4 != 1:
        raise ValueError("Daylight binary encoding is of the wrong length")
  
    if text == "3":
        # This is the encoding of an empty string (perverse, I know)
        return None, ""
  
    count = text[-1]
    if count not in ("1", "2", "3"):
        raise ValueError("Last character of encoding must be 1, 2, or 3, not %r" %
                         (count,))
  
    count = int(count)
    try:
        # Take four digits at a time
        fields = []
        reverse_table = _daylight_reverse_table
        for i in range(0, len(text)-1, 4):
            t = text[i:i+4]
            d = (reverse_table[t[0]] * 262144 +  # (2**6) ** 3
                 reverse_table[t[1]] * 4096 +    # (2**6) ** 2
                 reverse_table[t[2]] * 64 +      # (2**6) ** 1
                 reverse_table[t[3]])            # (2**6) ** 0

            # This is a 24 bit field
            # Convert back into 8 bits at a time
            c1 = d >> 16
            c2 = (d >> 8) & 0xFF
            c3 = d & 0xFF

            fields.append( chr(c1) + chr(c2) + chr(c3) )
    except KeyError:
        raise ValueError("Unknown encoding symbol")

    # Only 'count' of the last field is legal
    # Because of the special case for empty string earlier,
    #  the 'fields' array is non-empty
    fields[-1] = fields[-1][:count]
    s = "".join(fields)
    return (None, s)


assert from_daylight("I5Z2MLZgOKRcR...1") == (None, "PyDaylight")

def from_on_bit_positions(text, num_bits=1024, separator=" "):
    """Decode from a list of integers describing the location of the on bits

    >>> from_on_bit_positions("1 4 9 63", num_bits=32)
    (32, '\\x12\\x02\\x00\\x80')
    >>> from_on_bit_positions("1,4,9,63", num_bits=64, separator=",")
    (64, '\\x12\\x02\\x00\\x00\\x00\\x00\\x00\\x80')

    The text contains a sequence of non-negative integer values
    separated by the `separator` text. Bit positions are folded modulo
    num_bits. 

    This is often used to convert sparse fingerprints into a dense
    fingerprint.
    """
    if num_bits <= 0:
        raise ValueError("num_bits must be positive")
    bytes = [0] * ((num_bits+7)//8)
    for bit_s in text.split(separator):
        try:
            bit = int(bit_s)
        except ValueError:
            raise ValueError("Bit positions must be an integer, not %r" % (bit_s,))
        if bit < 0:
            raise ValueError("Bit positions must be non-negative, not %r" % (bit,))
        bit = bit % num_bits
        bytes[bit//8] |= 1<<(bit%8)
    return num_bits, "".join(map(chr, bytes))


##############

def import_decoder(path):
    """Find a decoder function given its full name, as in 'chemfp.decoders.from_cactvs'

    This function imports any intermediate modules, which may be a security concern.
    """
    terms = path.split(".")
    if not terms:
        raise ValueError("missing import name")
    if "" in terms:
        raise ValueError("Empty module name in %r" % (path,))

    # It's impossible to tell if the dotted terms corresponds to
    # module or class/instance attribute lookups, so I don't know
    # which fields are imports and which fields are getattrs. To get
    # around that, I'll import everything, and if that fails I'll
    # remove the deepest term and try again.
    tmp_terms = terms[:]
    while tmp_terms:
        try:
            __import__(".".join(tmp_terms), level=0)
        except ImportError:
            del tmp_terms[-1]
        else:
            break
    # I've imported as deep as possible.
    # Now start from the top and work down with getattr calls
    obj = __import__(terms[0], level=0)
    for i, subattr in enumerate(terms[1:]):
        obj = getattr(obj, subattr, None)
        if obj is None:
            failure_path = ".".join(terms[:i+2])
            raise ValueError(("Unable to import a decoder: "
                              "Could not find %(attr)r from %(path)r") %
                              dict(attr=failure_path, path=path))

    return obj



##### Helper code for dealing with common command-line parameters

_decoding_args = []
_decoder_table = {}
def _A(arg, action, decoder, help):
    _decoding_args.append ( ((arg,), dict(action=action, help=help)) )
    _decoder_table[arg.lstrip("-").replace("-","_")] = decoder

_A("--binary", "store_true", from_binary_lsb,
   "Encoded with the characters '0' and '1'. Bit #0 comes first. Example: 00100000 encodes the value 4")
_A("--binary-msb", "store_true", from_binary_msb,
   "Encoded with the characters '0' and '1'. Bit #0 comes last. Example: 00000100 encodes the value 4")
_A("--hex", "store_true", from_hex,
   "Hex encoded. Bit #0 is the first bit (1<<0) of the first byte. Example: 01f2 encodes the value \\x01\\xf2 = 498")
_A("--hex-lsb", "store_true", from_hex_lsb,
   "Hex encoded. Bit #0 is the eigth bit (1<<7) of the first byte. Example: 804f encodes the value \\x01\\xf2 = 498")
_A("--hex-msb", "store_true", from_hex_msb,
   "Hex encoded. Bit #0 is the first bit (1<<0) of the last byte. Example: f201 encodes the value \\x01\\xf2 = 498")
_A("--base64", "store_true", from_base64,
   "Base-64 encoded. Bit #0 is first bit (1<<0) of first byte. Example: AfI= encodes value \\x01\\xf2 = 498")
_A("--cactvs", "store_true", from_cactvs,
   help="CACTVS encoding, based on base64 and includes a version and bit length")
_A("--daylight", "store_true", from_daylight,
   help="Daylight encoding, which is is base64 variant")
_A("--decoder", "store", None,
    help="import and use the DECODER function to decode the fingerprint")

def _add_decoding_group(parser):
    decoding_group = parser.add_argument_group("Fingerprint decoding options")
    for (args, kwargs) in _decoding_args:
        decoding_group.add_argument(*args, **kwargs)

def _extract_decoder(parser, namespace):
    """An internal helper function for the command-line programs"""
    # Were any command-line decoder arguments specified?
    # Make sure that multiple decoders were not specified
    decoder_name = None
    for arg in _decoder_table:
        if getattr(namespace, arg):
            if decoder_name is not None:
                parser.error("Cannot decode with both --%(old_arg)s and --%(arg)s" % 
                             dict(old_arg=decoder_name, arg=arg))
            decoder_name = arg
    # When in doubt, assume a hex decoder
    if decoder_name is None:
        decoder_name = "hex"

    # If --decoder was specified, do the import and return (name, decoder)
    if decoder_name == "decoder":
        function_name = getattr(namespace, "decoder")
        fp_decoder = import_decoder(function_name)
        return function_name, fp_decoder

    # Otherwise it's in the decoder table
    fp_decoder = _decoder_table[decoder_name]
    return decoder_name, fp_decoder