This file is indexed.

/usr/lib/python2.7/dist-packages/chemfp/commandline/simsearch.py is in python-chemfp 1.1p1-2.1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
from __future__ import with_statement
import math
import sys
import itertools
import time

import chemfp
from chemfp import argparse, io, SOFTWARE, bitops
from chemfp import search

# Suppose you have a 4K fingerprint.
#   1/4096 = 0.000244140625.
#   2/4096 = 0.00048828125
# You only need to show "0.0002" and "0.0005" to
# disambiguate the scores. I don't like seeing only
# the minimum resolution, so I also show at least
# the next bit.
#   For 4096 the float_formatter is %.5f and the
# above values are 0.00024 and 0.00049.
# This also prevents the results from being shown
# in scientific notation.
def get_float_formatter(num_bytes):
    num_digits = int(math.log10(num_bytes*8)) + 2
    float_formatter = "%." + str(num_digits) + "f"
    return float_formatter

def write_simsearch_magic(outfile):
    outfile.write("#Simsearch/1\n")

def write_count_magic(outfile):
    outfile.write("#Count/1\n")

def write_simsearch_header(outfile, d):
    lines = []
    for name in ("num_bits", "type", "software", "queries", "targets"):
        value = d.get(name, None)
        if value is not None:
            lines.append("#%s=%s\n" % (name, value))
    for name in ("query_sources", "target_sources"):
        for value in d.get(name, []):
            lines.append("#%s=%s\n" % (name, value))
    outfile.writelines(lines)

#### The NxM cases

def report_threshold(outfile, query_arenas, targets, threshold):
    float_formatter = get_float_formatter(targets.metadata.num_bytes)
    def search_function(query_arena):
        return targets.threshold_tanimoto_search_arena(query_arena, threshold=threshold)
    _report_search(outfile, float_formatter, query_arenas, search_function)

def report_knearest(outfile, query_arenas, targets, k, threshold):
    float_formatter = get_float_formatter(targets.metadata.num_bytes)
    def search_function(query_arena):
        return targets.knearest_tanimoto_search_arena(query_arena, k=k, threshold=threshold)
    _report_search(outfile, float_formatter, query_arenas, search_function)

def _report_search(outfile, float_formatter, query_arenas, search_function):
    hit_formatter = "\t%s\t" + float_formatter
    for query_arena in query_arenas:
        for query_id, row in zip(query_arena.ids, search_function(query_arena)):
            outfile.write("%d\t%s" % (len(row), query_id))
            for hit in row.get_ids_and_scores():
                outfile.write(hit_formatter % hit)
            outfile.write("\n") # XXX flush?
    


def report_counts(outfile, query_arenas, targets, threshold):
    for query_arena in query_arenas:
        counts = targets.count_tanimoto_hits_arena(query_arena, threshold)
        for query_id, hit_count in zip(query_arena.ids, counts):
            outfile.write("%d\t%s\n" % (hit_count, query_id))

#### The NxN cases


def do_NxN_searches(args, k, threshold, target_filename):
    t1 = time.time()

    # load_fingerprints sorts the fingerprints based on popcount
    # I want the output to be in the same order as the input.
    # This means I need to do some reordering. Consider:
    #   0003  ID_A
    #   010a  ID_B
    #   1000  ID_C
    # I use this to generate:
    #   original_ids = ["ID_A", "ID_B", "ID_C"]
    #   targets.ids = [2, 0, 1]
    #   original_index_to_current_index = {2:0, 0:1, 1:2}
    #   current_index_to_original_index = {0:2, 1:0, 2:1}
    
    original_ids = []
    fps = chemfp.open(target_filename)
    def get_index_to_id(fps):
        for i, (id, fp) in enumerate(fps):
            original_ids.append(id)
            yield i, fp
    
    targets = chemfp.load_fingerprints(get_index_to_id(fps), fps.metadata)
    original_index_to_current_index = dict(zip(targets.ids, xrange(len(targets))))
    current_index_to_original_id = dict((i, original_ids[original_index])
                                            for i, original_index in enumerate(targets.ids))
    
    t2 = time.time()
    outfile = io.open_output(args.output)
    with io.ignore_pipe_errors:
        type = "Tanimoto k=%(k)s threshold=%(threshold)s NxN=full" % dict(
            k=k, threshold=threshold, max_score=1.0)

        if args.count:
            type = "Count threshold=%(threshold)s NxN=full" % dict(
                threshold=threshold)
            write_count_magic(outfile)
        else:
            write_simsearch_magic(outfile)

        write_simsearch_header(outfile, {
            "num_bits": targets.metadata.num_bits,
            "software": SOFTWARE,
            "type": type,
            "targets": target_filename,
            "target_sources": targets.metadata.sources})

        if args.count:
            counts = search.count_tanimoto_hits_symmetric(targets, threshold,
                                                          batch_size=args.batch_size)
            for original_index, original_id in enumerate(original_ids):
                current_index = original_index_to_current_index[original_index]
                count = counts[current_index]
                outfile.write("%d\t%s\n" % (count, original_id))
        else:
            hit_formatter = "\t%s\t" + get_float_formatter(targets.metadata.num_bytes)
            if k == "all":
                results = search.threshold_tanimoto_search_symmetric(targets, threshold,
                                                                     batch_size=args.batch_size)
            else:
                results = search.knearest_tanimoto_search_symmetric(targets, k, threshold,
                                                                    batch_size=args.batch_size)

            for original_index, original_id in enumerate(original_ids):
                current_index = original_index_to_current_index[original_index]
                new_indices_and_scores = results[current_index].get_ids_and_scores()
                outfile.write("%d\t%s" % (len(new_indices_and_scores), original_id))
                for (new_index, score) in new_indices_and_scores:
                    original_id = original_ids[new_index]
                    outfile.write(hit_formatter % (original_id, score))
                outfile.write("\n") # XXX flush?

    t3 = time.time()
    if args.times:
        sys.stderr.write("open %.2f search %.2f total %.2f\n" % (t2-t1, t3-t2, t3-t1))

####

def int_or_all(s):
    if s == "all":
        return s
    return int(s)

# the "2fps" options need a way to say "get the options from --reference"
# ob2fps --reference targets.fps | simsearch -k  5 --threshold 0.5 targets.fps


parser = argparse.ArgumentParser(
    description="Search an FPS file for similar fingerprints")
parser.add_argument("-k" ,"--k-nearest", help="select the k nearest neighbors (use 'all' for all neighbors)",
                    default=None, type=int_or_all)
parser.add_argument("-t" ,"--threshold", help="minimum similarity score threshold",
                    default=None, type=float)
parser.add_argument("-q", "--queries", help="filename containing the query fingerprints")
parser.add_argument("--NxN", action="store_true",
                    help="use the targets as the queries, and exclude the self-similarity term")
parser.add_argument("--hex-query", help="query in hex")
parser.add_argument("--query-id", default="Query1",
                    help="id for the hex query")
parser.add_argument("--in", metavar="FORMAT", dest="query_format",
                    help="input query format (default uses the file extension, else 'fps')")
parser.add_argument("-o", "--output", metavar="FILENAME",
                    help="output filename (default is stdout)")

parser.add_argument("-c", "--count", help="report counts", action="store_true")

parser.add_argument("-b", "--batch-size", help="batch size",
                    default=100, type=int)

parser.add_argument("--scan", help="scan the file to find matches (low memory overhead)",
                    action="store_true")
parser.add_argument("--memory", help="build and search an in-memory data structure (faster for multiple queries)",
                    action="store_true")

parser.add_argument("--times", help="report load and execution times to stderr",
                    action="store_true")

parser.add_argument("target_filename", nargs=1, help="target filename", default=None)

## Something to enable multi-threading
#parser.add_argument("-j", "--jobs", help="number of jobs ",
#                    default=10, type=int)


def main(args=None):
    args = parser.parse_args(args)
    target_filename = args.target_filename[0]

    threshold = args.threshold
    k = args.k_nearest

    if args.count and k is not None and k != "all":
        parser.error("--count search does not support --k-nearest")

    # People should not use this without setting parameters.  On the
    # other hand, I don't want an error message if there are no
    # parameters. This solution seems to make sense.

    if threshold is None:
        if k is None:
            # If nothing is set, use defaults of --thresdhold 0.7 -k 3            
            threshold = 0.7
            k = 3
        else:
            # only k is set; search over all possible matches
            threshold = 0.0
    else:
        if k is None:
            # only threshold is set; search for all hits above that threshold
            k = "all"

    if k == "all":
        pass
    elif k < 0:
        parser.error("--k-nearest must be non-negative or 'all'")

    if not (0.0 <= threshold <= 1.0):
        parser.error("--threshold must be between 0.0 and 1.0, inclusive")

    if args.batch_size < 1:
        parser.error("--batch-size must be positive")

    bitops.use_environment_variables()

    if args.NxN:
        if args.scan:
            parser.error("Cannot specify --scan with an --NxN search")
        if args.hex_query:
            parser.error("Cannot specify --hex-query with an --NxN search")
        if args.queries:
            parser.error("Cannot specify --queries with an --NxN search")
        do_NxN_searches(args, k, threshold, target_filename)
        return
            
    if args.scan and args.memory:
        parser.error("Cannot specify both --scan and --memory")

    if args.hex_query and args.queries:
        parser.error("Cannot specify both --hex-query and --queries")
    if args.hex_query:
        query_id = args.query_id
        for c, name in ( ("\t", "tab"),
                         ("\n", "newline"),
                         ("\r", "control-return"),
                         ("\0", "NUL")):
            if c in query_id:
                parser.error("--query-id must not contain the %s character" %
                             (name,))
    

    # Open the target file. This reads just enough to get the header.

    try:
        targets = chemfp.open(target_filename)
    except (IOError, ValueError), err:
        sys.stderr.write("Cannot open targets file: %s" % err)
        raise SystemExit(1)

    if args.hex_query is not None:
        try:
            query_fp = args.hex_query.decode("hex")
        except TypeError, err:
            parser.error("--hex-query is not a hex string: %s" % (err,))

        for (severity, error, msg_template) in chemfp.check_fp_problems(query_fp, targets.metadata):
            if severity == "error":
                parser.error(msg_template % dict(fp="query", metadata=repr(target_filename)))
            
        num_bits = targets.metadata.num_bits
        if num_bits is None:
            num_bits = len(query_fp) * 8
        query_metadata = chemfp.Metadata(num_bits=num_bits, num_bytes=len(query_fp))
        queries = chemfp.Fingerprints(query_metadata,
                                      [(query_id, query_fp)])
        query_filename = None
    else:
        query_filename = args.queries
        try:
            queries = chemfp.open(query_filename, format=args.query_format)
        except (ValueError, IOError), err:
            sys.stderr.write("Cannot open queries file: %s\n" % (err,))
            raise SystemExit(1)

    batch_size = args.batch_size
    query_arena_iter = queries.iter_arenas(batch_size)
    
    t1 = time.time()

    first_query_arena = None
    for first_query_arena in query_arena_iter:
        break

    if args.scan:
        # Leave the targets as-is
        pass
    elif args.memory:
        targets = chemfp.load_fingerprints(targets)
    if not first_query_arena:
        # No input. Leave as-is
        pass
    elif len(first_query_arena) < min(10, batch_size):
        # Figure out the optimal search. If there is a
        # small number of inputs (< ~10) then a scan
        # of the FPS file is faster than an arena search.
        pass
    else:
        targets = chemfp.load_fingerprints(targets)

    problems = chemfp.check_metadata_problems(queries.metadata, targets.metadata)
    for (severity, error, msg_template) in problems:
        msg = msg_template % dict(metadata1="queries", metadata2="targets")
        if severity == "error":
            parser.error(msg)
        elif severity == "warning":
            sys.stderr.write("WARNING: " + msg + "\n")

    t2 = time.time()
    outfile = io.open_output(args.output)
    with io.ignore_pipe_errors:
        type = "Tanimoto k=%(k)s threshold=%(threshold)s" % dict(
            k=k, threshold=threshold, max_score=1.0)

        if args.count:
            type = "Count threshold=%(threshold)s" % dict(
                threshold=threshold)
            write_count_magic(outfile)
        else:
            write_simsearch_magic(outfile)

        write_simsearch_header(outfile, {
            "num_bits": targets.metadata.num_bits,
            "software": SOFTWARE,
            "type": type,
            "queries": query_filename,
            "targets": target_filename,
            "query_sources": queries.metadata.sources,
            "target_sources": targets.metadata.sources})

        if first_query_arena:
            query_arenas = itertools.chain([first_query_arena],
                                           query_arena_iter)

            if args.count:
                report_counts(outfile, query_arenas, targets,
                              threshold = threshold)
            elif k == "all":
                report_threshold(outfile, query_arenas, targets,
                                 threshold = threshold)
            else:
                report_knearest(outfile, query_arenas, targets,
                                k = k, threshold = threshold)
                                
                
                    
    t3 = time.time()
    if args.times:
        sys.stderr.write("open %.2f search %.2f total %.2f\n" % (t2-t1, t3-t2, t3-t1))

if __name__ == "__main__":
    main()