This file is indexed.

/usr/share/RDKit/Contrib/mmpa/rfrag.py is in rdkit-data 201603.5-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# Copyright (c) 2013, GlaxoSmithKline Research & Development Ltd.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above
#       copyright notice, this list of conditions and the following
#       disclaimer in the documentation and/or other materials provided
#       with the distribution.
#     * Neither the name of GlaxoSmithKline Research & Development Ltd.
#       nor the names of its contributors may be used to endorse or promote
#       products derived from this software without specific prior written
#       permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Created by Jameed Hussain, July 2013
#
# Modifications and optimizations by Greg Landrum, July 2015
# 
from __future__ import print_function
import sys
import re
from rdkit import Chem
from rdkit.Chem import rdMMPA

def find_correct(f_array):

    core = ""
    side_chains = ""

    for f in f_array:
        attachments = f.count("*")
        if (attachments == 1):
            side_chains = "%s.%s" % (side_chains,f)
        else:
            core = f

    side_chains = side_chains.lstrip('.')

    #cansmi the side chains
    temp = Chem.MolFromSmiles(side_chains)
    side_chains = Chem.MolToSmiles( temp, isomericSmiles=True )

    #and cansmi the core
    temp = Chem.MolFromSmiles(core)
    core = Chem.MolToSmiles( temp, isomericSmiles=True )

    return core,side_chains

def delete_bonds(smi,id,mol,bonds,out):

    #use the same parent mol object and create editable mol
    em = Chem.EditableMol(mol)

    #loop through the bonds to delete
    isotope = 0
    isotope_track = {};
    for i in bonds:
        isotope += 1
        #remove the bond
        em.RemoveBond(i[0],i[1])

        #now add attachement points
        newAtomA = em.AddAtom(Chem.Atom(0))
        em.AddBond(i[0],newAtomA,Chem.BondType.SINGLE)

        newAtomB = em.AddAtom(Chem.Atom(0))
        em.AddBond(i[1],newAtomB,Chem.BondType.SINGLE)

        #keep track of where to put isotopes
        isotope_track[newAtomA] = isotope
        isotope_track[newAtomB] = isotope


    #should be able to get away without sanitising mol
    #as the existing valencies/atoms not changed
    modifiedMol = em.GetMol()

    #canonical smiles can be different with and without the isotopes
    #hence to keep track of duplicates use fragmented_smi_noIsotopes
    fragmented_smi_noIsotopes = Chem.MolToSmiles(modifiedMol,isomericSmiles=True)

    valid = True
    fragments = fragmented_smi_noIsotopes.split(".")

    #check if its a valid triple cut
    if(isotope == 3):
        valid = False
        for f in fragments:
            matchObj = re.search( '\*.*\*.*\*', f)
            if matchObj:
                valid = True
                break

    if valid:
        if(isotope == 1):
            fragmented_smi_noIsotopes = re.sub('\[\*\]', '[*:1]', fragmented_smi_noIsotopes)

            fragments = fragmented_smi_noIsotopes.split(".")

            #print fragmented_smi_noIsotopes
            s1 = Chem.MolFromSmiles(fragments[0])
            s2 = Chem.MolFromSmiles(fragments[1])

            #need to cansmi again as smiles can be different
            output = '%s,%s,,%s.%s' % (smi,id,Chem.MolToSmiles(s1,isomericSmiles=True),Chem.MolToSmiles(s2,isomericSmiles=True) )
            if( (output in out) == False):
                out.add(output)

        elif (isotope >= 2):
            #add the isotope labels
            for key in isotope_track:
                #to add isotope lables
                modifiedMol.GetAtomWithIdx(key).SetIsotope(isotope_track[key])
            fragmented_smi = Chem.MolToSmiles(modifiedMol,isomericSmiles=True)

            #change the isotopes into labels - currently can't add SMARTS or labels to mol
            fragmented_smi = re.sub('\[1\*\]', '[*:1]', fragmented_smi)
            fragmented_smi = re.sub('\[2\*\]', '[*:2]', fragmented_smi)
            fragmented_smi = re.sub('\[3\*\]', '[*:3]', fragmented_smi)

            fragments = fragmented_smi.split(".")

            #identify core/side chains and cansmi them
            core,side_chains = find_correct(fragments)

            #now change the labels on sidechains and core
            #to get the new labels, cansmi the dot-disconnected side chains
            #the first fragment in the side chains has attachment label 1, 2nd: 2, 3rd: 3
            #then change the labels accordingly in the core

            #this is required by the indexing script, as the side-chains are "keys" in the index
            #this ensures the side-chains always have the same numbering

            isotope_track = {}
            side_chain_fragments = side_chains.split(".")

            for s in range( len(side_chain_fragments) ):
                matchObj = re.search( '\[\*\:([123])\]', side_chain_fragments[s] )
                if matchObj:
                    #add to isotope_track with key: old_isotope, value:
                    isotope_track[matchObj.group(1)] = str(s+1)

            #change the labels if required
            if(isotope_track['1'] != '1'):
                core = re.sub('\[\*\:1\]', '[*:XX' + isotope_track['1'] + 'XX]' , core)
                side_chains = re.sub('\[\*\:1\]', '[*:XX' + isotope_track['1'] + 'XX]' , side_chains)
            if(isotope_track['2'] != '2'):
                core = re.sub('\[\*\:2\]', '[*:XX' + isotope_track['2'] + 'XX]' , core)
                side_chains = re.sub('\[\*\:2\]', '[*:XX' + isotope_track['2'] + 'XX]' , side_chains)

            if(isotope == 3):
                if(isotope_track['3'] != '3'):
                    core = re.sub('\[\*\:3\]', '[*:XX' + isotope_track['3'] + 'XX]' , core)
                    side_chains = re.sub('\[\*\:3\]', '[*:XX' + isotope_track['3'] + 'XX]' , side_chains)

            #now remove the XX
            core = re.sub('XX', '' , core)
            side_chains = re.sub('XX', '' , side_chains)

            output = '%s,%s,%s,%s' % (smi,id,core,side_chains)
            if( (output in out) == False):
                out.add(output)

def fragment_mol(smi,id):

    mol = Chem.MolFromSmiles(smi)

    #different cuts can give the same fragments
    #to use outlines to remove them
    outlines = set()

    if(mol == None):
        sys.stderr.write("Can't generate mol for: %s\n" % (smi) )
    else:
        frags = rdMMPA.FragmentMol(mol,pattern="[#6+0;!$(*=,#[!#6])]!@!=!#[*]",resultsAsMols=False)
        for core,chains in frags:
            output = '%s,%s,%s,%s' % (smi,id,core,chains)
            if( not (output in outlines) ):
                outlines.add(output)
    return outlines

if __name__=='__main__':

    if (len(sys.argv) >= 2):
        print("Program that fragments a user input set of smiles.")
        print("The program enumerates every single,double and triple acyclic single bond cuts in a molecule.\n")
        print("USAGE: ./rfrag.py <file_of_smiles")
        print("Format of smiles file: SMILES ID (space separated)")
        print("Output: whole mol smiles,ID,core,context\n")
        sys.exit(1)

    #read the STDIN
    for line in sys.stdin:

        line = line.rstrip()

        line_fields = re.split('\s|,',line)
        smiles = line_fields[0]
        cmpd_id = line_fields[1]

        #returns a set containing the output
        o = fragment_mol(smiles,cmpd_id)

        for l in o:
            print(l)





"""
optimization work.

Original:
~/RDKit_git/Contrib/mmpa > time head -100 ../../Data/Zinc/zim.smi | python rfrag.py > zim.frags.o

real	0m9.752s
user	0m9.704s
sys	0m0.043s


"""