This file is indexed.

/usr/lib/python2.7/dist-packages/smalr/CCS_aligner.py is in smalr 1.0.1-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import sys,os
import shutil
import numpy as np
import logging
import subprocess
import sam_parser

class mols_aligner:
	"""
	This class will coordinate the execution of the CCS read alignments.
	"""
	def __init__( self, mols, orig_fastq, ref, nat_movie_name_ID_map, align, chunk_id ):
		self.orig_fastq = orig_fastq
		self.ref        = ref
		self.chunk_id   = chunk_id

		sam_md, tmp_dir = self.align_CCS_mols_to_ref( mols.values(), nat_movie_name_ID_map, align )
		self.parse_alignments( sam_md, mols, align )
		# self.native_stats = self.get_native_read_stats( mols )
		shutil.rmtree(tmp_dir)

	def align_CCS_mols_to_ref( self, mols, nat_movie_name_ID_map, align ):
		"""Pull out the CCS reads for every molecule of interest and generate on fastq file.
		Then align this file to the reference using bwa bwasw. Finally, transform the output
		so that it returns a SAM file with the necessary MD field (for mismatch calling)."""
		self.readname_molID_map = {}

		def get_CCS_reads_for_aligning( mols, nat_movie_name_ID_map, align ):
			"""Go through the CCS fastq and index where each molecule/ZMW resides. Should speed
			up access when pulling out the reads."""
			tmp_dir = "tmp_align_%s" % self.chunk_id
			if os.path.exists(tmp_dir): 
				shutil.rmtree(tmp_dir)
			os.mkdir(tmp_dir)

			chunk_fastq = "chunk%s_mols_CCS.fastq" % self.chunk_id
			new_fastq   = os.path.join(tmp_dir, chunk_fastq)
			new_fastq_f = open(new_fastq,"w")

			id_fn     = os.path.join(tmp_dir, "all_ccs.fastq.index")
			CMD       = "grep -n ^@m %s > %s" % (self.orig_fastq, id_fn)
			p         = subprocess.Popen(CMD, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
			stdOutErr = p.communicate()
			sts       = p.returncode
			if os.path.exists(id_fn) and os.path.getsize(id_fn) > 0:
				zmw_line_map     = {}
				zmw_readname_map = {}
				for line in open(id_fn).xreadlines():
					line       = line.strip()
					line_n     = int(line.split(":")[0])
					readname   =     line.split(":")[1].strip("@")
					movie_name = readname.split("/")[0]

					try:
						movie_id = nat_movie_name_ID_map[movie_name]
					except KeyError:
						# The queried read is not in this chunk of molecules
						continue
					if readname.split("/")[-1] == "ccs":
						zmw_id                   = readname.split("/")[-2] + "_" + str(movie_id)
					else:
						zmw_id                   = readname.split("/")[-1] + "_" + str(movie_id)
					
					zmw_line_map[zmw_id]     = line_n
					zmw_readname_map[zmw_id] = readname
				
				n_mols       = len(mols)
				not_in_fastq = 0
				for n,mol in enumerate(mols):
					try:
						target    = zmw_line_map[mol.zmw_id]
						i = 0
						with open(self.orig_fastq) as f:
							
							while i < target-1:
								f.next()
								i += 1
							j = 0
							while j < 4:
								new_fastq_f.write(f.next())
								j += 1
							mol.in_fastq = True
							readname     = zmw_readname_map[mol.zmw_id]
							self.readname_molID_map[readname] = mol.mol_id
					except KeyError:
						not_in_fastq += 1
						progress      = float(n)/n_mols * 100
						logging.debug("Chunk %s: ZMW %s (mol %s) not in fastq file. Progress -- %.1f%%" % (self.chunk_id, mol.zmw_id, mol.mol_id, progress))
						pass
				pct_not_found = float(not_in_fastq) / len(mols) * 100
				logging.debug("Chunk %s: %s reads not found (%.1f%%) in original CCS fastq file." % (self.chunk_id, not_in_fastq, pct_not_found))
			else:
				logging.error("CCS fastq indexing failed: %s" % CMD)
			new_fastq_f.close()
			return new_fastq, tmp_dir

		def call_bwasw_samtools( fq ):
			
			def run_command( CMD ):
				p         = subprocess.Popen(CMD, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
				stdOutErr = p.communicate()
				sts       = p.returncode
				if sts != 0:
					for entry in stdOutErr:
						print entry
					raise Exception("Failed alignment command: %s" % CMD)


			sam        = fq+".aligned.sam"
			bam        = fq+".aligned.bam"
			sam_md     = fq+".aligned.md.sam"

			bwa_CMD     = "bwa bwasw %s %s > %s" % (self.ref, fq, sam)
			run_command( bwa_CMD )

			# Check if there are any alignments in the file
			if os.path.getsize(sam) == 0:
				logging.error("No alignments for %s!" % fq)
				os.remove(sam)
			else:
				samtools_view_CMD  = "samtools view -b -S %s > %s" % (sam, bam)
				run_command(samtools_view_CMD)

				# We are going to use samtools calmd to calculate the MD field, which along
				# with the CIGAR string in the sam file will let us identify SNPs/Indels.
				samtools_calmd_CMD = "samtools calmd %s %s > %s" % (bam, self.ref, sam_md)
				run_command(samtools_calmd_CMD)
			return sam_md

		new_fastq, tmp_dir = get_CCS_reads_for_aligning( mols, nat_movie_name_ID_map, align )
		sam_md = None
		if os.path.getsize(new_fastq) > 0:
			sam_md    		   = call_bwasw_samtools( new_fastq )
		return sam_md, tmp_dir

	def parse_alignments( self, sam_md, mols, align ):
		"""Parse each line in the SAM file and add certain values from the alignment as
		attributes of the molecule object."""
		i = 0
		j = 0
		to_del = []
		if sam_md != None:
			for line in open(sam_md).xreadlines():
				if line[0]=="@": # header
					continue
				readname    = line.split("\t")[0]
				mol_id      = self.readname_molID_map[readname]
				if line.split("\t")[2] == "*": # No alignment found
					to_del.append(mol_id)
					continue

				# Do a quick check to make sure were are looking that the alignment
				# that corresponds to the ref positions listed in the IPD csv file.
				if readname.find("/") < 0:
					# This is a weird alignment where the readname is corrupt.
					j += 1
					to_del.append(mol_id)
					continue
				align_start = int(line.split("\t")[3])
				one_pos     = mols[mol_id].entries.values()[0].pos
				if np.abs(one_pos - align_start) > 2000:
					# We probably aren't looking at the proper alignment
					# Even if we are, there's a ton of soft-clipping at the
					# beginning of this read -- suspicious.
					logging.debug("Chunk %s: suspicious positions! mol:%s align_start:%s sampled_pos:%s" % (self.chunk_id, mol_id, align_start, one_pos))
					i += 1
					to_del.append(mol_id)
					continue

				readname, var_pos, softclip, CIGAR, MD, align_start, align_end = sam_parser.read_sam_line(line)
				if not align:
					var_pos = []
				mol_id          = self.readname_molID_map[readname]
				mol             = mols[mol_id]
				mol.mapped      = True
				mol.var_pos     = var_pos
				mol.softclip    = softclip
				mol.var_no_sc   = list(set(mol.var_pos) - set(mol.softclip))
				mol.CIGAR       = CIGAR
				mol.MD          = MD
				mol.align_start = align_start
				mol.align_end   = align_end
				mol.first_pos   = align_start
				mol.last_pos    = align_end
		logging.debug("Chunk %s: skipped %s alignments -- start pos far away from sampled position." % (self.chunk_id, i))
		logging.debug("Chunk %s: skipped %s alignments -- readname was just the movie name (?)." % (self.chunk_id, j))

		to_del = to_del + [mol.mol_id for mol in mols.values() if (not mol.mapped or not mol.in_fastq)]
		for mol_id in to_del:
			try:
				del mols[mol_id]
			except KeyError:
				pass

	def get_native_read_stats( self, mols ):
		"""
		"""
		all_mols           = []
		in_fastq_no_mapped = []
		no_mapped          = []
		no_fastq           = []
		to_analyze         = []
		for mol in mols.values():
			all_mols.append(mol.mol_id)
			if not mol.mapped:
				no_mapped.append(mol.mol_id)
			if not mol.in_fastq:
				no_fastq.append(mol.mol_id)
			if mol.mapped and mol.in_fastq:
				to_analyze.append(mol.mol_id)
		native_stats = {"all_mols"       : all_mols,      \
						"no_mapped"          : no_mapped, \
						"no_fastq"           : no_fastq,  \
						"to_analyze"         : to_analyze}
		return native_stats