/usr/lib/python2.7/dist-packages/smalr/CCS_aligner.py is in smalr 1.0.1-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 | import sys,os
import shutil
import numpy as np
import logging
import subprocess
import sam_parser
class mols_aligner:
"""
This class will coordinate the execution of the CCS read alignments.
"""
def __init__( self, mols, orig_fastq, ref, nat_movie_name_ID_map, align, chunk_id ):
self.orig_fastq = orig_fastq
self.ref = ref
self.chunk_id = chunk_id
sam_md, tmp_dir = self.align_CCS_mols_to_ref( mols.values(), nat_movie_name_ID_map, align )
self.parse_alignments( sam_md, mols, align )
# self.native_stats = self.get_native_read_stats( mols )
shutil.rmtree(tmp_dir)
def align_CCS_mols_to_ref( self, mols, nat_movie_name_ID_map, align ):
"""Pull out the CCS reads for every molecule of interest and generate on fastq file.
Then align this file to the reference using bwa bwasw. Finally, transform the output
so that it returns a SAM file with the necessary MD field (for mismatch calling)."""
self.readname_molID_map = {}
def get_CCS_reads_for_aligning( mols, nat_movie_name_ID_map, align ):
"""Go through the CCS fastq and index where each molecule/ZMW resides. Should speed
up access when pulling out the reads."""
tmp_dir = "tmp_align_%s" % self.chunk_id
if os.path.exists(tmp_dir):
shutil.rmtree(tmp_dir)
os.mkdir(tmp_dir)
chunk_fastq = "chunk%s_mols_CCS.fastq" % self.chunk_id
new_fastq = os.path.join(tmp_dir, chunk_fastq)
new_fastq_f = open(new_fastq,"w")
id_fn = os.path.join(tmp_dir, "all_ccs.fastq.index")
CMD = "grep -n ^@m %s > %s" % (self.orig_fastq, id_fn)
p = subprocess.Popen(CMD, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdOutErr = p.communicate()
sts = p.returncode
if os.path.exists(id_fn) and os.path.getsize(id_fn) > 0:
zmw_line_map = {}
zmw_readname_map = {}
for line in open(id_fn).xreadlines():
line = line.strip()
line_n = int(line.split(":")[0])
readname = line.split(":")[1].strip("@")
movie_name = readname.split("/")[0]
try:
movie_id = nat_movie_name_ID_map[movie_name]
except KeyError:
# The queried read is not in this chunk of molecules
continue
if readname.split("/")[-1] == "ccs":
zmw_id = readname.split("/")[-2] + "_" + str(movie_id)
else:
zmw_id = readname.split("/")[-1] + "_" + str(movie_id)
zmw_line_map[zmw_id] = line_n
zmw_readname_map[zmw_id] = readname
n_mols = len(mols)
not_in_fastq = 0
for n,mol in enumerate(mols):
try:
target = zmw_line_map[mol.zmw_id]
i = 0
with open(self.orig_fastq) as f:
while i < target-1:
f.next()
i += 1
j = 0
while j < 4:
new_fastq_f.write(f.next())
j += 1
mol.in_fastq = True
readname = zmw_readname_map[mol.zmw_id]
self.readname_molID_map[readname] = mol.mol_id
except KeyError:
not_in_fastq += 1
progress = float(n)/n_mols * 100
logging.debug("Chunk %s: ZMW %s (mol %s) not in fastq file. Progress -- %.1f%%" % (self.chunk_id, mol.zmw_id, mol.mol_id, progress))
pass
pct_not_found = float(not_in_fastq) / len(mols) * 100
logging.debug("Chunk %s: %s reads not found (%.1f%%) in original CCS fastq file." % (self.chunk_id, not_in_fastq, pct_not_found))
else:
logging.error("CCS fastq indexing failed: %s" % CMD)
new_fastq_f.close()
return new_fastq, tmp_dir
def call_bwasw_samtools( fq ):
def run_command( CMD ):
p = subprocess.Popen(CMD, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdOutErr = p.communicate()
sts = p.returncode
if sts != 0:
for entry in stdOutErr:
print entry
raise Exception("Failed alignment command: %s" % CMD)
sam = fq+".aligned.sam"
bam = fq+".aligned.bam"
sam_md = fq+".aligned.md.sam"
bwa_CMD = "bwa bwasw %s %s > %s" % (self.ref, fq, sam)
run_command( bwa_CMD )
# Check if there are any alignments in the file
if os.path.getsize(sam) == 0:
logging.error("No alignments for %s!" % fq)
os.remove(sam)
else:
samtools_view_CMD = "samtools view -b -S %s > %s" % (sam, bam)
run_command(samtools_view_CMD)
# We are going to use samtools calmd to calculate the MD field, which along
# with the CIGAR string in the sam file will let us identify SNPs/Indels.
samtools_calmd_CMD = "samtools calmd %s %s > %s" % (bam, self.ref, sam_md)
run_command(samtools_calmd_CMD)
return sam_md
new_fastq, tmp_dir = get_CCS_reads_for_aligning( mols, nat_movie_name_ID_map, align )
sam_md = None
if os.path.getsize(new_fastq) > 0:
sam_md = call_bwasw_samtools( new_fastq )
return sam_md, tmp_dir
def parse_alignments( self, sam_md, mols, align ):
"""Parse each line in the SAM file and add certain values from the alignment as
attributes of the molecule object."""
i = 0
j = 0
to_del = []
if sam_md != None:
for line in open(sam_md).xreadlines():
if line[0]=="@": # header
continue
readname = line.split("\t")[0]
mol_id = self.readname_molID_map[readname]
if line.split("\t")[2] == "*": # No alignment found
to_del.append(mol_id)
continue
# Do a quick check to make sure were are looking that the alignment
# that corresponds to the ref positions listed in the IPD csv file.
if readname.find("/") < 0:
# This is a weird alignment where the readname is corrupt.
j += 1
to_del.append(mol_id)
continue
align_start = int(line.split("\t")[3])
one_pos = mols[mol_id].entries.values()[0].pos
if np.abs(one_pos - align_start) > 2000:
# We probably aren't looking at the proper alignment
# Even if we are, there's a ton of soft-clipping at the
# beginning of this read -- suspicious.
logging.debug("Chunk %s: suspicious positions! mol:%s align_start:%s sampled_pos:%s" % (self.chunk_id, mol_id, align_start, one_pos))
i += 1
to_del.append(mol_id)
continue
readname, var_pos, softclip, CIGAR, MD, align_start, align_end = sam_parser.read_sam_line(line)
if not align:
var_pos = []
mol_id = self.readname_molID_map[readname]
mol = mols[mol_id]
mol.mapped = True
mol.var_pos = var_pos
mol.softclip = softclip
mol.var_no_sc = list(set(mol.var_pos) - set(mol.softclip))
mol.CIGAR = CIGAR
mol.MD = MD
mol.align_start = align_start
mol.align_end = align_end
mol.first_pos = align_start
mol.last_pos = align_end
logging.debug("Chunk %s: skipped %s alignments -- start pos far away from sampled position." % (self.chunk_id, i))
logging.debug("Chunk %s: skipped %s alignments -- readname was just the movie name (?)." % (self.chunk_id, j))
to_del = to_del + [mol.mol_id for mol in mols.values() if (not mol.mapped or not mol.in_fastq)]
for mol_id in to_del:
try:
del mols[mol_id]
except KeyError:
pass
def get_native_read_stats( self, mols ):
"""
"""
all_mols = []
in_fastq_no_mapped = []
no_mapped = []
no_fastq = []
to_analyze = []
for mol in mols.values():
all_mols.append(mol.mol_id)
if not mol.mapped:
no_mapped.append(mol.mol_id)
if not mol.in_fastq:
no_fastq.append(mol.mol_id)
if mol.mapped and mol.in_fastq:
to_analyze.append(mol.mol_id)
native_stats = {"all_mols" : all_mols, \
"no_mapped" : no_mapped, \
"no_fastq" : no_fastq, \
"to_analyze" : to_analyze}
return native_stats
|