#!/usr/bin/env python
# -*- coding: utf-8 -*-
from iss import util
from iss import modeller
from scipy import stats
from random import random
import sys
import pysam
import logging
import numpy as np
[docs]def read_bam(bam_file, n_reads=1000000):
"""Bam file reader. Select random mapped reads from a bam file
Args:
bam_file (string): path to a bam file
Yields:
read: a pysam read object
"""
logger = logging.getLogger(__name__)
try:
lines = pysam.idxstats(bam_file).splitlines()
total_records = sum([int(l.split("\t")[2])
for l in lines if not l.startswith("#")])
# total_records = sum(1 for _ in bam.fetch() if not _.is_unmapped)
random_fraction = n_reads / total_records
bam = pysam.AlignmentFile(bam_file, 'rb') # reopen the file
except (IOError, ValueError,
ZeroDivisionError, pysam.utils.SamtoolsError) as e:
logger.error('Failed to read bam file: %s' % e)
sys.exit(1)
else:
logger.info('Reading bam file: %s' % bam_file)
c = 0
with bam:
for read in bam.fetch():
if not read.is_unmapped and random() < random_fraction:
c += 1
if logger.getEffectiveLevel() == 10:
print(
'DEBUG:iss.bam:Subsampling %s / %s reads' % (
c, n_reads),
end='\r')
yield read
elif c >= n_reads:
break
[docs]def write_to_file(model, read_length, mean_f, mean_r, hist_f, hist_r,
sub_f, sub_r, ins_f, ins_r, del_f, del_r, i_size, output):
"""Write variables to a .npz file
Args:
model (string): the type of error model
read_length (int): read length of the dataset
mean_f (list): list of mean bin sizes
mean_r (list): list of mean bin sizes
hist_f (list): list of cumulative distribution functions for the
forward read quality
hist_r (list): list of cumulative distribution functions for the
reverse read quality
sub_f (list): list of dictionaries representing the substitution
probabilities for the forward reads
sub_r (list): list of dictionaries representing the substitution
probabilities for the reverse reads
ins_f (list): list of dictionaries representing the insertion
probabilities for the forward reads
ins_r (list): list of dictionaries representing the insertion
probabilities for the reverse reads
del_f (list): list of dictionaries representing the deletion
probabilities for the forward reads
del_r (list): list of dictionaries representing the deletion
probabilities for the reverse reads
i_size (int): distribution of insert size for the aligned reads
output (string): prefix of the output file
"""
logger = logging.getLogger(__name__)
try:
logger.info('Writing model to file: %s' % output)
np.savez_compressed(
output,
model=model,
read_length=read_length,
insert_size=i_size,
mean_count_forward=mean_f,
mean_count_reverse=mean_r,
quality_hist_forward=hist_f,
quality_hist_reverse=hist_r,
subst_choices_forward=sub_f,
subst_choices_reverse=sub_r,
ins_forward=ins_f,
ins_reverse=ins_r,
del_forward=del_f,
del_reverse=del_r
)
except PermissionError as e:
logger.error('Failed to open output file: %s' % e)
sys.exit(1)
[docs]def to_model(bam_path, output):
"""from a bam file, write all variables needed for modelling reads in
a .npz model file
For a brief description of the variables that will be written to the
output file, see the bam.write_to_file function
Args:
bam_path (string): path to a bam file
output (string): prefix of the output file
"""
logger = logging.getLogger(__name__)
insert_size_dist = []
qualities_forward = []
qualities_reverse = []
subst_matrix_f = np.zeros([301, 16]) # we dont know the len of the reads
subst_matrix_r = np.zeros([301, 16]) # yet. we will find out from the
indel_matrix_f = np.zeros([301, 9]) # len of the quality lists
indel_matrix_r = np.zeros([301, 9])
# read the bam file and extract info needed for modelling
for read in read_bam(bam_path):
# get insert size distribution
if read.is_proper_pair:
template_length = abs(read.template_length)
i_size = template_length - (2 * len(read.seq))
insert_size_dist.append(i_size)
# get qualities
if read.is_read1:
# get mean quality too
quality_means = []
read_quality = read.query_qualities
mean_quality = np.mean(read_quality)
if read.is_reverse:
read_quality = read_quality[::-1] # reverse the list
quality_plus_mean = [
(quality, mean_quality) for quality in read_quality]
qualities_forward.append(np.asarray(quality_plus_mean))
# qualities_forward.append(read.query_qualities)
elif read.is_read2:
# get mean quality too
quality_means = []
read_quality = read.query_qualities
mean_quality = np.mean(read_quality)
if read.is_reverse:
read_quality = read_quality[::-1] # reverse the list
quality_plus_mean = [
(quality, mean_quality) for quality in read_quality]
qualities_reverse.append(np.asarray(quality_plus_mean))
# qualities_reverse.append(read.query_qualities)
# get mismatches
alignment = read.get_aligned_pairs(
matches_only=True,
with_seq=True
)
read_has_indels = False
for base in alignment: # dispatch mismatches in matrix
pos, subst, read_has_indels = modeller.dispatch_subst(
base, read, read_has_indels)
if read.is_read1 and subst is not None:
subst_matrix_f[pos, subst] += 1
elif read.is_read2 and subst is not None:
subst_matrix_r[pos, subst] += 1
if read_has_indels: # dispatch indels in matrix
for pos, indel in modeller.dispatch_indels(read):
if read.is_read1:
indel_matrix_f[pos, indel] += 1
elif read.is_read2:
indel_matrix_r[pos, indel] += 1
logger.info('Calculating insert size distribution')
# insert_size = int(np.mean(insert_size_dist))
hist_insert_size = modeller.insert_size(insert_size_dist)
logger.info('Calculating mean and base quality distribution')
quality_bins_f = modeller.divide_qualities_into_bins(qualities_forward)
quality_bins_r = modeller.divide_qualities_into_bins(qualities_reverse)
# getting distribution of mean sequence quality
mean_f = [len(quality_bin) for quality_bin in quality_bins_f]
mean_r = [len(quality_bin) for quality_bin in quality_bins_r]
hists_f = modeller.quality_bins_to_histogram(quality_bins_f)
hists_r = modeller.quality_bins_to_histogram(quality_bins_r)
# modern illumina instruments return reads of the same length
# in case our bam file contains aligned reads of different length,
# we coerce the model's read length to the smallest read of the bam file
length_forward = min((len(x) for x in hists_f if len(x) > 1))
length_reverse = min((len(x) for x in hists_r if len(x) > 1))
read_length = min(length_forward, length_reverse)
# now we can resize the substitution and indel matrices before
# doing operations on them
subst_matrix_f.resize([read_length, 16], refcheck=False)
subst_matrix_r.resize([read_length, 16], refcheck=False)
indel_matrix_f.resize([read_length, 9], refcheck=False)
indel_matrix_r.resize([read_length, 9], refcheck=False)
logger.info('Calculating substitution rate')
subst_f = modeller.subst_matrix_to_choices(subst_matrix_f, read_length)
subst_r = modeller.subst_matrix_to_choices(subst_matrix_r, read_length)
logger.info('Calculating indel rate')
# update the base count in indel matrices
for position in range(read_length):
indel_matrix_f[position][0] = sum(subst_matrix_f[position][::4])
indel_matrix_r[position][0] = sum(subst_matrix_r[position][::4])
ins_f, del_f = modeller.indel_matrix_to_choices(
indel_matrix_f, read_length)
ins_r, del_r = modeller.indel_matrix_to_choices(
indel_matrix_r, read_length)
write_to_file(
'kde',
read_length,
mean_f,
mean_r,
hists_f,
hists_r,
subst_f,
subst_r,
ins_f,
ins_r,
del_f,
del_r,
hist_insert_size,
output + '.npz')