#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
import sys
from random import random
import numpy as np
import pysam
from iss import modeller
[docs]
def read_bam(bam_file, n_reads=1000000):
"""Bam file reader. Select random mapped reads from a bam file
Args:
bam_file (string): path to a bam file
Yields:
read: a pysam read object
"""
logger = logging.getLogger(__name__)
try:
lines = pysam.idxstats(bam_file).splitlines()
total_records = sum([int(line.split("\t")[2]) for line in lines if not line.startswith("#")])
# total_records = sum(1 for _ in bam.fetch() if not _.is_unmapped)
random_fraction = n_reads / total_records
bam = pysam.AlignmentFile(bam_file, "rb") # reopen the file
except (IOError, ValueError, ZeroDivisionError, pysam.utils.SamtoolsError) as e:
logger.error("Failed to read bam file: %s" % e)
sys.exit(1)
else:
logger.info("Reading bam file: %s" % bam_file)
c = 0
with bam:
for read in bam.fetch():
if not read.is_unmapped and random() < random_fraction:
c += 1
if logger.getEffectiveLevel() == 10:
print("DEBUG:iss.bam:Subsampling %s / %s reads" % (c, n_reads), end="\r")
yield read
elif c >= n_reads:
break
[docs]
def write_to_file(
model, read_length, mean_f, mean_r, hist_f, hist_r, sub_f, sub_r, ins_f, ins_r, del_f, del_r, i_size, output
):
"""Write variables to a .npz file
Args:
model (string): the type of error model
read_length (int): read length of the dataset
mean_f (list): list of mean bin sizes
mean_r (list): list of mean bin sizes
hist_f (list): list of cumulative distribution functions for the
forward read quality
hist_r (list): list of cumulative distribution functions for the
reverse read quality
sub_f (list): list of dictionaries representing the substitution
probabilities for the forward reads
sub_r (list): list of dictionaries representing the substitution
probabilities for the reverse reads
ins_f (list): list of dictionaries representing the insertion
probabilities for the forward reads
ins_r (list): list of dictionaries representing the insertion
probabilities for the reverse reads
del_f (list): list of dictionaries representing the deletion
probabilities for the forward reads
del_r (list): list of dictionaries representing the deletion
probabilities for the reverse reads
i_size (int): distribution of insert size for the aligned reads
output (string): prefix of the output file
"""
logger = logging.getLogger(__name__)
try:
logger.info("Writing model to file: %s" % output)
np.savez_compressed(
output,
model=model,
read_length=read_length,
insert_size=i_size,
mean_count_forward=mean_f,
mean_count_reverse=mean_r,
quality_hist_forward=np.array(hist_f, dtype=object),
quality_hist_reverse=np.array(hist_r, dtype=object),
subst_choices_forward=sub_f,
subst_choices_reverse=sub_r,
ins_forward=ins_f,
ins_reverse=ins_r,
del_forward=del_f,
del_reverse=del_r,
)
except PermissionError as e:
logger.error("Failed to open output file: %s" % e)
sys.exit(1)
[docs]
def to_model(bam_path, output):
"""from a bam file, write all variables needed for modelling reads in
a .npz model file
For a brief description of the variables that will be written to the
output file, see the bam.write_to_file function
Args:
bam_path (string): path to a bam file
output (string): prefix of the output file
"""
logger = logging.getLogger(__name__)
template_length_dist = []
qualities_forward = []
qualities_reverse = []
subst_matrix_f = np.zeros([301, 16]) # we dont know the len of the reads
subst_matrix_r = np.zeros([301, 16]) # yet. we will find out from the
indel_matrix_f = np.zeros([301, 9]) # len of the quality lists
indel_matrix_r = np.zeros([301, 9])
# read the bam file and extract info needed for modelling
for read in read_bam(bam_path):
# get insert size distribution
if read.is_paired:
template_length = abs(read.template_length)
# i_size = template_length - (2 * len(read.seq))
template_length_dist.append(template_length)
# get qualities
if read.is_read1:
# get mean quality too
read_quality = read.query_qualities
mean_quality = np.mean(read_quality)
if read.is_reverse:
read_quality = read_quality[::-1] # reverse the list
quality_plus_mean = [(quality, mean_quality) for quality in read_quality]
qualities_forward.append(np.asarray(quality_plus_mean))
# qualities_forward.append(read.query_qualities)
elif read.is_read2:
# get mean quality too
read_quality = read.query_qualities
mean_quality = np.mean(read_quality)
if read.is_reverse:
read_quality = read_quality[::-1] # reverse the list
quality_plus_mean = [(quality, mean_quality) for quality in read_quality]
qualities_reverse.append(np.asarray(quality_plus_mean))
# qualities_reverse.append(read.query_qualities)
# get mismatches
alignment = read.get_aligned_pairs(matches_only=True, with_seq=True)
read_has_indels = False
for base in alignment: # dispatch mismatches in matrix
pos, subst, read_has_indels = modeller.dispatch_subst(base, read, read_has_indels)
if read.is_read1 and subst is not None:
subst_matrix_f[pos, subst] += 1
elif read.is_read2 and subst is not None:
subst_matrix_r[pos, subst] += 1
if read_has_indels: # dispatch indels in matrix
for pos, indel in modeller.dispatch_indels(read):
if read.is_read1:
indel_matrix_f[pos, indel] += 1
elif read.is_read2:
indel_matrix_r[pos, indel] += 1
logger.info("Calculating mean and base quality distribution")
quality_bins_f = modeller.divide_qualities_into_bins(qualities_forward)
quality_bins_r = modeller.divide_qualities_into_bins(qualities_reverse)
# getting distribution of mean sequence quality
mean_f = [len(quality_bin) for quality_bin in quality_bins_f]
mean_r = [len(quality_bin) for quality_bin in quality_bins_r]
hists_f = modeller.quality_bins_to_histogram(quality_bins_f)
hists_r = modeller.quality_bins_to_histogram(quality_bins_r)
# modern illumina instruments return reads of the same length
# in case our bam file contains aligned reads of different length,
# we coerce the model's read length to the smallest read of the bam file
length_forward = min((len(x) for x in hists_f if len(x) > 1))
length_reverse = min((len(x) for x in hists_r if len(x) > 1))
read_length = min(length_forward, length_reverse)
# now we can resize the substitution and indel matrices before
# doing operations on them
subst_matrix_f.resize([read_length, 16], refcheck=False)
subst_matrix_r.resize([read_length, 16], refcheck=False)
indel_matrix_f.resize([read_length, 9], refcheck=False)
indel_matrix_r.resize([read_length, 9], refcheck=False)
logger.info("Calculating substitution rate")
subst_f = modeller.subst_matrix_to_choices(subst_matrix_f, read_length)
subst_r = modeller.subst_matrix_to_choices(subst_matrix_r, read_length)
logger.info("Calculating indel rate")
# update the base count in indel matrices
for position in range(read_length):
indel_matrix_f[position][0] = sum(subst_matrix_f[position][::4])
indel_matrix_r[position][0] = sum(subst_matrix_r[position][::4])
ins_f, del_f = modeller.indel_matrix_to_choices(indel_matrix_f, read_length)
ins_r, del_r = modeller.indel_matrix_to_choices(indel_matrix_r, read_length)
logger.info("Calculating insert size distribution")
# insert_size = int(np.mean(insert_size_dist))
hist_insert_size = modeller.insert_size(template_length_dist, read_length)
write_to_file(
"kde",
read_length,
mean_f,
mean_r,
hists_f,
hists_r,
subst_f,
subst_r,
ins_f,
ins_r,
del_f,
del_r,
hist_insert_size,
output + ".npz",
)