Source code for iss.bam

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import print_function
from builtins import int, dict, range

from iss import util
from iss import modeller

from scipy import stats
from random import random

import sys
import pysam
import logging
import numpy as np


[docs]def read_bam(bam_file, n_reads=1000000): """Bam file reader. Select random mapped reads from a bam file Args: bam_file (string): path to a bam file Yields: read: a pysam read object """ logger = logging.getLogger(__name__) try: lines = pysam.idxstats(bam_file).splitlines() total_records = sum([int(l.split("\t")[2]) for l in lines if not l.startswith("#")]) # total_records = sum(1 for _ in bam.fetch() if not _.is_unmapped) random_fraction = n_reads / total_records bam = pysam.AlignmentFile(bam_file, 'rb') # reopen the file except (IOError, ValueError, ZeroDivisionError, pysam.utils.SamtoolsError) as e: logger.error('Failed to read bam file: %s' % e) sys.exit(1) else: logger.info('Reading bam file: %s' % bam_file) c = 0 with bam: for read in bam.fetch(): if not read.is_unmapped and random() < random_fraction: c += 1 if logger.getEffectiveLevel() == 10: print( 'DEBUG:iss.bam:Subsampling %s / %s reads' % ( c, n_reads), end='\r') yield read elif c >= n_reads: break
[docs]def write_to_file(model, read_length, mean_f, mean_r, hist_f, hist_r, sub_f, sub_r, ins_f, ins_r, del_f, del_r, i_size, output): """Write variables to a .npz file Args: model (string): the type of error model read_length (int): read length of the dataset mean_f (list): list of mean bin sizes mean_r (list): list of mean bin sizes hist_f (list): list of cumulative distribution functions for the forward read quality hist_r (list): list of cumulative distribution functions for the reverse read quality sub_f (list): list of dictionaries representing the substitution probabilities for the forward reads sub_r (list): list of dictionaries representing the substitution probabilities for the reverse reads ins_f (list): list of dictionaries representing the insertion probabilities for the forward reads ins_r (list): list of dictionaries representing the insertion probabilities for the reverse reads del_f (list): list of dictionaries representing the deletion probabilities for the forward reads del_r (list): list of dictionaries representing the deletion probabilities for the reverse reads i_size (int): distribution of insert size for the aligned reads output (string): prefix of the output file """ logger = logging.getLogger(__name__) try: logger.info('Writing model to file: %s' % output) np.savez_compressed( output, model=model, read_length=read_length, insert_size=i_size, mean_count_forward=mean_f, mean_count_reverse=mean_r, quality_hist_forward=hist_f, quality_hist_reverse=hist_r, subst_choices_forward=sub_f, subst_choices_reverse=sub_r, ins_forward=ins_f, ins_reverse=ins_r, del_forward=del_f, del_reverse=del_r ) except PermissionError as e: logger.error('Failed to open output file: %s' % e) sys.exit(1)
[docs]def to_model(bam_path, output): """from a bam file, write all variables needed for modelling reads in a .npz model file For a brief description of the variables that will be written to the output file, see the bam.write_to_file function Args: bam_path (string): path to a bam file output (string): prefix of the output file """ logger = logging.getLogger(__name__) insert_size_dist = [] qualities_forward = [] qualities_reverse = [] subst_matrix_f = np.zeros([301, 16]) # we dont know the len of the reads subst_matrix_r = np.zeros([301, 16]) # yet. we will find out from the indel_matrix_f = np.zeros([301, 9]) # len of the quality lists indel_matrix_r = np.zeros([301, 9]) # read the bam file and extract info needed for modelling for read in read_bam(bam_path): # get insert size distribution if read.is_proper_pair: template_length = abs(read.template_length) i_size = template_length - (2 * len(read.seq)) insert_size_dist.append(i_size) # get qualities if read.is_read1: # get mean quality too quality_means = [] read_quality = read.query_qualities mean_quality = np.mean(read_quality) if read.is_reverse: read_quality = read_quality[::-1] # reverse the list quality_plus_mean = [ (quality, mean_quality) for quality in read_quality] qualities_forward.append(np.asarray(quality_plus_mean)) # qualities_forward.append(read.query_qualities) elif read.is_read2: # get mean quality too quality_means = [] read_quality = read.query_qualities mean_quality = np.mean(read_quality) if read.is_reverse: read_quality = read_quality[::-1] # reverse the list quality_plus_mean = [ (quality, mean_quality) for quality in read_quality] qualities_reverse.append(np.asarray(quality_plus_mean)) # qualities_reverse.append(read.query_qualities) # get mismatches alignment = read.get_aligned_pairs( matches_only=True, with_seq=True ) read_has_indels = False for base in alignment: # dispatch mismatches in matrix pos, subst, read_has_indels = modeller.dispatch_subst( base, read, read_has_indels) if read.is_read1 and subst is not None: subst_matrix_f[pos, subst] += 1 elif read.is_read2 and subst is not None: subst_matrix_r[pos, subst] += 1 if read_has_indels: # dispatch indels in matrix for pos, indel in modeller.dispatch_indels(read): if read.is_read1: indel_matrix_f[pos, indel] += 1 elif read.is_read2: indel_matrix_r[pos, indel] += 1 logger.info('Calculating insert size distribution') # insert_size = int(np.mean(insert_size_dist)) hist_insert_size = modeller.insert_size(insert_size_dist) logger.info('Calculating mean and base quality distribution') quality_bins_f = modeller.divide_qualities_into_bins(qualities_forward) quality_bins_r = modeller.divide_qualities_into_bins(qualities_reverse) # getting distribution of mean sequence quality mean_f = [len(quality_bin) for quality_bin in quality_bins_f] mean_r = [len(quality_bin) for quality_bin in quality_bins_r] hists_f = modeller.quality_bins_to_histogram(quality_bins_f) hists_r = modeller.quality_bins_to_histogram(quality_bins_r) # modern illumina instruments return reads of the same length # in case our bam file contains aligned reads of different length, # we coerce the model's read length to the smallest read of the bam file length_forward = min((len(x) for x in hists_f if len(x) > 1)) length_reverse = min((len(x) for x in hists_r if len(x) > 1)) read_length = min(length_forward, length_reverse) # now we can resize the substitution and indel matrices before # doing operations on them subst_matrix_f.resize([read_length, 16], refcheck=False) subst_matrix_r.resize([read_length, 16], refcheck=False) indel_matrix_f.resize([read_length, 9], refcheck=False) indel_matrix_r.resize([read_length, 9], refcheck=False) logger.info('Calculating substitution rate') subst_f = modeller.subst_matrix_to_choices(subst_matrix_f, read_length) subst_r = modeller.subst_matrix_to_choices(subst_matrix_r, read_length) logger.info('Calculating indel rate') # update the base count in indel matrices for position in range(read_length): indel_matrix_f[position][0] = sum(subst_matrix_f[position][::4]) indel_matrix_r[position][0] = sum(subst_matrix_r[position][::4]) ins_f, del_f = modeller.indel_matrix_to_choices( indel_matrix_f, read_length) ins_r, del_r = modeller.indel_matrix_to_choices( indel_matrix_r, read_length) write_to_file( 'kde', read_length, mean_f, mean_r, hists_f, hists_r, subst_f, subst_r, ins_f, ins_r, del_f, del_r, hist_insert_size, output + '.npz')