Source code for iss.bam

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import logging
import sys
from random import random

import numpy as np
import pysam

from iss import modeller


[docs] def read_bam(bam_file, n_reads=1000000): """Bam file reader. Select random mapped reads from a bam file Args: bam_file (string): path to a bam file Yields: read: a pysam read object """ logger = logging.getLogger(__name__) try: lines = pysam.idxstats(bam_file).splitlines() total_records = sum([int(line.split("\t")[2]) for line in lines if not line.startswith("#")]) # total_records = sum(1 for _ in bam.fetch() if not _.is_unmapped) random_fraction = n_reads / total_records bam = pysam.AlignmentFile(bam_file, "rb") # reopen the file except (IOError, ValueError, ZeroDivisionError, pysam.utils.SamtoolsError) as e: logger.error("Failed to read bam file: %s" % e) sys.exit(1) else: logger.info("Reading bam file: %s" % bam_file) c = 0 with bam: for read in bam.fetch(): if not read.is_unmapped and random() < random_fraction: c += 1 if logger.getEffectiveLevel() == 10: print("DEBUG:iss.bam:Subsampling %s / %s reads" % (c, n_reads), end="\r") yield read elif c >= n_reads: break
[docs] def write_to_file( model, read_length, mean_f, mean_r, hist_f, hist_r, sub_f, sub_r, ins_f, ins_r, del_f, del_r, i_size, output ): """Write variables to a .npz file Args: model (string): the type of error model read_length (int): read length of the dataset mean_f (list): list of mean bin sizes mean_r (list): list of mean bin sizes hist_f (list): list of cumulative distribution functions for the forward read quality hist_r (list): list of cumulative distribution functions for the reverse read quality sub_f (list): list of dictionaries representing the substitution probabilities for the forward reads sub_r (list): list of dictionaries representing the substitution probabilities for the reverse reads ins_f (list): list of dictionaries representing the insertion probabilities for the forward reads ins_r (list): list of dictionaries representing the insertion probabilities for the reverse reads del_f (list): list of dictionaries representing the deletion probabilities for the forward reads del_r (list): list of dictionaries representing the deletion probabilities for the reverse reads i_size (int): distribution of insert size for the aligned reads output (string): prefix of the output file """ logger = logging.getLogger(__name__) try: logger.info("Writing model to file: %s" % output) np.savez_compressed( output, model=model, read_length=read_length, insert_size=i_size, mean_count_forward=mean_f, mean_count_reverse=mean_r, quality_hist_forward=np.array(hist_f, dtype=object), quality_hist_reverse=np.array(hist_r, dtype=object), subst_choices_forward=sub_f, subst_choices_reverse=sub_r, ins_forward=ins_f, ins_reverse=ins_r, del_forward=del_f, del_reverse=del_r, ) except PermissionError as e: logger.error("Failed to open output file: %s" % e) sys.exit(1)
[docs] def to_model(bam_path, output): """from a bam file, write all variables needed for modelling reads in a .npz model file For a brief description of the variables that will be written to the output file, see the bam.write_to_file function Args: bam_path (string): path to a bam file output (string): prefix of the output file """ logger = logging.getLogger(__name__) template_length_dist = [] qualities_forward = [] qualities_reverse = [] subst_matrix_f = np.zeros([301, 16]) # we dont know the len of the reads subst_matrix_r = np.zeros([301, 16]) # yet. we will find out from the indel_matrix_f = np.zeros([301, 9]) # len of the quality lists indel_matrix_r = np.zeros([301, 9]) # read the bam file and extract info needed for modelling for read in read_bam(bam_path): # get insert size distribution if read.is_paired: template_length = abs(read.template_length) # i_size = template_length - (2 * len(read.seq)) template_length_dist.append(template_length) # get qualities if read.is_read1: # get mean quality too read_quality = read.query_qualities mean_quality = np.mean(read_quality) if read.is_reverse: read_quality = read_quality[::-1] # reverse the list quality_plus_mean = [(quality, mean_quality) for quality in read_quality] qualities_forward.append(np.asarray(quality_plus_mean)) # qualities_forward.append(read.query_qualities) elif read.is_read2: # get mean quality too read_quality = read.query_qualities mean_quality = np.mean(read_quality) if read.is_reverse: read_quality = read_quality[::-1] # reverse the list quality_plus_mean = [(quality, mean_quality) for quality in read_quality] qualities_reverse.append(np.asarray(quality_plus_mean)) # qualities_reverse.append(read.query_qualities) # get mismatches alignment = read.get_aligned_pairs(matches_only=True, with_seq=True) read_has_indels = False for base in alignment: # dispatch mismatches in matrix pos, subst, read_has_indels = modeller.dispatch_subst(base, read, read_has_indels) if read.is_read1 and subst is not None: subst_matrix_f[pos, subst] += 1 elif read.is_read2 and subst is not None: subst_matrix_r[pos, subst] += 1 if read_has_indels: # dispatch indels in matrix for pos, indel in modeller.dispatch_indels(read): if read.is_read1: indel_matrix_f[pos, indel] += 1 elif read.is_read2: indel_matrix_r[pos, indel] += 1 logger.info("Calculating mean and base quality distribution") quality_bins_f = modeller.divide_qualities_into_bins(qualities_forward) quality_bins_r = modeller.divide_qualities_into_bins(qualities_reverse) # getting distribution of mean sequence quality mean_f = [len(quality_bin) for quality_bin in quality_bins_f] mean_r = [len(quality_bin) for quality_bin in quality_bins_r] hists_f = modeller.quality_bins_to_histogram(quality_bins_f) hists_r = modeller.quality_bins_to_histogram(quality_bins_r) # modern illumina instruments return reads of the same length # in case our bam file contains aligned reads of different length, # we coerce the model's read length to the smallest read of the bam file length_forward = min((len(x) for x in hists_f if len(x) > 1)) length_reverse = min((len(x) for x in hists_r if len(x) > 1)) read_length = min(length_forward, length_reverse) # now we can resize the substitution and indel matrices before # doing operations on them subst_matrix_f.resize([read_length, 16], refcheck=False) subst_matrix_r.resize([read_length, 16], refcheck=False) indel_matrix_f.resize([read_length, 9], refcheck=False) indel_matrix_r.resize([read_length, 9], refcheck=False) logger.info("Calculating substitution rate") subst_f = modeller.subst_matrix_to_choices(subst_matrix_f, read_length) subst_r = modeller.subst_matrix_to_choices(subst_matrix_r, read_length) logger.info("Calculating indel rate") # update the base count in indel matrices for position in range(read_length): indel_matrix_f[position][0] = sum(subst_matrix_f[position][::4]) indel_matrix_r[position][0] = sum(subst_matrix_r[position][::4]) ins_f, del_f = modeller.indel_matrix_to_choices(indel_matrix_f, read_length) ins_r, del_r = modeller.indel_matrix_to_choices(indel_matrix_r, read_length) logger.info("Calculating insert size distribution") # insert_size = int(np.mean(insert_size_dist)) hist_insert_size = modeller.insert_size(template_length_dist, read_length) write_to_file( "kde", read_length, mean_f, mean_r, hists_f, hists_r, subst_f, subst_r, ins_f, ins_r, del_f, del_r, hist_insert_size, output + ".npz", )