Source code for iss.bam

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import logging
import sys
from random import random

import numpy as np
import pysam

from iss import modeller



[docs]
def read_bam(bam_file, n_reads=1000000):
    """Bam file reader. Select random mapped reads from a bam file

    Args:
        bam_file (string): path to a bam file

    Yields:
        read: a pysam read object
    """
    logger = logging.getLogger(__name__)

    try:
        lines = pysam.idxstats(bam_file).splitlines()
        total_records = sum([int(line.split("\t")[2]) for line in lines if not line.startswith("#")])
        # total_records = sum(1 for _ in bam.fetch() if not _.is_unmapped)
        random_fraction = n_reads / total_records
        bam = pysam.AlignmentFile(bam_file, "rb")  # reopen the file

    except (IOError, ValueError, ZeroDivisionError, pysam.utils.SamtoolsError) as e:
        logger.error("Failed to read bam file: %s" % e)
        sys.exit(1)
    else:
        logger.info("Reading bam file: %s" % bam_file)
        c = 0
        with bam:
            for read in bam.fetch():
                if not read.is_unmapped and random() < random_fraction:
                    c += 1
                    if logger.getEffectiveLevel() == 10:
                        print("DEBUG:iss.bam:Subsampling %s / %s reads" % (c, n_reads), end="\r")
                    yield read
                elif c >= n_reads:
                    break




[docs]
def write_to_file(
    model, read_length, mean_f, mean_r, hist_f, hist_r, sub_f, sub_r, ins_f, ins_r, del_f, del_r, i_size, output
):
    """Write variables to a .npz file

    Args:
        model (string): the type of error model
        read_length (int): read length of the dataset
        mean_f (list): list of mean bin sizes
        mean_r (list): list of mean bin sizes
        hist_f (list): list of cumulative distribution functions for the
            forward read quality
        hist_r (list): list of cumulative distribution functions for the
            reverse read quality
        sub_f (list): list of dictionaries representing the substitution
            probabilities for the forward reads
        sub_r (list): list of dictionaries representing the substitution
            probabilities for the reverse reads
        ins_f (list): list of dictionaries representing the insertion
            probabilities for the forward reads
        ins_r (list): list of dictionaries representing the insertion
            probabilities for the reverse reads
        del_f (list): list of dictionaries representing the deletion
            probabilities for the forward reads
        del_r (list): list of dictionaries representing the deletion
            probabilities for the reverse reads
        i_size (int): distribution of insert size for the aligned reads
        output (string): prefix of the output file
    """
    logger = logging.getLogger(__name__)

    try:
        logger.info("Writing model to file: %s" % output)
        np.savez_compressed(
            output,
            model=model,
            read_length=read_length,
            insert_size=i_size,
            mean_count_forward=mean_f,
            mean_count_reverse=mean_r,
            quality_hist_forward=np.array(hist_f, dtype=object),
            quality_hist_reverse=np.array(hist_r, dtype=object),
            subst_choices_forward=sub_f,
            subst_choices_reverse=sub_r,
            ins_forward=ins_f,
            ins_reverse=ins_r,
            del_forward=del_f,
            del_reverse=del_r,
        )
    except PermissionError as e:
        logger.error("Failed to open output file: %s" % e)
        sys.exit(1)




[docs]
def to_model(bam_path, output):
    """from a bam file, write all variables needed for modelling reads in
    a .npz model file

    For a brief description of the variables that will be written to the
        output file, see the bam.write_to_file function

    Args:
        bam_path (string): path to a bam file
        output (string): prefix of the output file
    """
    logger = logging.getLogger(__name__)

    template_length_dist = []
    qualities_forward = []
    qualities_reverse = []
    subst_matrix_f = np.zeros([301, 16])  # we dont know the len of the reads
    subst_matrix_r = np.zeros([301, 16])  # yet. we will find out from the
    indel_matrix_f = np.zeros([301, 9])  # len of the quality lists
    indel_matrix_r = np.zeros([301, 9])

    # read the bam file and extract info needed for modelling
    for read in read_bam(bam_path):
        # get insert size distribution
        if read.is_paired:
            template_length = abs(read.template_length)
            # i_size = template_length - (2 * len(read.seq))
            template_length_dist.append(template_length)

        # get qualities
        if read.is_read1:
            # get mean quality too
            read_quality = read.query_qualities
            mean_quality = np.mean(read_quality)
            if read.is_reverse:
                read_quality = read_quality[::-1]  # reverse the list

            quality_plus_mean = [(quality, mean_quality) for quality in read_quality]
            qualities_forward.append(np.asarray(quality_plus_mean))
            # qualities_forward.append(read.query_qualities)
        elif read.is_read2:
            # get mean quality too
            read_quality = read.query_qualities
            mean_quality = np.mean(read_quality)
            if read.is_reverse:
                read_quality = read_quality[::-1]  # reverse the list

            quality_plus_mean = [(quality, mean_quality) for quality in read_quality]
            qualities_reverse.append(np.asarray(quality_plus_mean))
            # qualities_reverse.append(read.query_qualities)

        # get mismatches
        alignment = read.get_aligned_pairs(matches_only=True, with_seq=True)
        read_has_indels = False
        for base in alignment:  # dispatch mismatches in matrix
            pos, subst, read_has_indels = modeller.dispatch_subst(base, read, read_has_indels)
            if read.is_read1 and subst is not None:
                subst_matrix_f[pos, subst] += 1
            elif read.is_read2 and subst is not None:
                subst_matrix_r[pos, subst] += 1
        if read_has_indels:  # dispatch indels in matrix
            for pos, indel in modeller.dispatch_indels(read):
                if read.is_read1:
                    indel_matrix_f[pos, indel] += 1
                elif read.is_read2:
                    indel_matrix_r[pos, indel] += 1

    logger.info("Calculating mean and base quality distribution")
    quality_bins_f = modeller.divide_qualities_into_bins(qualities_forward)
    quality_bins_r = modeller.divide_qualities_into_bins(qualities_reverse)

    # getting distribution of mean sequence quality
    mean_f = [len(quality_bin) for quality_bin in quality_bins_f]
    mean_r = [len(quality_bin) for quality_bin in quality_bins_r]

    hists_f = modeller.quality_bins_to_histogram(quality_bins_f)
    hists_r = modeller.quality_bins_to_histogram(quality_bins_r)

    # modern illumina instruments return reads of the same length
    # in case our bam file contains aligned reads of different length,
    # we coerce the model's read length to the smallest read of the bam file
    length_forward = min((len(x) for x in hists_f if len(x) > 1))
    length_reverse = min((len(x) for x in hists_r if len(x) > 1))
    read_length = min(length_forward, length_reverse)

    # now we can resize the substitution and indel matrices before
    # doing operations on them
    subst_matrix_f.resize([read_length, 16], refcheck=False)
    subst_matrix_r.resize([read_length, 16], refcheck=False)
    indel_matrix_f.resize([read_length, 9], refcheck=False)
    indel_matrix_r.resize([read_length, 9], refcheck=False)

    logger.info("Calculating substitution rate")
    subst_f = modeller.subst_matrix_to_choices(subst_matrix_f, read_length)
    subst_r = modeller.subst_matrix_to_choices(subst_matrix_r, read_length)

    logger.info("Calculating indel rate")
    # update the base count in indel matrices
    for position in range(read_length):
        indel_matrix_f[position][0] = sum(subst_matrix_f[position][::4])
        indel_matrix_r[position][0] = sum(subst_matrix_r[position][::4])

    ins_f, del_f = modeller.indel_matrix_to_choices(indel_matrix_f, read_length)
    ins_r, del_r = modeller.indel_matrix_to_choices(indel_matrix_r, read_length)

    logger.info("Calculating insert size distribution")
    # insert_size = int(np.mean(insert_size_dist))
    hist_insert_size = modeller.insert_size(template_length_dist, read_length)

    write_to_file(
        "kde",
        read_length,
        mean_f,
        mean_r,
        hists_f,
        hists_r,
        subst_f,
        subst_r,
        ins_f,
        ins_r,
        del_f,
        del_r,
        hist_insert_size,
        output + ".npz",
    )
Source code for iss.bam

InSilicoSeq

Navigation

Related Topics