Source code for iss.bam

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from iss import util
from iss import modeller

from scipy import stats
from random import random

import sys
import pysam
import logging
import numpy as np


[docs]def read_bam(bam_file, n_reads=1000000):
    """Bam file reader. Select random mapped reads from a bam file

    Args:
        bam_file (string): path to a bam file

    Yields:
        read: a pysam read object
    """
    logger = logging.getLogger(__name__)

    try:
        lines = pysam.idxstats(bam_file).splitlines()
        total_records = sum([int(l.split("\t")[2])
                            for l in lines if not l.startswith("#")])
        # total_records = sum(1 for _ in bam.fetch() if not _.is_unmapped)
        random_fraction = n_reads / total_records
        bam = pysam.AlignmentFile(bam_file, 'rb')  # reopen the file

    except (IOError, ValueError,
            ZeroDivisionError, pysam.utils.SamtoolsError) as e:
        logger.error('Failed to read bam file: %s' % e)
        sys.exit(1)
    else:
        logger.info('Reading bam file: %s' % bam_file)
        c = 0
        with bam:
            for read in bam.fetch():
                if not read.is_unmapped and random() < random_fraction:
                    c += 1
                    if logger.getEffectiveLevel() == 10:
                        print(
                            'DEBUG:iss.bam:Subsampling %s / %s reads' % (
                                c, n_reads),
                            end='\r')
                    yield read
                elif c >= n_reads:
                    break


[docs]def write_to_file(model, read_length, mean_f, mean_r, hist_f, hist_r,
                  sub_f, sub_r, ins_f, ins_r, del_f, del_r, i_size, output):
    """Write variables to a .npz file

    Args:
        model (string): the type of error model
        read_length (int): read length of the dataset
        mean_f (list): list of mean bin sizes
        mean_r (list): list of mean bin sizes
        hist_f (list): list of cumulative distribution functions for the
            forward read quality
        hist_r (list): list of cumulative distribution functions for the
            reverse read quality
        sub_f (list): list of dictionaries representing the substitution
            probabilities for the forward reads
        sub_r (list): list of dictionaries representing the substitution
            probabilities for the reverse reads
        ins_f (list): list of dictionaries representing the insertion
            probabilities for the forward reads
        ins_r (list): list of dictionaries representing the insertion
            probabilities for the reverse reads
        del_f (list): list of dictionaries representing the deletion
            probabilities for the forward reads
        del_r (list): list of dictionaries representing the deletion
            probabilities for the reverse reads
        i_size (int): distribution of insert size for the aligned reads
        output (string): prefix of the output file
    """
    logger = logging.getLogger(__name__)

    try:
        logger.info('Writing model to file: %s' % output)
        np.savez_compressed(
            output,
            model=model,
            read_length=read_length,
            insert_size=i_size,
            mean_count_forward=mean_f,
            mean_count_reverse=mean_r,
            quality_hist_forward=hist_f,
            quality_hist_reverse=hist_r,
            subst_choices_forward=sub_f,
            subst_choices_reverse=sub_r,
            ins_forward=ins_f,
            ins_reverse=ins_r,
            del_forward=del_f,
            del_reverse=del_r
        )
    except PermissionError as e:
        logger.error('Failed to open output file: %s' % e)
        sys.exit(1)


[docs]def to_model(bam_path, output):
    """from a bam file, write all variables needed for modelling reads in
    a .npz model file

    For a brief description of the variables that will be written to the
        output file, see the bam.write_to_file function

    Args:
        bam_path (string): path to a bam file
        output (string): prefix of the output file
    """
    logger = logging.getLogger(__name__)

    insert_size_dist = []
    qualities_forward = []
    qualities_reverse = []
    subst_matrix_f = np.zeros([301, 16])  # we dont know the len of the reads
    subst_matrix_r = np.zeros([301, 16])  # yet. we will find out from the
    indel_matrix_f = np.zeros([301, 9])   # len of the quality lists
    indel_matrix_r = np.zeros([301, 9])

    # read the bam file and extract info needed for modelling
    for read in read_bam(bam_path):
        # get insert size distribution
        if read.is_proper_pair:
            template_length = abs(read.template_length)
            i_size = template_length - (2 * len(read.seq))
            insert_size_dist.append(i_size)

        # get qualities
        if read.is_read1:
            # get mean quality too
            quality_means = []
            read_quality = read.query_qualities
            mean_quality = np.mean(read_quality)
            if read.is_reverse:
                read_quality = read_quality[::-1]  # reverse the list

            quality_plus_mean = [
                (quality, mean_quality) for quality in read_quality]
            qualities_forward.append(np.asarray(quality_plus_mean))
            # qualities_forward.append(read.query_qualities)
        elif read.is_read2:
            # get mean quality too
            quality_means = []
            read_quality = read.query_qualities
            mean_quality = np.mean(read_quality)
            if read.is_reverse:
                read_quality = read_quality[::-1]  # reverse the list

            quality_plus_mean = [
                (quality, mean_quality) for quality in read_quality]
            qualities_reverse.append(np.asarray(quality_plus_mean))
            # qualities_reverse.append(read.query_qualities)

        # get mismatches
        alignment = read.get_aligned_pairs(
            matches_only=True,
            with_seq=True
            )
        read_has_indels = False
        for base in alignment:  # dispatch mismatches in matrix
            pos, subst, read_has_indels = modeller.dispatch_subst(
                base, read, read_has_indels)
            if read.is_read1 and subst is not None:
                subst_matrix_f[pos, subst] += 1
            elif read.is_read2 and subst is not None:
                subst_matrix_r[pos, subst] += 1
        if read_has_indels:  # dispatch indels in matrix
            for pos, indel in modeller.dispatch_indels(read):
                if read.is_read1:
                    indel_matrix_f[pos, indel] += 1
                elif read.is_read2:
                    indel_matrix_r[pos, indel] += 1

    logger.info('Calculating insert size distribution')
    # insert_size = int(np.mean(insert_size_dist))
    hist_insert_size = modeller.insert_size(insert_size_dist)

    logger.info('Calculating mean and base quality distribution')
    quality_bins_f = modeller.divide_qualities_into_bins(qualities_forward)
    quality_bins_r = modeller.divide_qualities_into_bins(qualities_reverse)

    # getting distribution of mean sequence quality
    mean_f = [len(quality_bin) for quality_bin in quality_bins_f]
    mean_r = [len(quality_bin) for quality_bin in quality_bins_r]

    hists_f = modeller.quality_bins_to_histogram(quality_bins_f)
    hists_r = modeller.quality_bins_to_histogram(quality_bins_r)

    # modern illumina instruments return reads of the same length
    # in case our bam file contains aligned reads of different length,
    # we coerce the model's read length to the smallest read of the bam file
    length_forward = min((len(x) for x in hists_f if len(x) > 1))
    length_reverse = min((len(x) for x in hists_r if len(x) > 1))
    read_length = min(length_forward, length_reverse)

    # now we can resize the substitution and indel matrices before
    # doing operations on them
    subst_matrix_f.resize([read_length, 16], refcheck=False)
    subst_matrix_r.resize([read_length, 16], refcheck=False)
    indel_matrix_f.resize([read_length, 9], refcheck=False)
    indel_matrix_r.resize([read_length, 9], refcheck=False)

    logger.info('Calculating substitution rate')
    subst_f = modeller.subst_matrix_to_choices(subst_matrix_f, read_length)
    subst_r = modeller.subst_matrix_to_choices(subst_matrix_r, read_length)

    logger.info('Calculating indel rate')
    # update the base count in indel matrices
    for position in range(read_length):
        indel_matrix_f[position][0] = sum(subst_matrix_f[position][::4])
        indel_matrix_r[position][0] = sum(subst_matrix_r[position][::4])

    ins_f, del_f = modeller.indel_matrix_to_choices(
        indel_matrix_f, read_length)
    ins_r, del_r = modeller.indel_matrix_to_choices(
        indel_matrix_r, read_length)

    write_to_file(
        'kde',
        read_length,
        mean_f,
        mean_r,
        hists_f,
        hists_r,
        subst_f,
        subst_r,
        ins_f,
        ins_r,
        del_f,
        del_r,
        hist_insert_size,
        output + '.npz')
Source code for iss.bam

InSilicoSeq

Navigation

Related Topics