Source code for iss.error_models.kde
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from builtins import super
from iss import util
from iss.error_models import ErrorModel
from Bio.Seq import MutableSeq
from Bio.SeqRecord import SeqRecord
from scipy import stats
import sys
import random
import numpy as np
[docs]class KDErrorModel(ErrorModel):
"""KDErrorModel class.
Error model based on an .npz files derived from read alignments.
the npz file must contain:
- the length of the reads
- the mean insert size
- the size of mean sequence quality bins (for R1 and R2)
- a cumulative distribution function of quality scores for each position
(for R1 and R2)
- the substitution for each nucleotide at each position (for R1 and R2)
- the insertion and deletion rates for each position (for R1 and R2)
"""
def __init__(self, npz_path):
super().__init__()
self.npz_path = npz_path
self.error_profile = self.load_npz(npz_path, 'kde')
self.read_length = self.error_profile['read_length']
self.i_size_cdf = self.error_profile['insert_size']
self.mean_forward = self.error_profile['mean_count_forward']
self.mean_reverse = self.error_profile['mean_count_reverse']
self.quality_forward = self.error_profile['quality_hist_forward']
self.quality_reverse = self.error_profile['quality_hist_reverse']
self.subst_choices_for = self.error_profile['subst_choices_forward']
self.subst_choices_rev = self.error_profile['subst_choices_reverse']
self.ins_for = self.error_profile['ins_forward']
self.ins_rev = self.error_profile['ins_reverse']
self.del_for = self.error_profile['del_forward']
self.del_rev = self.error_profile['del_reverse']
self.error_profile = '' # discard the error profile after reading
[docs] def gen_phred_scores(self, cdfs, orientation):
"""Generate a list of phred scores based on cdfs and mean bins
For each position, draw a phred score from the cdf and append to the
phred score list
Args:
cdfs (ndarray): array containing the cdfs
orientation (string): orientation of the read. Can be 'forward' or
'reverse'
Returns:
list: a list of phred scores
"""
# choose which mean bin to use
if orientation == 'forward':
mean = self.mean_forward
elif orientation == 'reverse':
mean = self.mean_reverse
norm_mean = mean / sum(mean)
quality_bin = np.searchsorted(norm_mean, np.random.rand())
# downgrades index out of bound (ex rand is 1, last bin in searchsorted
# is 0.95) to best quality bin
if quality_bin == 4:
quality_bin = 3
cdfs_bin = cdfs[quality_bin]
phred_list = []
for cdf in cdfs_bin:
random_quality = np.searchsorted(cdf, np.random.rand())
phred_list.append(random_quality)
return phred_list
[docs] def random_insert_size(self):
"""Draw a random insert size from the insert size cdf
Args:
i_size_cdf: cumulative distribution function of the insert size
Returns:
int: an insert size
"""
insert_size = np.searchsorted(self.i_size_cdf, np.random.rand())
return insert_size