Source code for iss.error_models
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from builtins import dict, range, zip
from iss import util
import sys
import random
import logging
import numpy as np
[docs]class ErrorModel(object):
"""Main ErrorModel Class
This class is used to create inheriting classes and contains all
the functions that are shared by all ErrorModel classes
"""
@property
def logger(self):
component = "{}.{}".format(type(self).__module__, type(self).__name__)
return logging.getLogger(component)
[docs] def load_npz(self, npz_path, model):
"""load the error profile .npz file
Args:
npz_path (string): path to the npz file
model (string): type of model. Could be 'cdf' or 'kde'. 'cdf' has
been deprecated and is no longer available
Returns:
ndarray: numpy object containg variables necessary
for error model construction
"""
try:
error_profile = np.load(npz_path)
assert error_profile['model'] == model
except (OSError, IOError) as e:
self.logger.error('Failed to read ErrorModel file: %s' % e)
sys.exit(1)
except AssertionError as e:
self.logger.error(
'Trying to load a %s ErrorModel in %s mode' % (
error_profile['model'], model))
sys.exit(1)
else:
self.logger.debug('Loaded ErrorProfile: %s' % npz_path)
return error_profile
[docs] def introduce_error_scores(self, record, orientation):
"""Add phred scores to a SeqRecord according to the error_model
Args:
record (SeqRecord): a read record
orientation (string): orientation of the read. Can be 'forward' or
'reverse'
Returns:
SeqRecord: a read record with error scores
"""
if orientation == 'forward':
record.letter_annotations["phred_quality"] = self.gen_phred_scores(
self.quality_forward, 'forward')
elif orientation == 'reverse':
record.letter_annotations["phred_quality"] = self.gen_phred_scores(
self.quality_reverse, 'reverse')
return record
[docs] def mut_sequence(self, record, orientation):
"""Introduce substitution errors to a sequence
If a random probability is higher than the probability of the basecall
being correct, introduce a substitution error
Args:
record (SeqRecord): a read record with error scores
orientation (string): orientation of the read. Can be 'forward' or
'reverse'
Returns:
Seq: a sequence
"""
# get the right subst_matrix
if orientation == 'forward':
nucl_choices = self.subst_choices_for
elif orientation == 'reverse':
nucl_choices = self.subst_choices_rev
mutable_seq = record.seq.tomutable()
quality_list = record.letter_annotations["phred_quality"]
position = 0
for nucl, qual in zip(mutable_seq, quality_list):
if random.random() > util.phred_to_prob(qual) \
and nucl.upper() not in 'RYWSMKHBVDN':
mutable_seq[position] = str(np.random.choice(
nucl_choices[position][nucl.upper()][0],
p=nucl_choices[position][nucl.upper()][1]))
position += 1
return mutable_seq.toseq()
[docs] def adjust_seq_length(self, mut_seq, orientation, full_sequence, bounds):
"""Truncate or Extend reads to make them fit the read length
When insertions or deletions are introduced to the reads, their length
will change. This function takes a (mutable) read and a reference
sequence, and extend or truncate the read if it has had an insertion
or a deletion
Args:
mut_seq (MutableSeq): a mutable sequence
orientation (string): orientation of the read. Can be 'forward' or
'reverse'
full_sequence (Seq): the reference sequence from which mut_seq
comes from
bounds (tuple): the position of the read in the full_sequence
Returns:
Seq: a sequence fitting the ErrorModel
"""
read_start, read_end = bounds
if len(mut_seq) == self.read_length:
return mut_seq.toseq()
elif len(mut_seq) > self.read_length:
while len(mut_seq) > self.read_length:
mut_seq.pop()
return mut_seq.toseq()
else: # len smaller
to_add = self.read_length - len(mut_seq)
if orientation == 'forward':
for i in range(to_add):
if read_end + i > len(full_sequence):
nucl_to_add = 'A'
else:
nucl_to_add = str(full_sequence[read_end + i])
mut_seq.append(nucl_to_add)
elif orientation == 'reverse':
for i in range(to_add):
nucl_to_add = util.rev_comp(full_sequence[read_end + i])
mut_seq.append(nucl_to_add)
return mut_seq.toseq()
[docs] def introduce_indels(self, record, orientation, full_seq, bounds):
"""Introduce insertions or deletions in a sequence
Introduce insertion and deletion errors according to the probabilities
present in the indel choices list
Args:
record (SeqRecord): a sequence record
orientation (string): orientation of the read. Can be 'forward' or
'reverse'
full_seq (Seq): the reference sequence from which mut_seq
comes from
bounds (tuple): the position of the read in the full_sequence
Returns:
Seq: a sequence with (eventually) indels
"""
# get the right indel arrays
if orientation == 'forward':
insertions = self.ins_for
deletions = self.del_for
elif orientation == 'reverse':
insertions = self.ins_rev
deletions = self.del_rev
mutable_seq = record.seq.tomutable()
position = 0
for nucl in range(self.read_length - 1):
try:
# skip ambiguous nucleotides
if mutable_seq[nucl].upper() in 'RYWSMKHBVDN':
position += 1
continue
for nucl_to_insert, prob in insertions[position].items():
if random.random() < prob:
# we want to insert after the base read
mutable_seq.insert(position + 1, str(nucl_to_insert))
if random.random() < deletions[position][mutable_seq[nucl].upper()]:
mutable_seq.pop(position)
position += 1
except IndexError as e:
continue
seq = self.adjust_seq_length(
mutable_seq, orientation, full_seq, bounds)
return seq