#!/usr/bin/env python
# -*- coding: utf-8 -*-
from iss import bam
from iss import util
from iss import download
from iss import abundance
from iss import generator
from iss.version import __version__
from Bio import SeqIO
from joblib import Parallel, delayed
import os
import sys
import random
import logging
import argparse
import numpy as np
[docs]def generate_reads(args):
"""Main function for the `iss generate` submodule
This submodule generates reads from an ErrorModel and write them to
args.output + _R(1|2).fastq
Args:
args (object): the command-line arguments from argparse
"""
logger = logging.getLogger(__name__)
logger.debug('iss version %s' % __version__)
logger.debug('Using verbose logger')
try: # try to import and load the correct error model
logger.info('Starting iss generate')
logger.info('Using %s ErrorModel' % args.mode)
if args.seed:
logger.info('Setting random seed to %i' % args.seed)
random.seed(args.seed)
np.random.seed(args.seed)
if args.mode == 'kde':
from iss.error_models import kde
if args.model is None:
logger.error('--model is required in --mode kde')
sys.exit(1)
elif args.model.lower() == 'hiseq':
npz = os.path.join(
os.path.dirname(__file__),
'profiles/HiSeq')
elif args.model.lower() == 'novaseq':
npz = os.path.join(
os.path.dirname(__file__),
'profiles/NovaSeq')
elif args.model.lower() == 'miseq':
npz = os.path.join(
os.path.dirname(__file__),
'profiles/MiSeq')
else:
npz = args.model
err_mod = kde.KDErrorModel(npz)
elif args.mode == 'basic':
if args.model is not None:
logger.warning('--model %s will be ignored in --mode %s' % (
args.model, args.mode))
from iss.error_models import basic
err_mod = basic.BasicErrorModel()
except ImportError as e:
logger.error('Failed to import ErrorModel module: %s' % e)
sys.exit(1)
try: # try to read genomes and concatenate --genomes and --ncbi genomes
if args.genomes or args.draft or args.ncbi:
genome_files = []
if args.genomes:
genome_files.extend(args.genomes)
if args.draft:
logger.warning('--draft is in early experimental stage.')
logger.warning(
'--draft disables --abundance_file and --coverage')
logger.warning('Defaulting to --abundance.')
genome_files.extend(args.draft)
if args.ncbi and args.n_genomes_ncbi:
util.genome_file_exists(args.output + '_ncbi_genomes.fasta')
total_genomes_ncbi = []
try:
assert len(*args.ncbi) == len(*args.n_genomes_ncbi)
except AssertionError as e:
logger.error(
'--ncbi and --n_genomes_ncbi of unequal lengths. \
Aborting')
sys.exit(1)
# this is py3 only
# for g, n in zip(*args.ncbi, *args.n_genomes):
# py2 compatibilty workaround
# TODO switch to the more elegant solution when we drop python2
args.ncbi = [x for y in args.ncbi
for x in y]
args.n_genomes_ncbi = [x for y in args.n_genomes_ncbi
for x in y]
for g, n in zip(args.ncbi, args.n_genomes_ncbi):
genomes_ncbi = download.ncbi(
g, n, args.output + '_ncbi_genomes.fasta')
genome_files.append(genomes_ncbi)
else:
logger.error("One of --genomes/-g, --draft, --ncbi/-k is required")
sys.exit(1)
genome_file = args.output + '.iss.tmp.genomes.fasta'
util.concatenate(
genome_files,
output=genome_file)
assert os.stat(genome_file).st_size != 0
f = open(genome_file, 'r')
with f: # count the number of records
genome_list = util.count_records(f)
except IOError as e:
logger.error('Failed to open genome(s) file:%s' % e)
raise
sys.exit(1)
except AssertionError as e:
logger.error('Genome(s) file seems empty: %s' % genome_file)
sys.exit(1)
except KeyboardInterrupt as e:
logger.error('iss generate interrupted: %s' % e)
sys.exit(1)
else:
abundance_dispatch = {
'uniform': abundance.uniform,
'halfnormal': abundance.halfnormal,
'exponential': abundance.exponential,
'lognormal': abundance.lognormal,
'zero_inflated_lognormal': abundance.zero_inflated_lognormal
}
# read the abundance file
if args.abundance_file and not args.draft:
logger.info('Using abundance file:%s' % args.abundance_file)
abundance_dic = abundance.parse_abundance_file(args.abundance_file)
elif args.coverage and not args.draft:
logger.warning('--coverage is an experimental feature')
logger.info('Using coverage file:%s' % args.coverage)
abundance_dic = abundance.parse_abundance_file(args.coverage)
elif args.abundance in abundance_dispatch:
logger.info('Using %s abundance distribution' % args.abundance)
if args.draft:
abundance_dic = abundance.draft(
genome_list,
args.draft,
abundance_dispatch[args.abundance],
args.output)
else:
abundance_dic = abundance_dispatch[
args.abundance](genome_list)
abundance.to_file(abundance_dic, args.output)
else:
logger.error('Could not get abundance')
sys.exit(1)
cpus = args.cpus
logger.info('Using %s cpus for read generation' % cpus)
if not args.coverage:
n_reads = util.convert_n_reads(args.n_reads)
logger.info('Generating %s reads' % n_reads)
try:
temp_file_list = [] # list holding the prefix of all temp files
f = open(genome_file, 'r') # re-opens the file
with f:
fasta_file = SeqIO.parse(f, 'fasta')
if args.n_genomes and not args.ncbi:
n = args.n_genomes[0][0]
else:
n = None
for record in util.reservoir(fasta_file, genome_list, n):
# generate reads for records
try:
species_abundance = abundance_dic[record.id]
except KeyError as e:
logger.error(
'Fasta record not found in abundance file: %s' % e)
sys.exit(1)
else:
logger.info('Generating reads for record: %s'
% record.id)
genome_size = len(record.seq)
if args.coverage:
coverage = species_abundance
else:
coverage = abundance.to_coverage(
n_reads,
species_abundance,
err_mod.read_length,
genome_size
)
n_pairs = int(round(
(coverage *
len(record.seq)) / err_mod.read_length) / 2)
# exact n_reads for each cpus
if n_pairs % cpus == 0:
n_pairs_per_cpu = [(n_pairs // cpus)
for _ in range(cpus)]
else:
n_pairs_per_cpu = [(n_pairs // cpus)
for _ in range(cpus)]
n_pairs_per_cpu[-1] += n_pairs % cpus
record_file_name_list = Parallel(n_jobs=cpus)(
delayed(generator.reads)(
record, err_mod,
n_pairs_per_cpu[i], i, args.output, args.seed,
args.gc_bias) for i in range(cpus))
temp_file_list.extend(record_file_name_list)
except KeyboardInterrupt as e:
logger.error('iss generate interrupted: %s' % e)
temp_file_unique = list(set(temp_file_list))
temp_R1 = [temp_file + '_R1.fastq' for temp_file in temp_file_list]
temp_R2 = [temp_file + '_R2.fastq' for temp_file in temp_file_list]
full_tmp_list = temp_R1 + temp_R2
full_tmp_list.append(genome_file)
util.cleanup(full_tmp_list)
sys.exit(1)
else:
# remove the duplicates in file list and cleanup
# we remove the duplicates in case two records had the same header
# and reads were appended to the same temp file.
temp_file_unique = list(set(temp_file_list))
temp_R1 = [temp_file + '_R1.fastq' for temp_file in temp_file_list]
temp_R2 = [temp_file + '_R2.fastq' for temp_file in temp_file_list]
util.concatenate(temp_R1, args.output + '_R1.fastq')
util.concatenate(temp_R2, args.output + '_R2.fastq')
full_tmp_list = temp_R1 + temp_R2
full_tmp_list.append(genome_file)
util.cleanup(full_tmp_list)
logger.info('Read generation complete')
[docs]def model_from_bam(args):
"""Main function for the `iss model` submodule
This submodule write all variables necessary for building an ErrorModel
to args.output + .npz
Args:
args (object): the command-line arguments from argparse
"""
logger = logging.getLogger(__name__)
logger.debug('iss version %s' % __version__)
logger.debug('Using verbose logger')
try: # try to import bam module and write model data to file
logger.info('Starting iss model')
from iss import bam
except ImportError as e:
logger.error('Failed to import bam module: %s' % e)
sys.exit(1)
else:
logger.info('Using KDE ErrorModel')
bam.to_model(args.bam, args.output)
logger.info('Model generation complete')
[docs]def main():
parser = argparse.ArgumentParser(
prog='InSilicoSeq',
usage='iss [subcommand] [options]',
description='InSilicoSeq: A sequencing simulator'
)
parser.add_argument(
'-v',
'--version',
action='store_true',
default=False,
help='print software version and exit'
)
subparsers = parser.add_subparsers(
title='available subcommands',
metavar=''
)
parser_mod = subparsers.add_parser(
'model',
prog='iss model',
description='generate an error model from a bam file',
help='generate an error model from a bam file'
)
parser_gen = subparsers.add_parser(
'generate',
prog='iss generate',
description='simulate reads from an error model',
help='simulate reads from an error model'
)
# arguments form the read generator module
param_logging = parser_gen.add_mutually_exclusive_group()
# --genomes and --ncbi should not be exclusive anymore
# input_genomes = parser_gen.add_mutually_exclusive_group()
input_abundance = parser_gen.add_mutually_exclusive_group()
param_logging.add_argument(
'--quiet',
'-q',
action='store_true',
default=False,
help='Disable info logging. (default: %(default)s).'
)
param_logging.add_argument(
'--debug',
'-d',
action='store_true',
default=False,
help='Enable debug logging. (default: %(default)s).'
)
parser_gen.add_argument(
'--seed',
type=int,
metavar='<int>',
help='Seed all the random number generators',
default=None
)
parser_gen.add_argument(
'--cpus',
'-p',
default=2,
type=int,
metavar='<int>',
help='number of cpus to use. (default: %(default)s).'
)
parser_gen.add_argument(
'--genomes',
'-g',
metavar='<genomes.fasta>',
nargs="+",
help='Input genome(s) from where the reads will originate'
)
parser_gen.add_argument(
'--draft',
metavar='<draft.fasta>',
nargs="+",
help='Input draft genome(s) from where the reads will originate'
)
parser_gen.add_argument(
'--n_genomes',
'-u',
type=int,
action='append',
metavar='<int>',
help='How many genomes will be used for the simulation. is set with \
--genomes/-g or/and --draft to take random genomes from the \
input multifasta'
)
parser_gen.add_argument(
'--ncbi',
'-k',
choices=['bacteria', 'viruses', 'archaea'],
action='append',
nargs='*',
metavar='<str>',
help='Download input genomes from NCBI. Requires --n_genomes/-u\
option. Can be bacteria, viruses, archaea or a combination of the\
three (space-separated)'
)
parser_gen.add_argument(
'--n_genomes_ncbi',
'-U',
type=int,
action='append',
metavar='<int>',
nargs='*',
help='How many genomes will be downloaded from NCBI. Required if\
--ncbi/-k is set. If more than one kingdom is set with --ncbi,\
multiple values are necessary (space-separated).'
)
input_abundance.add_argument(
'--abundance',
'-a',
choices=['uniform', 'halfnormal',
'exponential', 'lognormal', 'zero_inflated_lognormal'],
metavar='<str>',
default='lognormal',
help='abundance distribution (default: %(default)s). Can be uniform,\
halfnormal, exponential, lognormal or zero-inflated-lognormal.'
)
input_abundance.add_argument(
'--abundance_file',
'-b',
metavar='<abundance.txt>',
help='abundance file for coverage calculations (default: %(default)s).'
)
input_abundance.add_argument(
'--coverage',
'-C',
metavar='<coverage.txt>',
help='file containing coverage information (default: %(default)s).'
)
parser_gen.add_argument(
'--n_reads',
'-n',
metavar='<int>',
default='1000000',
help='Number of reads to generate (default: %(default)s). Allows \
suffixes k, K, m, M, g and G (ex 0.5M for 500000).'
)
parser_gen.add_argument(
'--mode',
'-e',
metavar='<str>',
choices=['kde', 'basic'],
default='kde',
help='Error model. If not specified, using kernel density estimation \
(default: %(default)s). Can be kde or basic.'
)
parser_gen.add_argument(
'--model',
'-m',
metavar='<npz>',
default=None,
help='Error model file. (default: %(default)s). Use HiSeq, NovaSeq or \
MiSeq for a pre-computed error model provided with the software, or a \
file generated with iss model. If you do not wish to use a model, use \
--mode basic. The name of the built-in models is case insensitive.'
)
parser_gen.add_argument(
'--gc_bias',
'-c',
action='store_true',
default=False,
help='If set, may fail to sequence reads with abnormal GC content. \
Does not guarantee --n_reads (default: %(default)s)'
)
parser_gen.add_argument(
'--output',
'-o',
metavar='<fastq>',
help='Output file prefix (Required)',
required=True
)
parser_gen._optionals.title = 'arguments'
parser_gen.set_defaults(func=generate_reads)
# arguments for the error model module
parser_mod.add_argument(
'--quiet',
'-q',
action='store_true',
default=False,
help='Disable info logging. (default: %(default)s).'
)
parser_mod.add_argument(
'--debug',
'-d',
action='store_true',
default=False,
help='Enable debug logging. (default: %(default)s).'
)
parser_mod.add_argument(
'--bam',
'-b',
metavar='<bam>',
help='aligned reads from which the model will be inferred (Required)',
required=True
)
parser_mod.add_argument(
'--output',
'-o',
metavar='<npz>',
help='Output file prefix (Required)',
required=True
)
parser_mod._optionals.title = 'arguments'
parser_mod.set_defaults(func=model_from_bam)
args = parser.parse_args()
# set logger and display version if args.version
try:
if args.version:
print('iss version %s' % __version__)
sys.exit(0)
elif args.quiet:
logging.basicConfig(level=logging.ERROR)
elif args.debug:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
args.func(args)
logging.shutdown()
except AttributeError as e:
logger = logging.getLogger(__name__)
logger.debug(e)
parser.print_help()
# raise # extra traceback to uncomment if all hell breaks lose