Source code for iss.app

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import logging
import multiprocessing as mp
import os
import sys

from Bio import SeqIO

from iss import util
from iss.generator import (
    generate_work_divider,
    load_error_model,
    load_genomes,
    load_readcount_or_abundance,
    worker_iterator,
)
from iss.version import __version__


[docs] def generate_reads(args): """Main function for the `iss generate` submodule This submodule generates reads from an ErrorModel and write them to args.output + _R(1|2).fastq Args: args (object): the command-line arguments from argparse """ logger = logging.getLogger(__name__) logger.debug("iss version %s" % __version__) logger.debug("Using verbose logger") logger.info("Starting iss generate") error_model = load_error_model( args.mode, args.seed, args.model, args.fragment_length, args.fragment_length_sd, args.store_mutations ) genome_list, genome_file = load_genomes( args.genomes, args.draft, args.ncbi, args.n_genomes_ncbi, args.output, args.n_genomes ) readcount_dic, abundance_dic = load_readcount_or_abundance( args.readcount_file, args.abundance_file, args.coverage_file, args.coverage, args.abundance, args.draft, genome_list, genome_file, args.n_reads, args.output, error_model, ) if args.store_mutations: logger.info(f"Storing inserted sequence errors in {args.output}.vcf") logger.info("Using %s cpus for read generation" % args.cpus) if readcount_dic is not None: n_reads = sum(readcount_dic.values()) logger.info("Generating %s reads" % n_reads) else: n_reads = util.convert_n_reads(args.n_reads) logger.info("Generating %s reads" % n_reads) try: # list holding the prefix for each cpu's temp file temp_file_list = [f"{args.output}.iss.tmp.{i}" for i in range(args.cpus)] f = open(genome_file, "r") # re-opens the file with f: fasta_file = SeqIO.parse(f, "fasta") # TODO check if a SeqIO.index (db) leads to better memory usage and is not slower # fasta_dict = SeqIO.index(f, 'fasta') # Calculate how many reads we want each cpu to generate n_read_pairs = n_reads // 2 chunk_size = -((n_read_pairs) // -args.cpus) # this is ceildiv, see https://stackoverflow.com/a/17511341 logger.debug("Chunk size: %s" % chunk_size) # Divide the work of generating n_reads for each record into chunks work_chunks = generate_work_divider( fasta_file, readcount_dic, abundance_dic, n_reads, args.coverage, args.coverage_file, error_model, args.output, chunk_size, ) # Generate reads for each chunk in parallel with mp.Pool(args.cpus) as pool: pool.starmap( worker_iterator, [ (work, error_model, cpu_number, worker_prefix, args.seed, args.sequence_type, args.gc_bias) for cpu_number, (work, worker_prefix) in enumerate(zip(work_chunks, temp_file_list)) ], ) except KeyboardInterrupt as e: logger.error("iss generate interrupted: %s" % e) temp_R1 = [temp_file + "_R1.fastq" for temp_file in temp_file_list] temp_R2 = [temp_file + "_R2.fastq" for temp_file in temp_file_list] temp_mut = [temp_file + ".vcf" for temp_file in temp_file_list] full_tmp_list = temp_R1 + temp_R2 + temp_mut full_tmp_list.append(genome_file) if os.path.exists("%s.memmap" % args.output): full_tmp_list.append("%s.memmap" % args.output) util.cleanup(full_tmp_list) sys.exit(1) else: # remove the duplicates in file list and cleanup # we remove the duplicates in case two records had the same header # and reads were appended to the same temp file. temp_R1 = [temp_file + "_R1.fastq" for temp_file in temp_file_list] temp_R2 = [temp_file + "_R2.fastq" for temp_file in temp_file_list] temp_mut = [temp_file + ".vcf" for temp_file in temp_file_list] if args.store_mutations else [] util.concatenate(temp_R1, args.output + "_R1.fastq") util.concatenate(temp_R2, args.output + "_R2.fastq") if args.store_mutations: util.concatenate( temp_mut, args.output + ".vcf", "##fileformat=VCFv4.1\n" + "\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]), ) full_tmp_list = temp_R1 + temp_R2 + temp_mut full_tmp_list.append(genome_file) if os.path.exists("%s.memmap" % args.output): full_tmp_list.append("%s.memmap" % args.output) util.cleanup(full_tmp_list) if args.compress: util.compress(args.output + "_R1.fastq") util.compress(args.output + "_R2.fastq") if args.store_mutations: util.compress(args.output + ".vcf") logger.info("Read generation complete")
[docs] def model_from_bam(args): """Main function for the `iss model` submodule This submodule write all variables necessary for building an ErrorModel to args.output + .npz Args: args (object): the command-line arguments from argparse """ logger = logging.getLogger(__name__) logger.debug("iss version %s" % __version__) logger.debug("Using verbose logger") try: # try to import bam module and write model data to file logger.info("Starting iss model") from iss import bam except ImportError as e: logger.error("Failed to import bam module: %s" % e) sys.exit(1) else: logger.info("Using KDE ErrorModel") bam.to_model(args.bam, args.output) logger.info("Model generation complete")
[docs] def main(): parser = argparse.ArgumentParser( prog="InSilicoSeq", usage="iss [subcommand] [options]", description="InSilicoSeq: A sequencing simulator", ) parser.add_argument( "-v", "--version", action="store_true", default=False, help="print software version and exit", ) subparsers = parser.add_subparsers(title="available subcommands", metavar="") parser_mod = subparsers.add_parser( "model", prog="iss model", description="generate an error model from a bam file", help="generate an error model from a bam file", ) parser_gen = subparsers.add_parser( "generate", prog="iss generate", description="simulate reads from an error model", help="simulate reads from an error model", ) # arguments form the read generator module param_logging = parser_gen.add_mutually_exclusive_group() # --genomes and --ncbi should not be exclusive anymore # input_genomes = parser_gen.add_mutually_exclusive_group() input_abundance = parser_gen.add_mutually_exclusive_group() param_logging.add_argument( "--quiet", "-q", action="store_true", default=False, help="Disable info logging. (default: %(default)s).", ) param_logging.add_argument( "--debug", "-d", action="store_true", default=False, help="Enable debug logging. (default: %(default)s).", ) parser_gen.add_argument( "--seed", type=int, metavar="<int>", help="Seed all the random number generators", default=None, ) parser_gen.add_argument( "--cpus", "-p", default=2, type=int, metavar="<int>", help="number of cpus to use. (default: %(default)s).", ) parser_gen.add_argument( "--genomes", "-g", metavar="<genomes.fasta>", nargs="+", help="Input genome(s) from where the reads will originate", ) parser_gen.add_argument( "--draft", metavar="<draft.fasta>", nargs="+", help="Input draft genome(s) from where the reads will originate", ) parser_gen.add_argument( "--n_genomes", "-u", type=int, metavar="<int>", help="How many genomes will be used for the simulation. is set with \ --genomes/-g or/and --draft to take random genomes from the \ input multifasta", ) parser_gen.add_argument( "--ncbi", "-k", choices=["bacteria", "viruses", "archaea"], action="append", nargs="*", metavar="<str>", help="Download input genomes from NCBI. Requires --n_genomes/-u\ option. Can be bacteria, viruses, archaea or a combination of the\ three (space-separated)", ) parser_gen.add_argument( "--n_genomes_ncbi", "-U", type=int, action="append", metavar="<int>", nargs="*", help="How many genomes will be downloaded from NCBI. Required if\ --ncbi/-k is set. If more than one kingdom is set with --ncbi,\ multiple values are necessary (space-separated).", ) input_abundance.add_argument( "--abundance", "-a", choices=[ "uniform", "halfnormal", "exponential", "lognormal", "zero_inflated_lognormal", ], metavar="<str>", default="lognormal", help="abundance distribution (default: %(default)s). Can be uniform,\ halfnormal, exponential, lognormal or zero-inflated-lognormal.", ) input_abundance.add_argument( "--abundance_file", "-b", metavar="<abundance.txt>", help="abundance file for coverage calculations (default: %(default)s).", ) input_abundance.add_argument( "--coverage", "-C", choices=[ "uniform", "halfnormal", "exponential", "lognormal", "zero_inflated_lognormal", ], metavar="<str>", help="coverage distribution. Can be uniform,\ halfnormal, exponential, lognormal or zero-inflated-lognormal.", ) input_abundance.add_argument( "--coverage_file", "-D", metavar="<coverage.txt>", help="file containing coverage information (default: %(default)s).", ) input_abundance.add_argument( "--readcount_file", "-R", metavar="<readcount.txt>", help="file containing read_count information (default: %(default)s).", ) parser_gen.add_argument( "--n_reads", "-n", metavar="<int>", default="1000000", help="Number of reads to generate (default: %(default)s). Allows \ suffixes k, K, m, M, g and G (ex 0.5M for 500000).", ) parser_gen.add_argument( "--mode", "-e", metavar="<str>", choices=["kde", "basic", "perfect"], default="kde", help="Error model. If not specified, using kernel density estimation \ (default: %(default)s). Can be kde, basic or perfect", ) parser_gen.add_argument( "--model", "-m", metavar="<npz>", default=None, help="Error model file. (default: %(default)s). Use HiSeq, NextSeq, NovaSeq, \ MiSeq or Miseq-[20,24,28,32] for a pre-computed error model provided with the \ software, or a file generated with iss model. If you do not wish to use a model, use \ --mode basic or --mode perfect. The name of the built-in models are \ case insensitive.", ) parser_gen.add_argument( "--gc_bias", "-c", action="store_true", default=False, help="If set, may fail to sequence reads with abnormal GC content. \ (default: %(default)s)", ) parser_gen.add_argument( "--compress", "-z", action="store_true", default=False, help="Compress the output in gzip format (default: %(default)s).", ) parser_gen.add_argument( "--output", "-o", metavar="<fastq>", help="Output file path and prefix (Required)", required=True, ) parser_gen.add_argument( "--sequence_type", "-t", choices=["metagenomics", "amplicon"], default="metagenomics", required=False, help="Type of sequencing. Can be metagenomics or amplicon (default: %(default)s).", ) parser_gen.add_argument( "--fragment-length", "-l", metavar="<int>", required=False, type=int, help="Fragment length for metagenomics sequencing (default: %(default)s).", ) parser_gen.add_argument( "--fragment-length-sd", "-s", metavar="<int>", required=False, type=int, help="Fragment length standard deviation for metagenomics sequencing (default: %(default)s).", ) parser_gen.add_argument( "--store_mutations", "-M", action="store_true", default=False, help="Generates an additional VCF file with the mutations introduced in the reads", ) parser_gen._optionals.title = "arguments" parser_gen.set_defaults(func=generate_reads) # arguments for the error model module parser_mod.add_argument( "--quiet", "-q", action="store_true", default=False, help="Disable info logging. (default: %(default)s).", ) parser_mod.add_argument( "--debug", "-d", action="store_true", default=False, help="Enable debug logging. (default: %(default)s).", ) parser_mod.add_argument( "--bam", "-b", metavar="<bam>", help="aligned reads from which the model will be inferred (Required)", required=True, ) parser_mod.add_argument( "--output", "-o", metavar="<npz>", help="Output file path and prefix (Required)", required=True, ) parser_mod._optionals.title = "arguments" parser_mod.set_defaults(func=model_from_bam) args = parser.parse_args() # set logger and display version if args.version try: if args.version: print("iss version %s" % __version__) sys.exit(0) elif args.quiet: logging.basicConfig(level=logging.ERROR) elif args.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) args.func(args) logging.shutdown() except AttributeError as e: logger = logging.getLogger(__name__) logger.debug(e) parser.print_help()
# raise # extra traceback to uncomment if all hell breaks lose