#!/opt/mambaforge/envs/bioconda/conda-bld/concoct_1758281759540/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placeh/bin/python
"""
A script to generate the input coverage table for CONCOCT using a BEDFile.

Output is written to stdout. The BEDFile defines the regions used as 
subcontigs for concoct. This makes it possible to get the coverage for 
subcontigs without specifically mapping reads against the subcontigs.

@author: inodb, alneberg
"""
from __future__ import print_function
import sys
import os
import argparse
import subprocess
import glob
from signal import signal, SIGPIPE, SIG_DFL
import pandas as pd

def check_bed_file_for_errors(bedfile):
    with open(bedfile) as ifh:
        for line in ifh:
            line = line.strip()
            original_id, _, _, cutup_id = line.split('\t')
            try:
                assert 'concoct_part_' not in original_id
                assert 'concoct_part_' in cutup_id
            except AssertionError:
                sys.stderr.write(("ERROR! Something is wrong with the line:\n'{}'\n"
                "Perhaps 'concoct_part_' is misplaced or missing? Exiting!\n").format(line))
                sys.exit(-1)

def generate_input_table(bedfile, bamfiles, samplenames=None):
    """Reads input files into dictionaries then prints everything in the table
    format required for running CONCOCT."""

    p = subprocess.Popen(["samtools", "bedcov", bedfile] + bamfiles, stdout=subprocess.PIPE)

    out, err = p.communicate()
    if p.returncode != 0:
        sys.stderr.write(out)
        sys.stderr.write(err)
        raise Exception('Error with running samtools bedcov')
    else:
        # Header
        if samplenames == None:
            # Use index if no sample names given in header
            col_names = [os.path.splitext(os.path.basename(bamfile))[0] for bamfile in bamfiles]
        else:
            # Use given sample names in header
            col_names = samplenames
        header=["cov_mean_sample_{}".format(n) for n in col_names]

        # Content
        if sys.version_info[0] < 3:
            from StringIO import StringIO
        else:
            from io import StringIO

        fh = StringIO(out.decode('utf-8'))
        df = pd.read_table(fh, header=None)
        avg_coverage_depth = df[df.columns[4:]].divide((df[2]-df[1]), axis=0)
        avg_coverage_depth.index = df[3]
        avg_coverage_depth.columns = header 
        avg_coverage_depth.to_csv(sys.stdout, index_label='contig', sep='\t', float_format='%.3f')

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("bedfile", help="Contigs BEDFile with four columns representing: 'Contig ID, Start Position, "
    "End Position and SubContig ID' respectively. The Subcontig ID must contain the pattern "
    "'concoct_part_[0-9]*' while the contigs which are not cutup cannot contain this pattern. "
    "This file can be generated by the cut_up_fasta.py script.")
    parser.add_argument("bamfiles", nargs='+', help="BAM files with mappings to the original contigs.")
    parser.add_argument("--samplenames", default=None, help="File with sample names, one line each. Should be same nr "
    "of bamfiles. Default sample names used are the file names of the bamfiles, excluding the file extension.")
    args = parser.parse_args()

    # Get sample names
    if args.samplenames != None:
        samplenames = [ s[:-1] for s in open(args.samplenames).readlines() ]
        if len(samplenames) != len(args.bamfiles):
            raise Exception("Nr of names ({0}) in samplenames should be equal to nr of given "
                            "bamfiles ({1})".format(len(samplenames), len(args.bamfiles)))
    else:
        samplenames=None

    # ignore broken pipe error when piping output
    # http://newbebweb.blogspot.pt/2012/02/python-head-ioerror-errno-32-broken.html
    signal(SIGPIPE,SIG_DFL)
    check_bed_file_for_errors(args.bedfile)
    generate_input_table(args.bedfile, args.bamfiles, samplenames=samplenames)
