#!/usr/bin/perl

use strict;
use Getopt::Long;
use Data::Dumper;

Getopt::Long::Configure(qw(passthrough require_order no_ignore_case));

# TODO: put these  in a config file, it can still be perl...for now it's simple enough to edit here

my %nologic 	= map {($_, 1)} qw(if [ calc echo . alc for export);

# always send to grid engine, no matter how big
my %wrap 		= map {($_, 1)} qw(bowtie-build tophat juncsdb bwa cufflinks countannot calc-numaligned fastx_quality_stats);

# never send to grid engine, no matter how big
my %nowrap 	= map {($_, 1)} qw(condor_run run-analysis mv grun qpipe ea);

# always run these, regardless of dependencies - still send to grid if needed
my %nodep 		= map {($_, 1)} qw();

# this can be condor_run, qsub... whatever you want
my $grid_engine	= 'grun';

# alter default 'big' size and parameter parsing
my %info = (
		cp => {big=>'20g'},
		# input doesn't need to exist, output is calculated from the input
		gunzip => {big=>'10g', in=>{_noex=>1}, out=>{_eval=>sub{$_[0]=~m/(.*)\.gz$/; return $1}}},
		gzip => {big=>'20g', in=>{_noex=>1}, out=>{_eval=>sub{$_[0].".gz"}}},

		'auto-clip.sh' => {out=>{_eval=>sub{$_[0]=~ s/\.(fastq|fq)//; return ($_[0].".qcStats", $_[0].".adapters")}}},
		bowtie => {big=>'20m'},
		bwa => {big=>'20m'},
		tophat => {big=>'10m'},
		cufflinks => {big=>'10m', in=>{b=>1, r=>1, G=>1}},
		'cufflinks0.9.3' => {big=>'10m', in=>{b=>1, r=>1, G=>1}},
		fastx_quality_stats => {in=>{i=>1}, out=>{o=>1}},
		samtools => { match => [
				{ pat=>qr/sort\s*(?:-n)?\s+(\S+bam)\s*(\S+)/, post=>'$out[0] .= ".bam"' } ,
				{ pat=>qr/pileup.*\s(\S+[bs]am)/ } ,
		] },
                bwasam =>   { match => [
                                { pat=>qr/(?:-o \S+\s+)?(?:\S+)\s+(\S+)/, post=>'$out[1] = "$out[0].sam"; $out[0] = "$out[0].bam"'} ,
                ] },
		# change to asterisk for dependency globbing
		'fastq-multx' => {out=>{o=>sub {s/%/\*/}}, in=>{l=>1, g=>1, B=>1}},

		'fastq-mcf' => {out=>{o=>1}},
);

our $VERSION = '0.9.46';

my ($debug, $check, $inline, $verbose, $help, $rules_file, $list, $noexec, $passopts, $waitout, $force);

$| = 1;

$waitout=1;	# by default we wait for grid jobs output

GetOptions("DEBUG"=>\$debug, "K"=>\$check, "force"=>\$force, "grid"=>\$grid_engine, "waitout"=>\$waitout, "cmd=s"=>\$inline, "noexec"=>\$noexec, "l"=>\$list, "verbose|v"=>\$verbose, "help"=>\$help, "rules=s"=>\$rules_file) || die usage();

$verbose = 1 if $debug;
$noexec = 1 if $debug;

$passopts = '-v' if $verbose;
$passopts .= ' -l' if $list;
$passopts .= ' -D' if $debug;
$passopts .= ' -n' if $noexec;
$passopts .= ' -f' if $force;

if ($help || $ARGV[0] =~ /^-\?/) {
	print usage();
	exit 0
}

sub reptail {
	my ($f, $r) = @_;
	$f =~ s/\Q$r\E$//;
	return $f;
}

sub repwith {
        my ($f, $r, $t) = @_;
        $f =~ s/\Q$r\E/$t/;
        return $f;
}

if ($check) {
	# base executable name
	my $orig_cmd = shift @ARGV;
	my $cmd = $orig_cmd;
	$cmd =~ s/.*\///;

	my $cmd_nv = $cmd;
	$cmd_nv =~ s/\-[0-9.-]+//;
	$cmd_nv = $cmd if $cmd_nv !~ /^[\w-]+$/;


	my @argv = @ARGV;
	print STDERR "QSH preex '$orig_cmd @argv'\n" if $debug;
	for (@argv) {
		# bash vars come in unexpanded... expand them
		s/\$(\w+)/$ENV{$1}/g;
		s/\${(\w+)}/$ENV{$1}/g;
		s/\${(\w+)\%([^{}]+)}/reptail($ENV{$1},$2)/ge;
		s/\${(\w+)\/([^\/]+)\/([^{}]+)}/repwith($ENV{$1},$2,$3)/ge;
	}	
	print STDERR "QSH top '$cmd @argv'\n" if $debug;
	# don't try to alter non executable lines
	exit 0 if $cmd !~ /^[\w.-]+$/;

	# don't ever check dependencies, or wrap these commands
	if ($nologic{$cmd}) {
		print STDERR "QSH ignore command '$cmd'\n" if $verbose;
		if ($debug) {
			print STDERR "QSH debug: '$cmd @ARGV'";
		}
		exit 0;
	}

	print STDERR "QSH check command $cmd\n" if $verbose;

	# walk through arguments, determining inputs, outputs and total sizes 	
	my ($opt, %opt, $first, $firstnox, $last, @in, @out, $sz);
	my $info = $info{$cmd} ? $info{$cmd} : $info{$cmd_nv};
	for (@argv) {
		if (s/^--?//) {
			$opt = $_;
			next;
		}
		if ($opt) {
			if ($opt eq '<' || $info->{in}->{$opt}) {
				push @in, $_;
			}
			if ($opt eq '>>' || $opt eq '>' || $info->{out}->{$opt}) {
				push @out, $_;
			}
			$opt = '';
			next;
		}
		$opt = '>' if ($_ eq '>');
		$opt = '<' if ($_ eq '<');
		$_ =~ s/^~/$ENV{HOME}/;
		if (!$firstnox) {
				$firstnox = $_ 
		}
		if (!$first) {
			if ( $info->{in}->{_noex} || -e $_ ) {
				$first = $_ 
			}
		} else {
			$last = $_;
		}
		$sz += -s $_;
	}

	push @in, $first if $first && (!@in);		# no input ? assume first file that exists
	push @in, $firstnox if $firstnox && (!@in);	# no input ? assume first file

	if ( $info->{match} ) {
		my $args = "@ARGV";	
		for my $m (@{$info->{match}}) {
			my @m = $args =~ /$m->{pat}/;
			if (@m) {
				@in = shift @m;
				if (@m) {
					@out = @m;
					if ($m->{post}) {
						eval($m->{post});
					}
				}
			}
		}
	}
	if ($info->{post}) {
		eval($info->{post});
	}

	if ( $info->{in}->{_noin} ) {
		@in = ();
		$last = $first;
	}

	if ($info->{out}->{_eval}) {
		my ($i, $o) = ($in[0], $out[0]);
		@out = &{$info->{out}->{_eval}}($i, $o);
		print STDERR "QSH eval $cmd: @out\n" if $verbose;
		if ($@) {
			print STDERR "QSH error: $@\n";
		}
	}

	push @out, $last if $last && (!@out);		# no output ? assume last argument
	pop @out if $out[0] eq $in[0];			# no in's = outs

	print STDERR "QSH check: $cmd : sz: $sz, in: @in, out: @out\n" if $verbose;

	my $mx;
	for (@in) {
		print STDERR "$_\n" if $list;
		my $m = fmodtime($_);
		$mx = $m if ($m > $mx);
	}
	my ($nox, $need);
	for (@out) {
		if ($_ =~ /\*/) {
			# TODO: fileglob all outputs, pick oldest, use that.
		}
		if ($mx < fmodtime($_)) {
			$nox=1;
		} else {
			$need=1;
		}
	}

	if (!$need && $nox && !$force) {
		print STDERR "QSH suppress: @out, already done\n"  if $verbose || $noexec || $debug;
		print "true";
	} else {
		my $wrap = $wrap{$cmd} || $wrap{$cmd_nv};

		my $big = $info->{big};
		if (!$big) {
			$big = '50m';
		}

		if (!$wrap) {
			my $t = $big;
			$t *= 1000000 if ($t =~ /m$/i);
			$t *= 1000000000 if ($t =~ /g$/i);
			$wrap = $sz >= $t;
		}

		# never submit a job to the condor, if we're already in condor
		$wrap = 0 if $ENV{"_CONDOR_SLOT"};
		$wrap = 0 if $nowrap{$cmd};
		$wrap = 0 if $cmd eq $grid_engine;
		# || $ENV{"_GRUN"};		# grun's more friendly about sub-submits... not sure whether to allow?

		if ($list) {
			print STDERR `which $orig_cmd 2>/dev/null`;
		}
		my $suff =  "#>\"" . join('" "',@out) . "\"";

		$suff .=  "#wait" if !$debug && ($wrap || ($cmd eq $grid_engine));

		print STDERR "QSH debug: wait '$out[0]' " . ($wrap || ($cmd eq $grid_engine)) . "\n" if $debug;
		if ($wrap) {
			print STDERR "QSH wrap $orig_cmd, input size:$sz big:$big\n" if $verbose;
			quoteargv();
			if ($list) {
				print "false";
			} else {
				print "$grid_engine \"$orig_cmd @ARGV\" $suff";
			}
		} else {
			print STDERR "QSH no wrap $orig_cmd @ARGV, input size $sz big: $big\n" if $verbose;
                        if ($list) {
                                print "false";
                        } elsif ($suff) {
				print "$orig_cmd @ARGV $suff";
			}
		}
	}

	exit 0;
}

sub bash_prefix;

#TODO ... would be nice if this could be interactive... 
# but there's a whole TTY attach/reattach thing I can't figure out

open STDERR, "|sort|uniq\n" if $list;

my $in;

if (!$inline) {
	$in = shift @ARGV;
	if (!$in) {
		die "qsh: missing input file operand\n"
	}
	open(IN, $in) || die "qsh: can't open $in: $!\n";
}

no strict 'refs';
if (pipe_to_fork('FOO')) {
	print STDERR "QSH start\n" if $verbose;
	use strict;
	# parent
	print FOO bash_prefix();
	if ($inline) {
		print FOO "\n$inline\n";
	} else {
		while ( <IN> ) {
			print FOO $_
		}
	}
	close FOO;
	while (wait > 1) {};
} else {
	quoteargv();
	exec("bash -s @ARGV");
	while (wait > 1) {};
}

sub quoteargv {
	for (@ARGV) {
		s/(['"()])/\\$1/g;
	}
}

sub fmodtime {
	return (stat($_[0]))[9];
}


sub bash_prefix {
	my $r;
	my $wx;
	my ($won, $woff);

	$wx = '[ -n "$wait" ] && qsh_waitforoutput $out' if ($waitout);

	$won = 'wait=1' if $waitout;
	$woff = 'wait=' if $waitout;

	my $dx;
	$dx = 'echo QSH debug: Cmd=$cmd Out=$out Wait=$wait 1>&2' if $debug;
	$dx = 'echo "$cmd"' if $noexec && !$debug;

	$r = "POPTS=\"$passopts\"\n" if $passopts;
	$r .= <<EOF;

function qsh_on_debug {
	if [ -n "\$BASH_COMMAND" ]; then
		cmd=`$0 -K \$POPTS \$BASH_COMMAND`

		if [[ \$cmd =~ "(.*)#>(.*)" ]]; then
			cmd=\${BASH_REMATCH[1]}
			out=\${BASH_REMATCH[2]}
		fi

		$woff
                if [[ \$out =~ "(.*)#wait" ]]; then
			out=\${BASH_REMATCH[1]}
			$won
                fi
		$dx

		if [ -n "\$cmd" ]; then
			if [ -n "\$out" ]; then
				if [ -z "$noexec" ]; then
					echo "+ \$cmd" 1>&2
					bash -c "\$cmd" || qsh_rmdie \$out
					$wx
				fi
			else
				if [ -z "$noexec" ]; then
					echo "+ \$cmd" 1>&2
					\$cmd
				fi
			fi
			false
		fi
	fi
}

function qsh_rmdie {
	if [ ! "" == "\$*" ]; then
        for xout in \$*; do
                if [ -n \$xout ]; then
                        echo QSH removing \$xout 1>&2
                fi
                rm -rf \$xout
        done
	fi
	exit 1
}

function qsh_waitforoutput {
        i=0
        while [ \$i -lt 80 ]
        do
                (( i++ ))
                if [ -z "\$1" ]; then
			break;
		fi
                if [ -e "\$1" ]; then
                        break;
                fi
                sleep .25
        done
}

shopt -s extdebug
set -o functrace
set -a

trap qsh_on_debug DEBUG
EOF
return $r;
die $r;
}

sub pipe_to_fork ($) {
	my $parent = shift;
	pipe my $child, $parent or die;
	my $pid = fork();
	die "fork() failed: $!" unless defined $pid;
	if ($pid) {
	    close $child;
	}
	else {
	    close $parent;
	    open(STDIN, "<&=" . fileno($child)) or die;
	}
	$pid;
}

sub usage {
<<EOF
qsh version $VERSION

Usage: qsh [options] <bash-script>
   or: qsh [options] -c "commands to be run"

Options:
   -d|ebug	    debug
   -l|ist	    guess list of dependencies
   -v|erbose	    verbose output to STDERR
   -h|elp 	    show this help
   -r|ules FILE	    use FILE instead of /etc/qsh.conf

Description:

This is a wrapper around 'bash' that passes each command executed 
to an evaluator that determines whether the command should run 
based on it's parsed inputs and outputs.

In addition, the wrapper determines if the command should be
sent to a "grid engine" based on the size of the inputs and
the type of command.

The purpose is to enable "fluid" development of simple pipeline
scripts without having to recode them into Makefiles or other
even more cumbersome DAG scripts, etc.

Problems:
	- It's inevitable that qsh might get some things wrong, 
	  and prevent maximum efficiency.
	- It's often better to carefully plan workflows
EOF
}

