#!/usr/bin/perl

use strict;

use IO::File;
use Data::Dumper;
use Getopt::Long qw(:config no_ignorecase);

my $min_seen = 0;
my $distance = 0;
my $strip = -1;
my $usepil;
my $pilsub;
my $faidx;

GetOptions("minvar=i"=>\$min_seen, "distance"=>\$distance, "strip!"=>\$strip, "pileup"=>\$usepil, "PSUB=s"=>\$pilsub, "faidx=s"=>\$faidx) || die;

die "use -p for default varcall.txt/pileup.txt conversion\n" if -e $pilsub;
$pilsub = "varcall.txt/pileup.txt" if $usepil && !$pilsub;

die "Usage: $0 [opts] varcall-files...
    -m       Minimum number of samples with a variant (1)
    -d       Output a distance matrix instead (1=het, 2=hom, 3=indel)
    -p       Use associated pileup info for reference depth, default is varcall.txt -> pileup.txt
    -s       Strip all path info
    -f REF   Use reference for sort order (necessary if the original file was not lexically sorted!)
    -P SUBST Use associated pileup info for reference depth, and define SUBST
" if !@ARGV;

my @in;  my @pil; my $i = 0;

my %refsort;
if ($faidx) {
    $faidx = "$faidx.fai" if $faidx !~ /\.fai$/;
    open(IN, $faidx) || die "can't read $faidx: $!";
    my $i = 0;
    while(<IN>) {
        my ($id) = split /\t/;
        $refsort{$id}=++$i;
    }
    die unless %refsort;
}

for (@ARGV) {
    if (defined $pilsub) {
        my $pil = $_;
        my ($from, $to) = split /[\/=]/, $pilsub;
        die "-PSUB must be from/to: $pilsub\n" unless $to;
        if ($pil =~ s/$from$/$to/) {
            if (-s $pil) {
                open($pil[$i], $pil)||die"open $pil:$!\n";
            } elsif (-s "$pil.gz") {
                open($pil[$i], "gunzip -c $pil.gz|")||die"open $pil:$!\n";
            } else {
                die "file name $pil does not exist";
            }
        } else {
            die "file name $pil does not match $pilsub";
        }
    }
    open($in[$i++], $_)||die"open $_:$!\n";
}

# print header
print "chr\tpos\tref" if !$distance;
print "chr-pos" if $distance;
# strip common path info
my $pre = common_prefix(@ARGV); $pre =~ s/[^\/]+$//; $pre = quotemeta($pre);
my $suf = common_suffix(@ARGV); $suf =~ s/^[^.]+//; $suf = quotemeta($suf);
for (my $i = 0; $i < @in; ++$i ){
    print "\t";
    my $th = $ARGV[$i];
    if ($strip == -1) {
        $th =~ s/^$pre//;
    } elsif ($strip) {
        $th =~ s/^.*\///;
    }
    $th =~ s/$suf$//;
    print $th;
}
print "\n";

my $ok = 1;
my @keep;
my $cnt = 0;
while ($ok) {
$ok = 0;

my ($l, @d);
my ($min_i, $min_id, $min_pos);
# for each file
for (my $i = 0; $i < @in; ++$i ) {
    # read if needed
    if (!defined($keep[$i])) {
        $l = $in[$i]->getline; chomp $l;
        next unless $l;
        # id, pos, ref, depth, skipped, pct, call1, call2...
        $d[$i] = [split /\t/, $l];
    } else {
        # reuse last rounds read
        $d[$i] = $keep[$i];
        next unless $d[$i];
    }
    # find the min id
    $ok = 1;
    if ( ( $d[$i][0] && !defined($min_id) ) ||
         ( chrlt($d[$i][0],$min_id) ) || 
         ( ($d[$i][0] eq $min_id) && ($d[$i][1] < $min_pos) )
       ) {
        # minimum id
        $min_i = $i;
        $min_id = $d[$i][0];
        $min_pos = $d[$i][1];
    }
}

last if !$ok;

die "Error: no id found: ", Dumper \@d if !$min_id;

my $seen;
for ($i = 0; $i < @in; ++$i ){
    if ($d[$i] && ($d[$i][0] eq $min_id && $d[$i][1] eq $min_pos) ) {
        ++$seen;
        $keep[$i] = undef;                  # read next
    } else {
        $keep[$i] = $d[$i];                 # don't read
    }
}

# if you don't see enough in common, skip
if ($seen < $min_seen) {
    next;
}

my @p;
if (@pil) {
    # seek all the pileups
    for ($i = 0; $i < @pil; ++$i ){
        do {
            # note: chr, pos, ref, depth, pil
            $p[$i] = [split /\t/, $pil[$i]->getline];
        } while (   ( chrlt($p[$i][0], $min_id) ) ||
                    ( ($p[$i][0] eq $min_id) && ($p[$i][1] < $min_pos) )
                );
    }
}

# print the variant
my $ref = $d[$min_i][2];
print "$min_id-$min_pos" if $distance;
print "$min_id\t$min_pos\t", $ref if !$distance;

for ($i = 0; $i < @in; ++$i ){
    print "\t";
    if ($d[$i] && ($d[$i][0] eq $min_id && $d[$i][1] eq $min_pos) ) {
        # output data
        # todo support alternate matrix formats
        my $count = $#{$d[$i]};
        if ($distance) {
            for (@{$d[$i]}[6 .. $count]) {
                $distance = 2;
                if (uc($ref) eq substr($_,0,1)) {
                    --$distance;
                }
                if (substr($_,0,1) eq '*') {
                    ++$distance;
                }
                if (substr($_,0,1) eq '+') {
                    ++$distance;
                }
            }
            print $distance;
        } else {
            print join ";", sort { substr($b,index($b,':')+1) <=> substr($a,index($a,':')+1) } @{$d[$i]}[6 .. $count];
        }
    } else {
        # output blank, keep read for later
        if ($distance) {
            print 0;
        } else {
            if ($p[$i] && $p[$i][0] eq $min_id && $p[$i][1] eq $min_pos) {
                if ($p[$i][2] =~ /^[atgc]$/oi) {
                    my $depth = $p[$i][4] =~ tr/,.//;
                    if ($depth < 4) {
                        print "n/a";
                    } else {
                        my $sq = 0;
                        for(my $j=0;$j<length($p[$i][5]);++$j) {
                            $sq+=ord(substr($p[$i][5],$j,1))-33;
                        }
                        $sq = int($sq/length($p[$i][5]));
                        print uc($p[$i][2]), ":$depth,$sq";
                    }
                } else {
                    print "n/a";
                }
            } else {
                print "";
            }
        }
    }
}

print "\n";
};


sub common_suffix {
    my $comm = shift @_;
    while ($_ = shift @_) {
        $_ = substr($_,-length($comm)) if (length($_) > length($comm));
        $comm = substr($comm,-length($_)) if (length($_) < length($comm));
        if (( $_ ^ $comm ) =~ /(\0*)$/) {
            $comm = substr($comm, -length($1));
        } else {
            return undef;
        }
    }
    return $comm;
}

sub common_prefix {
    my $comm = shift @_;
    while ($_ = shift @_) {
        $_ = substr($_,0,length($comm)) if (length($_) > length($comm));
        $comm = substr($comm,0,length($_)) if (length($_) < length($comm));
        if (( $_ ^ $comm ) =~ /^(\0*)/) {
            $comm = substr($comm, 0, length($1));
        } else {
            return undef;
        }
    }
    return $comm;
}

sub chrlt {
    my ($a, $b) = @_;
    if ($faidx) {
        return 0 if $a eq $b;
        return $refsort{$a} < $refsort{$b};
    } else {
        $a =~ tr/:/\0/;
        $b =~ tr/:/\0/;
        return $a lt $b;
    }
}

