#!/usr/bin/perl -w
# struct_harvest.pl
#  May 2007 - Oct 2009
# dearl (a) soe ucsc edu
# v0.3
# 
# see inline POD for details, or try:
#   ./struct_harvest.pl --help
#   ./struct_harvest.pl --man
#
########################################
#
# This program is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation, either version 3 of the License, or
#     (at your option) any later version.
#
#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
#
#     You should have received a copy of the GNU General Public License
#     along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
########################################
use warnings;
use strict;
use Getopt::Long;
use Pod::Usage;
use File::Glob ':glob';
use File::Basename;
use Statistics::Descriptive;

#####
sub usage;
sub versionCheck;
sub harvest;
sub evanno;
sub clumpp;
sub gnuplot;
sub renameFiles;
sub help;

#####
my $IN_DIR;
my $OUT_DIR;
my ($isHarvest, $isEvanno, $isCLUMPP,  $isGNUPLOT);
my ($isOptionVersion, $isOptionHelp, $isOptionMan, $isOptionDebug);
my $isOptionRename;
my $VERSION = 'v0.3, Oct 2009.';
GetOptions('dir=s'     => \$IN_DIR
           ,'out=s'    => \$OUT_DIR
           ,'harvest'  => \$isHarvest
           ,'evanno'   => \$isEvanno
           ,'clumpp'   => \$isCLUMPP
           ,'gnuplot'  => \$isGNUPLOT
           ,'rename'   => \$isOptionRename
           ,'version'  => \$isOptionVersion
           ,'help'     => \$isOptionHelp
           ,'man'      => \$isOptionMan
           ,'debug'    => \$isOptionDebug);
pod2usage(-verbose => 2) if $isOptionMan;
versionCheck($VERSION) if($isOptionVersion);
help() if($isOptionHelp);
usage() unless($IN_DIR);
usage() unless( (($OUT_DIR) && ($isHarvest || $isEvanno || $isCLUMPP || $isGNUPLOT))  || $isOptionRename);
usage() unless( -d $IN_DIR);
if($OUT_DIR){
    if(! -d $OUT_DIR){
        mkdir($OUT_DIR) or die "$0: unable to make directory, $OUT_DIR. $!; $?\n";
    }
    unless ($OUT_DIR =~ /\/$/){
        $OUT_DIR="$OUT_DIR/";
    }
}

############################################################

=pod

=head1 struct_harvest.pl

F<struct_harvest.pl> is a tool to extract information from STRUCTURE Results
folders. This is the offline version of
http://taylor0.biology.ucla.edu/struct_harvest/

=head1 SYNOPSIS

./struct_harvest.pl [options]
 Input:
   --dir     [specify the path to your STRUCTURE Results folder]
   --out     [specify the path to an out directory. If it doesn\'t exist,
              it will be created.]
   --rename  [a flag. Will rename your results to remove whitespace and will
              standardize numbers, i.e. 5 -> 0005]
   --harvest [a flag. Runs the Harvester. Output is out_harvest.txt]
   --evanno  [a flag. Runs the Evanno method. Output is out_evanno.summary.dat]
   --clumpp  [a flag. Generates clumpp (min K - max K).indfiles.]
   --gnuplot [a flag. Generates .dat files and .p files for use with gnuplot]
   --help    [a flag. Prints this message]
   --man     [a flag. Prints full documentation]
   --version [a flag. Prints current version and web address]

=head1 DESCRIPTION

F<struct_harvest.pl> 
The script takes the paths to your Results directory and an output directory
and then, depending on the flags you pass, extracts data from the results
and writes them to the output directory.

STRUCTURE:
  J. Pritchard, M. Stephens, P. Donnelly. 2000. Genetics 155:945-959.
    http://www.genetics.org/cgi/content/full/155/2/945
    http://pritch.bsd.uchicago.edu/structure.html
CLUMPP:
  M. Jakobsson, N. Rosenberg 2007. CLUMPP: a cluster matching and permutation
    program for dealing with label switching and multimodality in analysis of
    population structure. Bioinformatics 23(14): 1801-1806.
    http://bioinformatics.oxfordjournals.org/cgi/content/full/23/14/1801
    http://rosenberglab.bioinformatics.med.umich.edu/clumpp.html
Evanno Method:
  G. Evanno, S. Regnaut, J. Goudet 2005 Detecting the number of clusters of
    individuals using the software structure: a simulation study. Molecular
    Ecology 14(8): 2611-2620.
    http://doi.wiley.com/10.1111/j.1365-294X.2005.02553.x
GNUPLOT:
  http://www.gnuplot.info/

=head1 AUTHOR

Dent A. Earl, dearl (a) soe ucsc edu

=head1 DATE

October 2009

=head1 LICENSE
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

=cut

############################################################

################################################################################
# main
my %data; # $data{$k}{$fileName}{runNum} = run number
#                               {ln}     = Ln(P)
#                               {mean}   = mean
#                               {var}    = variance
# harvest() runs no matter what, only prints if $isHarvest
renameFiles($IN_DIR) if($isOptionRename);
harvest($IN_DIR, $OUT_DIR, \%data, $isHarvest); 
evanno($IN_DIR,  $OUT_DIR, \%data, $isGNUPLOT) if ($isEvanno);
clumpp($IN_DIR,  $OUT_DIR, \%data)             if ($isCLUMPP);
gnuplot($IN_DIR, $OUT_DIR, \%data)             if ($isGNUPLOT);

# end main
################################################################################
##############################
#
sub usage{
    print "USAGE: $0 --dir <STRUCTURE results directory> --out <output directory, will be created if does not exist> \n\t[optional: --harvest --evanno --clumpp --gnuplot --rename --help --man --version]\n";
    exit(2);
}
##############################
#
sub versionCheck{
    print "$0 $_[0]\n";
    print "  http://users.soe.ucsc.edu/~dearl/software/struct_harvest/\n";
    exit(0);
}
##############################
#
sub help{
    print <<END_HELP;
$0
--dir     <specify the path to your STRUCTURE Results folder>
--out     <specify the path to an out directory. If it doesn\'t exist, it will be created.>
--rename  [a flag. Will rename your results files to remove whitespace and will standardize numbers, i.e. 5 -> 0005]
--harvest [a flag. Runs the Harvester. Output is out_harvest.txt]
--evanno  [a flag. Runs the Evanno method. Output is out_evanno.summary.dat]
--clumpp  [a flag. Generates clumpp (min K - max K).indfiles.]
--gnuplot [a flag. Generates .dat files and .p files for use with gnuplot]
--help    [a flag. Prints this message]
--man     [a flag. Prints full documentation]
--version [a flag. Prints current version and web address]
END_HELP
    exit(0);
}
##############################
#
sub harvest{
    my ($IN_DIR, $OUT_DIR, $data_ref, $isHarvest)=@_;
    my @jobs = bsd_glob("$IN_DIR/*.txt", GLOB_TILDE);
    my $numJobs = @jobs;
    if($isHarvest){
        my $tmpNme = $OUT_DIR."out_harvest.txt";
        open(OUT,">$tmpNme") || die("Cannot Open File: $!");
        my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time);
        printf OUT "This output file was generated at:\n%4d-%02d-%02d %02d:%02d:%02d\n", $year+1900,$mon+1,$mday,$hour,$min,$sec;
    }
    my @fileList = bsd_glob("$IN_DIR*_f", GLOB_TILDE);
    print OUT "\nFile Name\tRun #\t#Pops\tEst. Ln Prob of Data\tMean value of Ln likelihood\tVariance of ln likedlihood\n\n" if($isHarvest);
    for my $rFile (@fileList) { 
        open (FILE, $rFile) or die "ERROR - Unable to open '$rFile': $!\n";
        my ($fName, $fPath, $runNum);
        if(($runNum) = $rFile =~ /\S+_(\d+)_f/) {
            $runNum=~ s/^[0]+//;
            ($fName, $fPath) = fileparse($rFile);
            print OUT "$fName\t$runNum\t" if($isHarvest);
            
        }
        my ($k, $ln, $mean, $var);
        while (my $line = <FILE>) {
            chomp($line);
            if($line =~ /(\S+)\s*populations assumed/) { 
                $k = $1;
                print OUT "$k\t" if($isHarvest);
                $$data_ref{$k}{$fName}{runNum}=$runNum;
            }
            if ($line =~ /Estimated Ln Prob of Data\s*=\s*(\S+)/) { 
                $ln = $1;
                print OUT "$ln\t" if($isHarvest);
                $$data_ref{$k}{$fName}{ln}=$ln;
            }
            if ($line =~ /Mean value of ln likelihood\s*=\s*(\S+)/) {
                $mean = $1;
                print OUT "$mean\t" if($isHarvest);
                $$data_ref{$k}{$fName}{mean}=$mean;
            }
            if ($line =~ /Variance of ln likelihood\s*=\s*(\S+)/) { 
                $var = $1;
                print OUT "$var\n" if($isHarvest);
                $$data_ref{$k}{$fName}{var}=$var;
            }
        }
        close(FILE) if($isHarvest);
    }
    print OUT "\nSoftware written by Dent Earl dearl (a) soe ucsc edu, BME Dept. UCSC\nGo Slugs\n" if($isHarvest);
    close(OUT) if($isHarvest);
}

##############################
# this section of the code changes the naming format.
sub renameFiles{
    $IN_DIR=$_[0];
    my @fileList = bsd_glob("$IN_DIR*_f", GLOB_TILDE);
    for my $elm (@fileList){ 
        my ($fName, $fPath, $fSuffix) = fileparse($elm,'_f');
        my  $oldName=$fName;
        $fName =~ s/ /\_/g; # ditch all extraneous spaces
        if($fName =~ /(\S+)\_(\d+)/) {
            my $pre=$1;
            my $num=$2;
            $num += 0; # try to get rid of any existing leading 0's
            if($num<10){
                $num="000$num";
            }elsif($num < 100){
                $num="00$num";
            }elsif($num < 1000){
                $num="0$num";
            }
            my  $newName = "$pre\_$num\_f";
            if (-e $newName) {
                warn "can't rename $oldName to $newName: $newName already exists $?\n";
            }else{
                rename("$elm", "$fPath$newName") or warn "rename $elm to $fPath$newName failed: $!\n";
            }
        }
    }
}
##############################
#
sub evanno{
    my ($IN_DIR, $OUT_DIR, $data_ref, $isGNUPLOT)=@_;
    my $inBlock   = 0;
    my $btBlock   = 0;
    my $nearBlock = 0;
    my $promo     = 0;
    my $maxK      = 0;
    my $minK      = 10**10;
    my $reps      = 0;
    my $runs      = 0;
    my $name = "out_evanno";
    my @orderedMatrix;
    
    for my $k (sort {$a <=> $b} keys %{$data_ref}){
        if ($k > $maxK){
            $maxK = $k;
            $reps = 0;
        }
        if ($k < $minK){
            $minK = $k;
        }
        for my $key( sort {$$data_ref{$k}{$a}{runNum} <=> $$data_ref{$k}{$b}{runNum}} keys %{$$data_ref{$k}} ){
            $runs++;
            $reps++;
            push(@orderedMatrix, [$k, $$data_ref{$k}{$key}{ln}]);
        }
    }

    open(OUTsummary,">$OUT_DIR$name.summary.dat") || die("$0 : Cannot Open '$OUT_DIR$name.summary.txt' : $!");
    
    if (($reps != 0) && ($runs % $reps == 0) && ($reps!=1) && ((($maxK-$minK)+1)*$reps == $runs ) &&(($maxK-$minK) > 1) && ($isGNUPLOT) ){
        open(OUTln,">$OUT_DIR$name.ln.dat") || die("$0 : Cannot Open '$OUT_DIR$name.out.txt' : $!");
        open(OUTlnprime,">$OUT_DIR$name.lnprime.dat") || die("$0 : Cannot Open '$OUT_DIR$name.out.txt' : $!");
        open(OUTlnprime2,">$OUT_DIR$name.lnprime2.dat") || die("$0 : Cannot Open '$OUT_DIR$name.out.txt' : $!");
        open(OUTdeltak,">$OUT_DIR$name.deltak.dat") || die("$0 : Cannot Open '$OUT_DIR$name.out.txt' : $!");
        open(OUTScriptlnprime,">$OUT_DIR$name.lnprime.p") || die("$0 : Cannot Open '$OUT_DIR$name.out.txt' : $!");
        open(OUTScriptlnprime2,">$OUT_DIR$name.lnprime2.p") || die("$0 : Cannot Open '$OUT_DIR$name.out.txt' : $!");
        open(OUTScriptdeltak,">$OUT_DIR$name.deltak.p") || die("$0 : Cannot Open '$OUT_DIR$name.out.txt' : $!");
        

#########################
## Calculate Mean and Standard Deviation
        my @LnMeans;
        my @stdDevs;
        my $lastK;
        my $stat;
        my $i=0;
        my $k=$minK;
        print OUTln "#********************************************\n#K\tL(K) Mean\tStdev\n" if($isGNUPLOT);
        $stat = Statistics::Descriptive::Full->new();
        $stat->add_data($orderedMatrix[0][1]);
        $lastK=$orderedMatrix[0][0];
        for(my $i=1; $i< @orderedMatrix; $i++){
            if($lastK == $orderedMatrix[$i][0]){
                $stat->add_data($orderedMatrix[$i][1]);
            }else{
                if($lastK){
                    push(@LnMeans, $stat->mean());
                    push(@stdDevs, $stat->standard_deviation());
                    print OUTln sprintf("%d\t%.3f\t%.3f\n",$k++,$LnMeans[$#LnMeans],$stdDevs[$#stdDevs]);
                }
                $stat = Statistics::Descriptive::Full->new();
                $stat->add_data($orderedMatrix[$i][1]);
                $lastK=$orderedMatrix[$i][0];
            }
        }
        push(@LnMeans, $stat->mean());
        push(@stdDevs, $stat->standard_deviation());
        print OUTln sprintf("%d\t%.3f\t%.3f\n",$k++,$LnMeans[$#LnMeans],$stdDevs[$#stdDevs]);
        close(OUTln);

#########################
##  Ln'(K) Here 
        my @LnPrimes;
        for(my $i = 1; $i <= $maxK-$minK;$i++){
            for (my $j=0; $j<$reps; $j++){
                my $index = ($i*$reps)+$j;
                $LnPrimes[(($i-1)*$reps)+$j] = ($orderedMatrix[$index][1]) - ($orderedMatrix[$index-$reps][1]);
#                print "\t$LnPrimes[(($i-1)*$reps)+$j]\n";
            }
        }
        my @LnPrimeMeans;
        my @primeStandardDeviations;
        print OUTlnprime "#********************************************\n#K\tL'(K) Mean\tStdev\n" if($isGNUPLOT);
        for (my $i=0; $i<($maxK-$minK);$i++){
            $stat = Statistics::Descriptive::Full->new();
            for (my $j=0; $j<$reps; $j++){
                my $tmp = $LnPrimes[($i*$reps)+$j];
                $stat->add_data($tmp);
            }
            push(@LnPrimeMeans, $stat->mean());
            push(@primeStandardDeviations, $stat->standard_deviation());
            print OUTlnprime sprintf("%d\t%.3f\t%.3f\n",$i+$minK+1,$LnPrimeMeans[$#LnPrimeMeans],
                                     $primeStandardDeviations[$#primeStandardDeviations]);
        }
        close(OUTlnprime);
#########################
##  |Ln''(K)| Here      
        my @LnPrimePrimes;
        for(my $i = 1; $i < ($maxK-$minK); $i++){
            for (my $j=0; $j < $reps; $j++){
                my $index = ($i*$reps)+$j;
                $LnPrimePrimes[(($i-1)*$reps)+$j] = ($LnPrimes[$index-$reps]) - ($LnPrimes[$index]);
                $LnPrimePrimes[(($i-1)*$reps)+$j] = sqrt($LnPrimePrimes[(($i-1)*$reps)+$j]**2);
#                print "$LnPrimePrimes[(($i-1)*$reps)+$j]\n";
            }
        }
        my @LnPrimePrimeMeans;
        my @primePrimeStandardDeviations;
        print OUTlnprime2"#********************************************\n#K\t|L''(K)| Mean\tStdev\n" if($isGNUPLOT);
        for (my $i=0; $i< ($maxK-$minK - 1);$i++){
            $stat = Statistics::Descriptive::Full->new();
            for (my $j=0; $j<$reps; $j++){
                my $value = $LnPrimePrimes[($i*$reps)+$j];
                $stat->add_data( $value );
            }
            $LnPrimePrimeMeans[$i] = $stat->mean();
            $primePrimeStandardDeviations[$i]  = $stat->standard_deviation();
            print OUTlnprime2 sprintf("%d\t%.3f\t%.3f\n",($i+1)+$minK,$LnPrimePrimeMeans[$i],
                                     $primePrimeStandardDeviations[$i]) if($isGNUPLOT);         
        }
        close(OUTlnprime2);
#########################
## preDeltaK Here       
        my @preDeltaKs;
        for(my $i = 1; $i < $maxK-$minK;$i++){
            for (my $j=0;$j<$reps;$j++){
                my $index = ($i*$reps)+$j;
                $preDeltaKs[(($i-1)*$reps)+$j] = sqrt((($orderedMatrix[$index+$reps][1]) - 
                                                       (2*$orderedMatrix[$index][1])+$orderedMatrix[$index-$reps][1])**2);
            }
        }
#########################
## deltaKs Here 
        my @deltaKs;
        print OUTdeltak "#********************************************\n#K\tDelta(K) Mean\n" if($isGNUPLOT);
        for (my $i=$minK-$minK;$i<$maxK-$minK-1;$i++){
            for (my $j=0;$j<$reps;$j++){
                $deltaKs[$i] += $preDeltaKs[($i*$reps)+$j];
            }
            $deltaKs[$i] /= $reps;
            $deltaKs[$i] /= $stdDevs[$i+1];
            print OUTdeltak sprintf("%d\t%.3f\n" , $i+$minK+1,$deltaKs[$i]) if($isGNUPLOT); # debug line
        }
        close(OUTdeltak);
        
#########################
## Final Summary Here
        print OUTsummary "#********************************************\n#Run\tK\tLnP(K)\tL'(K)\t|L''(K)|\n";
        for (my $i=0;$i<$reps;$i++){
            print OUTsummary sprintf("%d\t%d\t%.1f\tXXXXX\tXXXXX\n", $i+1,$orderedMatrix[$i][0],$orderedMatrix[$i][1]);
        }
        for (my $i=$reps;$i<2*$reps;$i++){
            print OUTsummary sprintf("%d\t%d\t%.1f\t%.1f\t%.1f\n", $i+1,$orderedMatrix[$i][0],$orderedMatrix[$i][1], $LnPrimes[$i-$reps],$preDeltaKs[$i-$reps]);
        }
        for (my $i=2*$reps;$i<($runs-$reps);$i++){
            print OUTsummary sprintf("%d\t%d\t%.1f\t%.1f\t%.1f\n", $i+1, $orderedMatrix[$i][0] , $orderedMatrix[$i][1], $LnPrimes[$i-$reps] , $preDeltaKs[$i-$reps]);
        }
        for (my $i=$runs-$reps;$i<$runs;$i++){
            print OUTsummary sprintf("%d\t%d\t%.1f\t%.1f\tXXXXX\n", $i+1, $orderedMatrix[$i][0] , $orderedMatrix[$i][1], $LnPrimes[$i-$reps]);
        }
        my $minplot = $minK - 1;
        my $maxplot = $maxK + 1;
        if($isGNUPLOT){
        print OUTScriptlnprime <<ENDSCRIPT;
set terminal png size 640,480 enhanced font \'/usr/share/fonts/bitstream-vera/Vera.ttf\' \\
xffffff x000000 x404040 xff0000
set output \"$OUT_DIR$name.2.png\"
set autoscale
unset log
unset label
set xrange [$minplot \: $maxplot]
set xtic auto
set ytic auto
set title \"Rate of Change of the likelihood distribution (mean +-SD)\"
set xlabel\"K\"
set ylabel \"L\'(K)\"
set nokey
plot \"$name.lnprime.dat\" using 1:2:3 with yerrorbars 
ENDSCRIPT
        close(OUTScriptlnprime);
        print OUTScriptlnprime2 <<ENDSCRIPT;
set terminal png size 640,480 enhanced font \'/usr/share/fonts/bitstream-vera/Vera.ttf\' \\
xffffff x000000 x404040 xff0000
set output \"$OUT_DIR$name.3.png\"
set autoscale
unset log
unset label
set xrange [$minplot \: $maxplot]
set xtic auto
set ytic auto
set title \"Abs val of the 2nd ord rte of chnge of the likelihood dist. (mean +-SD)\"
set xlabel\"K\"
set ylabel \"|L''(K)|\"
set nokey
plot \"$name.lnprime2.dat\" using 1:2:3 with yerrorbars 
ENDSCRIPT
        close(OUTScriptlnprime2);
        print OUTScriptdeltak <<ENDSCRIPT;
set terminal png size 640,480 enhanced font \'/usr/share/fonts/bitstream-vera/Vera.ttf\' \\
xffffff x000000 x404040 xff0000
set output \"$name.4.png\"
set autoscale
unset log
unset label
set xrange [$minplot \: $maxplot]
set xtic auto
set ytic auto
set title \"DeltaK = m(|L''(K)|)/s[L(K)]\"
set xlabel\"K\"
set ylabel \"Delta K\"
set nokey
plot \"$name.deltak.dat\" using 1:2 with linespoints
ENDSCRIPT
            close(OUTScriptdeltak);
        }
#############################################################   
    }else{
        warn("There is a problem with the data that is preventing Evanno method from running. See $OUT_DIR$name.log\n");
        open(OUTnono,">$OUT_DIR$name.log") || die("PERL : evannozer.pl : Cannot Open '$OUT_DIR$name.log' : $!");
        print OUTnono "There was a problem...\n";
        print OUTnono "Evanno Method:\nAll K need the same # of reps, K values must be sequenial\n";
        if($reps == 0){
            warn("$0: Warning, number of replicates is zero, unable to perform Evanno method.\n");
            print OUTnono "The number of replicates is zero. All K values must have the same number of replicates, all K values must be sequential.\n";
            return;
        }
        my $test1 = ($runs % $reps == 0);
        my $test2 = ($reps!=1);
        my $test3 = ((($maxK-$minK)+1)*$reps == $runs );
        my $test4 = (($maxK-$minK) > 1);
        print OUTnono "#Runs = $runs\t#Reps = $reps\tMaxK = $maxK\tMinK = $minK\n";
        print OUTnono "===========================\n";
        print OUTnono "TEST 1 : (runs % reps == 0) \t\t\tSTATUS : ";
        if ($test1==1){
            print OUTnono "PASS\n";
        }else{
            print OUTnono "FAIL - You need all K values to have the same # of replicates.\n";
        }
        print OUTnono "TEST 2 : (reps!=1) \t\t\t\tSTATUS : ";
        if ($test2==1){
            print OUTnono "PASS\n";
        }else{
            print OUTnono "FAIL - You need to have more than one replicate per K.\n";
        }
        print OUTnono "TEST 3 : (((maxK-minK)+1)*reps == runs ) \tSTATUS : ";
        if ($test3==1){
            print OUTnono "PASS\n";
        }else{
            print OUTnono "FAIL - Your K values need to be sequential, no missing K are allowed. (Will fail if test 1 failed)\n";
        }
        print OUTnono "TEST 4 : ((maxK-minK) > 1) \t\t\tSTATUS : ";
        if ($test4==1){
            print OUTnono "PASS\n";
        }else{
            print OUTnono "FAIL - Your min K and max K are used to generate neighboring delta K, so your total K will be 2 greater than your delta K. Therefore must have at least 3 Ks. \n";
        }
        close(OUTnono);
    }
}
##############################
#
sub clumpp{
    my ($IN_DIR, $OUT_DIR, $data_ref)=@_;
    #
    # clumpp_in.pl Sept 26 2007 @DAE
    # give it a directory and it will go through and seek out results files, open them, determine
    # what K they are, and then produce an infile for CLUMPP for each value of K that you have
    # represented in the folder.
    my @fileList = bsd_glob("$IN_DIR*_f", GLOB_TILDE);
    my $minK=10**10; # no one would ever run structure with this many groups.
    my $maxK=-1;      #likewise, no one would ever run structure with 0 groups.
    for my $k (sort {$a <=> $b} keys %{$data_ref}){
        if ($k > $maxK){
            $maxK = $k;
        }
        if ($k < $minK){
            $minK = $k;
        }
    }
    my $replicates;
    for (my $i=$minK; $i<=$maxK; $i++){
        $replicates = 0;
        my $tmpNme = $OUT_DIR."K$i.indfile";
        open(OUTTXT,">$tmpNme") || die("$0 : Cannot Open File: $!");
        foreach my $file (@fileList){
            chomp($file);
            open(FILE, $file) or die "$0 : Unable to open '$file': $!\n";
            my $correct=0;
            while (my $line = <FILE>) {
                chomp($line);
                if($line =~ /   (\S+) populations assumed/){
                    my $nPops = $1;
                    if ($nPops==$i){
                        $correct=1;
                    }
                }
                if($correct==1){ 
                    # we want to pull out all rows of the inferred ancestry table, 
                    # remove the label and replace it with the index of the row.
                    if($line =~ /\(\S+\)\s+\S*\s+:\s+(.*)$/) {
                        my ($index, $label, $miss) = $line =~ /^[ ]*(\d+)\s+(\S+)\s+\((\S+)\)/;
                        my($popID) = $line =~ /\)\s+(\S+)\s+:/;
                        $line =~ s/^[ ]*(\d+)\s+(\S+)/  $1 $1/g; # memorize the index (c1), then excise the label (c2),
                        if(!$popID){
                            $line =~ s/ :/ 1 :/;
                        }
#                        $line =~ s/\015\012/\n/g; # just in case we have a windows line-ending file
                        print OUTTXT "$line\n";
                        $replicates++;
                    }
                }
                
            }
            close FILE;
            if($correct==1){print OUTTXT "\n";}
        }
        close(OUTTXT);
    }
}
##############################
#
sub gnuplot{
    my ($IN_DIR, $OUT_DIR, $data_ref, $isGNUPLOT)=@_;
    my $nme1=$OUT_DIR."gnuPLotLn.p";
    my $nme2=$OUT_DIR."gnuPLotLn.dat";
    open(OUTSCRIPT,">$nme1") || die("$0: Cannot Open File: $!");
    open(OUTDAT,">$nme2") || die("$0: Cannot Open File: $!");
    my ($minK, $maxK, $runs, $reps);
    $minK=10**10;
    $maxK=-1;
    print OUTDAT "#K\tEst. Ln Prob of Data\tMean value of Ln likelihood\tVariance of ln likedlihood\n";
    for my $k (sort {$a <=> $b} keys %{$data_ref}){
        if ($k > $maxK){
            $maxK = $k;
            $reps = 0;
        }
        if ($k < $minK){
            $minK = $k;
        }
        for my $key( sort {$$data_ref{$k}{$a}{runNum} <=> $$data_ref{$k}{$b}{runNum}} keys %{$$data_ref{$k}} ){
            print OUTDAT "$k\t$$data_ref{$k}{$key}{ln}\t";
            print OUTDAT "$$data_ref{$k}{$key}{mean}\t";
            print OUTDAT "$$data_ref{$k}{$key}{var}\n";
            $runs++;
            $reps++;
        }
    }
    my $minplot = $minK - 1;
    my $maxplot = $maxK + 1;
print OUTSCRIPT <<ENDSCRIPT;
set terminal png size 640,480 enhanced font \'/usr/share/fonts/bitstream-vera/Vera.ttf\' \\
xffffff x000000 x404040 xff0000
set output \"$OUT_DIR/gnuPlot.1.png\"
set autoscale
unset log
unset label
set xrange [$minplot \: $maxplot]
set xtic auto
set ytic auto
set title \"L(K) mean(+-SD)\"
set xlabel\"K\"
set ylabel \"Mean of Est. Ln Prob. of Data\"
set nokey
plot \"$OUT_DIR/gnuPlot.dat\" using 1:2:3 with yerrorbars 
ENDSCRIPT
close(OUTSCRIPT);
}