#!/usr/bin/perl -w # struct_harvest.pl # May 2007 - Oct 2009 # dearl (a) soe ucsc edu # v0.3 # # see inline POD for details, or try: # ./struct_harvest.pl --help # ./struct_harvest.pl --man # ######################################## # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # ######################################## use warnings; use strict; use Getopt::Long; use Pod::Usage; use File::Glob ':glob'; use File::Basename; use Statistics::Descriptive; ##### sub usage; sub versionCheck; sub harvest; sub evanno; sub clumpp; sub gnuplot; sub renameFiles; sub help; ##### my $IN_DIR; my $OUT_DIR; my ($isHarvest, $isEvanno, $isCLUMPP, $isGNUPLOT); my ($isOptionVersion, $isOptionHelp, $isOptionMan, $isOptionDebug); my $isOptionRename; my $VERSION = 'v0.3, Oct 2009.'; GetOptions('dir=s' => \$IN_DIR ,'out=s' => \$OUT_DIR ,'harvest' => \$isHarvest ,'evanno' => \$isEvanno ,'clumpp' => \$isCLUMPP ,'gnuplot' => \$isGNUPLOT ,'rename' => \$isOptionRename ,'version' => \$isOptionVersion ,'help' => \$isOptionHelp ,'man' => \$isOptionMan ,'debug' => \$isOptionDebug); pod2usage(-verbose => 2) if $isOptionMan; versionCheck($VERSION) if($isOptionVersion); help() if($isOptionHelp); usage() unless($IN_DIR); usage() unless( (($OUT_DIR) && ($isHarvest || $isEvanno || $isCLUMPP || $isGNUPLOT)) || $isOptionRename); usage() unless( -d $IN_DIR); if($OUT_DIR){ if(! -d $OUT_DIR){ mkdir($OUT_DIR) or die "$0: unable to make directory, $OUT_DIR. $!; $?\n"; } unless ($OUT_DIR =~ /\/$/){ $OUT_DIR="$OUT_DIR/"; } } ############################################################ =pod =head1 struct_harvest.pl F is a tool to extract information from STRUCTURE Results folders. This is the offline version of http://taylor0.biology.ucla.edu/struct_harvest/ =head1 SYNOPSIS ./struct_harvest.pl [options] Input: --dir [specify the path to your STRUCTURE Results folder] --out [specify the path to an out directory. If it doesn\'t exist, it will be created.] --rename [a flag. Will rename your results to remove whitespace and will standardize numbers, i.e. 5 -> 0005] --harvest [a flag. Runs the Harvester. Output is out_harvest.txt] --evanno [a flag. Runs the Evanno method. Output is out_evanno.summary.dat] --clumpp [a flag. Generates clumpp (min K - max K).indfiles.] --gnuplot [a flag. Generates .dat files and .p files for use with gnuplot] --help [a flag. Prints this message] --man [a flag. Prints full documentation] --version [a flag. Prints current version and web address] =head1 DESCRIPTION F The script takes the paths to your Results directory and an output directory and then, depending on the flags you pass, extracts data from the results and writes them to the output directory. STRUCTURE: J. Pritchard, M. Stephens, P. Donnelly. 2000. Genetics 155:945-959. http://www.genetics.org/cgi/content/full/155/2/945 http://pritch.bsd.uchicago.edu/structure.html CLUMPP: M. Jakobsson, N. Rosenberg 2007. CLUMPP: a cluster matching and permutation program for dealing with label switching and multimodality in analysis of population structure. Bioinformatics 23(14): 1801-1806. http://bioinformatics.oxfordjournals.org/cgi/content/full/23/14/1801 http://rosenberglab.bioinformatics.med.umich.edu/clumpp.html Evanno Method: G. Evanno, S. Regnaut, J. Goudet 2005 Detecting the number of clusters of individuals using the software structure: a simulation study. Molecular Ecology 14(8): 2611-2620. http://doi.wiley.com/10.1111/j.1365-294X.2005.02553.x GNUPLOT: http://www.gnuplot.info/ =head1 AUTHOR Dent A. Earl, dearl (a) soe ucsc edu =head1 DATE October 2009 =head1 LICENSE %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . =cut ############################################################ ################################################################################ # main my %data; # $data{$k}{$fileName}{runNum} = run number # {ln} = Ln(P) # {mean} = mean # {var} = variance # harvest() runs no matter what, only prints if $isHarvest renameFiles($IN_DIR) if($isOptionRename); harvest($IN_DIR, $OUT_DIR, \%data, $isHarvest); evanno($IN_DIR, $OUT_DIR, \%data, $isGNUPLOT) if ($isEvanno); clumpp($IN_DIR, $OUT_DIR, \%data) if ($isCLUMPP); gnuplot($IN_DIR, $OUT_DIR, \%data) if ($isGNUPLOT); # end main ################################################################################ ############################## # sub usage{ print "USAGE: $0 --dir --out \n\t[optional: --harvest --evanno --clumpp --gnuplot --rename --help --man --version]\n"; exit(2); } ############################## # sub versionCheck{ print "$0 $_[0]\n"; print " http://users.soe.ucsc.edu/~dearl/software/struct_harvest/\n"; exit(0); } ############################## # sub help{ print < --out --rename [a flag. Will rename your results files to remove whitespace and will standardize numbers, i.e. 5 -> 0005] --harvest [a flag. Runs the Harvester. Output is out_harvest.txt] --evanno [a flag. Runs the Evanno method. Output is out_evanno.summary.dat] --clumpp [a flag. Generates clumpp (min K - max K).indfiles.] --gnuplot [a flag. Generates .dat files and .p files for use with gnuplot] --help [a flag. Prints this message] --man [a flag. Prints full documentation] --version [a flag. Prints current version and web address] END_HELP exit(0); } ############################## # sub harvest{ my ($IN_DIR, $OUT_DIR, $data_ref, $isHarvest)=@_; my @jobs = bsd_glob("$IN_DIR/*.txt", GLOB_TILDE); my $numJobs = @jobs; if($isHarvest){ my $tmpNme = $OUT_DIR."out_harvest.txt"; open(OUT,">$tmpNme") || die("Cannot Open File: $!"); my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst)=localtime(time); printf OUT "This output file was generated at:\n%4d-%02d-%02d %02d:%02d:%02d\n", $year+1900,$mon+1,$mday,$hour,$min,$sec; } my @fileList = bsd_glob("$IN_DIR*_f", GLOB_TILDE); print OUT "\nFile Name\tRun #\t#Pops\tEst. Ln Prob of Data\tMean value of Ln likelihood\tVariance of ln likedlihood\n\n" if($isHarvest); for my $rFile (@fileList) { open (FILE, $rFile) or die "ERROR - Unable to open '$rFile': $!\n"; my ($fName, $fPath, $runNum); if(($runNum) = $rFile =~ /\S+_(\d+)_f/) { $runNum=~ s/^[0]+//; ($fName, $fPath) = fileparse($rFile); print OUT "$fName\t$runNum\t" if($isHarvest); } my ($k, $ln, $mean, $var); while (my $line = ) { chomp($line); if($line =~ /(\S+)\s*populations assumed/) { $k = $1; print OUT "$k\t" if($isHarvest); $$data_ref{$k}{$fName}{runNum}=$runNum; } if ($line =~ /Estimated Ln Prob of Data\s*=\s*(\S+)/) { $ln = $1; print OUT "$ln\t" if($isHarvest); $$data_ref{$k}{$fName}{ln}=$ln; } if ($line =~ /Mean value of ln likelihood\s*=\s*(\S+)/) { $mean = $1; print OUT "$mean\t" if($isHarvest); $$data_ref{$k}{$fName}{mean}=$mean; } if ($line =~ /Variance of ln likelihood\s*=\s*(\S+)/) { $var = $1; print OUT "$var\n" if($isHarvest); $$data_ref{$k}{$fName}{var}=$var; } } close(FILE) if($isHarvest); } print OUT "\nSoftware written by Dent Earl dearl (a) soe ucsc edu, BME Dept. UCSC\nGo Slugs\n" if($isHarvest); close(OUT) if($isHarvest); } ############################## # this section of the code changes the naming format. sub renameFiles{ $IN_DIR=$_[0]; my @fileList = bsd_glob("$IN_DIR*_f", GLOB_TILDE); for my $elm (@fileList){ my ($fName, $fPath, $fSuffix) = fileparse($elm,'_f'); my $oldName=$fName; $fName =~ s/ /\_/g; # ditch all extraneous spaces if($fName =~ /(\S+)\_(\d+)/) { my $pre=$1; my $num=$2; $num += 0; # try to get rid of any existing leading 0's if($num<10){ $num="000$num"; }elsif($num < 100){ $num="00$num"; }elsif($num < 1000){ $num="0$num"; } my $newName = "$pre\_$num\_f"; if (-e $newName) { warn "can't rename $oldName to $newName: $newName already exists $?\n"; }else{ rename("$elm", "$fPath$newName") or warn "rename $elm to $fPath$newName failed: $!\n"; } } } } ############################## # sub evanno{ my ($IN_DIR, $OUT_DIR, $data_ref, $isGNUPLOT)=@_; my $inBlock = 0; my $btBlock = 0; my $nearBlock = 0; my $promo = 0; my $maxK = 0; my $minK = 10**10; my $reps = 0; my $runs = 0; my $name = "out_evanno"; my @orderedMatrix; for my $k (sort {$a <=> $b} keys %{$data_ref}){ if ($k > $maxK){ $maxK = $k; $reps = 0; } if ($k < $minK){ $minK = $k; } for my $key( sort {$$data_ref{$k}{$a}{runNum} <=> $$data_ref{$k}{$b}{runNum}} keys %{$$data_ref{$k}} ){ $runs++; $reps++; push(@orderedMatrix, [$k, $$data_ref{$k}{$key}{ln}]); } } open(OUTsummary,">$OUT_DIR$name.summary.dat") || die("$0 : Cannot Open '$OUT_DIR$name.summary.txt' : $!"); if (($reps != 0) && ($runs % $reps == 0) && ($reps!=1) && ((($maxK-$minK)+1)*$reps == $runs ) &&(($maxK-$minK) > 1) && ($isGNUPLOT) ){ open(OUTln,">$OUT_DIR$name.ln.dat") || die("$0 : Cannot Open '$OUT_DIR$name.out.txt' : $!"); open(OUTlnprime,">$OUT_DIR$name.lnprime.dat") || die("$0 : Cannot Open '$OUT_DIR$name.out.txt' : $!"); open(OUTlnprime2,">$OUT_DIR$name.lnprime2.dat") || die("$0 : Cannot Open '$OUT_DIR$name.out.txt' : $!"); open(OUTdeltak,">$OUT_DIR$name.deltak.dat") || die("$0 : Cannot Open '$OUT_DIR$name.out.txt' : $!"); open(OUTScriptlnprime,">$OUT_DIR$name.lnprime.p") || die("$0 : Cannot Open '$OUT_DIR$name.out.txt' : $!"); open(OUTScriptlnprime2,">$OUT_DIR$name.lnprime2.p") || die("$0 : Cannot Open '$OUT_DIR$name.out.txt' : $!"); open(OUTScriptdeltak,">$OUT_DIR$name.deltak.p") || die("$0 : Cannot Open '$OUT_DIR$name.out.txt' : $!"); ######################### ## Calculate Mean and Standard Deviation my @LnMeans; my @stdDevs; my $lastK; my $stat; my $i=0; my $k=$minK; print OUTln "#********************************************\n#K\tL(K) Mean\tStdev\n" if($isGNUPLOT); $stat = Statistics::Descriptive::Full->new(); $stat->add_data($orderedMatrix[0][1]); $lastK=$orderedMatrix[0][0]; for(my $i=1; $i< @orderedMatrix; $i++){ if($lastK == $orderedMatrix[$i][0]){ $stat->add_data($orderedMatrix[$i][1]); }else{ if($lastK){ push(@LnMeans, $stat->mean()); push(@stdDevs, $stat->standard_deviation()); print OUTln sprintf("%d\t%.3f\t%.3f\n",$k++,$LnMeans[$#LnMeans],$stdDevs[$#stdDevs]); } $stat = Statistics::Descriptive::Full->new(); $stat->add_data($orderedMatrix[$i][1]); $lastK=$orderedMatrix[$i][0]; } } push(@LnMeans, $stat->mean()); push(@stdDevs, $stat->standard_deviation()); print OUTln sprintf("%d\t%.3f\t%.3f\n",$k++,$LnMeans[$#LnMeans],$stdDevs[$#stdDevs]); close(OUTln); ######################### ## Ln'(K) Here my @LnPrimes; for(my $i = 1; $i <= $maxK-$minK;$i++){ for (my $j=0; $j<$reps; $j++){ my $index = ($i*$reps)+$j; $LnPrimes[(($i-1)*$reps)+$j] = ($orderedMatrix[$index][1]) - ($orderedMatrix[$index-$reps][1]); # print "\t$LnPrimes[(($i-1)*$reps)+$j]\n"; } } my @LnPrimeMeans; my @primeStandardDeviations; print OUTlnprime "#********************************************\n#K\tL'(K) Mean\tStdev\n" if($isGNUPLOT); for (my $i=0; $i<($maxK-$minK);$i++){ $stat = Statistics::Descriptive::Full->new(); for (my $j=0; $j<$reps; $j++){ my $tmp = $LnPrimes[($i*$reps)+$j]; $stat->add_data($tmp); } push(@LnPrimeMeans, $stat->mean()); push(@primeStandardDeviations, $stat->standard_deviation()); print OUTlnprime sprintf("%d\t%.3f\t%.3f\n",$i+$minK+1,$LnPrimeMeans[$#LnPrimeMeans], $primeStandardDeviations[$#primeStandardDeviations]); } close(OUTlnprime); ######################### ## |Ln''(K)| Here my @LnPrimePrimes; for(my $i = 1; $i < ($maxK-$minK); $i++){ for (my $j=0; $j < $reps; $j++){ my $index = ($i*$reps)+$j; $LnPrimePrimes[(($i-1)*$reps)+$j] = ($LnPrimes[$index-$reps]) - ($LnPrimes[$index]); $LnPrimePrimes[(($i-1)*$reps)+$j] = sqrt($LnPrimePrimes[(($i-1)*$reps)+$j]**2); # print "$LnPrimePrimes[(($i-1)*$reps)+$j]\n"; } } my @LnPrimePrimeMeans; my @primePrimeStandardDeviations; print OUTlnprime2"#********************************************\n#K\t|L''(K)| Mean\tStdev\n" if($isGNUPLOT); for (my $i=0; $i< ($maxK-$minK - 1);$i++){ $stat = Statistics::Descriptive::Full->new(); for (my $j=0; $j<$reps; $j++){ my $value = $LnPrimePrimes[($i*$reps)+$j]; $stat->add_data( $value ); } $LnPrimePrimeMeans[$i] = $stat->mean(); $primePrimeStandardDeviations[$i] = $stat->standard_deviation(); print OUTlnprime2 sprintf("%d\t%.3f\t%.3f\n",($i+1)+$minK,$LnPrimePrimeMeans[$i], $primePrimeStandardDeviations[$i]) if($isGNUPLOT); } close(OUTlnprime2); ######################### ## preDeltaK Here my @preDeltaKs; for(my $i = 1; $i < $maxK-$minK;$i++){ for (my $j=0;$j<$reps;$j++){ my $index = ($i*$reps)+$j; $preDeltaKs[(($i-1)*$reps)+$j] = sqrt((($orderedMatrix[$index+$reps][1]) - (2*$orderedMatrix[$index][1])+$orderedMatrix[$index-$reps][1])**2); } } ######################### ## deltaKs Here my @deltaKs; print OUTdeltak "#********************************************\n#K\tDelta(K) Mean\n" if($isGNUPLOT); for (my $i=$minK-$minK;$i<$maxK-$minK-1;$i++){ for (my $j=0;$j<$reps;$j++){ $deltaKs[$i] += $preDeltaKs[($i*$reps)+$j]; } $deltaKs[$i] /= $reps; $deltaKs[$i] /= $stdDevs[$i+1]; print OUTdeltak sprintf("%d\t%.3f\n" , $i+$minK+1,$deltaKs[$i]) if($isGNUPLOT); # debug line } close(OUTdeltak); ######################### ## Final Summary Here print OUTsummary "#********************************************\n#Run\tK\tLnP(K)\tL'(K)\t|L''(K)|\n"; for (my $i=0;$i<$reps;$i++){ print OUTsummary sprintf("%d\t%d\t%.1f\tXXXXX\tXXXXX\n", $i+1,$orderedMatrix[$i][0],$orderedMatrix[$i][1]); } for (my $i=$reps;$i<2*$reps;$i++){ print OUTsummary sprintf("%d\t%d\t%.1f\t%.1f\t%.1f\n", $i+1,$orderedMatrix[$i][0],$orderedMatrix[$i][1], $LnPrimes[$i-$reps],$preDeltaKs[$i-$reps]); } for (my $i=2*$reps;$i<($runs-$reps);$i++){ print OUTsummary sprintf("%d\t%d\t%.1f\t%.1f\t%.1f\n", $i+1, $orderedMatrix[$i][0] , $orderedMatrix[$i][1], $LnPrimes[$i-$reps] , $preDeltaKs[$i-$reps]); } for (my $i=$runs-$reps;$i<$runs;$i++){ print OUTsummary sprintf("%d\t%d\t%.1f\t%.1f\tXXXXX\n", $i+1, $orderedMatrix[$i][0] , $orderedMatrix[$i][1], $LnPrimes[$i-$reps]); } my $minplot = $minK - 1; my $maxplot = $maxK + 1; if($isGNUPLOT){ print OUTScriptlnprime <$OUT_DIR$name.log") || die("PERL : evannozer.pl : Cannot Open '$OUT_DIR$name.log' : $!"); print OUTnono "There was a problem...\n"; print OUTnono "Evanno Method:\nAll K need the same # of reps, K values must be sequenial\n"; if($reps == 0){ warn("$0: Warning, number of replicates is zero, unable to perform Evanno method.\n"); print OUTnono "The number of replicates is zero. All K values must have the same number of replicates, all K values must be sequential.\n"; return; } my $test1 = ($runs % $reps == 0); my $test2 = ($reps!=1); my $test3 = ((($maxK-$minK)+1)*$reps == $runs ); my $test4 = (($maxK-$minK) > 1); print OUTnono "#Runs = $runs\t#Reps = $reps\tMaxK = $maxK\tMinK = $minK\n"; print OUTnono "===========================\n"; print OUTnono "TEST 1 : (runs % reps == 0) \t\t\tSTATUS : "; if ($test1==1){ print OUTnono "PASS\n"; }else{ print OUTnono "FAIL - You need all K values to have the same # of replicates.\n"; } print OUTnono "TEST 2 : (reps!=1) \t\t\t\tSTATUS : "; if ($test2==1){ print OUTnono "PASS\n"; }else{ print OUTnono "FAIL - You need to have more than one replicate per K.\n"; } print OUTnono "TEST 3 : (((maxK-minK)+1)*reps == runs ) \tSTATUS : "; if ($test3==1){ print OUTnono "PASS\n"; }else{ print OUTnono "FAIL - Your K values need to be sequential, no missing K are allowed. (Will fail if test 1 failed)\n"; } print OUTnono "TEST 4 : ((maxK-minK) > 1) \t\t\tSTATUS : "; if ($test4==1){ print OUTnono "PASS\n"; }else{ print OUTnono "FAIL - Your min K and max K are used to generate neighboring delta K, so your total K will be 2 greater than your delta K. Therefore must have at least 3 Ks. \n"; } close(OUTnono); } } ############################## # sub clumpp{ my ($IN_DIR, $OUT_DIR, $data_ref)=@_; # # clumpp_in.pl Sept 26 2007 @DAE # give it a directory and it will go through and seek out results files, open them, determine # what K they are, and then produce an infile for CLUMPP for each value of K that you have # represented in the folder. my @fileList = bsd_glob("$IN_DIR*_f", GLOB_TILDE); my $minK=10**10; # no one would ever run structure with this many groups. my $maxK=-1; #likewise, no one would ever run structure with 0 groups. for my $k (sort {$a <=> $b} keys %{$data_ref}){ if ($k > $maxK){ $maxK = $k; } if ($k < $minK){ $minK = $k; } } my $replicates; for (my $i=$minK; $i<=$maxK; $i++){ $replicates = 0; my $tmpNme = $OUT_DIR."K$i.indfile"; open(OUTTXT,">$tmpNme") || die("$0 : Cannot Open File: $!"); foreach my $file (@fileList){ chomp($file); open(FILE, $file) or die "$0 : Unable to open '$file': $!\n"; my $correct=0; while (my $line = ) { chomp($line); if($line =~ / (\S+) populations assumed/){ my $nPops = $1; if ($nPops==$i){ $correct=1; } } if($correct==1){ # we want to pull out all rows of the inferred ancestry table, # remove the label and replace it with the index of the row. if($line =~ /\(\S+\)\s+\S*\s+:\s+(.*)$/) { my ($index, $label, $miss) = $line =~ /^[ ]*(\d+)\s+(\S+)\s+\((\S+)\)/; my($popID) = $line =~ /\)\s+(\S+)\s+:/; $line =~ s/^[ ]*(\d+)\s+(\S+)/ $1 $1/g; # memorize the index (c1), then excise the label (c2), if(!$popID){ $line =~ s/ :/ 1 :/; } # $line =~ s/\015\012/\n/g; # just in case we have a windows line-ending file print OUTTXT "$line\n"; $replicates++; } } } close FILE; if($correct==1){print OUTTXT "\n";} } close(OUTTXT); } } ############################## # sub gnuplot{ my ($IN_DIR, $OUT_DIR, $data_ref, $isGNUPLOT)=@_; my $nme1=$OUT_DIR."gnuPLotLn.p"; my $nme2=$OUT_DIR."gnuPLotLn.dat"; open(OUTSCRIPT,">$nme1") || die("$0: Cannot Open File: $!"); open(OUTDAT,">$nme2") || die("$0: Cannot Open File: $!"); my ($minK, $maxK, $runs, $reps); $minK=10**10; $maxK=-1; print OUTDAT "#K\tEst. Ln Prob of Data\tMean value of Ln likelihood\tVariance of ln likedlihood\n"; for my $k (sort {$a <=> $b} keys %{$data_ref}){ if ($k > $maxK){ $maxK = $k; $reps = 0; } if ($k < $minK){ $minK = $k; } for my $key( sort {$$data_ref{$k}{$a}{runNum} <=> $$data_ref{$k}{$b}{runNum}} keys %{$$data_ref{$k}} ){ print OUTDAT "$k\t$$data_ref{$k}{$key}{ln}\t"; print OUTDAT "$$data_ref{$k}{$key}{mean}\t"; print OUTDAT "$$data_ref{$k}{$key}{var}\n"; $runs++; $reps++; } } my $minplot = $minK - 1; my $maxplot = $maxK + 1; print OUTSCRIPT <