#!perl
use Config;
use File::Basename qw(&basename &dirname);
use File::Spec;
use FindBin '$Bin';
use Cwd;

my $dir = dirname($0);
$file   = shift || File::Spec->catfile($dir,basename($0, '.PL','.PLS')).".pl";

open OUT,">$file" or die "Can't create $file: $!";

print "Extracting $file (with variable substitutions)\n";

my $startperl = $Config{startperl} ne '#!perl' 
  ? $Config{startperl}
  : "#!$Config{perlpath}";

print OUT <<"!GROK!THIS!";
$startperl

!GROK!THIS!

# In the following, perl variables are not expanded during extraction.

print OUT <<'!NO!SUBS!';
# $Id: bulk_load_xrt.PLS,v 1.4 2005/06/13 15:01:21 junjun Exp $

use strict;

use DBI;
use IO::File;
use Getopt::Long;

use constant MYSQL => 'mysql';

use constant GDATA      => 'gdata';
use constant GCLASS     => 'gclass';
use constant GATTRIBUTE => 'gattribute';
use constant GRELATIONSHIP => 'grelationship';


use Bio::DB::XRT;


package main;

my ($DSN,$FORCE,$NO_PARSE,$PONLY,$PREFIX,$NO_WIPE,$USER,$PASSWORD);

GetOptions ('database:s'   => \$DSN,
	    'create'           => \$FORCE,
	    'no_parse'         => \$NO_PARSE,
	    'parse_only'       => \$PONLY,
	    'prefix:s'         => \$PREFIX,
	    'no_wipe'          => \$NO_WIPE,
	    'user:s'           => \$USER,
	    'password:s'       => \$PASSWORD,
	   ) or die <<USAGE;
Usage: $0 [options] <xrt file 1> <xrt file 2> ...

 Options:
   --database <dsn>      Mysql database name
   --create              Reinitialize/create data tables without asking
   --user                Username to log in as
   --password            Password to use for authentication
   --no_parse            Use the previously parsed file to load into db
   --parse_only          Only parse the xrt files, don't load them
   --no_wipe             Load xrt into the db without wipe out the existing data
   --prefix              Specify the prefix of the temp files. (/[a-zA-Z0-9]/)

Options can be abbreviated.  For example, you can use -d for
--database.

USAGE
;

die "\nError: only 0-9, a-z, A-Z are allowed for prefix!\n\n" unless ($PREFIX =~ /^[a-zA-Z0-9]*$/);
$PREFIX .= '.' if (length($PREFIX) > 0);

$DSN ||= 'test';

unless ($FORCE or $PONLY or $NO_WIPE) {
  open (TTY,"/dev/tty") or die "/dev/tty: $!\n";
  print STDERR "This operation will delete all existing data in database $DSN.  Continue? ";
  my $f = <TTY>;
  die "Aborted\n" unless $f =~ /^[yY]/;
  close TTY;
}

my (@auth,$AUTH);
if (defined $USER) {
  push @auth,(-user=>$USER);
  $AUTH .= " -u$USER";
}
if (defined $PASSWORD) {
  push @auth,(-pass=>$PASSWORD);
  $AUTH .= " -p$PASSWORD";
}


my $db = Bio::DB::XRT->new(-dsn => "dbi:mysql:$DSN", @auth)
  or die "Can't open database: ", Bio::DB::XRT->error,"\n";

$db->db_initialize(1) unless ($NO_WIPE or $PONLY);

my $tmpdir = `pwd`;  ## choose the current dir for tmp files                                        
chomp $tmpdir;                                                                                      
my $re = system("touch $tmpdir/$PREFIX"."xrttmp 2> /dev/null");                                     
if ($re) { # no write permission here                                                               
  $tmpdir = $ENV{TMPDIR} || $ENV{TMP} || $ENV{HOME} || '/tmp';
}                                                                                                   
warn ("\nTemp file directory: $tmpdir\n\n");

my %FH;
my @files = (GDATA,GCLASS,GATTRIBUTE,GRELATIONSHIP);
my $FEATURES    = 0;

unless ($NO_PARSE) {
   my $xrttmp = "$tmpdir/$PREFIX"."xrttmp";
   my $xrtFH = IO::File->new("$xrttmp", ">") or die "Unable to create temp file at $tmpdir: $!\n";
   $xrtFH->autoflush;
   
   FILE: foreach (@ARGV) {
   
      my $file = $_;
      my @F = split/\//,$file;
      my $filename = $F[@F-1];
      warn ("Parsing file: $filename...\n");
      my $class = '';
      if ($filename=~/^([^\.]+)/) {$class = $1}
      
      open(IN,"< $file");
      my $count=0;
      my @columns = ();
      while(<IN>){
          chomp;
          my $line = $_;
          unless ($count) {
              @columns = split(/\t/,$line);
              unless ($columns[0] =~ /^ID$/i || $columns[0] =~ /^\#ID$/i){
                  warn("***Error***: First column must be ID! File '$filename' has been skipped!\n") ;
                  close(IN);
                  next FILE;
              }
          }else{
            next if /^#/;
            my @values = split(/\t/,$line);
            for (my $i=0; $i<@values; $i++){
              if(!($columns[$i]=~/^#.*/) && $values[$i] ne ''){
                  $xrtFH->print("$values[0]\t$class\t$columns[$i]\t$values[$i]\n");
              }
            }
          }
          $count++;
      }
      close(IN);
   
   }


   foreach (@files) {
     $FH{$_} = IO::File->new("$tmpdir/$PREFIX$_",">") or die $_,": $!";
     $FH{$_}->autoflush;
   }
   
   my $CID     = 1;
   my $AID     = 1;
   my $GID     = 1;
   my %CLASSID     = ();
   my %ATTRIBUTEID  = ();
   my %DONE        = ();
   
   if ($NO_WIPE) {  ##########  THIS PART is copied from Gavin's bulk_reload_gff.pl
      # If I could figure out how to use DBI nicely here, I would, but for
      # Now I"ll just hack it... it's a command line script anyway, and its
      # not performance senstive
      my $command = "${\MYSQL} $AUTH -e \"SELECT 'Yak' as pattern,max(gid)+1 FROM gdata\" $DSN";
      my $result = `$command`;
      $result=~m/Yak\t([0-9]*)/;
      $GID=$1;

      my $result = `${\MYSQL} $AUTH -e \"SELECT * FROM gclass\" $DSN`;
      my @dbclasses = split /\n/, $result;
      shift @dbclasses; # Discard Header
      foreach(@dbclasses){
          chomp;
          my @class = split /\t/;
          $CLASSID{uc($class[1])}=$class[0] unless $DONE{"C".$class[0]}++;
          $CID = $class[0] + 1 unless ($CID > $class[0])
      }

      my $result = `${\MYSQL} $AUTH -e \"SELECT * FROM gattribute\" $DSN`;
      my @dbattributes = split /\n/,$result;
      shift @dbattributes; # Discard Header
      foreach(@dbattributes){
          chomp;
          my @attribute = split /\t/;
          $ATTRIBUTEID{lc($attribute[1])}=$attribute[0] unless $DONE{"A".$attribute[0]}++;
          $AID = $attribute[0] + 1 unless ($AID > $attribute[0])
      }

      warn "Starting GID is $GID\nStarting CID is $CID\nStarting AID is $AID\n";
   }
   
   my $count;
   @ARGV = ("$xrttmp");

   while (<>) {
     chomp;
     next if /^\#/;
     my ($ref,$class,$attribute,$values) = split "\t";
     # handle value parsing, value can have multiple entries in it
     my @values = split(/\\t|\&\;/,$values);
     foreach(@values){
       my $value = $_;
       if ($attribute =~ /^[P|C]_ID\/(.+)$/i) {
         my $class1 = $1;
         my $cid1     = $CLASSID{uc($class)}      ||= $CID++;
         my $cid2     = $CLASSID{uc($class1)}     ||= $CID++;
         $FH{ GCLASS() }->print(    join("\t",$cid1,$class),"\n"              ) unless $DONE{"C$cid1"}++;
         $FH{ GCLASS() }->print(    join("\t",$cid2,$class1),"\n"              ) unless $DONE{"C$cid2"}++;
         if ($attribute =~ /^P_ID/i){
             $FH{ GRELATIONSHIP()  }->print(    join("\t",$cid2,$value,$cid1,$ref),"\n"   );
         }else{
             $FH{ GRELATIONSHIP()  }->print(    join("\t",$cid1,$ref,$cid2,$value),"\n"   );
         }
       }else{
         $FEATURES++;
         my $gid     = $GID++;
         my $cid     = $CLASSID{uc($class)} ||= $CID++;
         my $aid     = $ATTRIBUTEID{lc($attribute)} ||= $AID++;
  
         $FH{ GDATA()  }->print(    join("\t",$gid,$ref,$cid,$aid,$value),"\n"   );
         $FH{ GCLASS() }->print(    join("\t",$cid,$class),"\n"              ) unless $DONE{"C$cid"}++;
         $FH{ GATTRIBUTE()  }->print(    join("\t",$aid,$attribute),"\n"       ) unless $DONE{"A$aid"}++;
       
         if ( $gid % 1000 == 0) {
           print STDERR "$gid entries parsed...";
           print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
         }
       }
     }
   }
   $_->close foreach values %FH;

   unlink $xrttmp;
}
   

if ($PONLY) {
  print "\nParsing is done.\n";
  exit;
}

warn "Loading feature data.  You may see duplicate key warnings here...\n";

my $success = 1;

unless ($NO_WIPE) {
  # drop the index, that will make the data loading faster
  $success &&= system("${\MYSQL} $AUTH -e 'ALTER TABLE gdata DROP INDEX gfulltext' $DSN") == 0;
  $success &&= system("${\MYSQL} $AUTH -e 'ALTER TABLE gdata DROP INDEX gvalue' $DSN") == 0;
}

foreach (@files) {
  my $command =<<END;
${\MYSQL} $AUTH
-e "lock tables $_ write; load data infile '$tmpdir/$PREFIX$_' ignore into table $_; unlock tables"
$DSN
END
;
  $command =~ s/\n/ /g;
  warn "loading $PREFIX$_...\n";
  $success &&= system($command) == 0;
  unlink "$tmpdir/$_" unless ($PONLY or $NO_PARSE);
}

unless ($NO_WIPE) {
  # recreate the index after data has been loaded
  warn "creating index...\n";
  $success &&= system("${\MYSQL} $AUTH -e 'ALTER TABLE gdata ADD FULLTEXT gfulltext (gvalue)' $DSN") == 0;
  $success &&= system("${\MYSQL} $AUTH -e 'ALTER TABLE gdata ADD INDEX (gvalue(100))' $DSN") == 0;
}

warn "done...\n";

if ($success) {
  print "SUCCESS: ".($NO_PARSE ? "no_parse option used, unknown number of" : $FEATURES)." rows successfully loaded\n";
  exit 0;
} else {
  print "FAILURE: Please see standard error for details\n";
  exit -1;
}

__END__


=head1 AUTHOR

Junjun Zhang E<lt>junjun@genet.sickkids.on.caE<gt>.

Copyright (c) 2004 The Centre for Applied Genomics.

This script is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.  See DISCLAIMER for
disclaimers of warranty.

=cut

!NO!SUBS!
close OUT or die "Can't close $file: $!";
chmod 0755, $file or die "Can't reset permissions for $file: $!\n";
exec("$Config{'eunicefix'} $file") if $Config{'eunicefix'} ne ':';
