#!/bin/perl
# Originally by David Strubbe, September 2010
# perl script for running testsuite with PBS scheduler from buildbot

use Time::HiRes qw(gettimeofday tv_interval);

$write_interval = 5 * 60;
$output_file = "test.out";
$pbs_monitor_interval = 30;
$tail_monitor_interval = 0.5;

$starttime = [gettimeofday];

if($#ARGV < 3 || $#ARGV > 4) {
    $numargs = $#ARGV + 1;
    print "Number of arguments = $numargs\n";
    die "Usage: perl buildbot_pbs.pl job.scr timeout_seconds jobtitle end_delay [queue]\n";
}
$script = $ARGV[0];
$timeout_seconds = $ARGV[1];
$jobtitle = $ARGV[2];
$end_delay = $ARGV[3];
if($#ARGV == 4) {
    $queue = "-q " . $ARGV[4];
} else {
    $queue = "";
}

print "Using job script $script with title $jobtitle.\n";
$stderr = $jobtitle . ".err";
$stdout = $jobtitle . ".out";
system "rm -f $stderr $stdout $output_file";
# on some systems, if these files are already present there may be a PBS error
# also, clearing things out makes sure the timestamps will mean what we think they do
$jobname = `qsub $script -N $jobtitle -e $stderr -o $stdout $queue | tail -1`;
chomp $jobname;
if($jobname eq "") {
    die("Job submission failed.\n");
} else {
    print "Job name = $jobname\n";
}

$last_write = 0;
$running = 0;
$writing = 0;
$test_finished = 0;
$job_finished = 0;
do {
    sleep($pbs_monitor_interval);
    $qstat = `qstat $jobname 2> qstat_err_`;
    $qstat_err = `cat qstat_err_`;

    # this means qstat has failed, and all bets are off
    if($qstat_err ne "" && $qstat_err !~ /Unknown Job Id/) {
	print "qstat error: " . $qstat_err . "\n";
    }

    chomp $qstat; # there may be an end of line here otherwise
    @lines = split(/\n/, $qstat);
    # search to be robust against extra junk before the info we need
    for $iline (0..$#lines) {
	if($lines[$iline] =~ "Job id") {
	    @fields = split(' ', $lines[$iline+2]);
	}
    }

    $currenttime = [gettimeofday];
    $elapsed = tv_interval($starttime, $currenttime);

    if($running eq 0 && $fields[4] eq "R") {
	print "Job is running.\n";
	$running = 1;
    }
    
    if($running eq 1 && $writing eq 0) {
	# don't tail before MPI job has started writing
	# otherwise some of file from previous run may be written out
	# sometimes this age will never go negative, only less than a second but positive
	if(-e $output_file && -M $output_file <= 1e-4) {
	    $writing = 1;
	    print "Parallel job output:
===========================================\n";
	    system "tail -n +1 --pid=$$ --sleep-interval=$tail_monitor_interval -f $output_file &";
	    # giving PID of this Perl script will make tail stop when this script does
	    # -n +1 is needed so we start at the beginning of the file
	}
    }

    if($elapsed > $timeout_seconds) {
	system("qdel $jobname");
	sleep($end_delay);
	print "===========================================\n";
	print $qstat . "\n";
	die("Job has not completed before timeout. Killing job.\n");
    }

    if($running eq 0 && $elapsed - $last_write > $write_interval) {
	$last_write = $elapsed;
	print $qstat . "\n";
    }

    # we cannot stop until the job standard out file is present
    if(-e $stdout) {
	# if the standard error does not say the equivalent of
	#   qstat: Unknown Job Id 583497.sched-00.scs.lbl.gov
	# then qstat has failed and we don't know if the job is finished!
	# we will not consider the job done in this case, and continue waiting.
	if(($qstat eq "" && $qstat_err =~ /Unknown Job Id/)|| $fields[4] eq "C" || $fields[4] eq "E") {
	    $job_finished = 1;
	}

	$status = `cat $stdout`;
	if($status =~ /Exit status/) {
	    $test_finished = 1;
	}
    }

    $fileage = time - (stat($output_file))[9];

} while (($job_finished eq 0 && $test_finished eq 0) || $fileage < $end_delay);
# We stop when the job is no longer running (i.e. not in queue, or listed as completed or exiting)
# or "Exit status" has appeared in the standard out, and its standard out file exists, 
# and the job has not written to the output file in the last few seconds.

system "rm qstat_err_";

print "===========================================
Parallel job has finished.\n";

if($fileage >= $end_delay) {
    print "No write to output for $fileage seconds (> $end_delay).\n";
}

# temporary debugging info below
if($job_finished == 0) {
    print "No, job did not finish!\n";
}
if($test_finished == 0) {
    print "No, test has not finished!\n";
}
# end temp debug info

$errors = `head $stderr`;
if($errors ne "") {
    print "\nStandard error from job script:\n";
    system "cat $stderr";
    print "===========================================\n";
}

$status = `cat $stdout`;
if($status !~ /Exit status = 0/) {
    print $qstat . "\n";
    die("Test failed.\n");
}
