#!/bin/bash
# synchronize a set of machines.
#
# optparam prefix "-t" value The amount of time to wait.
# reqparam value barrier-name A name for the barrier. (This will allow overlapping sets of machines to participate in barriers that may end up adjacent in time)
# reqparam value repeat The IP of the hosts to synchronize
#
#ARGS:
# barrier [ -t timeout ] barrier-name server client1 client2 ... clientn
#    The timeout is specified in seconds.
#    All the clients must be able to connect to the server
#    Clients and servers may be specified by IP or name if DNS is working
#
# Theory of operation:
#
#   There will be a set of machines that want to syncronize.  The first one in
# the list will act as the server.  All the other machines will connect to
# the server in order to acomplish the syncronization.  If the server is ready
# before any particular client, there isn't a problem --  The server will
# wait for each client to connect before continuing.  If a particular client
# is ready before the server there is a potential problem.  The server may
# need to reboot an arbitrary number of times after that particular client is
# ready.  To solve this problem, there is a two phase synchronization.  In the
# first phase, each client creates a named pipe, and waits for the keyword 
# from the server.  The clients will only get this keyword when the server has
# entered the barrier script, and is ready to proceed.  Because a named pipe
# is blocking, the server will not be able to continue to the next step until
# all clients have connected.  Once the server knows all the clients are ready
# it will send the secondary keyword to each of the clients.  At that point, 
# all the machines will succesfully exit the script
#

. /etc/autobench.conf || . functions

# A little hack -- the only way to get the script to actually exit on the 
# timeout is to kill it (exit doesn't work).  But the return code when it's
# killed is 127 + the signal number.  It should return 86 when something goes
# wrong.  So this little snippet calls it self, checking the return code.
if [ "$1" != "-r" ]; then
  $0 -r $*
  if [ $? -ne 0 ]; then
    exit 86
  fi
  exit
else
  shift 1
fi

# Process the arguments ######################################################
TIMEOUT=300
if [ "$1" = "-t" ]; then
  TIMEOUT=$2
  shift 2
fi

if [ $# -lt 2 ]; then
  echo "You must specify at least barrier-name server"
  exit 1
fi

NAME=$1$DOT_JOB_ID
BARBASE=/var/tmp/barrier/$NAME
BARDIR=$AUTODIR$BARBASE
shift

if [ -z "$MYIP" ]; then
    false
    check_status "unable to determine my IP"
fi

SERVERIP=$1

# setup sleep for barrier timeout
if [ ! -f $AUTODIR/scripts/bin/barriersleep ]
then
  ln -s `which sleep` $AUTODIR/scripts/bin/barriersleep
fi


# Before we go into a potentially infinite loop, set the timeout
function timeout () {
  $AUTODIR/scripts/bin/barriersleep $TIMEOUT

  log "Barrier script timed out ($1)"

  SSH_PID=`ps waxf | grep "ssh.*autobench.conf" | cut -d " " -f 1`
  log "Killing ssh at PID=$SSH_PID"
  kill $SSH_PID

  kill $$ 2> /dev/null
}
timeout $NAME &
TIMEOUT_PID=$!

if [ "$MYIP" = "$SERVERIP" ]; then
  # We are the server

  # Wait for all the other machines to be ready
  mkdir -p $BARDIR
  for ip in $@; do
    if [ "$ip" != "$MYIP" ]; then
      # We don't need to wait for ourself
      if [ ! -e $BARDIR/${ip}-pipe ]; then
        mknod $BARDIR/${ip}-pipe p 2> /dev/null
      fi
      log "Barrier $NAME: Server waiting for $ip"
      echo "ready" > $BARDIR/${ip}-pipe
    fi
  done

  # They're all ready, tell them they can go
  for ip in $@; do
    if [ "$ip" != "$MYIP" ]; then
      # We don't need to wait for ourself
      log "Barrier $NAME: Server telling $ip to go"
      echo "go" > $BARDIR/${ip}-pipe
    fi
  done

  # cleanup
  rm -Rf $BARDIR

  kill $TIMEOUT_PID
  killall barriersleep

elif echo "${@:2}" | grep -q "$MYIP"; then
  # We are a participating client

  # This assumes that the local machine is running openssh
  mkdir -p ~/.ssh/
  cp ~/.ssh/config ~/.ssh/config-autobench-backup
  touch ~/.ssh/config-autobench-backup
  echo "Host $SERVERIP" >> ~/.ssh/config
  echo "  StrictHostKeyChecking no" >> ~/.ssh/config
  echo "  PasswordAuthentication no" >> ~/.ssh/config

  # A little support function
  # heartbeat to keep connection alive, also handles known_hosts issue on 1st connect
  function say_hi { while true; do echo "yes"; sleep 1; done }

  # Wait for the server to be ready
  while true; do
    log "Barrier $NAME: Client waiting for ready"
    RES=`say_hi | ssh $SERVERIP -i ~/.ssh/autobench-id \
       '. /etc/autobench.conf ; \
        AUTODIR=${AUTODIR:-/autobench} && \
        REMOTE_BARDIR=$AUTODIR'"$BARBASE"' &&\
        mkdir -p $REMOTE_BARDIR && \
        mknod $REMOTE_BARDIR/'"${MYIP}-pipe"' p 2> /dev/null ; \
        cat $REMOTE_BARDIR/'"${MYIP}-pipe"`
    RC=$?
    #log "BARRIER $NAME: result=$RES rc=$RC"
    if [ $RC -eq 0 -a "$RES" = "ready" ]; then
      break 
    fi
  done

  # We're ready -- that means the server isn't going to reboot anymore, now
  # we just have to wait for all the clients to connect to the server, then it
  # will tell us go.
  log "Barrier $NAME: Client waiting for go"
  RES=`ssh $SERVERIP -i ~/.ssh/autobench-id \
     '. /etc/autobench.conf ; \
      AUTODIR=${AUTODIR:-/autobench} && \
      REMOTE_BARDIR=$AUTODIR'"$BARBASE"' &&\
      cat $REMOTE_BARDIR/'"${MYIP}-pipe"' ; \
      rc=$? ; \
      exit $rc'`
  RC=$?
  kill $TIMEOUT_PID
  killall barriersleep
  #log "BARRIER $NAME: result=$RES rc=$RC"
  if [ $RC -ne 0 -o "$RES" != "go" ]; then
    # Something bad happened.
    exit 42
  fi
else
  kill $TIMEOUT_PID
  killall barriersleep
  # We aren't participating
  log "Barrier $NAME: Not participating"
fi

