/* Fully optimized solution to the N-queens problem.
   This implementation removes the flag structure used to time
   the two optimizations (Wirth's O(1) validity check and the
   use of permutation vectors).  It also condenses two heavily
   heavily used into macros for to remove the function call
   overhead:
      Mark  --- modified to do both Mark and Unmark
      Valid --- the check of the diagonal attacks

   Author:    Timothy J. Rolfe
   Language:  C
*/
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <malloc.h>
#include <ctype.h>
#include <string.h>
#include "cpuTimes.h"

long int Nunique = 0,   // Accumulate results here
         Ntotal  = 0;

enum{FALSE,TRUE} TRACE=FALSE;// Enable/disable tracing execution

// These need to be modified outside of Nqueens
short *Diag = NULL, *AntiD = NULL;

/* Mark a particular [R][C] board cell as in-use or available     */
/* Massively used; inline to eliminate the function call overhead */
/*
inline void Mark (
     int R, int C, int size,
     short Diag[], short AntiD[], int Flag)
{
   int Idx;

// Diagonal:  Row-Col == constant
   Idx = R - C + size-1;
   Diag[Idx] = Flag;
// AntiDiagonal:  Row+Col == constant
   Idx = R + C;
   AntiD[Idx] = Flag;
}
--- for straight C, make a #define macro   */
#define Mark(R,C,size,Diag,AntiD,Flag) \
   {  Diag[R-C+size-1]=Flag; AntiD[R+C]=Flag;  }
/*
Here are the definitions of the public fields of MPI_Status:
    int MPI_SOURCE;
    int MPI_TAG;
    int MPI_ERROR;
} MPI_Status;
*/

#define  INIT   1  // Message to client:  size and [0]
#define  DATA   2  // Message from client with results
#define  EXIT   4  // Message from client with CPU time
                   // Also to client, giving permission to exit

// Server process:  send jobs to client compute engines and
// receives results back.
void StartQueens (int size, double *clientTime)
{  int   col, k,
         commBuffer[2], // Communication buffer -- size, [0]
         Count[2],      // Counts back from the clients
         limit = (size+1) / 2,  // Mirror images done automatically
         nProc,         // size of the communicator
         proc,          // For loop [1..nProc-1] within initial message
         nActive;       // Number of active processes
   MPI_Status Status;

   if (TRACE) puts("Server process has entered StartQueens");
   MPI_Comm_size (MPI_COMM_WORLD, &nProc);
   commBuffer[0] = size;
// Send initial configurations to all client processes --- or to those
// needed in case not all are required.
   for ( col = 0, proc = 1; proc < nProc && col < limit; proc++, col++ )
   {  commBuffer[1] = col;
      if (TRACE) printf ("Sending client %d job %d,%d\n", proc,
                         commBuffer[0], commBuffer[1]);
      MPI_Send (commBuffer, 2, MPI_INT, proc, INIT, MPI_COMM_WORLD);
   }
   nActive = proc-1;    // Since rank==0 is not used
   if (proc < nProc)    // More processes than jobs
   {  int dmy[2] = {0, 0};   // Termination message to unused processes
      while (proc < nProc)
         MPI_Send (dmy, 2, MPI_INT, proc++, INIT, MPI_COMM_WORLD);
   }
   if (TRACE) puts ("Server beginning to wait on results");
// Receive back results and send out new problems
   while ( col < limit )
   {  MPI_Recv(Count, 2, MPI_INT, MPI_ANY_SOURCE , DATA,
               MPI_COMM_WORLD, &Status);
      proc = Status.MPI_SOURCE;
      if (TRACE) printf ("Received results from client %d (%d, %d)\n",
                 proc, Count[0], Count[1]);
      Nunique += Count[0];
      Ntotal  += Count[1];
      commBuffer[1] = col++;
      if (TRACE) printf ("Sending client %d job %d,%d\n", proc,
                         commBuffer[0], commBuffer[1]);
      MPI_Send (commBuffer, 2, MPI_INT, proc, INIT, MPI_COMM_WORLD);
   }
// Finally, receive back pending results and send termination
// indication (message with size of zero).
   commBuffer[0] = 0;
   while (nActive > 0)
   {
      if (TRACE) printf ("%d pending\n", nActive);
      MPI_Recv(Count, 2, MPI_INT, MPI_ANY_SOURCE , DATA,
               MPI_COMM_WORLD, &Status);
      --nActive;
      proc = Status.MPI_SOURCE;
      if (TRACE) printf ("Received results from client %d (%d, %d)\n",
                 proc, Count[0], Count[1]);
      Nunique += Count[0];
      Ntotal  += Count[1];
      if (TRACE) printf ("Sending client %d termination message\n", proc);
      MPI_Send (commBuffer, 2, MPI_INT, proc, INIT, MPI_COMM_WORLD);
   }
   for (k = 1; k < nProc; k++)
   {  double time;
      if (TRACE) printf("Waiting for time #%d\n", k);
      MPI_Recv(&time, 1, MPI_DOUBLE, MPI_ANY_SOURCE , EXIT,
               MPI_COMM_WORLD, &Status);
      proc = Status.MPI_SOURCE;
      clientTime[proc] = time;
      if (TRACE) printf("Client %d sent time %3.3lf\n", proc, time);
   }
   for (proc = 1; proc < nProc; proc++)
   {  MPI_Send(&proc, 0, MPI_INT, proc, EXIT, MPI_COMM_WORLD);
      if (TRACE) printf ("Sending EXIT to %d\n", proc);
   }
   if (TRACE) puts("Exiting StartQueens.");
}

// Macro to swap datatype x, locations y and z
#define  swap(x,y,z) { x temp = y; y = z; z = temp;  }

// Prototype for forward referencing
void Nqueens (int*, int*, int, int);

// Client processes receive problems to process from the
// server and then return their results to the server.
void ProcessQueens(int myPos)
{
   int  nCells = 0, size, k, col,
        buffer[2];
   int *board = NULL, *trial = NULL;   // Allow for realloc use
   double startTime, endTime, elapsed;
   MPI_Status Status;

   if (TRACE) printf("Client %d has entered ProcessQueens.\n", myPos);
   startTime = cpuClock();
   MPI_Recv(buffer, 2, MPI_INT, 0, INIT, MPI_COMM_WORLD, &Status);
   if (TRACE) printf("Client %d has received problem: %d and %d\n",
                     myPos, buffer[0], buffer[1]);
   fflush(stdout);
   size = buffer[0];
   col  = buffer[1];
// As long as a valid problem is in hand, do the processing.
// The server sends a size of zero as a termination message
   while (size > 0)
   {  int Count[2];

      // Generate the arrays
      if ( size > nCells )
      {  int idx;

         board = (int*) calloc (size, sizeof *board);
         trial = (int*) calloc (size*2, sizeof *trial);
      // Allocate the boolean arrays Diag and AntiD
      // Note that calloc automatically fills with FALSE (0)
         Diag  = (short*) calloc (2*(size-1), sizeof *Diag);
         AntiD = (short*) calloc (2*(size-1), sizeof *AntiD);
         if ( !AntiD )
         {
            puts ("realloc failed --- out of memory!");
            exit (11);
         }
      // Initial permutation generated
      // Since trial is scratch space, it is filled by Nqueens.
         for (idx = 0; idx < size; idx++)
            board[idx] = idx;

         nCells = size;
      }
   // Zero out the counters for THIS problem start.
      Nunique = 0,
      Ntotal  = 0;
      swap (int, board[0], board[col]);
   // CRITICAL:  mark [0] as used, and then as unused
      Mark (0, board[0], size, Diag, AntiD, TRUE);
      if (TRACE) printf("%d calling Nqueens\n", myPos); fflush(stdout);
      Nqueens (board, trial, size, 1);
      Mark (0, board[0], size, Diag, AntiD, FALSE);
      swap (int, board[0], board[col]);     // Undo the swap
   // Put the data into the communication vector
      Count[0] = Nunique;  Count[1] = Ntotal;
      if (TRACE) printf("Client %d sending results (%d, %d).\n",
                        myPos, Count[0], Count[1]);
      MPI_Send(Count,  2, MPI_INT, 0, DATA, MPI_COMM_WORLD);
   // Get the next job --- or the termination message.
      if (TRACE) printf("Client %d waiting for job,\n", myPos);
      MPI_Recv(buffer, 2, MPI_INT, 0, INIT, MPI_COMM_WORLD, &Status);
      size = buffer[0];
      col  = buffer[1];
   }
// Return the total CPU time required.
   endTime = cpuClock();
   elapsed = endTime - startTime;
   if (TRACE) printf("Client %d sending time %3.3lf.\n", myPos, elapsed);
   MPI_Send(&elapsed, 1, MPI_DOUBLE, 0, EXIT, MPI_COMM_WORLD);
// Final hand-shake:  get permission to terminate
   MPI_Recv(buffer, 0, MPI_INT, 0, EXIT, MPI_COMM_WORLD, &Status);
}

/****************************************************************/
/*            And finally, all the Nqueens logic                */
/****************************************************************/

/* Check two vectors for equality; return first inequality (a la strncmp) */
int intncmp (int L[], int R[], int N)
{
   int Idx;

   for (Idx = 0; Idx < N; Idx++)
      if ( L[Idx] - R[Idx] )
         return L[Idx]-R[Idx];
   return 0;
}

/* Rotate +90 or -90: */
void Rotate(int R[], int C[], int N, int Neg)
{
   int Idx, Jdx;

   Jdx = Neg ? 0 : N-1;
   for (Idx = 0; Idx < N; Neg ? Jdx++ : Jdx--)
      C[Idx++] = R[Jdx];
   Jdx = Neg ? N-1 : 0;
   for (Idx = 0; Idx < N; Neg ? Jdx-- : Jdx++)
      R[C[Idx++]] = Jdx;
}

/* Vertical mirror:  reflect each row across the middle */
void Vmirror(int R[], int N)
{
   int Idx;

   for (Idx = 0; Idx < N; Idx++)
      R[Idx] = (N-1) - R[Idx];
   return;
}

/* Check the symmetries.  Return 0 if this is not the 1st */
/* solution in the set of equivalent solutions; otherwise */
/* return the number of equivalent solutions.             */
int SymmetryOps(
    int Board[],     /* The fully-populated board         */
    int Trial[],     /* Used for symmetry checks          */
                     /* Holds its own scratch space too!  */
    int Size)        /* Number of cells in a row/column   */
{  int  Idx;         /* Loop variable; intncmp result     */
   int  Nequiv;      /* Number equivalent boards          */
   int *Scratch=&Trial[Size];   /* Scratch space          */

/* Copy; Trial will be subjected to the transformations   */
   for (Idx = 0; Idx < Size; Idx++)
      Trial[Idx] = Board[Idx];

/* 90 degrees --- clockwise (4th parameter of Rotate is FALSE)*/
   Rotate (Trial, Scratch, Size, 0);
   Idx = intncmp (Board, Trial, Size);
   if (Idx > 0) return 0;
   if ( Idx == 0 )  /* No change on 90 degree rotation        */
      Nequiv = 1;
   else                                       /*  180 degrees */
   {  Rotate (Trial, Scratch, Size, 0);
      Idx = intncmp (Board, Trial, Size);
      if (Idx > 0) return 0;
      if ( Idx == 0 ) /* No change on 180 degree rotation     */
         Nequiv = 2;
      else                                    /* 270 degrees  */
      {  Rotate (Trial, Scratch, Size, 0);
         Idx = intncmp (Board, Trial, Size);
         if (Idx > 0) return 0;
         Nequiv = 4;
      }
   }
/* Copy the board into Trial for mirror checks */
   for (Idx = 0; Idx < Size; Idx++)
      Trial[Idx] = Board[Idx];
/* Reflect -- vertical mirror */
   Vmirror (Trial, Size);
   Idx = intncmp (Board, Trial, Size);
   if (Idx > 0) return 0;
   if ( Nequiv > 1 )    // I.e., no four-fold rotational symmetry
   {
/* -90 degrees --- equiv. to diagonal mirror */
      Rotate (Trial, Scratch, Size, -1);
      Idx = intncmp (Board, Trial, Size);
      if (Idx > 0) return 0;
      if ( Nequiv > 2 ) // I.e., no two-fold rotational symmetry
      {
/* -180 degrees --- equiv. to horizontal mirror */
         Rotate (Trial, Scratch, Size, -1);
         Idx = intncmp (Board, Trial, Size);
         if (Idx > 0) return 0;
/* -270 degrees --- equiv. to anti-diagonal mirror */
         Rotate (Trial, Scratch, Size, -1);
         Idx = intncmp (Board, Trial, Size);
         if (Idx > 0) return 0;
      }
   }
/* WE HAVE A GOOD ONE! */
   return Nequiv * 2;
}

/* Test the validity of this particular partial board.            */
/* Massively used; inline to eliminate the function call overhead */
/*
inline int Valid (int Board[], int Size, int Row,
                  short Diag[], short AntiD[] )
{
   int Idx;        // Index into Diag[] / AntiD[]
   int Chk;        // Occupied flag

// Diagonal:  Row-Col == constant
   Idx = Row - Board[Row] + Size-1;
   Chk = Diag[Idx];
// AntiDiagonal:  Row+Col == constant
   Idx = Row + Board[Row];
   Chk = Chk || AntiD[Idx];
   return !Chk;    // Valid if NOT any occupied
}
 --- for straight C, make a #define macro   */
#define Valid(Board, Size, Row, Diag, AntiD) \
!( Diag[Row-Board[Row]+Size-1] || AntiD[Row+Board[Row]] )

/* Process the partial (or complete) board for the indicated Row */
void Nqueens (int Board[], int Trial[], int Size, int Row)
{
   int Idx, Lim, Vtemp;

/* Check for a partial board. */
   if (Row < Size-1)
   {
      if (Valid (Board, Size, Row, Diag, AntiD))
      {
         Mark (Row, Board[Row], Size, Diag, AntiD, TRUE);
         Nqueens (Board, Trial, Size, Row+1);
         Mark (Row, Board[Row], Size, Diag, AntiD, FALSE);
      }
/*    Rejection of vertical mirror images means that row zero */
/*    only needs to be processed to the middle.               */
      Lim = Row ? Size : (Size+1)/2 ;
      for (Idx = Row+1; Idx < Lim; Idx++)
      {
         Vtemp = Board[Idx];
         Board[Idx] = Board[Row];
         Board[Row] = Vtemp;
         if (Valid (Board, Size, Row, Diag, AntiD))
         {
            Mark (Row, Board[Row], Size, Diag, AntiD, TRUE);
            Nqueens (Board, Trial, Size, Row+1);
            Mark (Row, Board[Row], Size, Diag, AntiD, FALSE);
         }
      }
/*    Regenerate original vector from Row to Size-1:  */
      Vtemp = Board[Row];
      for (Idx = Row+1; Idx < Size; Idx++)
         Board[Idx-1] = Board[Idx];
      Board[Idx-1] = Vtemp;
   }
/* This is a complete board.  Do the symmetry checks */
   else
   {
      if ( !Valid (Board, Size, Row, Diag, AntiD) )
         return;
      Idx = SymmetryOps (Board, Trial, Size);
      if (Idx)
      {
         Nunique++;
         Ntotal += Idx;
      }
   }
   return;
}

int main(int argc, char *argv[])
{
   int    nProc,                    // Processes in the communicator
          proc;                     // loop variable
   MPI_Status Status;               // Return status from MPI
   int    rc;                       // Status  code from MPI_Xxx() call
   int    myPos;                    // My own position

   rc = MPI_Init(&argc, &argv);
   if (rc != MPI_SUCCESS)
   {  puts ("MPI_Init failed"); exit(-1);  }

   rc = MPI_Comm_rank (MPI_COMM_WORLD, &myPos);
   rc = MPI_Comm_size (MPI_COMM_WORLD, &nProc);
   if (TRACE)
   {
      printf ("Process %d of %d started.\n", myPos, nProc);
      fflush (stdout);
   }
   if ( myPos == 0 )    // I.e., this is the server/master/host
   {
      int    size,
             k;
      FILE  *fptr;
      double ClockT, CPU[2], Clock[2], Lapsed,
            *clientTime = (double*) calloc(nProc, sizeof *clientTime);

#ifdef DEBUG
      puts("Server has entered its part of main");
#endif
      if (argc < 2)
      {
         fputs ("size:  ", stdout);
         scanf ("%d", &size);
      }
      else
      {
         size = atoi(argv[1]);
      }

      getTimes(Clock, CPU);
#ifdef DEBUG
      puts ("Server is calling StartQueens.");
#endif
      StartQueens (size, clientTime);
#ifdef DEBUG
      puts ("Server is back from StartQueens.");
#endif
      getTimes(&Clock[1], &CPU[1]);
      Lapsed = CPU[1] - CPU[0];
      ClockT = Clock[1] - Clock[0];
printf ("%22.15lg %22.15lg %22.15lg\n%22.15lg %22.15lg %22.15lg\n",
       Clock[1], Clock[0], ClockT, CPU[1], CPU[0], Lapsed);
      printf ("%3d ==> %10ld  %10ld  %15.8lg  %15.8lg\n",
              size, Nunique, Ntotal, Lapsed, ClockT);
      for (k = 1; k < nProc; k++)
         printf ("%15.7lg", clientTime[k]);
      putchar('\n');
   }
   else  // I.e., this is the client/slave/node
      ProcessQueens(myPos);
#ifdef DEBUG
   printf ("Process %d is back\n", myPos);
#endif
   MPI_Finalize();
   exit(0);
}
