#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "shmem.h"
// for region markers
#include "esp.h"


#define MSG_ELEMS 4096
#define MSG_SIZE MSG_ELEMS * sizeof(int)

#define PE(x) (x)
#define concat(a, b) a##b
#define MAX_NODES 128
#define CAL_LOOPS 5
#define HOST_NAME_MAX 128


// the % operator behaves unexpectedly with negative 'a' inputs.  Provide a hack. 
int mod(int a, int b)
{
	// this is really undefined
	if (b == 0) {
	    return b;
	}
	if(b < 0) {
	    return -mod(-a, -b);   
	}
	int ret = a % b;
	if(ret < 0) {
	    ret += b;
	}
	return ret;
}

//  GOAL:  PE[0] starts it off, sends to the next one. Then that one sends to the next PE, and so on.
//  	The last node will then send back to PE[0].  All receivers perform a validation on their data.
//  	Basically a simple synchronized token ring.
//  	
int calibrationLoop()
{
	int sourceData[MSG_ELEMS];		// non-symmetric
	static int sinkData[MSG_ELEMS];		// marked static, so symmetric

	int myPE   = shmem_my_pe();
	int numPEs = shmem_n_pes();

	if (numPEs < 2) {
		printf("Not enough PEs to play! Require at least 2 PEs.  Exiting.\n");
		return -1;
	}
	for (int x = 0; x < MSG_ELEMS; x++) {
		sourceData[x] = x + myPE;
		sinkData[x] = 0xABBA;
	}
	
	// allocate a symmetric integer per PE to be used to sync between PEs.
	int *sync = (int *)shmem_calloc(numPEs, sizeof(int));

	shmem_sync_all();

	if (0 == myPE)
	{
		int destPE = mod((myPE + 1), numPEs);

		printf("PE[0] starting things off by calling shmem_int_put(sinkData, sourceData, %d, %d) \n", 
                                                    MSG_ELEMS, destPE); fflush(stdout);
		esp_enter("SendingData");
		shmem_int_put(sinkData, sourceData, MSG_ELEMS, destPE);
		esp_exit("SendingData");
		
		shmem_atomic_set(&sync[destPE], destPE + 1, destPE);
	}

	shmem_sync_all();
	shmem_quiet();

	for (int calLoops = 0; calLoops < CAL_LOOPS ; calLoops++)
	{
		int errcnt = 0;
		int sourcePE = mod((myPE - 1), numPEs);

		// block on my element
		esp_enter("WaitingForData");
		shmem_int_wait_until(&sync[myPE], SHMEM_CMP_EQ, myPE + 1);
		esp_exit("WaitingForData");

		// reset it
		sync[myPE] = 0;
		
		if (sourcePE < 0) {
			printf("\n\n*********** PE[%d] ERROR:  Calculated sourcePE invalid: %d ********** \n", myPE, sourcePE);
			if (sync) {
				shmem_free(sync);
			}
			return -1;
		}
                
		//printf("PE[%d] Received data, verifying contents... \n", myPE);  fflush(stdout);

		// Read & Verify
		for (int x = 0; x < MSG_ELEMS; x++)
		{
			if (sinkData[x] != (x + sourcePE)) {
				printf("\n\n***** PE[%d] ERROR: sinkData[%d] = 0x%x, expected 0x%x. *****\n", 
                                            myPE, x, sinkData[x], x + sourcePE);  fflush(stdout);
				errcnt++;
			}
		}
		if (errcnt) {
			printf("PE[%d] FAIL: !!! ENCOUNTERED %d ERRORS !!! \n", myPE, errcnt);  fflush(stdout);
			if (sync) {
				shmem_free(sync);
			}
			return -1;
		}

		int destPE = mod(myPE + 1, numPEs);
		
                esp_enter("SendingData");
                shmem_int_put(sinkData, sourceData, MSG_ELEMS, destPE);	// note: is #elems, not data size!
		esp_exit("SendingData");

		// Signal the destPE we have sent.
		shmem_atomic_set(&sync[destPE], destPE + 1, destPE);
	}
	if (sync) {
		shmem_free(sync);
	}
	return 0;
}


int main()
{
	char hostname[1024];
	gethostname(hostname, 1024);

	// Initialize SHMEM
	shmem_init();
	
	int me = shmem_my_pe();
	int numPEs = shmem_n_pes();

	// Let everyone in
	shmem_barrier_all();
	shmem_sync_all();

	for (int x = 0; x < CAL_LOOPS; x++)
	{
		if (0 == me) {
			printf("Iteration: %d \n", x);  fflush(stdout);
		}
		if (0 != calibrationLoop()) {
			printf("PE[%d]: ************TEST FAILED************** :( \n", me);
			break;
		}
		shmem_sync_all();					
		shmem_quiet();
	}
	if (0 == me) {
		printf("\n\nWORKLOAD (sanity) DETAILS: %d PEs each sent and received %d bytes of data.\n", 
                            numPEs, (int)(MSG_SIZE * CAL_LOOPS));  fflush(stdout);
	}

	// Wrap it up
	shmem_finalize();
}

