Assignment 1: Traveling Salesperson Genetic Algorithm

The following is sample code that does an island exchange in the host.
/*	island.cu

	simple example of island exchanges via host
*/

#include 

// For the CUDA runtime routines (prefixed with "cuda_")
#include 

#include 

#define	PES	(1024*2)
#define	POP	50

typedef struct {
	int	num[POP][PES];
	int	best[PES];
	int	worst[PES];
} gdata_t;

__global__ void
island(gdata_t *p, const int pes)
{
    // pick best & worst
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < pes)
    {
	int best = 0;
	int worst = 0;
	for (int j = 1; j < POP; ++j)
	{
	    if (p->num[best][i] < p->num[j][i]) {
		best = j;
	    }
	    if (p->num[worst][i] >= p->num[j][i]) {
		worst = j;
	    }
	}
	p->best[i] = best;
	p->worst[i] = worst;
    }
}

/**
 * Host main routine
 */
int
main(void)
{
    // Error code to check return values for CUDA calls
    cudaError_t err = cudaSuccess;
    int gbest, agree, pass = 0;

    // Print the vector length to be used, and compute its size
    int pes = PES;
    int pop = POP;
    size_t size = sizeof(gdata_t);

    // Allocate the host data
    gdata_t *hp = (gdata_t *)malloc(size);

    // Verify that allocations succeeded
    if (hp == NULL)
    {
        fprintf(stderr, "Failed to allocate host data!\n");
        exit(EXIT_FAILURE);
    }

    // Initialize the host data
    for (int i = 0; i < pop; ++i)
    {
	for (int j = 0; j < pes; ++j)
	{
	    hp->num[i][j] = rand();
	}
    }

    // Allocate the device data
    gdata_t *dp = NULL;
    err = cudaMalloc((void **)&dp, size);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device data (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

// loop until all nodes agree
do {

    // Copy the host data to device
    printf("Copy input data from the host memory to the CUDA device\n");
    err = cudaMemcpy(dp, hp, size, cudaMemcpyHostToDevice);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy data from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Launch the CUDA Kernel
    int threadsPerBlock = 256;
    int blocksPerGrid =(pes + threadsPerBlock - 1) / threadsPerBlock;
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
    island<<>>(dp, pes);
    err = cudaGetLastError();

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch island kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Copy the device result back to the host memory
    printf("Copy output data from the CUDA device to the host memory\n");
    err = cudaMemcpy(hp, dp, size, cudaMemcpyDeviceToHost);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy data back from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Communicate between islands
    // Pick global best & do all agree?
    gbest = hp->num[ hp->best[0] ][0];
    agree = 1;
    for (int i = 0; i < pes; ++i)
    {
	int whofrom = rand() % pes;

	// Communicate between islands
	hp->num[ hp->worst[i] ][i] = hp->num[ hp->best[whofrom] ][whofrom];

	// Pick best
        if (hp->num[ hp->best[i] ][i] != gbest)
	{
	    agree = 0;
	    if (hp->num[ hp->best[i] ][i] > gbest) gbest = hp->num[ hp->best[i] ][i];
	}
    }

    fprintf(stderr, "Pass %d\n", ++pass);

} while (!agree);

    printf("All agree!\n");

    // Free host memory
    free(hp);

    printf("Done\n");
    return 0;
}
GPU Computing