diff --git a/README.md b/README.md
index 98dd9a8..fca931f 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,73 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Xincheng Zhang
+* Tested on: Windows 10, i7-4702HQ @ 2.20GHz 8GB, GTX 870M 3072MB (Personal Laptop)
 
-### (TODO: Your README)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+
+## ScreenShot
+
+___
+
+
+![](https://github.com/XinCastle/Project1-CUDA-Flocking/blob/master/images/2017.09.11.23.34.16.gif)
+
+
+* This result is tested under the following settings:
+* number of particles: 5000
+* blocksize: 128
+* coherent-uniform
+* rule1Distance 5.0f
+* rule2Distance 3.0f
+* rule3Distance 5.0f
+* rule1Scale 0.01f
+* rule2Scale 0.1f
+* maxSpeed 1.0f
+
+
+## Performance Analysis
+
+___
+
+The performance of **Naive Boids**, **Uniform Boids** and **Coherent Uniform Boids** is measured by FPS.
+The following is the diagram comparing these three methods.
+
+![](https://github.com/XinCastle/Project1-CUDA-Flocking/blob/master/images/diagram.png)
+
+** Note:I find the movement of mine looks the same as the example. However, the frame rates of uniform boids
+ and coherent uniform boids are much slower than I expect. I checked my algorithm but wasn't able to tell why
+ it happens.
+
+ In conclusion, when the boids number is more than 1000, the performance order is: Coherent Uniform Boids > Uniform Boids > Naive Boids
+
+
+### Questions
+
+___
+* For each implementation, how does changing the number of boids affect performance? Why do you think this is?
+
+1. As the number of boids increases, the performance becomes worse for all the three methods. As for how much they are affected, the order
+is: Naive Boids > Uniform Boids ~=(almost equal to) Cohereent Uniform Boids. I think the reason is because as we add more boids in the system,
+there are more neighbors around a particle so that the whole algorithm requires more time for calculation. The naive approach is the least
+efficient, so it decreases faster than the other two methods.
+
+
+* For each implementation, how does changing the block count and block size affect performance? Why do you think this is?
+
+2. As the blocksize becomes larger, the frame rate slightly drops. I think the reason is the following:
+It's less efficient to fetch and access data in larger memory. Therefore, as the blocksize becomes larger, it takes more and more time for
+the system to access the memory.
+
+
+* For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?
+
+3. Yes, the coherent uniform grid has better performance than the uniform grid because reading data which is stored in contiguous memory is faster.
+Therefore, the main reason of the improvement is because of contiguous memory.
+
+
+* Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not?
+
+4. Yes, when the number of neighboring cells becomes 27 rather than 8, the performance becomes better. I think it's because as cell width decreases,
+the total particles defined as "neighboring particles" significantly decreases. Therfore, the time cost decreases although we have more cells to loop
+in the function.
\ No newline at end of file
diff --git a/images/2017.09.11.23.34.16.gif b/images/2017.09.11.23.34.16.gif
new file mode 100644
index 0000000..27b1dc5
Binary files /dev/null and b/images/2017.09.11.23.34.16.gif differ
diff --git a/images/diagram.png b/images/diagram.png
new file mode 100644
index 0000000..03fcaf5
Binary files /dev/null and b/images/diagram.png differ
diff --git a/src/kernel.cu b/src/kernel.cu
index aaf0fbf..7fd936d 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -86,6 +86,9 @@ int *dev_gridCellEndIndices;   // to this cell?
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
 
+glm::vec3 *dev_posAf;
+glm::vec3 *dev_velAf;
+
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
 int gridCellCount;
@@ -169,6 +172,19 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+
+
+  cudaMalloc((void**)&dev_posAf, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_posAfterShuffle failed!");
+
+  cudaMalloc((void**)&dev_velAf, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_velAfterShuffle failed!");
+
   cudaThreadSynchronize();
 }
 
@@ -231,9 +247,64 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+	glm::vec3 perceived_center (0,0,0);
+	glm::vec3 c (0,0,0);
+	glm::vec3 perceived_velocity (0,0,0);
+	glm::vec3 rule1(0, 0, 0);
+	glm::vec3 rule2(0, 0, 0);
+	glm::vec3 rule3(0, 0, 0);
+	glm::vec3 vc(0, 0, 0);
+	int N1 = 0, N2 = 0, N3 = 0;
+
+	for (int i = 0; i < N; i++)
+	{
+		if (i != iSelf && glm::length(pos[i] - pos[iSelf]) < rule1Distance)
+		{
+			N1++;
+			perceived_center += pos[i];
+		}
+		if (i != iSelf && glm::length(pos[i] - pos[iSelf]) < rule2Distance)
+		{
+			if (glm::length(pos[i] - pos[iSelf]) < 100)
+			{
+				c -= (pos[i] - pos[iSelf]);
+			}
+		}
+		if (i != iSelf && glm::length(pos[i] - pos[iSelf]) < rule3Distance)
+		{
+			N3++;
+			perceived_velocity += vel[i];
+		}
+	}
+
+	if (N1 <= 0)
+	{
+		rule1 = glm::vec3(0,0,0);
+	}
+	else
+	{
+		perceived_center /= N1;
+		rule1 = (perceived_center - pos[iSelf]) * rule1Scale;
+	}
+
   // Rule 2: boids try to stay a distance d away from each other
+
+	rule2 = c * rule2Scale;
+
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+
+	if (N3 <= 0)
+	{
+		rule3 = glm::vec3(0, 0, 0);
+	}
+	else
+	{
+		perceived_velocity /= N3;
+		rule3 = perceived_velocity * rule3Scale;
+	}
+
+	vc = rule1 + rule2 + rule3;
+  return vc;
 }
 
 /**
@@ -245,6 +316,16 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   // Compute a new velocity based on pos and vel1
   // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index <= N)
+	{
+		vel2[index] = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+		if (glm::length(vel2[index]) > maxSpeed)
+		{
+			vel2[index] = vel2[index] / glm::length(vel2[index])*maxSpeed;
+		}
+	}
+	//Why not vel1? because we need the former velocity to compute new velocity in rule3.
 }
 
 /**
@@ -289,6 +370,19 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+	int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (id < N)
+	{
+		int x = (pos[id].x - gridMin.x)*inverseCellWidth;
+		int y = (pos[id].y - gridMin.y)*inverseCellWidth;
+		int z = (pos[id].z - gridMin.z)*inverseCellWidth;
+		gridIndices[id] = gridIndex3Dto1D(x, y, z, gridResolution);
+		indices[id] = id;
+	}
+	else
+	{
+		return;
+	}
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +400,27 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index < N)
+	{
+		if (index == 0)
+		{
+			gridCellStartIndices[particleGridIndices[0]] = 0;
+		}
+		else if (index == N - 1)
+		{
+			gridCellEndIndices[particleGridIndices[N - 1]] = N - 1;
+		}
+		else if (particleGridIndices[index] != particleGridIndices[index - 1])
+		{
+			gridCellStartIndices[particleGridIndices[index]] = index;
+			gridCellEndIndices[particleGridIndices[index-1]] = index-1;
+		}
+	}
+	else
+	{
+		return;
+	}
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,8 +437,132 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N)
+	{
+		return;
+	}
+	int particleIndex = particleArrayIndices[index];
+	float x = (pos[particleIndex].x - gridMin.x) * inverseCellWidth;
+	float y = (pos[particleIndex].y - gridMin.y) * inverseCellWidth;
+	float z = (pos[particleIndex].z - gridMin.z) * inverseCellWidth;
+
+	int dx = (x - (int)x) > 0.5f ? 1 : -1;
+	int dy = (y - (int)y) > 0.5f ? 1 : -1;
+	int dz = (x - (int)z) > 0.5f ? 1 : -1;
+
+	int neighborsIndex[8];
+	neighborsIndex[0] = gridIndex3Dto1D(x, y, z, gridResolution);
+	neighborsIndex[1] = gridIndex3Dto1D(x + dx, y, z, gridResolution);
+	neighborsIndex[2] = gridIndex3Dto1D(x + dx, y + dy, z, gridResolution);
+	neighborsIndex[3] = gridIndex3Dto1D(x + dx, y, z + dz, gridResolution);
+	neighborsIndex[4] = gridIndex3Dto1D(x + dx, y + dy, z + dz, gridResolution);
+	neighborsIndex[5] = gridIndex3Dto1D(x, y + dy, z, gridResolution);
+	neighborsIndex[6] = gridIndex3Dto1D(x, y + dy, z + dz, gridResolution);
+	neighborsIndex[7] = gridIndex3Dto1D(x, y, z + dz, gridResolution);
+
+	glm::vec3 v1(0.0f, 0.0f, 0.0f);
+	glm::vec3 v2(0.0f, 0.0f, 0.0f);
+	glm::vec3 v3(0.0f, 0.0f, 0.0f);
+
+	glm::vec3 c(0.0f, 0.0f, 0.0f);
+	glm::vec3 perceived_center(0.0f,0.0f,0.0f);
+	glm::vec3 perceived_velocity(0.0f, 0.0f, 0.0f);
+
+	int gridCount = gridResolution * gridResolution * gridResolution;
+
+	int r1 = 0 , r3 = 0;
+
+	for (int i = 0; i < 8; i++)
+	{
+		if (neighborsIndex[i] < 0 || neighborsIndex[i] >= gridCount)
+		{
+			continue;
+		}
+		if (gridCellStartIndices[neighborsIndex[i]] != -1 && gridCellEndIndices[neighborsIndex[i]] != -1)
+		{
+			for (int j = gridCellStartIndices[neighborsIndex[i]]; j <= gridCellEndIndices[neighborsIndex[i]]; j++)
+			{
+				int k = particleArrayIndices[j];
+				//if the particle is not itself ***see the rule definition in 1.2
+				if (particleIndex != k)
+				{
+					float distance = glm::length(pos[k] - pos[particleIndex]);
+
+					if (distance < rule1Distance)
+					{
+
+						perceived_center += pos[k];
+						//printf("I'm here\n");
+						r1 ++;
+					}
+					if (distance < rule2Distance)
+					{
+						if (distance < 100)
+						{
+							c -= (pos[k] - pos[particleIndex]);
+						}
+					}
+					if (distance < rule3Distance)
+					{
+						perceived_velocity += vel1[k];
+						r3 ++;
+					}
+				}
+			}
+		}
+	}
+
+	if (r1 > 0)
+	{
+		perceived_center /= r1;
+		v1 = (perceived_center - pos[particleIndex])* rule1Scale;
+	}
+
+	v2 = c * rule2Scale;
+
+	if (r3 > 0)
+	{
+		perceived_velocity /= r3;
+		v3 = perceived_velocity * rule3Scale;
+	}
+
+	
+	glm::vec3 finVel = vel1[particleIndex] + v1 +v2 +v3;
+
+
+
+	//clamp
+	if (glm::length(finVel) > maxSpeed)
+	{
+		finVel = glm::normalize(finVel) * maxSpeed;
+	}
+
+	vel2[particleIndex] = finVel;
+
 }
 
+
+__global__ void kernRearrange(
+	int N, int* particleArrayIndices, glm::vec3* pos, glm::vec3* posAf,
+	glm::vec3* vel, glm::vec3* velAf)
+{
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) {
+		return;
+	}
+	posAf[index] = pos[particleArrayIndices[index]];
+	velAf[index] = vel[particleArrayIndices[index]];
+
+
+
+}
+
+
+
+
 __global__ void kernUpdateVelNeighborSearchCoherent(
   int N, int gridResolution, glm::vec3 gridMin,
   float inverseCellWidth, float cellWidth,
@@ -341,6 +580,110 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+	
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N)
+	{
+		return;
+	}
+
+	float x = (pos[index].x - gridMin.x) * inverseCellWidth;
+	float y = (pos[index].y - gridMin.y) * inverseCellWidth;
+	float z = (pos[index].z - gridMin.z) * inverseCellWidth;
+
+	int dx = ((x - (int)x) >= 0.5f) ? 1 : -1;
+	int dy = ((y - (int)y) >= 0.5f) ? 1 : -1;
+	int dz = ((z - (int)z) >= 0.5f) ? 1 : -1;
+
+	int neighborsIndex[8];
+	neighborsIndex[0] = gridIndex3Dto1D(x, y, z, gridResolution);
+	neighborsIndex[1] = gridIndex3Dto1D(x + dx, y, z, gridResolution);
+	neighborsIndex[2] = gridIndex3Dto1D(x + dx, y + dy, z, gridResolution);
+	neighborsIndex[3] = gridIndex3Dto1D(x + dx, y, z + dz, gridResolution);
+	neighborsIndex[4] = gridIndex3Dto1D(x + dx, y + dy, z + dz, gridResolution);
+	neighborsIndex[5] = gridIndex3Dto1D(x, y + dy, z, gridResolution);
+	neighborsIndex[6] = gridIndex3Dto1D(x, y + dy, z + dz, gridResolution);
+	neighborsIndex[7] = gridIndex3Dto1D(x, y, z + dz, gridResolution);
+
+	glm::vec3 v1(0.0f, 0.0f, 0.0f);
+	glm::vec3 v2(0.0f, 0.0f, 0.0f);
+	glm::vec3 v3(0.0f, 0.0f, 0.0f);
+
+	glm::vec3 c(0.f);
+	glm::vec3 perceived_center(0.f);
+	glm::vec3 perceived_velocity(0.f);
+
+	int r1=0, r3 = 0;
+
+	int gridCount = gridResolution * gridResolution * gridResolution;
+	
+	for (int i = 0; i < 8; i++)
+	{
+		if (neighborsIndex[i] < 0 || neighborsIndex[i] >= gridCount)
+		{
+			continue;
+		}
+		if (gridCellStartIndices[neighborsIndex[i]] != -1 && gridCellEndIndices[neighborsIndex[i]] != -1)
+		{
+			for (int j = gridCellStartIndices[neighborsIndex[i]]; j <= gridCellEndIndices[neighborsIndex[i]]; j++)
+			{
+				//if the particle is not itself ***see the rule definition in 1.2
+				if (index != j)
+				{
+					float distance = glm::length(pos[j] - pos[index]);
+
+					if (distance < rule1Distance)
+					{
+
+						perceived_center += pos[j];
+						//printf("I'm here\n");
+						r1++;
+					}
+					if (distance < rule2Distance)
+					{
+						if (distance < 100)
+						{
+							c -= (pos[j] - pos[index]);
+						}
+					}
+					if (distance < rule3Distance)
+					{
+						perceived_velocity += vel1[j];
+						r3++;
+					}
+				}
+			}
+		}
+	}
+
+	if (r1 > 0)
+	{
+		perceived_center /= r1;
+		v1 = (perceived_center - pos[index])* rule1Scale;
+	}
+
+	v2 = c * rule2Scale;
+
+	if (r3 > 0)
+	{
+		perceived_velocity /= r3;
+		v3 = perceived_velocity * rule3Scale;
+	}
+
+
+	glm::vec3 finVel = vel1[index] + v1 + v2 + v3;
+
+
+
+	//clamp
+	if (glm::length(finVel) > maxSpeed)
+	{
+		finVel = glm::normalize(finVel) * maxSpeed;
+	}
+
+	vel2[index] = finVel;
+
 }
 
 /**
@@ -349,6 +692,12 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+	int N = numObjects;
+	dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+	kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> > (N, dev_pos, dev_vel1, dev_vel2);
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (N, dt, dev_pos, dev_vel2);
+
+	std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +713,29 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+	kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+	dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+	dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+	
+	dim3 BlocksPerGridCell = (gridCellCount + blockSize - 1) / blockSize;
+
+	kernResetIntBuffer << <BlocksPerGridCell, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+	kernResetIntBuffer << <BlocksPerGridCell, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+
+
+
+	kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+	kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> >(numObjects, dt, dev_pos, dev_vel2);
+
+	std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +754,38 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+
+
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+	kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+	dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+	dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+	dim3 BlocksPerGridCell = (gridCellCount + blockSize - 1) / blockSize;
+
+	kernResetIntBuffer << <BlocksPerGridCell, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+	kernResetIntBuffer << <BlocksPerGridCell, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+
+
+
+	kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+	kernRearrange<< <fullBlocksPerGrid, blockSize >> >(numObjects,
+		dev_particleArrayIndices, dev_pos, dev_posAf, dev_vel1, dev_velAf);
+
+	kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, blockSize >> >(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+		dev_gridCellStartIndices, dev_gridCellEndIndices, dev_posAf, dev_velAf, dev_vel2);
+
+	std::swap(dev_pos, dev_posAf);
+
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> >(numObjects, dt, dev_pos, dev_vel2);
+
+	std::swap(dev_vel1, dev_vel2);
+
+
 }
 
 void Boids::endSimulation() {
@@ -390,6 +794,14 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+
+  cudaFree(dev_posAf);
+  cudaFree(dev_velAf);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index a29471d..a7c2c99 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,8 +14,8 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
 const int N_FOR_VIS = 5000;