How to Build High-Performance GPU-Accelerated Simulations and Differentiable Physics Workflows Using NVIDIA Warp Kernels

angles = np.linspace(0.0, 2.0 * np.pi, n_particles, endpoint=False, dtype=np.float32)
px0_np = 0.4 * np.cos(angles).astype(np.float32)
py0_np = (0.7 + 0.15 * np.sin(angles)).astype(np.float32)
vx0_np = (-0.8 * np.sin(angles)).astype(np.float32)
vy0_np = (0.8 * np.cos(angles)).astype(np.float32)


px0_wp = wp.array(px0_np, dtype=wp.float32, device=device)
py0_wp = wp.array(py0_np, dtype=wp.float32, device=device)
vx0_wp = wp.array(vx0_np, dtype=wp.float32, device=device)
vy0_wp = wp.array(vy0_np, dtype=wp.float32, device=device)


state_size = (steps + 1) * n_particles
px_wp = wp.empty(state_size, dtype=wp.float32, device=device)
py_wp = wp.empty(state_size, dtype=wp.float32, device=device)
vx_wp = wp.empty(state_size, dtype=wp.float32, device=device)
vy_wp = wp.empty(state_size, dtype=wp.float32, device=device)


wp.launch(
   kernel=init_particles_kernel,
   dim=n_particles,
   inputs=[n_particles, px0_wp, py0_wp, vx0_wp, vy0_wp],
   outputs=[px_wp, py_wp, vx_wp, vy_wp],
   device=device,
)


wp.launch(
   kernel=simulate_particles_kernel,
   dim=steps * n_particles,
   inputs=[n_particles, dt, gravity, damping, bounce, radius],
   outputs=[px_wp, py_wp, vx_wp, vy_wp],
   device=device,
)
wp.synchronize()


px_traj = px_wp.numpy().reshape(steps + 1, n_particles)
py_traj = py_wp.numpy().reshape(steps + 1, n_particles)


sample_ids = np.linspace(0, n_particles - 1, 16, dtype=int)
plt.figure(figsize=(8, 6))
for idx in sample_ids:
   plt.plot(px_traj[:, idx], py_traj[:, idx], linewidth=1.5)
plt.axhline(radius, linestyle="--")
plt.xlim(-1.05, 1.05)
plt.ylim(0.0, 1.25)
plt.title(f"Warp particle trajectories on {device}")
plt.xlabel("x")
plt.ylabel("y")
plt.show()


proj_steps = 180
proj_dt = np.float32(0.025)
proj_g = np.float32(-9.8)
target_x = np.float32(3.8)
target_y = np.float32(0.0)


vx_value = np.float32(2.0)
vy_value = np.float32(6.5)
lr = 0.08
iters = 60


loss_history = []
vx_history = []
vy_history = []


for it in range(iters):
   init_vx_wp = wp.array(np.array([vx_value], dtype=np.float32), dtype=wp.float32, device=device, requires_grad=True)
   init_vy_wp = wp.array(np.array([vy_value], dtype=np.float32), dtype=wp.float32, device=device, requires_grad=True)


   x_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device, requires_grad=True)
   y_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device, requires_grad=True)
   vx_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device, requires_grad=True)
   vy_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device, requires_grad=True)
   loss_wp = wp.zeros(1, dtype=wp.float32, device=device, requires_grad=True)


   tape = wp.Tape()
   with tape:
       wp.launch(
           kernel=init_projectile_kernel,
           dim=1,
           inputs=[],
           outputs=[x_hist_wp, y_hist_wp, vx_hist_wp, vy_hist_wp, init_vx_wp, init_vy_wp],
           device=device,
       )
       wp.launch(
           kernel=projectile_step_kernel,
           dim=proj_steps,
           inputs=[proj_dt, proj_g],
           outputs=[x_hist_wp, y_hist_wp, vx_hist_wp, vy_hist_wp],
           device=device,
       )
       wp.launch(
           kernel=projectile_loss_kernel,
           dim=1,
           inputs=[proj_steps, target_x, target_y],
           outputs=[x_hist_wp, y_hist_wp, loss_wp],
           device=device,
       )


   tape.backward(loss=loss_wp)
   wp.synchronize()


   current_loss = float(loss_wp.numpy()[0])
   grad_vx = float(init_vx_wp.grad.numpy()[0])
   grad_vy = float(init_vy_wp.grad.numpy()[0])


   vx_value = np.float32(vx_value - lr * grad_vx)
   vy_value = np.float32(vy_value - lr * grad_vy)


   loss_history.append(current_loss)
   vx_history.append(float(vx_value))
   vy_history.append(float(vy_value))


   if it % 10 == 0 or it == iters - 1:
       print(f"iter={it:02d} loss={current_loss:.6f} vx={vx_value:.4f} vy={vy_value:.4f} grad=({grad_vx:.4f}, {grad_vy:.4f})")


final_init_vx_wp = wp.array(np.array([vx_value], dtype=np.float32), dtype=wp.float32, device=device)
final_init_vy_wp = wp.array(np.array([vy_value], dtype=np.float32), dtype=wp.float32, device=device)
x_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device)
y_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device)
vx_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device)
vy_hist_wp = wp.zeros(proj_steps + 1, dtype=wp.float32, device=device)


wp.launch(
   kernel=init_projectile_kernel,
   dim=1,
   inputs=[],
   outputs=[x_hist_wp, y_hist_wp, vx_hist_wp, vy_hist_wp, final_init_vx_wp, final_init_vy_wp],
   device=device,
)
wp.launch(
   kernel=projectile_step_kernel,
   dim=proj_steps,
   inputs=[proj_dt, proj_g],
   outputs=[x_hist_wp, y_hist_wp, vx_hist_wp, vy_hist_wp],
   device=device,
)
wp.synchronize()


x_path = x_hist_wp.numpy()
y_path = y_hist_wp.numpy()


fig = plt.figure(figsize=(15, 4))


ax1 = fig.add_subplot(1, 3, 1)
ax1.plot(loss_history)
ax1.set_title("Optimization loss")
ax1.set_xlabel("Iteration")
ax1.set_ylabel("Squared distance")


ax2 = fig.add_subplot(1, 3, 2)
ax2.plot(vx_history, label="vx")
ax2.plot(vy_history, label="vy")
ax2.set_title("Learned initial velocity")
ax2.set_xlabel("Iteration")
ax2.legend()


ax3 = fig.add_subplot(1, 3, 3)
ax3.plot(x_path, y_path, linewidth=2)
ax3.scatter([target_x], [target_y], s=80, marker="x")
ax3.set_title("Differentiable projectile trajectory")
ax3.set_xlabel("x")
ax3.set_ylabel("y")
ax3.set_ylim(-0.1, max(1.0, float(np.max(y_path)) + 0.3))


plt.tight_layout()
plt.show()


final_dx = float(x_path[-1] - target_x)
final_dy = float(y_path[-1] - target_y)
final_dist = math.sqrt(final_dx * final_dx + final_dy * final_dy)
print(f"Final target miss distance: {final_dist:.6f}")
print(f"Optimized initial velocity: vx={vx_value:.6f}, vy={vy_value:.6f}")

Source_link

No Result