import gr import gr.pygr as mlab import numba as nb import numpy as np import math import radar_rust from IPython import embed from numba import cuda from tqdm import tqdm from radar_nb import aot_func, aot_func_parallel def py_func(srcs, t): m = np.zeros((200, 200), np.float64) H, W = m.shape for y in range(H): for x in range(W): for src in srcs: dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2) m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase) return m def np_func(srcs, t): m = np.zeros((200, 200), np.float64) H, W = m.shape ys = np.arange(H).reshape(-1, 1).repeat(W, 1).reshape(-1) xs = np.arange(W).reshape(1, -1).repeat(H, 0).reshape(-1) ds = np.sqrt( (ys.reshape(-1, 1) - srcs.y.reshape(1, -1)) ** 2 + (xs.reshape(-1, 1) - srcs.x.reshape(1, -1)) ** 2 ) for i in range(len(srcs)): m[ys, xs] += np.cos(2.0 * np.pi * (ds[:, i] + t) + t * srcs.phase[i]) return m spec = nb.from_dtype(np.dtype([("y", np.int32), ("x", np.int32), ("phase", np.int32)])) @nb.njit(nb.float64[:, :](spec[:], nb.float64)) def jit_func(srcs, t): m = np.zeros((200, 200), nb.float64) H, W = m.shape for y in range(H): for x in range(W): for src in srcs: dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2) m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase) return m @nb.njit(nb.float64[:, :](spec[:], nb.float64), fastmath=True) def jit_func_fast(srcs, t): m = np.zeros((200, 200), nb.float64) H, W = m.shape for y in range(H): for x in range(W): for src in srcs: dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2) m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase) return m def rust_func(srcs, t): buff = np.zeros((200, 200), dtype=np.float32) radar_rust.func(buff, srcs.y, srcs.x, srcs.phase, t) return buff def cuda_func(srcs, t): blockdim = 1024, 1 griddim = int(math.ceil(float(200 * 200) / blockdim[0])), 1 stream = cuda.stream() d_buff = cuda.device_array((200, 200), stream=stream) d_ys = cuda.to_device(np.ascontiguousarray(srcs.y), stream) d_xs = cuda.to_device(np.ascontiguousarray(srcs.x), stream) d_ps = cuda.to_device(np.ascontiguousarray(srcs.phase), stream) cuda_kernel[griddim, blockdim, stream](d_buff, d_ys, d_xs, d_ps, t) buff = d_buff.copy_to_host(stream=stream) stream.synchronize() return buff @cuda.jit((nb.float64[:, :], nb.int32[:], nb.int32[:], nb.int32[:], nb.float64)) def cuda_kernel(buff, src_ys, src_xs, src_ps, t): pos = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x y = pos // 200 x = pos % 200 for i in range(len(src_ys)): src_y, src_x, src_p = src_ys[i], src_xs[i], src_ps[i] dist = math.sqrt((y - src_y) ** 2 + (x - src_x) ** 2) buff[y, x] += math.cos(2.0 * np.pi * (dist + t) + t * src_p) if __name__ == "__main__": spec = np.dtype([("y", np.int32), ("x", np.int32), ("phase", np.int32)]) srcs = np.array( [(98, 0, -2), (99, 0, -1), (100, 0, 0), (101, 0, 1), (102, 0, 2),], dtype=spec, ).view(np.recarray) # gr.beginprint("rader.mp4") exprs = [ ("cuda", cuda_func), ("aot-parallel", aot_func_parallel), ("aot", aot_func), ("rust", rust_func), ("jit", jit_func), ("jit-fast", jit_func_fast), ("np", np_func), ("py", py_func), ] for name, func in exprs: for t in tqdm(np.arange(0, 100, 0.01), desc=name): m = func(srcs, t) mlab.imshow(m) # gr.endprint()