# return 타입이 같을 때 @nb.jit(nb.types.UniTuple(nb.float64[:],2)(nb.float64[:]),nopython=True) def f(a) : return a,a @nb.jit('UniTuple(float64[:], 2)(float64[:])',nopython=True) def f(a) : return a,a # return 타입이 다를 때 @nb.jit(nb.types.Tuple((nb.float64[:], nb.float64[:,:]))(nb.float64[:], nb.float64[:,:]),nopython=True) def f(a, b) : return a, b @nb.jit('Tuple((float64[:], float64[:,:]))(float64[:], float64[:,:])',nopython=True) def f(a, b) : return a, b # https://stackoverflow.com/questions/30363253/multiple-output-and-numba-signatures
import gr import gr.pygr as mlab import numba as nb import numpy as np import math import radar_rust from IPython import embed from numba import cuda from tqdm import tqdm from radar_nb import aot_func, aot_func_parallel def py_func(srcs, t): m = np.zeros((200, 200), np.float64) H, W = m.shape for y in range(H): for x in range(W): for src in srcs: dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2) m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase) return m def np_func(srcs, t): m = np.zeros((200, 200), np.float64) H, W = m.shape ys = np.arange(H).reshape(-1, 1).repeat(W, 1).reshape(-1) xs = np.arange(W).reshape(1, -1).repeat(H, 0).reshape(-1) ds = np.sqrt( (ys.reshape(-1, 1) - srcs.y.reshape(1, -1)) ** 2 + (xs.reshape(-1, 1) - srcs.x.reshape(1, -1)) ** 2 ) for i in range(len(srcs)): m[ys, xs] += np.cos(2.0 * np.pi * (ds[:, i] + t) + t * srcs.phase[i]) return m spec = nb.from_dtype(np.dtype([("y", np.int32), ("x", np.int32), ("phase", np.int32)])) @nb.njit(nb.float64[:, :](spec[:], nb.float64)) def jit_func(srcs, t): m = np.zeros((200, 200), nb.float64) H, W = m.shape for y in range(H): for x in range(W): for src in srcs: dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2) m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase) return m @nb.njit(nb.float64[:, :](spec[:], nb.float64), fastmath=True) def jit_func_fast(srcs, t): m = np.zeros((200, 200), nb.float64) H, W = m.shape for y in range(H): for x in range(W): for src in srcs: dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2) m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase) return m def rust_func(srcs, t): buff = np.zeros((200, 200), dtype=np.float32) radar_rust.func(buff, srcs.y, srcs.x, srcs.phase, t) return buff def cuda_func(srcs, t): blockdim = 1024, 1 griddim = int(math.ceil(float(200 * 200) / blockdim[0])), 1 stream = cuda.stream() d_buff = cuda.device_array((200, 200), stream=stream) d_ys = cuda.to_device(np.ascontiguousarray(srcs.y), stream) d_xs = cuda.to_device(np.ascontiguousarray(srcs.x), stream) d_ps = cuda.to_device(np.ascontiguousarray(srcs.phase), stream) cuda_kernel[griddim, blockdim, stream](d_buff, d_ys, d_xs, d_ps, t) buff = d_buff.copy_to_host(stream=stream) stream.synchronize() return buff @cuda.jit((nb.float64[:, :], nb.int32[:], nb.int32[:], nb.int32[:], nb.float64)) def cuda_kernel(buff, src_ys, src_xs, src_ps, t): pos = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x y = pos // 200 x = pos % 200 for i in range(len(src_ys)): src_y, src_x, src_p = src_ys[i], src_xs[i], src_ps[i] dist = math.sqrt((y - src_y) ** 2 + (x - src_x) ** 2) buff[y, x] += math.cos(2.0 * np.pi * (dist + t) + t * src_p) if __name__ == "__main__": spec = np.dtype([("y", np.int32), ("x", np.int32), ("phase", np.int32)]) srcs = np.array( [(98, 0, -2), (99, 0, -1), (100, 0, 0), (101, 0, 1), (102, 0, 2),], dtype=spec, ).view(np.recarray) # gr.beginprint("rader.mp4") exprs = [ ("cuda", cuda_func), ("aot-parallel", aot_func_parallel), ("aot", aot_func), ("rust", rust_func), ("jit", jit_func), ("jit-fast", jit_func_fast), ("np", np_func), ("py", py_func), ] for name, func in exprs: for t in tqdm(np.arange(0, 100, 0.01), desc=name): m = func(srcs, t) mlab.imshow(m) # gr.endprint()
import sys from pathlib import Path import numba as nb import numpy as np from numba.core.dispatcher import Dispatcher from numba.core.registry import CPUDispatcher from numba.pycc import CC spec = nb.from_dtype(np.dtype([("y", np.int32), ("x", np.int32), ("phase", np.int32)])) @nb.njit(nb.float64[:, :](spec[:], nb.float64)) def aot_func(srcs, t): m = np.zeros((200, 200), nb.float64) H, W = m.shape for y in range(H): for x in range(W): for src in srcs: dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2) m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase) return m @nb.njit(nb.float64[:, :](spec[:], nb.float64), parallel=True) def aot_func_parallel(srcs, t): m = np.zeros((200, 200), nb.float64) H, W = m.shape for y in nb.prange(H): for x in nb.prange(W): for src in srcs: dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2) m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase) return m if __name__ == "__main__": for p in Path().glob("**/*.pyd"): p.unlink() mod_name = Path(__file__).stem cc = CC(mod_name) # Uncomment the following line to print out the compilation steps # cc.verbose = True mod = __import__(mod_name) mod = sys.modules[mod_name] for name, func in mod.__dict__.items(): if isinstance(func, CPUDispatcher): print(f"compile: {name}") # signature = func.signatures[0] # signature = func.get_function_type().signature signature = func.nopython_signatures[0] deco = cc.export(func.__name__, signature) deco(func.py_func) cc.compile()
[package] name = "radar" version = "0.1.0" authors = ["Hyunsoo <rex8312@gmail.com>"] edition = "2018" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] name = "radar_rust" crate-type = ["cdylib"] [dependencies] numpy = "0.13" ndarray = "0.15" [dependencies.pyo3] version = "0.13" features = ["extension-module"]
# src/lib.rs use ndarray::{ArrayView1, ArrayViewMut2}; use numpy::{PyArray2, PyReadonlyArray1}; use pyo3::prelude::{pymodule, PyModule, PyResult, Python}; use std::f32::consts::PI; #[pymodule] fn radar_rust(_py: Python<'_>, m: &PyModule) -> PyResult<()> { // mutable example (no return) fn func( mut buff: ArrayViewMut2<f32>, ys: ArrayView1<i32>, xs: ArrayView1<i32>, ps: ArrayView1<i32>, t: f32, ) { let shape = buff.shape(); let height = shape[0]; let width = shape[1]; for y in 0..height { for x in 0..width { for i in 0..ys.len() { let src_y = ys[i] as f32; let src_x = xs[i] as f32; let src_p = ps[i] as f32; let dist = ((y as f32 - src_y).powi(2) + (x as f32 - src_x).powi(2)).sqrt(); buff[[y, x]] += (2.0 * PI * (dist + t) + t * src_p as f32).cos(); } } } } // wrapper of `mult` #[pyfn(m, "func")] fn func_py( buff: &PyArray2<f32>, ys: PyReadonlyArray1<i32>, xs: PyReadonlyArray1<i32>, ps: PyReadonlyArray1<i32>, t: f32, ) { let buff = unsafe { buff.as_array_mut() }; let ys = ys.as_array(); let xs = xs.as_array(); let ps = ps.as_array(); func(buff, ys, xs, ps, t); } Ok(()) }