====== Numba ======
===== multiple returns =====
# return 타입이 같을 때
@nb.jit(nb.types.UniTuple(nb.float64[:],2)(nb.float64[:]),nopython=True)
def f(a) :
return a,a
@nb.jit('UniTuple(float64[:], 2)(float64[:])',nopython=True)
def f(a) :
return a,a
# return 타입이 다를 때
@nb.jit(nb.types.Tuple((nb.float64[:], nb.float64[:,:]))(nb.float64[:], nb.float64[:,:]),nopython=True)
def f(a, b) :
return a, b
@nb.jit('Tuple((float64[:], float64[:,:]))(float64[:], float64[:,:])',nopython=True)
def f(a, b) :
return a, b
# https://stackoverflow.com/questions/30363253/multiple-output-and-numba-signatures
===== multi-threading =====
* https://numba.pydata.org/numba-doc/dev/user/examples.html#multi-threading
===== structured array & ahead-of-time compile =====
import gr
import gr.pygr as mlab
import numba as nb
import numpy as np
import math
import radar_rust
from IPython import embed
from numba import cuda
from tqdm import tqdm
from radar_nb import aot_func, aot_func_parallel
def py_func(srcs, t):
m = np.zeros((200, 200), np.float64)
H, W = m.shape
for y in range(H):
for x in range(W):
for src in srcs:
dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2)
m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase)
return m
def np_func(srcs, t):
m = np.zeros((200, 200), np.float64)
H, W = m.shape
ys = np.arange(H).reshape(-1, 1).repeat(W, 1).reshape(-1)
xs = np.arange(W).reshape(1, -1).repeat(H, 0).reshape(-1)
ds = np.sqrt(
(ys.reshape(-1, 1) - srcs.y.reshape(1, -1)) ** 2
+ (xs.reshape(-1, 1) - srcs.x.reshape(1, -1)) ** 2
)
for i in range(len(srcs)):
m[ys, xs] += np.cos(2.0 * np.pi * (ds[:, i] + t) + t * srcs.phase[i])
return m
spec = nb.from_dtype(np.dtype([("y", np.int32), ("x", np.int32), ("phase", np.int32)]))
@nb.njit(nb.float64[:, :](spec[:], nb.float64))
def jit_func(srcs, t):
m = np.zeros((200, 200), nb.float64)
H, W = m.shape
for y in range(H):
for x in range(W):
for src in srcs:
dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2)
m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase)
return m
@nb.njit(nb.float64[:, :](spec[:], nb.float64), fastmath=True)
def jit_func_fast(srcs, t):
m = np.zeros((200, 200), nb.float64)
H, W = m.shape
for y in range(H):
for x in range(W):
for src in srcs:
dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2)
m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase)
return m
def rust_func(srcs, t):
buff = np.zeros((200, 200), dtype=np.float32)
radar_rust.func(buff, srcs.y, srcs.x, srcs.phase, t)
return buff
def cuda_func(srcs, t):
blockdim = 1024, 1
griddim = int(math.ceil(float(200 * 200) / blockdim[0])), 1
stream = cuda.stream()
d_buff = cuda.device_array((200, 200), stream=stream)
d_ys = cuda.to_device(np.ascontiguousarray(srcs.y), stream)
d_xs = cuda.to_device(np.ascontiguousarray(srcs.x), stream)
d_ps = cuda.to_device(np.ascontiguousarray(srcs.phase), stream)
cuda_kernel[griddim, blockdim, stream](d_buff, d_ys, d_xs, d_ps, t)
buff = d_buff.copy_to_host(stream=stream)
stream.synchronize()
return buff
@cuda.jit((nb.float64[:, :], nb.int32[:], nb.int32[:], nb.int32[:], nb.float64))
def cuda_kernel(buff, src_ys, src_xs, src_ps, t):
pos = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
y = pos // 200
x = pos % 200
for i in range(len(src_ys)):
src_y, src_x, src_p = src_ys[i], src_xs[i], src_ps[i]
dist = math.sqrt((y - src_y) ** 2 + (x - src_x) ** 2)
buff[y, x] += math.cos(2.0 * np.pi * (dist + t) + t * src_p)
if __name__ == "__main__":
spec = np.dtype([("y", np.int32), ("x", np.int32), ("phase", np.int32)])
srcs = np.array(
[(98, 0, -2), (99, 0, -1), (100, 0, 0), (101, 0, 1), (102, 0, 2),], dtype=spec,
).view(np.recarray)
# gr.beginprint("rader.mp4")
exprs = [
("cuda", cuda_func),
("aot-parallel", aot_func_parallel),
("aot", aot_func),
("rust", rust_func),
("jit", jit_func),
("jit-fast", jit_func_fast),
("np", np_func),
("py", py_func),
]
for name, func in exprs:
for t in tqdm(np.arange(0, 100, 0.01), desc=name):
m = func(srcs, t)
mlab.imshow(m)
# gr.endprint()
import sys
from pathlib import Path
import numba as nb
import numpy as np
from numba.core.dispatcher import Dispatcher
from numba.core.registry import CPUDispatcher
from numba.pycc import CC
spec = nb.from_dtype(np.dtype([("y", np.int32), ("x", np.int32), ("phase", np.int32)]))
@nb.njit(nb.float64[:, :](spec[:], nb.float64))
def aot_func(srcs, t):
m = np.zeros((200, 200), nb.float64)
H, W = m.shape
for y in range(H):
for x in range(W):
for src in srcs:
dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2)
m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase)
return m
@nb.njit(nb.float64[:, :](spec[:], nb.float64), parallel=True)
def aot_func_parallel(srcs, t):
m = np.zeros((200, 200), nb.float64)
H, W = m.shape
for y in nb.prange(H):
for x in nb.prange(W):
for src in srcs:
dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2)
m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase)
return m
if __name__ == "__main__":
for p in Path().glob("**/*.pyd"):
p.unlink()
mod_name = Path(__file__).stem
cc = CC(mod_name)
# Uncomment the following line to print out the compilation steps
# cc.verbose = True
mod = __import__(mod_name)
mod = sys.modules[mod_name]
for name, func in mod.__dict__.items():
if isinstance(func, CPUDispatcher):
print(f"compile: {name}")
# signature = func.signatures[0]
# signature = func.get_function_type().signature
signature = func.nopython_signatures[0]
deco = cc.export(func.__name__, signature)
deco(func.py_func)
cc.compile()
* rust 초기화 및 lib 빌드
* cargo init
* maturin develop
* maturin build --release
* pip install --force-reinstall ...
[package]
name = "radar"
version = "0.1.0"
authors = ["Hyunsoo "]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
name = "radar_rust"
crate-type = ["cdylib"]
[dependencies]
numpy = "0.13"
ndarray = "0.15"
[dependencies.pyo3]
version = "0.13"
features = ["extension-module"]
# src/lib.rs
use ndarray::{ArrayView1, ArrayViewMut2};
use numpy::{PyArray2, PyReadonlyArray1};
use pyo3::prelude::{pymodule, PyModule, PyResult, Python};
use std::f32::consts::PI;
#[pymodule]
fn radar_rust(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
// mutable example (no return)
fn func(
mut buff: ArrayViewMut2,
ys: ArrayView1,
xs: ArrayView1,
ps: ArrayView1,
t: f32,
) {
let shape = buff.shape();
let height = shape[0];
let width = shape[1];
for y in 0..height {
for x in 0..width {
for i in 0..ys.len() {
let src_y = ys[i] as f32;
let src_x = xs[i] as f32;
let src_p = ps[i] as f32;
let dist = ((y as f32 - src_y).powi(2) + (x as f32 - src_x).powi(2)).sqrt();
buff[[y, x]] += (2.0 * PI * (dist + t) + t * src_p as f32).cos();
}
}
}
}
// wrapper of `mult`
#[pyfn(m, "func")]
fn func_py(
buff: &PyArray2,
ys: PyReadonlyArray1,
xs: PyReadonlyArray1,
ps: PyReadonlyArray1,
t: f32,
) {
let buff = unsafe { buff.as_array_mut() };
let ys = ys.as_array();
let xs = xs.as_array();
let ps = ps.as_array();
func(buff, ys, xs, ps, t);
}
Ok(())
}
{{tag>"python최적화" python numba}}