====== Numba ======

===== multiple returns =====

<code python>
# return 타입이 같을 때

@nb.jit(nb.types.UniTuple(nb.float64[:],2)(nb.float64[:]),nopython=True)
def f(a) :
    return a,a

@nb.jit('UniTuple(float64[:], 2)(float64[:])',nopython=True)
def f(a) :
    return a,a
    
# return 타입이 다를 때 
@nb.jit(nb.types.Tuple((nb.float64[:], nb.float64[:,:]))(nb.float64[:], nb.float64[:,:]),nopython=True)
def f(a, b) :
    return a, b

@nb.jit('Tuple((float64[:], float64[:,:]))(float64[:], float64[:,:])',nopython=True)
def f(a, b) :
    return a, b

# https://stackoverflow.com/questions/30363253/multiple-output-and-numba-signatures
</code>

===== multi-threading =====

  * https://numba.pydata.org/numba-doc/dev/user/examples.html#multi-threading

===== structured array & ahead-of-time compile =====


<code python radar.py>
import gr
import gr.pygr as mlab
import numba as nb
import numpy as np
import math
import radar_rust
from IPython import embed
from numba import cuda
from tqdm import tqdm

from radar_nb import aot_func, aot_func_parallel


def py_func(srcs, t):
    m = np.zeros((200, 200), np.float64)
    H, W = m.shape

    for y in range(H):
        for x in range(W):
            for src in srcs:
                dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2)
                m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase)
    return m


def np_func(srcs, t):
    m = np.zeros((200, 200), np.float64)
    H, W = m.shape
    ys = np.arange(H).reshape(-1, 1).repeat(W, 1).reshape(-1)
    xs = np.arange(W).reshape(1, -1).repeat(H, 0).reshape(-1)
    ds = np.sqrt(
        (ys.reshape(-1, 1) - srcs.y.reshape(1, -1)) ** 2
        + (xs.reshape(-1, 1) - srcs.x.reshape(1, -1)) ** 2
    )
    for i in range(len(srcs)):
        m[ys, xs] += np.cos(2.0 * np.pi * (ds[:, i] + t) + t * srcs.phase[i])
    return m


spec = nb.from_dtype(np.dtype([("y", np.int32), ("x", np.int32), ("phase", np.int32)]))


@nb.njit(nb.float64[:, :](spec[:], nb.float64))
def jit_func(srcs, t):
    m = np.zeros((200, 200), nb.float64)
    H, W = m.shape

    for y in range(H):
        for x in range(W):
            for src in srcs:
                dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2)
                m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase)
    return m


@nb.njit(nb.float64[:, :](spec[:], nb.float64), fastmath=True)
def jit_func_fast(srcs, t):
    m = np.zeros((200, 200), nb.float64)
    H, W = m.shape

    for y in range(H):
        for x in range(W):
            for src in srcs:
                dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2)
                m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase)
    return m


def rust_func(srcs, t):
    buff = np.zeros((200, 200), dtype=np.float32)
    radar_rust.func(buff, srcs.y, srcs.x, srcs.phase, t)
    return buff


def cuda_func(srcs, t):

    blockdim = 1024, 1
    griddim = int(math.ceil(float(200 * 200) / blockdim[0])), 1
    stream = cuda.stream()

    d_buff = cuda.device_array((200, 200), stream=stream)
    d_ys = cuda.to_device(np.ascontiguousarray(srcs.y), stream)
    d_xs = cuda.to_device(np.ascontiguousarray(srcs.x), stream)
    d_ps = cuda.to_device(np.ascontiguousarray(srcs.phase), stream)

    cuda_kernel[griddim, blockdim, stream](d_buff, d_ys, d_xs, d_ps, t)

    buff = d_buff.copy_to_host(stream=stream)
    stream.synchronize()
    return buff


@cuda.jit((nb.float64[:, :], nb.int32[:], nb.int32[:], nb.int32[:], nb.float64))
def cuda_kernel(buff, src_ys, src_xs, src_ps, t):

    pos = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    y = pos // 200
    x = pos % 200

    for i in range(len(src_ys)):
        src_y, src_x, src_p = src_ys[i], src_xs[i], src_ps[i]
        dist = math.sqrt((y - src_y) ** 2 + (x - src_x) ** 2)
        buff[y, x] += math.cos(2.0 * np.pi * (dist + t) + t * src_p)


if __name__ == "__main__":

    spec = np.dtype([("y", np.int32), ("x", np.int32), ("phase", np.int32)])
    srcs = np.array(
        [(98, 0, -2), (99, 0, -1), (100, 0, 0), (101, 0, 1), (102, 0, 2),], dtype=spec,
    ).view(np.recarray)

    # gr.beginprint("rader.mp4")
    exprs = [
        ("cuda", cuda_func),
        ("aot-parallel", aot_func_parallel),
        ("aot", aot_func),
        ("rust", rust_func),
        ("jit", jit_func),
        ("jit-fast", jit_func_fast),
        ("np", np_func),
        ("py", py_func),
    ]
    for name, func in exprs:
        for t in tqdm(np.arange(0, 100, 0.01), desc=name):
            m = func(srcs, t)
            mlab.imshow(m)
    # gr.endprint()
</code>

<code python radar_nb.py>
import sys
from pathlib import Path

import numba as nb
import numpy as np
from numba.core.dispatcher import Dispatcher
from numba.core.registry import CPUDispatcher
from numba.pycc import CC


spec = nb.from_dtype(np.dtype([("y", np.int32), ("x", np.int32), ("phase", np.int32)]))


@nb.njit(nb.float64[:, :](spec[:], nb.float64))
def aot_func(srcs, t):
    m = np.zeros((200, 200), nb.float64)
    H, W = m.shape

    for y in range(H):
        for x in range(W):
            for src in srcs:
                dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2)
                m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase)
    return m


@nb.njit(nb.float64[:, :](spec[:], nb.float64), parallel=True)
def aot_func_parallel(srcs, t):
    m = np.zeros((200, 200), nb.float64)
    H, W = m.shape

    for y in nb.prange(H):
        for x in nb.prange(W):
            for src in srcs:
                dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2)
                m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase)
    return m


if __name__ == "__main__":

    for p in Path().glob("**/*.pyd"):
        p.unlink()

    mod_name = Path(__file__).stem
    cc = CC(mod_name)

    # Uncomment the following line to print out the compilation steps
    # cc.verbose = True

    mod = __import__(mod_name)
    mod = sys.modules[mod_name]
    for name, func in mod.__dict__.items():
        if isinstance(func, CPUDispatcher):
            print(f"compile: {name}")
            # signature = func.signatures[0]
            # signature = func.get_function_type().signature
            signature = func.nopython_signatures[0]
            deco = cc.export(func.__name__, signature)
            deco(func.py_func)

    cc.compile()
</code>

  * rust 초기화 및 lib 빌드
    * cargo init
    * maturin develop
    * maturin build --release
    * pip install --force-reinstall ...


<code toml>
[package]
name = "radar"
version = "0.1.0"
authors = ["Hyunsoo <rex8312@gmail.com>"]
edition = "2018"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html


[lib]
name = "radar_rust"
crate-type = ["cdylib"]

[dependencies]
numpy = "0.13"
ndarray = "0.15"

[dependencies.pyo3]
version = "0.13"
features = ["extension-module"]
</code>

<code rust>
# src/lib.rs
use ndarray::{ArrayView1, ArrayViewMut2};
use numpy::{PyArray2, PyReadonlyArray1};
use pyo3::prelude::{pymodule, PyModule, PyResult, Python};
use std::f32::consts::PI;

#[pymodule]
fn radar_rust(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
    // mutable example (no return)
    fn func(
        mut buff: ArrayViewMut2<f32>,
        ys: ArrayView1<i32>,
        xs: ArrayView1<i32>,
        ps: ArrayView1<i32>,
        t: f32,
    ) {
        let shape = buff.shape();
        let height = shape[0];
        let width = shape[1];

        for y in 0..height {
            for x in 0..width {
                for i in 0..ys.len() {
                    let src_y = ys[i] as f32;
                    let src_x = xs[i] as f32;
                    let src_p = ps[i] as f32;
                    let dist = ((y as f32 - src_y).powi(2) + (x as f32 - src_x).powi(2)).sqrt();
                    buff[[y, x]] += (2.0 * PI * (dist + t) + t * src_p as f32).cos();
                }
            }
        }
    }

    // wrapper of `mult`
    #[pyfn(m, "func")]
    fn func_py(
        buff: &PyArray2<f32>,
        ys: PyReadonlyArray1<i32>,
        xs: PyReadonlyArray1<i32>,
        ps: PyReadonlyArray1<i32>,
        t: f32,
    ) {
        let buff = unsafe { buff.as_array_mut() };
        let ys = ys.as_array();
        let xs = xs.as_array();
        let ps = ps.as_array();
        func(buff, ys, xs, ps, t);
    }

    Ok(())
}
</code>

{{tag>"python최적화" python numba}}