import gr
import gr.pygr as mlab
import numba as nb
import numpy as np
import math
import radar_rust
from IPython import embed
from numba import cuda
from tqdm import tqdm

from radar_nb import aot_func, aot_func_parallel


def py_func(srcs, t):
    m = np.zeros((200, 200), np.float64)
    H, W = m.shape

    for y in range(H):
        for x in range(W):
            for src in srcs:
                dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2)
                m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase)
    return m


def np_func(srcs, t):
    m = np.zeros((200, 200), np.float64)
    H, W = m.shape
    ys = np.arange(H).reshape(-1, 1).repeat(W, 1).reshape(-1)
    xs = np.arange(W).reshape(1, -1).repeat(H, 0).reshape(-1)
    ds = np.sqrt(
        (ys.reshape(-1, 1) - srcs.y.reshape(1, -1)) ** 2
        + (xs.reshape(-1, 1) - srcs.x.reshape(1, -1)) ** 2
    )
    for i in range(len(srcs)):
        m[ys, xs] += np.cos(2.0 * np.pi * (ds[:, i] + t) + t * srcs.phase[i])
    return m


spec = nb.from_dtype(np.dtype([("y", np.int32), ("x", np.int32), ("phase", np.int32)]))


@nb.njit(nb.float64[:, :](spec[:], nb.float64))
def jit_func(srcs, t):
    m = np.zeros((200, 200), nb.float64)
    H, W = m.shape

    for y in range(H):
        for x in range(W):
            for src in srcs:
                dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2)
                m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase)
    return m


@nb.njit(nb.float64[:, :](spec[:], nb.float64), fastmath=True)
def jit_func_fast(srcs, t):
    m = np.zeros((200, 200), nb.float64)
    H, W = m.shape

    for y in range(H):
        for x in range(W):
            for src in srcs:
                dist = np.sqrt((y - src.y) ** 2 + (x - src.x) ** 2)
                m[y, x] += np.cos(2.0 * np.pi * (dist + t) + t * src.phase)
    return m


def rust_func(srcs, t):
    buff = np.zeros((200, 200), dtype=np.float32)
    radar_rust.func(buff, srcs.y, srcs.x, srcs.phase, t)
    return buff


def cuda_func(srcs, t):

    blockdim = 1024, 1
    griddim = int(math.ceil(float(200 * 200) / blockdim[0])), 1
    stream = cuda.stream()

    d_buff = cuda.device_array((200, 200), stream=stream)
    d_ys = cuda.to_device(np.ascontiguousarray(srcs.y), stream)
    d_xs = cuda.to_device(np.ascontiguousarray(srcs.x), stream)
    d_ps = cuda.to_device(np.ascontiguousarray(srcs.phase), stream)

    cuda_kernel[griddim, blockdim, stream](d_buff, d_ys, d_xs, d_ps, t)

    buff = d_buff.copy_to_host(stream=stream)
    stream.synchronize()
    return buff


@cuda.jit((nb.float64[:, :], nb.int32[:], nb.int32[:], nb.int32[:], nb.float64))
def cuda_kernel(buff, src_ys, src_xs, src_ps, t):

    pos = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    y = pos // 200
    x = pos % 200

    for i in range(len(src_ys)):
        src_y, src_x, src_p = src_ys[i], src_xs[i], src_ps[i]
        dist = math.sqrt((y - src_y) ** 2 + (x - src_x) ** 2)
        buff[y, x] += math.cos(2.0 * np.pi * (dist + t) + t * src_p)


if __name__ == "__main__":

    spec = np.dtype([("y", np.int32), ("x", np.int32), ("phase", np.int32)])
    srcs = np.array(
        [(98, 0, -2), (99, 0, -1), (100, 0, 0), (101, 0, 1), (102, 0, 2),], dtype=spec,
    ).view(np.recarray)

    # gr.beginprint("rader.mp4")
    exprs = [
        ("cuda", cuda_func),
        ("aot-parallel", aot_func_parallel),
        ("aot", aot_func),
        ("rust", rust_func),
        ("jit", jit_func),
        ("jit-fast", jit_func_fast),
        ("np", np_func),
        ("py", py_func),
    ]
    for name, func in exprs:
        for t in tqdm(np.arange(0, 100, 0.01), desc=name):
            m = func(srcs, t)
            mlab.imshow(m)
    # gr.endprint()