:%s/🛬/🚆/g et :%s/🚗/🚲/g
np par cp dans un script Numpy 😃x par 3y) 😐import cupy as cp
N = 256
hadamard_kern = cp.ElementwiseKernel('float32 x, float32 val', # inputs
'float32 y', # output
'y = x + val', # body
'hadamard_kern') # name
x = cp.ones(N, dtype='float32')
y = cp.ones(N, dtype='float32')
hadamard_kern(x, cp.float32(3.0), y)
if cp.any(abs(y-4.0)> 1e-12) :
print ("****** ERROR ***** ")(nb_block,block_size)blockDim.x * blockIdx.x + threadIdx.x@)import cupy as cp
from cupyx.jit import blockDim, blockIdx, threadIdx, rawkernel
import sys
N = 256
@rawkernel()
def kern(u, val):
tid = blockDim.x * blockIdx.x + threadIdx.x
u[tid] += val
x = cp.ones(N, dtype='float32')
kern((1,), (N,), (x, cp.float32(3.0)))
if cp.any(abs(x-4.0)> 1e-12) :
print ("****** ERROR ***** ")
sys.exit(-1)N = 256
michel_kern = cp.RawKernel(
r'''
extern "C" __global__
void kern(float* u, const float val) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
u[i] += val;
}
''', 'kern')
x = cp.ones(N, dtype='float32')
michel_kern((1,), (N,), (x, cp.float32(3.0)))
if cp.any(abs(x-4.0)> 1e-12) :
print ("****** ERROR ***** ")\[ X_{ij} = \frac{1}{4} \left(X_{(i-1)j} + X_{(i+1)j} + X_{i(j-1)} + X_{i(j+1)} - B_{ij}\right) \]


def jacobi_cpu_mat(ap, a, b):
ap[1:-1, 1:-1] = 0.25 * (a[0:-2, 1:-1] + a[2:, 1:-1] + a[1:-1, 0:-2] + a[1:-1, 2:] - b)
blur = readpgm("jean_zay_gris_txiki_edge.pgm")
m, n = blur.shape
X = np.zeros((m+2, n+2),dtype='float32')
X[1:-1,1:-1] = np.float32(blur)
X2 = np.copy(X)
for i in range(150000):
jacobi_cpu_mat(X2, X,blur)
X, X2 = X2, X def jacobi_cpu_mat(ap, a, b):
ap[1:-1, 1:-1] = 0.25 * (a[0:-2, 1:-1] + a[2:, 1:-1] + a[1:-1, 0:-2] + a[1:-1, 2:] - b)
blur = cp.asarray(readpgm("jean_zay_gris_txiki_edge.pgm"), dtype='float32')
m, n = blur.shape
X = cp.zeros((m+2, n+2),dtype='float32')
X[1:-1,1:-1] = blur
X2 = cp.copy(X)
for i in range(150000):
jacobi_gpu_mat(X2, X,blur)
X, X2 = X2, X
X_h = cp.asnumpy(X)@jit.rawkernel()
def jit_kern(ap, a, blur, m, n):
i = jit.threadIdx.x + jit.blockIdx.x * jit.blockDim.x
j = jit.threadIdx.y + jit.blockIdx.y * jit.blockDim.y
if i >= cp.uint(1) and i < cp.uint(m+1) and j >= cp.uint(1) and j < cp.uint(n+1):
ap[i, j] = 0.25 * (a[i-1, j] + a[i+1, j] + a[i, j-1] + a[i, j+1] - blur[i-1, j-1])
blur = readpgm("jean_zay_gris_txiki_edge.pgm")
m,n = blur.shape
X = np.zeros((m+2, n+2),dtype='float32')
X[1:-1,1:-1] = np.float32(blur)
X_d, X2_d, blur_d = map(cp.asarray, [X, X, blur]) # host -> device
blocks = 32, 32
grid = tuple((x[0]+2-1)//x[1]+1 for x in zip((m,n), blocks))
for i in range(150000):
jit_kern(grid, blocks, (X2_d, X_d, blur_d, m, n))
X_d, X2_d = X2_d, X_d # swap pointers
X = cp.asnumpy(X_d) # device -> hostlaplace_kern = cp.RawKernel(
r'''
extern "C" __global__ void laplace_gpu(float *newBuff, float *buff, const int *blur, int m, int n) {
int i = threadIdx.x+ blockIdx.x * blockDim.x;
int j = threadIdx.y+ blockIdx.y * blockDim.y;
if ((i >= 1) && (i < (m+1)) &&
(j >= 1) && (j < (n+1))) {
newBuff[i * (n+2) + j ] = 0.25 *( buff[(i-1) * (n+2) + j ] + buff[(i+1) * (n+2) + j] +
buff[i * (n+2) + j-1 ] + buff[i * (n+2) + j+1]
- blur[(i-1) * (n) + j-1]);
}
}
''', 'laplace_gpu')
(...)
X_d, X2_d = map(cp.asarray, [X, X])
blur_d = cp.asarray(blur, dtype='int32')
blocks = 32, 32
grid = tuple((x[0]+2-1)//x[1]+1 for x in zip((m,n), blocks))
for i in range(150000):
laplace_kern(grid, blocks, (X2_d, X_d, blur_d, m, n))
X_d, X2_d = X2_d, X_d
X = cp.asnumpy(X_d)laplace_shared_kern = cp.RawKernel(
r'''
extern "C" __global__ void laplace_shared_gpu(float *newBuff, float *buff, const int *blur, int m, int n) {
extern __shared__ float tile[];
(...)
}
''', 'laplace_shared_gpu')
(...)
shared_mem_bytes = (blocks[0]+2)*(blocks[1]+2)+(blocks[0]*blocks[1])
for i in range(150000):
laplace_shared_kern(grid, blocks, (X2_d, X_d, blur_d, m, n), shared_mem=4*shared_mem_bytes)
X_d, X2_d = X2_d, X_d
X = cp.asnumpy(X_d)| GPU | CPU | Numpy | Imp. | volée | brut | part. |
|---|---|---|---|---|---|---|
| T2000 | I9 10885H | 52.0s | 17.5s | 12.9s | 5.1s | 7.3s |
| a40 | AMD 7343 | 29.3s | 10.4s | 12.3s | 1.2s | 1.5s |
| h100 | AMD 9254 | 26.7s | 7.8s | 9.3s | 0.8s | 1.1s |
import numpy as np
import sys
import pyopencl as cl
N = 256
x = np.ones(N, dtype=np.float32)
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
x_d = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=x)
kern = cl.Program(ctx, """
__kernel void incr(__global float *vec, const float val)
{
int i = get_global_id(0);
vec[i] += val;
}
""").build()
kern.incr(queue, x.shape, None, x_d, np.float32(3.0))
cl.enqueue_copy(queue, x, x_d)
if np.any(abs(x-4.0) > 1e-12):
print(f"******* ERROR ******")
sys.exit(-1)ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
kern = cl.Program(ctx, """
__kernel void incr(__global float *newBuff, __global const float *buff, __global const float *blur, int m, int n)
{
int i = get_global_id(0); int j = get_global_id(1);
if ((i >= 1) && (i < (m+1)) && (j >= 1) && (j < (n+1))) {
newBuff[i * (n+2) + j ] = 0.25 *( buff[(i-1) * (n+2) + j ] + buff[(i+1) * (n+2) + j] +
buff[i * (n+2) + j-1] + buff[i * (n+2) + j+ 1]
- blur[i * (n+2) + j ]);
}
}
""").build()
blur_inside = readpgm("jean_zay_gris_txiki_edge.pgm")
m,n = blur_inside.shape
X, X2 = tuple((np.zeros((m+2, n+2),dtype='float32') for _ in range(2)))
X[1:-1,1:-1] = np.float32(blur_inside)
blur = X.copy()
X_d, X2_d = map(lambda z: cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=z), [X,X2])
blur_d = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=blur)
for i in range(150000):
kern.incr(queue, X.shape, None, X2_d, X_d, blur_d, np.int32(m), np.int32(n))
X_d, X2_d = X2_d, X_d
cl.enqueue_copy(queue, X, X_d)| Numpy | PyOpenCL. |
|---|---|
| 26.8s | 8.0s |
import numba
from numba.cuda import jit, threadIdx, blockIdx, blockDim, to_device, grid
import numpy as np
@jit
def increment(a, val):
#i = threadIdx.x + blockIdx.x * blockDim.x
i = grid(1)
if i < a.size:
a[i] +=val
N = 256
x = np.ones(N)
x_d = to_device(x)
increment[1,N](x_d, np.float32(3.0))
x = x_d.copy_to_host()
if np.any(abs(x-4.0)> 1e-12):
print ("****** ERROR ***** ")import numpy as np
from numba.cuda import jit, to_device, grid
from pgm import readpgm
@jit
def kern(ap, a, blur, m, n):
i,j = grid(2)
if i >= 1 and i < m+1 and j >= 1 and j < n+1:
ap[i, j] = 0.25 * (a[i-1, j] + a[i+1, j] + a[i, j-1] + a[i, j+1] - blur[i-1, j-1])
if __name__ == "__main__":
blur = readpgm("jean_zay_gris_txiki_edge.pgm")
m,n = blur.shape
x = np.zeros((m+2, n+2),dtype='float32')
x[1:-1,1:-1] = np.float32(blur)
x_d, x2_d, blur_d = map(to_device, [x, x, blur])
blocks = 32, 32
grids = tuple((x[0]+2-1)//x[1]+1 for x in zip((m,n), blocks))
for i in range(150000):
kern[grids, blocks](x2_d, x_d, blur_d, m, n)
x_d, x2_d = x2_d, x_d
x = x_d.copy_to_host()| GPU | CPU | Numpy | numba | cupy (imp) | cupy(brut) |
|---|---|---|---|---|---|
| a40 | AMD 7343 | 29.3s | 8.3s | 10.4s | 1.2s |
| h100 | AMD 9254 | 26.7s | 5.3s | 7.8s | 0.8s |
DistributedArray en Cupy