PyCUDA Complex Matrix Multiplication - C code vs Python code -
based on reading of pycuda documentation, samples , book on cuda kirk , hwu, have implemented cuda c-based complex matrix multiplication program , have written version in pycuda. c code produces correct results, python code doesn't.
to clear, python code taken samples (matrixmultiled) , has been modified handle complex numbers using cucomplexfloat "cucomplex.h". before modification, correctly multiplying real-valued matrices.
so can't figure out error. python code is
# attempt matrix multiplication complex numbers import pycuda.autoinit pycuda import driver, compiler, gpuarray, tools import numpy np time import * kernel_code_template = """ #include <cucomplex.h> __global__ void matrixmulkernel(cufloatcomplex *a, cufloatcomplex *b, cufloatcomplex *c) { const uint wa = %(matrix_size)s; const uint wb = %(matrix_size)s; // block index const uint bx = blockidx.x; const uint = blockidx.y; // thread index const uint tx = threadidx.x; const uint ty = threadidx.y; // index of first sub-matrix of processed block const uint abegin = wa * %(block_size)s * by; // index of last sub-matrix of processed block const uint aend = abegin + wa - 1; // step size used iterate through sub-matrices of const uint astep = %(block_size)s; // index of first sub-matrix of b processed block const int bbegin = %(block_size)s * bx; // step size used iterate through sub-matrcies of b const uint bstep = %(block_size)s * wb; // element of block sub-matrix computed thread cufloatcomplex csub = make_cufloatcomplex(0,0); // loop on sub-matrices of , b required compute block sub-matrix (int = abegin, b = bbegin; <= aend; += astep, b += bstep) { // shared memory sub-matrix of __shared__ cufloatcomplex as[%(block_size)s][%(block_size)s]; // shared memory sub-matrix of b __shared__ cufloatcomplex bs[%(block_size)s][%(block_size)s]; // load matrices global memory shared memory; // each thread loads 1 element of each matrix as[ty][tx] = make_cufloatcomplex(cucrealf(a[a + wa*ty + tx]),cucimagf(a[a + wa*ty + tx])); bs[ty][tx] = make_cufloatcomplex(cucrealf(b[b + wb*ty + tx]),cucimagf(b[b + wa*ty + tx])); // synchronize make sure matrices loaded __syncthreads(); // multiply 2 matrcies // each thread computes 1 element of block sub-matrix for(int k = 0; k < %(block_size)s; ++k) { csub = cucaddf(csub,cucmulf(as[ty][k],bs[k][tx])); } // synchronize make sure preceding computation // done before loading 2 new sub-matrices of , b in next iteration __syncthreads(); } // write block sub-matrix global memory // each thread writes 1 element const uint c = wb * %(block_size)s * + %(block_size)s * bx; c[c + wb*ty + tx] = make_cufloatcomplex(cucrealf(csub), cucimagf(csub)); } """ matrix_size = 4 tile_size = 2 block_size = tile_size a_cpu = np.zeros(shape=(matrix_size,matrix_size)).astype(np.complex) b_cpu = np.zeros(shape=(matrix_size,matrix_size)).astype(np.complex) a_cpu[:,:] = 1 + 1j*0 b_cpu[:,:] = 1 + 1j*2 # compute reference on cpu verify gpu computation t1 = time() c_cpu = np.dot(a_cpu, b_cpu) t2 = time() t_cpu = t2-t1 # transfer host (cpu) memory device (gpu) memory a_gpu = gpuarray.to_gpu(a_cpu) b_gpu = gpuarray.to_gpu(b_cpu) # create empty gpuarry result (c = * b) c_gpu = gpuarray.empty((matrix_size, matrix_size), np.complex) # kernel code template # specifying constant matrix_size kernel_code = kernel_code_template % { 'matrix_size': matrix_size, 'block_size': block_size, } # compile kernel code mod = compiler.sourcemodule(kernel_code) # kernel function compiled module matrixmul = mod.get_function("matrixmulkernel") # call kernel on card t1 = time() matrixmul( # inputs a_gpu, b_gpu, # output c_gpu, # grid of multiple blocks grid = (matrix_size/tile_size, matrix_size/tile_size), # block of multiple threads block = (tile_size, tile_size, 1), ) t2 = time() t_gpu = t2-t1 # print results print("-" * 80) print("matrix (gpu): ") print(a_gpu.get()) print("-" * 80) print("matrix b (gpu): ") print(b_gpu.get()) print("-" * 80) print("matrix c (gpu): ") print(c_gpu.get()) print("-" * 80) print("matrix c (cpu): ") print(c_cpu) print("-" * 80) print("cpu-gpu difference: ") print(c_cpu-c_gpu.get()) print("cpu time ", t_cpu) print("gpu time ", t_gpu) np.allclose(c_cpu, c_gpu.get() )
the c-code is
#include <stdio.h> #include <stdlib.h> #include <cucomplex.h> /* __global__ void matrixmulkernel(...) * main matrix multiplication kernel * executed on device (gpu). */ __global__ void matrixmulkernel(cufloatcomplex *md, cufloatcomplex *nd, cufloatcomplex *pd, int width, int tile_width) { // calculate row index of pd element , m int row = blockidx.y*tile_width + threadidx.y; // calculate column index of pd element , n int col = blockidx.x*tile_width + threadidx.x; cufloatcomplex pvalue = make_cufloatcomplex(0,0); // each thread computes 1 element of block sub-matrix for(int k = 0; k < width; k++) { pvalue = cucaddf(pvalue,cucmulf(md[row*width + k],nd[k*width + col])); } pd[row*width + col] = make_cufloatcomplex(cucrealf(pvalue),cucimagf(pvalue)); } /* void matrixmultiplication(...) * stub function matrix multiplication kernel * executed on host. takes inputs main() function * , declares memory, copies data device, invokes kernel * , copies result device host. */ void matrixmultiplication(cufloatcomplex *m, cufloatcomplex *n, cufloatcomplex *p, int width, int tile_width) { int size = width*width*sizeof(cufloatcomplex); cufloatcomplex *md, *nd, *pd; // transfer m , n device memory cudamalloc((void**) &md, size); cudamemcpy(md, m, size, cudamemcpyhosttodevice); cudamalloc((void**) &nd, size); cudamemcpy(nd, n, size, cudamemcpyhosttodevice); // allocate p on device cudamalloc((void**) &pd, size); // setup execution configuration dim3 dimgrid(width/tile_width, width/tile_width); dim3 dimblock(tile_width, tile_width); // launch device computation kernel matrixmulkernel<<<dimgrid,dimblock>>>(md, nd, pd, width, tile_width); // transfer p device host cudamemcpy(p, pd, size, cudamemcpydevicetohost); // free device matrices cudafree(md); cudafree(nd); cudafree(pd); } /* void printmatrix(..) * function used print matrix */ void printmatrix(cufloatcomplex *m, int width) { for(int = 0; < width; i++) { for(int j = 0; j < width; j++) { printf(" %f+i%f", cucrealf(m[i*width + j]), cucimagf(m[i*width + j])); } printf("\n"); } } int main() { /* define dimension of matrix (width x width) */ int width = 4; /* define dimension of tile * should less 512 */ int tile_width = 2; /* define pointers row major matrices */ cufloatcomplex *m, *n, *p; m = (cufloatcomplex *)malloc(width*width*sizeof(cufloatcomplex)); n = (cufloatcomplex *)malloc(width*width*sizeof(cufloatcomplex)); p = (cufloatcomplex *)malloc(width*width*sizeof(cufloatcomplex)); /* fill matrices arbitrarily */ for(int = 0; < width; i++) { for(int j = 0; j < width; j++) { m[i*width + j] = make_cufloatcomplex(1,0); n[i*width + j] = make_cufloatcomplex(1,2); p[i*width + j] = make_cufloatcomplex(0,0); } } /* code print matrices using helper function */ printf("matrix m \n\n"); printmatrix(m, width); printf("\nmatrix n \n\n"); printmatrix(n, width); /* call stub function matrix multiplication */ matrixmultiplication(m, n, p, width, tile_width); printf("\nmatrix p \n\n"); printmatrix(p, width); free(m); free(n); free(p); return 0; }
the output of python code is
matrix c (gpu): [[ 1.59878214e-314 +1.59926782e-314j 1.59878214e-314 +1.59926782e-314j 1.59878214e-314 +1.59926782e-314j 1.59878214e-314 +1.59926782e-314j] [ 1.59878214e-314 +1.59926782e-314j 1.59878214e-314 +1.59926782e-314j 1.59878214e-314 +1.59926782e-314j 1.59878214e-314 +1.59926782e-314j] [ -9.01080877e+306 -5.19870527e+306j -1.45379609e+307 -8.65694841e+306j -4.14125486e+306 -2.15325816e+306j -5.83708063e+306 -3.25935506e+306j] [ -1.44828853e+306 -1.44828853e+306j -2.32949855e+306 -2.32949855e+306j -3.78945180e+306 -3.78945180e+306j -6.54203686e+306 -6.54203686e+306j]] -------------------------------------------------------------------------------- matrix c (cpu): [[ 4.+8.j 4.+8.j 4.+8.j 4.+8.j] [ 4.+8.j 4.+8.j 4.+8.j 4.+8.j] [ 4.+8.j 4.+8.j 4.+8.j 4.+8.j] [ 4.+8.j 4.+8.j 4.+8.j 4.+8.j]]
the c-output is
matrix p 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000 4.000000+i8.000000
i'd appreciate if please point out error in python code. trying meet deadlines thesis, , rest of code in python, have no time port c.
thanks!
======================
edit: problem solved
this precision issue. fixed replacing host transfer , empty matrix creation code following...
# transfer host (cpu) memory device (gpu) memory a_gpu = gpuarray.to_gpu(a_cpu.astype(np.complex64)) b_gpu = gpuarray.to_gpu(b_cpu.astype(np.complex64)) # create empty gpuarry result (c = * b) c_gpu = gpuarray.empty((matrix_size, matrix_size), np.complex64)
hope helpful.
this precision issue. fixed replacing host transfer , empty matrix creation code following...
# transfer host (cpu) memory device (gpu) memory a_gpu = gpuarray.to_gpu(a_cpu.astype(np.complex64)) b_gpu = gpuarray.to_gpu(b_cpu.astype(np.complex64)) # create empty gpuarry result (c = * b) c_gpu = gpuarray.empty((matrix_size, matrix_size), np.complex64)
but find problem in using pycuda multiply large complex valued matrices. instance, taking matrix_size = 5040 , tile_size = 16 (which may not choice standpoint of hardware), cuda-c manage multiply matrices, pycuda crashes. why should happen?
Comments
Post a Comment