PyCUDA Complex Matrix Multiplication - C code vs Python code -


based on reading of pycuda documentation, samples , book on cuda kirk , hwu, have implemented cuda c-based complex matrix multiplication program , have written version in pycuda. c code produces correct results, python code doesn't.

to clear, python code taken samples (matrixmultiled) , has been modified handle complex numbers using cucomplexfloat "cucomplex.h". before modification, correctly multiplying real-valued matrices.

so can't figure out error. python code is

# attempt matrix multiplication complex numbers import pycuda.autoinit pycuda import driver, compiler, gpuarray, tools     import numpy np time import *  kernel_code_template = """             #include <cucomplex.h>     __global__ void matrixmulkernel(cufloatcomplex *a, cufloatcomplex *b, cufloatcomplex *c)     {           const uint wa = %(matrix_size)s;           const uint wb = %(matrix_size)s;            // block index           const uint bx = blockidx.x;           const uint = blockidx.y;            // thread index           const uint tx = threadidx.x;           const uint ty = threadidx.y;            // index of first sub-matrix of processed block           const uint abegin = wa * %(block_size)s * by;           // index of last sub-matrix of processed block           const uint aend   = abegin + wa - 1;           // step size used iterate through sub-matrices of           const uint astep = %(block_size)s;            // index of first sub-matrix of b processed block           const int bbegin = %(block_size)s * bx;           // step size used iterate through sub-matrcies of b           const uint bstep = %(block_size)s * wb;            // element of block sub-matrix computed thread           cufloatcomplex csub = make_cufloatcomplex(0,0);           // loop on sub-matrices of , b required compute block sub-matrix           (int = abegin, b = bbegin;                <= aend;            += astep, b += bstep)           {                // shared memory sub-matrix of            __shared__ cufloatcomplex as[%(block_size)s][%(block_size)s];            // shared memory sub-matrix of b            __shared__ cufloatcomplex bs[%(block_size)s][%(block_size)s];             // load matrices global memory shared memory;            // each thread loads 1 element of each matrix            as[ty][tx] = make_cufloatcomplex(cucrealf(a[a + wa*ty + tx]),cucimagf(a[a + wa*ty + tx]));            bs[ty][tx] = make_cufloatcomplex(cucrealf(b[b + wb*ty + tx]),cucimagf(b[b + wa*ty + tx]));             // synchronize make sure matrices loaded            __syncthreads();             // multiply 2 matrcies            // each thread computes 1 element of block sub-matrix            for(int k = 0; k < %(block_size)s; ++k)            {                 csub = cucaddf(csub,cucmulf(as[ty][k],bs[k][tx]));            }              // synchronize make sure preceding computation            // done before loading 2 new sub-matrices of , b in next iteration            __syncthreads();          }           // write block sub-matrix global memory          // each thread writes 1 element          const uint c = wb * %(block_size)s * + %(block_size)s * bx;          c[c + wb*ty + tx] = make_cufloatcomplex(cucrealf(csub), cucimagf(csub));     }     """  matrix_size = 4 tile_size  = 2 block_size = tile_size  a_cpu = np.zeros(shape=(matrix_size,matrix_size)).astype(np.complex) b_cpu = np.zeros(shape=(matrix_size,matrix_size)).astype(np.complex)  a_cpu[:,:] = 1 + 1j*0 b_cpu[:,:] = 1 + 1j*2  # compute reference on cpu verify gpu computation t1 = time() c_cpu = np.dot(a_cpu, b_cpu) t2 = time() t_cpu = t2-t1  # transfer host (cpu) memory device (gpu) memory a_gpu = gpuarray.to_gpu(a_cpu) b_gpu = gpuarray.to_gpu(b_cpu)  # create empty gpuarry result (c = * b) c_gpu = gpuarray.empty((matrix_size, matrix_size), np.complex)  # kernel code template # specifying constant matrix_size kernel_code = kernel_code_template % {         'matrix_size': matrix_size,         'block_size': block_size,         }  # compile kernel code mod = compiler.sourcemodule(kernel_code)  # kernel function compiled module matrixmul = mod.get_function("matrixmulkernel")  # call kernel on card t1 = time() matrixmul(         # inputs     a_gpu, b_gpu,     # output     c_gpu,     # grid of multiple blocks     grid = (matrix_size/tile_size, matrix_size/tile_size),     # block of multiple threads     block = (tile_size, tile_size, 1),     ) t2 = time() t_gpu = t2-t1  # print results print("-" * 80) print("matrix (gpu): ") print(a_gpu.get())  print("-" * 80) print("matrix b (gpu): ") print(b_gpu.get())  print("-" * 80) print("matrix c (gpu): ") print(c_gpu.get())  print("-" * 80) print("matrix c (cpu): ") print(c_cpu)  print("-" * 80) print("cpu-gpu difference: ") print(c_cpu-c_gpu.get())  print("cpu time ", t_cpu) print("gpu time ", t_gpu)  np.allclose(c_cpu, c_gpu.get() )             

the c-code is

 #include <stdio.h>  #include <stdlib.h>  #include <cucomplex.h>  /* __global__ void matrixmulkernel(...)  * main matrix multiplication kernel  * executed on device (gpu).  */ __global__ void matrixmulkernel(cufloatcomplex *md, cufloatcomplex *nd, cufloatcomplex *pd, int width, int tile_width) {     // calculate row index of pd element , m     int row = blockidx.y*tile_width + threadidx.y;     // calculate column index of pd element , n     int col = blockidx.x*tile_width + threadidx.x;      cufloatcomplex pvalue = make_cufloatcomplex(0,0);     // each thread computes 1 element of block sub-matrix     for(int k = 0; k < width; k++)     {         pvalue = cucaddf(pvalue,cucmulf(md[row*width + k],nd[k*width + col]));     }     pd[row*width + col] = make_cufloatcomplex(cucrealf(pvalue),cucimagf(pvalue)); }      /* void matrixmultiplication(...)  * stub function matrix multiplication kernel  * executed on host. takes inputs main() function  * , declares memory, copies data device, invokes kernel  * , copies result device host.  */ void matrixmultiplication(cufloatcomplex *m, cufloatcomplex *n, cufloatcomplex *p, int width, int tile_width) {     int size = width*width*sizeof(cufloatcomplex);     cufloatcomplex *md, *nd, *pd;      // transfer m , n device memory     cudamalloc((void**) &md, size);     cudamemcpy(md, m, size, cudamemcpyhosttodevice);     cudamalloc((void**) &nd, size);     cudamemcpy(nd, n, size, cudamemcpyhosttodevice);      // allocate p on device     cudamalloc((void**) &pd, size);      // setup execution configuration     dim3 dimgrid(width/tile_width, width/tile_width);     dim3 dimblock(tile_width, tile_width);     // launch device computation kernel     matrixmulkernel<<<dimgrid,dimblock>>>(md, nd, pd, width, tile_width);      // transfer p device host     cudamemcpy(p, pd, size, cudamemcpydevicetohost);      // free device matrices     cudafree(md);     cudafree(nd);     cudafree(pd); }      /* void printmatrix(..)  * function used print matrix  */ void printmatrix(cufloatcomplex *m, int width) {     for(int = 0; < width; i++)     {     for(int j = 0; j < width; j++)         {         printf("  %f+i%f", cucrealf(m[i*width + j]), cucimagf(m[i*width + j]));     }         printf("\n"); } }  int main() { /* define dimension of matrix (width x width) */     int width = 4; /* define dimension of tile      * should less 512      */     int tile_width = 2;     /* define pointers row major matrices */     cufloatcomplex *m, *n, *p;      m = (cufloatcomplex *)malloc(width*width*sizeof(cufloatcomplex));     n = (cufloatcomplex *)malloc(width*width*sizeof(cufloatcomplex));     p = (cufloatcomplex *)malloc(width*width*sizeof(cufloatcomplex));      /* fill matrices arbitrarily */     for(int = 0; < width; i++)     {         for(int j = 0; j < width; j++)         {             m[i*width + j] = make_cufloatcomplex(1,0);             n[i*width + j] = make_cufloatcomplex(1,2);             p[i*width + j] = make_cufloatcomplex(0,0);         }     }      /* code print matrices using helper function */     printf("matrix m \n\n");     printmatrix(m, width);     printf("\nmatrix n \n\n");     printmatrix(n, width);      /* call stub function matrix multiplication */     matrixmultiplication(m, n, p, width, tile_width);      printf("\nmatrix p \n\n");     printmatrix(p, width);      free(m);     free(n);     free(p);      return 0; }     

the output of python code is

matrix c (gpu):  [[  1.59878214e-314 +1.59926782e-314j   1.59878214e-314 +1.59926782e-314j     1.59878214e-314 +1.59926782e-314j   1.59878214e-314 +1.59926782e-314j]      [  1.59878214e-314 +1.59926782e-314j   1.59878214e-314 +1.59926782e-314j     1.59878214e-314 +1.59926782e-314j   1.59878214e-314 +1.59926782e-314j]  [ -9.01080877e+306 -5.19870527e+306j  -1.45379609e+307 -8.65694841e+306j    -4.14125486e+306 -2.15325816e+306j  -5.83708063e+306 -3.25935506e+306j]  [ -1.44828853e+306 -1.44828853e+306j  -2.32949855e+306 -2.32949855e+306j    -3.78945180e+306 -3.78945180e+306j  -6.54203686e+306 -6.54203686e+306j]] -------------------------------------------------------------------------------- matrix c (cpu):  [[ 4.+8.j  4.+8.j  4.+8.j  4.+8.j]  [ 4.+8.j  4.+8.j  4.+8.j  4.+8.j]  [ 4.+8.j  4.+8.j  4.+8.j  4.+8.j]  [ 4.+8.j  4.+8.j  4.+8.j  4.+8.j]] 

the c-output is

matrix p     4.000000+i8.000000  4.000000+i8.000000  4.000000+i8.000000  4.000000+i8.000000   4.000000+i8.000000  4.000000+i8.000000  4.000000+i8.000000  4.000000+i8.000000   4.000000+i8.000000  4.000000+i8.000000  4.000000+i8.000000  4.000000+i8.000000   4.000000+i8.000000  4.000000+i8.000000  4.000000+i8.000000  4.000000+i8.000000 

i'd appreciate if please point out error in python code. trying meet deadlines thesis, , rest of code in python, have no time port c.

thanks!

======================

edit: problem solved

this precision issue. fixed replacing host transfer , empty matrix creation code following...

    # transfer host (cpu) memory device (gpu) memory     a_gpu = gpuarray.to_gpu(a_cpu.astype(np.complex64))     b_gpu = gpuarray.to_gpu(b_cpu.astype(np.complex64))      # create empty gpuarry result (c = * b)     c_gpu = gpuarray.empty((matrix_size, matrix_size), np.complex64) 

hope helpful.

this precision issue. fixed replacing host transfer , empty matrix creation code following...

# transfer host (cpu) memory device (gpu) memory a_gpu = gpuarray.to_gpu(a_cpu.astype(np.complex64)) b_gpu = gpuarray.to_gpu(b_cpu.astype(np.complex64))  # create empty gpuarry result (c = * b) c_gpu = gpuarray.empty((matrix_size, matrix_size), np.complex64) 

but find problem in using pycuda multiply large complex valued matrices. instance, taking matrix_size = 5040 , tile_size = 16 (which may not choice standpoint of hardware), cuda-c manage multiply matrices, pycuda crashes. why should happen?


Comments

Popular posts from this blog

basic authentication with http post params android -

vb.net - Virtual Keyboard commands -

css - Firefox for ubuntu renders wrong colors -