python - Genetic cellular automata with PyCuda, how to efficiently pass a lot of data per cell to CUDA kernel? -

i'm developing genetic cellular automata using pycuda. each cell have lot of genome data, along cell parameters. i'm wondering efficient way 1) pass cells data cuda kernel, 2) process data.

i began 1 particularly bad (imo), yet still working solution. passing each parameter in separate array, process them switch-case , lot of duplicate code.

then, realized end pretty large number of parameters per kernel function, , decide rewrite it.

second solution store bunch of cell's parameters in single array dimension. more elegant in code, surprisingly code runs 10x slower!

to make more clear, full list of data need stored per cell:

(fc, mc, tc): 3x (int) - cell's current 'flavor', mass , temperature
(rfc, rmc, rtc): 3x (int) - cell's current registers
(fi, mi, ti) each neighbour: 8*3x (int) - incoming values
(rfi, rmi, rti) each neighbour: 8*3x (int) - incoming values
gate orientation: 1x (uchar)
execution pointer: 1x (uchar)
current micro-operations memory: 32x (uchar)
last step's micro-operations memory: 32x (uchar)

i'm splitting automata step in 2 phases. first (emit phase) calculating (fi, mi, ti) each cell neighbours. second (absorb phase) blending 8x(fi, mi, ti) values current cells' states. no genome or registers implemented yet, need data passed future.

so, code first solution was:

mk = 64 tk = 1000  emit_gpu = elementwisekernel("int3 *cells, int3 *dcells0, int3 *dcells1, int3 *dcells2, int3 *dcells3, int3 *dcells4, int3 *dcells5, int3 *dcells6, int3 *dcells7, int w, int h", """     int x = / h;     int y = % h;      int3 cell = cells[i];     float m = (float) cell.y;     float t = (float) cell.z;     int mi = (int) (fmin(1, t / tk) * m);     cells[i].y -= mi;     cells[i].z -= (int) (t * fmin(1, t / tk) / 1);      int fi = cell.x;     int mbase = mi / 8;     int mpart = mi % 8;     int madd;     int ti = cell.z;     int ii, xo, yo;      (int cc = 0; cc < 9; cc++) {       int c = (cc + fi) % 9;       if (c == 4) continue;       xo = x + c%3 - 1;       if (xo < 0) xo = w + xo;       if (xo >= w) xo = xo - w;       yo = y + c/3 - 1;       if (yo < 0) yo = h + yo;       if (xo >= w) yo = yo - h;       ii = xo * h + yo;       if (mpart > 0) { madd = 1; mpart--;} else madd = 0;       switch(c) {         case 0: dcells0[ii] = make_int3(fi, mbase + madd, ti); break;         case 1: dcells1[ii] = make_int3(fi, mbase + madd, ti); break;         case 2: dcells2[ii] = make_int3(fi, mbase + madd, ti); break;         case 3: dcells3[ii] = make_int3(fi, mbase + madd, ti); break;         case 5: dcells4[ii] = make_int3(fi, mbase + madd, ti); break;         case 6: dcells5[ii] = make_int3(fi, mbase + madd, ti); break;         case 7: dcells6[ii] = make_int3(fi, mbase + madd, ti); break;         case 8: dcells7[ii] = make_int3(fi, mbase + madd, ti); break;         default: break;       }      }  """, "ca_prepare", preamble=""" #define tk %s """ % tk)  absorb_gpu = elementwisekernel("int3 *cells, int3 *dcells0, int3 *dcells1, int3 *dcells2, int3 *dcells3, int3 *dcells4, int3 *dcells5, int3 *dcells6, int3 *dcells7, int *img, int w, int h", """     int3 cell = cells[i];      int3 dcell = dcells0[i];     cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);     cell.x = cell.x % 360;     if (cell.x < 0) cell.x += 360;      dcell = dcells1[i];     cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);     cell.x = cell.x % 360;     if (cell.x < 0) cell.x += 360;     if (cell.z > tk) cell.z = tk;      dcell = dcells2[i];     cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);     cell.x = cell.x % 360;     if (cell.x < 0) cell.x += 360;     if (cell.z > tk) cell.z = tk;      dcell = dcells3[i];     cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);     cell.x = cell.x % 360;     if (cell.x < 0) cell.x += 360;     if (cell.z > tk) cell.z = tk;      dcell = dcells4[i];     cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);     cell.x = cell.x % 360;     if (cell.x < 0) cell.x += 360;     if (cell.z > tk) cell.z = tk;      dcell = dcells5[i];     cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);     cell.x = cell.x % 360;     if (cell.x < 0) cell.x += 360;     if (cell.z > tk) cell.z = tk;      dcell = dcells6[i];     cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);     cell.x = cell.x % 360;     if (cell.x < 0) cell.x += 360;     if (cell.z > tk) cell.z = tk;      dcell = dcells7[i];     cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);     cell.x = cell.x % 360;     if (cell.x < 0) cell.x += 360;     if (cell.z > tk) cell.z = tk;      cells[i] = cell;     img[i] = hsv2rgb(cell);  """, "ca_calc", preamble=""" #include <math.h> #define mk %s #define tk %s  __device__ int3 operator+(const int3 &a, const int3 &b) {     return make_int3(a.x+b.x, a.y+b.y, a.z+b.z); }  __device__ int3 calc_d(int fc, int mc, int tc, int fi, int mi, int ti) {     int df = fi - fc;     if (df > 180) fc += 360;     if (df < -180) fc -= 360;     float sm = mi + mc;     if (sm != 0) sm = mi / sm; else sm = 0;     df = (int) (fi - fc) * sm;     int dm = mi;     int dt = fabs((float) (fi - fc)) * fmin((float) mc, (float) mi) / mk + (ti - tc) * sm;     return make_int3(df, dm, dt); }  __device__ uint hsv2rgb(int3 pixel) {     // skipped brevity } """ % (mk, tk, ram))

the second , current solution:

mk = 64 tk = 1000 cell_len = 120 # number of parameters per cell  emit_gpu = elementwisekernel("int *cells, int w, int h", """     int x = / h;     int y = % h;     int ii = * cn;      int fc = cells[ii];     int mc = cells[ii+1];     int tc = cells[ii+2];     float m = (float) mc;     float t = (float) tc;     int mi = (int) (fmin(1, t / tk) * m);     cells[ii+1] = mc - mi;     cells[ii+2] = tc - (int) (t * fmin(1, t / tk));      int mbase = mi / 8;     int mpart = mi % 8;     int madd;     int iii, xo, yo;      (int cc = 0; cc < 9; cc++) {       int c = (cc + fc) % 9;       if (c == 4) continue;       xo = x + c%3 - 1;       if (xo < 0) xo = w + xo; else if (xo >= w) xo = xo - w;       yo = y + c/3 - 1;       if (yo < 0) yo = h + yo; else if (xo >= w) yo = yo - h;       if (mpart > 0) { madd = 1; mpart--;} else madd = 0;       if (c > 4) c--;       iii = (xo * h + yo) * cn + 6 + c*3;        cells[iii] = fc;       cells[iii+1] = mbase + madd;       cells[iii+2] = tc;      }  """, "ca_emit", preamble=""" #define tk %s #define cn %s """ % (tk, cell_len))  absorb_gpu = elementwisekernel("int *cells, int *img, int w, int h", """     int ii = * cn;     int fc = cells[ii];     int mc = cells[ii+1];     int tc = cells[ii+2];      (int c=0; c < 8; c++){       int iii = ii + c * 3 + 6;       int fi = cells[iii];       int mi = cells[iii+1];       int ti = cells[iii+2];        int df = fi - fc;       if (df > 180) fc += 360;       if (df < -180) fc -= 360;       float sm = mi + mc;       if (sm != 0) sm = mi / sm; else sm = 0;       df = (int) (fi - fc) * sm;       int dm = mi;       int dt = fabs((float) (fi - fc)) * fmin((float) mc, (float) mi) / mk + (ti - tc) * sm;       fc += df;       mc += dm;       tc += dt;       fc = fc % 360;       if (fc < 0) fc += 360;       if (tc > tk) tc = tk;     }            cells[ii] = fc;     cells[ii+1] = mc;     cells[ii+2] = tc;     cells[ii+18] = (cells[ii+18] + 1) % 8;      img[i] = hsv2rgb(fc, tc, mc);  """, "ca_absorb", preamble=""" #include <math.h> #define mk %s #define tk %s #define cn %s  __device__ uint hsv2rgb(int hue, int sat, int val) {     // skipped brevity } """ % (mk, tk, cell_len))

both variants produce same ca behaviour, latter running slower.

gtx titan:

field size: 1900x1080 cells
solution #1: ~200 steps/s
solution #2: ~20 steps/s

gt 630m:

field size: 1600x900 cells
solution #1: ~7.8 steps/s
solution #2: ~1.5 steps/s

please feel free play both solutions' if need:

solution #1 full source

solution #2 full source

any clues or advises welcome:

why performance slowed down?
is possible raise performance of solution #2 @ least level of #1?
or solution better?

ok, managed how run second solution 15x faster. following changes made:

convert main parameters array int int4. made faster solution int3. although, space left unused (.w dimension). [3x speedup]
repack related parameters in widthxheight groups. so, shape changed (width, height, n) (n, width, height). made memory access more efficient, since elements inside groups tends processed together. [5x speedup]

the optimized code looks like:

mk = 64 tk = 1000  emit_gpu = elementwisekernel("int4 *cells, int w, int h, int cn", """     int x = / h;     int y = % h;      int4 cell = cells[i];     int fc = cell.x;     int mc = cell.y;     int tc = cell.z;     float m = (float) mc;     float t = (float) tc;     int mi = (int) (fmin(1, t / tk) * m);     cells[i] = make_int4(fc, mc - mi, tc - (int) (t * fmin(1, t / tk)), 0);      int mbase = mi / 8;     int mpart = mi % 8;     int madd;     int ii;     int xo, yo;      int cnn = 0;     (int dx = -1; dx < 2; dx++) {       xo = x + dx;       if (xo < 0) xo = w + xo; else if (xo >= w) xo = xo - w;       (int dy = -1; dy < 2; dy++) {         if (dx == 0 && dy == 0) continue;         cnn += cn;         yo = y + dy;         if (yo < 0) yo = h + yo; else if (yo >= h) yo = yo - h;         if (mpart > 0) { madd = 1; mpart--;} else madd = 0;         ii = (xo * h + yo) + cnn;         cells[ii] = make_int4(fc, mbase + madd, tc, 0);       }     }  """, "ca_emit", preamble=""" #define tk %s #define cn %s """ % (tk, cell_len))  absorb_gpu = elementwisekernel("int4 *cells, int *img, int w, int h, int cn", """     int ii = i;     int4 cell = cells[i];     int fc = cell.x;     int mc = cell.y;     int tc = cell.z;      (int c=0; c < 8; c++){       ii += cn;       cell = cells[ii];       int fi = cell.x;       int mi = cell.y;       int ti = cell.z;        int df = fi - fc;       if (df > 180) fc += 360;       if (df < -180) fc -= 360;       float sm = mi + mc;       if (sm != 0) sm = mi / sm; else sm = 0;       df = (int) (fi - fc) * sm;       int dm = mi;       int dt = fabs((float) (fi - fc)) * fmin((float) mc, (float) mi) / mk + (ti - tc) * sm;       fc += df;       mc += dm;       tc += dt;       fc = fc % 360;       if (fc < 0) fc += 360;       if (tc > tk) tc = tk;     }            cells[i] = make_int4(fc, mc, tc, 0);     img[i] = hsv2rgb(fc, tc, mc);  """, "ca_absorb", preamble=""" #include <math.h> #define mk %s #define tk %s  __device__ uint hsv2rgb(int hue, int sat, int val) {     // skipped brevity } """ % (mk, tk))

thanks park young-bae clues on repacking , alexey shchepin optimization issues.

Search This Blog

Deter

python - Genetic cellular automata with PyCuda, how to efficiently pass a lot of data per cell to CUDA kernel? -

Comments

Post a Comment

Popular posts from this blog

javascript - IndexedDB error: Uncaught DataCloneError: Failed to execute 'put' on 'IDBObjectStore': An object could not be cloned -

java - Unable to make sub reports with Jasper -

twitter bootstrap - Boostrap minimum scale with rotativa PDF -