/* * Example of how to use the mxGPUArray API in a MEX file. This example shows * how to write a MEX function that takes a gpuArray input and returns a * gpuArray output, e.g. B=mexFunction(A). * * Copyright 2012 The MathWorks, Inc. */ #include #include #include #include #include #include "mex.h" #include "gpu/mxGPUArray.h" #include #include #include using namespace std; const int Nthreads = 1024, NchanMax = 128, NrankMax = 3; ////////////////////////////////////////////////////////////////////////////////////////// __global__ void Conv1D(const double *Params, const float *data, const float *W, float *conv_sig){ __shared__ float sW[81*NrankMax], sdata[(Nthreads+81)*NrankMax]; float x; int tid, nt0, tid0, bid, i, nid, Nrank, NT, Nfilt; tid = threadIdx.x; bid = blockIdx.x; Nfilt = (int) Params[1]; NT = (int) Params[0]; Nrank = (int) Params[6]; nt0 = (int) Params[9]; if(tid0){ for (i=0; i Cbest){ Cbest = Cf; xb = Ci - mu[i] * lam[i]; /// (lam[i] + 1); ibest = i; } } if (Cbest > Th*Th){ err[tid0] = Cbest; xbest[tid0] = xb; ftype[tid0] = ibest; } } } ////////////////////////////////////////////////////////////////////////////////////////// __global__ void cleanup_spikes(const double *Params, const float *xbest, const float *err, const int *ftype, const bool *UtU, int *st, int *id, float *x, float *C, int *counter, float *nsp){ int lockout, curr_token, indx, maxFR, Nfilt, NTOT, tid, bid, NT, tid0, j; volatile __shared__ float sdata[Nthreads+2*81+1]; volatile __shared__ int id_sh[Nthreads+2*81+1]; bool flag=0; float err0; lockout = (int) Params[9] - 1; tid = threadIdx.x; bid = blockIdx.x; NT = (int) Params[0]; Nfilt = (int) Params[1]; maxFR = (int) Params[3]; tid0 = bid * Nthreads; if(tid01e-10){ flag = 0; for(j=-lockout;j<=lockout;j++) if(sdata[tid+lockout+j]>err0) if (UtU[curr_token*Nfilt + id_sh[tid+lockout+j]]){ flag = 1; break; } if(flag==0){ indx = atomicAdd(&counter[0], 1); if (indx>>(d_Params, d_data, d_W, d_dout); bestFilter<<>>(d_Params, d_dout, d_mu, d_lam, d_nu, d_xbest, d_err, d_ftype); cleanup_spikes<<>>(d_Params, d_xbest, d_err, d_ftype, d_UtU, d_st, d_id, d_x, d_C, d_counter, d_nsp); dim3 block(nt0, 1024/nt0); average_snips<<>>( d_Params, d_st, d_id, d_x, d_counter, d_dataraw, d_dWU); cudaMemcpy(counter, d_counter, sizeof(int), cudaMemcpyDeviceToHost); plhs[0] = mxGPUCreateMxArrayOnGPU(dWU); float *x, *C; int *st, *id; int minSize; if (counter[0]