/* * Example of how to use the mxGPUArray API in a MEX file. This example shows * how to write a MEX function that takes a gpuArray input and returns a * gpuArray output, e.g. B=mexFunction(A). * * Copyright 2012 The MathWorks, Inc. */ #include #include #include #include #include #include "mex.h" #include "gpu/mxGPUArray.h" #include #include #include using namespace std; const int Nthreads = 1024, NchanMax = 128, block = 32, NrankMax = 3; ////////////////////////////////////////////////////////////////////////////////////////// __global__ void Conv1D(const double *Params, const float *data, const float *W, float *conv_sig){ volatile __shared__ float sW[81*NrankMax], sdata[(Nthreads+81)*NrankMax]; float x; int tid, tid0, bid, i, nid, Nrank, NT, Nfilt, nt0; tid = threadIdx.x; bid = blockIdx.x; Nfilt = (int) Params[1]; NT = (int) Params[0]; Nrank = (int) Params[6]; nt0 = (int) Params[9]; if(tid0){ for (i=0; i Cbest + 1e-6){ Cbest = Cf; xb = Ci - lam[i] * mu[i]; // /(lam[i] + 1); ibest = i; } } if (Cbest > Th*Th){ err[tid0] = Cbest; xbest[tid0] = xb; ftype[tid0] = ibest; } } } ////////////////////////////////////////////////////////////////////////////////////////// __global__ void cleanup_spikes(const double *Params, const float *xbest, const float *err, const int *ftype, int *st, int *id, float *x, float *C, int *counter){ int lockout, indx, maxFR, NTOT, tid, bid, NT, tid0, j; volatile __shared__ float sdata[Nthreads+2*81+1]; bool flag=0; float err0; lockout = (int) Params[9] - 1; tid = threadIdx.x; bid = blockIdx.x; NT = (int) Params[0]; maxFR = (int) Params[3]; tid0 = bid * Nthreads; if(tid01e-10){ flag = 0; for(j=-lockout;j<=lockout;j++) if(sdata[tid+lockout+j]>err0){ flag = 1; break; } if(flag==0){ indx = atomicAdd(&counter[0], 1); if (indx=0 & tcurr>>(d_Params, d_data, d_W, d_dout); for(int k=0;k<(int) Params[4];k++){ cudaMemset(d_err, 0, NT * sizeof(float)); cudaMemset(d_ftype, 0, NT * sizeof(int)); cudaMemset(d_xbest, 0, NT * sizeof(float)); // compute the best filter bestFilter<<>>( d_Params, d_dout, d_mu, d_lam, d_nu, d_xbest, d_err, d_ftype); // ignore peaks that are smaller than another nearby peak cleanup_spikes<<>>(d_Params, d_xbest, d_err, d_ftype, d_st, d_id, d_x, d_C, d_counter); // add new spikes to 2nd counter cudaMemcpy(counter, d_counter, sizeof(int), cudaMemcpyDeviceToHost); if (counter[0]>maxFR){ counter[0] = maxFR; cudaMemcpy(d_counter, counter, sizeof(int), cudaMemcpyHostToDevice); } // extract template features before subtraction extractFEAT<<>>(d_Params, d_st, d_id, d_x, d_counter, d_dout, d_WtW, d_lam, d_mu,d_feat); // subtract the detected spikes subSpikes<<>>(d_Params, d_st, d_id, d_x, d_counter, d_dout, d_WtW); // update 1st counter from 2nd counter cudaMemcpy(d_counter+1, d_counter, sizeof(int), cudaMemcpyDeviceToDevice); if(counter[0]==maxFR) break; } // extractFEAT<<>>(d_Params, d_st, d_id, d_x, d_counter, d_dout, d_WtW, d_feat); float *x, *C, *feat; int *st, *id; int minSize; if (counter[0]