Actual source code: veccuda2.cu
petsc-3.14.5 2021-03-03
1: /*
2: Implements the sequential cuda vectors.
3: */
5: #define PETSC_SKIP_SPINLOCK
6: #define PETSC_SKIP_CXX_COMPLEX_FIX
8: #include <petscconf.h>
9: #include <petsc/private/vecimpl.h>
10: #include <../src/vec/vec/impls/dvecimpl.h>
11: #include <petsc/private/cudavecimpl.h>
13: #include <cuda_runtime.h>
14: #include <thrust/device_ptr.h>
15: #include <thrust/transform.h>
16: #include <thrust/functional.h>
18: /*
19: Allocates space for the vector array on the GPU if it does not exist.
20: Does NOT change the PetscCUDAFlag for the vector
21: Does NOT zero the CUDA array
23: */
24: PetscErrorCode VecCUDAAllocateCheck(Vec v)
25: {
27: cudaError_t err;
28: Vec_CUDA *veccuda;
29: PetscBool option_set;
32: if (!v->spptr) {
33: PetscReal pinned_memory_min;
34: PetscMalloc(sizeof(Vec_CUDA),&v->spptr);
35: veccuda = (Vec_CUDA*)v->spptr;
36: err = cudaMalloc((void**)&veccuda->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRCUDA(err);
37: veccuda->GPUarray = veccuda->GPUarray_allocated;
38: veccuda->stream = 0; /* using default stream */
39: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
40: if (v->data && ((Vec_Seq*)v->data)->array) {
41: v->offloadmask = PETSC_OFFLOAD_CPU;
42: } else {
43: v->offloadmask = PETSC_OFFLOAD_GPU;
44: }
45: }
46: pinned_memory_min = 0;
48: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
49: Note: This same code duplicated in VecCreate_SeqCUDA_Private() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
50: PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECCUDA Options","Vec");
51: PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
52: if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
53: PetscOptionsEnd();
54: }
55: return(0);
56: }
58: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
59: PetscErrorCode VecCUDACopyToGPU(Vec v)
60: {
62: cudaError_t err;
63: Vec_CUDA *veccuda;
64: PetscScalar *varray;
68: VecCUDAAllocateCheck(v);
69: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
70: PetscLogEventBegin(VEC_CUDACopyToGPU,v,0,0,0);
71: veccuda = (Vec_CUDA*)v->spptr;
72: varray = veccuda->GPUarray;
73: err = cudaMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
74: PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
75: PetscLogEventEnd(VEC_CUDACopyToGPU,v,0,0,0);
76: v->offloadmask = PETSC_OFFLOAD_BOTH;
77: }
78: return(0);
79: }
81: PetscErrorCode VecCUDACopyToGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
82: {
83: PetscScalar *varray;
84: PetscErrorCode ierr;
85: cudaError_t err;
86: PetscScalar *cpuPtr, *gpuPtr;
87: Vec_Seq *s;
88: VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
89: PetscInt lowestIndex,n;
93: VecCUDAAllocateCheck(v);
94: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
95: s = (Vec_Seq*)v->data;
96: if (mode & SCATTER_REVERSE) {
97: lowestIndex = ptop_scatter->sendLowestIndex;
98: n = ptop_scatter->ns;
99: } else {
100: lowestIndex = ptop_scatter->recvLowestIndex;
101: n = ptop_scatter->nr;
102: }
104: PetscLogEventBegin(VEC_CUDACopyToGPUSome,v,0,0,0);
105: varray = ((Vec_CUDA*)v->spptr)->GPUarray;
106: gpuPtr = varray + lowestIndex;
107: cpuPtr = s->array + lowestIndex;
109: /* Note : this code copies the smallest contiguous chunk of data
110: containing ALL of the indices */
111: err = cudaMemcpy(gpuPtr,cpuPtr,n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
112: PetscLogCpuToGpu(n*sizeof(PetscScalar));
114: /* Set the buffer states */
115: v->offloadmask = PETSC_OFFLOAD_BOTH;
116: PetscLogEventEnd(VEC_CUDACopyToGPUSome,v,0,0,0);
117: }
118: return(0);
119: }
121: /*
122: VecCUDACopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
123: */
124: PetscErrorCode VecCUDACopyFromGPU(Vec v)
125: {
127: cudaError_t err;
128: Vec_CUDA *veccuda;
129: PetscScalar *varray;
133: VecCUDAAllocateCheckHost(v);
134: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
135: PetscLogEventBegin(VEC_CUDACopyFromGPU,v,0,0,0);
136: veccuda = (Vec_CUDA*)v->spptr;
137: varray = veccuda->GPUarray;
138: err = cudaMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
139: PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
140: PetscLogEventEnd(VEC_CUDACopyFromGPU,v,0,0,0);
141: v->offloadmask = PETSC_OFFLOAD_BOTH;
142: }
143: return(0);
144: }
146: /* Note that this function only copies *some* of the values up from the GPU to CPU,
147: which means that we need recombine the data at some point before using any of the standard functions.
148: We could add another few flag-types to keep track of this, or treat things like VecGetArray VecRestoreArray
149: where you have to always call in pairs
150: */
151: PetscErrorCode VecCUDACopyFromGPUSome(Vec v, PetscCUDAIndices ci,ScatterMode mode)
152: {
153: const PetscScalar *varray, *gpuPtr;
154: PetscErrorCode ierr;
155: cudaError_t err;
156: PetscScalar *cpuPtr;
157: Vec_Seq *s;
158: VecScatterCUDAIndices_PtoP ptop_scatter = (VecScatterCUDAIndices_PtoP)ci->scatter;
159: PetscInt lowestIndex,n;
163: VecCUDAAllocateCheckHost(v);
164: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
165: PetscLogEventBegin(VEC_CUDACopyFromGPUSome,v,0,0,0);
166: if (mode & SCATTER_REVERSE) {
167: lowestIndex = ptop_scatter->recvLowestIndex;
168: n = ptop_scatter->nr;
169: } else {
170: lowestIndex = ptop_scatter->sendLowestIndex;
171: n = ptop_scatter->ns;
172: }
174: varray=((Vec_CUDA*)v->spptr)->GPUarray;
175: s = (Vec_Seq*)v->data;
176: gpuPtr = varray + lowestIndex;
177: cpuPtr = s->array + lowestIndex;
179: /* Note : this code copies the smallest contiguous chunk of data
180: containing ALL of the indices */
181: err = cudaMemcpy(cpuPtr,gpuPtr,n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
182: PetscLogGpuToCpu(n*sizeof(PetscScalar));
184: VecCUDARestoreArrayRead(v,&varray);
185: PetscLogEventEnd(VEC_CUDACopyFromGPUSome,v,0,0,0);
186: }
187: return(0);
188: }
190: /*MC
191: VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA
193: Options Database Keys:
194: . -vec_type seqcuda - sets the vector type to VECSEQCUDA during a call to VecSetFromOptions()
196: Level: beginner
198: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq(), VecSetPinnedMemoryMin()
199: M*/
201: PetscErrorCode VecAYPX_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
202: {
203: const PetscScalar *xarray;
204: PetscScalar *yarray;
205: PetscErrorCode ierr;
206: PetscBLASInt one = 1,bn = 0;
207: PetscScalar sone = 1.0;
208: cublasHandle_t cublasv2handle;
209: cublasStatus_t cberr;
210: cudaError_t err;
213: PetscCUBLASGetHandle(&cublasv2handle);
214: PetscBLASIntCast(yin->map->n,&bn);
215: VecCUDAGetArrayRead(xin,&xarray);
216: VecCUDAGetArray(yin,&yarray);
217: PetscLogGpuTimeBegin();
218: if (alpha == (PetscScalar)0.0) {
219: err = cudaMemcpy(yarray,xarray,bn*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
220: } else if (alpha == (PetscScalar)1.0) {
221: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
222: PetscLogGpuFlops(1.0*yin->map->n);
223: } else {
224: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
225: cberr = cublasXaxpy(cublasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
226: PetscLogGpuFlops(2.0*yin->map->n);
227: }
228: err = WaitForCUDA();CHKERRCUDA(err);
229: PetscLogGpuTimeEnd();
230: VecCUDARestoreArrayRead(xin,&xarray);
231: VecCUDARestoreArray(yin,&yarray);
232: return(0);
233: }
235: PetscErrorCode VecAXPY_SeqCUDA(Vec yin,PetscScalar alpha,Vec xin)
236: {
237: const PetscScalar *xarray;
238: PetscScalar *yarray;
239: PetscErrorCode ierr;
240: PetscBLASInt one = 1,bn = 0;
241: cublasHandle_t cublasv2handle;
242: cublasStatus_t cberr;
243: PetscBool yiscuda,xiscuda;
244: cudaError_t err;
247: if (alpha == (PetscScalar)0.0) return(0);
248: PetscCUBLASGetHandle(&cublasv2handle);
249: PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
250: PetscObjectTypeCompareAny((PetscObject)xin,&xiscuda,VECSEQCUDA,VECMPICUDA,"");
251: if (xiscuda && yiscuda) {
252: PetscBLASIntCast(yin->map->n,&bn);
253: VecCUDAGetArrayRead(xin,&xarray);
254: VecCUDAGetArray(yin,&yarray);
255: PetscLogGpuTimeBegin();
256: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
257: err = WaitForCUDA();CHKERRCUDA(err);
258: PetscLogGpuTimeEnd();
259: VecCUDARestoreArrayRead(xin,&xarray);
260: VecCUDARestoreArray(yin,&yarray);
261: PetscLogGpuFlops(2.0*yin->map->n);
262: } else {
263: VecAXPY_Seq(yin,alpha,xin);
264: }
265: return(0);
266: }
268: PetscErrorCode VecPointwiseDivide_SeqCUDA(Vec win, Vec xin, Vec yin)
269: {
270: PetscInt n = xin->map->n;
271: const PetscScalar *xarray=NULL,*yarray=NULL;
272: PetscScalar *warray=NULL;
273: thrust::device_ptr<const PetscScalar> xptr,yptr;
274: thrust::device_ptr<PetscScalar> wptr;
275: PetscErrorCode ierr;
276: cudaError_t err;
279: VecCUDAGetArrayWrite(win,&warray);
280: VecCUDAGetArrayRead(xin,&xarray);
281: VecCUDAGetArrayRead(yin,&yarray);
282: PetscLogGpuTimeBegin();
283: try {
284: wptr = thrust::device_pointer_cast(warray);
285: xptr = thrust::device_pointer_cast(xarray);
286: yptr = thrust::device_pointer_cast(yarray);
287: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
288: err = WaitForCUDA();CHKERRCUDA(err);
289: } catch (char *ex) {
290: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
291: }
292: PetscLogGpuTimeEnd();
293: PetscLogGpuFlops(n);
294: VecCUDARestoreArrayRead(xin,&xarray);
295: VecCUDARestoreArrayRead(yin,&yarray);
296: VecCUDARestoreArrayWrite(win,&warray);
297: return(0);
298: }
300: PetscErrorCode VecWAXPY_SeqCUDA(Vec win,PetscScalar alpha,Vec xin, Vec yin)
301: {
302: const PetscScalar *xarray=NULL,*yarray=NULL;
303: PetscScalar *warray=NULL;
304: PetscErrorCode ierr;
305: PetscBLASInt one = 1,bn = 0;
306: cublasHandle_t cublasv2handle;
307: cublasStatus_t cberr;
308: cudaError_t err;
311: PetscCUBLASGetHandle(&cublasv2handle);
312: PetscBLASIntCast(win->map->n,&bn);
313: if (alpha == (PetscScalar)0.0) {
314: VecCopy_SeqCUDA(yin,win);
315: } else {
316: VecCUDAGetArrayRead(xin,&xarray);
317: VecCUDAGetArrayRead(yin,&yarray);
318: VecCUDAGetArrayWrite(win,&warray);
319: PetscLogGpuTimeBegin();
320: err = cudaMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
321: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRCUBLAS(cberr);
322: err = WaitForCUDA();CHKERRCUDA(err);
323: PetscLogGpuTimeEnd();
324: PetscLogGpuFlops(2*win->map->n);
325: VecCUDARestoreArrayRead(xin,&xarray);
326: VecCUDARestoreArrayRead(yin,&yarray);
327: VecCUDARestoreArrayWrite(win,&warray);
328: }
329: return(0);
330: }
332: PetscErrorCode VecMAXPY_SeqCUDA(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
333: {
335: cudaError_t err;
336: PetscInt n = xin->map->n,j,j_rem;
337: PetscScalar alpha0,alpha1,alpha2,alpha3;
340: PetscLogGpuFlops(nv*2.0*n);
341: switch (j_rem=nv&0x3) {
342: case 3:
343: alpha0 = alpha[0];
344: alpha1 = alpha[1];
345: alpha2 = alpha[2];
346: alpha += 3;
347: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
348: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
349: VecAXPY_SeqCUDA(xin,alpha2,y[2]);
350: y += 3;
351: break;
352: case 2:
353: alpha0 = alpha[0];
354: alpha1 = alpha[1];
355: alpha +=2;
356: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
357: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
358: y +=2;
359: break;
360: case 1:
361: alpha0 = *alpha++;
362: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
363: y +=1;
364: break;
365: }
366: for (j=j_rem; j<nv; j+=4) {
367: alpha0 = alpha[0];
368: alpha1 = alpha[1];
369: alpha2 = alpha[2];
370: alpha3 = alpha[3];
371: alpha += 4;
372: VecAXPY_SeqCUDA(xin,alpha0,y[0]);
373: VecAXPY_SeqCUDA(xin,alpha1,y[1]);
374: VecAXPY_SeqCUDA(xin,alpha2,y[2]);
375: VecAXPY_SeqCUDA(xin,alpha3,y[3]);
376: y += 4;
377: }
378: err = WaitForCUDA();CHKERRCUDA(err);
379: return(0);
380: }
382: PetscErrorCode VecDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
383: {
384: const PetscScalar *xarray,*yarray;
385: PetscErrorCode ierr;
386: PetscBLASInt one = 1,bn = 0;
387: cublasHandle_t cublasv2handle;
388: cublasStatus_t cerr;
391: PetscCUBLASGetHandle(&cublasv2handle);
392: PetscBLASIntCast(yin->map->n,&bn);
393: VecCUDAGetArrayRead(xin,&xarray);
394: VecCUDAGetArrayRead(yin,&yarray);
395: /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
396: PetscLogGpuTimeBegin();
397: cerr = cublasXdot(cublasv2handle,bn,yarray,one,xarray,one,z);CHKERRCUBLAS(cerr);
398: PetscLogGpuTimeEnd();
399: if (xin->map->n >0) {
400: PetscLogGpuFlops(2.0*xin->map->n-1);
401: }
402: VecCUDARestoreArrayRead(xin,&xarray);
403: VecCUDARestoreArrayRead(yin,&yarray);
404: return(0);
405: }
407: //
408: // CUDA kernels for MDot to follow
409: //
411: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
412: #define MDOT_WORKGROUP_SIZE 128
413: #define MDOT_WORKGROUP_NUM 128
415: #if !defined(PETSC_USE_COMPLEX)
416: // M = 2:
417: __global__ void VecMDot_SeqCUDA_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
418: PetscInt size, PetscScalar *group_results)
419: {
420: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
421: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
422: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
423: PetscInt vec_start_index = blockIdx.x * entries_per_group;
424: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
426: PetscScalar entry_x = 0;
427: PetscScalar group_sum0 = 0;
428: PetscScalar group_sum1 = 0;
429: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
430: entry_x = x[i]; // load only once from global memory!
431: group_sum0 += entry_x * y0[i];
432: group_sum1 += entry_x * y1[i];
433: }
434: tmp_buffer[threadIdx.x] = group_sum0;
435: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
437: // parallel reduction
438: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
439: __syncthreads();
440: if (threadIdx.x < stride) {
441: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
442: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
443: }
444: }
446: // write result of group to group_results
447: if (threadIdx.x == 0) {
448: group_results[blockIdx.x] = tmp_buffer[0];
449: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
450: }
451: }
453: // M = 3:
454: __global__ void VecMDot_SeqCUDA_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
455: PetscInt size, PetscScalar *group_results)
456: {
457: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
458: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
459: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
460: PetscInt vec_start_index = blockIdx.x * entries_per_group;
461: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
463: PetscScalar entry_x = 0;
464: PetscScalar group_sum0 = 0;
465: PetscScalar group_sum1 = 0;
466: PetscScalar group_sum2 = 0;
467: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
468: entry_x = x[i]; // load only once from global memory!
469: group_sum0 += entry_x * y0[i];
470: group_sum1 += entry_x * y1[i];
471: group_sum2 += entry_x * y2[i];
472: }
473: tmp_buffer[threadIdx.x] = group_sum0;
474: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
475: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
477: // parallel reduction
478: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
479: __syncthreads();
480: if (threadIdx.x < stride) {
481: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
482: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
483: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
484: }
485: }
487: // write result of group to group_results
488: if (threadIdx.x == 0) {
489: group_results[blockIdx.x ] = tmp_buffer[0];
490: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
491: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
492: }
493: }
495: // M = 4:
496: __global__ void VecMDot_SeqCUDA_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
497: PetscInt size, PetscScalar *group_results)
498: {
499: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
500: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
501: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
502: PetscInt vec_start_index = blockIdx.x * entries_per_group;
503: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
505: PetscScalar entry_x = 0;
506: PetscScalar group_sum0 = 0;
507: PetscScalar group_sum1 = 0;
508: PetscScalar group_sum2 = 0;
509: PetscScalar group_sum3 = 0;
510: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
511: entry_x = x[i]; // load only once from global memory!
512: group_sum0 += entry_x * y0[i];
513: group_sum1 += entry_x * y1[i];
514: group_sum2 += entry_x * y2[i];
515: group_sum3 += entry_x * y3[i];
516: }
517: tmp_buffer[threadIdx.x] = group_sum0;
518: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
519: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
520: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
522: // parallel reduction
523: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
524: __syncthreads();
525: if (threadIdx.x < stride) {
526: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
527: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
528: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
529: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
530: }
531: }
533: // write result of group to group_results
534: if (threadIdx.x == 0) {
535: group_results[blockIdx.x ] = tmp_buffer[0];
536: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
537: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
538: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
539: }
540: }
542: // M = 8:
543: __global__ void VecMDot_SeqCUDA_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
544: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
545: PetscInt size, PetscScalar *group_results)
546: {
547: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
548: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
549: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
550: PetscInt vec_start_index = blockIdx.x * entries_per_group;
551: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
553: PetscScalar entry_x = 0;
554: PetscScalar group_sum0 = 0;
555: PetscScalar group_sum1 = 0;
556: PetscScalar group_sum2 = 0;
557: PetscScalar group_sum3 = 0;
558: PetscScalar group_sum4 = 0;
559: PetscScalar group_sum5 = 0;
560: PetscScalar group_sum6 = 0;
561: PetscScalar group_sum7 = 0;
562: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
563: entry_x = x[i]; // load only once from global memory!
564: group_sum0 += entry_x * y0[i];
565: group_sum1 += entry_x * y1[i];
566: group_sum2 += entry_x * y2[i];
567: group_sum3 += entry_x * y3[i];
568: group_sum4 += entry_x * y4[i];
569: group_sum5 += entry_x * y5[i];
570: group_sum6 += entry_x * y6[i];
571: group_sum7 += entry_x * y7[i];
572: }
573: tmp_buffer[threadIdx.x] = group_sum0;
574: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
575: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
576: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
577: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
578: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
579: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
580: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
582: // parallel reduction
583: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
584: __syncthreads();
585: if (threadIdx.x < stride) {
586: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
587: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
588: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
589: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
590: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
591: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
592: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
593: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
594: }
595: }
597: // write result of group to group_results
598: if (threadIdx.x == 0) {
599: group_results[blockIdx.x ] = tmp_buffer[0];
600: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
601: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
602: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
603: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
604: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
605: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
606: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
607: }
608: }
609: #endif /* !defined(PETSC_USE_COMPLEX) */
611: PetscErrorCode VecMDot_SeqCUDA(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
612: {
613: PetscErrorCode ierr;
614: PetscInt i,n = xin->map->n,current_y_index = 0;
615: const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
616: PetscScalar *group_results_gpu;
617: #if !defined(PETSC_USE_COMPLEX)
618: PetscInt j;
619: PetscScalar group_results_cpu[MDOT_WORKGROUP_NUM * 8]; // we process at most eight vectors in one kernel
620: #endif
621: cudaError_t cuda_ierr;
622: PetscBLASInt one = 1,bn = 0;
623: cublasHandle_t cublasv2handle;
624: cublasStatus_t cberr;
627: PetscCUBLASGetHandle(&cublasv2handle);
628: PetscBLASIntCast(xin->map->n,&bn);
629: if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqCUDA not positive.");
630: /* Handle the case of local size zero first */
631: if (!xin->map->n) {
632: for (i=0; i<nv; ++i) z[i] = 0;
633: return(0);
634: }
636: // allocate scratchpad memory for the results of individual work groups:
637: cuda_cudaMalloc((void**)&group_results_gpu, sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8);CHKERRCUDA(cuda_ierr);
639: VecCUDAGetArrayRead(xin,&xptr);
641: while (current_y_index < nv)
642: {
643: switch (nv - current_y_index) {
645: case 7:
646: case 6:
647: case 5:
648: case 4:
649: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
650: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
651: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
652: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
653: PetscLogGpuTimeBegin();
654: #if defined(PETSC_USE_COMPLEX)
655: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
656: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
657: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
658: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
659: PetscLogGpuTimeEnd();
660: #else
661: // run kernel:
662: VecMDot_SeqCUDA_kernel4<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu);
663: PetscLogGpuTimeEnd();
665: // copy results back to
666: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 4,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
668: // sum group results into z:
669: for (j=0; j<4; ++j) {
670: z[current_y_index + j] = 0;
671: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
672: }
673: #endif
674: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
675: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
676: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
677: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
678: current_y_index += 4;
679: break;
681: case 3:
682: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
683: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
684: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
686: PetscLogGpuTimeBegin();
687: #if defined(PETSC_USE_COMPLEX)
688: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
689: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
690: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
691: PetscLogGpuTimeEnd();
692: #else
693: // run kernel:
694: VecMDot_SeqCUDA_kernel3<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu);
695: PetscLogGpuTimeEnd();
697: // copy results back to
698: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 3,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
700: // sum group results into z:
701: for (j=0; j<3; ++j) {
702: z[current_y_index + j] = 0;
703: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
704: }
705: #endif
706: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
707: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
708: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
709: current_y_index += 3;
710: break;
712: case 2:
713: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
714: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
715: PetscLogGpuTimeBegin();
716: #if defined(PETSC_USE_COMPLEX)
717: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
718: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
719: PetscLogGpuTimeEnd();
720: #else
721: // run kernel:
722: VecMDot_SeqCUDA_kernel2<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,n,group_results_gpu);
723: PetscLogGpuTimeEnd();
725: // copy results back to
726: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 2,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
728: // sum group results into z:
729: for (j=0; j<2; ++j) {
730: z[current_y_index + j] = 0;
731: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
732: }
733: #endif
734: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
735: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
736: current_y_index += 2;
737: break;
739: case 1:
740: VecCUDAGetArrayRead(yin[current_y_index],&y0ptr);
741: PetscLogGpuTimeBegin();
742: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
743: PetscLogGpuTimeEnd();
744: VecCUDARestoreArrayRead(yin[current_y_index],&y0ptr);
745: current_y_index += 1;
746: break;
748: default: // 8 or more vectors left
749: VecCUDAGetArrayRead(yin[current_y_index ],&y0ptr);
750: VecCUDAGetArrayRead(yin[current_y_index+1],&y1ptr);
751: VecCUDAGetArrayRead(yin[current_y_index+2],&y2ptr);
752: VecCUDAGetArrayRead(yin[current_y_index+3],&y3ptr);
753: VecCUDAGetArrayRead(yin[current_y_index+4],&y4ptr);
754: VecCUDAGetArrayRead(yin[current_y_index+5],&y5ptr);
755: VecCUDAGetArrayRead(yin[current_y_index+6],&y6ptr);
756: VecCUDAGetArrayRead(yin[current_y_index+7],&y7ptr);
757: PetscLogGpuTimeBegin();
758: #if defined(PETSC_USE_COMPLEX)
759: cberr = cublasXdot(cublasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRCUBLAS(cberr);
760: cberr = cublasXdot(cublasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRCUBLAS(cberr);
761: cberr = cublasXdot(cublasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRCUBLAS(cberr);
762: cberr = cublasXdot(cublasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRCUBLAS(cberr);
763: cberr = cublasXdot(cublasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRCUBLAS(cberr);
764: cberr = cublasXdot(cublasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRCUBLAS(cberr);
765: cberr = cublasXdot(cublasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRCUBLAS(cberr);
766: cberr = cublasXdot(cublasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRCUBLAS(cberr);
767: PetscLogGpuTimeEnd();
768: #else
769: // run kernel:
770: VecMDot_SeqCUDA_kernel8<<<MDOT_WORKGROUP_NUM,MDOT_WORKGROUP_SIZE>>>(xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu);
771: PetscLogGpuTimeEnd();
773: // copy results back to
774: cuda_cudaMemcpy(group_results_cpu,group_results_gpu,sizeof(PetscScalar) * MDOT_WORKGROUP_NUM * 8,cudaMemcpyDeviceToHost);CHKERRCUDA(cuda_ierr);
776: // sum group results into z:
777: for (j=0; j<8; ++j) {
778: z[current_y_index + j] = 0;
779: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[current_y_index + j] += group_results_cpu[i];
780: }
781: #endif
782: VecCUDARestoreArrayRead(yin[current_y_index ],&y0ptr);
783: VecCUDARestoreArrayRead(yin[current_y_index+1],&y1ptr);
784: VecCUDARestoreArrayRead(yin[current_y_index+2],&y2ptr);
785: VecCUDARestoreArrayRead(yin[current_y_index+3],&y3ptr);
786: VecCUDARestoreArrayRead(yin[current_y_index+4],&y4ptr);
787: VecCUDARestoreArrayRead(yin[current_y_index+5],&y5ptr);
788: VecCUDARestoreArrayRead(yin[current_y_index+6],&y6ptr);
789: VecCUDARestoreArrayRead(yin[current_y_index+7],&y7ptr);
790: current_y_index += 8;
791: break;
792: }
793: }
794: VecCUDARestoreArrayRead(xin,&xptr);
796: cuda_cudaFree(group_results_gpu);CHKERRCUDA(cuda_ierr);
797: PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
798: return(0);
799: }
801: #undef MDOT_WORKGROUP_SIZE
802: #undef MDOT_WORKGROUP_NUM
804: PetscErrorCode VecSet_SeqCUDA(Vec xin,PetscScalar alpha)
805: {
806: PetscInt n = xin->map->n;
807: PetscScalar *xarray = NULL;
808: thrust::device_ptr<PetscScalar> xptr;
809: PetscErrorCode ierr;
810: cudaError_t err;
813: VecCUDAGetArrayWrite(xin,&xarray);
814: PetscLogGpuTimeBegin();
815: if (alpha == (PetscScalar)0.0) {
816: err = cudaMemset(xarray,0,n*sizeof(PetscScalar));CHKERRCUDA(err);
817: } else {
818: try {
819: xptr = thrust::device_pointer_cast(xarray);
820: thrust::fill(xptr,xptr+n,alpha);
821: } catch (char *ex) {
822: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
823: }
824: }
825: err = WaitForCUDA();CHKERRCUDA(err);
826: PetscLogGpuTimeEnd();
827: VecCUDARestoreArrayWrite(xin,&xarray);
828: return(0);
829: }
831: PetscErrorCode VecScale_SeqCUDA(Vec xin,PetscScalar alpha)
832: {
833: PetscScalar *xarray;
835: PetscBLASInt one = 1,bn = 0;
836: cublasHandle_t cublasv2handle;
837: cublasStatus_t cberr;
838: cudaError_t err;
841: if (alpha == (PetscScalar)0.0) {
842: VecSet_SeqCUDA(xin,alpha);
843: err = WaitForCUDA();CHKERRCUDA(err);
844: } else if (alpha != (PetscScalar)1.0) {
845: PetscCUBLASGetHandle(&cublasv2handle);
846: PetscBLASIntCast(xin->map->n,&bn);
847: VecCUDAGetArray(xin,&xarray);
848: PetscLogGpuTimeBegin();
849: cberr = cublasXscal(cublasv2handle,bn,&alpha,xarray,one);CHKERRCUBLAS(cberr);
850: VecCUDARestoreArray(xin,&xarray);
851: err = WaitForCUDA();CHKERRCUDA(err);
852: PetscLogGpuTimeEnd();
853: }
854: PetscLogGpuFlops(xin->map->n);
855: return(0);
856: }
858: PetscErrorCode VecTDot_SeqCUDA(Vec xin,Vec yin,PetscScalar *z)
859: {
860: const PetscScalar *xarray,*yarray;
861: PetscErrorCode ierr;
862: PetscBLASInt one = 1,bn = 0;
863: cublasHandle_t cublasv2handle;
864: cublasStatus_t cerr;
867: PetscCUBLASGetHandle(&cublasv2handle);
868: PetscBLASIntCast(xin->map->n,&bn);
869: VecCUDAGetArrayRead(xin,&xarray);
870: VecCUDAGetArrayRead(yin,&yarray);
871: PetscLogGpuTimeBegin();
872: cerr = cublasXdotu(cublasv2handle,bn,xarray,one,yarray,one,z);CHKERRCUBLAS(cerr);
873: PetscLogGpuTimeEnd();
874: if (xin->map->n > 0) {
875: PetscLogGpuFlops(2.0*xin->map->n-1);
876: }
877: VecCUDARestoreArrayRead(yin,&yarray);
878: VecCUDARestoreArrayRead(xin,&xarray);
879: return(0);
880: }
882: PetscErrorCode VecCopy_SeqCUDA(Vec xin,Vec yin)
883: {
884: const PetscScalar *xarray;
885: PetscScalar *yarray;
886: PetscErrorCode ierr;
887: cudaError_t err;
890: if (xin != yin) {
891: if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
892: PetscBool yiscuda;
894: PetscObjectTypeCompareAny((PetscObject)yin,&yiscuda,VECSEQCUDA,VECMPICUDA,"");
895: VecCUDAGetArrayRead(xin,&xarray);
896: if (yiscuda) {
897: VecCUDAGetArrayWrite(yin,&yarray);
898: } else {
899: VecGetArrayWrite(yin,&yarray);
900: }
901: PetscLogGpuTimeBegin();
902: if (yiscuda) {
903: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
904: } else {
905: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
906: }
907: err = WaitForCUDA();CHKERRCUDA(err);
908: PetscLogGpuTimeEnd();
909: VecCUDARestoreArrayRead(xin,&xarray);
910: if (yiscuda) {
911: VecCUDARestoreArrayWrite(yin,&yarray);
912: } else {
913: VecRestoreArrayWrite(yin,&yarray);
914: }
915: } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
916: /* copy in CPU if we are on the CPU */
917: VecCopy_SeqCUDA_Private(xin,yin);
918: } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
919: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
920: if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
921: /* copy in CPU */
922: VecCopy_SeqCUDA_Private(xin,yin);
923: } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
924: /* copy in GPU */
925: VecCUDAGetArrayRead(xin,&xarray);
926: VecCUDAGetArrayWrite(yin,&yarray);
927: PetscLogGpuTimeBegin();
928: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
929: PetscLogGpuTimeEnd();
930: VecCUDARestoreArrayRead(xin,&xarray);
931: VecCUDARestoreArrayWrite(yin,&yarray);
932: } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
933: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
934: default to copy in GPU (this is an arbitrary choice) */
935: VecCUDAGetArrayRead(xin,&xarray);
936: VecCUDAGetArrayWrite(yin,&yarray);
937: PetscLogGpuTimeBegin();
938: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
939: PetscLogGpuTimeEnd();
940: VecCUDARestoreArrayRead(xin,&xarray);
941: VecCUDARestoreArrayWrite(yin,&yarray);
942: } else {
943: VecCopy_SeqCUDA_Private(xin,yin);
944: }
945: }
946: }
947: return(0);
948: }
950: PetscErrorCode VecSwap_SeqCUDA(Vec xin,Vec yin)
951: {
953: PetscBLASInt one = 1,bn = 0;
954: PetscScalar *xarray,*yarray;
955: cublasHandle_t cublasv2handle;
956: cublasStatus_t cberr;
957: cudaError_t err;
960: PetscCUBLASGetHandle(&cublasv2handle);
961: PetscBLASIntCast(xin->map->n,&bn);
962: if (xin != yin) {
963: VecCUDAGetArray(xin,&xarray);
964: VecCUDAGetArray(yin,&yarray);
965: PetscLogGpuTimeBegin();
966: cberr = cublasXswap(cublasv2handle,bn,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
967: err = WaitForCUDA();CHKERRCUDA(err);
968: PetscLogGpuTimeEnd();
969: VecCUDARestoreArray(xin,&xarray);
970: VecCUDARestoreArray(yin,&yarray);
971: }
972: return(0);
973: }
975: PetscErrorCode VecAXPBY_SeqCUDA(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
976: {
977: PetscErrorCode ierr;
978: PetscScalar a = alpha,b = beta;
979: const PetscScalar *xarray;
980: PetscScalar *yarray;
981: PetscBLASInt one = 1, bn = 0;
982: cublasHandle_t cublasv2handle;
983: cublasStatus_t cberr;
984: cudaError_t err;
987: PetscCUBLASGetHandle(&cublasv2handle);
988: PetscBLASIntCast(yin->map->n,&bn);
989: if (a == (PetscScalar)0.0) {
990: VecScale_SeqCUDA(yin,beta);
991: } else if (b == (PetscScalar)1.0) {
992: VecAXPY_SeqCUDA(yin,alpha,xin);
993: } else if (a == (PetscScalar)1.0) {
994: VecAYPX_SeqCUDA(yin,beta,xin);
995: } else if (b == (PetscScalar)0.0) {
996: VecCUDAGetArrayRead(xin,&xarray);
997: VecCUDAGetArray(yin,&yarray);
998: PetscLogGpuTimeBegin();
999: err = cudaMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice);CHKERRCUDA(err);
1000: cberr = cublasXscal(cublasv2handle,bn,&alpha,yarray,one);CHKERRCUBLAS(cberr);
1001: err = WaitForCUDA();CHKERRCUDA(err);
1002: PetscLogGpuTimeEnd();
1003: PetscLogGpuFlops(xin->map->n);
1004: VecCUDARestoreArrayRead(xin,&xarray);
1005: VecCUDARestoreArray(yin,&yarray);
1006: } else {
1007: VecCUDAGetArrayRead(xin,&xarray);
1008: VecCUDAGetArray(yin,&yarray);
1009: PetscLogGpuTimeBegin();
1010: cberr = cublasXscal(cublasv2handle,bn,&beta,yarray,one);CHKERRCUBLAS(cberr);
1011: cberr = cublasXaxpy(cublasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRCUBLAS(cberr);
1012: VecCUDARestoreArrayRead(xin,&xarray);
1013: VecCUDARestoreArray(yin,&yarray);
1014: err = WaitForCUDA();CHKERRCUDA(err);
1015: PetscLogGpuTimeEnd();
1016: PetscLogGpuFlops(3.0*xin->map->n);
1017: }
1018: return(0);
1019: }
1021: PetscErrorCode VecAXPBYPCZ_SeqCUDA(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
1022: {
1024: cudaError_t err;
1025: PetscInt n = zin->map->n;
1028: if (gamma == (PetscScalar)1.0) {
1029: /* z = ax + b*y + z */
1030: VecAXPY_SeqCUDA(zin,alpha,xin);
1031: VecAXPY_SeqCUDA(zin,beta,yin);
1032: PetscLogGpuFlops(4.0*n);
1033: } else {
1034: /* z = a*x + b*y + c*z */
1035: VecScale_SeqCUDA(zin,gamma);
1036: VecAXPY_SeqCUDA(zin,alpha,xin);
1037: VecAXPY_SeqCUDA(zin,beta,yin);
1038: PetscLogGpuFlops(5.0*n);
1039: }
1040: err = WaitForCUDA();CHKERRCUDA(err);
1041: return(0);
1042: }
1044: PetscErrorCode VecPointwiseMult_SeqCUDA(Vec win,Vec xin,Vec yin)
1045: {
1046: PetscInt n = win->map->n;
1047: const PetscScalar *xarray,*yarray;
1048: PetscScalar *warray;
1049: thrust::device_ptr<const PetscScalar> xptr,yptr;
1050: thrust::device_ptr<PetscScalar> wptr;
1051: PetscErrorCode ierr;
1052: cudaError_t err;
1055: VecCUDAGetArray(win,&warray);
1056: VecCUDAGetArrayRead(xin,&xarray);
1057: VecCUDAGetArrayRead(yin,&yarray);
1058: PetscLogGpuTimeBegin();
1059: try {
1060: wptr = thrust::device_pointer_cast(warray);
1061: xptr = thrust::device_pointer_cast(xarray);
1062: yptr = thrust::device_pointer_cast(yarray);
1063: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
1064: err = WaitForCUDA();CHKERRCUDA(err);
1065: } catch (char *ex) {
1066: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1067: }
1068: PetscLogGpuTimeEnd();
1069: VecCUDARestoreArrayRead(xin,&xarray);
1070: VecCUDARestoreArrayRead(yin,&yarray);
1071: VecCUDARestoreArray(win,&warray);
1072: PetscLogGpuFlops(n);
1073: return(0);
1074: }
1076: /* should do infinity norm in cuda */
1078: PetscErrorCode VecNorm_SeqCUDA(Vec xin,NormType type,PetscReal *z)
1079: {
1080: PetscErrorCode ierr;
1081: PetscInt n = xin->map->n;
1082: PetscBLASInt one = 1, bn = 0;
1083: const PetscScalar *xarray;
1084: cublasHandle_t cublasv2handle;
1085: cublasStatus_t cberr;
1086: cudaError_t err;
1089: PetscCUBLASGetHandle(&cublasv2handle);
1090: PetscBLASIntCast(n,&bn);
1091: if (type == NORM_2 || type == NORM_FROBENIUS) {
1092: VecCUDAGetArrayRead(xin,&xarray);
1093: PetscLogGpuTimeBegin();
1094: cberr = cublasXnrm2(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1095: PetscLogGpuTimeEnd();
1096: VecCUDARestoreArrayRead(xin,&xarray);
1097: PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
1098: } else if (type == NORM_INFINITY) {
1099: int i;
1100: VecCUDAGetArrayRead(xin,&xarray);
1101: PetscLogGpuTimeBegin();
1102: cberr = cublasIXamax(cublasv2handle,bn,xarray,one,&i);CHKERRCUBLAS(cberr);
1103: PetscLogGpuTimeEnd();
1104: if (bn) {
1105: PetscScalar zs;
1106: err = cudaMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(err);
1107: *z = PetscAbsScalar(zs);
1108: } else *z = 0.0;
1109: VecCUDARestoreArrayRead(xin,&xarray);
1110: } else if (type == NORM_1) {
1111: VecCUDAGetArrayRead(xin,&xarray);
1112: PetscLogGpuTimeBegin();
1113: cberr = cublasXasum(cublasv2handle,bn,xarray,one,z);CHKERRCUBLAS(cberr);
1114: VecCUDARestoreArrayRead(xin,&xarray);
1115: PetscLogGpuTimeEnd();
1116: PetscLogGpuFlops(PetscMax(n-1.0,0.0));
1117: } else if (type == NORM_1_AND_2) {
1118: VecNorm_SeqCUDA(xin,NORM_1,z);
1119: VecNorm_SeqCUDA(xin,NORM_2,z+1);
1120: }
1121: return(0);
1122: }
1124: PetscErrorCode VecDotNorm2_SeqCUDA(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1125: {
1126: PetscErrorCode ierr;
1127: cudaError_t err;
1128: PetscReal n = s->map->n;
1129: const PetscScalar *sarray,*tarray;
1132: VecCUDAGetArrayRead(s,&sarray);
1133: VecCUDAGetArrayRead(t,&tarray);
1134: VecDot_SeqCUDA(s,t,dp);
1135: VecDot_SeqCUDA(t,t,nm);
1136: VecCUDARestoreArrayRead(s,&sarray);
1137: VecCUDARestoreArrayRead(t,&tarray);
1138: err = WaitForCUDA();CHKERRCUDA(err);
1139: PetscLogGpuFlops(4.0*n);
1140: return(0);
1141: }
1143: PetscErrorCode VecDestroy_SeqCUDA(Vec v)
1144: {
1146: cudaError_t err;
1149: if (v->spptr) {
1150: if (((Vec_CUDA*)v->spptr)->GPUarray_allocated) {
1151: err = cudaFree(((Vec_CUDA*)v->spptr)->GPUarray_allocated);CHKERRCUDA(err);
1152: ((Vec_CUDA*)v->spptr)->GPUarray_allocated = NULL;
1153: }
1154: if (((Vec_CUDA*)v->spptr)->stream) {
1155: err = cudaStreamDestroy(((Vec_CUDA*)v->spptr)->stream);CHKERRCUDA(err);
1156: }
1157: }
1158: VecDestroy_SeqCUDA_Private(v);
1159: PetscFree(v->spptr);
1160: return(0);
1161: }
1163: #if defined(PETSC_USE_COMPLEX)
1164: struct conjugate
1165: {
1166: __host__ __device__
1167: PetscScalar operator()(PetscScalar x)
1168: {
1169: return PetscConj(x);
1170: }
1171: };
1172: #endif
1174: PetscErrorCode VecConjugate_SeqCUDA(Vec xin)
1175: {
1176: #if defined(PETSC_USE_COMPLEX)
1177: PetscScalar *xarray;
1178: PetscErrorCode ierr;
1179: PetscInt n = xin->map->n;
1180: thrust::device_ptr<PetscScalar> xptr;
1181: cudaError_t err;
1184: VecCUDAGetArray(xin,&xarray);
1185: PetscLogGpuTimeBegin();
1186: try {
1187: xptr = thrust::device_pointer_cast(xarray);
1188: thrust::transform(xptr,xptr+n,xptr,conjugate());
1189: err = WaitForCUDA();CHKERRCUDA(err);
1190: } catch (char *ex) {
1191: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1192: }
1193: PetscLogGpuTimeEnd();
1194: VecCUDARestoreArray(xin,&xarray);
1195: #else
1197: #endif
1198: return(0);
1199: }
1201: PetscErrorCode VecGetLocalVector_SeqCUDA(Vec v,Vec w)
1202: {
1204: cudaError_t err;
1211: if (w->data) {
1212: if (((Vec_Seq*)w->data)->array_allocated) {
1213: if (w->pinned_memory) {
1214: PetscMallocSetCUDAHost();
1215: }
1216: PetscFree(((Vec_Seq*)w->data)->array_allocated);
1217: if (w->pinned_memory) {
1218: PetscMallocResetCUDAHost();
1219: w->pinned_memory = PETSC_FALSE;
1220: }
1221: }
1222: ((Vec_Seq*)w->data)->array = NULL;
1223: ((Vec_Seq*)w->data)->unplacedarray = NULL;
1224: }
1225: if (w->spptr) {
1226: if (((Vec_CUDA*)w->spptr)->GPUarray) {
1227: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1228: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1229: }
1230: if (((Vec_CUDA*)v->spptr)->stream) {
1231: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1232: }
1233: PetscFree(w->spptr);
1234: }
1236: if (v->petscnative) {
1237: PetscFree(w->data);
1238: w->data = v->data;
1239: w->offloadmask = v->offloadmask;
1240: w->pinned_memory = v->pinned_memory;
1241: w->spptr = v->spptr;
1242: PetscObjectStateIncrease((PetscObject)w);
1243: } else {
1244: VecGetArray(v,&((Vec_Seq*)w->data)->array);
1245: w->offloadmask = PETSC_OFFLOAD_CPU;
1246: VecCUDAAllocateCheck(w);
1247: }
1248: return(0);
1249: }
1251: PetscErrorCode VecRestoreLocalVector_SeqCUDA(Vec v,Vec w)
1252: {
1254: cudaError_t err;
1261: if (v->petscnative) {
1262: v->data = w->data;
1263: v->offloadmask = w->offloadmask;
1264: v->pinned_memory = w->pinned_memory;
1265: v->spptr = w->spptr;
1266: VecCUDACopyFromGPU(v);
1267: PetscObjectStateIncrease((PetscObject)v);
1268: w->data = 0;
1269: w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1270: w->spptr = 0;
1271: } else {
1272: VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1273: if ((Vec_CUDA*)w->spptr) {
1274: err = cudaFree(((Vec_CUDA*)w->spptr)->GPUarray);CHKERRCUDA(err);
1275: ((Vec_CUDA*)w->spptr)->GPUarray = NULL;
1276: if (((Vec_CUDA*)v->spptr)->stream) {
1277: err = cudaStreamDestroy(((Vec_CUDA*)w->spptr)->stream);CHKERRCUDA(err);
1278: }
1279: PetscFree(w->spptr);
1280: }
1281: }
1282: return(0);
1283: }