1: /* A template file for the CUDA Programming Model (CUPM) initialization, to be included in init.c. CUPM is either CUDA or HIP. */
3: PetscBool PetscCUPMSynchronize = PETSC_FALSE;
4: PetscBool PetscCUPMInitialized = PETSC_FALSE;
6: static PetscBool PetscNotUseCUPM = PETSC_FALSE; /* Assert the code will not use this type of devices */
8: /* Device validation after it is lazily initialized */
9: static PetscErrorCode PetscCUPMValidate(void) 10: {
11: PetscBool mpi_gpu_awareness;
14: if (use_gpu_aware_mpi) {
15: #if defined(PETSC_HAVE_OMPI_MAJOR_VERSION) && defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
16: /* Trust OpenMPI's compile time gpu query interface */
17: mpi_gpu_awareness = PETSC_TRUE;
18: #else
19: /* For other MPI implementations without gpu query API, we do a GPU MPI call to see if it segfaults.
20: Note that Spectrum MPI sets OMPI_MAJOR_VERSION and is CUDA-aware, but does not have MPIX_CUDA_AWARE_SUPPORT.
21: */
22: mpi_gpu_awareness = PetscMPICUPMAwarenessCheck();
23: #endif
24: if (!mpi_gpu_awareness) {
25: (*PetscErrorPrintf)("PETSc is configured with GPU support, but your MPI is not GPU-aware. For better performance, please use a GPU-aware MPI.\n");
26: (*PetscErrorPrintf)("If you do not care, add option -use_gpu_aware_mpi 0. To not see the message again, add the option to your .petscrc, OR add it to the env var PETSC_OPTIONS.\n");
27: (*PetscErrorPrintf)("If you do care, for IBM Spectrum MPI on OLCF Summit, you may need jsrun --smpiargs=-gpu.\n");
28: (*PetscErrorPrintf)("For OpenMPI, you need to configure it --with-cuda (https://www.open-mpi.org/faq/?category=buildcuda)\n");
29: (*PetscErrorPrintf)("For MVAPICH2-GDR, you need to set MV2_USE_CUDA=1 (http://mvapich.cse.ohio-state.edu/userguide/gdr/)\n");
30: (*PetscErrorPrintf)("For Cray-MPICH, you need to set MPICH_RDMA_ENABLED_CUDA=1 (https://www.olcf.ornl.gov/tutorials/gpudirect-mpich-enabled-cuda/)\n");
31: PETSCABORT(PETSC_COMM_SELF,PETSC_ERR_LIB);
32: }
33: }
34: return(0);
35: }
37: /*@C
38: PetscCUDAInitializeCheck - Check if CUDA is initialized. If not, initialize it.
40: Logically collective
42: Level: beginner
44: Notes:
45: In PETSc lazy device initialization, PETSc calls this function right before creating the first CUDA/HIP object.
46: It can be used by application developers who want to lazily initialize CUDA/HIP when they start to use it (which may before a PETSc CUDA/HIP object is created.)
48: .seealso: PetscCUDAInitialize(), PetscHIPInitialize(), PetscHIPInitializeCheck()
49: @*/
50: PETSC_EXTERN PetscErrorCodePetscCUDAInitializeCheck(void);
53: /*@C
54: PetscHIPInitializeCheck - Check if HIP is initialized. If not, initialize it.
56: Logically collective
58: Level: beginner
60: Notes:
61: See notes of PetscCUDAInitializeCheck() for details.
63: .seealso: PetscHIPInitialize(), PetscCUDAInitialize(), PetscCUDAInitializeCheck()
64: @*/
65: PETSC_EXTERN PetscErrorCodePetscCUDAInitializeCheck(void);
67: PetscErrorCode PetscCUPMInitializeCheck(void) 68: {
69: PetscErrorCode ierr;
70: cupmError_t cerr;
71: int devId,devCount;
72: PetscMPIInt rank;
73: static PetscBool cupmValdidateChecked = PETSC_FALSE;
76: if (PetscNotUseCUPM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"You asserted the code wouldn't use devices with -device_set none, but now trying to create a device object. Remove this option or see manpage of PetscCUPMInitialize().");
77: if (!PetscCUPMInitialized) {
78: cerr = cupmGetDeviceCount(&devCount);CHKERRCUPM(cerr);
79: if (devCount > 1) {
80: cerr = cupmSetDeviceFlags(cupmDeviceMapHost);
81: cupmGetLastError(); /* Reset the last error */
82: if (cerr == cupmSuccess) { /* It implies device runtime has not been initialized? */
83: MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
84: devId = rank % devCount;
85: for (int i=0; i<3; i++) {
86: cerr = cupmSetDevice(devId);
87: if (cerr == cupmSuccess) break;
88: if (cerr != cupmErrorMemoryAllocation && cerr != cupmErrorLaunchOutOfResources) CHKERRCUPM(cerr);
89: if (i < 2) {PetscSleep(3);}
90: }
91: if (cerr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU_RESOURCE,"Unable to initialize the GPU");
92: } else if (cerr == cupmErrorSetOnActiveProcess) {
93: /* It implies user has initialized device runtime outside of petsc. We do nothing to respect the device choice. */
94: }
95: }
96: PetscCUPMBLASInitializeHandle();
97: PetscCUPMSOLVERDnInitializeHandle();
98: PetscCUPMInitialized = PETSC_TRUE;
99: #if defined(PETSC_HAVE_KOKKOS)
100: PetscKokkosInitialize_Private();
101: PetscBeganKokkos = PETSC_TRUE;
102: #endif
103: }
105: if (!cupmValdidateChecked) {
106: PetscCUPMValidate();
107: cupmValdidateChecked = PETSC_TRUE;
108: }
109: PetscCreatedGpuObjects = PETSC_TRUE;
110: return(0);
111: }
113: /*@C
114: PetscCUDAInitialize - Initializes CUDA (eagerly in PetscInitialize() or soon after PetscInitialize()) and cuBLAS/cuSPARSE libraries on the device
116: Logically collective
118: Input Parameter:
119: + comm - the MPI communicator that will utilize the devices
120: - device - the device assigned to current MPI process. Special values like PETSC_DECIDE or PETSC_DEFAULT have special meanings (see details below)
122: Options Database:
123: + -cuda_device <device> - the device assigned to current MPI rank. <device> is case-insensitive and can be:
124: NONE (or none, or -3) : the code will not use any device, otherwise it will error out;
125: PETSC_DEFAULT(or DEFAULT, or -2) : do not explicitly set device, i.e., use whatever device already set by user (probably before PetscInitialize()). Init device runtime etc;
126: PETSC_DECIDE (or DECIDE, or -1) : assign MPI ranks in comm to available devices in round-robin, and init device runtime etc on the selected device;
127: >= 0 integer : assign the device with this id to current MPI process. Error out if <device> is invalid. Init device runtime etc on this device;
128: With PETSC_{DECIDE, DEFAULT}, if there are actually no devices, the code can still run, but it will error out when trying to create device objects.
129: . -cuda_view - view information about the devices.
130: . -cuda_synchronize - wait at the end of asynchronize device calls so that their time gets credited to the current event. With -log_view, the default is true, otherwise false.
131: . -log_view - logging, however if alone or combined with `-cuda_set_device DEFAULT | DECIDE | >=0 int`, will init device; if combined with `-cuda_set_device none`, won't init device.
132: - -use_gpu_aware_mpi - assume the MPI is device/GPU-aware when communicating data on devices. Default true.
134: Level: beginner
136: Notes:
137: Unless the input parameter <device> = -3, this routine initializes the CUDA device. It also initializes the cuBLAS/cuSPARSE libraries, which
138: takes a lot of time. Initializing them early helps avoid skewing timings in -log_view.
140: If this routine is triggered by command line options, it is called in PetscInitialize(). If users want to directly call it, they should call it immediately after PetscInitialize().
142: If this is not called then the CUDA initialization is delayed until first creation of a CUDA object and this can affect the timing since they happen asynchronously on different nodes and take a lot of time.
144: .seealso: PetscCUDAInitializeCheck(), PetscHIPInitialize(), PetscHIPInitializeCheck()
145: @*/
146: PETSC_EXTERN PetscErrorCodePetscCUDAInitialize(MPI_Comm comm,PetscInt device);
148: /*@C
149: PetscHIPInitialize - Initializes HIP (eagerly in PetscInitialize() or soon after PetscInitialize()) and hipBLAS/hipSPARSE libraries on the device
151: Logically collective
153: Input Parameter:
154: (see notes)
156: Options Database:
157: (see notes)
159: Level: beginner
161: Notes:
162: The functionality, parameters and options database of this routine are similar to that of PetscCUDAInitialize(), except that the option names
163: are -hip_device, -hip_view, -hip_synchronize instead. See manpage of PetscCUDAInitialize() for details.
165: .seealso: PetscHIPInitializeCheck(), PetscCUDAInitialize(), PetscCUDAInitializeCheck()
166: @*/
167: PETSC_EXTERN PetscErrorCodePetscHIPInitialize(MPI_Comm comm,PetscInt device);
169: PetscErrorCode PetscCUPMInitialize(MPI_Comm comm,PetscInt device)170: {
171: PetscErrorCode ierr;
172: cupmError_t cerr;
173: int devId,devCount=0;
174: const PetscInt PETSC_NONE=-3; /* Unlike PETSC_DECIDE, we don't have a macro PETSC_NONE in petsc headers */
175: PetscMPIInt rank;
178: if (!PetscCUPMInitialized) {
179: cerr = cupmGetDeviceCount(&devCount);
180: cupmGetLastError(); /* Reset the last error */
181: if (cerr != cupmSuccess) devCount = 0;
182: if (device >= 0) { /* User wants to use this specific device */
183: cerr = cupmSetDeviceFlags(cupmDeviceMapHost); /* Allow it to fail since user might have already initialized the device. */
184: cupmGetLastError(); /* Reset the last error */
185: cerr = cupmSetDevice((int)device);CHKERRCUPM(cerr);
186: } else if (device == PETSC_DECIDE) { /* Assign MPI ranks to available devices in round-robin */
187: if (devCount > 0) { /* Allow no device as long as user does not use devices */
188: /* Set the device flags so that it can map host memory */
189: cerr = cupmSetDeviceFlags(cupmDeviceMapHost);CHKERRCUPM(cerr);
190: MPI_Comm_rank(comm,&rank);
191: devId = rank % devCount;
192: cerr = cupmSetDevice(devId);CHKERRCUPM(cerr);
193: }
194: } else if (device == PETSC_DEFAULT) {
195: /* Do nothing, i.e., use whatever device set by user before PetscInitialize() */
196: } else if (device == PETSC_NONE) {
197: PetscNotUseCUPM = PETSC_TRUE; /* Assert the code won't use devices even there are */
198: } else SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Wrong device (%D) passed to -device_set <dev>. Must be NONE(-3),PETSC_DEFAULT(-2),PETSC_DECIDE(-1) or a non-negative integer.",device);
200: if (devCount > 0 && device != PETSC_NONE) {
201: /* Do costly device handles initialization here to not to distort petsc logging later */
202: PetscCUPMBLASInitializeHandle();
203: PetscCUPMSOLVERDnInitializeHandle();
204: PetscCUPMInitialized = PETSC_TRUE;
205: }
206: }
207: return(0);
208: }
210: /*
211: The routine works as a driver to initialize and view the device
213: Input Parameter:
214: initDevice: True if user explicitly has -cuda/hip_set_device xxx
215: device: Significant when <initDeivce>. Basically, it is the integer presentation of the xxx above
216: logView: True if -log_view or -log_summary
217: devView: True if -{cuda,hip}_view
218: */
219: static PetscErrorCode PetscCUPMInitializeAndView(PetscBool initDevice,PetscInt device,PetscBool logView,PetscBool devView)220: {
221: PetscErrorCode ierr;
222: cupmError_t cerr;
223: PetscMPIInt rank;
224: int devId,devCount;
225: cupmDeviceProp prop;
228: PetscCUPMSynchronize = logView;
229: if (initDevice) {PetscCUPMInitialize(PETSC_COMM_WORLD,device);}
230: else if (logView) { /* With -log_view, we want to do costly gpu runtime initialization early so that not to distort the timing later. */
231: devCount = 0;
232: cerr = cupmGetDeviceCount(&devCount);
233: cupmGetLastError(); /* Reset the last error */
234: if (cerr == cupmSuccess && devCount >= 1) { /* There are devices */
235: devId = 0;
236: if (devCount > 1) { /* Decide which device to init when there are multiple */
237: cerr = cupmSetDeviceFlags(cupmDeviceMapHost);
238: cupmGetLastError(); /* Reset the last error */
239: if (cerr == cupmSuccess) { /* It implies gpu runtime has not been initialized */
240: MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
241: devId = rank % devCount;
242: cerr = cupmSetDevice(devId);CHKERRCUPM(cerr);
243: } else if (cerr == cupmErrorSetOnActiveProcess) {
244: /* It means user initialized gpu runtime outside of petsc. We respect the device choice. */
245: cerr = cupmGetDevice(&devId);CHKERRCUPM(cerr);
246: }
247: }
248: PetscCUPMInitialize(PETSC_COMM_WORLD,(PetscInt)devId);
249: }
250: }
252: if (devView) {
253: MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
254: cerr = cupmGetDeviceCount(&devCount);CHKERRCUPM(cerr);
255: for (devId = 0; devId < devCount; ++devId) {
256: cerr = cupmGetDeviceProperties(&prop,devId);CHKERRCUPM(cerr);
257: PetscPrintf(PETSC_COMM_WORLD, "device %d: %s\n", devId, prop.name);
258: }
259: cerr = cupmGetDevice(&devId);CHKERRCUPM(cerr);
260: PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] Using device %d.\n",rank,devId);
261: PetscSynchronizedFlush(PETSC_COMM_WORLD,PETSC_STDOUT);
262: }
263: return(0);
264: }
266: /*
267: The routine checks user's device related options and initializes the device if instructed.
269: Input Parameter:
270: logView: True if -log_view or -log_summary
271: */
272: static PetscErrorCode PetscOptionsCheckCUPM(PetscBool logView)273: {
275: PetscBool initDevice = PETSC_FALSE,devView = PETSC_FALSE,devNone = PETSC_FALSE;
276: PetscInt device = 0;
277: char devStr[32]={0};
278: #if defined(PETSC_HAVE_KOKKOS)
279: PetscBool set,kinited,devDefault;
280: #endif
283: #if defined(PETSC_HAVE_KOKKOS)
284: PetscKokkosIsInitialized_Private(&kinited);
285: if (kinited) { /* Check if Petsc device options conform with Kokkos' device */
286: PetscOptionsGetString(NULL,NULL,cupmSetDeviceStr,devStr,sizeof(devStr),&set);
287: if (set) { /* If users have initialized Kokkos themselves, but also had e.g., -cuda_set_device XXX, for simplicity, make sure XXX is DEFAULT */
288: PetscStrcasecmp("DEFAULT",devStr,&devDefault);
289: if (!devDefault) {PetscStrcasecmp("PETSC_DEFAULT",devStr,&devDefault);}
290: if (!devDefault) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Kokkos was initialized before PetscInitialize(), but you have "cupmSetDeviceStr" %s. Remove the option or use "cupmSetDeviceStr" default.",devStr);
291: } else { /* If users did not have e.g., '-cuda_set_device XXX', insert one here so that petsc can continue its own device initialization */
292: PetscOptionsSetValue(NULL,cupmSetDeviceStr,"DEFAULT");
293: }
294: }
295: #endif
297: PetscOptionsBegin(PETSC_COMM_WORLD,NULL,cupmOptionsStr,"Sys");
298: PetscOptionsString(cupmSetDeviceStr,NULL,PetscCUPMInitializeStr,devStr,devStr,sizeof(devStr),&initDevice);
299: PetscStrcasecmp("none",devStr,&devNone);
300: if (devNone) device = -3; /* -3 is the locally used PETSC_NONE in Petsc{CUDA/HIP}Initialize() */
301: else {PetscOptionsInt(cupmSetDeviceStr,"Set which MPI ranks to use which devices",PetscCUPMInitializeStr,device,&device,&initDevice);}
302: PetscOptionsBool(cupmSynchronizeStr,"Wait for the device to complete operations before returning to the CPU (on by default with -log_summary or -log_view)",NULL,PetscCUPMSynchronize,&PetscCUPMSynchronize,NULL);
303: PetscOptionsName(cupmViewStr,"Display device information and assignments",NULL,&devView);
304: PetscOptionsEnd();
305: PetscCUPMInitializeAndView(initDevice,device,logView,devView);
307: #if defined(PETSC_HAVE_KOKKOS)
308: if (PetscCUPMInitialized && !kinited) {
309: PetscKokkosInitialize_Private();
310: PetscBeganKokkos = PETSC_TRUE;
311: }
312: #endif
313: return(0);
314: }