-
Notifications
You must be signed in to change notification settings - Fork 0
/
GPUtransfer-basic.cu
110 lines (90 loc) · 4.14 KB
/
GPUtransfer-basic.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <vector>
// Transfer one checkpoint shard between two GPUs on the same node and measure the time
#define CHECK_CUDA(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << " - " << cudaGetErrorString(err) << std::endl; \
exit(EXIT_FAILURE); \
} \
} while (0)
void loadCheckpointFromDisk(const std::string& filename, std::vector<float>& weights) {
std::ifstream file(filename, std::ios::binary);
if (!file) {
std::cerr << "Failed to open checkpoint file: " << filename << std::endl;
exit(EXIT_FAILURE);
}
file.seekg(0, std::ios::end);
size_t fileSize = file.tellg();
file.seekg(0, std::ios::beg);
weights.resize(fileSize / sizeof(float));
file.read(reinterpret_cast<char*>(weights.data()), fileSize);
if (!file) {
std::cerr << "Failed to read checkpoint file: " << filename << std::endl;
exit(EXIT_FAILURE);
}
std::cout << "Loaded checkpoint of size " << weights.size() * sizeof(float) << " bytes in CPU memory." << std::endl;
}
int main() {
// Model checkpoint filename
const std::string checkpointFile = "/work/hdd/bdof/nkanamarla/models/LLAMA3checkpointbinformat/LLAMA3checkpoint.bin";
// Load model weights from disk
std::vector<float> weights;
loadCheckpointFromDisk(checkpointFile, weights);
// Set the GPUs to use
int deviceCount;
CHECK_CUDA(cudaGetDeviceCount(&deviceCount));
if (deviceCount < 2) {
std::cerr << "This program requires at least two GPUs." << std::endl;
return EXIT_FAILURE;
}
int srcDevice = 0;
int dstDevice = 1;
// Allocate memory on source GPU
CHECK_CUDA(cudaSetDevice(srcDevice));
float* d_srcWeights;
size_t dataSize = weights.size() * sizeof(float);
CHECK_CUDA(cudaMalloc(&d_srcWeights, dataSize));
CHECK_CUDA(cudaMemcpy(d_srcWeights, weights.data(), dataSize, cudaMemcpyHostToDevice));
std::cout << "Checkpoint transferred to GPU " << srcDevice << " of size " << dataSize << " bytes." << std::endl;
// Enable peer access between GPUs
int canAccessPeer;
CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessPeer, dstDevice, srcDevice));
if (canAccessPeer) {
CHECK_CUDA(cudaDeviceEnablePeerAccess(dstDevice, 0));
} else {
std::cerr << "Peer access not supported between GPU " << srcDevice << " and GPU " << dstDevice << "." << std::endl;
return EXIT_FAILURE;
}
// Allocate memory on destination GPU
CHECK_CUDA(cudaSetDevice(dstDevice));
float* d_dstWeights;
CHECK_CUDA(cudaMalloc(&d_dstWeights, dataSize));
// Create CUDA events for timing
std::cout << "Begin experiment simulating GPU cache checkpointing from GPU " << srcDevice << " to GPU " << dstDevice << "." << std::endl;
cudaEvent_t start, stop;
CHECK_CUDA(cudaEventCreate(&start));
CHECK_CUDA(cudaEventCreate(&stop));
// Start timing
CHECK_CUDA(cudaEventRecord(start, 0));
// Transfer data between GPUs
CHECK_CUDA(cudaMemcpyPeer(d_dstWeights, dstDevice, d_srcWeights, srcDevice, dataSize));
// Stop timing
CHECK_CUDA(cudaEventRecord(stop, 0));
CHECK_CUDA(cudaEventSynchronize(stop));
float milliseconds = 0;
CHECK_CUDA(cudaEventElapsedTime(&milliseconds, start, stop));
std::cout << "GPU-to-GPU transfer took " << milliseconds << " ms." << std::endl;
// Cleanup
CHECK_CUDA(cudaFree(d_srcWeights));
CHECK_CUDA(cudaFree(d_dstWeights));
CHECK_CUDA(cudaEventDestroy(start));
CHECK_CUDA(cudaEventDestroy(stop));
CHECK_CUDA(cudaSetDevice(srcDevice));
CHECK_CUDA(cudaDeviceDisablePeerAccess(dstDevice));
std::cout << "Cleanup complete. Exiting program." << std::endl;
return 0;
}