Skip to content

Commit e528cfd

Browse files
sergeisakovmhucka
andauthored
Add options for cuStateVecEx. (#1007)
This pull request adds the possibility to set options for cuStateVecEx and also refactors how options are passed from Python to the C++ backends in the pybind interface. --------- Co-authored-by: Michael Hucka <mhucka@google.com>
1 parent 73e9809 commit e528cfd

23 files changed

Lines changed: 313 additions & 320 deletions

apps/qsim_base_custatevecex.cu

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,18 +32,19 @@ struct Options {
3232
std::string circuit_file;
3333
unsigned maxtime = std::numeric_limits<unsigned>::max();
3434
unsigned seed = 1;
35+
unsigned lbuf = 30;
3536
unsigned verbosity = 0;
3637
};
3738

3839
Options GetOptions(int argc, char* argv[]) {
39-
constexpr char usage[] = "usage:\n ./qsim_base -c circuit -d maxtime "
40-
"-s seed -v verbosity\n";
40+
constexpr char usage[] = "usage:\n ./qsim_base_custatevecex.x -c circuit "
41+
"-d maxtime -s seed -l lbuf -v verbosity\n";
4142

4243
Options opt;
4344

4445
int k;
4546

46-
while ((k = getopt(argc, argv, "c:d:s:v:")) != -1) {
47+
while ((k = getopt(argc, argv, "c:d:s:l:v:")) != -1) {
4748
switch (k) {
4849
case 'c':
4950
opt.circuit_file = optarg;
@@ -54,6 +55,9 @@ Options GetOptions(int argc, char* argv[]) {
5455
case 's':
5556
opt.seed = std::atoi(optarg);
5657
break;
58+
case 'l':
59+
opt.lbuf = std::atoi(optarg);
60+
break;
5761
case 'v':
5862
opt.verbosity = std::atoi(optarg);
5963
break;
@@ -112,8 +116,14 @@ int main(int argc, char* argv[]) {
112116
using Simulator = qsim::SimulatorCuStateVecEx<fp_type>;
113117
using StateSpace = Simulator::StateSpace;
114118

115-
explicit Factory(unsigned verbosity = 0) : verbosity(verbosity) {
116-
mp.initialize();
119+
explicit Factory(uint64_t transfer_buffer_size, unsigned verbosity = 0)
120+
: verbosity(verbosity) {
121+
MultiProcessCuStateVecEx::Parameter param = {transfer_buffer_size};
122+
mp.Initialize(param);
123+
124+
if (verbosity > 2 && mp.Initialized()) {
125+
qsim::IO::messagef("# transfer_buf_size=%lu\n", transfer_buffer_size);
126+
}
117127
}
118128

119129
StateSpace CreateStateSpace() const {
@@ -136,7 +146,7 @@ int main(int argc, char* argv[]) {
136146
using State = StateSpace::State;
137147
using Runner = CuStateVecExRunner<IO, Factory>;
138148

139-
Factory factory(opt.verbosity);
149+
Factory factory(uint64_t{1} << opt.lbuf, opt.verbosity);
140150

141151
StateSpace state_space = factory.CreateStateSpace();
142152
State state = state_space.Create(circuit.num_qubits);

docs/cirq_interface.md

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -183,20 +183,29 @@ To compile with the NVIDIA cuStateVec library (v1.0.0 or higher is required),
183183
set the environmment variable `CUQUANTUM_ROOT` to the path to the cuStateVec
184184
library.
185185

186-
`QSimOptions` provides five parameters to configure GPU execution. `use_gpu`
186+
`QSimOptions` provides six parameters to configure GPU execution. `use_gpu`
187187
is required to enable GPU execution:
188188
* `use_gpu`: if True, use GPU instead of CPU for simulation.
189189
* `gpu_mode`: use CUDA if set to 0 (default value), use the NVIDIA cuStateVec
190190
if set to 1 or use the NVIDIA cuStateVecEx library if set to any other value.
191191

192192
In the case of the NVIDIA cuStateVecEx library, simulations can be performed
193-
in multi-device / multi-node environments.
193+
in multi-device / multi-node environments. A CUDA-aware MPI library is required
194+
for multi-node. Currently, only Open MPI is supported.
194195

195-
If `use_gpu` is set and `gpu_mode` is set to 0, the remaining parameters can
196+
If `use_gpu` is set and `gpu_mode` is set to 0, two parameters can
196197
optionally be set to fine-tune StateSpace performance for a specific device.
197198
In most cases, the default values provide good performance.
198199
* `gpu_state_threads`: number of threads per CUDA block to use for the GPU
199200
StateSpace. This must be a power of 2 in the range [32, 1024].
200201
* `gpu_data_blocks`: number of data blocks to use for the GPU StateSpace.
201202
Below 16 data blocks, performance is noticeably reduced.
202203

204+
If `use_gpu` is set and `gpu_mode` is set to 2 or greater (cuStateVecEx), two
205+
parameters can be set to adjust the transfer buffer size for MPI communication
206+
or network type.
207+
* `gpu_cusvex_log_buf_size`: log2 of the buffer size. Default value is 30,
208+
i.e. the buffer size is 2^30 bytes.
209+
* `gpu_cusvex_network_type`: Device network type for multi-device:
210+
0=Switch (default), 1=FullMesh. Or layered network type for multi-process:
211+
0=SuperPOD (default), 1=GB200NVL, 2=SwitchTree, 3=Communicator.

lib/multiprocess_custatevecex.h

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,11 @@ struct MultiProcessCuStateVecEx {
3939
};
4040

4141
struct Parameter {
42-
uint64_t transfer_buffer_size = 16777216;
42+
uint64_t transfer_buffer_size = uint64_t{1} << 30;
4343
NetworkType network_type = kSuperPod;
4444
};
4545

46-
MultiProcessCuStateVecEx(Parameter param = Parameter{16777216, kSuperPod})
47-
: param_(param), communicator_(nullptr), initialized_(false) {}
46+
MultiProcessCuStateVecEx() : communicator_(nullptr), initialized_(false) {}
4847

4948
~MultiProcessCuStateVecEx() {
5049
if (communicator_) {
@@ -55,23 +54,29 @@ struct MultiProcessCuStateVecEx {
5554
custatevecExCommunicatorFinalize(&status);
5655
}
5756

58-
custatevecExCommunicatorDescriptor_t communicator() const {
57+
custatevecExCommunicatorDescriptor_t Communicator() const {
5958
return communicator_;
6059
}
6160

62-
unsigned num_processes() const {
61+
unsigned NumProcesses() const {
6362
return num_processes_;
6463
}
6564

66-
unsigned rank() const {
65+
unsigned Rank() const {
6766
return rank_;
6867
}
6968

70-
bool initialized() const {
69+
static bool ValidNetworkType(unsigned network_type) {
70+
return network_type < 4;
71+
}
72+
73+
bool Initialized() const {
7174
return initialized_;
7275
}
7376

74-
void initialize() {
77+
void Initialize(Parameter param) {
78+
param_ = param;
79+
7580
int argc = 0;
7681
char** argv = nullptr;
7782

@@ -109,7 +114,7 @@ struct MultiProcessCuStateVecEx {
109114
num_global_qubits_ = get_num_global_qubits(num_processes);
110115

111116
unsigned num_acc_global_qubits = 0;
112-
auto network_layers = get_network_layers(param_.network_type);
117+
auto network_layers = GetNetworkLayers(param_.network_type);
113118

114119
num_global_qubits_per_layer_.reserve(2);
115120
global_index_bit_classes_.reserve(2);
@@ -149,7 +154,7 @@ struct MultiProcessCuStateVecEx {
149154
initialized_ = true;
150155
}
151156

152-
auto create_sv_config(unsigned num_qubits, cudaDataType_t data_type) const {
157+
auto CreateSVConfig(unsigned num_qubits, cudaDataType_t data_type) const {
153158
custatevecExDictionaryDescriptor_t sv_config = nullptr;
154159

155160
if (!initialized_ ||
@@ -187,7 +192,7 @@ struct MultiProcessCuStateVecEx {
187192

188193
using NetworkLayers = std::vector<NetworkLayer>;
189194

190-
static NetworkLayers get_network_layers(NetworkType id) {
195+
static NetworkLayers GetNetworkLayers(NetworkType id) {
191196
switch (id) {
192197
case kSuperPod:
193198
return {{CUSTATEVEC_EX_GLOBAL_INDEX_BIT_CLASS_INTERPROC_P2P, 3},

lib/statespace_custatevecex.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ class StateSpaceCuStateVecEx :
130130
unsigned required_rank = k / size;
131131

132132
if (state.distr_type() != Base::kMultiProcess
133-
|| Base::mp.rank() == required_rank) {
133+
|| Base::mp.Rank() == required_rank) {
134134
ErrorCheck(custatevecExStateVectorGetState(
135135
state.get(), buf, kStateDataType, k, k + 1, 1));
136136
}
@@ -139,7 +139,7 @@ class StateSpaceCuStateVecEx :
139139

140140
if (state.distr_type() == Base::kMultiProcess) {
141141
auto cuda_type = GetCudaType<std::complex<fp_type>>();
142-
auto comm = Base::mp.communicator();
142+
auto comm = Base::mp.Communicator();
143143
ErrorCheck(comm->intf->bcast(comm, buf, 1, cuda_type, required_rank));
144144
}
145145

@@ -161,7 +161,7 @@ class StateSpaceCuStateVecEx :
161161
unsigned required_rank = k / size;
162162

163163
if (state.distr_type() != Base::kMultiProcess
164-
|| Base::mp.rank() == required_rank) {
164+
|| Base::mp.Rank() == required_rank) {
165165
ErrorCheck(custatevecExStateVectorSetState(
166166
state.get(), buf, kStateDataType, k, k + 1, 1));
167167
}
@@ -183,7 +183,7 @@ class StateSpaceCuStateVecEx :
183183
unsigned required_rank = k / size;
184184

185185
if (state.distr_type() != Base::kMultiProcess
186-
|| Base::mp.rank() == required_rank) {
186+
|| Base::mp.Rank() == required_rank) {
187187
ErrorCheck(custatevecExStateVectorSetState(
188188
state.get(), buf, kStateDataType, k, k + 1, 1));
189189
}

lib/vectorspace_custatevecex.h

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ class VectorSpaceCuStateVecEx {
245245
ResultType local_r = callback(k, res);
246246

247247
auto cuda_type = GetCudaType<ResultType>();
248-
auto comm = mp_->communicator();
248+
auto comm = mp_->Communicator();
249249
ErrorCheck(comm->intf->allreduce(comm, &local_r, &r, 1, cuda_type));
250250

251251
return r;
@@ -338,7 +338,7 @@ class VectorSpaceCuStateVecEx {
338338
ResultType local_r = callback(k, res1, res2);
339339

340340
auto cuda_type = GetCudaType<ResultType>();
341-
auto comm = mp_->communicator();
341+
auto comm = mp_->Communicator();
342342
ErrorCheck(comm->intf->allreduce(comm, &local_r, &r, 1, cuda_type));
343343

344344
return r;
@@ -385,16 +385,16 @@ class VectorSpaceCuStateVecEx {
385385
Vector Create(unsigned num_qubits) const {
386386
custatevecExStateVectorDescriptor_t state_vec;
387387
custatevecExDictionaryDescriptor_t sv_config
388-
= mp.create_sv_config(num_qubits, kStateDataType);
388+
= mp.CreateSVConfig(num_qubits, kStateDataType);
389389

390390
unsigned num_substates = 1;
391391
DistributionType distr_type = kNoDistr;
392392

393393
if (sv_config != nullptr) {
394394
ErrorCheck(custatevecExStateVectorCreateMultiProcess(
395-
&state_vec, sv_config, nullptr, mp.communicator(), nullptr));
395+
&state_vec, sv_config, nullptr, mp.Communicator(), nullptr));
396396

397-
num_substates = mp.num_processes();
397+
num_substates = mp.NumProcesses();
398398
distr_type = kMultiProcess;
399399

400400
if (param.verbosity > 2) {
@@ -480,6 +480,10 @@ class VectorSpaceCuStateVecEx {
480480
return vector.get() == nullptr;
481481
}
482482

483+
static bool ValidDeviceNetworkType(unsigned network_type) {
484+
return network_type < 2;
485+
}
486+
483487
bool Copy(const Vector& src, Vector& dest) const {
484488
if (src.num_qubits() != dest.num_qubits()) {
485489
return false;
@@ -503,15 +507,15 @@ class VectorSpaceCuStateVecEx {
503507
bool Copy(const Vector& src, fp_type* dest) const {
504508
if (src.distr_type() == kMultiProcess) {
505509
uint64_t size = (uint64_t{1} << src.num_qubits()) / src.num_substates();
506-
uint64_t offset = size * mp.rank();
510+
uint64_t offset = size * mp.Rank();
507511

508512
ErrorCheck(custatevecExStateVectorGetState(
509513
src.get(), dest + 2 * offset, kStateDataType,
510514
offset, offset + size, 1));
511515
ErrorCheck(custatevecExStateVectorSynchronize(src.get()));
512516

513517
auto cuda_type = GetCudaType<std::complex<fp_type>>();
514-
auto comm = mp.communicator();
518+
auto comm = mp.Communicator();
515519
ErrorCheck(comm->intf->allgather(
516520
comm, dest + 2 * offset, dest, size, cuda_type));
517521
} else {
@@ -529,7 +533,7 @@ class VectorSpaceCuStateVecEx {
529533
bool Copy(const fp_type* src, Vector& dest) const {
530534
if (dest.distr_type() == kMultiProcess) {
531535
uint64_t size = (uint64_t{1} << dest.num_qubits()) / dest.num_substates();
532-
uint64_t offset = size * mp.rank();
536+
uint64_t offset = size * mp.Rank();
533537

534538
ErrorCheck(custatevecExStateVectorSetState(
535539
dest.get(), src + 2 * offset, kStateDataType,
@@ -560,7 +564,7 @@ class VectorSpaceCuStateVecEx {
560564

561565
if (dest.distr_type() == kMultiProcess) {
562566
size /= dest.num_substates();
563-
uint64_t offset = size * mp.rank();
567+
uint64_t offset = size * mp.Rank();
564568

565569
ErrorCheck(custatevecExStateVectorSetState(
566570
dest.get(), src + 2 * offset, kStateDataType,

pybind_interface/avx2/pybind_main_avx2.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,9 @@ namespace qsim {
2727
using Simulator = SimulatorAVX<For>;
2828

2929
struct Factory {
30-
// num_state_threads and num_dblocks are unused, but kept for consistency
31-
// with the GPU Factory.
32-
Factory(
33-
unsigned num_sim_threads,
34-
unsigned num_state_threads,
35-
unsigned num_dblocks) : num_threads(num_sim_threads) {}
30+
explicit Factory(const py::dict& options) {
31+
num_threads = ParseOptions<unsigned>(options, "t\0");
32+
}
3633

3734
using Simulator = qsim::Simulator<For>;
3835
using StateSpace = Simulator::StateSpace;

pybind_interface/avx512/pybind_main_avx512.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,9 @@ namespace qsim {
2727
using Simulator = SimulatorAVX512<For>;
2828

2929
struct Factory {
30-
// num_state_threads and num_dblocks are unused, but kept for consistency
31-
// with the GPU Factory.
32-
Factory(
33-
unsigned num_sim_threads,
34-
unsigned num_state_threads,
35-
unsigned num_dblocks) : num_threads(num_sim_threads) {}
30+
explicit Factory(const py::dict& options) {
31+
num_threads = ParseOptions<unsigned>(options, "t\0");
32+
}
3633

3734
using Simulator = qsim::Simulator<For>;
3835
using StateSpace = Simulator::StateSpace;

pybind_interface/basic/pybind_main_basic.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,9 @@ namespace qsim {
2727
using Simulator = SimulatorBasic<For>;
2828

2929
struct Factory {
30-
// num_state_threads and num_dblocks are unused, but kept for consistency
31-
// with the GPU Factory.
32-
Factory(
33-
unsigned num_sim_threads,
34-
unsigned num_state_threads,
35-
unsigned num_dblocks) : num_threads(num_sim_threads) {}
30+
explicit Factory(const py::dict& options) {
31+
num_threads = ParseOptions<unsigned>(options, "t\0");
32+
}
3633

3734
using Simulator = qsim::Simulator<For>;
3835
using StateSpace = Simulator::StateSpace;

pybind_interface/cuda/pybind_main_cuda.cpp

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,9 @@ namespace qsim {
2424
using Simulator = SimulatorCUDA<float>;
2525

2626
struct Factory {
27-
Factory(
28-
unsigned num_sim_threads,
29-
unsigned num_state_threads,
30-
unsigned num_dblocks
31-
) {
32-
ss_params.num_threads = num_state_threads;
33-
ss_params.num_dblocks = num_dblocks;
27+
explicit Factory(const py::dict& options) {
28+
ss_params.num_threads = ParseOptions<unsigned>(options, "gsst\0");
29+
ss_params.num_dblocks = ParseOptions<unsigned>(options, "gdb\0");
3430
}
3531

3632
using Simulator = qsim::Simulator;

pybind_interface/custatevec/CMakeLists.txt

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,10 @@ include_directories($ENV{CUQUANTUM_ROOT}/include)
4545
link_directories($ENV{CUQUANTUM_ROOT}/lib $ENV{CUQUANTUM_ROOT}/lib64)
4646

4747
add_library(qsim_custatevec MODULE pybind_main_custatevec.cpp)
48-
target_link_libraries(qsim_custatevec -lcustatevec -lcublas)
49-
5048
set_target_properties(qsim_custatevec PROPERTIES
5149
PREFIX "${PYTHON_MODULE_PREFIX}"
5250
SUFFIX "${PYTHON_MODULE_EXTENSION}"
5351
)
5452
set_source_files_properties(pybind_main_custatevec.cpp PROPERTIES LANGUAGE CUDA)
5553

56-
target_link_libraries(qsim_custatevec PRIVATE qsim_openmp_config)
54+
target_link_libraries(qsim_custatevec PRIVATE qsim_openmp_config -lcustatevec -lcublas)

0 commit comments

Comments
 (0)