Skip to content

Commit 7436cb8

Browse files
committed
Fix Parallel for MacOs
1 parent 3296a09 commit 7436cb8

1 file changed

Lines changed: 101 additions & 42 deletions

File tree

include/CXXGraph/Utility/Parallel.hpp

Lines changed: 101 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -18,100 +18,159 @@
1818
/***********************************************************/
1919

2020
#pragma once
21+
22+
// ── Standard includes (always available) ──────────────────────────────────────
2123
#include <algorithm>
22-
#include <execution>
2324
#include <functional>
2425
#include <numeric>
25-
#ifdef CXXGRAPH_WITH_OPENMP
26-
#include <omp.h>
26+
#include <vector>
27+
28+
// ── Backend selection ─────────────────────────────────────────────────────────
29+
//
30+
// Three backends are supported, in priority order:
31+
//
32+
// 1. OpenMP — activated by -DCXXGRAPH_WITH_OPENMP.
33+
// Uses #pragma omp for all loop primitives.
34+
// parallel_sort falls back to sequential std::sort because
35+
// OpenMP provides no parallel sort primitive.
36+
//
37+
// 2. PSTL/TBB — activated automatically when <execution> + a parallel
38+
// runtime (Intel oneTBB) are present, detected via the
39+
// standard feature-test macro __cpp_lib_parallel_algorithm.
40+
// Uses std::execution::par_unseq for all primitives.
41+
//
42+
// 3. Sequential — fallback when neither OpenMP nor PSTL is available
43+
// (e.g. Apple Clang without TBB, or any platform lacking a
44+
// parallel STL runtime). All primitives run single-threaded.
45+
//
46+
// IMPORTANT: Do NOT use std::execution::par_unseq unconditionally.
47+
// Apple Clang / libc++ on macOS does NOT provide parallel execution policies
48+
// unless TBB is linked AND detected via __cpp_lib_parallel_algorithm.
49+
// Calling std::execution::par_unseq without that check is a hard compile error
50+
// on macOS, arm64, and any libc++ build without a PSTL backend.
51+
52+
#if defined(CXXGRAPH_WITH_OPENMP)
53+
// ── OpenMP path ───────────────────────────────────────────────────────────────
54+
#include <omp.h>
55+
// PSTL is not needed and must not be assumed present in OpenMP mode.
56+
#define CXXGRAPH_HAS_PSTL 0
57+
2758
#else
28-
#include <thread>
29-
#endif
59+
// ── PSTL / sequential path ────────────────────────────────────────────────────
60+
#include <thread>
61+
// <algorithm> already included above; it defines __cpp_lib_parallel_algorithm
62+
// when the implementation ships a parallel STL backend.
63+
// <execution> provides the std::execution policy objects themselves.
64+
#include <execution>
65+
66+
// __cpp_lib_parallel_algorithm is defined by the standard library in
67+
// <algorithm> (and others) when parallel execution policies are available.
68+
// Value 201603L means the feature is present per P0024R2 (C++17).
69+
//
70+
// Compilers/platforms that define this but do NOT actually parallelise
71+
// (e.g. libstdc++ without TBB) will fall back to sequential at runtime,
72+
// which is still correct and safe.
73+
#if defined(__cpp_lib_parallel_algorithm) && \
74+
__cpp_lib_parallel_algorithm >= 201603L
75+
#define CXXGRAPH_HAS_PSTL 1
76+
#else
77+
#define CXXGRAPH_HAS_PSTL 0
78+
#endif
79+
80+
#endif // CXXGRAPH_WITH_OPENMP
3081

3182
namespace CXXGraph {
3283
namespace Parallel {
3384

34-
// ── parallel_for_each ────────────────────────────────────────────────────────
85+
// ── parallel_for_each ────────────────────────────────────────────────────────
3586
// Applies f to every element of [first, last) in parallel.
3687
// The execution order is unspecified; f must not write shared state without
37-
// synchronisation.
38-
88+
// explicit synchronisation.
3989
template <typename RandomIt, typename UnaryFn>
4090
void parallel_for_each(RandomIt first, RandomIt last, UnaryFn f) {
41-
#ifdef CXXGRAPH_WITH_OPENMP
42-
#pragma omp parallel for schedule(dynamic)
91+
#if defined(CXXGRAPH_WITH_OPENMP)
92+
#pragma omp parallel for schedule(dynamic)
4393
for (auto it = first; it < last; ++it) f(*it);
44-
#else
94+
#elif CXXGRAPH_HAS_PSTL
4595
std::for_each(std::execution::par_unseq, first, last, f);
96+
#else
97+
std::for_each(first, last, f); // sequential fallback
4698
#endif
4799
}
48100

49-
// ── parallel_for ─────────────────────────────────────────────────────────────
50-
// Parallel loop over integer index range [begin, end).
101+
// ── parallel_for ─────────────────────────────────────────────────────────────
102+
// Parallel loop over an integer index range [begin, end).
51103
// Preserves the existing manual-thread API feel from concurrency_bfs.
52-
53104
template <typename IndexT, typename UnaryFn>
54105
void parallel_for(IndexT begin, IndexT end, UnaryFn f) {
55-
#ifdef CXXGRAPH_WITH_OPENMP
56-
#pragma omp parallel for schedule(dynamic)
106+
#if defined(CXXGRAPH_WITH_OPENMP)
107+
#pragma omp parallel for schedule(dynamic)
57108
for (IndexT i = begin; i < end; ++i) f(i);
58-
#else
59-
// Build an index range and dispatch via par_unseq
109+
#elif CXXGRAPH_HAS_PSTL
60110
std::vector<IndexT> indices(static_cast<std::size_t>(end - begin));
61111
std::iota(indices.begin(), indices.end(), begin);
62112
std::for_each(std::execution::par_unseq, indices.begin(), indices.end(), f);
113+
#else
114+
// Sequential fallback: plain loop — no index vector allocation needed.
115+
for (IndexT i = begin; i < end; ++i) f(i);
63116
#endif
64117
}
65118

66-
// ── parallel_sort
67-
// ───────────────────────────────────────────────────────────── Sorts [first,
68-
// last) in parallel using comp.
69-
119+
// ── parallel_sort ─────────────────────────────────────────────────────────────
120+
// Sorts [first, last) using comp in parallel.
121+
//
122+
// OpenMP note: OpenMP (through 5.2) does not provide a parallel sort
123+
// primitive. Using std::execution::par_unseq inside an OpenMP translation
124+
// unit is dangerous on platforms where the PSTL backend is absent (Apple
125+
// Clang / libc++ without TBB). We therefore use sequential std::sort in the
126+
// OpenMP path. The calling algorithms (Kruskal, Welsh-Powell) already obtain
127+
// their speedup from the parallel loop primitives above.
70128
template <typename RandomIt, typename Compare>
71129
void parallel_sort(RandomIt first, RandomIt last, Compare comp) {
72-
#ifdef CXXGRAPH_WITH_OPENMP
73-
// OpenMP doesn't provide sort; fall through to std::execution
130+
#if defined(CXXGRAPH_WITH_OPENMP)
131+
// Sequential sort — OpenMP has no parallel sort primitive.
132+
// Do NOT fall through to std::execution here: par_unseq may be unavailable
133+
// (Apple Clang without TBB) and would cause a hard compile error.
134+
std::sort(first, last, comp);
135+
#elif CXXGRAPH_HAS_PSTL
74136
std::sort(std::execution::par_unseq, first, last, comp);
75137
#else
76-
std::sort(std::execution::par_unseq, first, last, comp);
138+
std::sort(first, last, comp); // sequential fallback
77139
#endif
78140
}
79141

80-
// Default comparator overload
142+
// Default comparator overload.
81143
template <typename RandomIt>
82144
void parallel_sort(RandomIt first, RandomIt last) {
83145
parallel_sort(first, last, std::less<>{});
84146
}
85147

86-
// ── parallel_transform
87-
// ──────────────────────────────────────────────────────── Applies f to each
88-
// element of [first, last) and writes results to d_first.
89-
148+
// ── parallel_transform ────────────────────────────────────────────────────────
149+
// Applies f to each element of [first, last) and writes results to d_first.
90150
template <typename InputIt, typename OutputIt, typename UnaryFn>
91151
void parallel_transform(InputIt first, InputIt last, OutputIt d_first,
92152
UnaryFn f) {
93-
#ifdef CXXGRAPH_WITH_OPENMP
94-
#pragma omp parallel for schedule(static)
95-
for (auto it = first; it < last; ++it) {
153+
#if defined(CXXGRAPH_WITH_OPENMP)
154+
#pragma omp parallel for schedule(static)
155+
for (auto it = first; it < last; ++it)
96156
*(d_first + std::distance(first, it)) = f(*it);
97-
}
98-
#else
157+
#elif CXXGRAPH_HAS_PSTL
99158
std::transform(std::execution::par_unseq, first, last, d_first, f);
159+
#else
160+
std::transform(first, last, d_first, f); // sequential fallback
100161
#endif
101162
}
102163

103-
// ── available_threads
104-
// ───────────────────────────────────────────────────────── Returns the number
105-
// of threads the runtime will use. Mirrors the Globals::threads pattern used by
106-
// the partitioning layer.
107-
164+
// ── available_threads ─────────────────────────────────────────────────────────
165+
// Returns the number of threads the runtime will use.
166+
// Mirrors the Globals::threads pattern used by the partitioning layer.
108167
inline unsigned int available_threads() {
109-
#ifdef CXXGRAPH_WITH_OPENMP
168+
#if defined(CXXGRAPH_WITH_OPENMP)
110169
return static_cast<unsigned int>(omp_get_max_threads());
111170
#else
112171
return std::thread::hardware_concurrency();
113172
#endif
114173
}
115174

116175
} // namespace Parallel
117-
} // namespace CXXGraph
176+
} // namespace CXXGraph

0 commit comments

Comments
 (0)