1818/* **********************************************************/
1919
2020#pragma once
21+
22+ // ── Standard includes (always available) ──────────────────────────────────────
2123#include < algorithm>
22- #include < execution>
2324#include < functional>
2425#include < numeric>
25- #ifdef CXXGRAPH_WITH_OPENMP
26- #include < omp.h>
26+ #include < vector>
27+
28+ // ── Backend selection ─────────────────────────────────────────────────────────
29+ //
30+ // Three backends are supported, in priority order:
31+ //
32+ // 1. OpenMP — activated by -DCXXGRAPH_WITH_OPENMP.
33+ // Uses #pragma omp for all loop primitives.
34+ // parallel_sort falls back to sequential std::sort because
35+ // OpenMP provides no parallel sort primitive.
36+ //
37+ // 2. PSTL/TBB — activated automatically when <execution> + a parallel
38+ // runtime (Intel oneTBB) are present, detected via the
39+ // standard feature-test macro __cpp_lib_parallel_algorithm.
40+ // Uses std::execution::par_unseq for all primitives.
41+ //
42+ // 3. Sequential — fallback when neither OpenMP nor PSTL is available
43+ // (e.g. Apple Clang without TBB, or any platform lacking a
44+ // parallel STL runtime). All primitives run single-threaded.
45+ //
46+ // IMPORTANT: Do NOT use std::execution::par_unseq unconditionally.
47+ // Apple Clang / libc++ on macOS does NOT provide parallel execution policies
48+ // unless TBB is linked AND detected via __cpp_lib_parallel_algorithm.
49+ // Calling std::execution::par_unseq without that check is a hard compile error
50+ // on macOS, arm64, and any libc++ build without a PSTL backend.
51+
52+ #if defined(CXXGRAPH_WITH_OPENMP)
53+ // ── OpenMP path ───────────────────────────────────────────────────────────────
54+ #include < omp.h>
55+ // PSTL is not needed and must not be assumed present in OpenMP mode.
56+ #define CXXGRAPH_HAS_PSTL 0
57+
2758#else
28- #include < thread>
29- #endif
59+ // ── PSTL / sequential path ────────────────────────────────────────────────────
60+ #include < thread>
61+ // <algorithm> already included above; it defines __cpp_lib_parallel_algorithm
62+ // when the implementation ships a parallel STL backend.
63+ // <execution> provides the std::execution policy objects themselves.
64+ #include < execution>
65+
66+ // __cpp_lib_parallel_algorithm is defined by the standard library in
67+ // <algorithm> (and others) when parallel execution policies are available.
68+ // Value 201603L means the feature is present per P0024R2 (C++17).
69+ //
70+ // Compilers/platforms that define this but do NOT actually parallelise
71+ // (e.g. libstdc++ without TBB) will fall back to sequential at runtime,
72+ // which is still correct and safe.
73+ #if defined(__cpp_lib_parallel_algorithm) && \
74+ __cpp_lib_parallel_algorithm >= 201603L
75+ #define CXXGRAPH_HAS_PSTL 1
76+ #else
77+ #define CXXGRAPH_HAS_PSTL 0
78+ #endif
79+
80+ #endif // CXXGRAPH_WITH_OPENMP
3081
3182namespace CXXGraph {
3283namespace Parallel {
3384
34- // ── parallel_for_each ────────────────────────────────────────────────────────
85+ // ── parallel_for_each ─────────────────────────────────────────────────────────
3586// Applies f to every element of [first, last) in parallel.
3687// The execution order is unspecified; f must not write shared state without
37- // synchronisation.
38-
88+ // explicit synchronisation.
3989template <typename RandomIt, typename UnaryFn>
4090void parallel_for_each (RandomIt first, RandomIt last, UnaryFn f) {
41- #ifdef CXXGRAPH_WITH_OPENMP
42- #pragma omp parallel for schedule(dynamic)
91+ #if defined( CXXGRAPH_WITH_OPENMP)
92+ #pragma omp parallel for schedule(dynamic)
4393 for (auto it = first; it < last; ++it) f (*it);
44- #else
94+ #elif CXXGRAPH_HAS_PSTL
4595 std::for_each (std::execution::par_unseq, first, last, f);
96+ #else
97+ std::for_each (first, last, f); // sequential fallback
4698#endif
4799}
48100
49- // ── parallel_for ─────────────────────────────────────────────────────────────
50- // Parallel loop over integer index range [begin, end).
101+ // ── parallel_for ──────────────────────────────────────────────────────────────
102+ // Parallel loop over an integer index range [begin, end).
51103// Preserves the existing manual-thread API feel from concurrency_bfs.
52-
53104template <typename IndexT, typename UnaryFn>
54105void parallel_for (IndexT begin, IndexT end, UnaryFn f) {
55- #ifdef CXXGRAPH_WITH_OPENMP
56- #pragma omp parallel for schedule(dynamic)
106+ #if defined( CXXGRAPH_WITH_OPENMP)
107+ #pragma omp parallel for schedule(dynamic)
57108 for (IndexT i = begin; i < end; ++i) f (i);
58- #else
59- // Build an index range and dispatch via par_unseq
109+ #elif CXXGRAPH_HAS_PSTL
60110 std::vector<IndexT> indices (static_cast <std::size_t >(end - begin));
61111 std::iota (indices.begin (), indices.end (), begin);
62112 std::for_each (std::execution::par_unseq, indices.begin (), indices.end (), f);
113+ #else
114+ // Sequential fallback: plain loop — no index vector allocation needed.
115+ for (IndexT i = begin; i < end; ++i) f (i);
63116#endif
64117}
65118
66- // ── parallel_sort
67- // ───────────────────────────────────────────────────────────── Sorts [first,
68- // last) in parallel using comp.
69-
119+ // ── parallel_sort ─────────────────────────────────────────────────────────────
120+ // Sorts [first, last) using comp in parallel.
121+ //
122+ // OpenMP note: OpenMP (through 5.2) does not provide a parallel sort
123+ // primitive. Using std::execution::par_unseq inside an OpenMP translation
124+ // unit is dangerous on platforms where the PSTL backend is absent (Apple
125+ // Clang / libc++ without TBB). We therefore use sequential std::sort in the
126+ // OpenMP path. The calling algorithms (Kruskal, Welsh-Powell) already obtain
127+ // their speedup from the parallel loop primitives above.
70128template <typename RandomIt, typename Compare>
71129void parallel_sort (RandomIt first, RandomIt last, Compare comp) {
72- #ifdef CXXGRAPH_WITH_OPENMP
73- // OpenMP doesn't provide sort; fall through to std::execution
130+ #if defined(CXXGRAPH_WITH_OPENMP)
131+ // Sequential sort — OpenMP has no parallel sort primitive.
132+ // Do NOT fall through to std::execution here: par_unseq may be unavailable
133+ // (Apple Clang without TBB) and would cause a hard compile error.
134+ std::sort (first, last, comp);
135+ #elif CXXGRAPH_HAS_PSTL
74136 std::sort (std::execution::par_unseq, first, last, comp);
75137#else
76- std::sort (std::execution::par_unseq, first, last, comp);
138+ std::sort (first, last, comp); // sequential fallback
77139#endif
78140}
79141
80- // Default comparator overload
142+ // Default comparator overload.
81143template <typename RandomIt>
82144void parallel_sort (RandomIt first, RandomIt last) {
83145 parallel_sort (first, last, std::less<>{});
84146}
85147
86- // ── parallel_transform
87- // ──────────────────────────────────────────────────────── Applies f to each
88- // element of [first, last) and writes results to d_first.
89-
148+ // ── parallel_transform ────────────────────────────────────────────────────────
149+ // Applies f to each element of [first, last) and writes results to d_first.
90150template <typename InputIt, typename OutputIt, typename UnaryFn>
91151void parallel_transform (InputIt first, InputIt last, OutputIt d_first,
92152 UnaryFn f) {
93- #ifdef CXXGRAPH_WITH_OPENMP
94- #pragma omp parallel for schedule(static)
95- for (auto it = first; it < last; ++it) {
153+ #if defined( CXXGRAPH_WITH_OPENMP)
154+ #pragma omp parallel for schedule(static)
155+ for (auto it = first; it < last; ++it)
96156 *(d_first + std::distance (first, it)) = f (*it);
97- }
98- #else
157+ #elif CXXGRAPH_HAS_PSTL
99158 std::transform (std::execution::par_unseq, first, last, d_first, f);
159+ #else
160+ std::transform (first, last, d_first, f); // sequential fallback
100161#endif
101162}
102163
103- // ── available_threads
104- // ───────────────────────────────────────────────────────── Returns the number
105- // of threads the runtime will use. Mirrors the Globals::threads pattern used by
106- // the partitioning layer.
107-
164+ // ── available_threads ─────────────────────────────────────────────────────────
165+ // Returns the number of threads the runtime will use.
166+ // Mirrors the Globals::threads pattern used by the partitioning layer.
108167inline unsigned int available_threads () {
109- #ifdef CXXGRAPH_WITH_OPENMP
168+ #if defined( CXXGRAPH_WITH_OPENMP)
110169 return static_cast <unsigned int >(omp_get_max_threads ());
111170#else
112171 return std::thread::hardware_concurrency ();
113172#endif
114173}
115174
116175} // namespace Parallel
117- } // namespace CXXGraph
176+ } // namespace CXXGraph
0 commit comments