3 // Copyright (C) 2007 Free Software Foundation, Inc.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the terms
7 // of the GNU General Public License as published by the Free Software
8 // Foundation; either version 2, or (at your option) any later
11 // This library is distributed in the hope that it will be useful, but
12 // WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // General Public License for more details.
16 // You should have received a copy of the GNU General Public License
17 // along with this library; see the file COPYING. If not, write to
18 // the Free Software Foundation, 59 Temple Place - Suite 330, Boston,
19 // MA 02111-1307, USA.
21 // As a special exception, you may use this file as part of a free
22 // software library without restriction. Specifically, if other files
23 // instantiate templates or use macros or inline functions from this
24 // file, or you compile this file and link it with other files to
25 // produce an executable, this file does not by itself cause the
26 // resulting executable to be covered by the GNU General Public
27 // License. This exception does not however invalidate any other
28 // reasons why the executable file might be covered by the GNU General
31 /** @file parallel/random_shuffle.h
32 * @brief Parallel implementation of std::random_shuffle().
33 * This file is a GNU parallel extension to the Standard C++ Library.
36 // Written by Johannes Singler.
38 #ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H
39 #define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1
43 #include <parallel/basic_iterator.h>
44 #include <bits/stl_algo.h>
46 #include <parallel/parallel.h>
47 #include <parallel/base.h>
48 #include <parallel/random_number.h>
49 #include <parallel/timing.h>
51 namespace __gnu_parallel
53 /** @brief Type to hold the index of a bin.
55 * Since many variables of this type are allocated, it should be
56 * chosen as small as possible.
58 typedef unsigned short bin_index;
60 /** @brief Data known to every thread participating in
61 __gnu_parallel::parallel_random_shuffle(). */
62 template<typename RandomAccessIterator>
63 struct DRandomShufflingGlobalData
65 typedef std::iterator_traits<RandomAccessIterator> traits_type;
66 typedef typename traits_type::value_type value_type;
67 typedef typename traits_type::difference_type difference_type;
69 /** @brief Begin iterator of the source. */
70 RandomAccessIterator& source;
72 /** @brief Temporary arrays for each thread. */
73 value_type** temporaries;
75 /** @brief Two-dimensional array to hold the thread-bin distribution.
77 * Dimensions (num_threads + 1) x (num_bins + 1). */
78 difference_type** dist;
80 /** @brief Start indexes of the threads' chunks. */
81 difference_type* starts;
83 /** @brief Number of the thread that will further process the
85 thread_index_t* bin_proc;
87 /** @brief Number of bins to distribute to. */
90 /** @brief Number of bits needed to address the bins. */
93 /** @brief Constructor. */
94 DRandomShufflingGlobalData(RandomAccessIterator& _source)
98 /** @brief Local data for a thread participating in
99 __gnu_parallel::parallel_random_shuffle().
101 template<typename RandomAccessIterator, typename RandomNumberGenerator>
104 /** @brief Number of threads participating in total. */
107 /** @brief Number of owning thread. */
110 /** @brief Begin index for bins taken care of by this thread. */
111 bin_index bins_begin;
113 /** @brief End index for bins taken care of by this thread. */
116 /** @brief Random seed for this thread. */
119 /** @brief Pointer to global data. */
120 DRandomShufflingGlobalData<RandomAccessIterator>* sd;
123 /** @brief Generate a random number in @c [0,2^logp).
124 * @param logp Logarithm (basis 2) of the upper range bound.
125 * @param rng Random number generator to use.
127 template<typename RandomNumberGenerator>
128 inline int random_number_pow2(int logp, RandomNumberGenerator& rng)
130 return rng.genrand_bits(logp);
133 /** @brief Random shuffle code executed by each thread.
134 * @param pus Arary of thread-local data records. */
135 template<typename RandomAccessIterator, typename RandomNumberGenerator>
136 inline void parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* pus)
138 typedef std::iterator_traits<RandomAccessIterator> traits_type;
139 typedef typename traits_type::value_type value_type;
140 typedef typename traits_type::difference_type difference_type;
142 Timing<sequential_tag> t;
145 DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[omp_get_thread_num()];
146 DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd;
147 thread_index_t iam = d->iam;
149 // Indexing: dist[bin][processor]
150 difference_type length = sd->starts[iam + 1] - sd->starts[iam];
151 bin_index* oracles = new bin_index[length];
152 difference_type* dist = new difference_type[sd->num_bins + 1];
153 bin_index* bin_proc = new bin_index[sd->num_bins];
154 value_type** temporaries = new value_type*[d->num_threads];
156 // Compute oracles and count appearances.
157 for (bin_index b = 0; b < sd->num_bins + 1; b++)
159 int num_bits = sd->num_bits;
161 random_number rng(d->seed);
164 for (difference_type i = 0; i < length; i++)
166 bin_index oracle = random_number_pow2(num_bits, rng);
169 // To allow prefix (partial) sum.
173 for (bin_index b = 0; b < sd->num_bins + 1; b++)
174 sd->dist[b][iam + 1] = dist[b];
184 // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
185 // total number of items in bin s
186 for (bin_index s = 0; s < sd->num_bins; s++)
187 partial_sum(sd->dist[s + 1], sd->dist[s + 1] + d->num_threads + 1, sd->dist[s + 1]);
194 sequence_index_t offset = 0, global_offset = 0;
195 for (bin_index s = 0; s < d->bins_begin; s++)
196 global_offset += sd->dist[s + 1][d->num_threads];
200 for (bin_index s = d->bins_begin; s < d->bins_end; s++)
202 for (int t = 0; t < d->num_threads + 1; t++)
203 sd->dist[s + 1][t] += offset;
204 offset = sd->dist[s + 1][d->num_threads];
207 sd->temporaries[iam] = new value_type[offset];
215 // Draw local copies to avoid false sharing.
216 for (bin_index b = 0; b < sd->num_bins + 1; b++)
217 dist[b] = sd->dist[b][iam];
218 for (bin_index b = 0; b < sd->num_bins; b++)
219 bin_proc[b] = sd->bin_proc[b];
220 for (thread_index_t t = 0; t < d->num_threads; t++)
221 temporaries[t] = sd->temporaries[t];
223 RandomAccessIterator source = sd->source;
224 difference_type start = sd->starts[iam];
226 // Distribute according to oracles, second main loop.
227 for (difference_type i = 0; i < length; i++)
229 bin_index target_bin = oracles[i];
230 thread_index_t target_p = bin_proc[target_bin];
232 // Last column [d->num_threads] stays unchanged.
233 temporaries[target_p][dist[target_bin + 1]++] = *(source + i + start);
239 delete[] temporaries;
247 // Shuffle bins internally.
248 for (bin_index b = d->bins_begin; b < d->bins_end; b++)
250 value_type* begin = sd->temporaries[iam] + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
251 * end = sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
252 sequential_random_shuffle(begin, end, rng);
253 std::copy(begin, end, sd->source + global_offset + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
256 delete[] sd->temporaries[iam];
263 /** @brief Round up to the next greater power of 2.
264 * @param x Integer to round up */
266 T round_up_to_pow2(T x)
271 return (T)1 << (log2(x - 1) + 1);
274 /** @brief Main parallel random shuffle step.
275 * @param begin Begin iterator of sequence.
276 * @param end End iterator of sequence.
277 * @param n Length of sequence.
278 * @param num_threads Number of threads to use.
279 * @param rng Random number generator to use.
281 template<typename RandomAccessIterator, typename RandomNumberGenerator>
283 parallel_random_shuffle_drs(RandomAccessIterator begin, RandomAccessIterator end, typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads, RandomNumberGenerator& rng)
285 typedef std::iterator_traits<RandomAccessIterator> traits_type;
286 typedef typename traits_type::value_type value_type;
287 typedef typename traits_type::difference_type difference_type;
292 num_threads = static_cast<thread_index_t>(n);
294 bin_index num_bins, num_bins_cache;
296 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
297 // Try the L1 cache first.
300 num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type))));
301 num_bins_cache = round_up_to_pow2(num_bins_cache);
303 // No more buckets than TLB entries, power of 2
304 // Power of 2 and at least one element per bin, at most the TLB size.
305 num_bins = std::min(n, (difference_type)num_bins_cache);
307 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
308 // 2 TLB entries needed per bin.
309 num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
311 num_bins = round_up_to_pow2(num_bins);
313 if (num_bins < num_bins_cache)
316 // Now try the L2 cache
318 num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type)))));
319 num_bins_cache = round_up_to_pow2(num_bins_cache);
321 // No more buckets than TLB entries, power of 2.
322 num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache));
323 // Power of 2 and at least one element per bin, at most the TLB size.
324 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
325 // 2 TLB entries needed per bin.
326 num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
328 num_bins = round_up_to_pow2(num_bins);
329 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
333 num_threads = std::min((bin_index)num_threads, (bin_index)num_bins);
335 if (num_threads <= 1)
336 return sequential_random_shuffle(begin, end, rng);
338 DRandomShufflingGlobalData<RandomAccessIterator> sd(begin);
340 DRSSorterPU<RandomAccessIterator, random_number >* pus = new DRSSorterPU<RandomAccessIterator, random_number >[num_threads];
342 sd.temporaries = new value_type*[num_threads];
343 //sd.oracles = new bin_index[n];
344 sd.dist = new difference_type*[num_bins + 1];
345 sd.bin_proc = new thread_index_t[num_bins];
346 for (bin_index b = 0; b < num_bins + 1; b++)
347 sd.dist[b] = new difference_type[num_threads + 1];
348 for (bin_index b = 0; b < (num_bins + 1); b++)
353 difference_type* starts = sd.starts = new difference_type[num_threads + 1];
355 sd.num_bins = num_bins;
356 sd.num_bits = log2(num_bins);
358 difference_type chunk_length = n / num_threads, split = n % num_threads, start = 0;
359 int bin_chunk_length = num_bins / num_threads, bin_split = num_bins % num_threads;
360 for (int i = 0; i < num_threads; i++)
363 start += (i < split) ? (chunk_length + 1) : chunk_length;
364 int j = pus[i].bins_begin = bin_cursor;
366 // Range of bins for this processor.
367 bin_cursor += (i < bin_split) ? (bin_chunk_length + 1) : bin_chunk_length;
368 pus[i].bins_end = bin_cursor;
369 for (; j < bin_cursor; j++)
371 pus[i].num_threads = num_threads;
373 pus[i].seed = rng(std::numeric_limits<uint32>::max());
376 starts[num_threads] = start;
378 // Now shuffle in parallel.
379 #pragma omp parallel num_threads(num_threads)
380 parallel_random_shuffle_drs_pu(pus);
383 delete[] sd.bin_proc;
384 for (int s = 0; s < (num_bins + 1); s++)
387 delete[] sd.temporaries;
392 /** @brief Sequential cache-efficient random shuffle.
393 * @param begin Begin iterator of sequence.
394 * @param end End iterator of sequence.
395 * @param rng Random number generator to use.
397 template<typename RandomAccessIterator, typename RandomNumberGenerator>
399 sequential_random_shuffle(RandomAccessIterator begin, RandomAccessIterator end, RandomNumberGenerator& rng)
401 typedef std::iterator_traits<RandomAccessIterator> traits_type;
402 typedef typename traits_type::value_type value_type;
403 typedef typename traits_type::difference_type difference_type;
405 difference_type n = end - begin;
407 bin_index num_bins, num_bins_cache;
409 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
410 // Try the L1 cache first, must fit into L1.
411 num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type))));
412 num_bins_cache = round_up_to_pow2(num_bins_cache);
414 // No more buckets than TLB entries, power of 2
415 // Power of 2 and at least one element per bin, at most the TLB size
416 num_bins = std::min(n, (difference_type)num_bins_cache);
417 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
418 // 2 TLB entries needed per bin
419 num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
421 num_bins = round_up_to_pow2(num_bins);
423 if (num_bins < num_bins_cache)
426 // Now try the L2 cache, must fit into L2.
427 num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type)))));
428 num_bins_cache = round_up_to_pow2(num_bins_cache);
430 // No more buckets than TLB entries, power of 2
431 // Power of 2 and at least one element per bin, at most the TLB size.
432 num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache));
434 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
435 // 2 TLB entries needed per bin
436 num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
438 num_bins = round_up_to_pow2(num_bins);
439 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
443 int num_bits = log2(num_bins);
447 value_type* target = new value_type[n];
448 bin_index* oracles = new bin_index[n];
449 difference_type* dist0 = new difference_type[num_bins + 1], * dist1 = new difference_type[num_bins + 1];
451 for (int b = 0; b < num_bins + 1; b++)
454 Timing<sequential_tag> t;
457 random_number bitrng(rng(0xFFFFFFFF));
459 for (difference_type i = 0; i < n; i++)
461 bin_index oracle = random_number_pow2(num_bits, bitrng);
464 // To allow prefix (partial) sum.
471 partial_sum(dist0, dist0 + num_bins + 1, dist0);
473 for (int b = 0; b < num_bins + 1; b++)
478 // Distribute according to oracles.
479 for (difference_type i = 0; i < n; i++)
480 target[(dist0[oracles[i]])++] = *(begin + i);
482 for (int b = 0; b < num_bins; b++)
484 sequential_random_shuffle(target + dist1[b], target + dist1[b + 1],
496 __gnu_sequential::random_shuffle(begin, end, rng);
499 /** @brief Parallel random public call.
500 * @param begin Begin iterator of sequence.
501 * @param end End iterator of sequence.
502 * @param rng Random number generator to use.
504 template<typename RandomAccessIterator, typename RandomNumberGenerator>
506 parallel_random_shuffle(RandomAccessIterator begin, RandomAccessIterator end, RandomNumberGenerator rng = random_number())
508 typedef std::iterator_traits<RandomAccessIterator> traits_type;
509 typedef typename traits_type::difference_type difference_type;
510 difference_type n = end - begin;
511 parallel_random_shuffle_drs(begin, end, n, get_max_threads(), rng) ;